summaryrefslogtreecommitdiff
path: root/src/libffmpeg/libavcodec/alpha
diff options
context:
space:
mode:
Diffstat (limited to 'src/libffmpeg/libavcodec/alpha')
-rw-r--r--src/libffmpeg/libavcodec/alpha/dsputil_alpha.c246
-rw-r--r--src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S306
-rw-r--r--src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c110
-rw-r--r--src/libffmpeg/libavcodec/alpha/regdef.h45
4 files changed, 552 insertions, 155 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
index 5e1aa2093..9a3fb1eac 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -22,6 +22,8 @@
void simple_idct_axp(DCTELEM *block);
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
int line_size);
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
@@ -103,145 +105,183 @@ void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
}
#endif
-/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
- Since the immediate result could be greater than 255, we do the
- shift first. The result is too low by one if the bytes were both
- odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
-static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
-{
- UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
- l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
- l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
- return l1 + l2 + correction;
+static void clear_blocks_axp(DCTELEM *blocks) {
+ uint64_t *p = (uint64_t *) blocks;
+ int n = sizeof(DCTELEM) * 6 * 64;
+
+ do {
+ p[0] = 0;
+ p[1] = 0;
+ p[2] = 0;
+ p[3] = 0;
+ p[4] = 0;
+ p[5] = 0;
+ p[6] = 0;
+ p[7] = 0;
+ p += 8;
+ n -= 8 * 8;
+ } while (n);
}
-/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
- The '1' only has an effect when one byte is even and the other odd,
- i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
- Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
-static inline UINT64 avg2(UINT64 l1, UINT64 l2)
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
{
- UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
- l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
- l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
- return l1 + l2 + correction;
+ return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
}
-static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+static inline uint64_t avg2(uint64_t a, uint64_t b)
{
- UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
- + ((l2 & ~BYTE_VEC(0x03)) >> 2)
- + ((l3 & ~BYTE_VEC(0x03)) >> 2)
- + ((l4 & ~BYTE_VEC(0x03)) >> 2);
- UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
- + (l2 & BYTE_VEC(0x03))
- + (l3 & BYTE_VEC(0x03))
- + (l4 & BYTE_VEC(0x03))
- + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
- return r1 + r2;
+ return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
}
-static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+ each iteration. */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
{
- UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
- + ((l2 & ~BYTE_VEC(0x03)) >> 2)
- + ((l3 & ~BYTE_VEC(0x03)) >> 2)
- + ((l4 & ~BYTE_VEC(0x03)) >> 2);
- UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
- + (l2 & BYTE_VEC(0x03))
- + (l3 & BYTE_VEC(0x03))
- + (l4 & BYTE_VEC(0x03))
- + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
+ uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+ + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+ uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
+ + (l2 & BYTE_VEC(0x03))
+ + (l3 & BYTE_VEC(0x03))
+ + (l4 & BYTE_VEC(0x03))
+ + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
return r1 + r2;
}
+#endif
-#define PIXOPNAME(suffix) put ## suffix
-#define BTYPE UINT8
-#define AVG2 avg2
-#define AVG4 avg4
-#define STORE(l, b) stq(l, b)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define OP(LOAD, STORE) \
+ do { \
+ STORE(LOAD(pixels), block); \
+ pixels += line_size; \
+ block += line_size; \
+ } while (--h)
-#define PIXOPNAME(suffix) put_no_rnd ## suffix
-#define BTYPE UINT8
-#define AVG2 avg2_no_rnd
-#define AVG4 avg4_no_rnd
-#define STORE(l, b) stq(l, b)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define OP_X2(LOAD, STORE) \
+ do { \
+ uint64_t pix1, pix2; \
+ \
+ pix1 = LOAD(pixels); \
+ pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
+ STORE(AVG2(pix1, pix2), block); \
+ pixels += line_size; \
+ block += line_size; \
+ } while (--h)
-/* The following functions are untested. */
-#if 0
+#define OP_Y2(LOAD, STORE) \
+ do { \
+ uint64_t pix = LOAD(pixels); \
+ do { \
+ uint64_t next_pix; \
+ \
+ pixels += line_size; \
+ next_pix = LOAD(pixels); \
+ STORE(AVG2(pix, next_pix), block); \
+ block += line_size; \
+ pix = next_pix; \
+ } while (--h); \
+ } while (0)
+
+#define OP_XY2(LOAD, STORE) \
+ do { \
+ uint64_t pix1 = LOAD(pixels); \
+ uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
+ uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
+ + (pix2 & BYTE_VEC(0x03)); \
+ uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
+ + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
+ \
+ do { \
+ uint64_t npix1, npix2; \
+ uint64_t npix_l, npix_h; \
+ uint64_t avg; \
+ \
+ pixels += line_size; \
+ npix1 = LOAD(pixels); \
+ npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \
+ npix_l = (npix1 & BYTE_VEC(0x03)) \
+ + (npix2 & BYTE_VEC(0x03)); \
+ npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
+ + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
+ avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+ + pix_h + npix_h; \
+ STORE(avg, block); \
+ \
+ block += line_size; \
+ pix_l = npix_l; \
+ pix_h = npix_h; \
+ } while (--h); \
+ } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
+static void OPNAME ## _pixels ## SUFF ## _axp \
+ (uint8_t *restrict block, const uint8_t *restrict pixels, \
+ int line_size, int h) \
+{ \
+ if ((size_t) pixels & 0x7) { \
+ OPKIND(uldq, STORE); \
+ } else { \
+ OPKIND(ldq, STORE); \
+ } \
+}
-#define PIXOPNAME(suffix) avg ## suffix
-#define BTYPE UINT8
+#define PIXOP(OPNAME, STORE) \
+ MAKE_OP(OPNAME, , OP, STORE) \
+ MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
+ MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
+ MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives. */
#define AVG2 avg2
#define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
+PIXOP(avg, STORE);
+
+/* Not rounding primitives. */
#undef AVG2
#undef AVG4
+#undef AVG4_ROUNDER
#undef STORE
-
-#define PIXOPNAME(suffix) avg_no_rnd ## suffix
-#define BTYPE UINT8
#define AVG2 avg2_no_rnd
#define AVG4 avg4_no_rnd
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
-#define PIXOPNAME(suffix) sub ## suffix
-#define BTYPE DCTELEM
-#define AVG2 avg2
-#define AVG4 avg4
-#define STORE(l, block) do { \
- UINT64 xxx = l; \
- (block)[0] -= (xxx >> 0) & 0xff; \
- (block)[1] -= (xxx >> 8) & 0xff; \
- (block)[2] -= (xxx >> 16) & 0xff; \
- (block)[3] -= (xxx >> 24) & 0xff; \
- (block)[4] -= (xxx >> 32) & 0xff; \
- (block)[5] -= (xxx >> 40) & 0xff; \
- (block)[6] -= (xxx >> 48) & 0xff; \
- (block)[7] -= (xxx >> 56) & 0xff; \
-} while (0)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
#undef STORE
-
-#endif
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
void dsputil_init_alpha(void)
{
- put_pixels_tab[0] = put_pixels_axp;
+ put_pixels_tab[0] = put_pixels_axp_asm;
put_pixels_tab[1] = put_pixels_x2_axp;
put_pixels_tab[2] = put_pixels_y2_axp;
put_pixels_tab[3] = put_pixels_xy2_axp;
- put_no_rnd_pixels_tab[0] = put_pixels_axp;
+ put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
+ avg_pixels_tab[0] = avg_pixels_axp;
+ avg_pixels_tab[1] = avg_pixels_x2_axp;
+ avg_pixels_tab[2] = avg_pixels_y2_axp;
+ avg_pixels_tab[3] = avg_pixels_xy2_axp;
+
+ avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
+ avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
+ avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
+ avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
+
+ clear_blocks = clear_blocks_axp;
+
/* amask clears all bits that correspond to present features. */
if (amask(AMASK_MVI) == 0) {
put_pixels_clamped = put_pixels_clamped_mvi_asm;
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
new file mode 100644
index 000000000..5349e443c
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
@@ -0,0 +1,306 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+#ifdef HAVE_AV_CONFIG_H
+#include "config.h"
+#endif
+
+/* Some nicer register names. */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+ .set noat
+ .set noreorder
+ .arch pca56
+ .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ * int line_size, int h)
+ */
+ .align 6
+ .globl put_pixels_axp_asm
+ .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+ .frame sp, 0, ra
+ .prologue 0
+
+#ifdef HAVE_GPROF
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+#endif
+
+ and a1, 7, t0
+ beq t0, $aligned
+
+ .align 4
+$unaligned:
+ ldq_u t0, 0(a1)
+ ldq_u t1, 8(a1)
+ addq a1, a2, a1
+ nop
+
+ ldq_u t2, 0(a1)
+ ldq_u t3, 8(a1)
+ addq a1, a2, a1
+ nop
+
+ ldq_u t4, 0(a1)
+ ldq_u t5, 8(a1)
+ addq a1, a2, a1
+ nop
+
+ ldq_u t6, 0(a1)
+ ldq_u t7, 8(a1)
+ extql t0, a1, t0
+ addq a1, a2, a1
+
+ extqh t1, a1, t1
+ addq a0, a2, t8
+ extql t2, a1, t2
+ addq t8, a2, t9
+
+ extqh t3, a1, t3
+ addq t9, a2, ta
+ extql t4, a1, t4
+ or t0, t1, t0
+
+ extqh t5, a1, t5
+ or t2, t3, t2
+ extql t6, a1, t6
+ or t4, t5, t4
+
+ extqh t7, a1, t7
+ or t6, t7, t6
+ stq t0, 0(a0)
+ stq t2, 0(t8)
+
+ stq t4, 0(t9)
+ subq a3, 4, a3
+ stq t6, 0(ta)
+ addq ta, a2, a0
+
+ bne a3, $unaligned
+ ret
+
+ .align 4
+$aligned:
+ ldq t0, 0(a1)
+ addq a1, a2, a1
+ ldq t1, 0(a1)
+ addq a1, a2, a1
+
+ ldq t2, 0(a1)
+ addq a1, a2, a1
+ ldq t3, 0(a1)
+ addq a1, a2, a1
+
+ ldq t4, 0(a1)
+ addq a1, a2, a1
+ ldq t5, 0(a1)
+ addq a1, a2, a1
+
+ ldq t6, 0(a1)
+ addq a1, a2, a1
+ ldq t7, 0(a1)
+ addq a1, a2, a1
+
+ addq a0, a2, t8
+ stq t0, 0(a0)
+ addq t8, a2, t9
+ stq t1, 0(t8)
+
+ addq t9, a2, ta
+ stq t2, 0(t9)
+ addq ta, a2, tb
+ stq t3, 0(ta)
+
+ addq tb, a2, tc
+ stq t4, 0(tb)
+ addq tc, a2, td
+ stq t5, 0(tc)
+
+ addq td, a2, te
+ stq t6, 0(td)
+ addq te, a2, a0
+ stq t7, 0(te)
+
+ subq a3, 8, a3
+ bne a3, $aligned
+
+ ret
+ .end put_pixels_axp_asm
+
+/************************************************************************
+ * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
+ * int line_size)
+ */
+ .align 6
+ .globl put_pixels_clamped_mvi_asm
+ .ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+ .frame sp, 0, ra
+ .prologue 0
+
+#ifdef HAVE_GPROF
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+#endif
+
+ lda t8, -1
+ lda t9, 8 # loop counter
+ zap t8, 0xaa, t8 # 00ff00ff00ff00ff
+
+ .align 4
+1: ldq t0, 0(a0)
+ ldq t1, 8(a0)
+ ldq t2, 16(a0)
+ ldq t3, 24(a0)
+
+ maxsw4 t0, zero, t0
+ subq t9, 2, t9
+ maxsw4 t1, zero, t1
+ lda a0, 32(a0)
+
+ maxsw4 t2, zero, t2
+ addq a1, a2, ta
+ maxsw4 t3, zero, t3
+ minsw4 t0, t8, t0
+
+ minsw4 t1, t8, t1
+ minsw4 t2, t8, t2
+ minsw4 t3, t8, t3
+ pkwb t0, t0
+
+ pkwb t1, t1
+ pkwb t2, t2
+ pkwb t3, t3
+ stl t0, 0(a1)
+
+ stl t1, 4(a1)
+ addq ta, a2, a1
+ stl t2, 0(ta)
+ stl t3, 4(ta)
+
+ bne t9, 1b
+ ret
+ .end put_pixels_clamped_mvi_asm
+
+/************************************************************************
+ * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
+ * int line_size)
+ */
+ .align 6
+ .globl add_pixels_clamped_mvi_asm
+ .ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+ .frame sp, 0, ra
+ .prologue 0
+
+#ifdef HAVE_GPROF
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+#endif
+
+ lda t1, -1
+ lda th, 8
+ zap t1, 0x33, tg
+ nop
+
+ srl tg, 1, t0
+ xor tg, t0, tg # 0x8000800080008000
+ zap t1, 0xaa, tf # 0x00ff00ff00ff00ff
+
+ .align 4
+1: ldl t1, 0(a1) # pix0 (try to hit cache line soon)
+ ldl t4, 4(a1) # pix1
+ addq a1, a2, te # pixels += line_size
+ ldq t0, 0(a0) # shorts0
+
+ ldl t7, 0(te) # pix2 (try to hit cache line soon)
+ ldl ta, 4(te) # pix3
+ ldq t3, 8(a0) # shorts1
+ ldq t6, 16(a0) # shorts2
+
+ ldq t9, 24(a0) # shorts3
+ unpkbw t1, t1 # 0 0 (quarter/op no.)
+ and t0, tg, t2 # 0 1
+ unpkbw t4, t4 # 1 0
+
+ bic t0, tg, t0 # 0 2
+ unpkbw t7, t7 # 2 0
+ and t3, tg, t5 # 1 1
+ addq t0, t1, t0 # 0 3
+
+ xor t0, t2, t0 # 0 4
+ unpkbw ta, ta # 3 0
+ and t6, tg, t8 # 2 1
+ maxsw4 t0, zero, t0 # 0 5
+
+ bic t3, tg, t3 # 1 2
+ bic t6, tg, t6 # 2 2
+ minsw4 t0, tf, t0 # 0 6
+ addq t3, t4, t3 # 1 3
+
+ pkwb t0, t0 # 0 7
+ xor t3, t5, t3 # 1 4
+ maxsw4 t3, zero, t3 # 1 5
+ addq t6, t7, t6 # 2 3
+
+ xor t6, t8, t6 # 2 4
+ and t9, tg, tb # 3 1
+ minsw4 t3, tf, t3 # 1 6
+ bic t9, tg, t9 # 3 2
+
+ maxsw4 t6, zero, t6 # 2 5
+ addq t9, ta, t9 # 3 3
+ stl t0, 0(a1) # 0 8
+ minsw4 t6, tf, t6 # 2 6
+
+ xor t9, tb, t9 # 3 4
+ maxsw4 t9, zero, t9 # 3 5
+ lda a0, 32(a0) # block += 16;
+ pkwb t3, t3 # 1 7
+
+ minsw4 t9, tf, t9 # 3 6
+ subq th, 2, th
+ pkwb t6, t6 # 2 7
+ pkwb t9, t9 # 3 7
+
+ stl t3, 4(a1) # 1 8
+ addq te, a2, a1 # pixels += line_size
+ stl t6, 0(te) # 2 8
+ stl t9, 4(te) # 3 8
+
+ bne th, 1b
+ ret
+ .end add_pixels_clamped_mvi_asm
diff --git a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
index eb1997eee..0be327079 100644
--- a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
@@ -23,69 +23,75 @@
extern UINT8 zigzag_end[64];
-static void dct_unquantize_h263_axp(MpegEncContext *s,
- DCTELEM *block, int n, int qscale)
+static void dct_unquantize_h263_axp(MpegEncContext *s, DCTELEM *block,
+ int n, int qscale)
{
- int i, level;
- UINT64 qmul, qadd;
+ int i, n_coeffs;
+ uint64_t qmul, qadd;
+ uint64_t correction;
+ DCTELEM *orig_block = block;
+ DCTELEM block0;
- ASM_ACCEPT_MVI;
-
if (s->mb_intra) {
- if (n < 4)
- block[0] = block[0] * s->y_dc_scale;
- else
- block[0] = block[0] * s->c_dc_scale;
- /* Catch up to aligned point. */
- qmul = s->qscale << 1;
- qadd = (s->qscale - 1) | 1;
- for (i = 1; i < 4; ++i) {
- level = block[i];
- if (level) {
- if (level < 0) {
- level = level * qmul - qadd;
- } else {
- level = level * qmul + qadd;
- }
- block[i] = level;
- }
- }
- block += 4;
- i = 60 / 4;
+ if (!s->h263_aic) {
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ }
+ n_coeffs = 64; // does not always use zigzag table
} else {
- i = zigzag_end[s->block_last_index[n]] / 4;
+ n_coeffs = zigzag_end[s->block_last_index[n]];
}
- qmul = s->qscale << 1;
+
+ qmul = qscale << 1;
qadd = WORD_VEC((qscale - 1) | 1);
- do {
- UINT64 levels, negmask, zeromask, corr;
- levels = ldq(block);
- if (levels == 0)
- continue;
- zeromask = cmpbge(0, levels);
- zeromask &= zeromask >> 1;
- /* Negate all negative words. */
- negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */
- negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
- corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */
- levels ^= negmask;
- levels += corr;
+ /* This mask kills spill from negative subwords to the next subword. */
+ correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */
+
+ for(i = 0; i < n_coeffs; block += 4, i += 4) {
+ uint64_t levels, negmask, zeros, add;
+
+ levels = ldq(block);
+ if (levels == 0)
+ continue;
+
+#ifdef __alpha_max__
+ /* I don't think the speed difference justifies runtime
+ detection. */
+ ASM_ACCEPT_MVI;
+ negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */
+ negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
+#else
+ negmask = cmpbge(WORD_VEC(0x7fff), levels);
+ negmask &= (negmask >> 1) | (1 << 7);
+ negmask = zap(-1, negmask);
+#endif
+
+ zeros = cmpbge(0, levels);
+ zeros &= zeros >> 1;
+ /* zeros |= zeros << 1 is not needed since qadd <= 255, so
+ zapping the lower byte suffices. */
- levels = levels * qmul;
- levels += zap(qadd, zeromask);
+ levels *= qmul;
+ levels -= correction & (negmask << 16);
- /* Re-negate negative words. */
- levels -= corr;
- levels ^= negmask;
+ /* Negate qadd for negative levels. */
+ add = qadd ^ negmask;
+ add += WORD_VEC(0x0001) & negmask;
+ /* Set qadd to 0 for levels == 0. */
+ add = zap(add, zeros);
- stq(levels, block);
- } while (block += 4, --i);
+ levels += add;
+
+ stq(levels, block);
+ }
+
+ if (s->mb_intra && !s->h263_aic)
+ orig_block[0] = block0;
}
void MPV_common_init_axp(MpegEncContext *s)
{
- if (amask(AMASK_MVI) == 0) {
- if (s->out_format == FMT_H263)
- s->dct_unquantize = dct_unquantize_h263_axp;
- }
+ s->dct_unquantize_h263 = dct_unquantize_h263_axp;
}
diff --git a/src/libffmpeg/libavcodec/alpha/regdef.h b/src/libffmpeg/libavcodec/alpha/regdef.h
new file mode 100644
index 000000000..7e7fc06b2
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/regdef.h
@@ -0,0 +1,45 @@
+/* Some BSDs don't seem to have regdef.h... sigh */
+#ifndef alpha_regdef_h
+#define alpha_regdef_h
+
+#define v0 $0 /* function return value */
+
+#define t0 $1 /* temporary registers (caller-saved) */
+#define t1 $2
+#define t2 $3
+#define t3 $4
+#define t4 $5
+#define t5 $6
+#define t6 $7
+#define t7 $8
+
+#define s0 $9 /* saved-registers (callee-saved registers) */
+#define s1 $10
+#define s2 $11
+#define s3 $12
+#define s4 $13
+#define s5 $14
+#define s6 $15
+#define fp s6 /* frame-pointer (s6 in frame-less procedures) */
+
+#define a0 $16 /* argument registers (caller-saved) */
+#define a1 $17
+#define a2 $18
+#define a3 $19
+#define a4 $20
+#define a5 $21
+
+#define t8 $22 /* more temps (caller-saved) */
+#define t9 $23
+#define t10 $24
+#define t11 $25
+#define ra $26 /* return address register */
+#define t12 $27
+
+#define pv t12 /* procedure-variable register */
+#define AT $at /* assembler temporary */
+#define gp $29 /* global pointer */
+#define sp $30 /* stack pointer */
+#define zero $31 /* reads as zero, writes are noops */
+
+#endif /* alpha_regdef_h */