diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/alpha')
-rw-r--r-- | src/libffmpeg/libavcodec/alpha/asm.h | 181 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/alpha/dsputil_alpha.c | 113 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c | 3 |
3 files changed, 160 insertions, 137 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/asm.h b/src/libffmpeg/libavcodec/alpha/asm.h index 0f4685f11..ceaf0be4a 100644 --- a/src/libffmpeg/libavcodec/alpha/asm.h +++ b/src/libffmpeg/libavcodec/alpha/asm.h @@ -20,122 +20,115 @@ #ifndef LIBAVCODEC_ALPHA_ASM_H #define LIBAVCODEC_ALPHA_ASM_H -#include <stdint.h> +#include <inttypes.h> #define AMASK_BWX (1 << 0) #define AMASK_FIX (1 << 1) +#define AMASK_CIX (1 << 2) #define AMASK_MVI (1 << 8) -static inline uint64_t BYTE_VEC(uint64_t x) +inline static uint64_t BYTE_VEC(uint64_t x) { x |= x << 8; x |= x << 16; x |= x << 32; return x; } -static inline uint64_t WORD_VEC(uint64_t x) +inline static uint64_t WORD_VEC(uint64_t x) { x |= x << 16; x |= x << 32; return x; } -static inline int32_t ldl(const void* p) -{ - return *(const int32_t*) p; -} -static inline uint64_t ldq(const void* p) -{ - return *(const uint64_t*) p; -} -/* FIXME ccc doesn't seem to get it? Use inline asm? */ -static inline uint64_t ldq_u(const void* p) -{ - return *(const uint64_t*) ((uintptr_t) p & ~7ul); -} -static inline void stl(uint32_t l, void* p) -{ - *(uint32_t*) p = l; -} -static inline void stq(uint64_t l, void* p) -{ - *(uint64_t*) p = l; -} +#define ldq(p) (*(const uint64_t *) (p)) +#define ldl(p) (*(const int32_t *) (p)) +#define stl(l, p) do { *(uint32_t *) (p) = (l); } while (0) +#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0) #ifdef __GNUC__ -#define OPCODE1(name) \ -static inline uint64_t name(uint64_t l) \ -{ \ - uint64_t r; \ - asm (#name " %1, %0" : "=r" (r) : "r" (l)); \ - return r; \ -} - -#define OPCODE2(name) \ -static inline uint64_t name(uint64_t l1, uint64_t l2) \ -{ \ - uint64_t r; \ - asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \ - return r; \ -} - -/* We don't want gcc to move this around or combine it with another - rpcc, so mark it volatile. */ -static inline uint64_t rpcc(void) -{ - uint64_t r; - asm volatile ("rpcc %0" : "=r" (r)); - return r; -} - -static inline uint64_t uldq(const void* v) -{ - struct foo { - unsigned long l; - } __attribute__((packed)); - - return ((const struct foo*) v)->l; -} +#define ASM_ACCEPT_MVI asm (".arch pca56") +struct unaligned_long { uint64_t l; } __attribute__((packed)); +#define ldq_u(p) (*(const uint64_t *) (((uint64_t) (p)) & ~7ul)) +#define uldq(a) (((const struct unaligned_long *) (a))->l) + +#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 2 +#define cmpbge __builtin_alpha_cmpbge +/* Avoid warnings. */ +#define extql(a, b) __builtin_alpha_extql(a, (uint64_t) (b)) +#define extqh(a, b) __builtin_alpha_extqh(a, (uint64_t) (b)) +#define zap __builtin_alpha_zap +#define zapnot __builtin_alpha_zapnot +#define amask __builtin_alpha_amask +#define implver __builtin_alpha_implver +#define rpcc __builtin_alpha_rpcc +#define minub8 __builtin_alpha_minub8 +#define minsb8 __builtin_alpha_minsb8 +#define minuw4 __builtin_alpha_minuw4 +#define minsw4 __builtin_alpha_minsw4 +#define maxub8 __builtin_alpha_maxub8 +#define maxsb8 __builtin_alpha_maxsb8 +#define maxuw4 __builtin_alpha_maxuw4 +#define maxsw4 __builtin_alpha_maxsw4 +#define perr __builtin_alpha_perr +#define pklb __builtin_alpha_pklb +#define pkwb __builtin_alpha_pkwb +#define unpkbl __builtin_alpha_unpkbl +#define unpkbw __builtin_alpha_unpkbw +#else +#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define extql(a, b) ({ uint64_t __r; asm ("extql %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define extqh(a, b) ({ uint64_t __r; asm ("extqh %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define zap(a, b) ({ uint64_t __r; asm ("zap %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define amask(a) ({ uint64_t __r; asm ("amask %1,%0" : "=r" (__r) : "rI" (a)); __r; }) +#define implver() ({ uint64_t __r; asm ("implver %0" : "=r" (__r)); __r; }) +#define rpcc() ({ uint64_t __r; asm volatile ("rpcc %0" : "=r" (__r)); __r; }) +#define minub8(a, b) ({ uint64_t __r; asm ("minub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define minsb8(a, b) ({ uint64_t __r; asm ("minsb8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define minuw4(a, b) ({ uint64_t __r; asm ("minuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define minsw4(a, b) ({ uint64_t __r; asm ("minsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define maxub8(a, b) ({ uint64_t __r; asm ("maxub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define maxsb8(a, b) ({ uint64_t __r; asm ("maxsb8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define maxuw4(a, b) ({ uint64_t __r; asm ("maxuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define maxsw4(a, b) ({ uint64_t __r; asm ("maxsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define perr(a, b) ({ uint64_t __r; asm ("perr %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; }) +#define pklb(a) ({ uint64_t __r; asm ("pklb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) +#define pkwb(a) ({ uint64_t __r; asm ("pkwb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) +#define unpkbl(a) ({ uint64_t __r; asm ("unpkbl %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) +#define unpkbw(a) ({ uint64_t __r; asm ("unpkbw %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) +#endif -#elif defined(__DECC) /* Compaq "ccc" compiler */ +#elif defined(__DECC) /* Digital/Compaq "ccc" compiler */ #include <c_asm.h> -#define OPCODE1(name) \ -static inline uint64_t name(uint64_t l) \ -{ \ - return asm (#name " %a0, %v0", l); \ -} - -#define OPCODE2(name) \ -static inline uint64_t name(uint64_t l1, uint64_t l2) \ -{ \ - return asm (#name " %a0, %a1, %v0", l1, l2); \ -} - -static inline uint64_t rpcc(void) -{ - return asm ("rpcc %v0"); -} - -static inline uint64_t uldq(const void* v) -{ - return *(const __unaligned uint64_t *) v; -} - +#define ASM_ACCEPT_MVI +#define ldq_u(a) asm ("ldq_u %v0,0(%a0)", a) +#define uldq(a) (*(const __unaligned uint64_t *) (a)) +#define cmpbge(a, b) asm ("cmpbge %a0,%a1,%v0", a, b) +#define extql(a, b) asm ("extql %a0,%a1,%v0", a, b) +#define extqh(a, b) asm ("extqh %a0,%a1,%v0", a, b) +#define zap(a, b) asm ("zap %a0,%a1,%v0", a, b) +#define zapnot(a, b) asm ("zapnot %a0,%a1,%v0", a, b) +#define amask(a) asm ("amask %a0,%v0", a) +#define implver() asm ("implver %v0") +#define rpcc() asm ("rpcc %v0") +#define minub8(a, b) asm ("minub8 %a0,%a1,%v0", a, b) +#define minsb8(a, b) asm ("minsb8 %a0,%a1,%v0", a, b) +#define minuw4(a, b) asm ("minuw4 %a0,%a1,%v0", a, b) +#define minsw4(a, b) asm ("minsw4 %a0,%a1,%v0", a, b) +#define maxub8(a, b) asm ("maxub8 %a0,%a1,%v0", a, b) +#define maxsb8(a, b) asm ("maxsb8 %a0,%a1,%v0", a, b) +#define maxuw4(a, b) asm ("maxuw4 %a0,%a1,%v0", a, b) +#define maxsw4(a, b) asm ("maxsw4 %a0,%a1,%v0", a, b) +#define perr(a, b) asm ("perr %a0,%a1,%v0", a, b) +#define pklb(a) asm ("pklb %a0,%v0", a) +#define pkwb(a) asm ("pkwb %a0,%v0", a) +#define unpkbl(a) asm ("unpkbl %a0,%v0", a) +#define unpkbw(a) asm ("unpkbw %a0,%v0", a) + +#else +#error "Unknown compiler!" #endif -OPCODE1(amask); -OPCODE1(unpkbw); -OPCODE1(pkwb); -OPCODE2(extql); -OPCODE2(extqh); -OPCODE2(zap); -OPCODE2(cmpbge); -OPCODE2(minsw4); -OPCODE2(minuw4); -OPCODE2(minub8); -OPCODE2(maxsw4); -OPCODE2(maxuw4); -OPCODE2(perr); - #endif /* LIBAVCODEC_ALPHA_ASM_H */ diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c index 3a54904f4..5e1aa2093 100644 --- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c +++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c @@ -22,58 +22,86 @@ void simple_idct_axp(DCTELEM *block); -static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, - int line_size) +void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, + int line_size); +void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, + int line_size); + +#if 0 +/* These functions were the base for the optimized assembler routines, + and remain here for documentation purposes. */ +static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, + int line_size) { int i = 8; + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ + + ASM_ACCEPT_MVI; + do { - UINT64 shorts; + uint64_t shorts0, shorts1; - shorts = ldq(block); - shorts = maxsw4(shorts, 0); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels); + shorts0 = ldq(block); + shorts0 = maxsw4(shorts0, 0); + shorts0 = minsw4(shorts0, clampmask); + stl(pkwb(shorts0), pixels); - shorts = ldq(block + 4); - shorts = maxsw4(shorts, 0); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels + 4); + shorts1 = ldq(block + 4); + shorts1 = maxsw4(shorts1, 0); + shorts1 = minsw4(shorts1, clampmask); + stl(pkwb(shorts1), pixels + 4); - pixels += line_size; - block += 8; + pixels += line_size; + block += 8; } while (--i); } -static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, - int line_size) +void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, + int line_size) { - int i = 8; + int h = 8; + /* Keep this function a leaf function by generating the constants + manually (mainly for the hack value ;-). */ + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ + uint64_t signmask = zap(-1, 0x33); + signmask ^= signmask >> 1; /* 0x8000800080008000 */ + + ASM_ACCEPT_MVI; + do { - UINT64 shorts; - - shorts = ldq(block); - shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ - shorts += unpkbw(ldl(pixels)); - shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ - shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ - shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ - shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ - stl(pkwb(shorts), pixels); - - /* next 4 */ - shorts = ldq(block + 4); - shorts &= ~WORD_VEC(0x8000); - shorts += unpkbw(ldl(pixels + 4)); - shorts &= ~WORD_VEC(0x8000); - shorts = minuw4(shorts, WORD_VEC(0x4000)); - shorts &= ~WORD_VEC(0x4000); - shorts = minsw4(shorts, WORD_VEC(0x00ff)); - stl(pkwb(shorts), pixels + 4); - - pixels += line_size; - block += 8; - } while (--i); + uint64_t shorts0, pix0, signs0; + uint64_t shorts1, pix1, signs1; + + shorts0 = ldq(block); + shorts1 = ldq(block + 4); + + pix0 = unpkbw(ldl(pixels)); + /* Signed subword add (MMX paddw). */ + signs0 = shorts0 & signmask; + shorts0 &= ~signmask; + shorts0 += pix0; + shorts0 ^= signs0; + /* Clamp. */ + shorts0 = maxsw4(shorts0, 0); + shorts0 = minsw4(shorts0, clampmask); + + /* Next 4. */ + pix1 = unpkbw(ldl(pixels + 4)); + signs1 = shorts1 & signmask; + shorts1 &= ~signmask; + shorts1 += pix1; + shorts1 ^= signs1; + shorts1 = maxsw4(shorts1, 0); + shorts1 = minsw4(shorts1, clampmask); + + stl(pkwb(shorts0), pixels); + stl(pkwb(shorts1), pixels + 4); + + pixels += line_size; + block += 8; + } while (--h); } +#endif /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 Since the immediate result could be greater than 255, we do the @@ -216,8 +244,7 @@ void dsputil_init_alpha(void) /* amask clears all bits that correspond to present features. */ if (amask(AMASK_MVI) == 0) { - fprintf(stderr, "MVI extension detected\n"); - put_pixels_clamped = put_pixels_clamped_axp; - add_pixels_clamped = add_pixels_clamped_axp; + put_pixels_clamped = put_pixels_clamped_mvi_asm; + add_pixels_clamped = add_pixels_clamped_mvi_asm; } } diff --git a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c index d0af5e1d3..eb1997eee 100644 --- a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c +++ b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c @@ -28,6 +28,9 @@ static void dct_unquantize_h263_axp(MpegEncContext *s, { int i, level; UINT64 qmul, qadd; + + ASM_ACCEPT_MVI; + if (s->mb_intra) { if (n < 4) block[0] = block[0] * s->y_dc_scale; |