diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/alpha/dsputil_alpha.c')
-rw-r--r-- | src/libffmpeg/libavcodec/alpha/dsputil_alpha.c | 246 |
1 files changed, 143 insertions, 103 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c index 5e1aa2093..9a3fb1eac 100644 --- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c +++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c @@ -22,6 +22,8 @@ void simple_idct_axp(DCTELEM *block); +void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, + int line_size, int h); void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, int line_size); void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, @@ -103,145 +105,183 @@ void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, } #endif -/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 - Since the immediate result could be greater than 255, we do the - shift first. The result is too low by one if the bytes were both - odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ -static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2) -{ - UINT64 correction = (l1 & l2) & BYTE_VEC(0x01); - l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; - l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; - return l1 + l2 + correction; +static void clear_blocks_axp(DCTELEM *blocks) { + uint64_t *p = (uint64_t *) blocks; + int n = sizeof(DCTELEM) * 6 * 64; + + do { + p[0] = 0; + p[1] = 0; + p[2] = 0; + p[3] = 0; + p[4] = 0; + p[5] = 0; + p[6] = 0; + p[7] = 0; + p += 8; + n -= 8 * 8; + } while (n); } -/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1 - The '1' only has an effect when one byte is even and the other odd, - i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01). - Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */ -static inline UINT64 avg2(UINT64 l1, UINT64 l2) +static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) { - UINT64 correction = (l1 | l2) & BYTE_VEC(0x01); - l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; - l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; - return l1 + l2 + correction; + return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); } -static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) +static inline uint64_t avg2(uint64_t a, uint64_t b) { - UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) - + ((l2 & ~BYTE_VEC(0x03)) >> 2) - + ((l3 & ~BYTE_VEC(0x03)) >> 2) - + ((l4 & ~BYTE_VEC(0x03)) >> 2); - UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) - + (l2 & BYTE_VEC(0x03)) - + (l3 & BYTE_VEC(0x03)) - + (l4 & BYTE_VEC(0x03)) - + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); - return r1 + r2; + return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); } -static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) +#if 0 +/* The XY2 routines basically utilize this scheme, but reuse parts in + each iteration. */ +static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) { - UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) - + ((l2 & ~BYTE_VEC(0x03)) >> 2) - + ((l3 & ~BYTE_VEC(0x03)) >> 2) - + ((l4 & ~BYTE_VEC(0x03)) >> 2); - UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) - + (l2 & BYTE_VEC(0x03)) - + (l3 & BYTE_VEC(0x03)) - + (l4 & BYTE_VEC(0x03)) - + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); + uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) + + ((l2 & ~BYTE_VEC(0x03)) >> 2) + + ((l3 & ~BYTE_VEC(0x03)) >> 2) + + ((l4 & ~BYTE_VEC(0x03)) >> 2); + uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) + + (l2 & BYTE_VEC(0x03)) + + (l3 & BYTE_VEC(0x03)) + + (l4 & BYTE_VEC(0x03)) + + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); return r1 + r2; } +#endif -#define PIXOPNAME(suffix) put ## suffix -#define BTYPE UINT8 -#define AVG2 avg2 -#define AVG4 avg4 -#define STORE(l, b) stq(l, b) -#include "pixops.h" -#undef PIXOPNAME -#undef BTYPE -#undef AVG2 -#undef AVG4 -#undef STORE +#define OP(LOAD, STORE) \ + do { \ + STORE(LOAD(pixels), block); \ + pixels += line_size; \ + block += line_size; \ + } while (--h) -#define PIXOPNAME(suffix) put_no_rnd ## suffix -#define BTYPE UINT8 -#define AVG2 avg2_no_rnd -#define AVG4 avg4_no_rnd -#define STORE(l, b) stq(l, b) -#include "pixops.h" -#undef PIXOPNAME -#undef BTYPE -#undef AVG2 -#undef AVG4 -#undef STORE +#define OP_X2(LOAD, STORE) \ + do { \ + uint64_t pix1, pix2; \ + \ + pix1 = LOAD(pixels); \ + pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ + STORE(AVG2(pix1, pix2), block); \ + pixels += line_size; \ + block += line_size; \ + } while (--h) -/* The following functions are untested. */ -#if 0 +#define OP_Y2(LOAD, STORE) \ + do { \ + uint64_t pix = LOAD(pixels); \ + do { \ + uint64_t next_pix; \ + \ + pixels += line_size; \ + next_pix = LOAD(pixels); \ + STORE(AVG2(pix, next_pix), block); \ + block += line_size; \ + pix = next_pix; \ + } while (--h); \ + } while (0) + +#define OP_XY2(LOAD, STORE) \ + do { \ + uint64_t pix1 = LOAD(pixels); \ + uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ + uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ + + (pix2 & BYTE_VEC(0x03)); \ + uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ + + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ + \ + do { \ + uint64_t npix1, npix2; \ + uint64_t npix_l, npix_h; \ + uint64_t avg; \ + \ + pixels += line_size; \ + npix1 = LOAD(pixels); \ + npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ + npix_l = (npix1 & BYTE_VEC(0x03)) \ + + (npix2 & BYTE_VEC(0x03)); \ + npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ + + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ + avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ + + pix_h + npix_h; \ + STORE(avg, block); \ + \ + block += line_size; \ + pix_l = npix_l; \ + pix_h = npix_h; \ + } while (--h); \ + } while (0) + +#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ +static void OPNAME ## _pixels ## SUFF ## _axp \ + (uint8_t *restrict block, const uint8_t *restrict pixels, \ + int line_size, int h) \ +{ \ + if ((size_t) pixels & 0x7) { \ + OPKIND(uldq, STORE); \ + } else { \ + OPKIND(ldq, STORE); \ + } \ +} -#define PIXOPNAME(suffix) avg ## suffix -#define BTYPE UINT8 +#define PIXOP(OPNAME, STORE) \ + MAKE_OP(OPNAME, , OP, STORE) \ + MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ + MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ + MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) + +/* Rounding primitives. */ #define AVG2 avg2 #define AVG4 avg4 +#define AVG4_ROUNDER BYTE_VEC(0x02) +#define STORE(l, b) stq(l, b) +PIXOP(put, STORE); + +#undef STORE #define STORE(l, b) stq(AVG2(l, ldq(b)), b); -#include "pixops.h" -#undef PIXOPNAME -#undef BTYPE +PIXOP(avg, STORE); + +/* Not rounding primitives. */ #undef AVG2 #undef AVG4 +#undef AVG4_ROUNDER #undef STORE - -#define PIXOPNAME(suffix) avg_no_rnd ## suffix -#define BTYPE UINT8 #define AVG2 avg2_no_rnd #define AVG4 avg4_no_rnd -#define STORE(l, b) stq(AVG2(l, ldq(b)), b); -#include "pixops.h" -#undef PIXOPNAME -#undef BTYPE -#undef AVG2 -#undef AVG4 -#undef STORE +#define AVG4_ROUNDER BYTE_VEC(0x01) +#define STORE(l, b) stq(l, b) +PIXOP(put_no_rnd, STORE); -#define PIXOPNAME(suffix) sub ## suffix -#define BTYPE DCTELEM -#define AVG2 avg2 -#define AVG4 avg4 -#define STORE(l, block) do { \ - UINT64 xxx = l; \ - (block)[0] -= (xxx >> 0) & 0xff; \ - (block)[1] -= (xxx >> 8) & 0xff; \ - (block)[2] -= (xxx >> 16) & 0xff; \ - (block)[3] -= (xxx >> 24) & 0xff; \ - (block)[4] -= (xxx >> 32) & 0xff; \ - (block)[5] -= (xxx >> 40) & 0xff; \ - (block)[6] -= (xxx >> 48) & 0xff; \ - (block)[7] -= (xxx >> 56) & 0xff; \ -} while (0) -#include "pixops.h" -#undef PIXOPNAME -#undef BTYPE -#undef AVG2 -#undef AVG4 #undef STORE - -#endif +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +PIXOP(avg_no_rnd, STORE); void dsputil_init_alpha(void) { - put_pixels_tab[0] = put_pixels_axp; + put_pixels_tab[0] = put_pixels_axp_asm; put_pixels_tab[1] = put_pixels_x2_axp; put_pixels_tab[2] = put_pixels_y2_axp; put_pixels_tab[3] = put_pixels_xy2_axp; - put_no_rnd_pixels_tab[0] = put_pixels_axp; + put_no_rnd_pixels_tab[0] = put_pixels_axp_asm; put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; + avg_pixels_tab[0] = avg_pixels_axp; + avg_pixels_tab[1] = avg_pixels_x2_axp; + avg_pixels_tab[2] = avg_pixels_y2_axp; + avg_pixels_tab[3] = avg_pixels_xy2_axp; + + avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp; + avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp; + avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp; + avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp; + + clear_blocks = clear_blocks_axp; + /* amask clears all bits that correspond to present features. */ if (amask(AMASK_MVI) == 0) { put_pixels_clamped = put_pixels_clamped_mvi_asm; |