diff options
author | Miguel Freitas <miguelfreitas@users.sourceforge.net> | 2004-04-25 18:57:04 +0000 |
---|---|---|
committer | Miguel Freitas <miguelfreitas@users.sourceforge.net> | 2004-04-25 18:57:04 +0000 |
commit | a2a44876712f079610f0396fb9a682ea47e05b6e (patch) | |
tree | 1f3e328dfe6a5f9fa7c79e7a23bf6310be2827fd | |
parent | acb7dc0f256afc24e875a168da989ef25d86b7b7 (diff) | |
download | xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.gz xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.bz2 |
ffmpeg sync
CVS patchset: 6437
CVS date: 2004/04/25 18:57:04
51 files changed, 10694 insertions, 2072 deletions
@@ -12,7 +12,7 @@ updates (the word 'maintainer' is intentionally avoided here). project version mediator ----------------------------------------------------------------------- -ffmpeg build 4707 Mike Melanson +ffmpeg build 4710 Mike Melanson goom 1.9dev5 gsm610 1.0.10 Mike Melanson liba52 0.7.4 diff --git a/configure.ac b/configure.ac index 6ae201efb..10f01a955 100644 --- a/configure.ac +++ b/configure.ac @@ -1962,6 +1962,7 @@ src/libffmpeg/libavcodec/i386/Makefile src/libffmpeg/libavcodec/mlib/Makefile src/libffmpeg/libavcodec/alpha/Makefile src/libffmpeg/libavcodec/ppc/Makefile +src/libffmpeg/libavcodec/sparc/Makefile src/libffmpeg/libavcodec/libpostproc/Makefile src/libflac/Makefile src/liblpcm/Makefile diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am index 8e5d53df3..7bbcbd281 100644 --- a/src/libffmpeg/libavcodec/Makefile.am +++ b/src/libffmpeg/libavcodec/Makefile.am @@ -1,6 +1,6 @@ include $(top_srcdir)/misc/Makefile.common -SUBDIRS = armv4l i386 mlib alpha ppc libpostproc +SUBDIRS = armv4l i386 mlib alpha ppc sparc libpostproc ## some files here are #included by others... go figure. EXTRA_DIST = fdctref.c motion_est_template.c svq3.c wmv2.c \ @@ -80,6 +80,7 @@ libavcodec_la_LDFLAGS = \ $(top_builddir)/src/libffmpeg/libavcodec/i386/libavcodec_mmx.la \ $(top_builddir)/src/libffmpeg/libavcodec/mlib/libavcodec_mlib.la \ $(top_builddir)/src/libffmpeg/libavcodec/ppc/libavcodec_ppc.la \ + $(top_builddir)/src/libffmpeg/libavcodec/sparc/libavcodec_sparc.la \ -avoid-version -module diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h index 510bd41d2..731bcd375 100644 --- a/src/libffmpeg/libavcodec/avcodec.h +++ b/src/libffmpeg/libavcodec/avcodec.h @@ -24,7 +24,7 @@ extern "C" { #define FFMPEG_VERSION_INT 0x000408 #define FFMPEG_VERSION "0.4.8" -#define LIBAVCODEC_BUILD 4707 +#define LIBAVCODEC_BUILD 4710 #define LIBAVCODEC_VERSION_INT FFMPEG_VERSION_INT #define LIBAVCODEC_VERSION FFMPEG_VERSION @@ -449,7 +449,7 @@ typedef struct AVPanScan{ \ /**\ * Motion vector table\ - * - encoding: unused\ + * - encoding: set by user\ * - decoding: set by lavc\ */\ int16_t (*motion_val[2])[2];\ @@ -457,7 +457,7 @@ typedef struct AVPanScan{ /**\ * Macroblock type table\ * mb_type_base + mb_width + 2\ - * - encoding: unused\ + * - encoding: set by user\ * - decoding: set by lavc\ */\ uint32_t *mb_type;\ @@ -545,13 +545,20 @@ typedef struct AVPanScan{ * - decoding: set by lavc\ */\ short *dct_coeff;\ +\ + /**\ + * Motion referece frame index\ + * - encoding: set by user\ + * - decoding: set by lavc\ + */\ + int8_t *ref_index[2]; #define FF_QSCALE_TYPE_MPEG1 0 #define FF_QSCALE_TYPE_MPEG2 1 #define FF_BUFFER_TYPE_INTERNAL 1 #define FF_BUFFER_TYPE_USER 2 ///< Direct rendering buffers (image is (de)allocated by user) -#define FF_BUFFER_TYPE_SHARED 4 ///< buffer from somewher else, dont dealloc image (data/base) +#define FF_BUFFER_TYPE_SHARED 4 ///< buffer from somewher else, dont dealloc image (data/base), all other tables are not shared #define FF_BUFFER_TYPE_COPY 8 ///< just a (modified) copy of some other buffer, dont dealloc anything @@ -847,6 +854,7 @@ typedef struct AVCodecContext { #define FF_BUG_QPEL_CHROMA2 256 #define FF_BUG_DIRECT_BLOCKSIZE 512 #define FF_BUG_EDGE 1024 +#define FF_BUG_HPEL_CHROMA 2048 //#define FF_BUG_FAKE_SCALABILITY 16 //autodetection should work 100% /** @@ -1567,6 +1575,22 @@ typedef struct AVCodecContext { * - decoding: set by execute() */ void *thread_opaque; + + /** + * Motion estimation threshold. under which no motion estimation is + * performed, but instead the user specified motion vectors are used + * + * - encoding: set by user + * - decoding: unused + */ + int me_threshold; + + /** + * Macroblock threshold. under which the user specified macroblock types will be used + * - encoding: set by user + * - decoding: unused + */ + int mb_threshold; } AVCodecContext; @@ -1676,6 +1700,7 @@ extern AVCodec h263p_encoder; extern AVCodec flv_encoder; extern AVCodec rv10_encoder; extern AVCodec rv20_encoder; +extern AVCodec dvvideo_encoder; extern AVCodec mjpeg_encoder; extern AVCodec ljpeg_encoder; extern AVCodec mpeg4_encoder; @@ -1826,7 +1851,10 @@ ImgReSampleContext *img_resample_init(int output_width, int output_height, ImgReSampleContext *img_resample_full_init(int owidth, int oheight, int iwidth, int iheight, int topBand, int bottomBand, - int leftBand, int rightBand); + int leftBand, int rightBand, + int padtop, int padbottom, + int padleft, int padright); + void img_resample(ImgReSampleContext *s, AVPicture *output, const AVPicture *input); @@ -1901,6 +1929,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode); void avcodec_get_context_defaults(AVCodecContext *s); AVCodecContext *avcodec_alloc_context(void); +void avcodec_get_frame_defaults(AVFrame *pic); AVFrame *avcodec_alloc_frame(void); int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic); @@ -2102,8 +2131,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size); /* for static data only */ /* call av_free_static to release all staticaly allocated tables */ void av_free_static(void); -void *__av_mallocz_static(void** location, unsigned int size); -#define av_mallocz_static(p, s) __av_mallocz_static((void **)(p), s) +void *av_mallocz_static(unsigned int size); /* add by bero : in adx.c */ int is_adx(const unsigned char *buf,size_t bufsize); @@ -2115,6 +2143,7 @@ void img_copy(AVPicture *dst, const AVPicture *src, #include <stdarg.h> +#define AV_LOG_QUIET -1 #define AV_LOG_ERROR 0 #define AV_LOG_INFO 1 #define AV_LOG_DEBUG 2 diff --git a/src/libffmpeg/libavcodec/cabac.c b/src/libffmpeg/libavcodec/cabac.c index 27e63045b..0e3e14f56 100644 --- a/src/libffmpeg/libavcodec/cabac.c +++ b/src/libffmpeg/libavcodec/cabac.c @@ -113,7 +113,7 @@ void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4], c->mps_state[2*i+0]= 2*mps_state[i]; c->mps_state[2*i+1]= 2*mps_state[i]+1; - if(lps_state[i]){ + if( i ){ c->lps_state[2*i+0]= 2*lps_state[i]; c->lps_state[2*i+1]= 2*lps_state[i]+1; }else{ diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h index 59b128cef..de9382a13 100644 --- a/src/libffmpeg/libavcodec/common.h +++ b/src/libffmpeg/libavcodec/common.h @@ -6,6 +6,11 @@ #ifndef COMMON_H #define COMMON_H +// xine: disable DEBUG for ffmpeg (too noisy) +#ifdef DEBUG +#undef DEBUG +#endif + #if defined(WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__) # define CONFIG_WIN32 #endif @@ -131,7 +136,7 @@ static inline float floorf(float f) { /* windows */ -# ifndef __MINGW32__ +# if !defined(__MINGW32__) && !defined(__CYGWIN__) # define int64_t_C(c) (c ## i64) # define uint64_t_C(c) (c ## i64) @@ -204,40 +209,30 @@ static inline float floorf(float f) { /* debug stuff */ -# ifndef DEBUG -# ifndef NDEBUG +# if !defined(DEBUG) && !defined(NDEBUG) # define NDEBUG -# endif # endif # include <assert.h> /* dprintf macros */ -# if defined(CONFIG_WIN32) && !defined(__MINGW32__) +# if defined(CONFIG_WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__) inline void dprintf(const char* fmt,...) {} # else -#if __GNUC__ -#ifdef DEBUG -#define dprintf(fmt,args...) printf(fmt, ## args) -#else -#define dprintf(fmt,args...) -#endif -#else -#ifdef DEBUG -#define dprintf(...) printf(__VA_ARGS__) -#else -#define dprintf(...) -#endif -#endif +# ifdef DEBUG +# define dprintf(fmt,...) av_log(NULL, AV_LOG_DEBUG, fmt, __VA_ARGS__) +# else +# define dprintf(fmt,...) +# endif # endif /* !CONFIG_WIN32 */ # define av_abort() do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0) //rounded divison & shift -#define RSHIFT(a,b) ((a) > 0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b)) +#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b)) /* assume b>0 */ #define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) #define ABS(a) ((a) >= 0 ? (a) : (-(a))) @@ -291,6 +286,7 @@ struct PutBitContext; typedef void (*WriteDataFunc)(void *, uint8_t *, int); +/* buf and buf_end must be present and used by every alternative writer. */ typedef struct PutBitContext { #ifdef ALT_BITSTREAM_WRITER uint8_t *buf, *buf_end; @@ -327,11 +323,6 @@ static inline int put_bits_count(PutBitContext *s) #endif } -static inline int put_bits_left(PutBitContext* s) -{ - return (s->buf_end - s->buf) * 8 - put_bits_count(s); -} - /* pad the end of the output stream with zeros */ static inline void flush_put_bits(PutBitContext *s) { @@ -354,7 +345,7 @@ void align_put_bits(PutBitContext *s); void put_string(PutBitContext * pbc, char *s, int put_zero); /* bit input */ - +/* buffer, buffer_end and size_in_bits must be present and used by every reader */ typedef struct GetBitContext { const uint8_t *buffer, *buffer_end; #ifdef ALT_BITSTREAM_READER @@ -386,7 +377,7 @@ typedef struct RL_VLC_ELEM { uint8_t run; } RL_VLC_ELEM; -#ifdef ARCH_SPARC64 +#ifdef ARCH_SPARC #define UNALIGNED_STORES_ARE_BAD #endif @@ -437,7 +428,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value) bit_buf<<=bit_left; bit_buf |= value >> (n - bit_left); #ifdef UNALIGNED_STORES_ARE_BAD - if (3 & (int) s->buf_ptr) { + if (3 & (intptr_t) s->buf_ptr) { s->buf_ptr[0] = bit_buf >> 24; s->buf_ptr[1] = bit_buf >> 16; s->buf_ptr[2] = bit_buf >> 8; @@ -924,11 +915,6 @@ static inline void init_get_bits(GetBitContext *s, #endif } -static inline int get_bits_left(GetBitContext *s) -{ - return s->size_in_bits - get_bits_count(s); -} - int check_marker(GetBitContext *s, const char *msg); void align_get_bits(GetBitContext *s); int init_vlc(VLC *vlc, int nb_bits, int nb_codes, @@ -1080,7 +1066,7 @@ static inline int get_xbits_trace(GetBitContext *s, int n, char *file, char *fun #define get_vlc(s, vlc) get_vlc_trace(s, (vlc)->table, (vlc)->bits, 3, __FILE__, __PRETTY_FUNCTION__, __LINE__) #define get_vlc2(s, tab, bits, max) get_vlc_trace(s, tab, bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__) -#define tprintf printf +#define tprintf(...) av_log(NULL, AV_LOG_DEBUG, __VA_ARGS__) #else //TRACE #define tprintf(...) {} @@ -1182,6 +1168,12 @@ static inline int clip(int a, int amin, int amax) return a; } +static inline int clip_uint8(int a) +{ + if (a&(~255)) return (-a)>>31; + else return a; +} + /* math */ extern const uint8_t ff_sqrt_tab[128]; @@ -1290,6 +1282,9 @@ tend= rdtsc();\ #define malloc please_use_av_malloc #define free please_use_av_free #define realloc please_use_av_realloc +#define time time_is_forbidden_due_to_security_issues +#define rand rand_is_forbidden_due_to_state_trashing +#define srand srand_is_forbidden_due_to_state_trashing #if !(defined(LIBAVFORMAT_BUILD) || defined(_FRAMEHOOK_H)) #define printf please_use_av_log #define fprintf please_use_av_log diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c index 7f26bd98a..fce0b8163 100644 --- a/src/libffmpeg/libavcodec/dsputil.c +++ b/src/libffmpeg/libavcodec/dsputil.c @@ -3286,6 +3286,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) #ifdef HAVE_MLIB dsputil_init_mlib(c, avctx); #endif +#ifdef ARCH_SPARC + dsputil_init_vis(c,avctx); +#endif #ifdef ARCH_ALPHA dsputil_init_alpha(c, avctx); #endif diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h index 35e965db0..730e1489d 100644 --- a/src/libffmpeg/libavcodec/dsputil.h +++ b/src/libffmpeg/libavcodec/dsputil.h @@ -76,6 +76,12 @@ void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, int coeff_count, uint8_t *dest, int stride); +void vp3_dsp_init_sse2(void); +void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, uint8_t *dest, int stride); +void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, uint8_t *dest, int stride); + /* minimum alignment rules ;) if u notice errors in the align stuff, need more alignment for some asm code for some cpu @@ -378,6 +384,8 @@ static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) one or more MultiMedia extension */ int mm_support(void); +#define __align16 __attribute__ ((aligned (16))) + #if defined(HAVE_MMX) #undef emms_c @@ -413,7 +421,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); #elif defined(ARCH_ARMV4L) /* This is to use 4 bytes read to the IDCT pointers for some 'zero' - line ptimizations */ + line optimizations */ #define __align8 __attribute__ ((aligned (4))) void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx); @@ -425,6 +433,12 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx); void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx); +#elif defined(ARCH_SPARC) + +/* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */ +#define __align8 __attribute__ ((aligned (8))) +void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx); + #elif defined(ARCH_ALPHA) #define __align8 __attribute__ ((aligned (8))) diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c index 08be11d45..5f1eaaa3b 100644 --- a/src/libffmpeg/libavcodec/dv.c +++ b/src/libffmpeg/libavcodec/dv.c @@ -1,6 +1,7 @@ /* * DV decoder * Copyright (c) 2002 Fabrice Bellard. + * Copyright (c) 2004 Roman Shaposhnik. * * DV encoder * Copyright (c) 2003 Roman Shaposhnik. @@ -33,20 +34,18 @@ #include "simple_idct.h" #include "dvdata.h" -typedef struct DVVideoDecodeContext { +typedef struct DVVideoContext { const DVprofile* sys; AVFrame picture; + uint8_t *buf; uint8_t dv_zigzag[2][64]; - uint8_t dv_idct_shift[2][22][64]; + uint8_t dv_idct_shift[2][2][22][64]; void (*get_pixels)(DCTELEM *block, const uint8_t *pixels, int line_size); void (*fdct[2])(DCTELEM *block); void (*idct_put[2])(uint8_t *dest, int line_size, DCTELEM *block); - - GetBitContext gb; - DCTELEM block[5*6][64] __align8; -} DVVideoDecodeContext; +} DVVideoContext; #define TEX_VLC_BITS 9 @@ -58,15 +57,18 @@ typedef struct DVVideoDecodeContext { #define DV_VLC_MAP_LEV_SIZE 512 #endif +/* MultiThreading */ +static uint8_t** dv_anchor; + /* XXX: also include quantization */ -static RL_VLC_ELEM *dv_rl_vlc[1]; +static RL_VLC_ELEM *dv_rl_vlc; /* VLC encoding lookup table */ static struct dv_vlc_pair { uint32_t vlc; uint8_t size; } (*dv_vlc_map)[DV_VLC_MAP_LEV_SIZE] = NULL; -static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm) +static void dv_build_unquantize_tables(DVVideoContext *s, uint8_t* perm) { int i, q, j; @@ -76,29 +78,34 @@ static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm) for(i = 1; i < 64; i++) { /* 88 table */ j = perm[i]; - s->dv_idct_shift[0][q][j] = + s->dv_idct_shift[0][0][q][j] = dv_quant_shifts[q][dv_88_areas[i]] + 1; + s->dv_idct_shift[1][0][q][j] = s->dv_idct_shift[0][0][q][j] + 1; } /* 248DCT */ for(i = 1; i < 64; i++) { /* 248 table */ - s->dv_idct_shift[1][q][i] = + s->dv_idct_shift[0][1][q][i] = dv_quant_shifts[q][dv_248_areas[i]] + 1; + s->dv_idct_shift[1][1][q][i] = s->dv_idct_shift[0][1][q][i] + 1; } } } static int dvvideo_init(AVCodecContext *avctx) { - DVVideoDecodeContext *s = avctx->priv_data; + DVVideoContext *s = avctx->priv_data; DSPContext dsp; static int done=0; int i, j; if (!done) { - int i; VLC dv_vlc; + uint16_t new_dv_vlc_bits[NB_DV_VLC*2]; + uint8_t new_dv_vlc_len[NB_DV_VLC*2]; + uint8_t new_dv_vlc_run[NB_DV_VLC*2]; + int16_t new_dv_vlc_level[NB_DV_VLC*2]; done = 1; @@ -106,13 +113,42 @@ static int dvvideo_init(AVCodecContext *avctx) if (!dv_vlc_map) return -ENOMEM; + /* dv_anchor lets each thread know its Id */ + dv_anchor = av_malloc(12*27*sizeof(void*)); + if (!dv_anchor) { + av_free(dv_vlc_map); + return -ENOMEM; + } + for (i=0; i<12*27; i++) + dv_anchor[i] = (void*)(size_t)i; + + /* it's faster to include sign bit in a generic VLC parsing scheme */ + for (i=0, j=0; i<NB_DV_VLC; i++, j++) { + new_dv_vlc_bits[j] = dv_vlc_bits[i]; + new_dv_vlc_len[j] = dv_vlc_len[i]; + new_dv_vlc_run[j] = dv_vlc_run[i]; + new_dv_vlc_level[j] = dv_vlc_level[i]; + + if (dv_vlc_level[i]) { + new_dv_vlc_bits[j] <<= 1; + new_dv_vlc_len[j]++; + + j++; + new_dv_vlc_bits[j] = (dv_vlc_bits[i] << 1) | 1; + new_dv_vlc_len[j] = dv_vlc_len[i] + 1; + new_dv_vlc_run[j] = dv_vlc_run[i]; + new_dv_vlc_level[j] = -dv_vlc_level[i]; + } + } + /* NOTE: as a trick, we use the fact the no codes are unused to accelerate the parsing of partial codes */ - init_vlc(&dv_vlc, TEX_VLC_BITS, NB_DV_VLC, - dv_vlc_len, 1, 1, dv_vlc_bits, 2, 2); + init_vlc(&dv_vlc, TEX_VLC_BITS, j, + new_dv_vlc_len, 1, 1, new_dv_vlc_bits, 2, 2); - dv_rl_vlc[0] = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM)); - if (!dv_rl_vlc[0]) { + dv_rl_vlc = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM)); + if (!dv_rl_vlc) { + av_free(dv_anchor); av_free(dv_vlc_map); return -ENOMEM; } @@ -124,18 +160,15 @@ static int dvvideo_init(AVCodecContext *avctx) if(len<0){ //more bits needed run= 0; level= code; - } else if (code == (NB_DV_VLC - 1)) { - /* EOB */ - run = 0; - level = 256; } else { - run= dv_vlc_run[code] + 1; - level= dv_vlc_level[code]; + run= new_dv_vlc_run[code] + 1; + level= new_dv_vlc_level[code]; } - dv_rl_vlc[0][i].len = len; - dv_rl_vlc[0][i].level = level; - dv_rl_vlc[0][i].run = run; + dv_rl_vlc[i].len = len; + dv_rl_vlc[i].level = level; + dv_rl_vlc[i].run = run; } + free_vlc(&dv_vlc); for (i = 0; i < NB_DV_VLC - 1; i++) { if (dv_vlc_run[i] >= DV_VLC_MAP_RUN_SIZE || dv_vlc_level[i] >= DV_VLC_MAP_LEV_SIZE) @@ -202,13 +235,19 @@ static int dvvideo_init(AVCodecContext *avctx) return 0; } +static int dvvideo_end(AVCodecContext *avctx) +{ + avcodec_default_free_buffers(avctx); + return 0; +} + // #define VLC_DEBUG +// #define printf(...) av_log(NULL, AV_LOG_ERROR, __VA_ARGS__) typedef struct BlockInfo { const uint8_t *shift_table; const uint8_t *scan_table; uint8_t pos; /* position in block */ - uint8_t eob_reached; /* true if EOB has been reached */ uint8_t dct_mode; uint8_t partial_bit_count; uint16_t partial_bit_buffer; @@ -228,141 +267,88 @@ static const int mb_area_start[5] = { 1, 6, 21, 43, 64 }; #warning only works with ALT_BITSTREAM_READER #endif +static inline int get_bits_left(GetBitContext *s) +{ + return s->size_in_bits - get_bits_count(s); +} + +static inline int get_bits_size(GetBitContext *s) +{ + return s->size_in_bits; +} + +static inline int put_bits_left(PutBitContext* s) +{ + return (s->buf_end - s->buf) * 8 - put_bits_count(s); +} + /* decode ac coefs */ -static void dv_decode_ac(DVVideoDecodeContext *s, - BlockInfo *mb, DCTELEM *block, int last_index) +static void dv_decode_ac(GetBitContext *gb, BlockInfo *mb, DCTELEM *block) { - int last_re_index; - int shift_offset = mb->shift_offset; + int last_index = get_bits_size(gb); const uint8_t *scan_table = mb->scan_table; const uint8_t *shift_table = mb->shift_table; int pos = mb->pos; - int level, pos1, sign, run; - int partial_bit_count; -#ifndef ALT_BITSTREAM_READER //FIXME - int re_index=0; - int re1_index=0; -#endif - OPEN_READER(re, &s->gb); + int partial_bit_count = mb->partial_bit_count; + int level, pos1, run, vlc_len, index; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); -#ifdef VLC_DEBUG - printf("start\n"); -#endif - /* if we must parse a partial vlc, we do it here */ - partial_bit_count = mb->partial_bit_count; if (partial_bit_count > 0) { - uint8_t buf[4]; - uint32_t v; - int l, l1; - GetBitContext gb1; - - /* build the dummy bit buffer */ - l = 16 - partial_bit_count; - UPDATE_CACHE(re, &s->gb); -#ifdef VLC_DEBUG - printf("show=%04x\n", SHOW_UBITS(re, &s->gb, 16)); -#endif - v = (mb->partial_bit_buffer << l) | SHOW_UBITS(re, &s->gb, l); - buf[0] = v >> 8; - buf[1] = v; -#ifdef VLC_DEBUG - printf("v=%04x cnt=%d %04x\n", - v, partial_bit_count, (mb->partial_bit_buffer << l)); -#endif - /* try to read the codeword */ - init_get_bits(&gb1, buf, 4*8); - { - OPEN_READER(re1, &gb1); - UPDATE_CACHE(re1, &gb1); - GET_RL_VLC(level, run, re1, &gb1, dv_rl_vlc[0], - TEX_VLC_BITS, 2); - l = re1_index; - CLOSE_READER(re1, &gb1); - } -#ifdef VLC_DEBUG - printf("****run=%d level=%d size=%d\n", run, level, l); -#endif - /* compute codeword length */ - l1 = (level != 256 && level != 0); - /* if too long, we cannot parse */ - l -= partial_bit_count; - if ((re_index + l + l1) > last_index) - return; - /* skip read bits */ - last_re_index = 0; /* avoid warning */ - re_index += l; - /* by definition, if we can read the vlc, all partial bits - will be read (otherwise we could have read the vlc before) */ - mb->partial_bit_count = 0; - UPDATE_CACHE(re, &s->gb); - goto handle_vlc; + re_cache = ((unsigned)re_cache >> partial_bit_count) | + (mb->partial_bit_buffer << (sizeof(re_cache)*8 - partial_bit_count)); + re_index -= partial_bit_count; + mb->partial_bit_count = 0; } /* get the AC coefficients until last_index is reached */ for(;;) { - UPDATE_CACHE(re, &s->gb); #ifdef VLC_DEBUG - printf("%2d: bits=%04x index=%d\n", - pos, SHOW_UBITS(re, &s->gb, 16), re_index); + printf("%2d: bits=%04x index=%d\n", pos, SHOW_UBITS(re, gb, 16), re_index); #endif - last_re_index = re_index; - GET_RL_VLC(level, run, re, &s->gb, dv_rl_vlc[0], - TEX_VLC_BITS, 2); - handle_vlc: + /* our own optimized GET_RL_VLC */ + index = NEG_USR32(re_cache, TEX_VLC_BITS); + vlc_len = dv_rl_vlc[index].len; + if (vlc_len < 0) { + index = NEG_USR32((unsigned)re_cache << TEX_VLC_BITS, -vlc_len) + dv_rl_vlc[index].level; + vlc_len = TEX_VLC_BITS - vlc_len; + } + level = dv_rl_vlc[index].level; + run = dv_rl_vlc[index].run; + + /* gotta check if we're still within gb boundaries */ + if (re_index + vlc_len > last_index) { + /* should be < 16 bits otherwise a codeword could have been parsed */ + mb->partial_bit_count = last_index - re_index; + mb->partial_bit_buffer = NEG_USR32(re_cache, mb->partial_bit_count); + re_index = last_index; + break; + } + re_index += vlc_len; + #ifdef VLC_DEBUG - printf("run=%d level=%d\n", run, level); + printf("run=%d level=%d\n", run, level); #endif - if (level == 256) { - if (re_index > last_index) { - cannot_read: - /* put position before read code */ - re_index = last_re_index; - mb->eob_reached = 0; - break; - } - /* EOB */ - mb->eob_reached = 1; - break; - } else if (level != 0) { - if ((re_index + 1) > last_index) - goto cannot_read; - sign = SHOW_SBITS(re, &s->gb, 1); - level = (level ^ sign) - sign; - LAST_SKIP_BITS(re, &s->gb, 1); - pos += run; - /* error */ - if (pos >= 64) { - goto read_error; - } + pos += run; + if (pos >= 64) + break; + + if (level) { pos1 = scan_table[pos]; - level = level << (shift_table[pos1] + shift_offset); - block[pos1] = level; - // printf("run=%d level=%d shift=%d\n", run, level, shift_table[pos1]); - } else { - if (re_index > last_index) - goto cannot_read; - /* level is zero: means run without coding. No - sign is coded */ - pos += run; - /* error */ - if (pos >= 64) { - read_error: -#if defined(VLC_DEBUG) || 1 - av_log(NULL, AV_LOG_ERROR, "error pos=%d\n", pos); -#endif - /* for errors, we consider the eob is reached */ - mb->eob_reached = 1; - break; - } - } + block[pos1] = level << shift_table[pos1]; + } + + UPDATE_CACHE(re, gb); } - CLOSE_READER(re, &s->gb); + CLOSE_READER(re, gb); mb->pos = pos; } -static inline void bit_copy(PutBitContext *pb, GetBitContext *gb, int bits_left) +static inline void bit_copy(PutBitContext *pb, GetBitContext *gb) { + int bits_left = get_bits_left(gb); while (bits_left >= 16) { put_bits(pb, 16, get_bits(gb, 16)); bits_left -= 16; @@ -373,60 +359,56 @@ static inline void bit_copy(PutBitContext *pb, GetBitContext *gb, int bits_left) } /* mb_x and mb_y are in units of 8 pixels */ -static inline void dv_decode_video_segment(DVVideoDecodeContext *s, +static inline void dv_decode_video_segment(DVVideoContext *s, uint8_t *buf_ptr1, const uint16_t *mb_pos_ptr) { int quant, dc, dct_mode, class1, j; int mb_index, mb_x, mb_y, v, last_index; DCTELEM *block, *block1; - int c_offset, bits_left; + int c_offset; uint8_t *y_ptr; - BlockInfo mb_data[5 * 6], *mb, *mb1; void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block); uint8_t *buf_ptr; PutBitContext pb, vs_pb; + GetBitContext gb; + BlockInfo mb_data[5 * 6], *mb, *mb1; + DCTELEM sblock[5*6][64] __align8; uint8_t mb_bit_buffer[80 + 4]; /* allow some slack */ - int mb_bit_count; uint8_t vs_bit_buffer[5 * 80 + 4]; /* allow some slack */ - int vs_bit_count; - - memset(s->block, 0, sizeof(s->block)); + + memset(sblock, 0, sizeof(sblock)); /* pass 1 : read DC and AC coefficients in blocks */ buf_ptr = buf_ptr1; - block1 = &s->block[0][0]; + block1 = &sblock[0][0]; mb1 = mb_data; init_put_bits(&vs_pb, vs_bit_buffer, 5 * 80); - vs_bit_count = 0; - for(mb_index = 0; mb_index < 5; mb_index++) { + for(mb_index = 0; mb_index < 5; mb_index++, mb1 += 6, block1 += 6 * 64) { /* skip header */ quant = buf_ptr[3] & 0x0f; buf_ptr += 4; init_put_bits(&pb, mb_bit_buffer, 80); - mb_bit_count = 0; mb = mb1; block = block1; for(j = 0;j < 6; j++) { - /* NOTE: size is not important here */ - init_get_bits(&s->gb, buf_ptr, 14*8); + last_index = block_sizes[j]; + init_get_bits(&gb, buf_ptr, last_index); /* get the dc */ - dc = get_bits(&s->gb, 9); + dc = get_bits(&gb, 9); dc = (dc << (32 - 9)) >> (32 - 9); - dct_mode = get_bits1(&s->gb); + dct_mode = get_bits1(&gb); mb->dct_mode = dct_mode; mb->scan_table = s->dv_zigzag[dct_mode]; - class1 = get_bits(&s->gb, 2); - mb->shift_offset = (class1 == 3); - mb->shift_table = s->dv_idct_shift[dct_mode] + class1 = get_bits(&gb, 2); + mb->shift_table = s->dv_idct_shift[class1 == 3][dct_mode] [quant + dv_quant_offset[class1]]; dc = dc << 2; /* convert to unsigned because 128 is not added in the standard IDCT */ dc += 1024; block[0] = dc; - last_index = block_sizes[j]; buf_ptr += last_index >> 3; mb->pos = 0; mb->partial_bit_count = 0; @@ -434,88 +416,64 @@ static inline void dv_decode_video_segment(DVVideoDecodeContext *s, #ifdef VLC_DEBUG printf("MB block: %d, %d ", mb_index, j); #endif - dv_decode_ac(s, mb, block, last_index); + dv_decode_ac(&gb, mb, block); /* write the remaining bits in a new buffer only if the block is finished */ - bits_left = last_index - get_bits_count(&s->gb); - if (mb->eob_reached) { - mb->partial_bit_count = 0; - mb_bit_count += bits_left; - bit_copy(&pb, &s->gb, bits_left); - } else { - /* should be < 16 bits otherwise a codeword could have - been parsed */ - mb->partial_bit_count = bits_left; - mb->partial_bit_buffer = get_bits(&s->gb, bits_left); - } + if (mb->pos >= 64) + bit_copy(&pb, &gb); + block += 64; mb++; } - flush_put_bits(&pb); - /* pass 2 : we can do it just after */ #ifdef VLC_DEBUG - printf("***pass 2 size=%d MB#=%d\n", mb_bit_count, mb_index); + printf("***pass 2 size=%d MB#=%d\n", put_bits_count(&pb), mb_index); #endif block = block1; mb = mb1; - init_get_bits(&s->gb, mb_bit_buffer, 80*8); - for(j = 0;j < 6; j++) { - if (!mb->eob_reached && get_bits_count(&s->gb) < mb_bit_count) { - dv_decode_ac(s, mb, block, mb_bit_count); + init_get_bits(&gb, mb_bit_buffer, put_bits_count(&pb)); + flush_put_bits(&pb); + for(j = 0;j < 6; j++, block += 64, mb++) { + if (mb->pos < 64 && get_bits_left(&gb) > 0) { + dv_decode_ac(&gb, mb, block); /* if still not finished, no need to parse other blocks */ - if (!mb->eob_reached) { - /* we could not parse the current AC coefficient, - so we add the remaining bytes */ - bits_left = mb_bit_count - get_bits_count(&s->gb); - if (bits_left > 0) { - mb->partial_bit_count += bits_left; - mb->partial_bit_buffer = - (mb->partial_bit_buffer << bits_left) | - get_bits(&s->gb, bits_left); - } - goto next_mb; - } + if (mb->pos < 64) + break; } - block += 64; - mb++; } /* all blocks are finished, so the extra bytes can be used at the video segment level */ - bits_left = mb_bit_count - get_bits_count(&s->gb); - vs_bit_count += bits_left; - bit_copy(&vs_pb, &s->gb, bits_left); - next_mb: - mb1 += 6; - block1 += 6 * 64; + if (j >= 6) + bit_copy(&vs_pb, &gb); } /* we need a pass other the whole video segment */ - flush_put_bits(&vs_pb); - #ifdef VLC_DEBUG - printf("***pass 3 size=%d\n", vs_bit_count); + printf("***pass 3 size=%d\n", put_bits_count(&vs_pb)); #endif - block = &s->block[0][0]; + block = &sblock[0][0]; mb = mb_data; - init_get_bits(&s->gb, vs_bit_buffer, 5 * 80*8); + init_get_bits(&gb, vs_bit_buffer, put_bits_count(&vs_pb)); + flush_put_bits(&vs_pb); for(mb_index = 0; mb_index < 5; mb_index++) { for(j = 0;j < 6; j++) { - if (!mb->eob_reached) { + if (mb->pos < 64) { #ifdef VLC_DEBUG printf("start %d:%d\n", mb_index, j); #endif - dv_decode_ac(s, mb, block, vs_bit_count); + dv_decode_ac(&gb, mb, block); } + if (mb->pos >= 64 && mb->pos < 127) + av_log(NULL, AV_LOG_ERROR, "AC EOB marker is absent pos=%d\n", mb->pos); block += 64; mb++; } } /* compute idct and place blocks */ - block = &s->block[0][0]; + block = &sblock[0][0]; mb = mb_data; for(mb_index = 0; mb_index < 5; mb_index++) { v = *mb_pos_ptr++; @@ -790,7 +748,7 @@ static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos) * horrible and the weighting is missing. But it's missing from the * decoding step also -- so at least we're on the same page with decoder ;-) */ -static inline void dv_encode_video_segment(DVVideoDecodeContext *s, +static inline void dv_encode_video_segment(DVVideoContext *s, uint8_t *dif, const uint16_t *mb_pos_ptr) { @@ -801,6 +759,7 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s, uint8_t* ptr; int do_edge_wrap; DCTELEM block[64] __align8; + DCTELEM sblock[5*6][64] __align8; EncBlockInfo enc_blks[5*6]; PutBitContext pbs[5*6]; PutBitContext* pb; @@ -854,7 +813,7 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s, } enc_blk->dct_mode = dv_guess_dct_mode(block); - enc_blk->mb = &s->block[mb_index*6+j][0]; + enc_blk->mb = &sblock[mb_index*6+j][0]; enc_blk->area_q[0] = enc_blk->area_q[1] = enc_blk->area_q[2] = enc_blk->area_q[3] = 0; enc_blk->partial_bit_count = 0; enc_blk->partial_bit_buffer = 0; @@ -906,15 +865,31 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s, flush_put_bits(&pbs[j]); } +static int dv_decode_mt(AVCodecContext *avctx, void* sl) +{ + DVVideoContext *s = avctx->priv_data; + int slice = (size_t)sl; + dv_decode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80], + &s->sys->video_place[slice*5]); + return 0; +} + +static int dv_encode_mt(AVCodecContext *avctx, void* sl) +{ + DVVideoContext *s = avctx->priv_data; + int slice = (size_t)sl; + dv_encode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80], + &s->sys->video_place[slice*5]); + return 0; +} + /* NOTE: exactly one frame must be given (120000 bytes for NTSC, 144000 bytes for PAL) */ static int dvvideo_decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size) { - DVVideoDecodeContext *s = avctx->priv_data; - int ds, vs; - const uint16_t *mb_pos_ptr; + DVVideoContext *s = avctx->priv_data; *data_size=0; /* special case for last picture */ @@ -925,7 +900,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, if (!s->sys || buf_size < s->sys->frame_size) return -1; /* NOTE: we only accept several full frames */ - if(s->picture.data[0]) avctx->release_buffer(avctx, &s->picture); @@ -940,24 +914,10 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, s->picture.interlaced_frame = 1; s->picture.top_field_first = 0; - /* for each DIF segment */ - mb_pos_ptr = s->sys->video_place; - for (ds = 0; ds < s->sys->difseg_size; ds++) { - buf += 6 * 80; /* skip DIF segment header */ - - for(vs = 0; vs < 27; vs++) { - if ((vs % 3) == 0) - buf += 80; /* skip audio block */ - -#ifdef VLC_DEBUG - printf("********************* %d, %d **********************\n", ds, vs); -#endif - dv_decode_video_segment(s, buf, mb_pos_ptr); - buf += 5 * 80; - mb_pos_ptr += 5; - } - } - + s->buf = buf; + avctx->execute(avctx, dv_decode_mt, (void**)&dv_anchor[0], NULL, + s->sys->difseg_size * 27); + emms_c(); /* return image */ @@ -970,9 +930,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size, void *data) { - DVVideoDecodeContext *s = c->priv_data; - const uint16_t *mb_pos_ptr; - int ds, vs; + DVVideoContext *s = c->priv_data; s->sys = dv_codec_profile(c); if (!s->sys) @@ -981,41 +939,34 @@ static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size, c->pix_fmt = s->sys->pix_fmt; s->picture = *((AVFrame *)data); - /* for each DIF segment */ - mb_pos_ptr = s->sys->video_place; - for (ds = 0; ds < s->sys->difseg_size; ds++) { - buf += 6 * 80; /* skip DIF segment header */ - - for(vs = 0; vs < 27; vs++) { - if ((vs % 3) == 0) - buf += 80; /* skip audio block */ - -#ifdef VLC_DEBUG - printf("********************* %d, %d **********************\n", ds, vs); -#endif - dv_encode_video_segment(s, buf, mb_pos_ptr); - buf += 5 * 80; - mb_pos_ptr += 5; - } - } + s->buf = buf; + c->execute(c, dv_encode_mt, (void**)&dv_anchor[0], NULL, + s->sys->difseg_size * 27); emms_c(); return s->sys->frame_size; } -static int dvvideo_end(AVCodecContext *avctx) -{ - avcodec_default_free_buffers(avctx); - return 0; -} +AVCodec dvvideo_encoder = { + "dvvideo", + CODEC_TYPE_VIDEO, + CODEC_ID_DVVIDEO, + sizeof(DVVideoContext), + dvvideo_init, + dvvideo_encode_frame, + dvvideo_end, + NULL, + CODEC_CAP_DR1, + NULL +}; AVCodec dvvideo_decoder = { "dvvideo", CODEC_TYPE_VIDEO, CODEC_ID_DVVIDEO, - sizeof(DVVideoDecodeContext), + sizeof(DVVideoContext), dvvideo_init, - dvvideo_encode_frame, + NULL, dvvideo_end, dvvideo_decode_frame, CODEC_CAP_DR1, diff --git a/src/libffmpeg/libavcodec/dvdata.h b/src/libffmpeg/libavcodec/dvdata.h index e6e0986ba..e60d99448 100644 --- a/src/libffmpeg/libavcodec/dvdata.h +++ b/src/libffmpeg/libavcodec/dvdata.h @@ -218,7 +218,7 @@ static const uint8_t dv_vlc_run[409] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, +127, }; static const uint8_t dv_vlc_level[409] = { diff --git a/src/libffmpeg/libavcodec/error_resilience.c b/src/libffmpeg/libavcodec/error_resilience.c index c6b10a79c..b7aeebddf 100644 --- a/src/libffmpeg/libavcodec/error_resilience.c +++ b/src/libffmpeg/libavcodec/error_resilience.c @@ -45,7 +45,7 @@ static void put_dc(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t { int dc, dcu, dcv, y, i; for(i=0; i<4; i++){ - dc= s->dc_val[0][mb_x*2+1 + (i&1) + (mb_y*2+1 + (i>>1))*(s->mb_width*2+2)]; + dc= s->dc_val[0][mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride]; if(dc<0) dc=0; else if(dc>2040) dc=2040; for(y=0; y<8; y++){ @@ -55,8 +55,8 @@ static void put_dc(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t } } } - dcu = s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)]; - dcv = s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)]; + dcu = s->dc_val[1][mb_x + mb_y*s->mb_stride]; + dcv = s->dc_val[2][mb_x + mb_y*s->mb_stride]; if (dcu<0 ) dcu=0; else if(dcu>2040) dcu=2040; if (dcv<0 ) dcv=0; @@ -209,8 +209,8 @@ static void h_block_filter(MpegEncContext *s, uint8_t *dst, int w, int h, int st int left_damage = left_status&(DC_ERROR|AC_ERROR|MV_ERROR); int right_damage= right_status&(DC_ERROR|AC_ERROR|MV_ERROR); int offset= b_x*8 + b_y*stride*8; - int16_t *left_mv= s->current_picture.motion_val[0][s->block_wrap[0]*((b_y<<(1-is_luma)) + 1) + ( b_x <<(1-is_luma))]; - int16_t *right_mv= s->current_picture.motion_val[0][s->block_wrap[0]*((b_y<<(1-is_luma)) + 1) + ((b_x+1)<<(1-is_luma))]; + int16_t *left_mv= s->current_picture.motion_val[0][s->b8_stride*(b_y<<(1-is_luma)) + ( b_x <<(1-is_luma))]; + int16_t *right_mv= s->current_picture.motion_val[0][s->b8_stride*(b_y<<(1-is_luma)) + ((b_x+1)<<(1-is_luma))]; if(!(left_damage||right_damage)) continue; // both undamaged @@ -269,8 +269,8 @@ static void v_block_filter(MpegEncContext *s, uint8_t *dst, int w, int h, int st int top_damage = top_status&(DC_ERROR|AC_ERROR|MV_ERROR); int bottom_damage= bottom_status&(DC_ERROR|AC_ERROR|MV_ERROR); int offset= b_x*8 + b_y*stride*8; - int16_t *top_mv= s->current_picture.motion_val[0][s->block_wrap[0]*(( b_y <<(1-is_luma)) + 1) + (b_x<<(1-is_luma))]; - int16_t *bottom_mv= s->current_picture.motion_val[0][s->block_wrap[0]*(((b_y+1)<<(1-is_luma)) + 1) + (b_x<<(1-is_luma))]; + int16_t *top_mv= s->current_picture.motion_val[0][s->b8_stride*( b_y <<(1-is_luma)) + (b_x<<(1-is_luma))]; + int16_t *bottom_mv= s->current_picture.motion_val[0][s->b8_stride*((b_y+1)<<(1-is_luma)) + (b_x<<(1-is_luma))]; if(!(top_damage||bottom_damage)) continue; // both undamaged @@ -378,8 +378,8 @@ int score_sum=0; int j; int best_score=256*256*256*64; int best_pred=0; - const int mot_stride= mb_width*2+2; - const int mot_index= mb_x*2 + 1 + (mb_y*2+1)*mot_stride; + const int mot_stride= s->b8_stride; + const int mot_index= mb_x*2 + mb_y*2*mot_stride; int prev_x= s->current_picture.motion_val[0][mot_index][0]; int prev_y= s->current_picture.motion_val[0][mot_index][1]; @@ -672,14 +672,15 @@ void ff_er_frame_end(MpegEncContext *s){ av_log(s->avctx, AV_LOG_INFO, "concealing errors\n"); if(s->current_picture.motion_val[0] == NULL){ - int size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2); + int size = s->b8_stride * 2 * s->mb_height; Picture *pic= s->current_picture_ptr; av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n"); for(i=0; i<2; i++){ - pic->motion_val_base[i]= av_mallocz((size+1) * 2 * sizeof(uint16_t)); //FIXME size - pic->motion_val[i]= pic->motion_val_base[i]+1; + pic->ref_index[i]= av_mallocz(size * sizeof(uint8_t)); + pic->motion_val_base[i]= av_mallocz((size+2) * 2 * sizeof(uint16_t)); + pic->motion_val[i]= pic->motion_val_base[i]+2; } pic->motion_subsample_log2= 3; s->current_picture= *s->current_picture_ptr; @@ -845,17 +846,17 @@ void ff_er_frame_end(MpegEncContext *s){ s->mb_intra=0; s->mb_skiped=0; if(IS_8X8(mb_type)){ - int mb_index= mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0]; + int mb_index= mb_x*2 + mb_y*2*s->b8_stride; int j; s->mv_type = MV_TYPE_8X8; for(j=0; j<4; j++){ - s->mv[0][j][0] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->block_wrap[0] ][0]; - s->mv[0][j][1] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->block_wrap[0] ][1]; + s->mv[0][j][0] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->b8_stride ][0]; + s->mv[0][j][1] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->b8_stride ][1]; } }else{ s->mv_type = MV_TYPE_16X16; - s->mv[0][0][0] = s->current_picture.motion_val[0][ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][0]; - s->mv[0][0][1] = s->current_picture.motion_val[0][ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][1]; + s->mv[0][0][0] = s->current_picture.motion_val[0][ mb_x*2 + mb_y*2*s->b8_stride ][0]; + s->mv[0][0][1] = s->current_picture.motion_val[0][ mb_x*2 + mb_y*2*s->b8_stride ][1]; } s->dsp.clear_blocks(s->block[0]); @@ -870,7 +871,7 @@ void ff_er_frame_end(MpegEncContext *s){ if(s->pict_type==B_TYPE){ for(mb_y=0; mb_y<s->mb_height; mb_y++){ for(mb_x=0; mb_x<s->mb_width; mb_x++){ - int xy= mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0]; + int xy= mb_x*2 + mb_y*2*s->b8_stride; const int mb_xy= mb_x + mb_y * s->mb_stride; const int mb_type= s->current_picture.mb_type[mb_xy]; error= s->error_status_table[mb_xy]; @@ -930,7 +931,7 @@ void ff_er_frame_end(MpegEncContext *s){ dest_cb= s->current_picture.data[1] + mb_x*8 + mb_y*8 *s->uvlinesize; dest_cr= s->current_picture.data[2] + mb_x*8 + mb_y*8 *s->uvlinesize; - dc_ptr= &s->dc_val[0][mb_x*2+1 + (mb_y*2+1)*(s->mb_width*2+2)]; + dc_ptr= &s->dc_val[0][mb_x*2 + mb_y*2*s->b8_stride]; for(n=0; n<4; n++){ dc=0; for(y=0; y<8; y++){ @@ -939,7 +940,7 @@ void ff_er_frame_end(MpegEncContext *s){ dc+= dest_y[x + (n&1)*8 + (y + (n>>1)*8)*s->linesize]; } } - dc_ptr[(n&1) + (n>>1)*(s->mb_width*2+2)]= (dc+4)>>3; + dc_ptr[(n&1) + (n>>1)*s->b8_stride]= (dc+4)>>3; } dcu=dcv=0; @@ -950,18 +951,18 @@ void ff_er_frame_end(MpegEncContext *s){ dcv+=dest_cr[x + y*(s->uvlinesize)]; } } - s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)]= (dcu+4)>>3; - s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)]= (dcv+4)>>3; + s->dc_val[1][mb_x + mb_y*s->mb_stride]= (dcu+4)>>3; + s->dc_val[2][mb_x + mb_y*s->mb_stride]= (dcv+4)>>3; } } #if 1 /* guess DC for damaged blocks */ - guess_dc(s, s->dc_val[0] + s->mb_width*2+3, s->mb_width*2, s->mb_height*2, s->mb_width*2+2, 1); - guess_dc(s, s->dc_val[1] + s->mb_width +3, s->mb_width , s->mb_height , s->mb_width +2, 0); - guess_dc(s, s->dc_val[2] + s->mb_width +3, s->mb_width , s->mb_height , s->mb_width +2, 0); + guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1); + guess_dc(s, s->dc_val[1], s->mb_width , s->mb_height , s->mb_stride, 0); + guess_dc(s, s->dc_val[2], s->mb_width , s->mb_height , s->mb_stride, 0); #endif /* filter luma DC */ - filter181(s->dc_val[0] + s->mb_width*2+3, s->mb_width*2, s->mb_height*2, s->mb_width*2+2); + filter181(s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride); #if 1 /* render DC only intra */ diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c index 8a60ff08b..ec776eb98 100644 --- a/src/libffmpeg/libavcodec/h263.c +++ b/src/libffmpeg/libavcodec/h263.c @@ -75,7 +75,7 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr); static void mpeg4_encode_visual_object_header(MpegEncContext * s); static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_number); #endif //CONFIG_ENCODERS -static void mpeg4_decode_sprite_trajectory(MpegEncContext * s); +static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb); static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr, int *dir_ptr); #ifdef CONFIG_ENCODERS @@ -577,12 +577,13 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){ } else if(IS_INTERLACED(colocated_mb_type)){ s->mv_type = MV_TYPE_FIELD; for(i=0; i<2; i++){ + int field_select= s->next_picture.ref_index[0][s->block_index[2*i]]; if(s->top_field_first){ - time_pp= s->pp_field_time - s->p_field_select_table[i][mb_index] + i; - time_pb= s->pb_field_time - s->p_field_select_table[i][mb_index] + i; + time_pp= s->pp_field_time - field_select + i; + time_pb= s->pb_field_time - field_select + i; }else{ - time_pp= s->pp_field_time + s->p_field_select_table[i][mb_index] - i; - time_pb= s->pb_field_time + s->p_field_select_table[i][mb_index] - i; + time_pp= s->pp_field_time + field_select - i; + time_pb= s->pb_field_time + field_select - i; } s->mv[0][i][0] = s->p_field_mv_table[i][0][mb_index][0]*time_pb/time_pp + mx; s->mv[0][i][1] = s->p_field_mv_table[i][0][mb_index][1]*time_pb/time_pp + my; @@ -610,7 +611,7 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){ void ff_h263_update_motion_val(MpegEncContext * s){ const int mb_xy = s->mb_y * s->mb_stride + s->mb_x; //FIXME a lot of thet is only needed for !low_delay - const int wrap = s->block_wrap[0]; + const int wrap = s->b8_stride; const int xy = s->block_index[0]; s->current_picture.mbskip_table[mb_xy]= s->mb_skiped; @@ -631,10 +632,13 @@ void ff_h263_update_motion_val(MpegEncContext * s){ for(i=0; i<2; i++){ s->p_field_mv_table[i][0][mb_xy][0]= s->mv[0][i][0]; s->p_field_mv_table[i][0][mb_xy][1]= s->mv[0][i][1]; - s->p_field_select_table[i][mb_xy]= s->field_select[0][i]; } + s->current_picture.ref_index[0][xy ]= + s->current_picture.ref_index[0][xy + 1]= s->field_select[0][0]; + s->current_picture.ref_index[0][xy + wrap ]= + s->current_picture.ref_index[0][xy + wrap + 1]= s->field_select[0][1]; } - + /* no update if 8X8 because it has been done during parsing */ s->current_picture.motion_val[0][xy][0] = motion_x; s->current_picture.motion_val[0][xy][1] = motion_y; @@ -985,7 +989,7 @@ void mpeg4_encode_mb(MpegEncContext * s, } /* motion vectors: 16x16 mode */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); h263_encode_motion(s, motion_x - pred_x, s->f_code); h263_encode_motion(s, motion_y - pred_y, s->f_code); @@ -1009,7 +1013,7 @@ void mpeg4_encode_mb(MpegEncContext * s, } /* motion vectors: 16x8 interlaced mode */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); pred_y /=2; put_bits(&s->pb, 1, s->field_select[0][0]); @@ -1037,7 +1041,7 @@ void mpeg4_encode_mb(MpegEncContext * s, for(i=0; i<4; i++){ /* motion vectors: 8x8 mode*/ - h263_pred_motion(s, i, &pred_x, &pred_y); + h263_pred_motion(s, i, 0, &pred_x, &pred_y); h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][0] - pred_x, s->f_code); h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][1] - pred_y, s->f_code); @@ -1185,7 +1189,7 @@ void h263_encode_mb(MpegEncContext * s, } /* motion vectors: 16x16 mode */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); if (!s->umvplus) { h263_encode_motion(s, motion_x - pred_x, 1); @@ -1212,7 +1216,7 @@ void h263_encode_mb(MpegEncContext * s, for(i=0; i<4; i++){ /* motion vectors: 8x8 mode*/ - h263_pred_motion(s, i, &pred_x, &pred_y); + h263_pred_motion(s, i, 0, &pred_x, &pred_y); motion_x= s->current_picture.motion_val[0][ s->block_index[i] ][0]; motion_y= s->current_picture.motion_val[0][ s->block_index[i] ][1]; @@ -1435,16 +1439,16 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr) /* find prediction */ if (n < 4) { - x = 2 * s->mb_x + 1 + (n & 1); - y = 2 * s->mb_y + 1 + ((n & 2) >> 1); - wrap = s->mb_width * 2 + 2; + x = 2 * s->mb_x + (n & 1); + y = 2 * s->mb_y + ((n & 2) >> 1); + wrap = s->b8_stride; dc_val = s->dc_val[0]; ac_val = s->ac_val[0][0]; scale = s->y_dc_scale; } else { - x = s->mb_x + 1; - y = s->mb_y + 1; - wrap = s->mb_width + 2; + x = s->mb_x; + y = s->mb_y; + wrap = s->mb_stride; dc_val = s->dc_val[n - 4 + 1]; ac_val = s->ac_val[n - 4 + 1][0]; scale = s->c_dc_scale; @@ -1456,8 +1460,10 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr) c = dc_val[(x) + (y - 1) * wrap]; /* No prediction outside GOB boundary */ - if (s->first_slice_line && ((n < 2) || (n > 3))) - c = 1024; + if(s->first_slice_line && n!=3){ + if(n!=2) c= 1024; + if(n!=1 && s->mb_x == s->resync_mb_x) a= 1024; + } pred_dc = 1024; /* just DC prediction */ if (a != 1024 && c != 1024) @@ -1480,16 +1486,16 @@ static void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n) /* find prediction */ if (n < 4) { - x = 2 * s->mb_x + 1 + (n & 1); - y = 2 * s->mb_y + 1 + (n>> 1); - wrap = s->mb_width * 2 + 2; + x = 2 * s->mb_x + (n & 1); + y = 2 * s->mb_y + (n>> 1); + wrap = s->b8_stride; dc_val = s->dc_val[0]; ac_val = s->ac_val[0][0]; scale = s->y_dc_scale; } else { - x = s->mb_x + 1; - y = s->mb_y + 1; - wrap = s->mb_width + 2; + x = s->mb_x; + y = s->mb_y; + wrap = s->mb_stride; dc_val = s->dc_val[n - 4 + 1]; ac_val = s->ac_val[n - 4 + 1][0]; scale = s->c_dc_scale; @@ -1560,78 +1566,15 @@ static void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n) ac_val1[8 + i] = block[s->dsp.idct_permutation[i ]]; } -int16_t *h263_pred_motion(MpegEncContext * s, int block, +int16_t *h263_pred_motion(MpegEncContext * s, int block, int dir, int *px, int *py) { - int xy, wrap; - int16_t *A, *B, *C, *mot_val; - static const int off[4]= {2, 1, 1, -1}; - - wrap = s->block_wrap[0]; - xy = s->block_index[block]; - - mot_val = s->current_picture.motion_val[0][xy]; - - A = s->current_picture.motion_val[0][xy - 1]; - /* special case for first (slice) line */ - if (s->first_slice_line && block<3) { - // we cant just change some MVs to simulate that as we need them for the B frames (and ME) - // and if we ever support non rectangular objects than we need to do a few ifs here anyway :( - if(block==0){ //most common case - if(s->mb_x == s->resync_mb_x){ //rare - *px= *py = 0; - }else if(s->mb_x + 1 == s->resync_mb_x && s->h263_pred){ //rare - C = s->current_picture.motion_val[0][xy + off[block] - wrap]; - if(s->mb_x==0){ - *px = C[0]; - *py = C[1]; - }else{ - *px = mid_pred(A[0], 0, C[0]); - *py = mid_pred(A[1], 0, C[1]); - } - }else{ - *px = A[0]; - *py = A[1]; - } - }else if(block==1){ - if(s->mb_x + 1 == s->resync_mb_x && s->h263_pred){ //rare - C = s->current_picture.motion_val[0][xy + off[block] - wrap]; - *px = mid_pred(A[0], 0, C[0]); - *py = mid_pred(A[1], 0, C[1]); - }else{ - *px = A[0]; - *py = A[1]; - } - }else{ /* block==2*/ - B = s->current_picture.motion_val[0][xy - wrap]; - C = s->current_picture.motion_val[0][xy + off[block] - wrap]; - if(s->mb_x == s->resync_mb_x) //rare - A[0]=A[1]=0; - - *px = mid_pred(A[0], B[0], C[0]); - *py = mid_pred(A[1], B[1], C[1]); - } - } else { - B = s->current_picture.motion_val[0][xy - wrap]; - C = s->current_picture.motion_val[0][xy + off[block] - wrap]; - *px = mid_pred(A[0], B[0], C[0]); - *py = mid_pred(A[1], B[1], C[1]); - } - return mot_val; -} - -// identical to above but with s->current_picture->motion_val, the above one will be removed, and this renamed to it -int16_t *h263_pred_motion2(MpegEncContext * s, int block, int dir, - int *px, int *py) -{ - int xy, wrap; + int wrap; int16_t *A, *B, *C, (*mot_val)[2]; static const int off[4]= {2, 1, 1, -1}; wrap = s->b8_stride; - xy = 2*(s->mb_x + s->mb_y * wrap); - - mot_val = s->current_picture.motion_val[dir] + xy; + mot_val = s->current_picture.motion_val[dir] + s->block_index[block]; A = mot_val[ - 1]; /* special case for first (slice) line */ @@ -1785,7 +1728,7 @@ static void init_mv_penalty_and_fcode(MpegEncContext *s) else{ int val, bit_size, range, code; - bit_size = s->f_code - 1; + bit_size = f_code - 1; range = 1 << bit_size; val=mv; @@ -2386,6 +2329,7 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number) time_div= s->time/s->time_increment_resolution; time_mod= s->time%s->time_increment_resolution; time_incr= time_div - s->last_time_base; + assert(time_incr >= 0); while(time_incr--) put_bits(&s->pb, 1, 1); @@ -2994,13 +2938,12 @@ void ff_mpeg4_init_partitions(MpegEncContext *s) uint8_t *start= pbBufPtr(&s->pb); uint8_t *end= s->pb.buf_end; int size= end - start; - int pb_size = size/3; - int pb2_size= size/3; - int tex_size= size - pb_size - pb2_size; + int pb_size = (((int)start + size/3)&(~3)) - (int)start; + int tex_size= (size - 2*pb_size)&(~3); set_put_bits_buffer_size(&s->pb, pb_size); init_put_bits(&s->tex_pb, start + pb_size , tex_size); - init_put_bits(&s->pb2 , start + pb_size + tex_size, pb2_size); + init_put_bits(&s->pb2 , start + pb_size + tex_size, pb_size); } void ff_mpeg4_merge_partitions(MpegEncContext *s) @@ -3165,7 +3108,7 @@ static int mpeg4_decode_video_packet_header(MpegEncContext *s) skip_bits(&s->gb, 3); /* intra dc vlc threshold */ //FIXME dont just ignore everything if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE){ - mpeg4_decode_sprite_trajectory(s); + mpeg4_decode_sprite_trajectory(s, &s->gb); av_log(s->avctx, AV_LOG_ERROR, "untested\n"); } @@ -3196,10 +3139,10 @@ void ff_mpeg4_clean_buffers(MpegEncContext *s) { int c_wrap, c_xy, l_wrap, l_xy; - l_wrap= s->block_wrap[0]; - l_xy= s->mb_y*l_wrap*2 + s->mb_x*2; - c_wrap= s->block_wrap[4]; - c_xy= s->mb_y*c_wrap + s->mb_x; + l_wrap= s->b8_stride; + l_xy= (2*s->mb_y-1)*l_wrap + s->mb_x*2 - 1; + c_wrap= s->mb_stride; + c_xy= (s->mb_y-1)*c_wrap + s->mb_x - 1; #if 0 /* clean DC */ @@ -3372,7 +3315,7 @@ static int mpeg4_decode_partition_a(MpegEncContext *s){ }else{ /* P/S_TYPE */ int mx, my, pred_x, pred_y, bits; int16_t * const mot_val= s->current_picture.motion_val[0][s->block_index[0]]; - const int stride= s->block_wrap[0]*2; + const int stride= s->b8_stride*2; try_again: bits= show_bits(&s->gb, 17); @@ -3430,7 +3373,7 @@ try_again: if ((cbpc & 16) == 0) { /* 16x16 motion prediction */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); if(!s->mcsel){ mx = h263_decode_motion(s, pred_x, s->f_code); if (mx >= 0xffff) @@ -3454,7 +3397,7 @@ try_again: int i; s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; for(i=0;i<4;i++) { - int16_t *mot_val= h263_pred_motion(s, i, &pred_x, &pred_y); + int16_t *mot_val= h263_pred_motion(s, i, 0, &pred_x, &pred_y); mx = h263_decode_motion(s, pred_x, s->f_code); if (mx >= 0xffff) return -1; @@ -3708,7 +3651,7 @@ static void preview_obmc(MpegEncContext *s){ int cbpc, i, pred_x, pred_y, mx, my; int16_t *mot_val; const int xy= s->mb_x + 1 + s->mb_y * s->mb_stride; - const int stride= s->block_wrap[0]*2; + const int stride= s->b8_stride*2; for(i=0; i<4; i++) s->block_index[i]+= 2; @@ -3748,7 +3691,7 @@ static void preview_obmc(MpegEncContext *s){ if ((cbpc & 16) == 0) { s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0; /* 16x16 motion prediction */ - mot_val= h263_pred_motion(s, 0, &pred_x, &pred_y); + mot_val= h263_pred_motion(s, 0, 0, &pred_x, &pred_y); if (s->umvplus) mx = h263p_decode_umotion(s, pred_x); else @@ -3766,7 +3709,7 @@ static void preview_obmc(MpegEncContext *s){ } else { s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; for(i=0;i<4;i++) { - mot_val = h263_pred_motion(s, i, &pred_x, &pred_y); + mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y); if (s->umvplus) mx = h263p_decode_umotion(s, pred_x); else @@ -3858,7 +3801,7 @@ int ff_h263_decode_mb(MpegEncContext *s, s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0; /* 16x16 motion prediction */ s->mv_type = MV_TYPE_16X16; - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); if (s->umvplus) mx = h263p_decode_umotion(s, pred_x); else @@ -3883,7 +3826,7 @@ int ff_h263_decode_mb(MpegEncContext *s, s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; s->mv_type = MV_TYPE_8X8; for(i=0;i<4;i++) { - mot_val = h263_pred_motion(s, i, &pred_x, &pred_y); + mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y); if (s->umvplus) mx = h263p_decode_umotion(s, pred_x); else @@ -3977,7 +3920,7 @@ int ff_h263_decode_mb(MpegEncContext *s, //FIXME UMV if(USES_LIST(mb_type, 0)){ - int16_t *mot_val= h263_pred_motion2(s, 0, 0, &mx, &my); + int16_t *mot_val= h263_pred_motion(s, 0, 0, &mx, &my); s->mv_dir = MV_DIR_FORWARD; mx = h263_decode_motion(s, mx, 1); @@ -3990,7 +3933,7 @@ int ff_h263_decode_mb(MpegEncContext *s, } if(USES_LIST(mb_type, 1)){ - int16_t *mot_val= h263_pred_motion2(s, 0, 1, &mx, &my); + int16_t *mot_val= h263_pred_motion(s, 0, 1, &mx, &my); s->mv_dir |= MV_DIR_BACKWARD; mx = h263_decode_motion(s, mx, 1); @@ -4145,7 +4088,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s, s->field_select[0][0]= get_bits1(&s->gb); s->field_select[0][1]= get_bits1(&s->gb); - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); for(i=0; i<2; i++){ mx = h263_decode_motion(s, pred_x, s->f_code); @@ -4163,7 +4106,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s, s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0; /* 16x16 motion prediction */ s->mv_type = MV_TYPE_16X16; - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); mx = h263_decode_motion(s, pred_x, s->f_code); if (mx >= 0xffff) @@ -4180,7 +4123,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s, s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; s->mv_type = MV_TYPE_8X8; for(i=0;i<4;i++) { - mot_val = h263_pred_motion(s, i, &pred_x, &pred_y); + mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y); mx = h263_decode_motion(s, pred_x, s->f_code); if (mx >= 0xffff) return -1; @@ -5117,11 +5060,15 @@ int h263_decode_picture_header(MpegEncContext *s) s->qscale = get_bits(&s->gb, 5); } + s->mb_width = (s->width + 15) / 16; + s->mb_height = (s->height + 15) / 16; + s->mb_num = s->mb_width * s->mb_height; + /* PEI */ while (get_bits1(&s->gb) != 0) { skip_bits(&s->gb, 8); } - + if(s->h263_slice_structured){ if (get_bits1(&s->gb) != 1) { av_log(s->avctx, AV_LOG_ERROR, "SEPB1 marker missing\n"); @@ -5181,7 +5128,7 @@ int h263_decode_picture_header(MpegEncContext *s) return 0; } -static void mpeg4_decode_sprite_trajectory(MpegEncContext * s) +static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb) { int i; int a= 2<<s->sprite_warping_accuracy; @@ -5201,17 +5148,17 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s) int length; int x=0, y=0; - length= get_vlc(&s->gb, &sprite_trajectory); + length= get_vlc(gb, &sprite_trajectory); if(length){ - x= get_xbits(&s->gb, length); + x= get_xbits(gb, length); } - if(!(s->divx_version==500 && s->divx_build==413)) skip_bits1(&s->gb); /* marker bit */ + if(!(s->divx_version==500 && s->divx_build==413)) skip_bits1(gb); /* marker bit */ - length= get_vlc(&s->gb, &sprite_trajectory); + length= get_vlc(gb, &sprite_trajectory); if(length){ - y=get_xbits(&s->gb, length); + y=get_xbits(gb, length); } - skip_bits1(&s->gb); /* marker bit */ + skip_bits1(gb); /* marker bit */ //printf("%d %d %d %d\n", x, y, i, s->sprite_warping_accuracy); d[i][0]= x; d[i][1]= y; @@ -5840,7 +5787,7 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){ } if(s->pict_type == S_TYPE && (s->vol_sprite_usage==STATIC_SPRITE || s->vol_sprite_usage==GMC_SPRITE)){ - mpeg4_decode_sprite_trajectory(s); + mpeg4_decode_sprite_trajectory(s, gb); if(s->sprite_brightness_change) av_log(s->avctx, AV_LOG_ERROR, "sprite_brightness_change not supported\n"); if(s->vol_sprite_usage==STATIC_SPRITE) av_log(s->avctx, AV_LOG_ERROR, "static sprite not supported\n"); } @@ -6111,8 +6058,10 @@ int flv_h263_decode_picture_header(MpegEncContext *s) s->height = height; s->pict_type = I_TYPE + get_bits(&s->gb, 2); - if (s->pict_type > P_TYPE) + s->dropable= s->pict_type > P_TYPE; + if (s->dropable) s->pict_type = P_TYPE; + skip_bits1(&s->gb); /* deblocking flag */ s->chroma_qscale= s->qscale = get_bits(&s->gb, 5); diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c index 88db359fe..aaf38b172 100644 --- a/src/libffmpeg/libavcodec/h263dec.c +++ b/src/libffmpeg/libavcodec/h263dec.c @@ -42,8 +42,8 @@ int ff_h263_decode_init(AVCodecContext *avctx) s->workaround_bugs= avctx->workaround_bugs; // set defaults + MPV_decode_defaults(s); s->quant_precision=5; - s->progressive_sequence=1; s->decode_mb= ff_h263_decode_mb; s->low_delay= 1; avctx->pix_fmt= PIX_FMT_YUV420P; @@ -551,6 +551,8 @@ retry: s->workaround_bugs|= FF_BUG_EDGE; } + if(s->divx_version) + s->workaround_bugs|= FF_BUG_HPEL_CHROMA; #if 0 if(s->divx_version==500) s->padding_bug_score= 256*256*256*64; @@ -714,7 +716,8 @@ assert(s->current_picture.pict_type == s->pict_type); ff_print_debug_info(s, pict); } else { *pict= *(AVFrame*)&s->last_picture; - ff_print_debug_info(s, pict); + if(pict) + ff_print_debug_info(s, pict); } /* Return the Picture timestamp as the frame number */ diff --git a/src/libffmpeg/libavcodec/h264.c b/src/libffmpeg/libavcodec/h264.c index 3f60e35e8..fa254e93b 100644 --- a/src/libffmpeg/libavcodec/h264.c +++ b/src/libffmpeg/libavcodec/h264.c @@ -31,6 +31,8 @@ #include "h264data.h" #include "golomb.h" +#include "cabac.h" + #undef NDEBUG #include <assert.h> @@ -162,6 +164,8 @@ typedef struct H264Context{ unsigned int top_samples_available; unsigned int topright_samples_available; unsigned int left_samples_available; + uint8_t (*top_border)[16+2*8]; + uint8_t left_border[17+2*9]; /** * non zero coeff count cache. @@ -248,9 +252,9 @@ typedef struct H264Context{ int chroma_offset[2][16][2]; //deblock - int disable_deblocking_filter_idc; - int slice_alpha_c0_offset_div2; - int slice_beta_offset_div2; + int deblocking_filter; ///< disable_deblocking_filter_idc with 1<->0 + int slice_alpha_c0_offset; + int slice_beta_offset; int redundant_pic_count; @@ -282,6 +286,22 @@ typedef struct H264Context{ GetBitContext *inter_gb_ptr; DCTELEM mb[16*24] __align8; + + /** + * Cabac + */ + CABACContext cabac; + uint8_t cabac_state[399]; + int cabac_init_idc; + + /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */ + uint16_t *cbp_table; + /* chroma_pred_mode for i4x4 or i16x16, else 0 */ + uint8_t *chroma_pred_mode_table; + int last_qscale_diff; + int16_t (*mvd_table[2])[2]; + int16_t mvd_cache[2][5*8][2]; + }H264Context; static VLC coeff_token_vlc[4]; @@ -295,6 +315,7 @@ static VLC run7_vlc; static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); +static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr); static inline uint32_t pack16to32(int a, int b){ #ifdef WORDS_BIGENDIAN @@ -610,9 +631,52 @@ static inline void fill_caches(H264Context *h, int mb_type){ *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else) *(uint32_t*)h->mv_cache [list][scan8[4 ]]= *(uint32_t*)h->mv_cache [list][scan8[12]]= 0; + + if( h->pps.cabac ) { + /* XXX beurk, Load mvd */ + if(IS_INTER(topleft_type)){ + const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride; + *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy]; + }else{ + *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0; + } + + if(IS_INTER(top_type)){ + const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; + *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0]; + *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1]; + *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2]; + *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3]; + }else{ + *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= + *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]= + *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= + *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0; + } + if(IS_INTER(left_type[0])){ + const int b_xy= h->mb2b_xy[left_xy[0]] + 3; + *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]]; + *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]]; + }else{ + *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]= + *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0; + } + if(IS_INTER(left_type[1])){ + const int b_xy= h->mb2b_xy[left_xy[1]] + 3; + *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]]; + *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]]; + }else{ + *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]= + *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0; + } + *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]= + *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]= + *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else) + *(uint32_t*)h->mvd_cache [list][scan8[4 ]]= + *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0; + } } //FIXME - } #endif } @@ -920,6 +984,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){ *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0; } + if( h->pps.cabac ) { + /* FIXME needed ? */ + for(y=0; y<4; y++){ + *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= + *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0; + } + } for(y=0; y<2; y++){ *(uint16_t*)s->current_picture.motion_val[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101; } @@ -931,6 +1002,12 @@ static inline void write_back_motion(H264Context *h, int mb_type){ *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]; *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]; } + if( h->pps.cabac ) { + for(y=0; y<4; y++){ + *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y]; + *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y]; + } + } for(y=0; y<2; y++){ s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y]; s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y]; @@ -988,6 +1065,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c dst[di++]= 0; dst[di++]= 0; si+=3; + continue; }else //next start code break; } @@ -1001,6 +1079,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c return dst; } +#if 0 /** * @param src the data which should be escaped * @param dst the target buffer, dst+1 == src is allowed as a special case @@ -1073,6 +1152,7 @@ static void encode_rbsp_trailing(PutBitContext *pb){ length= (-put_bits_count(pb))&7; if(length) put_bits(pb, length, 0); } +#endif /** * identifies the exact end of the bitstream @@ -1132,6 +1212,7 @@ static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){ } } +#if 0 /** * dct tranforms the 16 dc values. * @param qp quantization parameter ??? FIXME @@ -1169,6 +1250,8 @@ static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){ block[stride*10+offset]= (z0 - z3)>>1; } } +#endif + #undef xStride #undef stride @@ -1194,6 +1277,7 @@ static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){ block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1; } +#if 0 static void chroma_dc_dct_c(DCTELEM *block){ const int stride= 16*2; const int xStride= 16; @@ -1214,6 +1298,7 @@ static void chroma_dc_dct_c(DCTELEM *block){ block[stride*1 + xStride*0]= (a-c); block[stride*1 + xStride*1]= (e-b); } +#endif /** * gets the chroma qp. @@ -1232,18 +1317,6 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){ uint8_t *cm = cropTbl + MAX_NEG_CROP; block[0] += 32; -#if 1 - for(i=0; i<4; i++){ - const int z0= block[i + 4*0] + block[i + 4*2]; - const int z1= block[i + 4*0] - block[i + 4*2]; - const int z2= (block[i + 4*1]>>1) - block[i + 4*3]; - const int z3= block[i + 4*1] + (block[i + 4*3]>>1); - - block[i + 4*0]= z0 + z3; - block[i + 4*1]= z1 + z2; - block[i + 4*2]= z1 - z2; - block[i + 4*3]= z0 - z3; - } for(i=0; i<4; i++){ const int z0= block[0 + 4*i] + block[2 + 4*i]; @@ -1251,18 +1324,6 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){ const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i]; const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1); - dst[0 + i*stride]= cm[ dst[0 + i*stride] + ((z0 + z3) >> 6) ]; - dst[1 + i*stride]= cm[ dst[1 + i*stride] + ((z1 + z2) >> 6) ]; - dst[2 + i*stride]= cm[ dst[2 + i*stride] + ((z1 - z2) >> 6) ]; - dst[3 + i*stride]= cm[ dst[3 + i*stride] + ((z0 - z3) >> 6) ]; - } -#else - for(i=0; i<4; i++){ - const int z0= block[0 + 4*i] + block[2 + 4*i]; - const int z1= block[0 + 4*i] - block[2 + 4*i]; - const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i]; - const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1); - block[0 + 4*i]= z0 + z3; block[1 + 4*i]= z1 + z2; block[2 + 4*i]= z1 - z2; @@ -1280,9 +1341,9 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){ dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ]; dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ]; } -#endif } +#if 0 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){ int i; //FIXME try int temp instead of block @@ -1315,6 +1376,7 @@ static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int st block[3*4 + i]= z3 - 2*z2; } } +#endif //FIXME need to check that this doesnt overflow signed 32 bit for low qp, iam not sure, its very close //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away) @@ -2106,10 +2168,15 @@ static void init_pred_ptrs(H264Context *h){ static void free_tables(H264Context *h){ av_freep(&h->intra4x4_pred_mode); + av_freep(&h->chroma_pred_mode_table); + av_freep(&h->cbp_table); + av_freep(&h->mvd_table[0]); + av_freep(&h->mvd_table[1]); av_freep(&h->non_zero_count); av_freep(&h->slice_table_base); + av_freep(&h->top_border); h->slice_table= NULL; - + av_freep(&h->mb2b_xy); av_freep(&h->mb2b8_xy); } @@ -2124,8 +2191,17 @@ static int alloc_tables(H264Context *h){ int x,y; CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t)) + CHECKED_ALLOCZ(h->non_zero_count , big_mb_num * 16 * sizeof(uint8_t)) CHECKED_ALLOCZ(h->slice_table_base , big_mb_num * sizeof(uint8_t)) + CHECKED_ALLOCZ(h->top_border , s->mb_width * (16+8+8) * sizeof(uint8_t)) + + if( h->pps.cabac ) { + CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t)) + CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t)) + CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t)); + CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t)); + } memset(h->slice_table_base, -1, big_mb_num * sizeof(uint8_t)); h->slice_table= h->slice_table_base + s->mb_stride + 1; @@ -2166,6 +2242,8 @@ static int decode_init(AVCodecContext *avctx){ H264Context *h= avctx->priv_data; MpegEncContext * const s = &h->s; + MPV_decode_defaults(s); + s->avctx = avctx; common_init(h); @@ -2173,7 +2251,6 @@ static int decode_init(AVCodecContext *avctx){ s->workaround_bugs= avctx->workaround_bugs; // set defaults - s->progressive_sequence=1; // s->decode_mb= ff_h263_decode_mb; s->low_delay= 1; avctx->pix_fmt= PIX_FMT_YUV420P; @@ -2205,6 +2282,66 @@ static void frame_start(H264Context *h){ // s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1; } +static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ + MpegEncContext * const s = &h->s; + int i; + + src_y -= linesize; + src_cb -= uvlinesize; + src_cr -= uvlinesize; + + h->left_border[0]= h->top_border[s->mb_x][15]; + for(i=1; i<17; i++){ + h->left_border[i]= src_y[15+i* linesize]; + } + + *(uint64_t*)(h->top_border[s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize); + *(uint64_t*)(h->top_border[s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize); + + if(!(s->flags&CODEC_FLAG_GRAY)){ + h->left_border[17 ]= h->top_border[s->mb_x][16+7]; + h->left_border[17+9]= h->top_border[s->mb_x][24+7]; + for(i=1; i<9; i++){ + h->left_border[i+17 ]= src_cb[7+i*uvlinesize]; + h->left_border[i+17+9]= src_cr[7+i*uvlinesize]; + } + *(uint64_t*)(h->top_border[s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize); + *(uint64_t*)(h->top_border[s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize); + } +} + +static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ + MpegEncContext * const s = &h->s; + int temp8, i; + uint64_t temp64; + + src_y -= linesize + 1; + src_cb -= uvlinesize + 1; + src_cr -= uvlinesize + 1; + +#define XCHG(a,b,t,xchg)\ +t= a;\ +if(xchg)\ + a= b;\ +b= t; + + for(i=0; i<17; i++){ + XCHG(h->left_border[i ], src_y [i* linesize], temp8, xchg); + } + + XCHG(*(uint64_t*)(h->top_border[s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg); + XCHG(*(uint64_t*)(h->top_border[s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1); + + if(!(s->flags&CODEC_FLAG_GRAY)){ + for(i=0; i<9; i++){ + XCHG(h->left_border[i+17 ], src_cb[i*uvlinesize], temp8, xchg); + XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg); + } + XCHG(*(uint64_t*)(h->top_border[s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1); + XCHG(*(uint64_t*)(h->top_border[s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1); + } +} + static void hl_decode_mb(H264Context *h){ MpegEncContext * const s = &h->s; const int mb_x= s->mb_x; @@ -2240,6 +2377,9 @@ static void hl_decode_mb(H264Context *h){ } if(IS_INTRA(mb_type)){ + if(h->deblocking_filter) + xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); + if(!(s->flags&CODEC_FLAG_GRAY)){ h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize); h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize); @@ -2257,6 +2397,9 @@ static void hl_decode_mb(H264Context *h){ if(!topright_avail){ tr= ptr[3 - linesize]*0x01010101; topright= (uint8_t*) &tr; + }else if(i==5 && h->deblocking_filter){ + tr= *(uint32_t*)h->top_border[mb_x+1]; + topright= (uint8_t*) &tr; } h->pred4x4[ dir ](ptr, topright, linesize); @@ -2275,6 +2418,8 @@ static void hl_decode_mb(H264Context *h){ else svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); } + if(h->deblocking_filter) + xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); }else if(s->codec_id == CODEC_ID_H264){ hl_motion(h, dest_y, dest_cb, dest_cr, s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab, @@ -2331,10 +2476,10 @@ static void hl_decode_mb(H264Context *h){ } } } -} - -static void decode_mb_cabac(H264Context *h){ -// MpegEncContext * const s = &h->s; + if(h->deblocking_filter) { + backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize); + filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr); + } } /** @@ -2979,9 +3124,16 @@ static int decode_slice_header(H264Context *h){ if(s->current_picture.reference) decode_ref_pic_marking(h); - //FIXME CABAC stuff - s->qscale = h->pps.init_qp + get_se_golomb(&s->gb); //slice_qp_delta + if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ) + h->cabac_init_idc = get_ue_golomb(&s->gb); + + h->last_qscale_diff = 0; + s->qscale = h->pps.init_qp + get_se_golomb(&s->gb); + if(s->qscale<0 || s->qscale>51){ + av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale); + return -1; + } //FIXME qscale / qp ... stuff if(h->slice_type == SP_TYPE){ get_bits1(&s->gb); /* sp_for_switch_flag */ @@ -2990,14 +3142,19 @@ static int decode_slice_header(H264Context *h){ get_se_golomb(&s->gb); /* slice_qs_delta */ } + h->deblocking_filter = 1; + h->slice_alpha_c0_offset = 0; + h->slice_beta_offset = 0; if( h->pps.deblocking_filter_parameters_present ) { - h->disable_deblocking_filter_idc= get_ue_golomb(&s->gb); - if( h->disable_deblocking_filter_idc != 1 ) { - h->slice_alpha_c0_offset_div2= get_se_golomb(&s->gb); - h->slice_beta_offset_div2= get_se_golomb(&s->gb); + h->deblocking_filter= get_ue_golomb(&s->gb); + if(h->deblocking_filter < 2) + h->deblocking_filter^= 1; // 1<->0 + + if( h->deblocking_filter ) { + h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1; + h->slice_beta_offset = get_se_golomb(&s->gb) << 1; } - }else - h->disable_deblocking_filter_idc= 0; + } #if 0 //FMO if( h->pps.num_slice_groups > 1 && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5) @@ -3012,7 +3169,7 @@ static int decode_slice_header(H264Context *h){ s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1], h->ref_count[0], h->ref_count[1], s->qscale, - h->disable_deblocking_filter_idc + h->deblocking_filter ); } @@ -3122,7 +3279,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++; #else if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++; - ? == prefix > 2 or sth + /* ? == prefix > 2 or sth */ #endif tprintf("level: %d suffix_length:%d\n", level[i], suffix_length); } @@ -3186,7 +3343,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in * decodes a macroblock * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed */ -static int decode_mb(H264Context *h){ +static int decode_mb_cavlc(H264Context *h){ MpegEncContext * const s = &h->s; const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; int mb_type, partition_count, cbp; @@ -3223,6 +3380,7 @@ static int decode_mb(H264Context *h){ write_back_motion(h, mb_type); s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type + s->current_picture.qscale_table[mb_xy]= s->qscale; h->slice_table[ mb_xy ]= h->slice_num; h->prev_mb_skiped= 1; @@ -3303,7 +3461,9 @@ decode_intra_mb: skip_bits(&s->gb, 384); //FIXME check /fix the bitstream readers + //FIXME deblock filter, non_zero_count_cache init ... memset(h->non_zero_count[mb_xy], 16, 16); + s->current_picture.qscale_table[mb_xy]= s->qscale; return 0; } @@ -3607,71 +3767,1470 @@ decode_intra_mb: nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; } }else{ - memset(&h->non_zero_count_cache[8], 0, 8*5); + uint8_t * const nnz= &h->non_zero_count_cache[0]; + fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; } + s->current_picture.qscale_table[mb_xy]= s->qscale; write_back_non_zero_count(h); return 0; } -static int decode_slice(H264Context *h){ +static int decode_cabac_mb_type( H264Context *h ) { MpegEncContext * const s = &h->s; - const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F; - s->mb_skip_run= -1; - -#if 1 - for(;;){ - int ret= decode_mb(h); - - hl_decode_mb(h); - - if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ? - s->mb_y++; - ret= decode_mb(h); - - hl_decode_mb(h); - s->mb_y--; + if( h->slice_type == I_TYPE ) { + const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; + int ctx = 0; + int mb_type; + + if( s->mb_x > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-1] ) ) + ctx++; + if( s->mb_y > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-s->mb_stride] ) ) + ctx++; + + if( get_cabac( &h->cabac, &h->cabac_state[3+ctx] ) == 0 ) + return 0; /* I4x4 */ + + if( get_cabac_terminate( &h->cabac ) ) + return 25; /* PCM */ + + mb_type = 1; /* I16x16 */ + if( get_cabac( &h->cabac, &h->cabac_state[3+3] ) ) + mb_type += 12; /* cbp_luma != 0 */ + + if( get_cabac( &h->cabac, &h->cabac_state[3+4] ) ) { + if( get_cabac( &h->cabac, &h->cabac_state[3+5] ) ) + mb_type += 4 * 2; /* cbp_chroma == 2 */ + else + mb_type += 4 * 1; /* cbp_chroma == 1 */ + } + if( get_cabac( &h->cabac, &h->cabac_state[3+6] ) ) + mb_type += 2; + if( get_cabac( &h->cabac, &h->cabac_state[3+7] ) ) + mb_type += 1; + return mb_type; + + } else if( h->slice_type == P_TYPE ) { + if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) { + /* P-type */ + if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) { + if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 ) + return 0; /* P_L0_D16x16; */ + else + return 3; /* P_8x8; */ + } else { + if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 ) + return 2; /* P_L0_D8x16; */ + else + return 1; /* P_L0_D16x8; */ + } + } else { + int mb_type; + /* I-type */ + if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 ) + return 5+0; /* I_4x4 */ + if( get_cabac_terminate( &h->cabac ) ) + return 5+25; /*I_PCM */ + mb_type = 5+1; /* I16x16 */ + if( get_cabac( &h->cabac, &h->cabac_state[17+1] ) ) + mb_type += 12; /* cbp_luma != 0 */ + + if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) ) { + if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) ) + mb_type += 4 * 2; /* cbp_chroma == 2 */ + else + mb_type += 4 * 1; /* cbp_chroma == 1 */ + } + if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) ) + mb_type += 2; + if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) ) + mb_type += 1; + + return mb_type; } + } else { + /* TODO do others frames types */ + return -1; + } +} - if(ret<0){ - av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y); - ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask); +static int decode_cabac_mb_skip( H264Context *h) { + MpegEncContext * const s = &h->s; + const int mb_xy = s->mb_x + s->mb_y*s->mb_stride; + const int mba_xy = mb_xy - 1; + const int mbb_xy = mb_xy - s->mb_stride; + int ctx = 0; + + if( s->mb_x > 0 && !IS_SKIP( s->current_picture.mb_type[mba_xy] ) ) + ctx++; + if( s->mb_y > 0 && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ) ) + ctx++; + + if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE) + return get_cabac( &h->cabac, &h->cabac_state[11+ctx] ); + else /* B-frame */ + return get_cabac( &h->cabac, &h->cabac_state[24+ctx] ); +} - return -1; +static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) { + int mode = 0; + + if( get_cabac( &h->cabac, &h->cabac_state[68] ) ) + return pred_mode; + + if( get_cabac( &h->cabac, &h->cabac_state[69] ) ) + mode += 1; + if( get_cabac( &h->cabac, &h->cabac_state[69] ) ) + mode += 2; + if( get_cabac( &h->cabac, &h->cabac_state[69] ) ) + mode += 4; + if( mode >= pred_mode ) + return mode + 1; + else + return mode; +} + +static int decode_cabac_mb_chroma_pre_mode( H264Context *h) { + MpegEncContext * const s = &h->s; + const int mb_xy = s->mb_x + s->mb_y*s->mb_stride; + const int mba_xy = mb_xy - 1; + const int mbb_xy = mb_xy - s->mb_stride; + + int ctx = 0; + + /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */ + if( s->mb_x > 0 && h->chroma_pred_mode_table[mba_xy] != 0 ) + ctx++; + + if( s->mb_y > 0 && h->chroma_pred_mode_table[mbb_xy] != 0 ) + ctx++; + + if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 ) + return 0; + + if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 ) + return 1; + if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 ) + return 2; + else + return 3; +} + +static const uint8_t block_idx_x[16] = { + 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3 +}; +static const uint8_t block_idx_y[16] = { + 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 +}; +static const uint8_t block_idx_xy[4][4] = { + { 0, 2, 8, 10}, + { 1, 3, 9, 11}, + { 4, 6, 12, 14}, + { 5, 7, 13, 15} +}; + +static int decode_cabac_mb_cbp_luma( H264Context *h) { + MpegEncContext * const s = &h->s; + const int mb_xy = s->mb_x + s->mb_y*s->mb_stride; + + int cbp = 0; + int i8x8; + + h->cbp_table[mb_xy] = 0; /* FIXME aaahahahah beurk */ + + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { + int mba_xy = -1; + int mbb_xy = -1; + int x, y; + int ctx = 0; + + x = block_idx_x[4*i8x8]; + y = block_idx_y[4*i8x8]; + + if( x > 0 ) + mba_xy = mb_xy; + else if( s->mb_x > 0 ) + mba_xy = mb_xy - 1; + + if( y > 0 ) + mbb_xy = mb_xy; + else if( s->mb_y > 0 ) + mbb_xy = mb_xy - s->mb_stride; + + /* No need to test for skip as we put 0 for skip block */ + if( mba_xy >= 0 ) { + int i8x8a = block_idx_xy[(x-1)&0x03][y]/4; + if( ((h->cbp_table[mba_xy] >> i8x8a)&0x01) == 0 ) + ctx++; } - - if(++s->mb_x >= s->mb_width){ - s->mb_x=0; - ff_draw_horiz_band(s, 16*s->mb_y, 16); - if(++s->mb_y >= s->mb_height){ - tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits); - if(get_bits_count(&s->gb) == s->gb.size_in_bits){ - ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); + if( mbb_xy >= 0 ) { + int i8x8b = block_idx_xy[x][(y-1)&0x03]/4; + if( ((h->cbp_table[mbb_xy] >> i8x8b)&0x01) == 0 ) + ctx += 2; + } - return 0; + if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) { + cbp |= 1 << i8x8; + h->cbp_table[mb_xy] = cbp; /* FIXME aaahahahah beurk */ + } + } + return cbp; +} +static int decode_cabac_mb_cbp_chroma( H264Context *h) { + MpegEncContext * const s = &h->s; + const int mb_xy = s->mb_x + s->mb_y*s->mb_stride; + int ctx; + int cbp_a, cbp_b; + + /* No need to test for skip */ + if( s->mb_x > 0 ) + cbp_a = (h->cbp_table[mb_xy-1]>>4)&0x03; + else + cbp_a = -1; + + if( s->mb_y > 0 ) + cbp_b = (h->cbp_table[mb_xy-s->mb_stride]>>4)&0x03; + else + cbp_b = -1; + + ctx = 0; + if( cbp_a > 0 ) ctx++; + if( cbp_b > 0 ) ctx += 2; + if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 ) + return 0; + + ctx = 4; + if( cbp_a == 2 ) ctx++; + if( cbp_b == 2 ) ctx += 2; + if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) ) + return 2; + else + return 1; +} +static int decode_cabac_mb_dqp( H264Context *h) { + MpegEncContext * const s = &h->s; + int mbn_xy; + int ctx = 0; + int val = 0; + + if( s->mb_x > 0 ) + mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1; + else + mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride; + + if( mbn_xy >= 0 && h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) ) + ctx++; + + while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) { + if( ctx < 2 ) + ctx = 2; + else + ctx = 3; + val++; + } + + if( val&0x01 ) + return (val + 1)/2; + else + return -(val + 1)/2; +} +static int decode_cabac_mb_sub_type( H264Context *h ) { + if( get_cabac( &h->cabac, &h->cabac_state[21] ) ) + return 0; /* 8x8 */ + if( !get_cabac( &h->cabac, &h->cabac_state[22] ) ) + return 1; /* 8x4 */ + if( get_cabac( &h->cabac, &h->cabac_state[23] ) ) + return 2; /* 4x8 */ + return 3; /* 4x4 */ +} + +static int decode_cabac_mb_ref( H264Context *h, int list, int n ) { + int refa = h->ref_cache[list][scan8[n] - 1]; + int refb = h->ref_cache[list][scan8[n] - 8]; + int ref = 0; + int ctx = 0; + + if( refa > 0 ) + ctx++; + if( refb > 0 ) + ctx += 2; + + while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) { + ref++; + if( ctx < 4 ) + ctx = 4; + else + ctx = 5; + } + return ref; +} + +static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) { + int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) + + abs( h->mvd_cache[list][scan8[n] - 8][l] ); + int ctxbase = (l == 0) ? 40 : 47; + int ctx; + int mvd = 0; + + if( amvd < 3 ) + ctx = 0; + else if( amvd > 32 ) + ctx = 2; + else + ctx = 1; + + while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) { + mvd++; + if( ctx < 3 ) + ctx = 3; + else if( ctx < 6 ) + ctx++; + } + + if( mvd >= 9 ) { + int k = 3; + while( get_cabac_bypass( &h->cabac ) ) { + mvd += 1 << k; + k++; + } + while( k-- ) { + if( get_cabac_bypass( &h->cabac ) ) + mvd += 1 << k; + } + } + if( mvd != 0 && get_cabac_bypass( &h->cabac ) ) + return -mvd; + return mvd; +} + + +static int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) { + MpegEncContext * const s = &h->s; + const int mb_xy = s->mb_x + s->mb_y*s->mb_stride; + int mba_xy = -1; + int mbb_xy = -1; + + int nza = -1; + int nzb = -1; + int ctx = 0; + + if( cat == 0 ) { + if( s->mb_x > 0 ) { + mba_xy = mb_xy - 1; + if( IS_INTRA16x16(s->current_picture.mb_type[mba_xy] ) ) + nza = h->cbp_table[mba_xy]&0x100; + } + if( s->mb_y > 0 ) { + mbb_xy = mb_xy - s->mb_stride; + if( IS_INTRA16x16(s->current_picture.mb_type[mbb_xy] ) ) + nzb = h->cbp_table[mbb_xy]&0x100; + } + } else if( cat == 1 || cat == 2 ) { + int i8x8a, i8x8b; + int x, y; + + x = block_idx_x[idx]; + y = block_idx_y[idx]; + + if( x > 0 ) + mba_xy = mb_xy; + else if( s->mb_x > 0 ) + mba_xy = mb_xy - 1; + + if( y > 0 ) + mbb_xy = mb_xy; + else if( s->mb_y > 0 ) + mbb_xy = mb_xy - s->mb_stride; + + /* No need to test for skip */ + if( mba_xy >= 0 ) { + i8x8a = block_idx_xy[(x-1)&0x03][y]/4; + + if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) && + ((h->cbp_table[mba_xy]&0x0f)>>i8x8a)) + nza = h->non_zero_count_cache[scan8[idx] - 1]; + } + + if( mbb_xy >= 0 ) { + i8x8b = block_idx_xy[x][(y-1)&0x03]/4; + + if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) && + ((h->cbp_table[mbb_xy]&0x0f)>>i8x8b)) + nzb = h->non_zero_count_cache[scan8[idx] - 8]; + } + } else if( cat == 3 ) { + if( s->mb_x > 0 ) { + mba_xy = mb_xy - 1; + + if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) && + (h->cbp_table[mba_xy]&0x30) ) + nza = (h->cbp_table[mba_xy]>>(6+idx))&0x01; + } + if( s->mb_y > 0 ) { + mbb_xy = mb_xy - s->mb_stride; + + if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) && + (h->cbp_table[mbb_xy]&0x30) ) + nzb = (h->cbp_table[mbb_xy]>>(6+idx))&0x01; + } + } else if( cat == 4 ) { + int idxc = idx % 4 ; + if( idxc == 1 || idxc == 3 ) + mba_xy = mb_xy; + else if( s->mb_x > 0 ) + mba_xy = mb_xy -1; + + if( idxc == 2 || idxc == 3 ) + mbb_xy = mb_xy; + else if( s->mb_y > 0 ) + mbb_xy = mb_xy - s->mb_stride; + + if( mba_xy >= 0 && + !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) && + (h->cbp_table[mba_xy]&0x30) == 0x20 ) + nza = h->non_zero_count_cache[scan8[16+idx] - 1]; + + if( mbb_xy >= 0 && + !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) && + (h->cbp_table[mbb_xy]&0x30) == 0x20 ) + nzb = h->non_zero_count_cache[scan8[16+idx] - 8]; + } + + if( ( mba_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) || + ( mba_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) ) || + nza > 0 ) + ctx++; + + if( ( mbb_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) || + ( mbb_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) ) || + nzb > 0 ) + ctx += 2; + + return ctx + 4 * cat; +} + +static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) { + const int mb_xy = h->s.mb_x + h->s.mb_y*h->s.mb_stride; + const uint16_t *qmul= dequant_coeff[qp]; + static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 }; + static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 }; + static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 }; + + int coeff[16]; + + int last = 0; + int coeff_count = 0; + int nz[16] = {0}; + int i; + + int abslevel1 = 0; + int abslevelgt1 = 0; + + /* cat: 0-> DC 16x16 n = 0 + * 1-> AC 16x16 n = luma4x4idx + * 2-> Luma4x4 n = luma4x4idx + * 3-> DC Chroma n = iCbCr + * 4-> AC Chroma n = 4 * iCbCr + chroma4x4idx + */ + + /* read coded block flag */ + if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) { + if( cat == 1 || cat == 2 ) + h->non_zero_count_cache[scan8[n]] = 0; + else if( cat == 4 ) + h->non_zero_count_cache[scan8[16+n]] = 0; + + return 0; + } + + while( last < max_coeff - 1 ) { + int ctx = FFMIN( last, max_coeff - 2 ); + + if( get_cabac( &h->cabac, &h->cabac_state[105+significant_coeff_flag_offset[cat]+ctx] ) == 0 ) { + nz[last++] = 0; + } + else { + nz[last++] = 1; + coeff_count++; + if( get_cabac( &h->cabac, &h->cabac_state[166+last_significant_coeff_flag_offset[cat]+ctx] ) ) { + while( last < max_coeff ) { + nz[last++] = 0; + } + break; + } + } + } + if( last == max_coeff -1 ) { + nz[last++] = 1; + coeff_count++; + } + + if( cat == 0 && coeff_count > 0 ) + h->cbp_table[mb_xy] |= 0x100; + else if( cat == 1 || cat == 2 ) + h->non_zero_count_cache[scan8[n]] = coeff_count; + else if( cat == 3 && coeff_count > 0 ) + h->cbp_table[mb_xy] |= 0x40 << n; + else if( cat == 4 ) + h->non_zero_count_cache[scan8[16+n]] = coeff_count; + + for( i = coeff_count - 1; i >= 0; i-- ) { + int coeff_abs_m1; + + int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 + 1 )) + coeff_abs_level_m1_offset[cat]; + + if( get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) == 0 ) { + coeff_abs_m1 = 0; + } else { + coeff_abs_m1 = 1; + ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat]; + while( coeff_abs_m1 < 14 && get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) ) { + coeff_abs_m1++; + } + } + + if( coeff_abs_m1 >= 14 ) { + int j = 0; + while( get_cabac_bypass( &h->cabac ) ) { + coeff_abs_m1 += 1 << j; + j++; + } + + while( j-- ) { + if( get_cabac_bypass( &h->cabac ) ) + coeff_abs_m1 += 1 << j ; + } + } + if( get_cabac_bypass( &h->cabac ) ) + coeff[i] = -1 *( coeff_abs_m1 + 1 ); + else + coeff[i] = coeff_abs_m1 + 1; + + if( coeff_abs_m1 == 0 ) + abslevel1++; + else + abslevelgt1++; + } + + if( cat == 0 || cat == 3 ) { /* DC */ + int j; + for( i = 0, j = 0; j < coeff_count; i++ ) { + if( nz[i] ) { + block[scantable[i]] = coeff[j]; + + j++; + } + } + + } else { /* AC */ + int j; + for( i = 0, j = 0; j < coeff_count; i++ ) { + if( nz[i] ) { + block[scantable[i]] = coeff[j] * qmul[scantable[i]]; + + j++; + } + } + } + return 0; +} + +/** + * decodes a macroblock + * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + */ +static int decode_mb_cabac(H264Context *h) { + MpegEncContext * const s = &h->s; + const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; + int mb_type, partition_count, cbp = 0; + + s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?) + + if( h->slice_type == B_TYPE ) { + av_log( h->s.avctx, AV_LOG_ERROR, "B-frame not supported with CABAC\n" ); + return -1; + } + if( h->sps.mb_aff ) { + av_log( h->s.avctx, AV_LOG_ERROR, "Fields not supported with CABAC\n" ); + return -1; + } + + if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) { + /* read skip flags */ + if( decode_cabac_mb_skip( h ) ) { + int mx, my; + + /* skip mb */ + mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; + + memset(h->non_zero_count[mb_xy], 0, 16); + memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui +#if 0 + if(h->sps.mb_aff && s->mb_skip_run==0 && (s->mb_y&1)==0){ + h->mb_field_decoding_flag= get_bits1(&s->gb); + } + if(h->mb_field_decoding_flag) + mb_type|= MB_TYPE_INTERLACED; +#endif + + fill_caches(h, mb_type); //FIXME check what is needed and what not ... + pred_pskip_motion(h, &mx, &my); + fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); + fill_rectangle( h->mvd_cache[0][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); + fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); + write_back_motion(h, mb_type); + + s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type + s->current_picture.qscale_table[mb_xy]= s->qscale; + h->slice_table[ mb_xy ]= h->slice_num; + h->cbp_table[mb_xy] = 0; + h->chroma_pred_mode_table[mb_xy] = 0; + h->last_qscale_diff = 0; + + h->prev_mb_skiped= 1; + + return 0; + + } + } + h->prev_mb_skiped = 0; + + if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) { + av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" ); + return -1; + } + + if( h->slice_type == P_TYPE ) { + if( mb_type < 5) { + partition_count= p_mb_type_info[mb_type].partition_count; + mb_type= p_mb_type_info[mb_type].type; + } else { + mb_type -= 5; + goto decode_intra_mb; + } + } else { + assert(h->slice_type == I_TYPE); +decode_intra_mb: + partition_count = 0; + cbp= i_mb_type_info[mb_type].cbp; + h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; + mb_type= i_mb_type_info[mb_type].type; + } +#if 0 + if(h->mb_field_decoding_flag) + mb_type |= MB_TYPE_INTERLACED; +#endif + + s->current_picture.mb_type[mb_xy]= mb_type; + h->slice_table[ mb_xy ]= h->slice_num; + + if(IS_INTRA_PCM(mb_type)) { + /* TODO */ + h->cbp_table[mb_xy] = 0xf +4*2; + h->chroma_pred_mode_table[mb_xy] = 0; + s->current_picture.qscale_table[mb_xy]= s->qscale; + return -1; + } + + fill_caches(h, mb_type); + + if( IS_INTRA( mb_type ) ) { + if( IS_INTRA4x4( mb_type ) ) { + int i; + for( i = 0; i < 16; i++ ) { + int pred = pred_intra_mode( h, i ); + h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred ); + + //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] ); + } + write_back_intra_pred_mode(h); + if( check_intra4x4_pred_mode(h) < 0 ) return -1; + } else { + h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode ); + if( h->intra16x16_pred_mode < 0 ) return -1; + } + h->chroma_pred_mode_table[mb_xy] = + h->chroma_pred_mode = decode_cabac_mb_chroma_pre_mode( h ); + + h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode ); + if( h->chroma_pred_mode < 0 ) return -1; + } else if( partition_count == 4 ) { + int i, j, sub_partition_count[4], list, ref[2][4]; + + /* Only P-frame */ + for( i = 0; i < 4; i++ ) { + h->sub_mb_type[i] = decode_cabac_mb_sub_type( h ); + sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count; + h->sub_mb_type[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].type; + } + + for( list = 0; list < 2; list++ ) { + if( h->ref_count[list] > 0 ) { + for( i = 0; i < 4; i++ ) { + if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){ + if( h->ref_count[list] > 1 ) + ref[list][i] = decode_cabac_mb_ref( h, list, 4*i ); + else + ref[list][i] = 0; + } else { + ref[list][i] = -1; + } + h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]= + h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i]; + } + } + } + + for(list=0; list<2; list++){ + + for(i=0; i<4; i++){ + //h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]= + //h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i]; + + if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){ + const int sub_mb_type= h->sub_mb_type[i]; + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; + for(j=0; j<sub_partition_count[i]; j++){ + int mpx, mpy; + int mx, my; + const int index= 4*i + block_width*j; + int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ]; + int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ]; + pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy); + + mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 ); + my = mpy + decode_cabac_mb_mvd( h, list, index, 1 ); + tprintf("final mv:%d %d\n", mx, my); + + if(IS_SUB_8X8(sub_mb_type)){ + mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= + mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; + mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= + mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; + + mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= + mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx; + mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= + mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy; + }else if(IS_SUB_8X4(sub_mb_type)){ + mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx; + mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my; + + mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx; + mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy; + }else if(IS_SUB_4X8(sub_mb_type)){ + mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx; + mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my; + + mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx; + mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy; + }else{ + assert(IS_SUB_4X4(sub_mb_type)); + mv_cache[ 0 ][0]= mx; + mv_cache[ 0 ][1]= my; + + mvd_cache[ 0 ][0]= mx - mpx; + mvd_cache[ 0 ][1]= my - mpy; + } + } }else{ - ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); + uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0]; + uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0]; + p[0] = p[1] = p[8] = p[9] = 0; + pd[0]= pd[1]= pd[8]= pd[9]= 0; + } + } + } + } else if( !IS_DIRECT(mb_type) ) { + int list, mx, my, i, mpx, mpy; + if(IS_16X16(mb_type)){ + for(list=0; list<2; list++){ + if(IS_DIR(mb_type, 0, list)){ + if(h->ref_count[list] > 0 ){ + const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0; + fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); + } + } + } + for(list=0; list<2; list++){ + if(IS_DIR(mb_type, 0, list)){ + pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy); + + mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 ); + my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 ); + tprintf("final mv:%d %d\n", mx, my); + + fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4); + fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); + } + } + } + else if(IS_16X8(mb_type)){ + for(list=0; list<2; list++){ + if(h->ref_count[list]>0){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0; + fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); + } + } + } + } + for(list=0; list<2; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy); + mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 ); + my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 ); + tprintf("final mv:%d %d\n", mx, my); + fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4); + fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); + } + } + } + }else{ + assert(IS_8X16(mb_type)); + for(list=0; list<2; list++){ + if(h->ref_count[list]>0){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ //FIXME optimize + const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0; + fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); + } + } + } + } + for(list=0; list<2; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy); + mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 ); + my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 ); + + tprintf("final mv:%d %d\n", mx, my); + fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4); + fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); + } + } + } + } + } + + if( IS_INTER( mb_type ) ) { + h->chroma_pred_mode_table[mb_xy] = 0; + write_back_motion( h, mb_type ); + } + + if( !IS_INTRA16x16( mb_type ) ) { + cbp = decode_cabac_mb_cbp_luma( h ); + cbp |= decode_cabac_mb_cbp_chroma( h ) << 4; + } + + h->cbp_table[mb_xy] = cbp; + + if( cbp || IS_INTRA16x16( mb_type ) ) { + const uint8_t *scan, *dc_scan; + int dqp; + + if(IS_INTERLACED(mb_type)){ + scan= field_scan; + dc_scan= luma_dc_field_scan; + }else{ + scan= zigzag_scan; + dc_scan= luma_dc_zigzag_scan; + } + + h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h ); + s->qscale += dqp; + if(((unsigned)s->qscale) > 51){ + if(s->qscale<0) s->qscale+= 52; + else s->qscale-= 52; + } + h->chroma_qp = get_chroma_qp(h, s->qscale); + + if( IS_INTRA16x16( mb_type ) ) { + int i; + //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); + if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, s->qscale, 16) < 0) + return -1; + if( cbp&15 ) { + for( i = 0; i < 16; i++ ) { + //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i ); + if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, s->qscale, 15) < 0 ) + return -1; + } + } else { + fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); + } + } else { + int i8x8, i4x4; + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { + if( cbp & (1<<i8x8) ) { + for( i4x4 = 0; i4x4 < 4; i4x4++ ) { + const int index = 4*i8x8 + i4x4; + //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index ); + if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, s->qscale, 16) < 0 ) + return -1; + } + } else { + uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; + nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; + } + } + } + + if( cbp&0x30 ){ + int c; + for( c = 0; c < 2; c++ ) { + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); + if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->chroma_qp, 4) < 0) return -1; + } + } + + if( cbp&0x20 ) { + int c, i; + for( c = 0; c < 2; c++ ) { + for( i = 0; i < 4; i++ ) { + const int index = 16 + 4 * c + i; + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 ); + if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->chroma_qp, 15) < 0) + return -1; } } + } else { + uint8_t * const nnz= &h->non_zero_count_cache[0]; + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; } - - if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){ - if(get_bits_count(&s->gb) == s->gb.size_in_bits){ - ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); + } else { + memset( &h->non_zero_count_cache[8], 0, 8*5 ); + } + + s->current_picture.qscale_table[mb_xy]= s->qscale; + write_back_non_zero_count(h); + + return 0; +} + + +static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { + int i, d; + const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); + const int alpha = alpha_table[index_a]; + const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )]; + + for( i = 0; i < 4; i++ ) { + if( bS[i] == 0 ) { + pix += 4 * stride; + continue; + } + + if( bS[i] < 4 ) { + const int tc0 = tc0_table[index_a][bS[i] - 1]; + /* 4px edge length */ + for( d = 0; d < 4; d++ ) { + const int p0 = pix[-1]; + const int p1 = pix[-2]; + const int p2 = pix[-3]; + const int q0 = pix[0]; + const int q1 = pix[1]; + const int q2 = pix[2]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + int tc = tc0; + int i_delta; + + if( ABS( p2 - p0 ) < beta ) { + pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 ); + tc++; + } + if( ABS( q2 - q0 ) < beta ) { + pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 ); + tc++; + } + + i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */ + pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ + } + pix += stride; + } + }else{ + /* 4px edge length */ + for( d = 0; d < 4; d++ ) { + const int p0 = pix[-1]; + const int p1 = pix[-2]; + const int p2 = pix[-3]; + + const int q0 = pix[0]; + const int q1 = pix[1]; + const int q2 = pix[2]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + + if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ + if( ABS( p2 - p0 ) < beta) + { + const int p3 = pix[-4]; + /* p0', p1', p2' */ + pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; + } else { + /* p0' */ + pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + } + if( ABS( q2 - q0 ) < beta) + { + const int q3 = pix[3]; + /* q0', q1', q2' */ + pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; + } else { + /* q0' */ + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + }else{ + /* p0', q0' */ + pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + } + pix += stride; + } + } + } +} +static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { + int i, d; + const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); + const int alpha = alpha_table[index_a]; + const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )]; + + for( i = 0; i < 4; i++ ) { + if( bS[i] == 0 ) { + pix += 2 * stride; + continue; + } + + if( bS[i] < 4 ) { + const int tc = tc0_table[index_a][bS[i] - 1] + 1; + /* 2px edge length (because we use same bS than the one for luma) */ + for( d = 0; d < 2; d++ ){ + const int p0 = pix[-1]; + const int p1 = pix[-2]; + const int q0 = pix[0]; + const int q1 = pix[1]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + + pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */ + pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ + } + pix += stride; + } + }else{ + /* 2px edge length (because we use same bS than the one for luma) */ + for( d = 0; d < 2; d++ ){ + const int p0 = pix[-1]; + const int p1 = pix[-2]; + const int q0 = pix[0]; + const int q1 = pix[1]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + + pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ + } + pix += stride; + } + } + } +} + +static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { + int i, d; + const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); + const int alpha = alpha_table[index_a]; + const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )]; + const int pix_next = stride; + + for( i = 0; i < 4; i++ ) { + if( bS[i] == 0 ) { + pix += 4; + continue; + } + + if( bS[i] < 4 ) { + const int tc0 = tc0_table[index_a][bS[i] - 1]; + /* 4px edge length */ + for( d = 0; d < 4; d++ ) { + const int p0 = pix[-1*pix_next]; + const int p1 = pix[-2*pix_next]; + const int p2 = pix[-3*pix_next]; + const int q0 = pix[0]; + const int q1 = pix[1*pix_next]; + const int q2 = pix[2*pix_next]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + + int tc = tc0; + int i_delta; + + if( ABS( p2 - p0 ) < beta ) { + pix[-2*pix_next] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 ); + tc++; + } + if( ABS( q2 - q0 ) < beta ) { + pix[pix_next] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 ); + tc++; + } + + i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-pix_next] = clip_uint8( p0 + i_delta ); /* p0' */ + pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ + } + pix++; + } + }else{ + /* 4px edge length */ + for( d = 0; d < 4; d++ ) { + const int p0 = pix[-1*pix_next]; + const int p1 = pix[-2*pix_next]; + const int p2 = pix[-3*pix_next]; + const int q0 = pix[0]; + const int q1 = pix[1*pix_next]; + const int q2 = pix[2*pix_next]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + + const int p3 = pix[-4*pix_next]; + const int q3 = pix[ 3*pix_next]; + + if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ + if( ABS( p2 - p0 ) < beta) { + /* p0', p1', p2' */ + pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; + } else { + /* p0' */ + pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + } + if( ABS( q2 - q0 ) < beta) { + /* q0', q1', q2' */ + pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; + } else { + /* q0' */ + pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + }else{ + /* p0', q0' */ + pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + } + pix++; + } + } + } +} + +static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) { + int i, d; + const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 ); + const int alpha = alpha_table[index_a]; + const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )]; + const int pix_next = stride; + + for( i = 0; i < 4; i++ ) + { + if( bS[i] == 0 ) { + pix += 2; + continue; + } + + if( bS[i] < 4 ) { + int tc = tc0_table[index_a][bS[i] - 1] + 1; + /* 2px edge length (see deblocking_filter_edgecv) */ + for( d = 0; d < 2; d++ ) { + const int p0 = pix[-1*pix_next]; + const int p1 = pix[-2*pix_next]; + const int q0 = pix[0]; + const int q1 = pix[1*pix_next]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + + int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + + pix[-pix_next] = clip_uint8( p0 + i_delta ); /* p0' */ + pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ + } + pix++; + } + }else{ + /* 2px edge length (see deblocking_filter_edgecv) */ + for( d = 0; d < 2; d++ ) { + const int p0 = pix[-1*pix_next]; + const int p1 = pix[-2*pix_next]; + const int q0 = pix[0]; + const int q1 = pix[1*pix_next]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { + + pix[-pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ + } + pix++; + } + } + } +} + +static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) { + MpegEncContext * const s = &h->s; + const int mb_xy= mb_x + mb_y*s->mb_stride; + int linesize, uvlinesize; + int dir; + + /* FIXME Implement deblocking filter for field MB */ + if( h->sps.mb_aff ) { + return; + } + linesize = s->linesize; + uvlinesize = s->uvlinesize; + + /* dir : 0 -> vertical edge, 1 -> horizontal edge */ + for( dir = 0; dir < 2; dir++ ) + { + int start = 0; + int edge; + + /* test picture boundary */ + if( ( dir == 0 && mb_x == 0 ) || ( dir == 1 && mb_y == 0 ) ) { + start = 1; + } + /* FIXME test slice boundary */ + if( h->deblocking_filter == 2 ) { + } + + /* Calculate bS */ + for( edge = start; edge < 4; edge++ ) { + /* mbn_xy: neighbour macroblock (how that works for field ?) */ + int mbn_xy = edge > 0 ? mb_xy : ( dir == 0 ? mb_xy -1 : mb_xy - s->mb_stride ); + int bS[4]; + int qp; + + if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) || + IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) { + bS[0] = bS[1] = bS[2] = bS[3] = ( edge == 0 ? 4 : 3 ); + } else { + int i; + for( i = 0; i < 4; i++ ) { + int x = dir == 0 ? edge : i; + int y = dir == 0 ? i : edge; + int b_idx= 8 + 4 + x + 8*y; + int bn_idx= b_idx - (dir ? 8:1); + + if( h->non_zero_count_cache[b_idx] != 0 || + h->non_zero_count_cache[bn_idx] != 0 ) { + bS[i] = 2; + } + else if( h->slice_type == P_TYPE ) { + if( h->ref_cache[0][b_idx] != h->ref_cache[0][bn_idx] || + ABS( h->mv_cache[0][b_idx][0] - h->mv_cache[0][bn_idx][0] ) >= 4 || + ABS( h->mv_cache[0][b_idx][1] - h->mv_cache[0][bn_idx][1] ) >= 4 ) + bS[i] = 1; + else + bS[i] = 0; + } + else { + /* FIXME Add support for B frame */ + return; + } + } + + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) + continue; + } + /* Filter edge */ + qp = ( s->qscale + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; + if( dir == 0 ) { + filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp ); + if( (edge&1) == 0 ) { + int chroma_qp = ( h->chroma_qp + + get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; + filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp ); + filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp ); + } + } else { + filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp ); + if( (edge&1) == 0 ) { + int chroma_qp = ( h->chroma_qp + + get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1; + filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp ); + filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp ); + } + } + } + } +} + +static int decode_slice(H264Context *h){ + MpegEncContext * const s = &h->s; + const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F; + + s->mb_skip_run= -1; + + if( h->pps.cabac ) { + int i; + + /* realign */ + align_get_bits( &s->gb ); + + /* init cabac */ + ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 ); + ff_init_cabac_decoder( &h->cabac, + s->gb.buffer + get_bits_count(&s->gb)/8, + ( s->gb.size_in_bits - get_bits_count(&s->gb) ) ); + /* calculate pre-state */ + for( i= 0; i < 399; i++ ) { + int pre; + if( h->slice_type == I_TYPE ) + pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 ); + else + pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 ); + + if( pre <= 63 ) + h->cabac_state[i] = 2 * ( 63 - pre ) + 0; + else + h->cabac_state[i] = 2 * ( pre - 64 ) + 1; + } + + for(;;){ + int ret = decode_mb_cabac(h); + int eos = get_cabac_terminate( &h->cabac ); /* End of Slice flag */ + + hl_decode_mb(h); + + /* XXX: useless as decode_mb_cabac it doesn't support that ... */ + if( ret >= 0 && h->sps.mb_aff ) { //FIXME optimal? or let mb_decode decode 16x32 ? + s->mb_y++; + + ret = decode_mb_cabac(h); + eos = get_cabac_terminate( &h->cabac ); + + hl_decode_mb(h); + s->mb_y--; + } + + if( ret < 0 ) { + av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y); + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask); + return -1; + } + + if( ++s->mb_x >= s->mb_width ) { + s->mb_x = 0; + ff_draw_horiz_band(s, 16*s->mb_y, 16); + if( ++s->mb_y >= s->mb_height ) { + tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits); + } + } + + if( eos || s->mb_y >= s->mb_height ) { + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); return 0; - }else{ + } +#if 0 + /* TODO test over-reading in cabac code */ + else if( read too much in h->cabac ) { + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask); + return -1; + } +#endif + } + + } else { + for(;;){ + int ret = decode_mb_cavlc(h); + + hl_decode_mb(h); + + if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ? + s->mb_y++; + ret = decode_mb_cavlc(h); + + hl_decode_mb(h); + s->mb_y--; + } + + if(ret<0){ + av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y); ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask); return -1; } + + if(++s->mb_x >= s->mb_width){ + s->mb_x=0; + ff_draw_horiz_band(s, 16*s->mb_y, 16); + if(++s->mb_y >= s->mb_height){ + tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits); + + if(get_bits_count(&s->gb) == s->gb.size_in_bits ) { + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); + + return 0; + }else{ + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); + + return -1; + } + } + } + + if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){ + if(get_bits_count(&s->gb) == s->gb.size_in_bits ){ + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask); + + return 0; + }else{ + ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask); + + return -1; + } + } } } -#endif + #if 0 for(;s->mb_y < s->mb_height; s->mb_y++){ for(;s->mb_x < s->mb_width; s->mb_x++){ @@ -4022,7 +5581,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){ buf_index += consumed; - if(h->nal_ref_idc < s->hurry_up) + if( s->hurry_up == 1 && h->nal_ref_idc == 0 ) continue; switch(h->nal_unit_type){ @@ -4035,7 +5594,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){ s->data_partitioning = 0; if(decode_slice_header(h) < 0) return -1; - if(h->redundant_pic_count==0) + if(h->redundant_pic_count==0 && s->hurry_up < 5 ) decode_slice(h); break; case NAL_DPA: @@ -4054,7 +5613,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){ init_get_bits(&h->inter_gb, ptr, bit_length); h->inter_gb_ptr= &h->inter_gb; - if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning) + if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning && s->hurry_up < 5 ) decode_slice(h); break; case NAL_SEI: @@ -4099,6 +5658,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){ assert(h->mmco_index==0); ff_er_frame_end(s); + MPV_frame_end(s); return buf_index; diff --git a/src/libffmpeg/libavcodec/h264data.h b/src/libffmpeg/libavcodec/h264data.h index 40a252253..5480becd4 100644 --- a/src/libffmpeg/libavcodec/h264data.h +++ b/src/libffmpeg/libavcodec/h264data.h @@ -528,3 +528,598 @@ static const int quant_coeff[52][16]={ { 1260, 819, 1260, 819, 819, 524, 819, 524, 1260, 819, 1260, 819, 819, 524, 819, 524,}, { 1170, 728, 1170, 728, 728, 456, 728, 456, 1170, 728, 1170, 728, 728, 456, 728, 456,}, }; + + +/* Deblocking filter (p153) */ +static const int alpha_table[52] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, + 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, + 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, + 80, 90,101,113,127,144,162,182,203,226, + 255, 255 +}; +static const int beta_table[52] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, + 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, + 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, + 18, 18 +}; +static const int tc0_table[52][3] = { + { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, + { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, + { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 }, + { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 }, + { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 }, + { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 }, + { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 }, + { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 }, + { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 } +}; + +/* Cabac pre state table */ + +static const int cabac_context_init_I[399][2] = +{ + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 unsused for I */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, + + /* 24- 39 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + + /* 40 - 53 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, + + /* 54 - 59 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 -> 87 */ + { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, + { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, + { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, + { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, + { -12, 115 },{ -16, 122 }, + + /* 88 -> 104 */ + { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, + { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, + { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, + { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, + { -22, 125 }, + + /* 105 -> 135 */ + { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, + { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, + { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, + { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, + { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, + { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, + { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, + { 14, 62 }, { -13, 108 },{ -15, 100 }, + + /* 136 -> 165 */ + { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, + { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, + { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, + { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, + { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, + { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, + { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, + { 0, 62 }, { 12, 72 }, + + /* 166 -> 196 */ + { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, + { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, + { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, + { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, + { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, + { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, + { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, + { 0, 89 }, { 26, -19 }, { 22, -17 }, + + /* 197 -> 226 */ + { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, + { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, + { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, + { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, + { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, + { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, + { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, + { 12, 68 }, { 2, 97 }, + + /* 227 -> 251 */ + { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, + { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, + { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, + { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, + { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, + { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, + { -4, 65 }, + + /* 252 -> 275 */ + { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, + { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, + { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, + { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, + { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, + { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 -> 307 */ + { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, + { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, + { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, + { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, + { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, + { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, + { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, + { 9, 64 }, { -12, 104 },{ -11, 97 }, + + /* 308 -> 337 */ + { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, + { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, + { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, + { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, + { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, + { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, + { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, + { 5, 64 }, { 12, 70 }, + + /* 338 -> 368 */ + { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, + { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, + { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, + { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, + { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, + { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, + { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, + { -12, 109 },{ 36, -35 }, { 36, -34 }, + + /* 369 -> 398 */ + { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, + { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, + { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, + { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, + { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, + { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, + { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, + { 29, 39 }, { 19, 66 } +}; + +static const int cabac_context_init_PB[3][399][2] = +{ + /* i_cabac_init_idc == 0 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, + { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, + { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, + { 17, 50 }, + + /* 24 - 39 */ + { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, + { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, + { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, + { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, + + /* 40 - 53 */ + { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, + { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, + { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, + { -3, 81 }, { 0, 88 }, + + /* 54 - 59 */ + { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, + { -7, 72 }, { 1, 58 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 87 */ + { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, + { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, + { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, + { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, + { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, + { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, + { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, + { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, + { 0, 68 }, { -4, 69 }, { -8, 88 }, + + /* 105 -> 165 */ + { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, + { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, + { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, + { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, + { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, + { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, + { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, + { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, + { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, + { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, + { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, + { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, + { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, + { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, + { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, + { 9, 69 }, + + /* 166 - 226 */ + { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, + { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, + { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, + { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, + { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, + { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, + { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, + { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, + { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, + { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, + { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, + { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, + { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, + { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, + { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, + { -9, 108 }, + + /* 227 - 275 */ + { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, + { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, + { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, + { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, + { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, + { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, + { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, + { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, + { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, + { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, + { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, + { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, + { -8, 85 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, + { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, + { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, + { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, + { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, + { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, + { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, + { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, + { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, + { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, + { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, + { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, + { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, + { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, + { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, + { 26, 43 }, + + /* 338 - 398 */ + { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, + { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, + { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, + { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, + { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, + { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, + { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, + { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, + { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, + { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, + { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, + { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, + { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, + { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, + { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, + { 11, 86 }, + + + }, + + /* i_cabac_init_idc == 1 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, + { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, + { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, + { 10, 54 }, + + /* 24 - 39 */ + { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, + { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, + { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, + { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, + + /* 40 - 53 */ + { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, + { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, + { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, + { -7, 86 },{ -5, 95 }, + + /* 54 - 59 */ + { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, + { -5, 72 },{ 0, 61 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 104 */ + { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, + { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, + { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, + { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, + { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, + { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, + { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, + { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, + { 0, 68 }, { -7, 74 }, { -9, 88 }, + + /* 105 -> 165 */ + { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, + { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, + { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, + { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, + { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, + { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, + { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, + { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, + { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, + { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, + { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, + { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, + { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, + { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, + { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, + { 0, 89 }, + + /* 166 - 226 */ + { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, + { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, + { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, + { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, + { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, + { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, + { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, + { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, + { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, + { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, + { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, + { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, + { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, + { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, + { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, + { -10, 116 }, + + /* 227 - 275 */ + { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, + { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, + { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, + { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, + { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, + { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, + { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, + { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, + { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, + { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, + { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, + { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, + { -4, 78 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, + { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, + { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, + { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, + { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, + { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, + { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, + { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, + { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, + { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, + { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, + { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, + { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, + { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, + { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, + { 18, 50 }, + + /* 338 - 398 */ + { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, + { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, + { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, + { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, + { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, + { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, + { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, + { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, + { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, + { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, + { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, + { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, + { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, + { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, + { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, + { 11, 83 }, + + }, + + /* i_cabac_init_idc == 2 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, + { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, + { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, + { 14, 57 }, + + /* 24 - 39 */ + { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, + { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, + { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, + { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, + + /* 40 - 53 */ + { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, + { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, + { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, + { -3, 90 },{ -1, 101 }, + + /* 54 - 59 */ + { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, + { -7, 50 },{ 1, 60 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 104 */ + { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, + { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, + { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, + { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, + { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, + { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, + { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, + { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, + { 3, 68 }, { -8, 71 }, { -13, 98 }, + + /* 105 -> 165 */ + { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, + { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, + { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, + { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, + { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, + { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, + { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, + { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, + { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, + { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, + { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, + { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, + { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, + { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, + { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, + { -22, 127 }, + + /* 166 - 226 */ + { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, + { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, + { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, + { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, + { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, + { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, + { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, + { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, + { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, + { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, + { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, + { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, + { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, + { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, + { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, + { -24, 127 }, + + /* 227 - 275 */ + { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, + { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, + { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, + { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, + { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, + { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, + { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, + { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, + { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, + { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, + { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, + { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, + { -10, 87 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, + { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, + { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, + { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, + { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, + { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, + { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, + { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, + { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, + { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, + { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, + { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, + { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, + { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, + { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, + { 25, 42 }, + + /* 338 - 398 */ + { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, + { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, + { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, + { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, + { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, + { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, + { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, + { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, + { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, + { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, + { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, + { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, + { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, + { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, + { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, + { 25, 61 }, + } +}; diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am index d7b2bb4f8..eaa8d0f75 100644 --- a/src/libffmpeg/libavcodec/i386/Makefile.am +++ b/src/libffmpeg/libavcodec/i386/Makefile.am @@ -18,7 +18,8 @@ libavcodec_mmx_src = \ motion_est_mmx.c \ mpegvideo_mmx.c \ simple_idct_mmx.c \ - vp3dsp_mmx.c + vp3dsp_mmx.c \ + vp3dsp_sse2.c libavcodec_mmx_dummy = libavcodec_mmx_dummy.c diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index 15dc8eec2..772c9c1f0 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -2147,9 +2147,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } /* VP3 optimized DSP functions */ - c->vp3_dsp_init = vp3_dsp_init_mmx; - c->vp3_idct_put = vp3_idct_put_mmx; - c->vp3_idct_add = vp3_idct_add_mmx; + if (mm_flags & MM_SSE2) { + c->vp3_dsp_init = vp3_dsp_init_sse2; + c->vp3_idct_put = vp3_idct_put_sse2; + c->vp3_idct_add = vp3_idct_add_sse2; + } else { + c->vp3_dsp_init = vp3_dsp_init_mmx; + c->vp3_idct_put = vp3_idct_put_mmx; + c->vp3_idct_add = vp3_idct_add_mmx; + } #ifdef CONFIG_ENCODERS c->get_pixels = get_pixels_mmx; diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c index 877160773..7af576971 100644 --- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -50,7 +50,14 @@ static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; -static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; +struct +{ + const long fdct_r_row_sse2[4] ATTR_ALIGN(16); +} fdct_r_row_sse2 ATTR_ALIGN(16)= +{{ + RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW +}}; +//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table 16384, 16384, -8867, -21407, @@ -126,7 +133,12 @@ static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff 6270, 26722, 6270, -17855, }; -static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table +struct +{ + const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16); +} tab_frw_01234567_sse2 ATTR_ALIGN(16) = +{{ +//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ C4, C4, C5, C7, C2, C6, C3, -C7, \ -C4, C4, C7, C3, C6, -C2, C7, -C5, \ @@ -252,7 +264,8 @@ TABLE_SSE2 #define C6 12299 #define C7 6270 TABLE_SSE2 -}; +}}; + static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) { @@ -392,7 +405,7 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) "FDCT_ROW_SSE2_H2 80 192 \n\t" "FDCT_ROW_SSE2 80 \n\t" : - : "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) ); } diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h index 7e94cfd9b..ad684bc5a 100644 --- a/src/libffmpeg/libavcodec/i386/mmx.h +++ b/src/libffmpeg/libavcodec/i386/mmx.h @@ -240,4 +240,28 @@ typedef union { #define sfence() __asm__ __volatile__ ("sfence\n\t") +/* SSE2 */ +#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) +#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) +#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) +#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) + +#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) + +#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) +#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) +#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) +#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) +#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) +#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) + +#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) + +#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) +#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) + +#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) +#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) + + #endif /* AVCODEC_I386MMX_H */ diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c index 59020466f..76007a1d1 100644 --- a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c +++ b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c @@ -46,213 +46,216 @@ static uint16_t idct_cosine_table[7] = { #define r7 mm7 /* from original comments: The Macro does IDct on 4 1-D Dcts */ -#define BeginIDCT() \ +#define BeginIDCT() { \ movq_m2r(*I(3), r2); \ movq_m2r(*C(3), r6); \ movq_r2r(r2, r4); \ movq_m2r(*J(5), r7); \ - pmulhw_r2r(r6, r4); \ + pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \ movq_m2r(*C(5), r1); \ - pmulhw_r2r(r7, r6); \ + pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \ movq_r2r(r1, r5); \ - pmulhw_r2r(r2, r1); \ + pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \ movq_m2r(*I(1), r3); \ - pmulhw_r2r(r7, r5); \ - movq_m2r(*C(1), r0); \ - paddw_r2r(r2, r4); \ - paddw_r2r(r7, r6); \ - paddw_r2r(r1, r2); \ + pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \ + movq_m2r(*C(1), r0); /* (all registers are in use) */ \ + paddw_r2r(r2, r4); /* r4 = c3*i3 */ \ + paddw_r2r(r7, r6); /* r6 = c3*i5 */ \ + paddw_r2r(r1, r2); /* r2 = c5*i3 */ \ movq_m2r(*J(7), r1); \ - paddw_r2r(r5, r7); \ - movq_r2r(r0, r5); \ - pmulhw_r2r(r3, r0); \ - paddsw_r2r(r7, r4); \ - pmulhw_r2r(r1, r5); \ + paddw_r2r(r5, r7); /* r7 = c5*i5 */ \ + movq_r2r(r0, r5); /* r5 = c1 */ \ + pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \ + paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \ + pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \ movq_m2r(*C(7), r7); \ - psubsw_r2r(r2, r6); \ - paddw_r2r(r3, r0); \ - pmulhw_r2r(r7, r3); \ + psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \ + paddw_r2r(r3, r0); /* r0 = c1*i1 */ \ + pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \ movq_m2r(*I(2), r2); \ - pmulhw_r2r(r1, r7); \ - paddw_r2r(r1, r5); \ - movq_r2r(r2, r1); \ - pmulhw_m2r(*C(2), r2); \ - psubsw_r2r(r5, r3); \ + pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \ + paddw_r2r(r1, r5); /* r5 = c1*i7 */ \ + movq_r2r(r2, r1); /* r1 = i2 */ \ + pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \ + psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \ movq_m2r(*J(6), r5); \ - paddsw_r2r(r7, r0); \ - movq_r2r(r5, r7); \ - psubsw_r2r(r4, r0); \ - pmulhw_m2r(*C(2), r5); \ - paddw_r2r(r1, r2); \ - pmulhw_m2r(*C(6), r1); \ - paddsw_r2r(r4, r4); \ - paddsw_r2r(r0, r4); \ - psubsw_r2r(r6, r3); \ - paddw_r2r(r7, r5); \ - paddsw_r2r(r6, r6); \ - pmulhw_m2r(*C(6), r7); \ - paddsw_r2r(r3, r6); \ - movq_r2m(r4, *I(1)); \ - psubsw_r2r(r5, r1); \ + paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \ + movq_r2r(r5, r7); /* r7 = i6 */ \ + psubsw_r2r(r4, r0); /* r0 = A - C */ \ + pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \ + paddw_r2r(r1, r2); /* r2 = c2*i2 */ \ + pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \ + paddsw_r2r(r4, r4); /* r4 = C + C */ \ + paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \ + psubsw_r2r(r6, r3); /* r3 = B - D */ \ + paddw_r2r(r7, r5); /* r5 = c2*i6 */ \ + paddsw_r2r(r6, r6); /* r6 = D + D */ \ + pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \ + paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \ + movq_r2m(r4, *I(1)); /* save C. at I(1) */ \ + psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \ movq_m2r(*C(4), r4); \ - movq_r2r(r3, r5); \ - pmulhw_r2r(r4, r3); \ - paddsw_r2r(r2, r7); \ - movq_r2m(r6, *I(2)); \ - movq_r2r(r0, r2); \ + movq_r2r(r3, r5); /* r5 = B - D */ \ + pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \ + paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \ + movq_r2m(r6, *I(2)); /* save D. at I(2) */ \ + movq_r2r(r0, r2); /* r2 = A - C */ \ movq_m2r(*I(0), r6); \ - pmulhw_r2r(r4, r0); \ - paddw_r2r(r3, r5); \ + pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \ + paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \ movq_m2r(*J(4), r3); \ - psubsw_r2r(r1, r5); \ - paddw_r2r(r0, r2); \ - psubsw_r2r(r3, r6); \ + psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \ + paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \ + psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \ movq_r2r(r6, r0); \ - pmulhw_r2r(r4, r6); \ - paddsw_r2r(r3, r3); \ - paddsw_r2r(r1, r1); \ - paddsw_r2r(r0, r3); \ - paddsw_r2r(r5, r1); \ - pmulhw_r2r(r3, r4); \ - paddsw_r2r(r0, r6); \ - psubsw_r2r(r2, r6); \ - paddsw_r2r(r2, r2); \ - movq_m2r(*I(1), r0); \ - paddsw_r2r(r6, r2); \ - paddw_r2r(r3, r4); \ - psubsw_r2r(r1, r2); + pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \ + paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \ + paddsw_r2r(r1, r1); /* r1 = H + H */ \ + paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \ + paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \ + pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \ + paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \ + psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \ + paddsw_r2r(r2, r2); /* r2 = A. + A. */ \ + movq_m2r(*I(1), r0); /* r0 = C. */ \ + paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \ + paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \ + psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \ +} /* RowIDCT gets ready to transpose */ -#define RowIDCT() \ +#define RowIDCT() { \ \ - BeginIDCT() \ + BeginIDCT(); \ \ - movq_m2r(*I(2), r3); \ - psubsw_r2r(r7, r4); \ - paddsw_r2r(r1, r1); \ - paddsw_r2r(r7, r7); \ - paddsw_r2r(r2, r1); \ - paddsw_r2r(r4, r7); \ - psubsw_r2r(r3, r4); \ - psubsw_r2r(r5, r6); \ + movq_m2r(*I(2), r3); /* r3 = D. */ \ + psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ + paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ + paddsw_r2r(r7, r7); /* r7 = G + G */ \ + paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ + paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ + psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ + paddsw_r2r(r3, r3); \ + psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ paddsw_r2r(r5, r5); \ - paddsw_r2r(r4, r3); \ - paddsw_r2r(r6, r5); \ - psubsw_r2r(r0, r7); \ + paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ + paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ + psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ paddsw_r2r(r0, r0); \ - movq_r2m(r1, *I(1)); \ - paddsw_r2r(r7, r0); + movq_r2m(r1, *I(1)); /* save R1 */ \ + paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ +} /* Column IDCT normalizes and stores final results */ -#define ColumnIDCT() \ +#define ColumnIDCT() { \ \ - BeginIDCT() \ + BeginIDCT(); \ \ - paddsw_m2r(*Eight, r2); \ - paddsw_r2r(r1, r1); \ - paddsw_r2r(r2, r1); \ - psraw_i2r(4, r2); \ - psubsw_r2r(r7, r4); \ - psraw_i2r(4, r1); \ - movq_m2r(*I(2), r3); \ - paddsw_r2r(r7, r7); \ - movq_r2m(r2, *I(2)); \ - paddsw_r2r(r4, r7); \ - movq_r2m(r1, *I(1)); \ - psubsw_r2r(r3, r4); \ - paddsw_m2r(*Eight, r4); \ - paddsw_r2r(r3, r3); \ - paddsw_r2r(r4, r3); \ - psraw_i2r(4, r4); \ - psubsw_r2r(r5, r6); \ - psraw_i2r(4, r3); \ - paddsw_m2r(*Eight, r6); \ - paddsw_r2r(r5, r5); \ - paddsw_r2r(r6, r5); \ - psraw_i2r(4, r6); \ - movq_r2m(r4, *J(4)); \ - psraw_i2r(4, r5); \ - movq_r2m(r3, *I(3)); \ - psubsw_r2r(r0, r7); \ - paddsw_m2r(*Eight, r7); \ - paddsw_r2r(r0, r0); \ - paddsw_r2r(r7, r0); \ - psraw_i2r(4, r7); \ - movq_r2m(r6, *J(6)); \ - psraw_i2r(4, r0); \ - movq_r2m(r5, *J(5)); \ - movq_r2m(r7, *J(7)); \ - movq_r2m(r0, *I(0)); - + paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \ + paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ + paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ + psraw_i2r(4, r2); /* r2 = NR2 */ \ + psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ + psraw_i2r(4, r1); /* r1 = NR1 */ \ + movq_m2r(*I(2), r3); /* r3 = D. */ \ + paddsw_r2r(r7, r7); /* r7 = G + G */ \ + movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \ + paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ + movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \ + psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ + paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \ + paddsw_r2r(r3, r3); /* r3 = D. + D. */ \ + paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ + psraw_i2r(4, r4); /* r4 = NR4 */ \ + psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ + psraw_i2r(4, r3); /* r3 = NR3 */ \ + paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \ + paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \ + paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ + psraw_i2r(4, r6); /* r6 = NR6 */ \ + movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \ + psraw_i2r(4, r5); /* r5 = NR5 */ \ + movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \ + psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ + paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \ + paddsw_r2r(r0, r0); /* r0 = C. + C. */ \ + paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ + psraw_i2r(4, r7); /* r7 = NR7 */ \ + movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \ + psraw_i2r(4, r0); /* r0 = NR0 */ \ + movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \ + movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \ + movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \ +} /* Following macro does two 4x4 transposes in place. At entry (we assume): - r0 = a3 a2 a1 a0 - I(1) = b3 b2 b1 b0 - r2 = c3 c2 c1 c0 - r3 = d3 d2 d1 d0 - - r4 = e3 e2 e1 e0 - r5 = f3 f2 f1 f0 - r6 = g3 g2 g1 g0 - r7 = h3 h2 h1 h0 + r0 = a3 a2 a1 a0 + I(1) = b3 b2 b1 b0 + r2 = c3 c2 c1 c0 + r3 = d3 d2 d1 d0 - At exit, we have: + r4 = e3 e2 e1 e0 + r5 = f3 f2 f1 f0 + r6 = g3 g2 g1 g0 + r7 = h3 h2 h1 h0 - I(0) = d0 c0 b0 a0 - I(1) = d1 c1 b1 a1 - I(2) = d2 c2 b2 a2 - I(3) = d3 c3 b3 a3 + At exit, we have: - J(4) = h0 g0 f0 e0 - J(5) = h1 g1 f1 e1 - J(6) = h2 g2 f2 e2 - J(7) = h3 g3 f3 e3 + I(0) = d0 c0 b0 a0 + I(1) = d1 c1 b1 a1 + I(2) = d2 c2 b2 a2 + I(3) = d3 c3 b3 a3 + + J(4) = h0 g0 f0 e0 + J(5) = h1 g1 f1 e1 + J(6) = h2 g2 f2 e2 + J(7) = h3 g3 f3 e3 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. Since r1 is free at entry, we calculate the Js first. */ -#define Transpose() \ - movq_r2r(r4, r1); \ - punpcklwd_r2r(r5, r4); \ - movq_r2m(r0, *I(0)); \ - punpckhwd_r2r(r5, r1); \ - movq_r2r(r6, r0); \ - punpcklwd_r2r(r7, r6); \ - movq_r2r(r4, r5); \ - punpckldq_r2r(r6, r4); \ - punpckhdq_r2r(r6, r5); \ - movq_r2r(r1, r6); \ +#define Transpose() { \ + movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \ + punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \ + movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \ + punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \ + movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \ + punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \ + movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \ + punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \ + punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \ + movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \ movq_r2m(r4, *J(4)); \ - punpckhwd_r2r(r7, r0); \ + punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \ movq_r2m(r5, *J(5)); \ - punpckhdq_r2r(r0, r6); \ - movq_m2r(*I(0), r4); \ - punpckldq_r2r(r0, r1); \ - movq_m2r(*I(1), r5); \ - movq_r2r(r4, r0); \ + punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \ + movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \ + punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \ + movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \ + movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \ movq_r2m(r6, *J(7)); \ - punpcklwd_r2r(r5, r0); \ + punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \ movq_r2m(r1, *J(6)); \ - punpckhwd_r2r(r5, r4); \ - movq_r2r(r2, r5); \ - punpcklwd_r2r(r3, r2); \ - movq_r2r(r0, r1); \ - punpckldq_r2r(r2, r0); \ - punpckhdq_r2r(r2, r1); \ - movq_r2r(r4, r2); \ + punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \ + movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \ + punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \ + movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \ + punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \ + punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \ + movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \ movq_r2m(r0, *I(0)); \ - punpckhwd_r2r(r3, r5); \ + punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \ movq_r2m(r1, *I(1)); \ - punpckhdq_r2r(r5, r4); \ - punpckldq_r2r(r5, r2); \ + punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \ + punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \ movq_r2m(r4, *I(3)); \ - movq_r2m(r2, *I(2)); - + movq_r2m(r2, *I(2)); \ +} void vp3_dsp_init_mmx(void) { @@ -263,7 +266,7 @@ void vp3_dsp_init_mmx(void) idct_constants[--j] = 0; } while (j); - idct_constants[0] = idct_constants[5] = + idct_constants[0] = idct_constants[5] = idct_constants[10] = idct_constants[15] = 65535; j = 1; @@ -272,7 +275,7 @@ void vp3_dsp_init_mmx(void) p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1]; } while (++j <= 7); - idct_constants[44] = idct_constants[45] = + idct_constants[44] = idct_constants[45] = idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift; } @@ -292,254 +295,240 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, #define C(x) (idct_constants + 16 + (x - 1) * 4) #define Eight (idct_constants + 44) - movq_m2r(*input_data, r0); - pmullw_m2r(*dequant_matrix, r0); - movq_m2r(*(input_data + 8), r1); - pmullw_m2r(*(dequant_matrix + 8), r1); - movq_m2r(*M(0), r2); - movq_r2r(r0, r3); - movq_m2r(*(input_data + 4), r4); - psrlq_i2r(16, r0); - pmullw_m2r(*(dequant_matrix + 4), r4); - pand_r2r(r2, r3); - movq_r2r(r0, r5); - movq_r2r(r1, r6); - pand_r2r(r2, r5); - psllq_i2r(32, r6); - movq_m2r(*M(3), r7); - pxor_r2r(r5, r0); - pand_r2r(r6, r7); - por_r2r(r3, r0); - pxor_r2r(r7, r6); - por_r2r(r7, r0); - movq_m2r(*M(3), r7); - movq_r2r(r4, r3); - movq_r2m(r0, *output_data); - - pand_r2r(r2, r3); - movq_m2r(*(input_data + 16), r0); - psllq_i2r(16, r3); - pmullw_m2r(*(dequant_matrix + 16), r0); - pand_r2r(r1, r7); - por_r2r(r3, r5); - por_r2r(r6, r7); - movq_m2r(*(input_data + 12), r3); - por_r2r(r5, r7); - pmullw_m2r(*(dequant_matrix + 12), r3); - psrlq_i2r(16, r4); - movq_r2m(r7, *(output_data + 8)); - - movq_r2r(r4, r5); - movq_r2r(r0, r7); - psrlq_i2r(16, r4); - psrlq_i2r(48, r7); - movq_r2r(r2, r6); - pand_r2r(r2, r5); - pand_r2r(r4, r6); - movq_r2m(r7, *(output_data + 40)); - - pxor_r2r(r6, r4); - psrlq_i2r(32, r1); - por_r2r(r5, r4); - movq_m2r(*M(3), r7); - pand_r2r(r2, r1); - movq_m2r(*(input_data + 24), r5); - psllq_i2r(16, r0); - pmullw_m2r(*(dequant_matrix + 24), r5); - pand_r2r(r0, r7); - movq_r2m(r1, *(output_data + 32)); - - por_r2r(r4, r7); - movq_r2r(r3, r4); - pand_r2r(r2, r3); - movq_m2r(*M(2), r1); - psllq_i2r(32, r3); - por_r2r(r3, r7); - movq_r2r(r5, r3); - psllq_i2r(48, r3); - pand_r2r(r0, r1); - movq_r2m(r7, *(output_data + 16)); - - por_r2r(r3, r6); - movq_m2r(*M(1), r7); - por_r2r(r1, r6); - movq_m2r(*(input_data + 28), r1); - pand_r2r(r4, r7); - pmullw_m2r(*(dequant_matrix + 28), r1); - por_r2r(r6, r7); - pand_m2r(*M(1), r0); - psrlq_i2r(32, r4); - movq_r2m(r7, *(output_data + 24)); - - movq_r2r(r4, r6); - movq_m2r(*M(3), r7); - pand_r2r(r2, r4); - movq_m2r(*M(1), r3); - pand_r2r(r1, r7); - pand_r2r(r5, r3); - por_r2r(r4, r0); - psllq_i2r(16, r3); - por_r2r(r0, r7); - movq_m2r(*M(2), r4); - por_r2r(r3, r7); - movq_m2r(*(input_data + 40), r0); - movq_r2r(r4, r3); - pmullw_m2r(*(dequant_matrix + 40), r0); - pand_r2r(r5, r4); - movq_r2m(r7, *(output_data + 4)); - - por_r2r(r4, r6); - movq_r2r(r3, r4); - psrlq_i2r(16, r6); - movq_r2r(r0, r7); - pand_r2r(r1, r4); - psllq_i2r(48, r7); - por_r2r(r4, r6); - movq_m2r(*(input_data + 44), r4); - por_r2r(r6, r7); - pmullw_m2r(*(dequant_matrix + 44), r4); - psrlq_i2r(16, r3); - movq_r2m(r7, *(output_data + 12)); - - pand_r2r(r1, r3); - psrlq_i2r(48, r5); - pand_r2r(r2, r1); - movq_m2r(*(input_data + 52), r6); - por_r2r(r3, r5); - pmullw_m2r(*(input_data + 52), r6); - psrlq_i2r(16, r0); - movq_r2r(r4, r7); - movq_r2r(r2, r3); - psllq_i2r(48, r7); - pand_r2r(r0, r3); - pxor_r2r(r3, r0); - psllq_i2r(32, r3); - por_r2r(r5, r7); - movq_r2r(r6, r5); - pand_m2r(*M(1), r6); - por_r2r(r3, r7); - psllq_i2r(32, r6); - por_r2r(r1, r0); - movq_r2m(r7, *(output_data + 20)); - - por_r2r(r6, r0); - movq_m2r(*(input_data + 60), r7); - movq_r2r(r5, r6); - pmullw_m2r(*(input_data + 60), r7); - psrlq_i2r(32, r5); - pand_r2r(r2, r6); - movq_r2r(r5, r1); - movq_r2m(r0, *(output_data + 28)); - - pand_r2r(r2, r1); - movq_m2r(*(input_data + 56), r0); - movq_r2r(r7, r3); - pmullw_m2r(*(dequant_matrix + 56), r0); - psllq_i2r(16, r3); - pand_m2r(*M(3), r7); - pxor_r2r(r1, r5); - por_r2r(r5, r6); - movq_r2r(r3, r5); - pand_m2r(*M(3), r5); - por_r2r(r1, r7); - movq_m2r(*(input_data + 48), r1); - pxor_r2r(r5, r3); - pmullw_m2r(*(dequant_matrix + 48), r1); - por_r2r(r3, r7); - por_r2r(r5, r6); - movq_r2r(r0, r5); - movq_r2m(r7, *(output_data + 60)); - - psrlq_i2r(16, r5); - pand_m2r(*M(2), r5); - movq_r2r(r0, r7); - por_r2r(r5, r6); - pand_r2r(r2, r0); - pxor_r2r(r0, r7); - psllq_i2r(32, r0); - movq_r2m(r6, *(output_data + 52)); - - psrlq_i2r(16, r4); - movq_m2r(*(input_data + 36), r5); - psllq_i2r(16, r7); - pmullw_m2r(*(dequant_matrix + 36), r5); - movq_r2r(r7, r6); - movq_m2r(*M(2), r3); - psllq_i2r(16, r6); - pand_m2r(*M(3), r7); - pand_r2r(r1, r3); - por_r2r(r0, r7); - movq_r2r(r1, r0); - pand_m2r(*M(3), r1); - por_r2r(r3, r6); - movq_r2r(r4, r3); - psrlq_i2r(32, r1); - pand_r2r(r2, r3); - por_r2r(r1, r7); - por_r2r(r3, r7); - movq_r2r(r4, r3); - pand_m2r(*M(1), r3); - movq_r2r(r5, r1); - movq_r2m(r7, *(output_data + 44)); - - psrlq_i2r(48, r5); - movq_m2r(*(input_data + 32), r7); - por_r2r(r3, r6); - pmullw_m2r(*(dequant_matrix + 32), r7); - por_r2r(r5, r6); - pand_m2r(*M(2), r4); - psllq_i2r(32, r0); - movq_r2m(r6, *(output_data + 36)); - - movq_r2r(r0, r6); - pand_m2r(*M(3), r0); - psllq_i2r(16, r6); - movq_m2r(*(input_data + 20), r5); - movq_r2r(r1, r3); - pmullw_m2r(*(dequant_matrix + 40), r5); - psrlq_i2r(16, r1); - pand_m2r(*M(1), r1); - por_r2r(r4, r0); - pand_r2r(r7, r2); - por_r2r(r1, r0); - por_r2r(r2, r0); - psllq_i2r(16, r3); - movq_r2r(r3, r4); - movq_r2r(r5, r2); - movq_r2m(r0, *(output_data + 56)); - - psrlq_i2r(48, r2); - pand_m2r(*M(2), r4); - por_r2r(r2, r6); - movq_m2r(*M(1), r2); - por_r2r(r4, r6); - pand_r2r(r7, r2); - psllq_i2r(32, r3); - por_m2r(*(output_data + 40), r3); - - por_r2r(r2, r6); - movq_m2r(*M(3), r2); - psllq_i2r(16, r5); - movq_r2m(r6, *(output_data + 48)); - - pand_r2r(r5, r2); - movq_m2r(*M(2), r6); - pxor_r2r(r2, r5); - pand_r2r(r7, r6); - psrlq_i2r(32, r2); - pand_m2r(*M(3), r7); - por_r2r(r2, r3); - por_m2r(*(output_data + 32), r7); - - por_r2r(r3, r6); - por_r2r(r5, r7); - movq_r2m(r6, *(output_data + 40)); - movq_r2m(r7, *(output_data + 32)); + unsigned char *input_bytes = (unsigned char *)input_data; + unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix; + unsigned char *output_data_bytes = (unsigned char *)output_data; + + movq_m2r(*(input_bytes), r0); + pmullw_m2r(*(dequant_matrix_bytes), r0); /* r0 = 03 02 01 00 */ + movq_m2r(*(input_bytes+16), r1); + pmullw_m2r(*(dequant_matrix_bytes+16), r1); /* r1 = 13 12 11 10 */ + movq_m2r(*M(0), r2); /* r2 = __ __ __ FF */ + movq_r2r(r0, r3); /* r3 = 03 02 01 00 */ + movq_m2r(*(input_bytes+8), r4); + psrlq_i2r(16, r0); /* r0 = __ 03 02 01 */ + pmullw_m2r(*(dequant_matrix_bytes+8), r4); /* r4 = 07 06 05 04 */ + pand_r2r(r2, r3); /* r3 = __ __ __ 00 */ + movq_r2r(r0, r5); /* r5 = __ 03 02 01 */ + movq_r2r(r1, r6); /* r6 = 13 12 11 10 */ + pand_r2r(r2, r5); /* r5 = __ __ __ 01 */ + psllq_i2r(32, r6); /* r6 = 11 10 __ __ */ + movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */ + pxor_r2r(r5, r0); /* r0 = __ 03 02 __ */ + pand_r2r(r6, r7); /* r7 = 11 __ __ __ */ + por_r2r(r3, r0); /* r0 = __ 03 02 00 */ + pxor_r2r(r7, r6); /* r6 = __ 10 __ __ */ + por_r2r(r7, r0); /* r0 = 11 03 02 00 = R0 */ + movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */ + movq_r2r(r4, r3); /* r3 = 07 06 05 04 */ + movq_r2m(r0, *(output_data_bytes)); /* write R0 = r0 */ + pand_r2r(r2, r3); /* r3 = __ __ __ 04 */ + movq_m2r(*(input_bytes+32), r0); + psllq_i2r(16, r3); /* r3 = __ __ 04 __ */ + pmullw_m2r(*(dequant_matrix_bytes+32), r0); /* r0 = 23 22 21 20 */ + pand_r2r(r1, r7); /* r7 = 13 __ __ __ */ + por_r2r(r3, r5); /* r5 = __ __ 04 01 */ + por_r2r(r6, r7); /* r7 = 13 10 __ __ */ + movq_m2r(*(input_bytes+24), r3); + por_r2r(r5, r7); /* r7 = 13 10 04 01 = R1 */ + pmullw_m2r(*(dequant_matrix_bytes+24), r3); /* r3 = 17 16 15 14 */ + psrlq_i2r(16, r4); /* r4 = __ 07 06 05 */ + movq_r2m(r7, *(output_data_bytes+16)); /* write R1 = r7 */ + movq_r2r(r4, r5); /* r5 = __ 07 06 05 */ + movq_r2r(r0, r7); /* r7 = 23 22 21 20 */ + psrlq_i2r(16, r4); /* r4 = __ __ 07 06 */ + psrlq_i2r(48, r7); /* r7 = __ __ __ 23 */ + movq_r2r(r2, r6); /* r6 = __ __ __ FF */ + pand_r2r(r2, r5); /* r5 = __ __ __ 05 */ + pand_r2r(r4, r6); /* r6 = __ __ __ 06 */ + movq_r2m(r7, *(output_data_bytes+80)); /* partial R9 = __ __ __ 23 */ + pxor_r2r(r6, r4); /* r4 = __ __ 07 __ */ + psrlq_i2r(32, r1); /* r1 = __ __ 13 12 */ + por_r2r(r5, r4); /* r4 = __ __ 07 05 */ + movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */ + pand_r2r(r2, r1); /* r1 = __ __ __ 12 */ + movq_m2r(*(input_bytes+48), r5); + psllq_i2r(16, r0); /* r0 = 22 21 20 __ */ + pmullw_m2r(*(dequant_matrix_bytes+48), r5); /* r5 = 33 32 31 30 */ + pand_r2r(r0, r7); /* r7 = 22 __ __ __ */ + movq_r2m(r1, *(output_data_bytes+64)); /* partial R8 = __ __ __ 12 */ + por_r2r(r4, r7); /* r7 = 22 __ 07 05 */ + movq_r2r(r3, r4); /* r4 = 17 16 15 14 */ + pand_r2r(r2, r3); /* r3 = __ __ __ 14 */ + movq_m2r(*M(2), r1); /* r1 = __ FF __ __ */ + psllq_i2r(32, r3); /* r3 = __ 14 __ __ */ + por_r2r(r3, r7); /* r7 = 22 14 07 05 = R2 */ + movq_r2r(r5, r3); /* r3 = 33 32 31 30 */ + psllq_i2r(48, r3); /* r3 = 30 __ __ __ */ + pand_r2r(r0, r1); /* r1 = __ 21 __ __ */ + movq_r2m(r7, *(output_data_bytes+32)); /* write R2 = r7 */ + por_r2r(r3, r6); /* r6 = 30 __ __ 06 */ + movq_m2r(*M(1), r7); /* r7 = __ __ FF __ */ + por_r2r(r1, r6); /* r6 = 30 21 __ 06 */ + movq_m2r(*(input_bytes+56), r1); + pand_r2r(r4, r7); /* r7 = __ __ 15 __ */ + pmullw_m2r(*(dequant_matrix_bytes+56), r1); /* r1 = 37 36 35 34 */ + por_r2r(r6, r7); /* r7 = 30 21 15 06 = R3 */ + pand_m2r(*M(1), r0); /* r0 = __ __ 20 __ */ + psrlq_i2r(32, r4); /* r4 = __ __ 17 16 */ + movq_r2m(r7, *(output_data_bytes+48)); /* write R3 = r7 */ + movq_r2r(r4, r6); /* r6 = __ __ 17 16 */ + movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */ + pand_r2r(r2, r4); /* r4 = __ __ __ 16 */ + movq_m2r(*M(1), r3); /* r3 = __ __ FF __ */ + pand_r2r(r1, r7); /* r7 = 37 __ __ __ */ + pand_r2r(r5, r3); /* r3 = __ __ 31 __ */ + por_r2r(r4, r0); /* r0 = __ __ 20 16 */ + psllq_i2r(16, r3); /* r3 = __ 31 __ __ */ + por_r2r(r0, r7); /* r7 = 37 __ 20 16 */ + movq_m2r(*M(2), r4); /* r4 = __ FF __ __ */ + por_r2r(r3, r7); /* r7 = 37 31 20 16 = R4 */ + movq_m2r(*(input_bytes+80), r0); + movq_r2r(r4, r3); /* r3 = __ __ FF __ */ + pmullw_m2r(*(dequant_matrix_bytes+80), r0); /* r0 = 53 52 51 50 */ + pand_r2r(r5, r4); /* r4 = __ 32 __ __ */ + movq_r2m(r7, *(output_data_bytes+8)); /* write R4 = r7 */ + por_r2r(r4, r6); /* r6 = __ 32 17 16 */ + movq_r2r(r3, r4); /* r4 = __ FF __ __ */ + psrlq_i2r(16, r6); /* r6 = __ __ 32 17 */ + movq_r2r(r0, r7); /* r7 = 53 52 51 50 */ + pand_r2r(r1, r4); /* r4 = __ 36 __ __ */ + psllq_i2r(48, r7); /* r7 = 50 __ __ __ */ + por_r2r(r4, r6); /* r6 = __ 36 32 17 */ + movq_m2r(*(input_bytes+88), r4); + por_r2r(r6, r7); /* r7 = 50 36 32 17 = R5 */ + pmullw_m2r(*(dequant_matrix_bytes+88), r4); /* r4 = 57 56 55 54 */ + psrlq_i2r(16, r3); /* r3 = __ __ FF __ */ + movq_r2m(r7, *(output_data_bytes+24)); /* write R5 = r7 */ + pand_r2r(r1, r3); /* r3 = __ __ 35 __ */ + psrlq_i2r(48, r5); /* r5 = __ __ __ 33 */ + pand_r2r(r2, r1); /* r1 = __ __ __ 34 */ + movq_m2r(*(input_bytes+104), r6); + por_r2r(r3, r5); /* r5 = __ __ 35 33 */ + pmullw_m2r(*(dequant_matrix_bytes+104), r6); /* r6 = 67 66 65 64 */ + psrlq_i2r(16, r0); /* r0 = __ 53 52 51 */ + movq_r2r(r4, r7); /* r7 = 57 56 55 54 */ + movq_r2r(r2, r3); /* r3 = __ __ __ FF */ + psllq_i2r(48, r7); /* r7 = 54 __ __ __ */ + pand_r2r(r0, r3); /* r3 = __ __ __ 51 */ + pxor_r2r(r3, r0); /* r0 = __ 53 52 __ */ + psllq_i2r(32, r3); /* r3 = __ 51 __ __ */ + por_r2r(r5, r7); /* r7 = 54 __ 35 33 */ + movq_r2r(r6, r5); /* r5 = 67 66 65 64 */ + pand_m2r(*M(1), r6); /* r6 = __ __ 65 __ */ + por_r2r(r3, r7); /* r7 = 54 51 35 33 = R6 */ + psllq_i2r(32, r6); /* r6 = 65 __ __ __ */ + por_r2r(r1, r0); /* r0 = __ 53 52 34 */ + movq_r2m(r7, *(output_data_bytes+40)); /* write R6 = r7 */ + por_r2r(r6, r0); /* r0 = 65 53 52 34 = R7 */ + movq_m2r(*(input_bytes+120), r7); + movq_r2r(r5, r6); /* r6 = 67 66 65 64 */ + pmullw_m2r(*(dequant_matrix_bytes+120), r7); /* r7 = 77 76 75 74 */ + psrlq_i2r(32, r5); /* r5 = __ __ 67 66 */ + pand_r2r(r2, r6); /* r6 = __ __ __ 64 */ + movq_r2r(r5, r1); /* r1 = __ __ 67 66 */ + movq_r2m(r0, *(output_data_bytes+56)); /* write R7 = r0 */ + pand_r2r(r2, r1); /* r1 = __ __ __ 66 */ + movq_m2r(*(input_bytes+112), r0); + movq_r2r(r7, r3); /* r3 = 77 76 75 74 */ + pmullw_m2r(*(dequant_matrix_bytes+112), r0); /* r0 = 73 72 71 70 */ + psllq_i2r(16, r3); /* r3 = 76 75 74 __ */ + pand_m2r(*M(3), r7); /* r7 = 77 __ __ __ */ + pxor_r2r(r1, r5); /* r5 = __ __ 67 __ */ + por_r2r(r5, r6); /* r6 = __ __ 67 64 */ + movq_r2r(r3, r5); /* r5 = 76 75 74 __ */ + pand_m2r(*M(3), r5); /* r5 = 76 __ __ __ */ + por_r2r(r1, r7); /* r7 = 77 __ __ 66 */ + movq_m2r(*(input_bytes+96), r1); + pxor_r2r(r5, r3); /* r3 = __ 75 74 __ */ + pmullw_m2r(*(dequant_matrix_bytes+96), r1); /* r1 = 63 62 61 60 */ + por_r2r(r3, r7); /* r7 = 77 75 74 66 = R15 */ + por_r2r(r5, r6); /* r6 = 76 __ 67 64 */ + movq_r2r(r0, r5); /* r5 = 73 72 71 70 */ + movq_r2m(r7, *(output_data_bytes+120)); /* store R15 = r7 */ + psrlq_i2r(16, r5); /* r5 = __ 73 72 71 */ + pand_m2r(*M(2), r5); /* r5 = __ 73 __ __ */ + movq_r2r(r0, r7); /* r7 = 73 72 71 70 */ + por_r2r(r5, r6); /* r6 = 76 73 67 64 = R14 */ + pand_r2r(r2, r0); /* r0 = __ __ __ 70 */ + pxor_r2r(r0, r7); /* r7 = 73 72 71 __ */ + psllq_i2r(32, r0); /* r0 = __ 70 __ __ */ + movq_r2m(r6, *(output_data_bytes+104)); /* write R14 = r6 */ + psrlq_i2r(16, r4); /* r4 = __ 57 56 55 */ + movq_m2r(*(input_bytes+72), r5); + psllq_i2r(16, r7); /* r7 = 72 71 __ __ */ + pmullw_m2r(*(dequant_matrix_bytes+72), r5); /* r5 = 47 46 45 44 */ + movq_r2r(r7, r6); /* r6 = 72 71 __ __ */ + movq_m2r(*M(2), r3); /* r3 = __ FF __ __ */ + psllq_i2r(16, r6); /* r6 = 71 __ __ __ */ + pand_m2r(*M(3), r7); /* r7 = 72 __ __ __ */ + pand_r2r(r1, r3); /* r3 = __ 62 __ __ */ + por_r2r(r0, r7); /* r7 = 72 70 __ __ */ + movq_r2r(r1, r0); /* r0 = 63 62 61 60 */ + pand_m2r(*M(3), r1); /* r1 = 63 __ __ __ */ + por_r2r(r3, r6); /* r6 = 71 62 __ __ */ + movq_r2r(r4, r3); /* r3 = __ 57 56 55 */ + psrlq_i2r(32, r1); /* r1 = __ __ 63 __ */ + pand_r2r(r2, r3); /* r3 = __ __ __ 55 */ + por_r2r(r1, r7); /* r7 = 72 70 63 __ */ + por_r2r(r3, r7); /* r7 = 72 70 63 55 = R13 */ + movq_r2r(r4, r3); /* r3 = __ 57 56 55 */ + pand_m2r(*M(1), r3); /* r3 = __ __ 56 __ */ + movq_r2r(r5, r1); /* r1 = 47 46 45 44 */ + movq_r2m(r7, *(output_data_bytes+88)); /* write R13 = r7 */ + psrlq_i2r(48, r5); /* r5 = __ __ __ 47 */ + movq_m2r(*(input_bytes+64), r7); + por_r2r(r3, r6); /* r6 = 71 62 56 __ */ + pmullw_m2r(*(dequant_matrix_bytes+64), r7); /* r7 = 43 42 41 40 */ + por_r2r(r5, r6); /* r6 = 71 62 56 47 = R12 */ + pand_m2r(*M(2), r4); /* r4 = __ 57 __ __ */ + psllq_i2r(32, r0); /* r0 = 61 60 __ __ */ + movq_r2m(r6, *(output_data_bytes+72)); /* write R12 = r6 */ + movq_r2r(r0, r6); /* r6 = 61 60 __ __ */ + pand_m2r(*M(3), r0); /* r0 = 61 __ __ __ */ + psllq_i2r(16, r6); /* r6 = 60 __ __ __ */ + movq_m2r(*(input_bytes+40), r5); + movq_r2r(r1, r3); /* r3 = 47 46 45 44 */ + pmullw_m2r(*(dequant_matrix_bytes+40), r5); /* r5 = 27 26 25 24 */ + psrlq_i2r(16, r1); /* r1 = __ 47 46 45 */ + pand_m2r(*M(1), r1); /* r1 = __ __ 46 __ */ + por_r2r(r4, r0); /* r0 = 61 57 __ __ */ + pand_r2r(r7, r2); /* r2 = __ __ __ 40 */ + por_r2r(r1, r0); /* r0 = 61 57 46 __ */ + por_r2r(r2, r0); /* r0 = 61 57 46 40 = R11 */ + psllq_i2r(16, r3); /* r3 = 46 45 44 __ */ + movq_r2r(r3, r4); /* r4 = 46 45 44 __ */ + movq_r2r(r5, r2); /* r2 = 27 26 25 24 */ + movq_r2m(r0, *(output_data_bytes+112)); /* write R11 = r0 */ + psrlq_i2r(48, r2); /* r2 = __ __ __ 27 */ + pand_m2r(*M(2), r4); /* r4 = __ 45 __ __ */ + por_r2r(r2, r6); /* r6 = 60 __ __ 27 */ + movq_m2r(*M(1), r2); /* r2 = __ __ FF __ */ + por_r2r(r4, r6); /* r6 = 60 45 __ 27 */ + pand_r2r(r7, r2); /* r2 = __ __ 41 __ */ + psllq_i2r(32, r3); /* r3 = 44 __ __ __ */ + por_m2r(*(output_data_bytes+80), r3); /* r3 = 44 __ __ 23 */ + por_r2r(r2, r6); /* r6 = 60 45 41 27 = R10 */ + movq_m2r(*M(3), r2); /* r2 = FF __ __ __ */ + psllq_i2r(16, r5); /* r5 = 26 25 24 __ */ + movq_r2m(r6, *(output_data_bytes+96)); /* store R10 = r6 */ + pand_r2r(r5, r2); /* r2 = 26 __ __ __ */ + movq_m2r(*M(2), r6); /* r6 = __ FF __ __ */ + pxor_r2r(r2, r5); /* r5 = __ 25 24 __ */ + pand_r2r(r7, r6); /* r6 = __ 42 __ __ */ + psrlq_i2r(32, r2); /* r2 = __ __ 26 __ */ + pand_m2r(*M(3), r7); /* r7 = 43 __ __ __ */ + por_r2r(r2, r3); /* r3 = 44 __ 26 23 */ + por_m2r(*(output_data_bytes+64), r7); /* r7 = 43 __ __ 12 */ + por_r2r(r3, r6); /* r6 = 44 42 26 23 = R9 */ + por_r2r(r5, r7); /* r7 = 43 25 24 12 = R8 */ + movq_r2m(r6, *(output_data_bytes+80)); /* store R9 = r6 */ + movq_r2m(r7, *(output_data_bytes+64)); /* store R8 = r7 */ #undef M - /* at this point, function has completed dequantization + dezigzag + + /* at this point, function has completed dequantization + dezigzag + * partial transposition; now do the idct itself */ #define I(K) (output_data + K * 8) diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c new file mode 100644 index 000000000..c8f9158af --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c @@ -0,0 +1,890 @@ +/* + * Copyright (C) 2004 the ffmpeg project + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/** + * @file vp3dsp_sse2.c + * SSE2-optimized functions cribbed from the original VP3 source code. + */ + +#include "../dsputil.h" +#include "mmx.h" + +static unsigned short __align16 SSE2_dequant_const[] = +{ + 0,65535,65535,0,0,0,0,0, // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000 + 0,0,0,0,65535,65535,0,0, // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000 + 65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF + 0,0,0,65535,0,0,0,0, // 0x0000 0000 0000 0000 FFFF 0000 0000 0000 + 0,0,0,65535,65535,0,0,0, // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000 + 65535,0,0,0,0,65535,0,0, // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF + 0,0,65535,65535, 0,0,0,0 // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000 +}; + +static unsigned int __align16 eight_data[] = +{ + 0x00080008, + 0x00080008, + 0x00080008, + 0x00080008 +}; + +static unsigned short __align16 SSE2_idct_data[7 * 8] = +{ + 64277,64277,64277,64277,64277,64277,64277,64277, + 60547,60547,60547,60547,60547,60547,60547,60547, + 54491,54491,54491,54491,54491,54491,54491,54491, + 46341,46341,46341,46341,46341,46341,46341,46341, + 36410,36410,36410,36410,36410,36410,36410,36410, + 25080,25080,25080,25080,25080,25080,25080,25080, + 12785,12785,12785,12785,12785,12785,12785,12785 +}; + + +#define SSE2_Column_IDCT() { \ + \ + movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ + movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ + \ + movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \ + movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ + \ + pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \ + movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ + \ + pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \ + movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \ + \ + pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \ + movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ + \ + pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \ + movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ + \ + /* all registers are in use */ \ + \ + paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \ + paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \ + \ + paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \ + movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ + \ + paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \ + movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \ + \ + pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \ + paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \ + \ + pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \ + movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ + \ + psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \ + paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \ + \ + pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \ + movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ + \ + pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \ + paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \ + \ + movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \ + pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ + \ + psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \ + movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ + \ + paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \ + movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \ + \ + psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \ + pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ + \ + paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \ + pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ + \ + paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \ + paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \ + \ + psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \ + paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \ + \ + paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \ + pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ + \ + paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \ + movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ + \ + psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \ + movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ + \ + movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \ + pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ + \ + paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \ + movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ + \ + movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \ + movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ + \ + pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ + paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \ + \ + movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ + psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \ + \ + paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \ + psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \ + \ + movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \ + pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ + \ + paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \ + paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \ + \ + paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \ + paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \ + \ + pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ + paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \ + \ + psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \ + paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \ + \ + movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ + paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \ + \ + paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ + psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \ + \ + paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \ + paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \ + \ + paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \ + psraw_i2r(4, xmm2); /* xmm2 = op2 */ \ + \ + psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \ + psraw_i2r(4, xmm1); /* xmm1 = op1 */ \ + \ + movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ + paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \ + \ + movdqu_r2m(xmm2, *O(2)); /* Write out op2 */ \ + paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \ + \ + movdqu_r2m(xmm1, *O(1)); /* Write out op1 */ \ + psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \ + \ + paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \ + paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \ + \ + paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \ + psraw_i2r(4, xmm4); /* xmm4 = op4 */ \ + \ + psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \ + psraw_i2r(4, xmm3); /* xmm3 = op3 */ \ + \ + paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \ + paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \ + \ + paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \ + psraw_i2r(4, xmm6); /* xmm6 = op6 */ \ + \ + movdqu_r2m(xmm4, *O(4)); /* Write out op4 */ \ + psraw_i2r(4, xmm5); /* xmm5 = op5 */ \ + \ + movdqu_r2m(xmm3, *O(3)); /* Write out op3 */ \ + psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \ + \ + paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \ + paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \ + \ + paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \ + psraw_i2r(4, xmm7); /* xmm7 = op7 */ \ + \ + movdqu_r2m(xmm6, *O(6)); /* Write out op6 */ \ + psraw_i2r(4, xmm0); /* xmm0 = op0 */ \ + \ + movdqu_r2m(xmm5, *O(5)); /* Write out op5 */ \ + movdqu_r2m(xmm7, *O(7)); /* Write out op7 */ \ + \ + movdqu_r2m(xmm0, *O(0)); /* Write out op0 */ \ + \ +} /* End of SSE2_Column_IDCT macro */ + + +#define SSE2_Row_IDCT() { \ + \ + movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ + movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ + \ + movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \ + movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ + \ + pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \ + movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ + \ + pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \ + movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \ + \ + pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \ + movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ + \ + pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \ + movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ + \ + /* all registers are in use */ \ + \ + paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \ + paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \ + \ + paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \ + movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ + \ + paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \ + movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \ + \ + pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \ + paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \ + \ + pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \ + movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ + \ + psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \ + paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \ + \ + pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \ + movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ + \ + pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \ + paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \ + \ + movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \ + pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ + \ + psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \ + movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ + \ + paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \ + movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \ + \ + psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \ + pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ + \ + paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \ + pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ + \ + paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \ + paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \ + \ + psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \ + paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \ + \ + paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \ + pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ + \ + paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \ + movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ + \ + psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \ + movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ + \ + movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \ + pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ + \ + paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \ + movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ + \ + movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \ + movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ + \ + pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ + paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \ + \ + movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ + psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \ + \ + paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \ + psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \ + \ + movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \ + pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \ + \ + paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \ + paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \ + \ + paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \ + paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \ + \ + pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ + paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \ + \ + psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \ + paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \ + \ + movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ + paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \ + \ + paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ + psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \ + \ + paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \ + paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \ + \ + psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \ + \ + movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ + paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \ + \ + movdqu_r2m(xmm2, *I(2)); /* Write out op2 */ \ + paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \ + \ + movdqu_r2m(xmm1, *I(1)); /* Write out op1 */ \ + psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \ + \ + paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \ + \ + paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \ + \ + psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \ + \ + paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \ + \ + paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \ + \ + movdqu_r2m(xmm4, *I(4)); /* Write out op4 */ \ + \ + movdqu_r2m(xmm3, *I(3)); /* Write out op3 */ \ + psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \ + \ + paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \ + \ + paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \ + \ + movdqu_r2m(xmm6, *I(6)); /* Write out op6 */ \ + \ + movdqu_r2m(xmm5, *I(5)); /* Write out op5 */ \ + movdqu_r2m(xmm7, *I(7)); /* Write out op7 */ \ + \ + movdqu_r2m(xmm0, *I(0)); /* Write out op0 */ \ + \ +} /* End of SSE2_Row_IDCT macro */ + + +#define SSE2_Transpose() { \ + \ + movdqu_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \ + movdqu_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \ + \ + movdqu_r2r(xmm4, xmm5); /* make a copy */ \ + punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \ + \ + punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \ + movdqu_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \ + \ + movdqu_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \ + movdqu_r2r(xmm6, xmm7); /* make a copy */ \ + \ + punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \ + punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \ + \ + movdqu_r2r(xmm4, xmm3); /* make a copy */ \ + punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \ + \ + punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \ + movdqu_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \ + /* Free xmm6 */ \ + movdqu_r2r(xmm5, xmm6); /* make a copy */ \ + punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \ + \ + punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \ + movdqu_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \ + /* Free xmm7 */ \ + movdqu_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \ + movdqu_r2r(xmm0, xmm7); /* make a copy */ \ + \ + punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \ + punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \ + /* Free xmm1 */ \ + movdqu_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \ + movdqu_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \ + \ + movdqu_r2r(xmm2, xmm1); /* make a copy */ \ + punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \ + \ + punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \ + movdqu_r2r(xmm0, xmm3); /* make a copy */ \ + \ + punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \ + punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \ + /* Free xmm2 */ \ + movdqu_r2r(xmm7, xmm2); /* make a copy */ \ + punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \ + \ + punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \ + movdqu_r2r(xmm0, xmm1); /* make a copy */ \ + \ + punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \ + punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \ + \ + movdqu_r2m(xmm0, *I(0)); /* save I(0) */ \ + movdqu_r2m(xmm1, *I(1)); /* save I(1) */ \ + \ + movdqu_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \ + movdqu_r2r(xmm3, xmm1); /* make a copy */ \ + \ + punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \ + punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \ + \ + movdqu_r2r(xmm2, xmm4); /* make a copy */ \ + punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \ + \ + punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \ + movdqu_r2m(xmm1, *I(2)); /* save I(2) */ \ + \ + movdqu_r2m(xmm3, *I(3)); /* save I(3) */ \ + movdqu_r2m(xmm4, *I(4)); /* save I(4) */ \ + \ + movdqu_r2m(xmm2, *I(5)); /* save I(5) */ \ + movdqu_r2r(xmm7, xmm5); /* make a copy */ \ + \ + punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \ + punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \ + \ + movdqu_r2m(xmm5, *I(6)); /* save I(6) */ \ + movdqu_r2m(xmm7, *I(7)); /* save I(7) */ \ + \ +} /* End of Transpose Macro */ + + +#define SSE2_Dequantize() { \ + movdqu_m2r(*(eax), xmm0); \ + \ + pmullw_m2r(*(ebx), xmm0); /* xmm0 = 07 06 05 04 03 02 01 00 */ \ + movdqu_m2r(*(eax + 16), xmm1); \ + \ + pmullw_m2r(*(ebx + 16), xmm1); /* xmm1 = 17 16 15 14 13 12 11 10 */ \ + pshuflw_r2r(xmm0, xmm3, 0x078); /* xmm3 = 07 06 05 04 01 03 02 00 */ \ + \ + movdqu_r2r(xmm1, xmm2); /* xmm2 = 17 16 15 14 13 12 11 10 */ \ + movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ + \ + movdqu_m2r(*(eax + 32), xmm4); \ + movdqu_m2r(*(eax + 64), xmm5); \ + \ + pmullw_m2r(*(ebx + 32), xmm4); /* xmm4 = 27 26 25 24 23 22 21 20 */ \ + pmullw_m2r(*(ebx + 64), xmm5); /* xmm5 = 47 46 45 44 43 42 41 40 */ \ + \ + movdqu_m2r(*(ecx + 16), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ + pand_r2r(xmm2, xmm7); /* xmm7 = -- -- -- -- -- 12 11 -- */ \ + \ + pand_r2r(xmm4, xmm6); /* xmm6 = -- -- 25 24 -- -- -- -- */ \ + pxor_r2r(xmm7, xmm2); /* xmm2 = 17 16 15 14 13 -- -- 10 */ \ + \ + pxor_r2r(xmm6, xmm4); /* xmm4 = 27 26 -- -- 23 22 21 20 */ \ + pslldq_i2r(4, xmm7); /* xmm7 = -- -- -- 12 11 -- -- -- */ \ + \ + pslldq_i2r(2, xmm6); /* xmm6 = -- 25 24 -- -- -- -- -- */ \ + por_r2r(xmm6, xmm7); /* xmm7 = -- 25 24 12 11 -- -- -- */ \ + \ + movdqu_m2r(*(ecx + 32), xmm0); /* xmm0 = -- -- -- -- -- FF FF FF */ \ + movdqu_m2r(*(ecx + 48), xmm6); /* xmm6 = -- -- -- -- FF -- -- -- */ \ + \ + pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- -- -- 03 02 00 */ \ + pand_r2r(xmm5, xmm6); /* xmm6 = -- -- -- -- 43 -- -- -- */ \ + \ + pxor_r2r(xmm0, xmm3); /* xmm3 = 07 06 05 04 01 -- -- -- */ \ + pxor_r2r(xmm6, xmm5); /* xmm5 = 47 46 45 44 -- 42 41 40 */ \ + \ + por_r2r(xmm7, xmm0); /* xmm0 = -- 25 24 12 11 03 02 00 */ \ + pslldq_i2r(8, xmm6); /* xmm6 = 43 -- -- -- -- -- -- -- */ \ + \ + por_r2r(xmm6, xmm0); /* xmm0 = 43 25 24 12 11 03 02 00 */ \ + /* 02345 in use */ \ + \ + movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ + pshuflw_r2r(xmm5, xmm5, 0x0B4); /* xmm5 = 47 46 45 44 42 -- 41 40 */ \ + \ + movdqu_r2r(xmm1, xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ + movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ + \ + movdqu_r2m(xmm0, *(eax)); /* write 43 25 24 12 11 03 02 00 */ \ + pshufhw_r2r(xmm4, xmm4, 0x0C2); /* xmm4 = 27 -- -- 26 23 22 21 20 */ \ + \ + pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- 26 23 -- -- -- */ \ + pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 44 42 -- -- -- */ \ + \ + pxor_r2r(xmm7, xmm4); /* xmm4 = 27 -- -- -- -- 22 21 20 */ \ + pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 45 -- -- -- 41 40 */ \ + \ + pshuflw_r2r(xmm2, xmm2, 0x0C6); /* xmm2 = 17 16 15 14 13 10 -- -- */ \ + movdqu_r2r(xmm6, xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ + \ + pslldq_i2r(2, xmm7); /* xmm7 = -- -- 26 23 -- -- -- -- */ \ + pslldq_i2r(6, xmm1); /* xmm1 = 44 42 -- -- -- -- -- -- */ \ + \ + psrldq_i2r(2, xmm0); /* xmm0 = -- -- -- -- FF FF -- -- */ \ + pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- 04 01 -- -- -- */ \ + \ + pand_r2r(xmm2, xmm0); /* xmm0 = -- -- -- -- 13 10 -- -- */ \ + pxor_r2r(xmm6, xmm3); /* xmm3 = 07 06 05 -- -- -- -- -- */ \ + \ + pxor_r2r(xmm0, xmm2); /* xmm2 = 17 16 15 14 -- -- -- -- */ \ + psrldq_i2r(6, xmm6); /* xmm0 = -- -- -- -- -- -- 04 01 */ \ + \ + por_r2r(xmm7, xmm1); /* xmm1 = 44 42 26 23 -- -- -- -- */ \ + por_r2r(xmm6, xmm0); /* xmm1 = -- -- -- -- 13 10 04 01 */ \ + /* 12345 in use */ \ + por_r2r(xmm0, xmm1); /* xmm1 = 44 42 26 23 13 10 04 01 */ \ + pshuflw_r2r(xmm4, xmm4, 0x093); /* xmm4 = 27 -- -- -- 22 21 20 -- */ \ + \ + pshufhw_r2r(xmm4, xmm4, 0x093); /* xmm4 = -- -- -- 27 22 21 20 -- */ \ + movdqu_r2m(xmm1, *(eax + 16)); /* write 44 42 26 23 13 10 04 01 */ \ + \ + pshufhw_r2r(xmm3, xmm3, 0x0D2); /* xmm3 = 07 05 -- 06 -- -- -- -- */ \ + movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ + \ + pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 06 -- -- -- -- */ \ + psrldq_i2r(12, xmm3); /* xmm3 = -- -- -- -- -- -- 07 05 */ \ + \ + psrldq_i2r(8, xmm0); /* xmm0 = -- -- -- -- -- -- -- 06 */ \ + \ + movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ + movdqu_m2r(*(ecx + 96), xmm7); /* xmm7 = -- -- -- -- FF FF -- -- */ \ + \ + pand_r2r(xmm4, xmm6); /* xmm6 = -- -- -- 27 22 -- -- -- */ \ + pxor_r2r(xmm6, xmm4); /* xmm4 = -- -- -- -- -- 21 20 -- */ \ + \ + por_r2r(xmm6, xmm3); /* xmm3 = -- -- -- 27 22 -- 07 05 */ \ + pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- -- -- 21 -- -- */ \ + \ + por_r2r(xmm7, xmm0); /* xmm0 = -- -- -- -- -- 21 -- 06 */ \ + pxor_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- -- 20 -- */ \ + \ + movdqu_m2r(*(ecx + 16 ), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ + movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ + \ + pand_r2r(xmm2, xmm6); /* xmm6 = -- -- 15 14 -- -- -- -- */ \ + pand_r2r(xmm6, xmm1); /* xmm1 = -- -- -- 14 -- -- -- -- */ \ + \ + pxor_r2r(xmm6, xmm2); /* xmm2 = 17 16 -- -- -- -- -- -- */ \ + pxor_r2r(xmm1, xmm6); /* xmm6 = -- -- 15 -- -- -- -- -- */ \ + \ + psrldq_i2r(4, xmm1); /* xmm1 = -- -- -- -- -- 14 -- -- */ \ + \ + psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- 15 -- */ \ + por_r2r(xmm1, xmm3); /* xmm3 = -- -- -- 27 22 14 07 05 */ \ + \ + por_r2r(xmm6, xmm0); /* xmm0 = -- -- -- -- -- 21 15 06 */ \ + pshufhw_r2r(xmm5, xmm5, 0x0E1); /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \ + \ + movdqu_m2r(*(ecx + 64), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ + pshuflw_r2r(xmm5, xmm5, 0x072); /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \ + \ + movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ + pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 45 41 -- -- -- */ \ + \ + pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 -- -- -- -- 40 -- */ \ + pslldq_i2r(4, xmm1); /* xmm1 = -- 45 41 -- -- -- -- -- */ \ + \ + pshufd_r2r(xmm5, xmm5, 0x09C); /* xmm5 = -- -- -- -- 47 46 40 -- */ \ + por_r2r(xmm1, xmm3); /* xmm3 = -- 45 41 27 22 14 07 05 */ \ + \ + movdqu_m2r(*(eax + 96), xmm1); /* xmm1 = 67 66 65 64 63 62 61 60 */ \ + pmullw_m2r(*(ebx + 96), xmm1); \ + \ + movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ + \ + psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ + pand_r2r(xmm5, xmm7); /* xmm7 = -- -- -- -- -- 46 40 -- */ \ + \ + pand_r2r(xmm1, xmm6); /* xmm6 = -- -- -- -- -- -- -- 60 */ \ + pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- -- -- 47 -- -- -- */ \ + \ + pxor_r2r(xmm6, xmm1); /* xmm1 = 67 66 65 64 63 62 61 -- */ \ + pslldq_i2r(2, xmm5); /* xmm5 = -- -- -- 47 -- -- -- -- */ \ + \ + pslldq_i2r(14, xmm6); /* xmm6 = 60 -- -- -- -- -- -- -- */ \ + por_r2r(xmm5, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 -- */ \ + \ + por_r2r(xmm6, xmm3); /* xmm3 = 60 45 41 27 22 14 07 05 */ \ + pslldq_i2r(6, xmm7); /* xmm7 = -- -- 46 40 -- -- -- -- */ \ + \ + movdqu_r2m(xmm3, *(eax+32)); /* write 60 45 41 27 22 14 07 05 */ \ + por_r2r(xmm7, xmm0); /* xmm0 = -- -- 46 40 -- 21 15 06 */ \ + /* 0, 1, 2, 4 in use */ \ + movdqu_m2r(*(eax + 48), xmm3); /* xmm3 = 37 36 35 34 33 32 31 30 */ \ + movdqu_m2r(*(eax + 80), xmm5); /* xmm5 = 57 56 55 54 53 52 51 50 */ \ + \ + pmullw_m2r(*(ebx + 48), xmm3); \ + pmullw_m2r(*(ebx + 80), xmm5); \ + \ + movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ + movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ + \ + psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ + pslldq_i2r(8, xmm7); /* xmm7 = FF -- -- -- -- -- -- -- */ \ + \ + pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- -- 30 */ \ + pand_r2r(xmm5, xmm7); /* xmm7 = 57 -- -- -- -- -- -- -- */ \ + \ + pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 31 -- */ \ + pxor_r2r(xmm7, xmm5); /* xmm5 = __ 56 55 54 53 52 51 50 */ \ + \ + pslldq_i2r(6, xmm6); /* xmm6 = -- -- -- -- 30 -- -- -- */ \ + psrldq_i2r(2, xmm7); /* xmm7 = -- 57 -- -- -- -- -- -- */ \ + \ + por_r2r(xmm7, xmm6); /* xmm6 = -- 57 -- -- 30 -- -- -- */ \ + movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ + \ + por_r2r(xmm6, xmm0); /* xmm0 = -- 57 46 40 30 21 15 06 */ \ + psrldq_i2r(2, xmm7); /* xmm7 = -- -- -- -- -- -- FF FF */ \ + \ + movdqu_r2r(xmm2, xmm6); /* xmm6 = 17 16 -- -- -- -- -- -- */ \ + pand_r2r(xmm1, xmm7); /* xmm7 = -- -- -- -- -- -- 61 -- */ \ + \ + pslldq_i2r(2, xmm6); /* xmm6 = 16 -- -- -- -- -- -- -- */ \ + psrldq_i2r(14, xmm2); /* xmm2 = -- -- -- -- -- -- -- 17 */ \ + \ + pxor_r2r(xmm7, xmm1); /* xmm1 = 67 66 65 64 63 62 -- -- */ \ + pslldq_i2r(12, xmm7); /* xmm7 = 61 -- -- -- -- -- -- -- */ \ + \ + psrldq_i2r(14, xmm6); /* xmm6 = -- -- -- -- -- -- -- 16 */ \ + por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 16 */ \ + \ + por_r2r(xmm7, xmm0); /* xmm0 = 61 57 46 40 30 21 15 06 */ \ + movdqu_m2r(*(ecx), xmm6); /* xmm6 = -- -- -- -- -- FF FF -- */ \ + \ + psrldq_i2r(2, xmm6); /* xmm6 = -- -- -- -- -- -- FF FF */ \ + movdqu_r2m(xmm0, *(eax+48)); /* write 61 57 46 40 30 21 15 06 */ \ + /* 1, 2, 3, 4, 5 in use */\ + movdqu_m2r(*(ecx), xmm0); /* xmm0 = -- -- -- -- -- FF FF -- */ \ + pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- 31 -- */ \ + \ + movdqu_r2r(xmm3, xmm7); /* xmm7 = 37 36 35 34 33 32 31 -- */ \ + pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 -- -- */ \ + \ + pslldq_i2r(2, xmm3); /* xmm3 = 36 35 34 33 32 -- -- -- */ \ + pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- -- 62 -- -- */ \ + \ + psrldq_i2r(14, xmm7); /* xmm7 = -- -- -- -- -- -- -- 37 */ \ + pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 63 -- -- -- */ \ + \ + por_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- 31 37 */ \ + movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ + \ + pshuflw_r2r(xmm6, xmm6, 0x01E); /* xmm6 = -- -- -- -- 37 31 -- -- */ \ + pslldq_i2r(6, xmm7); /* xmm7 = FF FF -- -- -- -- -- -- */ \ + \ + por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 37 31 20 16 */ \ + pand_r2r(xmm5, xmm7); /* xmm7 = -- 56 -- -- -- -- -- -- */ \ + \ + pslldq_i2r(8, xmm0); /* xmm0 = -- 62 -- -- -- -- -- -- */ \ + pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- 55 54 53 52 51 50 */ \ + \ + psrldq_i2r(2, xmm7); /* xmm7 = -- -- 56 -- -- -- -- -- */ \ + \ + pshufhw_r2r(xmm3, xmm3, 0x087); /* xmm3 = 35 33 34 36 32 -- -- -- */ \ + por_r2r(xmm7, xmm0); /* xmm0 = -- 62 56 -- -- -- -- -- */ \ + \ + movdqu_m2r(*(eax + 112), xmm7); /* xmm7 = 77 76 75 74 73 72 71 70 */ \ + pmullw_m2r(*(ebx + 112), xmm7); \ + \ + movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ + por_r2r(xmm0, xmm4); /* xmm4 = -- 62 56 47 37 31 20 16 */ \ + \ + pshuflw_r2r(xmm7, xmm7, 0x0E1); /* xmm7 = 77 76 75 74 73 72 70 71 */ \ + psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ + \ + movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ + pand_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- -- 71 */ \ + \ + pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 36 32 -- -- -- */ \ + pxor_r2r(xmm6, xmm7); /* xmm7 = 77 76 75 74 73 72 70 -- */ \ + \ + pxor_r2r(xmm0, xmm3); /* xmm3 = 35 33 34 -- -- -- -- -- */ \ + pslldq_i2r(14, xmm6); /* xmm6 = 71 -- -- -- -- -- -- -- */ \ + \ + psrldq_i2r(4, xmm0); /* xmm0 = -- -- -- -- -- 36 32 -- */ \ + por_r2r(xmm6, xmm4); /* xmm4 = 71 62 56 47 37 31 20 16 */ \ + \ + por_r2r(xmm0, xmm2); /* xmm2 = -- -- -- -- -- 36 32 17 */ \ + movdqu_r2m(xmm4, *(eax + 64)); /* write 71 62 56 47 37 31 20 16 */ \ + /* 1, 2, 3, 5, 7 in use */ \ + movdqu_m2r(*(ecx + 80), xmm6); /* xmm6 = -- -- FF -- -- -- -- FF */ \ + pshufhw_r2r(xmm7, xmm7, 0x0D2); /* xmm7 = 77 75 74 76 73 72 70 __ */ \ + \ + movdqu_m2r(*(ecx), xmm4); /* xmm4 = -- -- -- -- -- FF FF -- */ \ + movdqu_m2r(*(ecx+48), xmm0); /* xmm0 = -- -- -- -- FF -- -- -- */ \ + \ + pand_r2r(xmm5, xmm6); /* xmm6 = -- -- 55 -- -- -- -- 50 */ \ + pand_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- 72 70 -- */ \ + \ + pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- 63 -- -- -- */ \ + pxor_r2r(xmm6, xmm5); /* xmm5 = -- -- -- 54 53 52 51 -- */ \ + \ + pxor_r2r(xmm4, xmm7); /* xmm7 = 77 75 74 76 73 -- -- -- */ \ + pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 -- -- -- -- */ \ + \ + pshuflw_r2r(xmm6, xmm6, 0x02B); /* xmm6 = -- -- 55 -- 50 -- -- -- */ \ + pslldq_i2r(10, xmm4); /* xmm4 = 72 20 -- -- -- -- -- -- */ \ + \ + pshufhw_r2r(xmm6, xmm6, 0x0B1); /* xmm6 = -- -- -- 55 50 -- -- -- */ \ + pslldq_i2r(4, xmm0); /* xmm0 = -- -- 63 -- -- -- -- -- */ \ + \ + por_r2r(xmm4, xmm6); /* xmm6 = 72 70 -- 55 50 -- -- -- */ \ + por_r2r(xmm0, xmm2); /* xmm2 = -- -- 63 -- -- 36 32 17 */ \ + \ + por_r2r(xmm6, xmm2); /* xmm2 = 72 70 64 55 50 36 32 17 */ \ + pshufhw_r2r(xmm1, xmm1, 0x0C9); /* xmm1 = 67 64 66 65 -- -- -- -- */ \ + \ + movdqu_r2r(xmm3, xmm6); /* xmm6 = 35 33 34 -- -- -- -- -- */ \ + movdqu_r2m(xmm2, *(eax+80)); /* write 72 70 64 55 50 36 32 17 */ \ + \ + psrldq_i2r(12, xmm6); /* xmm6 = -- -- -- -- -- -- 35 33 */ \ + pslldq_i2r(4, xmm3); /* xmm3 = 34 -- -- -- -- -- -- -- */ \ + \ + pshuflw_r2r(xmm5, xmm5, 0x04E); /* xmm5 = -- -- -- 54 51 -- 53 52 */ \ + movdqu_r2r(xmm7, xmm4); /* xmm4 = 77 75 74 76 73 -- -- -- */ \ + \ + movdqu_r2r(xmm5, xmm2); /* xmm2 = -- -- -- 54 51 -- 53 52 */ \ + psrldq_i2r(10, xmm7); /* xmm7 = -- -- -- -- -- 77 75 74 */ \ + \ + pslldq_i2r(6, xmm4); /* xmm4 = 76 73 -- -- -- -- -- -- */ \ + pslldq_i2r(12, xmm2); /* xmm2 = 53 52 -- -- -- -- -- -- */ \ + \ + movdqu_r2r(xmm1, xmm0); /* xmm0 = 67 64 66 65 -- -- -- -- */ \ + psrldq_i2r(12, xmm1); /* xmm1 = -- -- -- -- -- -- 67 64 */ \ + \ + psrldq_i2r(6, xmm5); /* xmm5 = -- -- -- -- -- -- 54 51 */ \ + psrldq_i2r(14, xmm3); /* xmm3 = -- -- -- -- -- -- -- 34 */ \ + \ + pslldq_i2r(10, xmm7); /* xmm7 = 77 75 74 -- -- -- -- -- */ \ + por_r2r(xmm6, xmm4); /* xmm4 = 76 73 -- -- -- -- 35 33 */ \ + \ + psrldq_i2r(10, xmm2); /* xmm2 = -- -- -- -- -- 53 52 -- */ \ + pslldq_i2r(4, xmm0); /* xmm0 = 66 65 -- -- -- -- -- -- */ \ + \ + pslldq_i2r(8, xmm1); /* xmm1 = -- -- 67 64 -- -- -- -- */ \ + por_r2r(xmm7, xmm3); /* xmm3 = 77 75 74 -- -- -- -- 34 */ \ + \ + psrldq_i2r(6, xmm0); /* xmm0 = -- -- -- 66 65 -- -- -- */ \ + pslldq_i2r(4, xmm5); /* xmm5 = -- -- -- -- 54 51 -- -- */ \ + \ + por_r2r(xmm1, xmm4); /* xmm4 = 76 73 67 64 -- -- 35 33 */ \ + por_r2r(xmm2, xmm3); /* xmm3 = 77 75 74 -- -- 53 52 34 */ \ + \ + por_r2r(xmm5, xmm4); /* xmm4 = 76 73 67 64 54 51 35 33 */ \ + por_r2r(xmm0, xmm3); /* xmm3 = 77 75 74 66 65 53 52 34 */ \ + \ + movdqu_r2m(xmm4, *(eax+96)); /* write 76 73 67 64 54 51 35 33 */ \ + movdqu_r2m(xmm3, *(eax+112)); /* write 77 75 74 66 65 53 52 34 */ \ + \ +} /* end of SSE2_Dequantize Macro */ + + +void vp3_dsp_init_sse2(void) +{ + /* nop */ +} + + +static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, + int16_t *output_data) +{ + unsigned char *input_bytes = (unsigned char *)input_data; + unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix; + unsigned char *dequant_const_bytes = (unsigned char *)SSE2_dequant_const; + unsigned char *output_data_bytes = (unsigned char *)output_data; + unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data; + unsigned char *Eight = (unsigned char *)eight_data; + +#define eax input_bytes +#define ebx dequant_matrix_bytes +#define ecx dequant_const_bytes +#define edx idct_data_bytes + +#define I(i) (eax + 16 * i) +#define O(i) (ebx + 16 * i) +#define C(i) (edx + 16 * (i-1)) + + SSE2_Dequantize(); + +#undef ebx +#define ebx output_data_bytes + + SSE2_Row_IDCT(); + + SSE2_Transpose(); + + SSE2_Column_IDCT(); +} + + +void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, uint8_t *dest, int stride) +{ + int16_t transformed_data[64]; + int16_t *op; + int i, j; + + vp3_idct_sse2(input_data, dequant_matrix, transformed_data); + + /* place in final output */ + op = transformed_data; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + if (*op < -128) + *dest = 0; + else if (*op > 127) + *dest = 255; + else + *dest = (uint8_t)(*op + 128); + op++; + dest++; + } + dest += (stride - 8); + } +} + + +void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, uint8_t *dest, int stride) +{ + int16_t transformed_data[64]; + int16_t *op; + int i, j; + int16_t sample; + + vp3_idct_sse2(input_data, dequant_matrix, transformed_data); + + /* place in final output */ + op = transformed_data; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + sample = *dest + *op; + if (sample < 0) + *dest = 0; + else if (sample > 255) + *dest = 255; + else + *dest = (uint8_t)(sample & 0xFF); + op++; + dest++; + } + dest += (stride - 8); + } +} diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c index a18645e33..14fdb1059 100644 --- a/src/libffmpeg/libavcodec/imgresample.c +++ b/src/libffmpeg/libavcodec/imgresample.c @@ -45,7 +45,10 @@ #define LINE_BUF_HEIGHT (NB_TAPS * 4) struct ImgReSampleContext { - int iwidth, iheight, owidth, oheight, topBand, bottomBand, leftBand, rightBand; + int iwidth, iheight, owidth, oheight; + int topBand, bottomBand, leftBand, rightBand; + int padtop, padbottom, padleft, padright; + int pad_owidth, pad_oheight; int h_incr, v_incr; int16_t h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */ int16_t v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */ @@ -532,6 +535,7 @@ static void component_resample(ImgReSampleContext *s, &s->v_filters[phase_y][0]); src_y += s->v_incr; + output += owrap; } } @@ -572,13 +576,16 @@ static void build_filter(int16_t *filter, float factor) ImgReSampleContext *img_resample_init(int owidth, int oheight, int iwidth, int iheight) { - return img_resample_full_init(owidth, oheight, iwidth, iheight, 0, 0, 0, 0); + return img_resample_full_init(owidth, oheight, iwidth, iheight, + 0, 0, 0, 0, 0, 0, 0, 0); } ImgReSampleContext *img_resample_full_init(int owidth, int oheight, int iwidth, int iheight, int topBand, int bottomBand, - int leftBand, int rightBand) + int leftBand, int rightBand, + int padtop, int padbottom, + int padleft, int padright) { ImgReSampleContext *s; @@ -593,19 +600,30 @@ ImgReSampleContext *img_resample_full_init(int owidth, int oheight, s->oheight = oheight; s->iwidth = iwidth; s->iheight = iheight; + s->topBand = topBand; s->bottomBand = bottomBand; s->leftBand = leftBand; s->rightBand = rightBand; - s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / owidth; - s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / oheight; - - build_filter(&s->h_filters[0][0], (float) owidth / (float) (iwidth - leftBand - rightBand)); - build_filter(&s->v_filters[0][0], (float) oheight / (float) (iheight - topBand - bottomBand)); + s->padtop = padtop; + s->padbottom = padbottom; + s->padleft = padleft; + s->padright = padright; + + s->pad_owidth = owidth - (padleft + padright); + s->pad_oheight = oheight - (padtop + padbottom); + + s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / s->pad_owidth; + s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / s->pad_oheight; + + build_filter(&s->h_filters[0][0], (float) s->pad_owidth / + (float) (iwidth - leftBand - rightBand)); + build_filter(&s->v_filters[0][0], (float) s->pad_oheight / + (float) (iheight - topBand - bottomBand)); return s; - fail: +fail: av_free(s); return NULL; } @@ -614,13 +632,20 @@ void img_resample(ImgReSampleContext *s, AVPicture *output, const AVPicture *input) { int i, shift; + uint8_t* optr; - for(i=0;i<3;i++) { + for (i=0;i<3;i++) { shift = (i == 0) ? 0 : 1; - component_resample(s, output->data[i], output->linesize[i], - s->owidth >> shift, s->oheight >> shift, - input->data[i] + (input->linesize[i] * (s->topBand >> shift)) + (s->leftBand >> shift), - input->linesize[i], ((s->iwidth - s->leftBand - s->rightBand) >> shift), + + optr = output->data[i] + (((output->linesize[i] * + s->padtop) + s->padleft) >> shift); + + component_resample(s, optr, output->linesize[i], + s->pad_owidth >> shift, s->pad_oheight >> shift, + input->data[i] + (input->linesize[i] * + (s->topBand >> shift)) + (s->leftBand >> shift), + input->linesize[i], ((s->iwidth - s->leftBand - + s->rightBand) >> shift), (s->iheight - s->topBand - s->bottomBand) >> shift); } } diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c index 30029d40c..255a82d2c 100644 --- a/src/libffmpeg/libavcodec/mjpeg.c +++ b/src/libffmpeg/libavcodec/mjpeg.c @@ -862,13 +862,11 @@ static int mjpeg_decode_init(AVCodecContext *avctx) memset(&s2, 0, sizeof(MpegEncContext)); s2.avctx= avctx; // s2->out_format = FMT_MJPEG; - s2.width = 8; - s2.height = 8; - if (MPV_common_init(&s2) < 0) - return -1; + dsputil_init(&s2.dsp, avctx); + DCT_common_init(&s2); + s->scantable= s2.intra_scantable; s->idct_put= s2.dsp.idct_put; - MPV_common_end(&s2); s->mpeg_enc_ctx_allocated = 0; s->buffer_size = 102400; /* smaller buffer should be enough, @@ -1532,15 +1530,22 @@ static int mjpeg_decode_app(MJpegDecodeContext *s) if (id == ff_get_fourcc("JFIF")) { - int t_w, t_h; + int t_w, t_h, v1, v2; skip_bits(&s->gb, 8); /* the trailing zero-byte */ - av_log(s->avctx, AV_LOG_INFO, "mjpeg: JFIF header found (version: %x.%x)\n", - get_bits(&s->gb, 8), get_bits(&s->gb, 8)); + v1= get_bits(&s->gb, 8); + v2= get_bits(&s->gb, 8); skip_bits(&s->gb, 8); s->avctx->sample_aspect_ratio.num= get_bits(&s->gb, 16); s->avctx->sample_aspect_ratio.den= get_bits(&s->gb, 16); + if (s->avctx->debug & FF_DEBUG_PICT_INFO) + av_log(s->avctx, AV_LOG_INFO, "mjpeg: JFIF header found (version: %x.%x) SAR=%d/%d\n", + v1, v2, + s->avctx->sample_aspect_ratio.num, + s->avctx->sample_aspect_ratio.den + ); + t_w = get_bits(&s->gb, 8); t_h = get_bits(&s->gb, 8); if (t_w && t_h) @@ -1555,7 +1560,8 @@ static int mjpeg_decode_app(MJpegDecodeContext *s) if (id == ff_get_fourcc("Adob") && (get_bits(&s->gb, 8) == 'e')) { - av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n"); + if (s->avctx->debug & FF_DEBUG_PICT_INFO) + av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n"); skip_bits(&s->gb, 16); /* version */ skip_bits(&s->gb, 16); /* flags0 */ skip_bits(&s->gb, 16); /* flags1 */ @@ -1565,7 +1571,8 @@ static int mjpeg_decode_app(MJpegDecodeContext *s) } if (id == ff_get_fourcc("LJIF")){ - av_log(s->avctx, AV_LOG_INFO, "Pegasus lossless jpeg header found\n"); + if (s->avctx->debug & FF_DEBUG_PICT_INFO) + av_log(s->avctx, AV_LOG_INFO, "Pegasus lossless jpeg header found\n"); skip_bits(&s->gb, 16); /* version ? */ skip_bits(&s->gb, 16); /* unknwon always 0? */ skip_bits(&s->gb, 16); /* unknwon always 0? */ @@ -1604,7 +1611,7 @@ static int mjpeg_decode_app(MJpegDecodeContext *s) skip_bits(&s->gb, 32); /* scan off */ skip_bits(&s->gb, 32); /* data off */ #endif - if (s->first_picture) + if (s->avctx->debug & FF_DEBUG_PICT_INFO) av_log(s->avctx, AV_LOG_INFO, "mjpeg: Apple MJPEG-A header found\n"); } } @@ -1635,7 +1642,8 @@ static int mjpeg_decode_com(MJpegDecodeContext *s) else cbuf[i] = 0; - av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf); + if(s->avctx->debug & FF_DEBUG_PICT_INFO) + av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf); /* buggy avid, it puts EOI only at every 10th frame */ if (!strcmp(cbuf, "AVID")) @@ -1781,13 +1789,12 @@ static int mjpeg_decode_frame(AVCodecContext *avctx, /* process markers */ if (start_code >= 0xd0 && start_code <= 0xd7) { dprintf("restart marker: %d\n", start_code&0x0f); - } else if (s->first_picture) { /* APP fields */ - if (start_code >= 0xe0 && start_code <= 0xef) - mjpeg_decode_app(s); + } else if (start_code >= APP0 && start_code <= APP15) { + mjpeg_decode_app(s); /* Comment */ - else if (start_code == COM) - mjpeg_decode_com(s); + } else if (start_code == COM){ + mjpeg_decode_com(s); } switch(start_code) { diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c index 5132487cf..f194a4d60 100644 --- a/src/libffmpeg/libavcodec/motion_est.c +++ b/src/libffmpeg/libavcodec/motion_est.c @@ -33,8 +33,8 @@ #include "dsputil.h" #include "mpegvideo.h" -//#undef NDEBUG -//#include <assert.h> +#undef NDEBUG +#include <assert.h> #define SQ(a) ((a)*(a)) @@ -46,9 +46,8 @@ static inline int sad_hpel_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, - int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[6], int stride, int uvstride, - int size, int h, uint8_t * const mv_penalty); + int src_index, int ref_index, + int size, int h); static inline int update_map_generation(MpegEncContext * s) { @@ -73,209 +72,153 @@ static int minima_cmp(const void *a, const void *b){ return da->height - db->height; } - -/* SIMPLE */ -#define RENAME(a) simple_ ## a -#define CMP(d, x, y, size)\ -d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride, h); +#define FLAG_QPEL 1 //must be 1 +#define FLAG_CHROMA 2 +#define FLAG_DIRECT 4 -#define CMP_HPEL(d, dx, dy, x, y, size)\ -{\ - const int dxy= (dx) + 2*(dy);\ - hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, h);\ - d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\ -} - - -#define CMP_QPEL(d, dx, dy, x, y, size)\ -{\ - const int dxy= (dx) + 4*(dy);\ - qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\ - d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\ +static inline void init_ref(MpegEncContext *s, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){ + MotionEstContext * const c= &s->me; + const int offset[3]= { + y*c-> stride + x, + ((y*c->uvstride + x)>>1), + ((y*c->uvstride + x)>>1), + }; + int i; + for(i=0; i<3; i++){ + c->src[0][i]= src [i] + offset[i]; + c->ref[0][i]= ref [i] + offset[i]; + } + if(ref_index){ + for(i=0; i<3; i++){ + c->ref[ref_index][i]= ref2[i] + offset[i]; + } + } } -#include "motion_est_template.c" -#undef RENAME -#undef CMP -#undef CMP_HPEL -#undef CMP_QPEL -#undef INIT - -/* SIMPLE CHROMA */ -#define RENAME(a) simple_chroma_ ## a - -#define CMP(d, x, y, size)\ -d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride, h);\ -if(chroma_cmp){\ - int dxy= ((x)&1) + 2*((y)&1);\ - int c= ((x)>>1) + ((y)>>1)*uvstride;\ -\ - chroma_hpel_put[0][dxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\ - d += chroma_cmp(s, s->me.scratchpad, src_u, uvstride, h>>1);\ - chroma_hpel_put[0][dxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\ - d += chroma_cmp(s, s->me.scratchpad, src_v, uvstride, h>>1);\ +static int get_flags(MpegEncContext *s, int direct, int chroma){ + return ((s->flags&CODEC_FLAG_QPEL) ? FLAG_QPEL : 0) + + (direct ? FLAG_DIRECT : 0) + + (chroma ? FLAG_CHROMA : 0); } -#define CMP_HPEL(d, dx, dy, x, y, size)\ -{\ - const int dxy= (dx) + 2*(dy);\ - hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, h);\ - d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\ - if(chroma_cmp_sub){\ - int cxy= (dxy) | ((x)&1) | (2*((y)&1));\ - int c= ((x)>>1) + ((y)>>1)*uvstride;\ - chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\ - d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride, h>>1);\ - chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\ - d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride, h>>1);\ - }\ -} +static always_inline int cmp(MpegEncContext *s, const int x, const int y, const int subx, const int suby, + const int size, const int h, int ref_index, int src_index, + me_cmp_func cmp_func, me_cmp_func chroma_cmp_func, const int flags){ + MotionEstContext * const c= &s->me; + const int stride= c->stride; + const int uvstride= c->uvstride; + const int qpel= flags&FLAG_QPEL; + const int chroma= flags&FLAG_CHROMA; + const int dxy= subx + (suby<<(1+qpel)); //FIXME log2_subpel? + const int hx= subx + (x<<(1+qpel)); + const int hy= suby + (y<<(1+qpel)); + uint8_t * const * const ref= c->ref[ref_index]; + uint8_t * const * const src= c->src[src_index]; + int d; + //FIXME check chroma 4mv, (no crashes ...) + if(flags&FLAG_DIRECT){ + if(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1)){ + const int time_pp= s->pp_time; + const int time_pb= s->pb_time; + const int mask= 2*qpel+1; + if(s->mv_type==MV_TYPE_8X8){ + int i; + for(i=0; i<4; i++){ + int fx = c->direct_basis_mv[i][0] + hx; + int fy = c->direct_basis_mv[i][1] + hy; + int bx = hx ? fx - c->co_located_mv[i][0] : c->co_located_mv[i][0]*(time_pb - time_pp)/time_pp + ((i &1)<<(qpel+4)); + int by = hy ? fy - c->co_located_mv[i][1] : c->co_located_mv[i][1]*(time_pb - time_pp)/time_pp + ((i>>1)<<(qpel+4)); + int fxy= (fx&mask) + ((fy&mask)<<(qpel+1)); + int bxy= (bx&mask) + ((by&mask)<<(qpel+1)); + + uint8_t *dst= c->temp + 8*(i&1) + 8*stride*(i>>1); + if(qpel){ + c->qpel_put[1][fxy](dst, ref[0] + (fx>>2) + (fy>>2)*stride, stride); + c->qpel_avg[1][bxy](dst, ref[8] + (bx>>2) + (by>>2)*stride, stride); + }else{ + c->hpel_put[1][fxy](dst, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 8); + c->hpel_avg[1][bxy](dst, ref[8] + (bx>>1) + (by>>1)*stride, stride, 8); + } + } + }else{ + int fx = c->direct_basis_mv[0][0] + hx; + int fy = c->direct_basis_mv[0][1] + hy; + int bx = hx ? fx - c->co_located_mv[0][0] : (c->co_located_mv[0][0]*(time_pb - time_pp)/time_pp); + int by = hy ? fy - c->co_located_mv[0][1] : (c->co_located_mv[0][1]*(time_pb - time_pp)/time_pp); + int fxy= (fx&mask) + ((fy&mask)<<(qpel+1)); + int bxy= (bx&mask) + ((by&mask)<<(qpel+1)); + + if(qpel){ + c->qpel_put[1][fxy](c->temp , ref[0] + (fx>>2) + (fy>>2)*stride , stride); + c->qpel_put[1][fxy](c->temp + 8 , ref[0] + (fx>>2) + (fy>>2)*stride + 8 , stride); + c->qpel_put[1][fxy](c->temp + 8*stride, ref[0] + (fx>>2) + (fy>>2)*stride + 8*stride, stride); + c->qpel_put[1][fxy](c->temp + 8 + 8*stride, ref[0] + (fx>>2) + (fy>>2)*stride + 8 + 8*stride, stride); + c->qpel_avg[1][bxy](c->temp , ref[8] + (bx>>2) + (by>>2)*stride , stride); + c->qpel_avg[1][bxy](c->temp + 8 , ref[8] + (bx>>2) + (by>>2)*stride + 8 , stride); + c->qpel_avg[1][bxy](c->temp + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8*stride, stride); + c->qpel_avg[1][bxy](c->temp + 8 + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8 + 8*stride, stride); + }else{ + assert((fx>>1) + 16*s->mb_x >= -16); + assert((fy>>1) + 16*s->mb_y >= -16); + assert((fx>>1) + 16*s->mb_x <= s->width); + assert((fy>>1) + 16*s->mb_y <= s->height); + assert((bx>>1) + 16*s->mb_x >= -16); + assert((by>>1) + 16*s->mb_y >= -16); + assert((bx>>1) + 16*s->mb_x <= s->width); + assert((by>>1) + 16*s->mb_y <= s->height); + + c->hpel_put[0][fxy](c->temp, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 16); + c->hpel_avg[0][bxy](c->temp, ref[8] + (bx>>1) + (by>>1)*stride, stride, 16); + } + } + d = cmp_func(s, c->temp, src[0], stride, 16); + }else + d= 256*256*256*32; + }else{ + int uvdxy; + if(dxy){ + if(qpel){ + c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h) + if(chroma){ + int cx= hx/2; + int cy= hy/2; + cx= (cx>>1)|(cx&1); + cy= (cy>>1)|(cy&1); + uvdxy= (cx&1) + 2*(cy&1); + //FIXME x/y wrong, but mpeg4 qpel is sick anyway, we should drop as much of it as possible in favor for h264 + } + }else{ + c->hpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride, h); + if(chroma) + uvdxy= dxy | (x&1) | (2*(y&1)); + } + d = cmp_func(s, c->temp, src[0], stride, h); + }else{ + d = cmp_func(s, src[0], ref[0] + x + y*stride, stride, h); + if(chroma) + uvdxy= (x&1) + 2*(y&1); + } + if(chroma){ + uint8_t * const uvtemp= c->temp + 16*stride; + c->hpel_put[size+1][uvdxy](uvtemp , ref[1] + (x>>1) + (y>>1)*uvstride, uvstride, h>>1); + c->hpel_put[size+1][uvdxy](uvtemp+8, ref[2] + (x>>1) + (y>>1)*uvstride, uvstride, h>>1); + d += chroma_cmp_func(s, uvtemp , src[1], uvstride, h>>1); + d += chroma_cmp_func(s, uvtemp+8, src[2], uvstride, h>>1); + } + } +#if 0 + if(full_pel){ + const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1); + score_map[index]= d; + } -#define CMP_QPEL(d, dx, dy, x, y, size)\ -{\ - const int dxy= (dx) + 4*(dy);\ - qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\ - d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\ - if(chroma_cmp_sub){\ - int cxy, c;\ - int cx= (4*(x) + (dx))/2;\ - int cy= (4*(y) + (dy))/2;\ - cx= (cx>>1)|(cx&1);\ - cy= (cy>>1)|(cy&1);\ - cxy= (cx&1) + 2*(cy&1);\ - c= ((cx)>>1) + ((cy)>>1)*uvstride;\ - chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\ - d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride, h>>1);\ - chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\ - d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride, h>>1);\ - }\ + d += (c->mv_penalty[hx - c->pred_x] + c->mv_penalty[hy - c->pred_y])*c->penalty_factor; +#endif + return d; } #include "motion_est_template.c" -#undef RENAME -#undef CMP -#undef CMP_HPEL -#undef CMP_QPEL -#undef INIT - -/* SIMPLE DIRECT HPEL */ -#define RENAME(a) simple_direct_hpel_ ## a -//FIXME precalc divisions stuff - -#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\ -if((x) >= xmin && 2*(x) + (dx) <= 2*xmax && (y) >= ymin && 2*(y) + (dy) <= 2*ymax){\ - const int hx= 2*(x) + (dx);\ - const int hy= 2*(y) + (dy);\ - if(s->mv_type==MV_TYPE_8X8){\ - int i;\ - for(i=0; i<4; i++){\ - int fx = s->me.direct_basis_mv[i][0] + hx;\ - int fy = s->me.direct_basis_mv[i][1] + hy;\ - int bx = hx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\ - int by = hy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\ - int fxy= (fx&1) + 2*(fy&1);\ - int bxy= (bx&1) + 2*(by&1);\ -\ - uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\ - hpel_put[1][fxy](dst, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 8);\ - hpel_avg[1][bxy](dst, (ref_data[3]) + (bx>>1) + (by>>1)*(stride), stride, 8);\ - }\ - }else{\ - int fx = s->me.direct_basis_mv[0][0] + hx;\ - int fy = s->me.direct_basis_mv[0][1] + hy;\ - int bx = hx ? fx - s->me.co_located_mv[0][0] : (s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp);\ - int by = hy ? fy - s->me.co_located_mv[0][1] : (s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp);\ - int fxy= (fx&1) + 2*(fy&1);\ - int bxy= (bx&1) + 2*(by&1);\ - \ - assert((fx>>1) + 16*s->mb_x >= -16);\ - assert((fy>>1) + 16*s->mb_y >= -16);\ - assert((fx>>1) + 16*s->mb_x <= s->width);\ - assert((fy>>1) + 16*s->mb_y <= s->height);\ - assert((bx>>1) + 16*s->mb_x >= -16);\ - assert((by>>1) + 16*s->mb_y >= -16);\ - assert((bx>>1) + 16*s->mb_x <= s->width);\ - assert((by>>1) + 16*s->mb_y <= s->height);\ -\ - hpel_put[0][fxy](s->me.scratchpad, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 16);\ - hpel_avg[0][bxy](s->me.scratchpad, (ref_data[3]) + (bx>>1) + (by>>1)*(stride), stride, 16);\ - }\ - d = cmp_func(s, s->me.scratchpad, src_y, stride, 16);\ -}else\ - d= 256*256*256*32; - - -#define CMP_HPEL(d, dx, dy, x, y, size)\ - CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub) - -#define CMP(d, x, y, size)\ - CMP_DIRECT(d, 0, 0, x, y, size, cmp) - -#include "motion_est_template.c" -#undef RENAME -#undef CMP -#undef CMP_HPEL -#undef CMP_QPEL -#undef INIT -#undef CMP_DIRECT - -/* SIMPLE DIRECT QPEL */ -#define RENAME(a) simple_direct_qpel_ ## a - -#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\ -if((x) >= xmin && 4*(x) + (dx) <= 4*xmax && (y) >= ymin && 4*(y) + (dy) <= 4*ymax){\ - const int qx= 4*(x) + (dx);\ - const int qy= 4*(y) + (dy);\ - if(s->mv_type==MV_TYPE_8X8){\ - int i;\ - for(i=0; i<4; i++){\ - int fx = s->me.direct_basis_mv[i][0] + qx;\ - int fy = s->me.direct_basis_mv[i][1] + qy;\ - int bx = qx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\ - int by = qy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\ - int fxy= (fx&3) + 4*(fy&3);\ - int bxy= (bx&3) + 4*(by&3);\ -\ - uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\ - qpel_put[1][fxy](dst, (ref_y ) + (fx>>2) + (fy>>2)*(stride), stride);\ - qpel_avg[1][bxy](dst, (ref_data[3]) + (bx>>2) + (by>>2)*(stride), stride);\ - }\ - }else{\ - int fx = s->me.direct_basis_mv[0][0] + qx;\ - int fy = s->me.direct_basis_mv[0][1] + qy;\ - int bx = qx ? fx - s->me.co_located_mv[0][0] : s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp;\ - int by = qy ? fy - s->me.co_located_mv[0][1] : s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp;\ - int fxy= (fx&3) + 4*(fy&3);\ - int bxy= (bx&3) + 4*(by&3);\ -\ - qpel_put[1][fxy](s->me.scratchpad , (ref_y ) + (fx>>2) + (fy>>2)*(stride) , stride);\ - qpel_put[1][fxy](s->me.scratchpad + 8 , (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8 , stride);\ - qpel_put[1][fxy](s->me.scratchpad + 8*stride, (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8*stride, stride);\ - qpel_put[1][fxy](s->me.scratchpad + 8 + 8*stride, (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8 + 8*stride, stride);\ - qpel_avg[1][bxy](s->me.scratchpad , (ref_data[3]) + (bx>>2) + (by>>2)*(stride) , stride);\ - qpel_avg[1][bxy](s->me.scratchpad + 8 , (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8 , stride);\ - qpel_avg[1][bxy](s->me.scratchpad + 8*stride, (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8*stride, stride);\ - qpel_avg[1][bxy](s->me.scratchpad + 8 + 8*stride, (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8 + 8*stride, stride);\ - }\ - d = cmp_func(s, s->me.scratchpad, src_y, stride, 16);\ -}else\ - d= 256*256*256*32; - - -#define CMP_QPEL(d, dx, dy, x, y, size)\ - CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub) - -#define CMP(d, x, y, size)\ - CMP_DIRECT(d, 0, 0, x, y, size, cmp) - -#include "motion_est_template.c" -#undef RENAME -#undef CMP -#undef CMP_HPEL -#undef CMP_QPEL -#undef INIT -#undef CMP__DIRECT static inline int get_penalty_factor(MpegEncContext *s, int type){ switch(type&0xFF){ @@ -297,54 +240,45 @@ static inline int get_penalty_factor(MpegEncContext *s, int type){ } void ff_init_me(MpegEncContext *s){ + MotionEstContext * const c= &s->me; + ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, s->avctx->me_pre_cmp); ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp); ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp); ff_set_cmp(&s->dsp, s->dsp.mb_cmp, s->avctx->mb_cmp); + + s->me.flags = get_flags(s, 0, s->avctx->me_cmp &FF_CMP_CHROMA); + s->me.sub_flags= get_flags(s, 0, s->avctx->me_sub_cmp&FF_CMP_CHROMA); + s->me.mb_flags = get_flags(s, 0, s->avctx->mb_cmp &FF_CMP_CHROMA); +/*FIXME s->no_rounding b_type*/ if(s->flags&CODEC_FLAG_QPEL){ - if(s->avctx->me_sub_cmp&FF_CMP_CHROMA) - s->me.sub_motion_search= simple_chroma_qpel_motion_search; - else - s->me.sub_motion_search= simple_qpel_motion_search; + s->me.sub_motion_search= qpel_motion_search; + c->qpel_avg= s->dsp.avg_qpel_pixels_tab; + if(s->no_rounding) c->qpel_put= s->dsp.put_no_rnd_qpel_pixels_tab; + else c->qpel_put= s->dsp.put_qpel_pixels_tab; }else{ if(s->avctx->me_sub_cmp&FF_CMP_CHROMA) - s->me.sub_motion_search= simple_chroma_hpel_motion_search; + s->me.sub_motion_search= hpel_motion_search; else if( s->avctx->me_sub_cmp == FF_CMP_SAD && s->avctx-> me_cmp == FF_CMP_SAD && s->avctx-> mb_cmp == FF_CMP_SAD) s->me.sub_motion_search= sad_hpel_motion_search; // 2050 vs. 2450 cycles else - s->me.sub_motion_search= simple_hpel_motion_search; + s->me.sub_motion_search= hpel_motion_search; + c->hpel_avg= s->dsp.avg_pixels_tab; + if(s->no_rounding) c->hpel_put= s->dsp.put_no_rnd_pixels_tab; + else c->hpel_put= s->dsp.put_pixels_tab; } - - if(s->avctx->me_cmp&FF_CMP_CHROMA){ - s->me.motion_search[0]= simple_chroma_epzs_motion_search; - s->me.motion_search[1]= simple_chroma_epzs_motion_search4; - s->me.motion_search[4]= simple_chroma_epzs_motion_search2; + if(s->linesize){ + s->me.stride = s->linesize; + s->me.uvstride= s->uvlinesize; }else{ - s->me.motion_search[0]= simple_epzs_motion_search; - s->me.motion_search[1]= simple_epzs_motion_search4; - s->me.motion_search[4]= simple_epzs_motion_search2; - } - - if(s->avctx->me_pre_cmp&FF_CMP_CHROMA){ - s->me.pre_motion_search= simple_chroma_epzs_motion_search; - }else{ - s->me.pre_motion_search= simple_epzs_motion_search; - } - - if(s->flags&CODEC_FLAG_QPEL){ - if(s->avctx->mb_cmp&FF_CMP_CHROMA) - s->me.get_mb_score= simple_chroma_qpel_get_mb_score; - else - s->me.get_mb_score= simple_qpel_get_mb_score; - }else{ - if(s->avctx->mb_cmp&FF_CMP_CHROMA) - s->me.get_mb_score= simple_chroma_hpel_get_mb_score; - else - s->me.get_mb_score= simple_hpel_get_mb_score; + s->me.stride = 16*s->mb_width + 32; + s->me.uvstride= 8*s->mb_width + 16; } + + c->temp= c->scratchpad; } #if 0 @@ -611,18 +545,17 @@ static int phods_motion_search(MpegEncContext * s, static inline int sad_hpel_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, - int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[6], int stride, int uvstride, - int size, int h, uint8_t * const mv_penalty) + int src_index, int ref_index, + int size, int h) { - uint32_t *score_map= s->me.score_map; const int penalty_factor= s->me.sub_penalty_factor; int mx, my, dminh; uint8_t *pix, *ptr; - const int xmin= s->me.xmin; - const int ymin= s->me.ymin; - const int xmax= s->me.xmax; - const int ymax= s->me.ymax; + int stride= s->me.stride; + const int flags= s->me.sub_flags; + LOAD_COMMON + + assert(flags == 0); if(s->me.skip){ // printf("S"); @@ -632,11 +565,11 @@ static inline int sad_hpel_motion_search(MpegEncContext * s, } // printf("N"); - pix = src_data[0]; + pix = s->me.src[src_index][0]; mx = *mx_ptr; my = *my_ptr; - ptr = ref_data[0] + (my * stride) + mx; + ptr = s->me.ref[ref_index][0] + (my * stride) + mx; dminh = dmin; @@ -733,7 +666,7 @@ static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my, int mv4) s->current_picture.motion_val[0][mot_xy+1][0]= mx; s->current_picture.motion_val[0][mot_xy+1][1]= my; - mot_xy += s->block_wrap[0]; + mot_xy += s->b8_stride; s->current_picture.motion_val[0][mot_xy ][0]= mx; s->current_picture.motion_val[0][mot_xy ][1]= my; s->current_picture.motion_val[0][mot_xy+1][0]= mx; @@ -763,41 +696,40 @@ static inline void get_limits(MpegEncContext *s, int x, int y) } } +static inline void init_mv4_ref(MpegEncContext *s){ + MotionEstContext * const c= &s->me; + const int stride= s->linesize; + + c->ref[1][0] = c->ref[0][0] + 8; + c->ref[2][0] = c->ref[0][0] + 8*stride; + c->ref[3][0] = c->ref[2][0] + 8; + c->src[1][0] = c->src[0][0] + 8; + c->src[2][0] = c->src[0][0] + 8*stride; + c->src[3][0] = c->src[2][0] + 8; +} + static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) { + MotionEstContext * const c= &s->me; const int size= 1; const int h=8; int block; int P[10][2]; int dmin_sum=0, mx4_sum=0, my4_sum=0; - uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; int same=1; const int stride= s->linesize; const int uvstride= s->uvlinesize; - const int xmin= s->me.xmin; - const int ymin= s->me.ymin; - const int xmax= s->me.xmax; - const int ymax= s->me.ymax; + uint8_t *mv_penalty= s->me.current_mv_penalty; + init_mv4_ref(s); + for(block=0; block<4; block++){ int mx4, my4; int pred_x4, pred_y4; int dmin4; static const int off[4]= {2, 1, 1, -1}; - const int mot_stride = s->block_wrap[0]; + const int mot_stride = s->b8_stride; const int mot_xy = s->block_index[block]; - const int block_x= (block&1); - const int block_y= (block>>1); - uint8_t *src_data[3]= { - s->new_picture.data[0] + 8*(2*s->mb_x + block_x) + stride *8*(2*s->mb_y + block_y), //FIXME chroma? - s->new_picture.data[1] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y), - s->new_picture.data[2] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y) - }; - uint8_t *ref_data[3]= { - s->last_picture.data[0] + 8*(2*s->mb_x + block_x) + stride *8*(2*s->mb_y + block_y), //FIXME chroma? - s->last_picture.data[1] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y), - s->last_picture.data[2] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y) - }; P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0]; P_LEFT[1] = s->current_picture.motion_val[0][mot_xy - 1][1]; @@ -806,8 +738,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) /* special case for first line */ if (s->first_slice_line && block<2) { - pred_x4= P_LEFT[0]; - pred_y4= P_LEFT[1]; + s->me.pred_x= pred_x4= P_LEFT[0]; + s->me.pred_y= pred_y4= P_LEFT[1]; } else { P_TOP[0] = s->current_picture.motion_val[0][mot_xy - mot_stride ][0]; P_TOP[1] = s->current_picture.motion_val[0][mot_xy - mot_stride ][1]; @@ -821,32 +753,22 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); -// if(s->out_format == FMT_H263){ - pred_x4 = P_MEDIAN[0]; - pred_y4 = P_MEDIAN[1]; -#if 0 - }else { /* mpeg1 at least */ - pred_x4= P_LEFT[0]; - pred_y4= P_LEFT[1]; - } -#endif + s->me.pred_x= pred_x4 = P_MEDIAN[0]; + s->me.pred_y= pred_y4 = P_MEDIAN[1]; } P_MV1[0]= mx; P_MV1[1]= my; - dmin4 = s->me.motion_search[1](s, &mx4, &my4, P, pred_x4, pred_y4, - src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty); + dmin4 = epzs_motion_search4(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift); - dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, - pred_x4, pred_y4, src_data, ref_data, stride, uvstride, size, h, mv_penalty); + dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h); - if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0] - && s->avctx->mb_decision == FF_MB_DECISION_SIMPLE){ + if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ int dxy; const int offset= ((block&1) + (block>>1)*stride)*8; uint8_t *dest_y = s->me.scratchpad + offset; if(s->quarter_sample){ - uint8_t *ref= ref_data[0] + (mx4>>2) + (my4>>2)*stride; + uint8_t *ref= c->ref[block][0] + (mx4>>2) + (my4>>2)*stride; dxy = ((my4 & 3) << 2) | (mx4 & 3); if(s->no_rounding) @@ -854,7 +776,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) else s->dsp.put_qpel_pixels_tab [1][dxy](dest_y , ref , stride); }else{ - uint8_t *ref= ref_data[0] + (mx4>>1) + (my4>>1)*stride; + uint8_t *ref= c->ref[block][0] + (mx4>>1) + (my4>>1)*stride; dxy = ((my4 & 1) << 1) | (mx4 & 1); if(s->no_rounding) @@ -909,6 +831,9 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad , s->uvlinesize, 8); dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad+8, s->uvlinesize, 8); } + + s->me.pred_x= mx; + s->me.pred_y= my; switch(s->avctx->mb_cmp&0xFF){ /*case FF_CMP_SSE: @@ -920,14 +845,28 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) } } -static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint8_t *frame_ref_data[3], - int16_t (*mv_tables[2][2])[2], uint8_t *field_select_tables[2], int f_code, int mx, int my) +static inline void init_interlaced_ref(MpegEncContext *s, int ref_index){ + MotionEstContext * const c= &s->me; + + c->ref[1+ref_index][0] = c->ref[0+ref_index][0] + s->linesize; + c->src[1][0] = c->src[0][0] + s->linesize; + if(c->flags & FLAG_CHROMA){ + c->ref[1+ref_index][1] = c->ref[0+ref_index][1] + s->uvlinesize; + c->ref[1+ref_index][2] = c->ref[0+ref_index][2] + s->uvlinesize; + c->src[1][1] = c->src[0][1] + s->uvlinesize; + c->src[1][2] = c->src[0][2] + s->uvlinesize; + } +} + +static int interlaced_search(MpegEncContext *s, int ref_index, + int16_t (*mv_tables[2][2])[2], uint8_t *field_select_tables[2], int mx, int my, int user_field_select) { + MotionEstContext * const c= &s->me; const int size=0; const int h=8; int block; int P[10][2]; - uint8_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV; + uint8_t * const mv_penalty= c->current_mv_penalty; int same=1; const int stride= 2*s->linesize; const int uvstride= 2*s->uvlinesize; @@ -935,45 +874,42 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint const int mot_stride= s->mb_stride; const int xy= s->mb_x + s->mb_y*mot_stride; - s->me.ymin>>=1; - s->me.ymax>>=1; + c->ymin>>=1; + c->ymax>>=1; + c->stride<<=1; + c->uvstride<<=1; + init_interlaced_ref(s, ref_index); for(block=0; block<2; block++){ int field_select; int best_dmin= INT_MAX; int best_field= -1; - uint8_t *src_data[3]= { - frame_src_data[0] + s-> linesize*block, - frame_src_data[1] + s->uvlinesize*block, - frame_src_data[2] + s->uvlinesize*block - }; - for(field_select=0; field_select<2; field_select++){ - int dmin, mx_i, my_i, pred_x, pred_y; - uint8_t *ref_data[3]= { - frame_ref_data[0] + s-> linesize*field_select, - frame_ref_data[1] + s->uvlinesize*field_select, - frame_ref_data[2] + s->uvlinesize*field_select - }; + int dmin, mx_i, my_i; int16_t (*mv_table)[2]= mv_tables[block][field_select]; + if(user_field_select){ + if(field_select_tables[block][xy] != field_select) + continue; + } + P_LEFT[0] = mv_table[xy - 1][0]; P_LEFT[1] = mv_table[xy - 1][1]; - if(P_LEFT[0] > (s->me.xmax<<1)) P_LEFT[0] = (s->me.xmax<<1); + if(P_LEFT[0] > (c->xmax<<1)) P_LEFT[0] = (c->xmax<<1); - pred_x= P_LEFT[0]; - pred_y= P_LEFT[1]; + s->me.pred_x= P_LEFT[0]; + s->me.pred_y= P_LEFT[1]; if(!s->first_slice_line){ P_TOP[0] = mv_table[xy - mot_stride][0]; P_TOP[1] = mv_table[xy - mot_stride][1]; P_TOPRIGHT[0] = mv_table[xy - mot_stride + 1][0]; P_TOPRIGHT[1] = mv_table[xy - mot_stride + 1][1]; - if(P_TOP[1] > (s->me.ymax<<1)) P_TOP[1] = (s->me.ymax<<1); - if(P_TOPRIGHT[0] < (s->me.xmin<<1)) P_TOPRIGHT[0]= (s->me.xmin<<1); - if(P_TOPRIGHT[0] > (s->me.xmax<<1)) P_TOPRIGHT[0]= (s->me.xmax<<1); - if(P_TOPRIGHT[1] > (s->me.ymax<<1)) P_TOPRIGHT[1]= (s->me.ymax<<1); + if(P_TOP[1] > (c->ymax<<1)) P_TOP[1] = (c->ymax<<1); + if(P_TOPRIGHT[0] < (c->xmin<<1)) P_TOPRIGHT[0]= (c->xmin<<1); + if(P_TOPRIGHT[0] > (c->xmax<<1)) P_TOPRIGHT[0]= (c->xmax<<1); + if(P_TOPRIGHT[1] > (c->ymax<<1)) P_TOPRIGHT[1]= (c->ymax<<1); P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); @@ -981,32 +917,29 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint P_MV1[0]= mx; //FIXME not correct if block != field_select P_MV1[1]= my / 2; - dmin = s->me.motion_search[4](s, &mx_i, &my_i, P, pred_x, pred_y, - src_data, ref_data, stride, uvstride, mv_table, (1<<16)>>1, mv_penalty); + dmin = epzs_motion_search2(s, &mx_i, &my_i, P, block, field_select+ref_index, mv_table, (1<<16)>>1); - dmin= s->me.sub_motion_search(s, &mx_i, &my_i, dmin, - pred_x, pred_y, src_data, ref_data, stride, uvstride, size, h, mv_penalty); + dmin= c->sub_motion_search(s, &mx_i, &my_i, dmin, block, field_select+ref_index, size, h); mv_table[xy][0]= mx_i; mv_table[xy][1]= my_i; - if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0] - && s->avctx->mb_decision == FF_MB_DECISION_SIMPLE){ + if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ int dxy; //FIXME chroma ME - uint8_t *ref= ref_data[0] + (mx_i>>1) + (my_i>>1)*stride; + uint8_t *ref= c->ref[field_select+ref_index][0] + (mx_i>>1) + (my_i>>1)*stride; dxy = ((my_i & 1) << 1) | (mx_i & 1); if(s->no_rounding){ - s->dsp.put_no_rnd_pixels_tab[size][dxy](s->me.scratchpad, ref , stride, h); + s->dsp.put_no_rnd_pixels_tab[size][dxy](c->scratchpad, ref , stride, h); }else{ - s->dsp.put_pixels_tab [size][dxy](s->me.scratchpad, ref , stride, h); + s->dsp.put_pixels_tab [size][dxy](c->scratchpad, ref , stride, h); } - dmin= s->dsp.mb_cmp[size](s, src_data[0], s->me.scratchpad, stride, h); - dmin+= (mv_penalty[mx_i-pred_x] + mv_penalty[my_i-pred_y] + 1)*s->me.mb_penalty_factor; + dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); + dmin+= (mv_penalty[mx_i-s->me.pred_x] + mv_penalty[my_i-s->me.pred_y] + 1)*c->mb_penalty_factor; }else - dmin+= s->me.mb_penalty_factor; //field_select bits + dmin+= c->mb_penalty_factor; //field_select bits dmin += field_select != block; //slightly prefer same field @@ -1028,8 +961,10 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint dmin_sum += best_dmin; } - s->me.ymin<<=1; - s->me.ymax<<=1; + c->ymin<<=1; + c->ymax<<=1; + c->stride>>=1; + c->uvstride>>=1; if(same) return INT_MAX; @@ -1040,44 +975,182 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint case FF_CMP_RD: return dmin_sum; default: - return dmin_sum+ 11*s->me.mb_penalty_factor; + return dmin_sum+ 11*c->mb_penalty_factor; } } +static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int p_type){ + MotionEstContext * const c= &s->me; + Picture *p= s->current_picture_ptr; + int mb_xy= mb_x + mb_y*s->mb_stride; + int xy= 2*mb_x + 2*mb_y*s->b8_stride; + int mb_type= s->current_picture.mb_type[mb_xy]; + int flags= c->flags; + int shift= (flags&FLAG_QPEL) + 1; + int mask= (1<<shift)-1; + int x, y, i; + int d=0; + me_cmp_func cmpf= s->dsp.sse[0]; + me_cmp_func chroma_cmpf= s->dsp.sse[1]; + + assert(p_type==0 || !USES_LIST(mb_type, 1)); + assert(IS_INTRA(mb_type) || USES_LIST(mb_type,0) || USES_LIST(mb_type,1)); + + if(IS_INTERLACED(mb_type)){ + int xy2= xy + s->b8_stride; + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA; + c->stride<<=1; + c->uvstride<<=1; + + assert(s->flags & CODEC_FLAG_INTERLACED_ME); + + if(USES_LIST(mb_type, 0)){ + int field_select0= p->ref_index[0][xy ]; + int field_select1= p->ref_index[0][xy2]; + assert(field_select0==0 ||field_select0==1); + assert(field_select1==0 ||field_select1==1); + init_interlaced_ref(s, 0); + + if(p_type){ + s->p_field_select_table[0][mb_xy]= field_select0; + s->p_field_select_table[1][mb_xy]= field_select1; + *(uint32_t*)s->p_field_mv_table[0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[0][xy ]; + *(uint32_t*)s->p_field_mv_table[1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[0][xy2]; + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER_I; + }else{ + s->b_field_select_table[0][0][mb_xy]= field_select0; + s->b_field_select_table[0][1][mb_xy]= field_select1; + *(uint32_t*)s->b_field_mv_table[0][0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[0][xy ]; + *(uint32_t*)s->b_field_mv_table[0][1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[0][xy2]; + s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_FORWARD_I; + } + + x= p->motion_val[0][xy ][0]; + y= p->motion_val[0][xy ][1]; + d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select0, 0, cmpf, chroma_cmpf, flags); + x= p->motion_val[0][xy2][0]; + y= p->motion_val[0][xy2][1]; + d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select1, 1, cmpf, chroma_cmpf, flags); + } + if(USES_LIST(mb_type, 1)){ + int field_select0= p->ref_index[1][xy ]; + int field_select1= p->ref_index[1][xy2]; + assert(field_select0==0 ||field_select0==1); + assert(field_select1==0 ||field_select1==1); + init_interlaced_ref(s, 2); + + s->b_field_select_table[1][0][mb_xy]= field_select0; + s->b_field_select_table[1][1][mb_xy]= field_select1; + *(uint32_t*)s->b_field_mv_table[1][0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[1][xy ]; + *(uint32_t*)s->b_field_mv_table[1][1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[1][xy2]; + if(USES_LIST(mb_type, 0)){ + s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_BIDIR_I; + }else{ + s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_BACKWARD_I; + } + + x= p->motion_val[1][xy ][0]; + y= p->motion_val[1][xy ][1]; + d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select0+2, 0, cmpf, chroma_cmpf, flags); + x= p->motion_val[1][xy2][0]; + y= p->motion_val[1][xy2][1]; + d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select1+2, 1, cmpf, chroma_cmpf, flags); + //FIXME bidir scores + } + c->stride>>=1; + c->uvstride>>=1; + }else if(IS_8X8(mb_type)){ + assert(s->flags & CODEC_FLAG_4MV); + cmpf= s->dsp.sse[1]; + chroma_cmpf= s->dsp.sse[1]; + init_mv4_ref(s); + for(i=0; i<4; i++){ + xy= s->block_index[i]; + x= p->motion_val[0][xy][0]; + y= p->motion_val[0][xy][1]; + d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 1, 8, i, i, cmpf, chroma_cmpf, flags); + } + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER4V; + }else{ + if(USES_LIST(mb_type, 0)){ + if(p_type){ + *(uint32_t*)s->p_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy]; + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER; + }else if(USES_LIST(mb_type, 1)){ + *(uint32_t*)s->b_bidir_forw_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy]; + *(uint32_t*)s->b_bidir_back_mv_table[mb_xy]= *(uint32_t*)p->motion_val[1][xy]; + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_BIDIR; + }else{ + *(uint32_t*)s->b_forw_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy]; + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_FORWARD; + } + x= p->motion_val[0][xy][0]; + y= p->motion_val[0][xy][1]; + d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 16, 0, 0, cmpf, chroma_cmpf, flags); + }else if(USES_LIST(mb_type, 1)){ + *(uint32_t*)s->b_back_mv_table[mb_xy]= *(uint32_t*)p->motion_val[1][xy]; + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_BACKWARD; + + x= p->motion_val[1][xy][0]; + y= p->motion_val[1][xy][1]; + d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 16, 2, 0, cmpf, chroma_cmpf, flags); + }else + s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA; + } + return d; +} + void ff_estimate_p_frame_motion(MpegEncContext * s, int mb_x, int mb_y) { + MotionEstContext * const c= &s->me; uint8_t *pix, *ppix; - int sum, varc, vard, mx, my, dmin, xx, yy; - int pred_x=0, pred_y=0; + int sum, varc, vard, mx, my, dmin; int P[10][2]; const int shift= 1+s->quarter_sample; int mb_type=0; - uint8_t *ref_picture= s->last_picture.data[0]; Picture * const pic= &s->current_picture; - uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; - const int stride= s->linesize; - const int uvstride= s->uvlinesize; - uint8_t *src_data[3]= { - s->new_picture.data[0] + 16*(mb_x + stride*mb_y), - s->new_picture.data[1] + 8*(mb_x + uvstride*mb_y), - s->new_picture.data[2] + 8*(mb_x + uvstride*mb_y) - }; - uint8_t *ref_data[3]= { - s->last_picture.data[0] + 16*(mb_x + stride*mb_y), - s->last_picture.data[1] + 8*(mb_x + uvstride*mb_y), - s->last_picture.data[2] + 8*(mb_x + uvstride*mb_y) - }; + + init_ref(s, s->new_picture.data, s->last_picture.data, NULL, 16*mb_x, 16*mb_y, 0); assert(s->quarter_sample==0 || s->quarter_sample==1); + assert(s->linesize == s->me.stride); + assert(s->uvlinesize == s->me.uvstride); s->me.penalty_factor = get_penalty_factor(s, s->avctx->me_cmp); s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp); s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp); + s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; get_limits(s, 16*mb_x, 16*mb_y); s->me.skip=0; + /* intra / predictive decision */ + pix = c->src[0][0]; + sum = s->dsp.pix_sum(pix, s->linesize); + varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8; + + pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; + pic->mb_var [s->mb_stride * mb_y + mb_x] = varc; + s->mb_var_sum_temp += varc; + + if(s->avctx->me_threshold){ + vard= (check_input_motion(s, mb_x, mb_y, 1)+128)>>8; + + if(vard<s->avctx->me_threshold){ + pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard; + s->mc_mb_var_sum_temp += vard; + if (vard <= 64 || vard < varc) { //FIXME + s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc); + }else{ + s->scene_change_score+= s->qscale; + } + return; + } + if(vard<s->avctx->mb_threshold) + mb_type= s->mb_type[mb_x + mb_y*s->mb_stride]; + } + switch(s->me_method) { case ME_ZERO: default: @@ -1106,7 +1179,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, case ME_X1: case ME_EPZS: { - const int mot_stride = s->block_wrap[0]; + const int mot_stride = s->b8_stride; const int mot_xy = s->block_index[0]; P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0]; @@ -1127,51 +1200,58 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); if(s->out_format == FMT_H263){ - pred_x = P_MEDIAN[0]; - pred_y = P_MEDIAN[1]; + c->pred_x = P_MEDIAN[0]; + c->pred_y = P_MEDIAN[1]; }else { /* mpeg1 at least */ - pred_x= P_LEFT[0]; - pred_y= P_LEFT[1]; + c->pred_x= P_LEFT[0]; + c->pred_y= P_LEFT[1]; } }else{ - pred_x= P_LEFT[0]; - pred_y= P_LEFT[1]; + c->pred_x= P_LEFT[0]; + c->pred_y= P_LEFT[1]; } } - dmin = s->me.motion_search[0](s, &mx, &my, P, pred_x, pred_y, - src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty); - + dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift); + break; } - /* intra / predictive decision */ - xx = mb_x * 16; - yy = mb_y * 16; - - pix = src_data[0]; /* At this point (mx,my) are full-pell and the relative displacement */ - ppix = ref_data[0] + (my * s->linesize) + mx; - - sum = s->dsp.pix_sum(pix, s->linesize); - - varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8; + ppix = c->ref[0][0] + (my * s->linesize) + mx; + vard = (s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16)+128)>>8; -//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout); - pic->mb_var [s->mb_stride * mb_y + mb_x] = varc; pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard; - pic->mb_mean [s->mb_stride * mb_y + mb_x] = (sum+128)>>8; // pic->mb_cmp_score[s->mb_stride * mb_y + mb_x] = dmin; - s->mb_var_sum_temp += varc; s->mc_mb_var_sum_temp += vard; -//printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout); #if 0 printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n", varc, s->avg_mb_var, sum, vard, mx - xx, my - yy); #endif - if(s->avctx->mb_decision > FF_MB_DECISION_SIMPLE){ + if(mb_type){ + if (vard <= 64 || vard < varc) + s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc); + else + s->scene_change_score+= s->qscale; + + if(mb_type == CANDIDATE_MB_TYPE_INTER){ + s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16); + set_p_mv_tables(s, mx, my, 1); + }else{ + mx <<=shift; + my <<=shift; + } + if(mb_type == CANDIDATE_MB_TYPE_INTER4V){ + h263_mv4_search(s, mx, my, shift); + + set_p_mv_tables(s, mx, my, 0); + } + if(mb_type == CANDIDATE_MB_TYPE_INTER_I){ + interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 1); + } + }else if(s->avctx->mb_decision > FF_MB_DECISION_SIMPLE){ if (vard <= 64 || vard < varc) s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc); else @@ -1181,8 +1261,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, mb_type|= CANDIDATE_MB_TYPE_INTRA; if (varc*2 + 200 > vard){ mb_type|= CANDIDATE_MB_TYPE_INTER; - s->me.sub_motion_search(s, &mx, &my, dmin, - pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty); + s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16); if(s->flags&CODEC_FLAG_MV0) if(mx || my) mb_type |= CANDIDATE_MB_TYPE_SKIPED; //FIXME check difference @@ -1200,17 +1279,16 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, set_p_mv_tables(s, mx, my, 1); if((s->flags&CODEC_FLAG_INTERLACED_ME) && !s->me.skip){ //FIXME varc/d checks - if(interlaced_search(s, src_data, ref_data, s->p_field_mv_table, s->p_field_select_table, s->f_code, mx, my) < INT_MAX) + if(interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 0) < INT_MAX) mb_type |= CANDIDATE_MB_TYPE_INTER_I; } }else{ int intra_score, i; mb_type= CANDIDATE_MB_TYPE_INTER; - dmin= s->me.sub_motion_search(s, &mx, &my, dmin, - pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty); + dmin= s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16); if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip) - dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, src_data, ref_data, stride, uvstride, mv_penalty); + dmin= get_mb_score(s, mx, my, 0, 0); if((s->flags&CODEC_FLAG_4MV) && !s->me.skip && varc>50 && vard>10){ @@ -1222,7 +1300,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, } if((s->flags&CODEC_FLAG_INTERLACED_ME) && !s->me.skip){ //FIXME varc/d checks - int dmin_i= interlaced_search(s, src_data, ref_data, s->p_field_mv_table, s->p_field_select_table, s->f_code, mx, my); + int dmin_i= interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 0); if(dmin_i < dmin){ mb_type = CANDIDATE_MB_TYPE_INTER_I; dmin= dmin_i; @@ -1256,7 +1334,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, int mean; if(s->out_format == FMT_H263){ - mean= (s->dc_val[i][mb_x + (mb_y+1)*(s->mb_width+2)] + 4)>>3; //FIXME not exact but simple ;) + mean= (s->dc_val[i][mb_x + mb_y*s->b8_stride] + 4)>>3; //FIXME not exact but simple ;) }else{ mean= (s->last_dc[i] + 4)>>3; } @@ -1293,28 +1371,17 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, int ff_pre_estimate_p_frame_motion(MpegEncContext * s, int mb_x, int mb_y) { + MotionEstContext * const c= &s->me; int mx, my, dmin; - int pred_x=0, pred_y=0; int P[10][2]; const int shift= 1+s->quarter_sample; - uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; const int xy= mb_x + mb_y*s->mb_stride; - const int stride= s->linesize; - const int uvstride= s->uvlinesize; - uint8_t *src_data[3]= { - s->new_picture.data[0] + 16*(mb_x + stride*mb_y), - s->new_picture.data[1] + 8*(mb_x + uvstride*mb_y), - s->new_picture.data[2] + 8*(mb_x + uvstride*mb_y) - }; - uint8_t *ref_data[3]= { - s->last_picture.data[0] + 16*(mb_x + stride*mb_y), - s->last_picture.data[1] + 8*(mb_x + uvstride*mb_y), - s->last_picture.data[2] + 8*(mb_x + uvstride*mb_y) - }; + init_ref(s, s->new_picture.data, s->last_picture.data, NULL, 16*mb_x, 16*mb_y, 0); assert(s->quarter_sample==0 || s->quarter_sample==1); s->me.pre_penalty_factor = get_penalty_factor(s, s->avctx->me_pre_cmp); + s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; get_limits(s, 16*mb_x, 16*mb_y); s->me.skip=0; @@ -1326,8 +1393,8 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s, /* special case for first line */ if (s->first_slice_line) { - pred_x= P_LEFT[0]; - pred_y= P_LEFT[1]; + c->pred_x= P_LEFT[0]; + c->pred_y= P_LEFT[1]; P_TOP[0]= P_TOPRIGHT[0]= P_MEDIAN[0]= P_TOP[1]= P_TOPRIGHT[1]= P_MEDIAN[1]= 0; //FIXME } else { @@ -1342,11 +1409,11 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s, P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); - pred_x = P_MEDIAN[0]; - pred_y = P_MEDIAN[1]; + c->pred_x = P_MEDIAN[0]; + c->pred_y = P_MEDIAN[1]; } - dmin = s->me.pre_motion_search(s, &mx, &my, P, pred_x, pred_y, - src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty); + + dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift); s->p_mv_table[xy][0] = mx<<shift; s->p_mv_table[xy][1] = my<<shift; @@ -1355,22 +1422,20 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s, } static int ff_estimate_motion_b(MpegEncContext * s, - int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, int f_code) + int mb_x, int mb_y, int16_t (*mv_table)[2], int ref_index, int f_code) { int mx, my, dmin; - int pred_x=0, pred_y=0; int P[10][2]; const int shift= 1+s->quarter_sample; const int mot_stride = s->mb_stride; const int mot_xy = mb_y*mot_stride + mb_x; - uint8_t * const ref_picture= ref_data[0] - 16*s->mb_x - 16*s->mb_y*s->linesize; //FIXME ugly uint8_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV; int mv_scale; s->me.penalty_factor = get_penalty_factor(s, s->avctx->me_cmp); s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp); s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp); + s->me.current_mv_penalty= mv_penalty; get_limits(s, 16*mb_x, 16*mb_y); @@ -1420,8 +1485,8 @@ static int ff_estimate_motion_b(MpegEncContext * s, P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]); P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); } - pred_x= P_LEFT[0]; - pred_y= P_LEFT[1]; + s->me.pred_x= P_LEFT[0]; + s->me.pred_y= P_LEFT[1]; } if(mv_table == s->b_forw_mv_table){ @@ -1430,17 +1495,15 @@ static int ff_estimate_motion_b(MpegEncContext * s, mv_scale= ((s->pb_time - s->pp_time)<<16) / (s->pp_time<<shift); } - dmin = s->me.motion_search[0](s, &mx, &my, P, pred_x, pred_y, - src_data, ref_data, stride, uvstride, s->p_mv_table, mv_scale, mv_penalty); + dmin = epzs_motion_search(s, &mx, &my, P, 0, ref_index, s->p_mv_table, mv_scale); break; } - dmin= s->me.sub_motion_search(s, &mx, &my, dmin, - pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty); + dmin= s->me.sub_motion_search(s, &mx, &my, dmin, 0, ref_index, 0, 16); if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip) - dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, src_data, ref_data, stride, uvstride, mv_penalty); + dmin= get_mb_score(s, mx, my, 0, ref_index); //printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my); // s->mb_type[mb_y*s->mb_width + mb_x]= mb_type; @@ -1450,8 +1513,7 @@ static int ff_estimate_motion_b(MpegEncContext * s, return dmin; } -static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6], - int stride, int uvstride, +static inline int check_bidir_mv(MpegEncContext * s, int motion_fx, int motion_fy, int motion_bx, int motion_by, int pred_fx, int pred_fy, @@ -1459,15 +1521,20 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8 int size, int h) { //FIXME optimize? - //FIXME move into template? //FIXME better f_code prediction (max mv & distance) //FIXME pointers + MotionEstContext * const c= &s->me; uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame + int stride= s->me.stride; + int uvstride= s->me.uvstride; uint8_t *dest_y = s->me.scratchpad; uint8_t *ptr; int dxy; int src_x, src_y; int fbmin; + uint8_t **src_data= c->src[0]; + uint8_t **ref_data= c->ref[0]; + uint8_t **ref2_data= c->ref[2]; if(s->quarter_sample){ dxy = ((motion_fy & 3) << 2) | (motion_fx & 3); @@ -1481,7 +1548,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8 src_x = motion_bx >> 2; src_y = motion_by >> 2; - ptr = ref_data[3] + (src_y * stride) + src_x; + ptr = ref2_data[0] + (src_y * stride) + src_x; s->dsp.avg_qpel_pixels_tab[size][dxy](dest_y , ptr , stride); }else{ dxy = ((motion_fy & 1) << 1) | (motion_fx & 1); @@ -1495,7 +1562,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8 src_x = motion_bx >> 1; src_y = motion_by >> 1; - ptr = ref_data[3] + (src_y * stride) + src_x; + ptr = ref2_data[0] + (src_y * stride) + src_x; s->dsp.avg_pixels_tab[size][dxy](dest_y , ptr , stride, h); } @@ -1511,9 +1578,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8 } /* refine the bidir vectors in hq mode and return the score in both lq & hq mode*/ -static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6], - int stride, int uvstride, - int mb_x, int mb_y) +static inline int bidir_refine(MpegEncContext * s, int mb_x, int mb_y) { const int mot_stride = s->mb_stride; const int xy = mb_y *mot_stride + mb_x; @@ -1529,8 +1594,7 @@ static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t //FIXME do refinement and add flag - fbmin= check_bidir_mv(s, src_data, ref_data, stride, uvstride, - motion_fx, motion_fy, + fbmin= check_bidir_mv(s, motion_fx, motion_fy, motion_bx, motion_by, pred_fx, pred_fy, pred_bx, pred_by, @@ -1539,9 +1603,7 @@ static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t return fbmin; } -static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6], - int stride, int uvstride, - int mb_x, int mb_y) +static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y) { int P[10][2]; const int mot_stride = s->mb_stride; @@ -1552,8 +1614,8 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_ const int time_pb= s->pb_time; int mx, my, xmin, xmax, ymin, ymax; int16_t (*mv_table)[2]= s->b_direct_mv_table; - uint8_t * const mv_penalty= s->me.mv_penalty[1] + MAX_MV; + s->me.current_mv_penalty= s->me.mv_penalty[1] + MAX_MV; ymin= xmin=(-32)>>shift; ymax= xmax= 31>>shift; @@ -1604,6 +1666,10 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_ s->me.ymin= ymin; s->me.xmax= xmax; s->me.ymax= ymax; + s->me.flags |= FLAG_DIRECT; + s->me.sub_flags |= FLAG_DIRECT; + s->me.pred_x=0; + s->me.pred_y=0; P_LEFT[0] = clip(mv_table[mot_xy - 1][0], xmin<<shift, xmax<<shift); P_LEFT[1] = clip(mv_table[mot_xy - 1][1], ymin<<shift, ymax<<shift); @@ -1619,29 +1685,22 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_ P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]); } - //FIXME direct_search ptr in context!!! (needed for chroma anyway or this will get messy) - if(s->flags&CODEC_FLAG_QPEL){ - dmin = simple_direct_qpel_epzs_motion_search(s, &mx, &my, P, 0, 0, - src_data, ref_data, stride, uvstride, mv_table, 1<<14, mv_penalty); - dmin = simple_direct_qpel_qpel_motion_search(s, &mx, &my, dmin, - 0, 0, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty); - - if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip) - dmin= simple_direct_qpel_qpel_get_mb_score(s, mx, my, 0, 0, src_data, ref_data, stride, uvstride, mv_penalty); - }else{ - dmin = simple_direct_hpel_epzs_motion_search(s, &mx, &my, P, 0, 0, - src_data, ref_data, stride, uvstride, mv_table, 1<<15, mv_penalty); - dmin = simple_direct_hpel_hpel_motion_search(s, &mx, &my, dmin, - 0, 0, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty); - - if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip) - dmin= simple_direct_hpel_hpel_get_mb_score(s, mx, my, 0, 0, src_data, ref_data, stride, uvstride, mv_penalty); - } + dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, mv_table, 1<<(16-shift)); + if(s->me.sub_flags&FLAG_QPEL) + dmin = qpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16); + else + dmin = hpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16); + + if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip) + dmin= get_mb_score(s, mx, my, 0, 0); get_limits(s, 16*mb_x, 16*mb_y); //restore s->me.?min/max, maybe not needed s->b_direct_mv_table[mot_xy][0]= mx; s->b_direct_mv_table[mot_xy][1]= my; + s->me.flags &= ~FLAG_DIRECT; + s->me.sub_flags &= ~FLAG_DIRECT; + return dmin; } @@ -1651,52 +1710,89 @@ void ff_estimate_b_frame_motion(MpegEncContext * s, const int penalty_factor= s->me.mb_penalty_factor; int fmin, bmin, dmin, fbmin, bimin, fimin; int type=0; - const int stride= s->linesize; - const int uvstride= s->uvlinesize; - uint8_t *src_data[3]= { - s->new_picture.data[0] + 16*(s->mb_x + stride*s->mb_y), - s->new_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y), - s->new_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y) - }; - uint8_t *ref_data[6]= { - s->last_picture.data[0] + 16*(s->mb_x + stride*s->mb_y), - s->last_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y), - s->last_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y), - s->next_picture.data[0] + 16*(s->mb_x + stride*s->mb_y), - s->next_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y), - s->next_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y) - }; + const int xy = mb_y*s->mb_stride + mb_x; + init_ref(s, s->new_picture.data, s->last_picture.data, s->next_picture.data, 16*mb_x, 16*mb_y, 2); + s->me.skip=0; + if(s->avctx->me_threshold){ + int vard= (check_input_motion(s, mb_x, mb_y, 0)+128)>>8; + + if(vard<s->avctx->me_threshold){ +// pix = c->src[0][0]; +// sum = s->dsp.pix_sum(pix, s->linesize); +// varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8; + +// pic->mb_var [s->mb_stride * mb_y + mb_x] = varc; + s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] = vard; +/* pic->mb_mean [s->mb_stride * mb_y + mb_x] = (sum+128)>>8; + s->mb_var_sum_temp += varc;*/ + s->mc_mb_var_sum_temp += vard; +/* if (vard <= 64 || vard < varc) { + s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc); + }else{ + s->scene_change_score+= s->qscale; + }*/ + return; + } + if(vard<s->avctx->mb_threshold){ + type= s->mb_type[mb_y*s->mb_stride + mb_x]; + if(type == CANDIDATE_MB_TYPE_DIRECT){ + direct_search(s, mb_x, mb_y); + } + if(type == CANDIDATE_MB_TYPE_FORWARD || type == CANDIDATE_MB_TYPE_BIDIR){ + s->me.skip=0; + ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, 0, s->f_code); + } + if(type == CANDIDATE_MB_TYPE_BACKWARD || type == CANDIDATE_MB_TYPE_BIDIR){ + s->me.skip=0; + ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, 2, s->b_code); + } + if(type == CANDIDATE_MB_TYPE_FORWARD_I || type == CANDIDATE_MB_TYPE_BIDIR_I){ + s->me.skip=0; + s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; + interlaced_search(s, 0, + s->b_field_mv_table[0], s->b_field_select_table[0], + s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 1); + } + if(type == CANDIDATE_MB_TYPE_BACKWARD_I || type == CANDIDATE_MB_TYPE_BIDIR_I){ + s->me.skip=0; + s->me.current_mv_penalty= s->me.mv_penalty[s->b_code] + MAX_MV; + interlaced_search(s, 2, + s->b_field_mv_table[1], s->b_field_select_table[1], + s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 1); + } + return; + } + } + if (s->codec_id == CODEC_ID_MPEG4) - dmin= direct_search(s, src_data, ref_data, stride, uvstride, mb_x, mb_y); + dmin= direct_search(s, mb_x, mb_y); else dmin= INT_MAX; //FIXME penalty stuff for non mpeg4 s->me.skip=0; - fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, src_data, - ref_data, stride, uvstride, s->f_code) + 3*penalty_factor; + fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, 0, s->f_code) + 3*penalty_factor; s->me.skip=0; - bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, src_data, - ref_data+3, stride, uvstride, s->b_code) + 2*penalty_factor; + bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, 2, s->b_code) + 2*penalty_factor; //printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]); s->me.skip=0; - fbmin= bidir_refine(s, src_data, ref_data, stride, uvstride, mb_x, mb_y) + penalty_factor; + fbmin= bidir_refine(s, mb_x, mb_y) + penalty_factor; //printf("%d %d %d %d\n", dmin, fmin, bmin, fbmin); if(s->flags & CODEC_FLAG_INTERLACED_ME){ - const int xy = mb_y*s->mb_stride + mb_x; - //FIXME mb type penalty s->me.skip=0; - fimin= interlaced_search(s, src_data, ref_data , - s->b_field_mv_table[0], s->b_field_select_table[0], s->f_code, - s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]); - bimin= interlaced_search(s, src_data, ref_data+3, - s->b_field_mv_table[1], s->b_field_select_table[1], s->b_code, - s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1]); + s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; + fimin= interlaced_search(s, 0, + s->b_field_mv_table[0], s->b_field_select_table[0], + s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 0); + s->me.current_mv_penalty= s->me.mv_penalty[s->b_code] + MAX_MV; + bimin= interlaced_search(s, 2, + s->b_field_mv_table[1], s->b_field_select_table[1], + s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 0); }else fimin= bimin= INT_MAX; @@ -1813,11 +1909,11 @@ void ff_fix_long_p_mvs(MpegEncContext * s) //printf("%d no:%d %d//\n", clip, noclip, f_code); if(s->flags&CODEC_FLAG_4MV){ - const int wrap= 2+ s->mb_width*2; + const int wrap= s->b8_stride; /* clip / convert to intra 8x8 type MVs */ for(y=0; y<s->mb_height; y++){ - int xy= (y*2 + 1)*wrap + 1; + int xy= y*2*wrap; int i= y*s->mb_stride; int x; diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c index 49c2e57b5..8ab6c7be4 100644 --- a/src/libffmpeg/libavcodec/motion_est_template.c +++ b/src/libffmpeg/libavcodec/motion_est_template.c @@ -22,58 +22,32 @@ * @file motion_est_template.c * Motion estimation template. */ -//FIXME ref2_y next_pic? + //lets hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...) -//Note, the last line is there to kill these ugly unused var warnings #define LOAD_COMMON\ uint32_t * const score_map= s->me.score_map;\ - const int time_pp= s->pp_time;\ - const int time_pb= s->pb_time;\ const int xmin= s->me.xmin;\ const int ymin= s->me.ymin;\ const int xmax= s->me.xmax;\ const int ymax= s->me.ymax;\ - uint8_t * const src_y= src_data[0];\ - uint8_t * const src_u= src_data[1];\ - uint8_t * const src_v= src_data[2];\ - uint8_t * const ref_y= ref_data[0];\ - uint8_t * const ref_u= ref_data[1];\ - uint8_t * const ref_v= ref_data[2];\ - op_pixels_func (*hpel_put)[4];\ - op_pixels_func (*hpel_avg)[4]= &s->dsp.avg_pixels_tab[size];\ - op_pixels_func (*chroma_hpel_put)[4];\ - qpel_mc_func (*qpel_put)[16];\ - qpel_mc_func (*qpel_avg)[16]= &s->dsp.avg_qpel_pixels_tab[size];\ - const __attribute__((unused)) int unu= time_pp + time_pb + (size_t)src_u + (size_t)src_v + (size_t)ref_u + (size_t)ref_v\ - + (size_t)hpel_avg + (size_t)qpel_avg + (size_t)score_map\ - + xmin + xmax + ymin + ymax;\ - if(s->no_rounding /*FIXME b_type*/){\ - hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];\ - chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];\ - qpel_put= &s->dsp.put_no_rnd_qpel_pixels_tab[size];\ - }else{\ - hpel_put=& s->dsp.put_pixels_tab[size];\ - chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];\ - qpel_put= &s->dsp.put_qpel_pixels_tab[size];\ - } + uint8_t *mv_penalty= s->me.current_mv_penalty;\ + const int pred_x= s->me.pred_x;\ + const int pred_y= s->me.pred_y;\ - -#ifdef CMP_HPEL - #define CHECK_HALF_MV(dx, dy, x, y)\ {\ const int hx= 2*(x)+(dx);\ const int hy= 2*(y)+(dy);\ - CMP_HPEL(d, dx, dy, x, y, size);\ + d= cmp(s, x, y, dx, dy, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);\ d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\ COPY3_IF_LT(dmin, d, bx, hx, by, hy)\ } #if 0 -static int RENAME(hpel_motion_search)(MpegEncContext * s, +static int hpel_motion_search)(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, - int pred_x, int pred_y, uint8_t *ref_data[3], - int size, uint8_t * const mv_penalty) + uint8_t *ref_data[3], + int size) { const int xx = 16 * s->mb_x + 8*(n&1); const int yy = 16 * s->mb_y + 8*(n>>1); @@ -94,8 +68,8 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s, hpel_put=& s->dsp.put_pixels_tab[size]; chroma_hpel_put= &s->dsp.put_pixels_tab[size+1]; } - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; cmp_sub= s->dsp.me_sub_cmp[size]; chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; @@ -138,11 +112,10 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s, } #else -static int RENAME(hpel_motion_search)(MpegEncContext * s, +static int hpel_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, - int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - int size, int h, uint8_t * const mv_penalty) + int src_index, int ref_index, + int size, int h) { const int mx = *mx_ptr; const int my = *my_ptr; @@ -151,6 +124,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s, int bx=2*mx, by=2*my; LOAD_COMMON + int flags= s->me.sub_flags; //FIXME factorize @@ -164,7 +138,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s, } if(s->avctx->me_cmp != s->avctx->me_sub_cmp){ - CMP_HPEL(dmin, 0, 0, mx, my, size); + dmin= cmp(s, mx, my, 0, 0, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags); if(mx || my || size>0) dmin += (mv_penalty[2*mx - pred_x] + mv_penalty[2*my - pred_y])*penalty_factor; } @@ -246,14 +220,16 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s, } #endif -static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - uint8_t * const mv_penalty) +static int inline get_mb_score(MpegEncContext * s, int mx, int my, int src_index, + int ref_index) { // const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp; const int size= 0; const int h= 16; const int penalty_factor= s->me.mb_penalty_factor; + const int flags= s->me.mb_flags; + const int qpel= flags & FLAG_QPEL; + const int mask= 1+2*qpel; me_cmp_func cmp_sub, chroma_cmp_sub; int d; @@ -267,7 +243,7 @@ static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre assert(!s->me.skip); assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp); - CMP_HPEL(d, mx&1, my&1, mx>>1, my>>1, size); + d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags); //FIXME check cbp before adding penalty for (0,0) vector if(mx || my || size>0) d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor; @@ -275,26 +251,19 @@ static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre return d; } -#endif /* CMP_HPEL */ - - - -#ifdef CMP_QPEL - #define CHECK_QUARTER_MV(dx, dy, x, y)\ {\ const int hx= 4*(x)+(dx);\ const int hy= 4*(y)+(dy);\ - CMP_QPEL(d, dx, dy, x, y, size);\ + d= cmp(s, x, y, dx, dy, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\ d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\ COPY3_IF_LT(dmin, d, bx, hx, by, hy)\ } -static int RENAME(qpel_motion_search)(MpegEncContext * s, +static int qpel_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, - int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - int size, int h, uint8_t * const mv_penalty) + int src_index, int ref_index, + int size, int h) { const int mx = *mx_ptr; const int my = *my_ptr; @@ -302,13 +271,14 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s, const int map_generation= s->me.map_generation; const int subpel_quality= s->avctx->me_subpel_quality; uint32_t *map= s->me.map; - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; me_cmp_func cmp_sub, chroma_cmp_sub; LOAD_COMMON + int flags= s->me.sub_flags; - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; //factorize FIXME + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME //FIXME factorize cmp_sub= s->dsp.me_sub_cmp[size]; @@ -321,7 +291,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s, } if(s->avctx->me_cmp != s->avctx->me_sub_cmp){ - CMP_QPEL(dmin, 0, 0, mx, my, size); + dmin= cmp(s, mx, my, 0, 0, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags); if(mx || my || size>0) dmin += (mv_penalty[4*mx - pred_x] + mv_penalty[4*my - pred_y])*penalty_factor; } @@ -386,7 +356,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s, if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == (my<<ME_MAP_MV_BITS) + mx + map_generation && 0){ //FIXME tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)]; }else{ - CMP(tl, mx-1, my-1, size); //FIXME wrong if chroma me is different + tl= cmp(s, mx-1, my-1, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);//FIXME wrong if chroma me is different } cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c; @@ -509,36 +479,6 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s, return dmin; } -static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - uint8_t * const mv_penalty) -{ - const int size= 0; - const int h= 16; - const int penalty_factor= s->me.mb_penalty_factor; - me_cmp_func cmp_sub, chroma_cmp_sub; - int d; - - LOAD_COMMON - - //FIXME factorize - - cmp_sub= s->dsp.mb_cmp[size]; - chroma_cmp_sub= s->dsp.mb_cmp[size+1]; - - assert(!s->me.skip); - assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp); - - CMP_QPEL(d, mx&3, my&3, mx>>2, my>>2, size); - //FIXME check cbp before adding penalty for (0,0) vector - if(mx || my || size>0) - d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor; - - return d; -} - - -#endif /* CMP_QPEL */ #define CHECK_MV(x,y)\ {\ @@ -546,7 +486,7 @@ static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ /*printf("check_mv %d %d\n", x, y);*/\ if(map[index]!=key){\ - CMP(d, x, y, size);\ + d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\ map[index]= key;\ score_map[index]= d;\ d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\ @@ -570,7 +510,7 @@ static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\ /*printf("check_mv_dir %d %d %d\n", x, y, new_dir);*/\ if(map[index]!=key){\ - CMP(d, x, y, size);\ + d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\ map[index]= key;\ score_map[index]= d;\ d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\ @@ -590,27 +530,29 @@ if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\ if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\ +#define LOAD_COMMON2\ + uint32_t *map= s->me.map;\ + const int qpel= flags&FLAG_QPEL;\ + const int shift= 1+qpel;\ -static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, int dmin, - uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - int const pred_x, int const pred_y, int const penalty_factor, - int const shift, - uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty - ) +static always_inline int small_diamond_search(MpegEncContext * s, int *best, int dmin, + int src_index, int ref_index, int const penalty_factor, + int size, int h, int flags) { - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; int next_dir=-1; LOAD_COMMON + LOAD_COMMON2 + int map_generation= s->me.map_generation; - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; { /* ensure that the best point is in the MAP as h/qpel refinement needs it */ const int key= (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation; const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1); if(map[index]!=key){ //this will be executed only very rarey - CMP(score_map[index], best[0], best[1], size); + score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags); map[index]= key; } } @@ -634,20 +576,18 @@ static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, in } } -static inline int RENAME(funny_diamond_search)(MpegEncContext * s, int *best, int dmin, - uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - int const pred_x, int const pred_y, int const penalty_factor, - int const shift, - uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty - ) +static int funny_diamond_search(MpegEncContext * s, int *best, int dmin, + int src_index, int ref_index, int const penalty_factor, + int size, int h, int flags) { - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; int dia_size; LOAD_COMMON + LOAD_COMMON2 + int map_generation= s->me.map_generation; - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; for(dia_size=1; dia_size<=4; dia_size++){ int dir; @@ -702,7 +642,7 @@ if(256*256*256*64 % (stats[0]+1)==0){ const int index= (((ay)<<ME_MAP_SHIFT) + (ax))&(ME_MAP_SIZE-1);\ /*printf("sab check %d %d\n", ax, ay);*/\ if(map[index]!=key){\ - CMP(d, ax, ay, size);\ + d= cmp(s, ax, ay, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\ map[index]= key;\ score_map[index]= d;\ d += (mv_penalty[((ax)<<shift)-pred_x] + mv_penalty[((ay)<<shift)-pred_y])*penalty_factor;\ @@ -726,22 +666,20 @@ if(256*256*256*64 % (stats[0]+1)==0){ } #define MAX_SAB_SIZE 16 -static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int dmin, - uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - int const pred_x, int const pred_y, int const penalty_factor, - int const shift, - uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty - ) +static int sab_diamond_search(MpegEncContext * s, int *best, int dmin, + int src_index, int ref_index, int const penalty_factor, + int size, int h, int flags) { - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; Minima minima[MAX_SAB_SIZE]; const int minima_count= ABS(s->me.dia_size); int i, j; LOAD_COMMON + LOAD_COMMON2 + int map_generation= s->me.map_generation; - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; for(j=i=0; i<ME_MAP_SIZE; i++){ uint32_t key= map[i]; @@ -807,20 +745,18 @@ static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int return dmin; } -static inline int RENAME(var_diamond_search)(MpegEncContext * s, int *best, int dmin, - uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, - int const pred_x, int const pred_y, int const penalty_factor, - int const shift, - uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty - ) +static int var_diamond_search(MpegEncContext * s, int *best, int dmin, + int src_index, int ref_index, int const penalty_factor, + int size, int h, int flags) { - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; int dia_size; LOAD_COMMON + LOAD_COMMON2 + int map_generation= s->me.map_generation; - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; for(dia_size=1; dia_size<=s->me.dia_size; dia_size++){ int dir, start, end; @@ -885,31 +821,42 @@ if(256*256*256*64 % (stats[0]+1)==0){ return dmin; } -static int RENAME(epzs_motion_search)(MpegEncContext * s, - int *mx_ptr, int *my_ptr, - int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2], - int ref_mv_scale, uint8_t * const mv_penalty) +static always_inline int diamond_search(MpegEncContext * s, int *best, int dmin, + int src_index, int ref_index, int const penalty_factor, + int size, int h, int flags){ + if(s->me.dia_size==-1) + return funny_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); + else if(s->me.dia_size<-1) + return sab_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); + else if(s->me.dia_size<2) + return small_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); + else + return var_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); +} + +static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx_ptr, int *my_ptr, + int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], + int ref_mv_scale, int flags) { int best[2]={0, 0}; - int d, dmin; - const int shift= 1+s->quarter_sample; - uint32_t *map= s->me.map; + int d, dmin; int map_generation; const int penalty_factor= s->me.penalty_factor; const int size=0; const int h=16; const int ref_mv_stride= s->mb_stride; //pass as arg FIXME const int ref_mv_xy= s->mb_x + s->mb_y*ref_mv_stride; //add to last_mv beforepassing FIXME - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; + LOAD_COMMON + LOAD_COMMON2 - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; map_generation= update_map_generation(s); - CMP(dmin, 0, 0, size); + dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags); map[0]= map_generation; score_map[0]= dmin; @@ -974,22 +921,7 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s, } //check(best[0],best[1],0, b0) - if(s->me.dia_size==-1) - dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else if(s->me.dia_size<-1) - dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else if(s->me.dia_size<2) - dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else - dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); + dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); //check(best[0],best[1],0, b1) *mx_ptr= best[0]; @@ -999,29 +931,42 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s, return dmin; } -#ifndef CMP_DIRECT /* no 4mv search needed in direct mode */ -static int RENAME(epzs_motion_search4)(MpegEncContext * s, - int *mx_ptr, int *my_ptr, - int P[10][2], int pred_x, int pred_y, - uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2], - int ref_mv_scale, uint8_t * const mv_penalty) +//this function is dedicated to the braindamaged gcc +static inline int epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr, + int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], + int ref_mv_scale) +{ +//FIXME convert other functions in the same way if faster + switch(s->me.flags){ + case 0: + return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, 0); +// case FLAG_QPEL: +// return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, FLAG_QPEL); + default: + return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, s->me.flags); + } +} + +static int epzs_motion_search4(MpegEncContext * s, + int *mx_ptr, int *my_ptr, int P[10][2], + int src_index, int ref_index, int16_t (*last_mv)[2], + int ref_mv_scale) { int best[2]={0, 0}; int d, dmin; - const int shift= 1+s->quarter_sample; - uint32_t *map= s->me.map; int map_generation; const int penalty_factor= s->me.penalty_factor; const int size=1; const int h=8; const int ref_mv_stride= s->mb_stride; const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride; - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; LOAD_COMMON + int flags= s->me.flags; + LOAD_COMMON2 - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; map_generation= update_map_generation(s); @@ -1053,23 +998,7 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s, (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16) } - if(s->me.dia_size==-1) - dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else if(s->me.dia_size<-1) - dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else if(s->me.dia_size<2) - dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else - dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - + dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); *mx_ptr= best[0]; *my_ptr= best[1]; @@ -1079,28 +1008,26 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s, } //try to merge with above FIXME (needs PSNR test) -static int RENAME(epzs_motion_search2)(MpegEncContext * s, - int *mx_ptr, int *my_ptr, - int P[10][2], int pred_x, int pred_y, - uint8_t *src_data[3], - uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2], - int ref_mv_scale, uint8_t * const mv_penalty) +static int epzs_motion_search2(MpegEncContext * s, + int *mx_ptr, int *my_ptr, int P[10][2], + int src_index, int ref_index, int16_t (*last_mv)[2], + int ref_mv_scale) { int best[2]={0, 0}; int d, dmin; - const int shift= 1+s->quarter_sample; - uint32_t *map= s->me.map; int map_generation; const int penalty_factor= s->me.penalty_factor; const int size=0; //FIXME pass as arg const int h=8; const int ref_mv_stride= s->mb_stride; const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride; - me_cmp_func cmp, chroma_cmp; + me_cmp_func cmpf, chroma_cmpf; LOAD_COMMON + int flags= s->me.flags; + LOAD_COMMON2 - cmp= s->dsp.me_cmp[size]; - chroma_cmp= s->dsp.me_cmp[size+1]; + cmpf= s->dsp.me_cmp[size]; + chroma_cmpf= s->dsp.me_cmp[size+1]; map_generation= update_map_generation(s); @@ -1132,23 +1059,7 @@ static int RENAME(epzs_motion_search2)(MpegEncContext * s, (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16) } - if(s->me.dia_size==-1) - dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else if(s->me.dia_size<-1) - dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else if(s->me.dia_size<2) - dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - else - dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride, - pred_x, pred_y, penalty_factor, - shift, map, map_generation, size, h, mv_penalty); - + dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags); *mx_ptr= best[0]; *my_ptr= best[1]; @@ -1156,4 +1067,3 @@ static int RENAME(epzs_motion_search2)(MpegEncContext * s, // printf("%d %d %d \n", best[0], best[1], dmin); return dmin; } -#endif /* !CMP_DIRECT */ diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c index e39356c9d..493d1a445 100644 --- a/src/libffmpeg/libavcodec/mpeg12.c +++ b/src/libffmpeg/libavcodec/mpeg12.c @@ -249,7 +249,7 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s) { unsigned int vbv_buffer_size; unsigned int fps, v; - int n, i; + int i; uint64_t time_code; float best_aspect_error= 1E10; float aspect_ratio= av_q2d(s->avctx->sample_aspect_ratio); @@ -365,8 +365,14 @@ static inline void encode_mb_skip_run(MpegEncContext *s, int run){ static void common_init(MpegEncContext *s) { +int i; + s->y_dc_scale_table= s->c_dc_scale_table= ff_mpeg1_dc_scale_table; + + if(!s->encoding) + for(i=0;i<64;i++) + s->dsp.idct_permutation[i]=i; } void ff_mpeg1_clean_buffers(MpegEncContext *s){ @@ -500,8 +506,9 @@ void mpeg1_encode_mb(MpegEncContext *s, cbp |= 1 << (5 - i); } - if (cbp == 0 && !first_mb && (mb_x != s->mb_width - 1 || (mb_y != s->mb_height - 1 && s->codec_id == CODEC_ID_MPEG1VIDEO)) && - ((s->pict_type == P_TYPE && s->mv_type == MV_TYPE_16X16 && (motion_x | motion_y) == 0) || + if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 && + (mb_x != s->mb_width - 1 || (mb_y != s->mb_height - 1 && s->codec_id == CODEC_ID_MPEG1VIDEO)) && + ((s->pict_type == P_TYPE && (motion_x | motion_y) == 0) || (s->pict_type == B_TYPE && s->mv_dir == s->last_mv_dir && (((s->mv_dir & MV_DIR_FORWARD) ? ((s->mv[0][0][0] - s->last_mv[0][0][0])|(s->mv[0][0][1] - s->last_mv[0][0][1])) : 0) | ((s->mv_dir & MV_DIR_BACKWARD) ? ((s->mv[1][0][0] - s->last_mv[1][0][0])|(s->mv[1][0][1] - s->last_mv[1][0][1])) : 0)) == 0))) { s->mb_skip_run++; @@ -798,7 +805,7 @@ void ff_mpeg1_encode_init(MpegEncContext *s) else{ int val, bit_size, range, code; - bit_size = s->f_code - 1; + bit_size = f_code - 1; range = 1 << bit_size; val=mv; @@ -955,7 +962,7 @@ static VLC mb_ptype_vlc; static VLC mb_btype_vlc; static VLC mb_pat_vlc; -static void init_vlcs() +static void init_vlcs(void) { static int done = 0; @@ -1754,11 +1761,17 @@ typedef struct Mpeg1Context { int repeat_field; /* true if we must repeat the field */ AVPanScan pan_scan; /** some temporary storage for the panscan */ int slice_count; + int swap_uv;//indicate VCR2 + int save_aspect_info; + } Mpeg1Context; static int mpeg_decode_init(AVCodecContext *avctx) { Mpeg1Context *s = avctx->priv_data; + MpegEncContext *s2 = &s->mpeg_enc_ctx; + + MPV_decode_defaults(s2); s->mpeg_enc_ctx.avctx= avctx; s->mpeg_enc_ctx.flags= avctx->flags; @@ -1773,6 +1786,122 @@ static int mpeg_decode_init(AVCodecContext *avctx) return 0; } +static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm, + const uint8_t *new_perm){ +uint16_t temp_matrix[64]; +int i; + + memcpy(temp_matrix,matrix,64*sizeof(uint16_t)); + + for(i=0;i<64;i++){ + matrix[new_perm[i]] = temp_matrix[old_perm[i]]; + } +} + +//Call this function when we know all parameters +//it may be called in different places for mpeg1 and mpeg2 +static int mpeg_decode_postinit(AVCodecContext *avctx){ +Mpeg1Context *s1 = avctx->priv_data; +MpegEncContext *s = &s1->mpeg_enc_ctx; +uint8_t old_permutation[64]; + + + if ( + (s1->mpeg_enc_ctx_allocated == 0)|| + avctx->width != s->width || + avctx->height != s->height|| +// s1->save_aspect_info != avctx->aspect_ratio_info|| + 0) + { + + if (s1->mpeg_enc_ctx_allocated) { + MPV_common_end(s); + } + + if( (s->width == 0 )||(s->height == 0)) + return -2; + + avctx->width = s->width; + avctx->height = s->height; + avctx->bit_rate = s->bit_rate; + s1->save_aspect_info = s->aspect_ratio_info; + + //low_delay may be forced, in this case we will have B frames + //that behave like P frames + avctx->has_b_frames = !(s->low_delay); + + if(avctx->sub_id==1){//s->codec_id==avctx->codec_id==CODEC_ID + //mpeg1 fps + avctx->frame_rate = frame_rate_tab[s->frame_rate_index].num; + avctx->frame_rate_base= frame_rate_tab[s->frame_rate_index].den; + //mpeg1 aspect + avctx->sample_aspect_ratio= av_d2q( + 1.0/mpeg1_aspect[s->aspect_ratio_info], 255); + + }else{//mpeg2 + //mpeg2 fps + av_reduce( + &s->avctx->frame_rate, + &s->avctx->frame_rate_base, + frame_rate_tab[s->frame_rate_index].num * (s->frame_rate_ext_n+1), + frame_rate_tab[s->frame_rate_index].den * (s->frame_rate_ext_d+1), + 1<<30); + //mpeg2 aspect + if(s->aspect_ratio_info > 1){ + if( (s1->pan_scan.width == 0 )||(s1->pan_scan.height == 0) ){ + s->avctx->sample_aspect_ratio= + av_div_q( + mpeg2_aspect[s->aspect_ratio_info], + (AVRational){s->width, s->height} + ); + }else{ + s->avctx->sample_aspect_ratio= + av_div_q( + mpeg2_aspect[s->aspect_ratio_info], + (AVRational){s1->pan_scan.width, s1->pan_scan.height} + ); + } + }else{ + s->avctx->sample_aspect_ratio= + mpeg2_aspect[s->aspect_ratio_info]; + } + }//mpeg2 + + if(avctx->xvmc_acceleration){ + avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420); + }else{ + if(s->chroma_format < 2){ + avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_420); + }else + if(s->chroma_format == 2){ + avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_422); + }else + if(s->chroma_format > 2){ + avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_444); + } + } + //until then pix_fmt may be changed right after codec init + if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT ) + if( avctx->idct_algo == FF_IDCT_AUTO ) + avctx->idct_algo = FF_IDCT_SIMPLE; + + //quantization matrixes may need reordering + //if dct permutation is changed + memcpy(old_permutation,s->dsp.idct_permutation,64*sizeof(uint8_t)); + + if (MPV_common_init(s) < 0) + return -2; + + quant_matrix_rebuild(s->intra_matrix, old_permutation,s->dsp.idct_permutation); + quant_matrix_rebuild(s->inter_matrix, old_permutation,s->dsp.idct_permutation); + quant_matrix_rebuild(s->chroma_intra_matrix,old_permutation,s->dsp.idct_permutation); + quant_matrix_rebuild(s->chroma_inter_matrix,old_permutation,s->dsp.idct_permutation); + + s1->mpeg_enc_ctx_allocated = 1; + } + return 0; +} + /* return the 8 bit start code value and update the search state. Return -1 if no start code found */ static int find_start_code(const uint8_t **pbuf_ptr, const uint8_t *buf_end) @@ -1807,6 +1936,9 @@ static int mpeg1_decode_picture(AVCodecContext *avctx, MpegEncContext *s = &s1->mpeg_enc_ctx; int ref, f_code, vbv_delay; + if(mpeg_decode_postinit(s->avctx) < 0) + return -2; + init_get_bits(&s->gb, buf, buf_size*8); ref = get_bits(&s->gb, 10); /* temporal ref */ @@ -1845,7 +1977,6 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s) { int horiz_size_ext, vert_size_ext; int bit_rate_ext; - int frame_rate_ext_n, frame_rate_ext_d; int level, profile; skip_bits(&s->gb, 1); /* profil and level esc*/ @@ -1865,32 +1996,17 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s) s->low_delay = get_bits1(&s->gb); if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1; - frame_rate_ext_n = get_bits(&s->gb, 2); - frame_rate_ext_d = get_bits(&s->gb, 5); - av_reduce( - &s->avctx->frame_rate, - &s->avctx->frame_rate_base, - frame_rate_tab[s->frame_rate_index].num * (frame_rate_ext_n+1), - frame_rate_tab[s->frame_rate_index].den * (frame_rate_ext_d+1), - 1<<30); + s->frame_rate_ext_n = get_bits(&s->gb, 2); + s->frame_rate_ext_d = get_bits(&s->gb, 5); dprintf("sequence extension\n"); s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG2VIDEO; s->avctx->sub_id = 2; /* indicates mpeg2 found */ - if(s->aspect_ratio_info <= 1) - s->avctx->sample_aspect_ratio= mpeg2_aspect[s->aspect_ratio_info]; - else{ - s->avctx->sample_aspect_ratio= - av_div_q( - mpeg2_aspect[s->aspect_ratio_info], - (AVRational){s->width, s->height} - ); - } - if(s->avctx->debug & FF_DEBUG_PICT_INFO) av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n", profile, level, s->avctx->rc_buffer_size, s->bit_rate); + } static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1) @@ -1912,14 +2028,7 @@ static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1) s1->pan_scan.width= 16*w; s1->pan_scan.height=16*h; - - if(s->aspect_ratio_info > 1) - s->avctx->sample_aspect_ratio= - av_div_q( - mpeg2_aspect[s->aspect_ratio_info], - (AVRational){w, h} - ); - + if(s->avctx->debug & FF_DEBUG_PICT_INFO) av_log(s->avctx, AV_LOG_DEBUG, "sde w:%d, h:%d\n", w, h); } @@ -1927,9 +2036,23 @@ static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1) static void mpeg_decode_picture_display_extension(Mpeg1Context *s1) { MpegEncContext *s= &s1->mpeg_enc_ctx; - int i; - - for(i=0; i<1; i++){ //FIXME count + int i,nofco; + + nofco = 1; + if(s->progressive_sequence){ + if(s->repeat_first_field){ + nofco++; + if(s->top_field_first) + nofco++; + } + }else{ + if(s->picture_structure == PICT_FRAME){ + nofco++; + if(s->repeat_first_field) + nofco++; + } + } + for(i=0; i<nofco; i++){ s1->pan_scan.position[i][0]= get_sbits(&s->gb, 16); skip_bits(&s->gb, 1); //marker s1->pan_scan.position[i][1]= get_sbits(&s->gb, 16); @@ -2134,8 +2257,8 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y, s->resync_mb_x= s->resync_mb_y= -1; - if (mb_y >= s->mb_height){ - av_log(s->avctx, AV_LOG_ERROR, "slice below image (%d >= %d)\n", s->mb_y, s->mb_height); + if (mb_y<<field_pic >= s->mb_height){ + av_log(s->avctx, AV_LOG_ERROR, "slice below image (%d >= %d)\n", mb_y, s->mb_height); return -1; } @@ -2208,8 +2331,8 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y, return -1; if(s->current_picture.motion_val[0] && !s->encoding){ //note motion_val is normally NULL unless we want to extract the MVs - const int wrap = field_pic ? 2*s->block_wrap[0] : s->block_wrap[0]; - int xy = s->mb_x*2 + 1 + (s->mb_y*2 +1)*wrap; + const int wrap = field_pic ? 2*s->b8_stride : s->b8_stride; + int xy = s->mb_x*2 + s->mb_y*2*wrap; int motion_x, motion_y, dir, i; if(field_pic && !s->first_field) xy += wrap/2; @@ -2218,18 +2341,20 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y, for(dir=0; dir<2; dir++){ if (s->mb_intra || (dir==1 && s->pict_type != B_TYPE)) { motion_x = motion_y = 0; - }else if (s->mv_type == MV_TYPE_16X16){ + }else if (s->mv_type == MV_TYPE_16X16 || (s->mv_type == MV_TYPE_FIELD && field_pic)){ motion_x = s->mv[dir][0][0]; motion_y = s->mv[dir][0][1]; } else /*if ((s->mv_type == MV_TYPE_FIELD) || (s->mv_type == MV_TYPE_16X8))*/ { motion_x = s->mv[dir][i][0]; motion_y = s->mv[dir][i][1]; } - + s->current_picture.motion_val[dir][xy ][0] = motion_x; s->current_picture.motion_val[dir][xy ][1] = motion_y; s->current_picture.motion_val[dir][xy + 1][0] = motion_x; s->current_picture.motion_val[dir][xy + 1][1] = motion_y; + s->current_picture.ref_index [dir][xy ]= + s->current_picture.ref_index [dir][xy + 1]= s->field_select[dir][i]; } xy += wrap; } @@ -2379,59 +2504,27 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, { Mpeg1Context *s1 = avctx->priv_data; MpegEncContext *s = &s1->mpeg_enc_ctx; - int width, height, i, v, j; - float aspect; + int width,height; + int i, v, j; init_get_bits(&s->gb, buf, buf_size*8); width = get_bits(&s->gb, 12); height = get_bits(&s->gb, 12); + if (width <= 0 || height <= 0 || + (width % 2) != 0 || (height % 2) != 0) + return -1; s->aspect_ratio_info= get_bits(&s->gb, 4); if (s->aspect_ratio_info == 0) return -1; - aspect= 1.0/mpeg1_aspect[s->aspect_ratio_info]; - avctx->sample_aspect_ratio= av_d2q(aspect, 255); - s->frame_rate_index = get_bits(&s->gb, 4); if (s->frame_rate_index == 0 || s->frame_rate_index > 13) return -1; s->bit_rate = get_bits(&s->gb, 18) * 400; if (get_bits1(&s->gb) == 0) /* marker */ return -1; - if (width <= 0 || height <= 0 || - (width % 2) != 0 || (height % 2) != 0) - return -1; - if (width != s->width || - height != s->height) { - /* start new mpeg1 context decoding */ - s->out_format = FMT_MPEG1; - if (s1->mpeg_enc_ctx_allocated) { - MPV_common_end(s); - } - s->width = width; - s->height = height; - avctx->has_b_frames= 1; - avctx->width = width; - avctx->height = height; - avctx->frame_rate = frame_rate_tab[s->frame_rate_index].num; - avctx->frame_rate_base= frame_rate_tab[s->frame_rate_index].den; - avctx->bit_rate = s->bit_rate; - - if(avctx->xvmc_acceleration){ - avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420); - }else{ - avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_420); - } - - if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT ) - if( avctx->idct_algo == FF_IDCT_AUTO ) - avctx->idct_algo = FF_IDCT_SIMPLE; - - if (MPV_common_init(s) < 0) - return -1; - s1->mpeg_enc_ctx_allocated = 1; - s->swap_uv = 0;//just in case vcr2 and mpeg2 stream have been concatinated - } + s->width = width; + s->height = height; s->avctx->rc_buffer_size= get_bits(&s->gb, 10) * 1024*16; skip_bits(&s->gb, 1); @@ -2444,19 +2537,21 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, av_log(s->avctx, AV_LOG_ERROR, "intra matrix damaged\n"); return -1; } - j = s->intra_scantable.permutated[i]; + j = s->dsp.idct_permutation[ ff_zigzag_direct[i] ]; s->intra_matrix[j] = v; s->chroma_intra_matrix[j] = v; } #ifdef DEBUG +/* dprintf("intra matrix present\n"); for(i=0;i<64;i++) - dprintf(" %d", s->intra_matrix[s->intra_scantable.permutated[i]]); + dprintf(" %d", s->intra_matrix[s->dsp.idct_permutation[i]); printf("\n"); +*/ #endif } else { for(i=0;i<64;i++) { - int j= s->dsp.idct_permutation[i]; + j = s->dsp.idct_permutation[i]; v = ff_mpeg1_default_intra_matrix[i]; s->intra_matrix[j] = v; s->chroma_intra_matrix[j] = v; @@ -2469,15 +2564,17 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, av_log(s->avctx, AV_LOG_ERROR, "inter matrix damaged\n"); return -1; } - j = s->intra_scantable.permutated[i]; + j = s->dsp.idct_permutation[ ff_zigzag_direct[i] ]; s->inter_matrix[j] = v; s->chroma_inter_matrix[j] = v; } #ifdef DEBUG +/* dprintf("non intra matrix present\n"); for(i=0;i<64;i++) - dprintf(" %d", s->inter_matrix[s->intra_scantable.permutated[i]]); + dprintf(" %d", s->inter_matrix[s->dsp.idct_permutation[i]); printf("\n"); +*/ #endif } else { for(i=0;i<64;i++) { @@ -2501,6 +2598,8 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, s->chroma_format = 1; s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG1VIDEO; avctx->sub_id = 1; /* indicates mpeg1 */ + s->out_format = FMT_MPEG1; + s->swap_uv = 0;//AFAIK VCR2 don't have SEQ_HEADER if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1; if(s->avctx->debug & FF_DEBUG_PICT_INFO) @@ -2593,6 +2692,36 @@ static void mpeg_decode_user_data(AVCodecContext *avctx, } } +static void mpeg_decode_gop(AVCodecContext *avctx, + const uint8_t *buf, int buf_size){ + Mpeg1Context *s1 = avctx->priv_data; + MpegEncContext *s = &s1->mpeg_enc_ctx; + + int drop_frame_flag; + int time_code_hours, time_code_minutes; + int time_code_seconds, time_code_pictures; + int broken_link; + + init_get_bits(&s->gb, buf, buf_size*8); + + drop_frame_flag = get_bits1(&s->gb); + + time_code_hours=get_bits(&s->gb,5); + time_code_minutes = get_bits(&s->gb,6); + skip_bits1(&s->gb);//marker bit + time_code_seconds = get_bits(&s->gb,6); + time_code_pictures = get_bits(&s->gb,6); + + /*broken_link indicate that after editing the + reference frames of the first B-Frames after GOP I-Frame + are missing (open gop)*/ + broken_link = get_bits1(&s->gb); + + if(s->avctx->debug & FF_DEBUG_PICT_INFO) + av_log(s->avctx, AV_LOG_DEBUG, "GOP (%2d:%02d:%02d.[%02d]) broken_link=%d\n", + time_code_hours, time_code_minutes, time_code_seconds, + time_code_pictures, broken_link); +} /** * finds the end of the current frame in the bitstream. * @return the position of the first byte of the next frame, or -1 @@ -2706,7 +2835,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx, input_size = buf_end - buf_ptr; if(avctx->debug & FF_DEBUG_STARTCODE){ - av_log(avctx, AV_LOG_DEBUG, "%3X at %d left %d\n", start_code, buf_ptr-buf, input_size); + av_log(avctx, AV_LOG_DEBUG, "%3X at %zd left %d\n", start_code, buf_ptr-buf, input_size); } /* prepare data for next start code */ @@ -2731,6 +2860,8 @@ static int mpeg_decode_frame(AVCodecContext *avctx, break; case GOP_START_CODE: s2->first_field=0; + mpeg_decode_gop(avctx, + buf_ptr, input_size); break; default: if (start_code >= SLICE_MIN_START_CODE && diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c index d01405f54..a9eed4e36 100644 --- a/src/libffmpeg/libavcodec/mpegaudiodec.c +++ b/src/libffmpeg/libavcodec/mpegaudiodec.c @@ -23,7 +23,6 @@ */ //#define DEBUG -#include <math.h> #include "avcodec.h" #include "mpegaudio.h" #include "dsputil.h" @@ -401,11 +400,11 @@ static int decode_init(AVCodecContext * avctx) } /* compute n ^ (4/3) and store it in mantissa/exp format */ - if (!av_mallocz_static(&table_4_3_exp, - TABLE_4_3_SIZE * sizeof(table_4_3_exp[0]))) + table_4_3_exp= av_mallocz_static(TABLE_4_3_SIZE * sizeof(table_4_3_exp[0])); + if(!table_4_3_exp) return -1; - if (!av_mallocz_static(&table_4_3_value, - TABLE_4_3_SIZE * sizeof(table_4_3_value[0]))) + table_4_3_value= av_mallocz_static(TABLE_4_3_SIZE * sizeof(table_4_3_value[0])); + if(!table_4_3_value) return -1; int_pow_init(); diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c index 32a92917c..bef088a41 100644 --- a/src/libffmpeg/libavcodec/mpegvideo.c +++ b/src/libffmpeg/libavcodec/mpegvideo.c @@ -283,7 +283,9 @@ static void copy_picture(Picture *dst, Picture *src){ dst->type= FF_BUFFER_TYPE_COPY; } -static void copy_picture_attributes(AVFrame *dst, AVFrame *src){ +static void copy_picture_attributes(MpegEncContext *s, AVFrame *dst, AVFrame *src){ + int i; + dst->pict_type = src->pict_type; dst->quality = src->quality; dst->coded_picture_number = src->coded_picture_number; @@ -292,6 +294,32 @@ static void copy_picture_attributes(AVFrame *dst, AVFrame *src){ dst->pts = src->pts; dst->interlaced_frame = src->interlaced_frame; dst->top_field_first = src->top_field_first; + + if(s->avctx->me_threshold){ + if(!src->motion_val[0]) + av_log(s->avctx, AV_LOG_ERROR, "AVFrame.motion_val not set!\n"); + if(!src->mb_type) + av_log(s->avctx, AV_LOG_ERROR, "AVFrame.mb_type not set!\n"); + if(!src->ref_index[0]) + av_log(s->avctx, AV_LOG_ERROR, "AVFrame.ref_index not set!\n"); + if(src->motion_subsample_log2 != dst->motion_subsample_log2) + av_log(s->avctx, AV_LOG_ERROR, "AVFrame.motion_subsample_log2 doesnt match! (%d!=%d)\n", + src->motion_subsample_log2, dst->motion_subsample_log2); + + memcpy(dst->mb_type, src->mb_type, s->mb_stride * s->mb_height * sizeof(dst->mb_type[0])); + + for(i=0; i<2; i++){ + int stride= ((16*s->mb_width )>>src->motion_subsample_log2) + 1; + int height= ((16*s->mb_height)>>src->motion_subsample_log2); + + if(src->motion_val[i] && src->motion_val[i] != dst->motion_val[i]){ + memcpy(dst->motion_val[i], src->motion_val[i], 2*stride*height*sizeof(int16_t)); + } + if(src->ref_index[i] && src->ref_index[i] != dst->ref_index[i]){ + memcpy(dst->ref_index[i], src->ref_index[i], s->b8_stride*2*s->mb_height*sizeof(int8_t)); + } + } + } } /** @@ -350,13 +378,14 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){ for(i=0; i<2; i++){ CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b4_array_size+2) * sizeof(int16_t)) pic->motion_val[i]= pic->motion_val_base[i]+2; - CHECKED_ALLOCZ(pic->ref_index[i] , b8_array_size * sizeof(uint8_t)) + CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t)) } pic->motion_subsample_log2= 2; }else if(s->out_format == FMT_H263 || s->encoding || (s->avctx->debug&FF_DEBUG_MV) || (s->avctx->debug_mv)){ for(i=0; i<2; i++){ - CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t)*2) //FIXME + CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t)) pic->motion_val[i]= pic->motion_val_base[i]+2; + CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t)) } pic->motion_subsample_log2= 3; } @@ -510,7 +539,68 @@ static void update_duplicate_context_after_me(MpegEncContext *dst, MpegEncContex #undef COPY } -/* init common structure for both encoder and decoder */ +/** + * sets the given MpegEncContext to common defaults (same for encoding and decoding). + * the changed fields will not depend upon the prior state of the MpegEncContext. + */ +static void MPV_common_defaults(MpegEncContext *s){ + s->y_dc_scale_table= + s->c_dc_scale_table= ff_mpeg1_dc_scale_table; + s->chroma_qscale_table= ff_default_chroma_qscale_table; + s->progressive_frame= 1; + s->progressive_sequence= 1; + s->picture_structure= PICT_FRAME; + + s->coded_picture_number = 0; + s->picture_number = 0; + s->input_picture_number = 0; + + s->picture_in_gop_number = 0; + + s->f_code = 1; + s->b_code = 1; +} + +/** + * sets the given MpegEncContext to defaults for decoding. + * the changed fields will not depend upon the prior state of the MpegEncContext. + */ +void MPV_decode_defaults(MpegEncContext *s){ + MPV_common_defaults(s); +} + +/** + * sets the given MpegEncContext to defaults for encoding. + * the changed fields will not depend upon the prior state of the MpegEncContext. + */ + +#ifdef CONFIG_ENCODERS +void MPV_encode_defaults(MpegEncContext *s){ + static int done=0; + + MPV_common_defaults(s); + + if(!done){ + int i; + done=1; + + default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) ); + memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1)); + memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1)); + + for(i=-16; i<16; i++){ + default_fcode_tab[i + MAX_MV]= 1; + } + } + s->me.mv_penalty= default_mv_penalty; + s->fcode_tab= default_fcode_tab; +} +#endif //CONFIG_ENCODERS + +/** + * init common structure for both encoder and decoder. + * this assumes that some variables like width/height are already set + */ int MPV_common_init(MpegEncContext *s) { int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y; @@ -538,31 +628,14 @@ int MPV_common_init(MpegEncContext *s) s->block_wrap[0]= s->block_wrap[1]= s->block_wrap[2]= - s->block_wrap[3]= s->mb_width*2 + 2; + s->block_wrap[3]= s->b8_stride; s->block_wrap[4]= - s->block_wrap[5]= s->mb_width + 2; - - s->y_dc_scale_table= - s->c_dc_scale_table= ff_mpeg1_dc_scale_table; - s->chroma_qscale_table= ff_default_chroma_qscale_table; - if( s->codec_id != CODEC_ID_MPEG1VIDEO && - s->codec_id != CODEC_ID_MPEG2VIDEO) - { - /* default structure is frame */ - s->progressive_frame= 1; - s->picture_structure= PICT_FRAME; - - s->y_dc_scale_table= - s->c_dc_scale_table= ff_mpeg1_dc_scale_table; - if (!s->encoding) - s->progressive_sequence= 1; - } - s->coded_picture_number = 0; - - y_size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2); - c_size = (s->mb_width + 2) * (s->mb_height + 2); + s->block_wrap[5]= s->mb_stride; + + y_size = s->b8_stride * (2 * s->mb_height + 1); + c_size = s->mb_stride * (s->mb_height + 1); yc_size = y_size + 2 * c_size; - + /* convert fourcc to upper case */ s->avctx->codec_tag= toupper( s->avctx->codec_tag &0xFF) + (toupper((s->avctx->codec_tag>>8 )&0xFF)<<8 ) @@ -642,12 +715,14 @@ int MPV_common_init(MpegEncContext *s) } if (s->out_format == FMT_H263) { /* ac values */ - CHECKED_ALLOCZ(s->ac_val[0], yc_size * sizeof(int16_t) * 16); - s->ac_val[1] = s->ac_val[0] + y_size; + CHECKED_ALLOCZ(s->ac_val_base, yc_size * sizeof(int16_t) * 16); + s->ac_val[0] = s->ac_val_base + s->b8_stride + 1; + s->ac_val[1] = s->ac_val_base + y_size + s->mb_stride + 1; s->ac_val[2] = s->ac_val[1] + c_size; /* cbp values */ - CHECKED_ALLOCZ(s->coded_block, y_size); + CHECKED_ALLOCZ(s->coded_block_base, y_size); + s->coded_block= s->coded_block_base + s->b8_stride + 1; /* divx501 bitstream reorder buffer */ CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE); @@ -660,20 +735,18 @@ int MPV_common_init(MpegEncContext *s) if (s->h263_pred || s->h263_plus || !s->encoding) { /* dc values */ //MN: we need these for error resilience of intra-frames - CHECKED_ALLOCZ(s->dc_val[0], yc_size * sizeof(int16_t)); - s->dc_val[1] = s->dc_val[0] + y_size; + CHECKED_ALLOCZ(s->dc_val_base, yc_size * sizeof(int16_t)); + s->dc_val[0] = s->dc_val_base + s->b8_stride + 1; + s->dc_val[1] = s->dc_val_base + y_size + s->mb_stride + 1; s->dc_val[2] = s->dc_val[1] + c_size; for(i=0;i<yc_size;i++) - s->dc_val[0][i] = 1024; + s->dc_val_base[i] = 1024; } /* which mb is a intra block */ CHECKED_ALLOCZ(s->mbintra_table, mb_array_size); memset(s->mbintra_table, 1, mb_array_size); - /* default structure is frame */ - s->picture_structure = PICT_FRAME; - /* init macroblock skip table */ CHECKED_ALLOCZ(s->mbskip_table, mb_array_size+2); //Note the +1 is for a quicker mpeg4 slice_end detection @@ -748,9 +821,9 @@ void MPV_common_end(MpegEncContext *s) av_freep(&s->p_field_select_table[i]); } - av_freep(&s->dc_val[0]); - av_freep(&s->ac_val[0]); - av_freep(&s->coded_block); + av_freep(&s->dc_val_base); + av_freep(&s->ac_val_base); + av_freep(&s->coded_block_base); av_freep(&s->mbintra_table); av_freep(&s->cbp_table); av_freep(&s->pred_dir_table); @@ -782,9 +855,9 @@ void MPV_common_end(MpegEncContext *s) s->last_picture_ptr= s->next_picture_ptr= s->current_picture_ptr= NULL; + for(i=0; i<3; i++) - if (s->visualization_buffer[i]) - av_free(s->visualization_buffer[i]); + av_freep(&s->visualization_buffer[i]); } #ifdef CONFIG_ENCODERS @@ -795,6 +868,8 @@ int MPV_encode_init(AVCodecContext *avctx) MpegEncContext *s = avctx->priv_data; int i, dummy; int chroma_h_shift, chroma_v_shift; + + MPV_encode_defaults(s); avctx->pix_fmt = PIX_FMT_YUV420P; // FIXME @@ -850,8 +925,15 @@ int MPV_encode_init(AVCodecContext *avctx) if(avctx->rc_min_rate && avctx->rc_max_rate != avctx->rc_min_rate){ av_log(avctx, AV_LOG_INFO, "Warning min_rate > 0 but min_rate != max_rate isnt recommanded!\n"); - } + } + + if( s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate + && (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO) + && 90000LL * (avctx->rc_buffer_size-1) > s->avctx->rc_max_rate*0xFFFFLL){ + av_log(avctx, AV_LOG_INFO, "Warning vbv_delay will be set to 0xFFFF (=VBR) as the specified vbv buffer is too large for the given bitrate!\n"); + } + if((s->flags & CODEC_FLAG_4MV) && s->codec_id != CODEC_ID_MPEG4 && s->codec_id != CODEC_ID_H263 && s->codec_id != CODEC_ID_H263P && s->codec_id != CODEC_ID_FLV1){ av_log(avctx, AV_LOG_ERROR, "4MV not supported by codec\n"); @@ -882,7 +964,13 @@ int MPV_encode_init(AVCodecContext *avctx) av_log(avctx, AV_LOG_ERROR, "b frames not supported by codec\n"); return -1; } - + + if((s->flags & (CODEC_FLAG_INTERLACED_DCT|CODEC_FLAG_INTERLACED_ME|CODEC_FLAG_ALT_SCAN)) + && s->codec_id != CODEC_ID_MPEG4 && s->codec_id != CODEC_ID_MPEG2VIDEO){ + av_log(avctx, AV_LOG_ERROR, "interlacing not supported by codec\n"); + return -1; + } + if(s->mpeg_quant && s->codec_id != CODEC_ID_MPEG4){ //FIXME mpeg2 uses that too av_log(avctx, AV_LOG_ERROR, "mpeg2 style quantization not supporetd by codec\n"); return -1; @@ -1081,28 +1169,6 @@ int MPV_encode_init(AVCodecContext *avctx) return -1; } - { /* set up some save defaults, some codecs might override them later */ - static int done=0; - if(!done){ - int i; - done=1; - - default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) ); - memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1)); - memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1)); - - for(i=-16; i<16; i++){ - default_fcode_tab[i + MAX_MV]= 1; - } - } - } - s->me.mv_penalty= default_mv_penalty; - s->fcode_tab= default_fcode_tab; - - /* dont use mv_penalty table for crap MV as it would be confused */ - //FIXME remove after fixing / removing old ME - if (s->me_method < ME_EPZS) s->me.mv_penalty = default_mv_penalty; - s->encoding = 1; /* init */ @@ -1122,22 +1188,22 @@ int MPV_encode_init(AVCodecContext *avctx) ff_init_me(s); #endif /* #if 0 */ +#ifdef CONFIG_ENCODERS /* xine: do not need this for decode or MPEG-1 encoding modes */ #if 0 -#ifdef CONFIG_ENCODERS #ifdef CONFIG_RISKY if (s->out_format == FMT_H263) h263_encode_init(s); if(s->msmpeg4_version) ff_msmpeg4_encode_init(s); #endif -#endif #endif /* #if 0 */ /* xine: we do want this for MPEG-1 encoding */ if (s->out_format == FMT_MPEG1) ff_mpeg1_encode_init(s); +#endif - /* init default q matrix */ + /* init q matrix */ for(i=0;i<64;i++) { int j= s->dsp.idct_permutation[i]; #ifdef CONFIG_RISKY @@ -1170,14 +1236,7 @@ int MPV_encode_init(AVCodecContext *avctx) if(ff_rate_control_init(s) < 0) return -1; - - s->picture_number = 0; - s->input_picture_number = 0; - s->picture_in_gop_number = 0; - /* motion detector init */ - s->f_code = 1; - s->b_code = 1; - + return 0; } @@ -1321,7 +1380,7 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx) assert(s->last_picture_ptr==NULL || s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3); /* mark&release old frames */ - if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr->data[0]) { + if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr != s->next_picture_ptr && s->last_picture_ptr->data[0]) { avctx->release_buffer(avctx, (AVFrame*)s->last_picture_ptr); /* release forgotten pictures */ @@ -1351,7 +1410,7 @@ alloc: pic= (AVFrame*)&s->picture[i]; } - pic->reference= s->pict_type != B_TYPE ? 3 : 0; + pic->reference= s->pict_type != B_TYPE && !s->dropable ? 3 : 0; pic->coded_picture_number= s->coded_picture_number++; @@ -1373,8 +1432,14 @@ alloc: if(s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3){ if (s->pict_type != B_TYPE) { s->last_picture_ptr= s->next_picture_ptr; - s->next_picture_ptr= s->current_picture_ptr; + if(!s->dropable) + s->next_picture_ptr= s->current_picture_ptr; } +/* av_log(s->avctx, AV_LOG_DEBUG, "L%p N%p C%p L%p N%p C%p type:%d drop:%d\n", s->last_picture_ptr, s->next_picture_ptr,s->current_picture_ptr, + s->last_picture_ptr ? s->last_picture_ptr->data[0] : NULL, + s->next_picture_ptr ? s->next_picture_ptr->data[0] : NULL, + s->current_picture_ptr ? s->current_picture_ptr->data[0] : NULL, + s->pict_type, s->dropable);*/ if(s->last_picture_ptr) copy_picture(&s->last_picture, s->last_picture_ptr); if(s->next_picture_ptr) copy_picture(&s->next_picture, s->next_picture_ptr); @@ -1486,7 +1551,7 @@ void MPV_frame_end(MpegEncContext *s) * @param color color of the arrow */ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){ - int t, x, y, f; + int t, x, y, fr, f; sx= clip(sx, 0, w-1); sy= clip(sy, 0, h-1); @@ -1504,8 +1569,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h ex-= sx; f= ((ey-sy)<<16)/ex; for(x= 0; x <= ex; x++){ - y= ((x*f) + (1<<15))>>16; - buf[y*stride + x]+= color; + y = (x*f)>>16; + fr= (x*f)&0xFFFF; + buf[ y *stride + x]+= (color*(0x10000-fr))>>16; + buf[(y+1)*stride + x]+= (color* fr )>>16; } }else{ if(sy > ey){ @@ -1517,8 +1584,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h if(ey) f= ((ex-sx)<<16)/ey; else f= 0; for(y= 0; y <= ey; y++){ - x= ((y*f) + (1<<15))>>16; - buf[y*stride + x]+= color; + x = (y*f)>>16; + fr= (y*f)&0xFFFF; + buf[y*stride + x ]+= (color*(0x10000-fr))>>16;; + buf[y*stride + x+1]+= (color* fr )>>16;; } } } @@ -1680,12 +1749,13 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){ if(!USES_LIST(pict->mb_type[mb_index], direction)) continue; + //FIXME for h264 if(IS_8X8(pict->mb_type[mb_index])){ int i; for(i=0; i<4; i++){ int sx= mb_x*16 + 4 + 8*(i&1); int sy= mb_y*16 + 4 + 8*(i>>1); - int xy= 1 + mb_x*2 + (i&1) + (mb_y*2 + 1 + (i>>1))*(s->mb_width*2 + 2); + int xy= mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride; int mx= (pict->motion_val[direction][xy][0]>>shift) + sx; int my= (pict->motion_val[direction][xy][1]>>shift) + sy; draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100); @@ -1695,15 +1765,19 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){ for(i=0; i<2; i++){ int sx=mb_x*16 + 8; int sy=mb_y*16 + 4 + 8*i; - int xy=1 + mb_x*2 + (mb_y*2 + 1 + i)*(s->mb_width*2 + 2); - int mx=(pict->motion_val[direction][xy][0]>>shift) + sx; - int my=(pict->motion_val[direction][xy][1]>>shift) + sy; - draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100); + int xy= mb_x*2 + (mb_y*2 + i)*s->b8_stride; + int mx=(pict->motion_val[direction][xy][0]>>shift); + int my=(pict->motion_val[direction][xy][1]>>shift); + + if(IS_INTERLACED(pict->mb_type[mb_index])) + my*=2; + + draw_arrow(ptr, sx, sy, mx+sx, my+sy, s->width, s->height, s->linesize, 100); } }else{ int sx= mb_x*16 + 8; int sy= mb_y*16 + 8; - int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2); + int xy= mb_x*2 + mb_y*2*s->b8_stride; int mx= (pict->motion_val[direction][xy][0]>>shift) + sx; int my= (pict->motion_val[direction][xy][1]>>shift) + sy; draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100); @@ -1880,7 +1954,7 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){ } } } - copy_picture_attributes(pic, pic_arg); + copy_picture_attributes(s, pic, pic_arg); pic->display_picture_number= s->input_picture_number++; if(pic->pts != AV_NOPTS_VALUE){ @@ -2009,11 +2083,12 @@ static void select_input_picture(MpegEncContext *s){ s->reordered_input_picture[0]->data[i]= NULL; s->reordered_input_picture[0]->type= 0; - copy_picture_attributes((AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]); pic->reference = s->reordered_input_picture[0]->reference; alloc_picture(s, pic, 0); + copy_picture_attributes(s, (AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]); + s->current_picture_ptr= pic; }else{ // input is not a shared pix -> reuse buffer for current_pix @@ -2125,7 +2200,8 @@ int MPV_encode_picture(AVCodecContext *avctx, } /* update mpeg1/2 vbv_delay for CBR */ - if(s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate && s->out_format == FMT_MPEG1){ + if(s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate && s->out_format == FMT_MPEG1 + && 90000LL * (avctx->rc_buffer_size-1) <= s->avctx->rc_max_rate*0xFFFFLL){ int vbv_delay; assert(s->repeat_first_field==0); @@ -2432,9 +2508,17 @@ if(s->quarter_sample) src_y = s->mb_y*(16>>field_based) + (motion_y >> 1); if (s->out_format == FMT_H263) { - uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1); - uvsrc_x = src_x>>1; - uvsrc_y = src_y>>1; + if((s->workaround_bugs & FF_BUG_HPEL_CHROMA) && field_based){ + mx = (motion_x>>1)|(motion_x&1); + my = motion_y >>1; + uvdxy = ((my & 1) << 1) | (mx & 1); + uvsrc_x = s->mb_x* 8 + (mx >> 1); + uvsrc_y = s->mb_y*(8>>field_based) + (my >> 1); + }else{ + uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1); + uvsrc_x = src_x>>1; + uvsrc_y = src_y>>1; + } } else { mx = motion_x / 2; my = motion_y / 2; @@ -2736,8 +2820,8 @@ static inline void MPV_motion(MpegEncContext *s, if(s->obmc && s->pict_type != B_TYPE){ int16_t mv_cache[4][4][2]; const int xy= s->mb_x + s->mb_y*s->mb_stride; - const int mot_stride= s->mb_width*2 + 2; - const int mot_xy= 1 + mb_x*2 + (mb_y*2 + 1)*mot_stride; + const int mot_stride= s->b8_stride; + const int mot_xy= mb_x*2 + mb_y*2*mot_stride; assert(!s->mb_skiped); @@ -2993,7 +3077,7 @@ static inline void add_dequant_dct(MpegEncContext *s, */ void ff_clean_intra_table_entries(MpegEncContext *s) { - int wrap = s->block_wrap[0]; + int wrap = s->b8_stride; int xy = s->block_index[0]; s->dc_val[0][xy ] = @@ -3010,15 +3094,15 @@ void ff_clean_intra_table_entries(MpegEncContext *s) s->coded_block[xy + 1 + wrap] = 0; } /* chroma */ - wrap = s->block_wrap[4]; - xy = s->mb_x + 1 + (s->mb_y + 1) * wrap; + wrap = s->mb_stride; + xy = s->mb_x + s->mb_y * wrap; s->dc_val[1][xy] = s->dc_val[2][xy] = 1024; /* ac pred */ memset(s->ac_val[1][xy], 0, 16 * sizeof(int16_t)); memset(s->ac_val[2][xy], 0, 16 * sizeof(int16_t)); - s->mbintra_table[s->mb_x + s->mb_y*s->mb_stride]= 0; + s->mbintra_table[xy]= 0; } /* generic function called after a macroblock has been parsed by the @@ -3338,12 +3422,12 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename const int linesize= s->current_picture.linesize[0]; //not s->linesize as this woulnd be wrong for field pics const int uvlinesize= s->current_picture.linesize[1]; - s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2; - s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1) + s->mb_x*2; - s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2; - s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2) + s->mb_x*2; - s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x; - s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x; + s->block_index[0]= s->b8_stride*(s->mb_y*2 ) - 2 + s->mb_x*2; + s->block_index[1]= s->b8_stride*(s->mb_y*2 ) - 1 + s->mb_x*2; + s->block_index[2]= s->b8_stride*(s->mb_y*2 + 1) - 2 + s->mb_x*2; + s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) - 1 + s->mb_x*2; + s->block_index[4]= s->mb_stride*(s->mb_y + 1) + s->b8_stride*s->mb_height*2 + s->mb_x - 1; + s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1; if(s->pict_type==B_TYPE && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME){ s->dest[0] = s->current_picture.data[0] + s->mb_x * 16 - 16; @@ -3392,7 +3476,6 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y) int dct_offset = s->linesize*8; //default for progressive frames uint8_t *ptr_y, *ptr_cb, *ptr_cr; int wrap_y, wrap_c; - int emu=0; for(i=0; i<6; i++) skip_dct[i]=0; @@ -4065,8 +4148,6 @@ static int encode_thread(AVCodecContext *c, void *arg){ ff_update_block_index(s); /* write gob / video packet header */ -/* xine: do not need this for decode or MPEG-1 encoding modes */ -#if 0 #ifdef CONFIG_RISKY if(s->rtp_mode){ int current_packet_size, is_gob_start; @@ -4121,19 +4202,25 @@ static int encode_thread(AVCodecContext *c, void *arg){ s->avctx->rtp_callback(s->ptr_lastgob, current_packet_size, 0); switch(s->codec_id){ +/* xine: do not need this for decode or MPEG-1 encoding modes */ +#if 0 case CODEC_ID_MPEG4: ff_mpeg4_encode_video_packet_header(s); ff_mpeg4_clean_buffers(s); break; +#endif /* #if 0 */ case CODEC_ID_MPEG1VIDEO: case CODEC_ID_MPEG2VIDEO: ff_mpeg1_encode_slice_header(s); ff_mpeg1_clean_buffers(s); break; +/* xine: do not need this for decode or MPEG-1 encoding modes */ +#if 0 case CODEC_ID_H263: case CODEC_ID_H263P: h263_encode_gob_header(s, mb_y); break; +#endif /* #if 0 */ } if(s->flags&CODEC_FLAG_PASS1){ @@ -4149,8 +4236,6 @@ static int encode_thread(AVCodecContext *c, void *arg){ } } #endif -#endif /* #if 0 */ - if( (s->resync_mb_x == s->mb_x) && s->resync_mb_y+1 == s->mb_y){ @@ -4615,7 +4700,6 @@ static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src) static void encode_picture(MpegEncContext *s, int picture_number) { - int mb_x, mb_y; int i, j; int bits; @@ -4651,19 +4735,18 @@ static void encode_picture(MpegEncContext *s, int picture_number) for(i=1; i<s->avctx->thread_count; i++){ ff_update_duplicate_context(s->thread_context[i], s); } - + + ff_init_me(s); + /* Estimate motion for every MB */ if(s->pict_type != I_TYPE){ -/* xine: do not need this for decode or MPEG-1 encoding modes */ -#if 0 - if(s->pict_type != B_TYPE){ + if(s->pict_type != B_TYPE && s->avctx->me_threshold==0){ if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){ s->avctx->execute(s->avctx, pre_estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count); } } s->avctx->execute(s->avctx, estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count); -#endif /* #if 0 */ }else /* if(s->pict_type == I_TYPE) */{ /* I-Frame */ for(i=0; i<s->mb_stride*s->mb_height; i++) @@ -5194,7 +5277,6 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise? int prev_run=0; int prev_level=0; int qmul, qadd, start_i, last_non_zero, i, dc; - const int esc_length= s->ac_esc_length; uint8_t * length; uint8_t * last_length; int lambda; @@ -5302,7 +5384,6 @@ STOP_TIMER("init rem[]") #endif for(;;){ int best_score=s->dsp.try_8x8basis(rem, weight, basis[0], 0); - int nochange_score= best_score; int best_coeff=0; int best_change=0; int run2, best_unquant_change, analyze_gradient; diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h index 171d66d83..cd42177f5 100644 --- a/src/libffmpeg/libavcodec/mpegvideo.h +++ b/src/libffmpeg/libavcodec/mpegvideo.h @@ -138,7 +138,6 @@ typedef struct Picture{ */ uint8_t *interpolated[3]; int16_t (*motion_val_base[2])[2]; - int8_t *ref_index[2]; uint32_t *mb_type_base; #define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if theres just one type #define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) @@ -201,6 +200,10 @@ typedef struct MotionEstContext{ int co_located_mv[4][2]; ///< mv from last p frame for direct mode ME int direct_basis_mv[4][2]; uint8_t *scratchpad; ///< data area for the me algo, so that the ME doesnt need to malloc/free + uint8_t *best_mb; + uint8_t *temp_mb[2]; + uint8_t *temp; + int best_bits; uint32_t *map; ///< map to avoid duplicate evaluations uint32_t *score_map; ///< map to store the scores int map_generation; @@ -208,31 +211,32 @@ typedef struct MotionEstContext{ int penalty_factor; int sub_penalty_factor; int mb_penalty_factor; + int flags; + int sub_flags; + int mb_flags; int pre_pass; ///< = 1 for the pre pass int dia_size; int xmin; int xmax; int ymin; int ymax; + int pred_x; + int pred_y; + uint8_t *src[4][4]; + uint8_t *ref[4][4]; + int stride; + int uvstride; +/* cmp, chroma_cmp;*/ + op_pixels_func (*hpel_put)[4]; + op_pixels_func (*hpel_avg)[4]; + qpel_mc_func (*qpel_put)[16]; + qpel_mc_func (*qpel_avg)[16]; uint8_t (*mv_penalty)[MAX_MV*2+1]; ///< amount of bits needed to encode a MV + uint8_t *current_mv_penalty; int (*sub_motion_search)(struct MpegEncContext * s, int *mx_ptr, int *my_ptr, int dmin, - int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[6], int stride, int uvstride, - int size, int h, uint8_t * const mv_penalty); - int (*motion_search[7])(struct MpegEncContext * s, - int *mx_ptr, int *my_ptr, - int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[6], int stride, int uvstride, int16_t (*last_mv)[2], - int ref_mv_scale, uint8_t * const mv_penalty); - int (*pre_motion_search)(struct MpegEncContext * s, - int *mx_ptr, int *my_ptr, - int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[6], int stride, int uvstride, int16_t (*last_mv)[2], - int ref_mv_scale, uint8_t * const mv_penalty); - int (*get_mb_score)(struct MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3], - uint8_t *ref_data[6], int stride, int uvstride, - uint8_t * const mv_penalty); + int src_index, int ref_index, + int size, int h); }MotionEstContext; /** @@ -321,13 +325,16 @@ typedef struct MpegEncContext { Picture *current_picture_ptr; ///< pointer to the current picture uint8_t *visualization_buffer[3]; //< temporary buffer vor MV visualization int last_dc[3]; ///< last DC values for MPEG1 + int16_t *dc_val_base; int16_t *dc_val[3]; ///< used for mpeg4 DC prediction, all 3 arrays must be continuous int16_t dc_cache[4*5]; int y_dc_scale, c_dc_scale; uint8_t *y_dc_scale_table; ///< qscale -> y_dc_scale table uint8_t *c_dc_scale_table; ///< qscale -> c_dc_scale table const uint8_t *chroma_qscale_table; ///< qscale -> chroma_qscale (h263) + uint8_t *coded_block_base; uint8_t *coded_block; ///< used for coded block pattern prediction (msmpeg4v3, wmv1) + int16_t (*ac_val_base)[16]; int16_t (*ac_val[3])[16]; ///< used for for mpeg4 AC prediction, all 3 arrays must be continuous int ac_pred; uint8_t *prev_pict_types; ///< previous picture types in bitstream order, used for mb skip @@ -352,8 +359,9 @@ typedef struct MpegEncContext { int adaptive_quant; ///< use adaptive quantization int dquant; ///< qscale difference to prev qscale int pict_type; ///< I_TYPE, P_TYPE, B_TYPE, ... - int last_pict_type; + int last_pict_type; //FIXME removes int last_non_b_pict_type; ///< used for mpeg4 gmc b-frames & ratecontrol + int dropable; int frame_rate_index; int frame_rate_ext_n; ///< MPEG-2 specific framerate modificators (numerator) int frame_rate_ext_d; ///< MPEG-2 specific framerate modificators (denominator) @@ -706,6 +714,7 @@ typedef struct MpegEncContext { int DCT_common_init(MpegEncContext *s); +void MPV_decode_defaults(MpegEncContext *s); int MPV_common_init(MpegEncContext *s); void MPV_common_end(MpegEncContext *s); void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]); @@ -856,7 +865,7 @@ void mpeg4_encode_mb(MpegEncContext *s, void h263_encode_picture_header(MpegEncContext *s, int picture_number); void ff_flv_encode_picture_header(MpegEncContext *s, int picture_number); void h263_encode_gob_header(MpegEncContext * s, int mb_line); -int16_t *h263_pred_motion(MpegEncContext * s, int block, +int16_t *h263_pred_motion(MpegEncContext * s, int block, int dir, int *px, int *py); void mpeg4_pred_ac(MpegEncContext * s, DCTELEM *block, int n, int dir); diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c index b7b88c38f..c6cfebe16 100644 --- a/src/libffmpeg/libavcodec/msmpeg4.c +++ b/src/libffmpeg/libavcodec/msmpeg4.c @@ -425,7 +425,9 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number) #ifdef DEBUG intra_count = 0; +/* printf("*****frame %d:\n", frame_count++); +*/ #endif } @@ -449,7 +451,7 @@ static inline int coded_block_pred(MpegEncContext * s, int n, uint8_t **coded_bl int xy, wrap, pred, a, b, c; xy = s->block_index[n]; - wrap = s->block_wrap[0]; + wrap = s->b8_stride; /* B C * A X @@ -567,7 +569,7 @@ void msmpeg4_encode_mb(MpegEncContext * s, s->misc_bits += get_bits_diff(s); - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); msmpeg4v2_encode_motion(s, motion_x - pred_x); msmpeg4v2_encode_motion(s, motion_y - pred_y); }else{ @@ -578,7 +580,7 @@ void msmpeg4_encode_mb(MpegEncContext * s, s->misc_bits += get_bits_diff(s); /* motion vector */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); msmpeg4_encode_motion(s, motion_x - pred_x, motion_y - pred_y); } @@ -1549,7 +1551,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) cbp|= cbpy<<2; if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C; - h263_pred_motion(s, 0, &mx, &my); + h263_pred_motion(s, 0, 0, &mx, &my); mx= msmpeg4v2_decode_motion(s, mx, 1); my= msmpeg4v2_decode_motion(s, my, 1); @@ -1637,7 +1639,7 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64]) s->rl_chroma_table_index = s->rl_table_index; } set_stat(ST_MV); - h263_pred_motion(s, 0, &mx, &my); + h263_pred_motion(s, 0, 0, &mx, &my); if (msmpeg4_decode_motion(s, &mx, &my) < 0) return -1; s->mv_dir = MV_DIR_FORWARD; diff --git a/src/libffmpeg/libavcodec/ppc/Makefile.am b/src/libffmpeg/libavcodec/ppc/Makefile.am index fbd734c29..50b9d802e 100644 --- a/src/libffmpeg/libavcodec/ppc/Makefile.am +++ b/src/libffmpeg/libavcodec/ppc/Makefile.am @@ -11,6 +11,7 @@ noinst_LTLIBRARIES = libavcodec_ppc.la libavcodec_ppc_src = dsputil_altivec.c \ dsputil_ppc.c \ + fdct_altivec.c \ fft_altivec.c \ idct_altivec.c \ gmc_altivec.c \ diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c index 633cae68b..1bc6fb009 100644 --- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c +++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2002 Brian Foley * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -1302,6 +1302,357 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } +int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ +POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); + int sum; +POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); + register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); + register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; +#ifdef CONFIG_DARWIN + { + register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); + register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); + register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); + register const_vector unsigned char perm1 = (const_vector unsigned char) + (0x02, 0x03, 0x00, 0x01, + 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, + 0x0E, 0x0F, 0x0C, 0x0D); + register const_vector unsigned char perm2 = (const_vector unsigned char) + (0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B); + register const_vector unsigned char perm3 = (const_vector unsigned char) + (0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07); +#else + register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; + register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; + register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; + register const_vector unsigned char perm1 = (const_vector unsigned char) + {0x02, 0x03, 0x00, 0x01, + 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, + 0x0E, 0x0F, 0x0C, 0x0D}; + register const_vector unsigned char perm2 = (const_vector unsigned char) + {0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B}; + register const_vector unsigned char perm3 = (const_vector unsigned char) + {0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07}; +#endif + +#define ONEITERBUTTERFLY(i, res) \ + { \ + register vector unsigned char src1, src2, srcO; \ + register vector unsigned char dst1, dst2, dstO; \ + src1 = vec_ld(stride * i, src); \ + if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ + src2 = vec_ld((stride * i) + 16, src); \ + srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + dst1 = vec_ld(stride * i, dst); \ + if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \ + dst2 = vec_ld((stride * i) + 16, dst); \ + dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + /* promote the unsigned chars to signed shorts */ \ + /* we're in the 8x8 function, we only care for the first 8 */ \ + register vector signed short srcV = \ + (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ + register vector signed short dstV = \ + (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ + /* substractions inside the first butterfly */ \ + register vector signed short but0 = vec_sub(srcV, dstV); \ + register vector signed short op1 = vec_perm(but0, but0, perm1); \ + register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ + register vector signed short op2 = vec_perm(but1, but1, perm2); \ + register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ + register vector signed short op3 = vec_perm(but2, but2, perm3); \ + res = vec_mladd(but2, vprod3, op3); \ + } + ONEITERBUTTERFLY(0, temp0); + ONEITERBUTTERFLY(1, temp1); + ONEITERBUTTERFLY(2, temp2); + ONEITERBUTTERFLY(3, temp3); + ONEITERBUTTERFLY(4, temp4); + ONEITERBUTTERFLY(5, temp5); + ONEITERBUTTERFLY(6, temp6); + ONEITERBUTTERFLY(7, temp7); + } +#undef ONEITERBUTTERFLY + { + register vector signed int vsum; + register vector signed short line0 = vec_add(temp0, temp1); + register vector signed short line1 = vec_sub(temp0, temp1); + register vector signed short line2 = vec_add(temp2, temp3); + register vector signed short line3 = vec_sub(temp2, temp3); + register vector signed short line4 = vec_add(temp4, temp5); + register vector signed short line5 = vec_sub(temp4, temp5); + register vector signed short line6 = vec_add(temp6, temp7); + register vector signed short line7 = vec_sub(temp6, temp7); + + register vector signed short line0B = vec_add(line0, line2); + register vector signed short line2B = vec_sub(line0, line2); + register vector signed short line1B = vec_add(line1, line3); + register vector signed short line3B = vec_sub(line1, line3); + register vector signed short line4B = vec_add(line4, line6); + register vector signed short line6B = vec_sub(line4, line6); + register vector signed short line5B = vec_add(line5, line7); + register vector signed short line7B = vec_sub(line5, line7); + + register vector signed short line0C = vec_add(line0B, line4B); + register vector signed short line4C = vec_sub(line0B, line4B); + register vector signed short line1C = vec_add(line1B, line5B); + register vector signed short line5C = vec_sub(line1B, line5B); + register vector signed short line2C = vec_add(line2B, line6B); + register vector signed short line6C = vec_sub(line2B, line6B); + register vector signed short line3C = vec_add(line3B, line7B); + register vector signed short line7C = vec_sub(line3B, line7B); + + vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); + vsum = vec_sum4s(vec_abs(line1C), vsum); + vsum = vec_sum4s(vec_abs(line2C), vsum); + vsum = vec_sum4s(vec_abs(line3C), vsum); + vsum = vec_sum4s(vec_abs(line4C), vsum); + vsum = vec_sum4s(vec_abs(line5C), vsum); + vsum = vec_sum4s(vec_abs(line6C), vsum); + vsum = vec_sum4s(vec_abs(line7C), vsum); + vsum = vec_sums(vsum, (vector signed int)vzero); + vsum = vec_splat(vsum, 3); + vec_ste(vsum, 0, &sum); + } +POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); + return sum; +} + +/* + 16x8 works with 16 elements ; it allows to avoid replicating + loads, and give the compiler more rooms for scheduling. + It's only used from inside hadamard8_diff16_altivec. + + Unfortunately, it seems gcc-3.3 is a bit dumb, and + the compiled code has a LOT of spill code, it seems + gcc (unlike xlc) cannot keep everything in registers + by itself. The following code include hand-made + registers allocation. It's not clean, but on + a 7450 the resulting code is much faster (best case + fall from 700+ cycles to 550). + + xlc doesn't add spill code, but it doesn't know how to + schedule for the 7450, and its code isn't much faster than + gcc-3.3 on the 7450 (but uses 25% less instructions...) + + On the 970, the hand-made RA is still a win (arount 690 + vs. around 780), but xlc goes to around 660 on the + regular C code... +*/ + +static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { + int sum; + register vector signed short + temp0 asm ("v0"), + temp1 asm ("v1"), + temp2 asm ("v2"), + temp3 asm ("v3"), + temp4 asm ("v4"), + temp5 asm ("v5"), + temp6 asm ("v6"), + temp7 asm ("v7"); + register vector signed short + temp0S asm ("v8"), + temp1S asm ("v9"), + temp2S asm ("v10"), + temp3S asm ("v11"), + temp4S asm ("v12"), + temp5S asm ("v13"), + temp6S asm ("v14"), + temp7S asm ("v15"); + register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0); + { +#ifdef CONFIG_DARWIN + register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); + register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); + register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); + register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char) + (0x02, 0x03, 0x00, 0x01, + 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, + 0x0E, 0x0F, 0x0C, 0x0D); + register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char) + (0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B); + register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char) + (0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07); +#else + register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; + register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; + register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; + register const_vector unsigned char perm1 = (const_vector unsigned char) + {0x02, 0x03, 0x00, 0x01, + 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, + 0x0E, 0x0F, 0x0C, 0x0D}; + register const_vector unsigned char perm2 = (const_vector unsigned char) + {0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x08, 0x09, 0x0A, 0x0B}; + register const_vector unsigned char perm3 = (const_vector unsigned char) + {0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07}; +#endif +#define ONEITERBUTTERFLY(i, res1, res2) \ + { \ + register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \ + register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \ + src1 = vec_ld(stride * i, src); \ + src2 = vec_ld((stride * i) + 16, src); \ + register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + dst1 = vec_ld(stride * i, dst); \ + dst2 = vec_ld((stride * i) + 16, dst); \ + register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + /* promote the unsigned chars to signed shorts */ \ + register vector signed short srcV asm ("v24") = \ + (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ + register vector signed short dstV asm ("v25") = \ + (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ + register vector signed short srcW asm ("v26") = \ + (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \ + register vector signed short dstW asm ("v27") = \ + (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \ + /* substractions inside the first butterfly */ \ + register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \ + register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \ + register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \ + register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \ + register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \ + register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \ + register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \ + register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \ + register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \ + register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \ + register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \ + res1 = vec_mladd(but2, vprod3, op3); \ + register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \ + res2 = vec_mladd(but2S, vprod3, op3S); \ + } + ONEITERBUTTERFLY(0, temp0, temp0S); + ONEITERBUTTERFLY(1, temp1, temp1S); + ONEITERBUTTERFLY(2, temp2, temp2S); + ONEITERBUTTERFLY(3, temp3, temp3S); + ONEITERBUTTERFLY(4, temp4, temp4S); + ONEITERBUTTERFLY(5, temp5, temp5S); + ONEITERBUTTERFLY(6, temp6, temp6S); + ONEITERBUTTERFLY(7, temp7, temp7S); + } +#undef ONEITERBUTTERFLY + { + register vector signed int vsum; + register vector signed short line0 = vec_add(temp0, temp1); + register vector signed short line1 = vec_sub(temp0, temp1); + register vector signed short line2 = vec_add(temp2, temp3); + register vector signed short line3 = vec_sub(temp2, temp3); + register vector signed short line4 = vec_add(temp4, temp5); + register vector signed short line5 = vec_sub(temp4, temp5); + register vector signed short line6 = vec_add(temp6, temp7); + register vector signed short line7 = vec_sub(temp6, temp7); + + register vector signed short line0B = vec_add(line0, line2); + register vector signed short line2B = vec_sub(line0, line2); + register vector signed short line1B = vec_add(line1, line3); + register vector signed short line3B = vec_sub(line1, line3); + register vector signed short line4B = vec_add(line4, line6); + register vector signed short line6B = vec_sub(line4, line6); + register vector signed short line5B = vec_add(line5, line7); + register vector signed short line7B = vec_sub(line5, line7); + + register vector signed short line0C = vec_add(line0B, line4B); + register vector signed short line4C = vec_sub(line0B, line4B); + register vector signed short line1C = vec_add(line1B, line5B); + register vector signed short line5C = vec_sub(line1B, line5B); + register vector signed short line2C = vec_add(line2B, line6B); + register vector signed short line6C = vec_sub(line2B, line6B); + register vector signed short line3C = vec_add(line3B, line7B); + register vector signed short line7C = vec_sub(line3B, line7B); + + vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); + vsum = vec_sum4s(vec_abs(line1C), vsum); + vsum = vec_sum4s(vec_abs(line2C), vsum); + vsum = vec_sum4s(vec_abs(line3C), vsum); + vsum = vec_sum4s(vec_abs(line4C), vsum); + vsum = vec_sum4s(vec_abs(line5C), vsum); + vsum = vec_sum4s(vec_abs(line6C), vsum); + vsum = vec_sum4s(vec_abs(line7C), vsum); + + register vector signed short line0S = vec_add(temp0S, temp1S); + register vector signed short line1S = vec_sub(temp0S, temp1S); + register vector signed short line2S = vec_add(temp2S, temp3S); + register vector signed short line3S = vec_sub(temp2S, temp3S); + register vector signed short line4S = vec_add(temp4S, temp5S); + register vector signed short line5S = vec_sub(temp4S, temp5S); + register vector signed short line6S = vec_add(temp6S, temp7S); + register vector signed short line7S = vec_sub(temp6S, temp7S); + + register vector signed short line0BS = vec_add(line0S, line2S); + register vector signed short line2BS = vec_sub(line0S, line2S); + register vector signed short line1BS = vec_add(line1S, line3S); + register vector signed short line3BS = vec_sub(line1S, line3S); + register vector signed short line4BS = vec_add(line4S, line6S); + register vector signed short line6BS = vec_sub(line4S, line6S); + register vector signed short line5BS = vec_add(line5S, line7S); + register vector signed short line7BS = vec_sub(line5S, line7S); + + register vector signed short line0CS = vec_add(line0BS, line4BS); + register vector signed short line4CS = vec_sub(line0BS, line4BS); + register vector signed short line1CS = vec_add(line1BS, line5BS); + register vector signed short line5CS = vec_sub(line1BS, line5BS); + register vector signed short line2CS = vec_add(line2BS, line6BS); + register vector signed short line6CS = vec_sub(line2BS, line6BS); + register vector signed short line3CS = vec_add(line3BS, line7BS); + register vector signed short line7CS = vec_sub(line3BS, line7BS); + + vsum = vec_sum4s(vec_abs(line0CS), vsum); + vsum = vec_sum4s(vec_abs(line1CS), vsum); + vsum = vec_sum4s(vec_abs(line2CS), vsum); + vsum = vec_sum4s(vec_abs(line3CS), vsum); + vsum = vec_sum4s(vec_abs(line4CS), vsum); + vsum = vec_sum4s(vec_abs(line5CS), vsum); + vsum = vec_sum4s(vec_abs(line6CS), vsum); + vsum = vec_sum4s(vec_abs(line7CS), vsum); + vsum = vec_sums(vsum, (vector signed int)vzero); + vsum = vec_splat(vsum, 3); + vec_ste(vsum, 0, &sum); + } + return sum; +} + +int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ +POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); + int score; +POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); + score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + if (h==16) { + dst += 8*stride; + src += 8*stride; + score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + } +POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); + return score; +} + int has_altivec(void) { #ifdef CONFIG_DARWIN diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h index 93448a1ad..e2729ab22 100644 --- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h +++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2002 Brian Foley * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -45,6 +46,8 @@ extern void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h); extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h); +extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h); +extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h); extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder); diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c index b8372e51e..b70de7328 100644 --- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c +++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2002 Brian Foley * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -59,6 +60,8 @@ static unsigned char* perfname[] = { "put_no_rnd_pixels8_xy2_altivec", "put_pixels16_xy2_altivec", "put_no_rnd_pixels16_xy2_altivec", + "hadamard8_diff8x8_altivec", + "hadamard8_diff16_altivec", "clear_blocks_dcbz32_ppc", "clear_blocks_dcbz128_ppc" }; @@ -262,7 +265,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) c->add_bytes= add_bytes_altivec; #endif /* 0 */ c->put_pixels_tab[0][0] = put_pixels16_altivec; - /* the tow functions do the same thing, so use the same code */ + /* the two functions do the same thing, so use the same code */ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; c->avg_pixels_tab[0][0] = avg_pixels16_altivec; // next one disabled as it's untested. @@ -276,6 +279,9 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) c->gmc1 = gmc1_altivec; + c->hadamard8_diff[0] = hadamard8_diff16_altivec; + c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; + #ifdef CONFIG_ENCODERS if (avctx->dct_algo == FF_DCT_AUTO || avctx->dct_algo == FF_DCT_ALTIVEC) diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h index d672edfcb..8b34c6b45 100644 --- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h +++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -50,6 +50,8 @@ enum powerpc_perf_index { altivec_put_no_rnd_pixels8_xy2_num, altivec_put_pixels16_xy2_num, altivec_put_no_rnd_pixels16_xy2_num, + altivec_hadamard8_diff8x8_num, + altivec_hadamard8_diff16_num, powerpc_clear_blocks_dcbz32, powerpc_clear_blocks_dcbz128, powerpc_perf_total @@ -63,6 +65,8 @@ enum powerpc_data_index { }; extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; +#ifndef POWERPC_MODE_64BITS +#define POWERP_PMC_DATATYPE unsigned long #define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a)) #define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a)) #if (POWERPC_NUM_PMC_ENABLED > 2) @@ -79,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ #define POWERPC_GET_PMC5(a) do {} while (0) #define POWERPC_GET_PMC6(a) do {} while (0) #endif -#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index; +#else /* POWERPC_MODE_64BITS */ +#define POWERP_PMC_DATATYPE unsigned long long +#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a)) +#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a)) +#if (POWERPC_NUM_PMC_ENABLED > 2) +#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a)) +#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a)) +#else +#define POWERPC_GET_PMC3(a) do {} while (0) +#define POWERPC_GET_PMC4(a) do {} while (0) +#endif +#if (POWERPC_NUM_PMC_ENABLED > 4) +#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a)) +#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a)) +#else +#define POWERPC_GET_PMC5(a) do {} while (0) +#define POWERPC_GET_PMC6(a) do {} while (0) +#endif +#endif /* POWERPC_MODE_64BITS */ +#define POWERPC_PERF_DECLARE(a, cond) \ + POWERP_PMC_DATATYPE \ + pmc_start[POWERPC_NUM_PMC_ENABLED], \ + pmc_stop[POWERPC_NUM_PMC_ENABLED], \ + pmc_loop_index; #define POWERPC_PERF_START_COUNT(a, cond) do { \ POWERPC_GET_PMC6(pmc_start[5]); \ POWERPC_GET_PMC5(pmc_start[4]); \ @@ -101,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ pmc_loop_index++) \ { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ - { \ - unsigned long diff = \ + if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ + { \ + POWERP_PMC_DATATYPE diff = \ pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ diff --git a/src/libffmpeg/libavcodec/ppc/fdct_altivec.c b/src/libffmpeg/libavcodec/ppc/fdct_altivec.c new file mode 100644 index 000000000..99df5ced3 --- /dev/null +++ b/src/libffmpeg/libavcodec/ppc/fdct_altivec.c @@ -0,0 +1,498 @@ +/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the + * AltiVec optimized library for the FFMPEG Multimedia System + * Copyright (C) 2003 James Klicman <james@klicman.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include "../common.h" +#include "../dsputil.h" +#include "dsputil_altivec.h" +#include "gcc_fixes.h" + + +#define vs16(v) ((vector signed short)(v)) +#define vs32(v) ((vector signed int)(v)) +#define vu8(v) ((vector unsigned char)(v)) +#define vu16(v) ((vector unsigned short)(v)) +#define vu32(v) ((vector unsigned int)(v)) + + +#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */ +#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */ +#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */ +#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */ +#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */ +#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */ +#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */ +#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ + + +#define W0 -(2 * C2) +#define W1 (2 * C6) +#define W2 (SQRT_2 * C6) +#define W3 (SQRT_2 * C3) +#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) +#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7)) +#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7)) +#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7)) +#define W8 (SQRT_2 * ( C7 - C3)) +#define W9 (SQRT_2 * (-C1 - C3)) +#define WA (SQRT_2 * (-C3 - C5)) +#define WB (SQRT_2 * ( C5 - C3)) + + +static vector float fdctconsts[3] = { + (vector float)AVV( W0, W1, W2, W3 ), + (vector float)AVV( W4, W5, W6, W7 ), + (vector float)AVV( W8, W9, WA, WB ) +}; + +#define LD_W0 vec_splat(cnsts0, 0) +#define LD_W1 vec_splat(cnsts0, 1) +#define LD_W2 vec_splat(cnsts0, 2) +#define LD_W3 vec_splat(cnsts0, 3) +#define LD_W4 vec_splat(cnsts1, 0) +#define LD_W5 vec_splat(cnsts1, 1) +#define LD_W6 vec_splat(cnsts1, 2) +#define LD_W7 vec_splat(cnsts1, 3) +#define LD_W8 vec_splat(cnsts2, 0) +#define LD_W9 vec_splat(cnsts2, 1) +#define LD_WA vec_splat(cnsts2, 2) +#define LD_WB vec_splat(cnsts2, 3) + + +#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ + x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ + x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ + x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ + x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ + x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ + x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ + x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ + x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ + \ + b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ + b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ + b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ + b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ + \ + b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ + b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ + b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ + cnst = LD_W2; \ + b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ + cnst = LD_W1; \ + b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ + cnst = LD_W0; \ + b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ + \ + x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ + x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ + x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ + x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ + x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ + cnst = LD_W3; \ + x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ + \ + cnst = LD_W8; \ + x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ + cnst = LD_W9; \ + x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ + cnst = LD_WA; \ + x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ + cnst = LD_WB; \ + x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ + \ + cnst = LD_W4; \ + b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ + cnst = LD_W5; \ + b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ + cnst = LD_W6; \ + b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ + cnst = LD_W7; \ + b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ + \ + b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ + b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ + b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ + b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \ + /* }}} */ + +#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ + x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ + x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ + x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ + x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ + x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ + x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ + x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ + x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ + \ + b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ + b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ + b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ + b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ + \ + b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ + b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ + b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ + cnst = LD_W2; \ + b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ + cnst = LD_W1; \ + b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ + cnst = LD_W0; \ + b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ + \ + x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ + x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ + x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ + x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ + x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ + cnst = LD_W3; \ + x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ + \ + cnst = LD_W8; \ + x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ + cnst = LD_W9; \ + x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ + cnst = LD_WA; \ + x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ + cnst = LD_WB; \ + x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ + \ + cnst = LD_W4; \ + b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ + cnst = LD_W5; \ + b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ + cnst = LD_W6; \ + b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ + cnst = LD_W7; \ + b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ + \ + b7 = vec_add(b7, x2); /* b7 += x2; */ \ + b5 = vec_add(b5, x3); /* b5 += x3; */ \ + b3 = vec_add(b3, x2); /* b3 += x2; */ \ + b1 = vec_add(b1, x3); /* b1 += x3; */ \ + /* }}} */ + + + +/* two dimensional discrete cosine transform */ + +void fdct_altivec(int16_t *block) +{ +POWERPC_PERF_DECLARE(altivec_fdct, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE +POWERPC_PERF_START_COUNT(altivec_fdct, 1); + void ff_jpeg_fdct_islow(int16_t *block); + ff_jpeg_fdct_islow(block); +POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + vector signed short *bp; + vector float *cp; + vector float b00, b10, b20, b30, b40, b50, b60, b70; + vector float b01, b11, b21, b31, b41, b51, b61, b71; + vector float mzero, cnst, cnsts0, cnsts1, cnsts2; + vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; + + POWERPC_PERF_START_COUNT(altivec_fdct, 1); + + + /* setup constants {{{ */ + /* mzero = -0.0 */ + vu32(mzero) = vec_splat_u32(-1); + vu32(mzero) = vec_sl(vu32(mzero), vu32(mzero)); + cp = fdctconsts; + cnsts0 = vec_ld(0, cp); cp++; + cnsts1 = vec_ld(0, cp); cp++; + cnsts2 = vec_ld(0, cp); + /* }}} */ + + + /* 8x8 matrix transpose (vector short[8]) {{{ */ +#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) + + bp = (vector signed short*)block; + vs16(b00) = vec_ld(0, bp); + vs16(b40) = vec_ld(16*4, bp); + vs16(b01) = MERGE_S16(h, b00, b40); + vs16(b11) = MERGE_S16(l, b00, b40); + bp++; + vs16(b10) = vec_ld(0, bp); + vs16(b50) = vec_ld(16*4, bp); + vs16(b21) = MERGE_S16(h, b10, b50); + vs16(b31) = MERGE_S16(l, b10, b50); + bp++; + vs16(b20) = vec_ld(0, bp); + vs16(b60) = vec_ld(16*4, bp); + vs16(b41) = MERGE_S16(h, b20, b60); + vs16(b51) = MERGE_S16(l, b20, b60); + bp++; + vs16(b30) = vec_ld(0, bp); + vs16(b70) = vec_ld(16*4, bp); + vs16(b61) = MERGE_S16(h, b30, b70); + vs16(b71) = MERGE_S16(l, b30, b70); + + vs16(x0) = MERGE_S16(h, b01, b41); + vs16(x1) = MERGE_S16(l, b01, b41); + vs16(x2) = MERGE_S16(h, b11, b51); + vs16(x3) = MERGE_S16(l, b11, b51); + vs16(x4) = MERGE_S16(h, b21, b61); + vs16(x5) = MERGE_S16(l, b21, b61); + vs16(x6) = MERGE_S16(h, b31, b71); + vs16(x7) = MERGE_S16(l, b31, b71); + + vs16(b00) = MERGE_S16(h, x0, x4); + vs16(b10) = MERGE_S16(l, x0, x4); + vs16(b20) = MERGE_S16(h, x1, x5); + vs16(b30) = MERGE_S16(l, x1, x5); + vs16(b40) = MERGE_S16(h, x2, x6); + vs16(b50) = MERGE_S16(l, x2, x6); + vs16(b60) = MERGE_S16(h, x3, x7); + vs16(b70) = MERGE_S16(l, x3, x7); + +#undef MERGE_S16 + /* }}} */ + + +/* Some of the initial calculations can be done as vector short before + * conversion to vector float. The following code section takes advantage + * of this. + */ +#if 1 + /* fdct rows {{{ */ + vs16(x0) = vec_add(vs16(b00), vs16(b70)); + vs16(x7) = vec_sub(vs16(b00), vs16(b70)); + vs16(x1) = vec_add(vs16(b10), vs16(b60)); + vs16(x6) = vec_sub(vs16(b10), vs16(b60)); + vs16(x2) = vec_add(vs16(b20), vs16(b50)); + vs16(x5) = vec_sub(vs16(b20), vs16(b50)); + vs16(x3) = vec_add(vs16(b30), vs16(b40)); + vs16(x4) = vec_sub(vs16(b30), vs16(b40)); + + vs16(b70) = vec_add(vs16(x0), vs16(x3)); + vs16(b10) = vec_add(vs16(x1), vs16(x2)); + + vs16(b00) = vec_add(vs16(b70), vs16(b10)); + vs16(b40) = vec_sub(vs16(b70), vs16(b10)); + +#define CTF0(n) \ + vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \ + vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \ + b##n##1 = vec_ctf(vs32(b##n##1), 0); \ + b##n##0 = vec_ctf(vs32(b##n##0), 0); + + CTF0(0); + CTF0(4); + + vs16(b20) = vec_sub(vs16(x0), vs16(x3)); + vs16(b60) = vec_sub(vs16(x1), vs16(x2)); + + CTF0(2); + CTF0(6); + +#undef CTF0 + + x0 = vec_add(b60, b20); + x1 = vec_add(b61, b21); + + cnst = LD_W2; + x0 = vec_madd(cnst, x0, mzero); + x1 = vec_madd(cnst, x1, mzero); + cnst = LD_W1; + b20 = vec_madd(cnst, b20, x0); + b21 = vec_madd(cnst, b21, x1); + cnst = LD_W0; + b60 = vec_madd(cnst, b60, x0); + b61 = vec_madd(cnst, b61, x1); + +#define CTFX(x,b) \ + vs32(b##0) = vec_unpackh(vs16(x)); \ + vs32(b##1) = vec_unpackl(vs16(x)); \ + b##0 = vec_ctf(vs32(b##0), 0); \ + b##1 = vec_ctf(vs32(b##1), 0); \ + + CTFX(x4, b7); + CTFX(x5, b5); + CTFX(x6, b3); + CTFX(x7, b1); + +#undef CTFX + + + x0 = vec_add(b70, b10); + x1 = vec_add(b50, b30); + x2 = vec_add(b70, b30); + x3 = vec_add(b50, b10); + x8 = vec_add(x2, x3); + cnst = LD_W3; + x8 = vec_madd(cnst, x8, mzero); + + cnst = LD_W8; + x0 = vec_madd(cnst, x0, mzero); + cnst = LD_W9; + x1 = vec_madd(cnst, x1, mzero); + cnst = LD_WA; + x2 = vec_madd(cnst, x2, x8); + cnst = LD_WB; + x3 = vec_madd(cnst, x3, x8); + + cnst = LD_W4; + b70 = vec_madd(cnst, b70, x0); + cnst = LD_W5; + b50 = vec_madd(cnst, b50, x1); + cnst = LD_W6; + b30 = vec_madd(cnst, b30, x1); + cnst = LD_W7; + b10 = vec_madd(cnst, b10, x0); + + b70 = vec_add(b70, x2); + b50 = vec_add(b50, x3); + b30 = vec_add(b30, x2); + b10 = vec_add(b10, x3); + + + x0 = vec_add(b71, b11); + x1 = vec_add(b51, b31); + x2 = vec_add(b71, b31); + x3 = vec_add(b51, b11); + x8 = vec_add(x2, x3); + cnst = LD_W3; + x8 = vec_madd(cnst, x8, mzero); + + cnst = LD_W8; + x0 = vec_madd(cnst, x0, mzero); + cnst = LD_W9; + x1 = vec_madd(cnst, x1, mzero); + cnst = LD_WA; + x2 = vec_madd(cnst, x2, x8); + cnst = LD_WB; + x3 = vec_madd(cnst, x3, x8); + + cnst = LD_W4; + b71 = vec_madd(cnst, b71, x0); + cnst = LD_W5; + b51 = vec_madd(cnst, b51, x1); + cnst = LD_W6; + b31 = vec_madd(cnst, b31, x1); + cnst = LD_W7; + b11 = vec_madd(cnst, b11, x0); + + b71 = vec_add(b71, x2); + b51 = vec_add(b51, x3); + b31 = vec_add(b31, x2); + b11 = vec_add(b11, x3); + /* }}} */ +#else + /* convert to float {{{ */ +#define CTF(n) \ + vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \ + vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \ + b##n##1 = vec_ctf(vs32(b##n##1), 0); \ + b##n##0 = vec_ctf(vs32(b##n##0), 0); \ + + CTF(0); + CTF(1); + CTF(2); + CTF(3); + CTF(4); + CTF(5); + CTF(6); + CTF(7); + +#undef CTF + /* }}} */ + + FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70); + FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71); +#endif + + + /* 8x8 matrix transpose (vector float[8][2]) {{{ */ + x0 = vec_mergel(b00, b20); + x1 = vec_mergeh(b00, b20); + x2 = vec_mergel(b10, b30); + x3 = vec_mergeh(b10, b30); + + b00 = vec_mergeh(x1, x3); + b10 = vec_mergel(x1, x3); + b20 = vec_mergeh(x0, x2); + b30 = vec_mergel(x0, x2); + + x4 = vec_mergel(b41, b61); + x5 = vec_mergeh(b41, b61); + x6 = vec_mergel(b51, b71); + x7 = vec_mergeh(b51, b71); + + b41 = vec_mergeh(x5, x7); + b51 = vec_mergel(x5, x7); + b61 = vec_mergeh(x4, x6); + b71 = vec_mergel(x4, x6); + + x0 = vec_mergel(b01, b21); + x1 = vec_mergeh(b01, b21); + x2 = vec_mergel(b11, b31); + x3 = vec_mergeh(b11, b31); + + x4 = vec_mergel(b40, b60); + x5 = vec_mergeh(b40, b60); + x6 = vec_mergel(b50, b70); + x7 = vec_mergeh(b50, b70); + + b40 = vec_mergeh(x1, x3); + b50 = vec_mergel(x1, x3); + b60 = vec_mergeh(x0, x2); + b70 = vec_mergel(x0, x2); + + b01 = vec_mergeh(x5, x7); + b11 = vec_mergel(x5, x7); + b21 = vec_mergeh(x4, x6); + b31 = vec_mergel(x4, x6); + /* }}} */ + + + FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); + FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); + + + /* round, convert back to short {{{ */ +#define CTS(n) \ + b##n##0 = vec_round(b##n##0); \ + b##n##1 = vec_round(b##n##1); \ + vs32(b##n##0) = vec_cts(b##n##0, 0); \ + vs32(b##n##1) = vec_cts(b##n##1, 0); \ + vs16(b##n##0) = vec_pack(vs32(b##n##0), vs32(b##n##1)); \ + vec_st(vs16(b##n##0), 0, bp); + + bp = (vector signed short*)block; + CTS(0); bp++; + CTS(1); bp++; + CTS(2); bp++; + CTS(3); bp++; + CTS(4); bp++; + CTS(5); bp++; + CTS(6); bp++; + CTS(7); + +#undef CTS + /* }}} */ + +POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* vim:set foldmethod=marker foldlevel=0: */ diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c index 51b387792..91e744af9 100644 --- a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c +++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c @@ -1,6 +1,9 @@ /* * Copyright (c) 2002 Dieter Shirley * + * dct_unquantize_h263_altivec: + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c index ce4bf8a47..c8269eb9a 100644 --- a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c +++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c @@ -1,84 +1,86 @@ -/*
- * Copyright (c) 2002 Dieter Shirley
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include "../dsputil.h"
-#include "../mpegvideo.h"
-#include <time.h>
-
-#ifdef HAVE_ALTIVEC
-#include "dsputil_altivec.h"
-#endif
-
-extern int dct_quantize_altivec(MpegEncContext *s,
- DCTELEM *block, int n,
- int qscale, int *overflow);
+/* + * Copyright (c) 2002 Dieter Shirley + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "../dsputil.h" +#include "../mpegvideo.h" +#include <time.h> + +#ifdef HAVE_ALTIVEC +#include "dsputil_altivec.h" +#endif + +extern int dct_quantize_altivec(MpegEncContext *s, + DCTELEM *block, int n, + int qscale, int *overflow); extern void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale); -
-extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
-
-
-void MPV_common_init_ppc(MpegEncContext *s)
-{
+ +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); + + +void MPV_common_init_ppc(MpegEncContext *s) +{ #ifdef HAVE_ALTIVEC - if (has_altivec())
- {
- if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
- (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
- {
- s->dsp.idct_put = idct_put_altivec;
- s->dsp.idct_add = idct_add_altivec;
+ if (has_altivec()) + { + if ((s->avctx->idct_algo == FF_IDCT_AUTO) || + (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) + { + s->dsp.idct_put = idct_put_altivec; + s->dsp.idct_add = idct_add_altivec; #ifndef ALTIVEC_USE_REFERENCE_C_CODE - s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; #else /* ALTIVEC_USE_REFERENCE_C_CODE */ s->dsp.idct_permutation_type = FF_NO_IDCT_PERM; #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ - }
-
- // Test to make sure that the dct required alignments are met.
- if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
- (((long)(s->q_inter_matrix) & 0x0f) != 0))
- {
- av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
- "to use Altivec DCT. Reverting to non-altivec version.\n");
- return;
- }
-
- if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
- {
- av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
- "to use Altivec DCT. Reverting to non-altivec version.\n");
- return;
- }
-
-
- if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
- (s->avctx->dct_algo == FF_DCT_ALTIVEC))
- {
- s->dct_quantize = dct_quantize_altivec;
+ } + + // Test to make sure that the dct required alignments are met. + if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || + (((long)(s->q_inter_matrix) & 0x0f) != 0)) + { + av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " + "to use Altivec DCT. Reverting to non-altivec version.\n"); + return; + } + + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) + { + av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " + "to use Altivec DCT. Reverting to non-altivec version.\n"); + return; + } + + + if ((s->avctx->dct_algo == FF_DCT_AUTO) || + (s->avctx->dct_algo == FF_DCT_ALTIVEC)) + { +#if 0 /* seems to cause trouble under some circumstances */ + s->dct_quantize = dct_quantize_altivec; +#endif s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; - }
- } else
-#endif
- {
- /* Non-AltiVec PPC optimisations here */
- }
-}
-
+ } + } else +#endif + { + /* Non-AltiVec PPC optimisations here */ + } +} + diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c index 11c9734fc..b67ec3974 100644 --- a/src/libffmpeg/libavcodec/rv10.c +++ b/src/libffmpeg/libavcodec/rv10.c @@ -388,6 +388,10 @@ static int rv20_decode_picture_header(MpegEncContext *s) // return -1; } seq= get_bits(&s->gb, 15); + if (s->avctx->sub_id == 0x20201002 && get_bits(&s->gb, 1)){ + av_log(s->avctx, AV_LOG_ERROR, "unknown bit4 set\n"); +// return -1; + } mb_pos= get_bits(&s->gb, av_log2(s->mb_num-1)+1); s->mb_x= mb_pos % s->mb_width; s->mb_y= mb_pos / s->mb_width; @@ -395,7 +399,7 @@ static int rv20_decode_picture_header(MpegEncContext *s) seq= get_bits(&s->gb, 8)*128; mb_pos= ff_h263_decode_mba(s); } -//printf("%d\n", seq); +//av_log(s->avctx, AV_LOG_DEBUG, "%d\n", seq); seq |= s->time &~0x7FFF; if(seq - s->time > 0x4000) seq -= 0x8000; if(seq - s->time < -0x4000) seq += 0x8000; @@ -414,7 +418,10 @@ static int rv20_decode_picture_header(MpegEncContext *s) } } // printf("%d %d %d %d %d\n", seq, (int)s->time, (int)s->last_non_b_time, s->pp_time, s->pb_time); - +/*for(i=0; i<32; i++){ + av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb)); +} +av_log(s->avctx, AV_LOG_DEBUG, "\n");*/ s->no_rounding= get_bits1(&s->gb); s->f_code = 1; @@ -441,6 +448,8 @@ static int rv10_decode_init(AVCodecContext *avctx) MpegEncContext *s = avctx->priv_data; static int done=0; + MPV_decode_defaults(s); + s->avctx= avctx; s->out_format = FMT_H263; s->codec_id= avctx->codec_id; @@ -476,6 +485,7 @@ static int rv10_decode_init(AVCodecContext *avctx) s->low_delay=1; break; case 0x20200002: + case 0x20201002: case 0x30202002: case 0x30203002: s->low_delay=0; @@ -490,8 +500,6 @@ static int rv10_decode_init(AVCodecContext *avctx) h263_decode_init_vlc(s); - s->progressive_sequence=1; - /* init rv vlc */ if (!done) { init_vlc(&rv_dc_lum, DC_VLC_BITS, 256, @@ -556,10 +564,6 @@ static int rv10_decode_packet(AVCodecContext *avctx, return -1; } - if(s->pict_type == B_TYPE){ //FIXME remove after cleaning mottion_val indexing - memset(s->current_picture.motion_val[0], 0, sizeof(int16_t)*2*(s->mb_width*2+2)*(s->mb_height*2+2)); - } - #ifdef DEBUG printf("qscale=%d\n", s->qscale); #endif @@ -592,9 +596,9 @@ static int rv10_decode_packet(AVCodecContext *avctx, s->block_wrap[0]= s->block_wrap[1]= s->block_wrap[2]= - s->block_wrap[3]= s->mb_width*2 + 2; + s->block_wrap[3]= s->b8_stride; s->block_wrap[4]= - s->block_wrap[5]= s->mb_width + 2; + s->block_wrap[5]= s->mb_stride; ff_init_block_index(s); /* decode each macroblock */ @@ -669,10 +673,6 @@ static int rv10_decode_frame(AVCodecContext *avctx, return -1; } - if(s->pict_type == B_TYPE){ //FIXME remove after cleaning mottion_val indexing - memset(s->current_picture.motion_val[0], 0, sizeof(int16_t)*2*(s->mb_width*2+2)*(s->mb_height*2+2)); - } - if(s->mb_y>=s->mb_height){ MPV_frame_end(s); diff --git a/src/libffmpeg/libavcodec/smc.c b/src/libffmpeg/libavcodec/smc.c index 87db50005..e937b03c8 100644 --- a/src/libffmpeg/libavcodec/smc.c +++ b/src/libffmpeg/libavcodec/smc.c @@ -36,9 +36,6 @@ #include "avcodec.h" #include "dsputil.h" -#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log -#define fprintf(...) {} - #define CPAIR 2 #define CQUAD 4 #define COCTET 8 @@ -75,7 +72,7 @@ typedef struct SmcContext { total_blocks--; \ if (total_blocks < 0) \ { \ - printf("warning: block counter just went negative (this should not happen)\n"); \ + av_log(s->avctx, AV_LOG_INFO, "warning: block counter just went negative (this should not happen)\n"); \ return; \ } \ } @@ -124,7 +121,7 @@ static void smc_decode_stream(SmcContext *s) chunk_size = BE_32(&s->buf[stream_ptr]) & 0x00FFFFFF; stream_ptr += 4; if (chunk_size != s->size) - printf("warning: MOV chunk size != encoded chunk size (%d != %d); using MOV chunk size\n", + av_log(s->avctx, AV_LOG_INFO, "warning: MOV chunk size != encoded chunk size (%d != %d); using MOV chunk size\n", chunk_size, s->size); chunk_size = s->size; @@ -135,13 +132,13 @@ static void smc_decode_stream(SmcContext *s) /* sanity checks */ /* make sure stream ptr hasn't gone out of bounds */ if (stream_ptr > chunk_size) { - printf("SMC decoder just went out of bounds (stream ptr = %d, chunk size = %d)\n", + av_log(s->avctx, AV_LOG_INFO, "SMC decoder just went out of bounds (stream ptr = %d, chunk size = %d)\n", stream_ptr, chunk_size); return; } /* make sure the row pointer hasn't gone wild */ if (row_ptr >= image_size) { - printf("SMC decoder just went out of bounds (row ptr = %d, height = %d)\n", + av_log(s->avctx, AV_LOG_INFO, "SMC decoder just went out of bounds (row ptr = %d, height = %d)\n", row_ptr, image_size); return; } @@ -164,7 +161,7 @@ static void smc_decode_stream(SmcContext *s) /* sanity check */ if ((row_ptr == 0) && (pixel_ptr == 0)) { - printf("encountered repeat block opcode (%02X) but no blocks rendered yet\n", + av_log(s->avctx, AV_LOG_INFO, "encountered repeat block opcode (%02X) but no blocks rendered yet\n", opcode & 0xF0); break; } @@ -198,7 +195,7 @@ static void smc_decode_stream(SmcContext *s) /* sanity check */ if ((row_ptr == 0) && (pixel_ptr < 2 * 4)) { - printf("encountered repeat block opcode (%02X) but not enough blocks rendered yet\n", + av_log(s->avctx, AV_LOG_INFO, "encountered repeat block opcode (%02X) but not enough blocks rendered yet\n", opcode & 0xF0); break; } @@ -425,7 +422,7 @@ static void smc_decode_stream(SmcContext *s) break; case 0xF0: - printf("0xF0 opcode seen in SMC chunk (xine developers would like to know)\n"); + av_log(s->avctx, AV_LOG_INFO, "0xF0 opcode seen in SMC chunk (contact the developers)\n"); break; } } @@ -462,7 +459,7 @@ static int smc_decode_frame(AVCodecContext *avctx, s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE | FF_BUFFER_HINTS_REUSABLE | FF_BUFFER_HINTS_READABLE; if (avctx->reget_buffer(avctx, &s->frame)) { - printf ("reget_buffer() failed\n"); + av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n"); return -1; } diff --git a/src/libffmpeg/libavcodec/sparc/Makefile.am b/src/libffmpeg/libavcodec/sparc/Makefile.am new file mode 100644 index 000000000..cdf16e3ad --- /dev/null +++ b/src/libffmpeg/libavcodec/sparc/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/misc/Makefile.common + +AM_CFLAGS = $(LIBFFMPEG_CFLAGS) +ASFLAGS = + +noinst_LTLIBRARIES = libavcodec_sparc.la + +libavcodec_sparc_src = dsputil_vis.c +libavcodec_sparc_dummy = libavcodec_sparc_dummy.c + +EXTRA_DIST = $(libavcodec_sparc_src) $(libavcodec_sparc_dummy) vis.h + +sparc_modules = $(libavcodec_sparc_src) + +libavcodec_sparc_la_SOURCES = $(sparc_modules) $(libavcodec_sparc_dummy) diff --git a/src/libffmpeg/libavcodec/sparc/dsputil_vis.c b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c new file mode 100644 index 000000000..434cf74ac --- /dev/null +++ b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c @@ -0,0 +1,4107 @@ +/* + * dsputil_vis.c + * Copyright (C) 2003 David S. Miller <davem@redhat.com> + * + * This file is part of ffmpeg, a free MPEG-4 video stream decoder. + * See http://ffmpeg.sourceforge.net/ for updates. + * + * ffmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * ffmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the Lesser GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* The *no_round* functions have been added by James A. Morrison, 2003,2004. + The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison. + */ + +#include "config.h" + +#ifdef ARCH_SPARC + +#include <inttypes.h> +#include <signal.h> +#include <setjmp.h> + +#include "../dsputil.h" + +#include "vis.h" + +/* The trick used in some of this file is the formula from the MMX + * motion comp code, which is: + * + * (x+y+1)>>1 == (x|y)-((x^y)>>1) + * + * This allows us to average 8 bytes at a time in a 64-bit FPU reg. + * We avoid overflows by masking before we do the shift, and we + * implement the shift by multiplying by 1/2 using mul8x16. So in + * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask + * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and + * the value 0x80808080 is in f8): + * + * fxor f0, f2, f10 + * fand f10, f4, f10 + * fmul8x16 f8, f10, f10 + * fand f10, f6, f10 + * for f0, f2, f12 + * fpsub16 f12, f10, f10 + */ + +#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd))) + +#define DUP4(x) {x, x, x, x} +#define DUP8(x) {x, x, x, x, x, x, x, x} +static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1); +static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2); +static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3); +static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6); +static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe); +static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f); +static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128); +static const int16_t constants256_512[] ATTR_ALIGN(8) = + {256, 512, 256, 512}; +static const int16_t constants256_1024[] ATTR_ALIGN(8) = + {256, 1024, 256, 1024}; + +#define REF_0 0 +#define REF_0_1 1 +#define REF_2 2 +#define REF_2_1 3 +#define REF_4 4 +#define REF_4_1 5 +#define REF_6 6 +#define REF_6_1 7 +#define REF_S0 8 +#define REF_S0_1 9 +#define REF_S2 10 +#define REF_S2_1 11 +#define REF_S4 12 +#define REF_S4_1 13 +#define REF_S6 14 +#define REF_S6_1 15 +#define DST_0 16 +#define DST_1 17 +#define DST_2 18 +#define DST_3 19 +#define CONST_1 20 +#define CONST_2 20 +#define CONST_3 20 +#define CONST_6 20 +#define MASK_fe 20 +#define CONST_128 22 +#define CONST_256 22 +#define CONST_512 22 +#define CONST_1024 22 +#define TMP0 24 +#define TMP1 25 +#define TMP2 26 +#define TMP3 27 +#define TMP4 28 +#define TMP5 29 +#define ZERO 30 +#define MASK_7f 30 + +#define TMP6 32 +#define TMP8 34 +#define TMP10 36 +#define TMP12 38 +#define TMP14 40 +#define TMP16 42 +#define TMP18 44 +#define TMP20 46 +#define TMP22 48 +#define TMP24 50 +#define TMP26 52 +#define TMP28 54 +#define TMP30 56 +#define TMP32 58 + +static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + do { /* 5 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + + vis_faligndata(TMP0, TMP2, REF_0); + vis_st64(REF_0, dest[0]); + + vis_faligndata(TMP2, TMP4, REF_2); + vis_st64_2(REF_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + do { /* 4 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + ref += stride; + + /* stall */ + + vis_faligndata(TMP0, TMP2, REF_0); + vis_st64(REF_0, dest[0]); + dest += stride; + } while (--height); +} + + +static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + int stride_8 = stride + 8; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(dest[0], DST_0); + + vis_ld64(dest[8], DST_2); + + vis_ld64(constants_fe[0], MASK_fe); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP2, TMP4, REF_2); + + vis_ld64(constants128[0], CONST_128); + + ref += stride; + height = (height >> 1) - 1; + + do { /* 24 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP6, MASK_fe, TMP6); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_xor(DST_2, REF_2, TMP8); + + vis_and(TMP8, MASK_fe, TMP8); + + vis_or(DST_0, REF_0, TMP10); + vis_ld64_2(dest, stride, DST_0); + vis_mul8x16(CONST_128, TMP8, TMP8); + + vis_or(DST_2, REF_2, TMP12); + vis_ld64_2(dest, stride_8, DST_2); + + vis_ld64(ref[0], TMP14); + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_psub16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_psub16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + + dest += stride; + vis_ld64_2(ref, 8, TMP16); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 16, TMP18); + vis_faligndata(TMP2, TMP4, REF_2); + ref += stride; + + vis_xor(DST_0, REF_0, TMP20); + + vis_and(TMP20, MASK_fe, TMP20); + + vis_xor(DST_2, REF_2, TMP22); + vis_mul8x16(CONST_128, TMP20, TMP20); + + vis_and(TMP22, MASK_fe, TMP22); + + vis_or(DST_0, REF_0, TMP24); + vis_mul8x16(CONST_128, TMP22, TMP22); + + vis_or(DST_2, REF_2, TMP26); + + vis_ld64_2(dest, stride, DST_0); + vis_faligndata(TMP14, TMP16, REF_0); + + vis_ld64_2(dest, stride_8, DST_2); + vis_faligndata(TMP16, TMP18, REF_2); + + vis_and(TMP20, MASK_7f, TMP20); + + vis_and(TMP22, MASK_7f, TMP22); + + vis_psub16(TMP24, TMP20, TMP20); + vis_st64(TMP20, dest[0]); + + vis_psub16(TMP26, TMP22, TMP22); + vis_st64_2(TMP22, dest, 8); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP6, MASK_fe, TMP6); + + vis_ld64_2(ref, 16, TMP4); + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_xor(DST_2, REF_2, TMP8); + + vis_and(TMP8, MASK_fe, TMP8); + + vis_or(DST_0, REF_0, TMP10); + vis_ld64_2(dest, stride, DST_0); + vis_mul8x16(CONST_128, TMP8, TMP8); + + vis_or(DST_2, REF_2, TMP12); + vis_ld64_2(dest, stride_8, DST_2); + + vis_ld64(ref[0], TMP14); + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_psub16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_psub16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + + dest += stride; + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_2); + + vis_xor(DST_0, REF_0, TMP20); + + vis_and(TMP20, MASK_fe, TMP20); + + vis_xor(DST_2, REF_2, TMP22); + vis_mul8x16(CONST_128, TMP20, TMP20); + + vis_and(TMP22, MASK_fe, TMP22); + + vis_or(DST_0, REF_0, TMP24); + vis_mul8x16(CONST_128, TMP22, TMP22); + + vis_or(DST_2, REF_2, TMP26); + + vis_and(TMP20, MASK_7f, TMP20); + + vis_and(TMP22, MASK_7f, TMP22); + + vis_psub16(TMP24, TMP20, TMP20); + vis_st64(TMP20, dest[0]); + + vis_psub16(TMP26, TMP22, TMP22); + vis_st64_2(TMP22, dest, 8); +} + +static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_ld64(dest[0], DST_0); + + vis_ld64(constants_fe[0], MASK_fe); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants128[0], CONST_128); + + ref += stride; + height = (height >> 1) - 1; + + do { /* 12 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP4); + + vis_ld64(ref[8], TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_or(DST_0, REF_0, TMP6); + vis_ld64_2(dest, stride, DST_0); + ref += stride; + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_ld64(ref[0], TMP12); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(ref[8], TMP2); + vis_xor(DST_0, REF_0, TMP0); + ref += stride; + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_psub16(TMP6, TMP4, TMP4); + vis_st64(TMP4, dest[0]); + dest += stride; + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_or(DST_0, REF_0, TMP6); + vis_ld64_2(dest, stride, DST_0); + + vis_faligndata(TMP12, TMP2, REF_0); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_psub16(TMP6, TMP0, TMP4); + vis_st64(TMP4, dest[0]); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP4); + + vis_ld64(ref[8], TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_or(DST_0, REF_0, TMP6); + vis_ld64_2(dest, stride, DST_0); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_xor(DST_0, REF_0, TMP0); + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_psub16(TMP6, TMP4, TMP4); + vis_st64(TMP4, dest[0]); + dest += stride; + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_or(DST_0, REF_0, TMP6); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_psub16(TMP6, TMP0, TMP4); + vis_st64(TMP4, dest[0]); +} + +static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64_2(ref, 16, TMP4); + + vis_ld64(constants_fe[0], MASK_fe); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants128[0], CONST_128); + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + ref += stride; + height = (height >> 1) - 1; + + do { /* 34 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP8); + + vis_ld64_2(ref, 16, TMP4); + vis_and(TMP6, MASK_fe, TMP6); + ref += stride; + + vis_ld64(ref[0], TMP14); + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_ld64_2(ref, 8, TMP16); + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_or(REF_0, REF_2, TMP10); + + vis_ld64_2(ref, 16, TMP18); + ref += stride; + vis_or(REF_4, REF_6, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_psub16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_psub16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + dest += stride; + + vis_xor(REF_0, REF_2, TMP6); + + vis_xor(REF_4, REF_6, TMP8); + + vis_and(TMP6, MASK_fe, TMP6); + + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_or(REF_0, REF_2, TMP10); + + vis_or(REF_4, REF_6, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP14, TMP16, REF_0); + + vis_faligndata(TMP16, TMP18, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP14, TMP16, REF_2); + vis_faligndata(TMP16, TMP18, REF_6); + } else { + vis_src1(TMP16, REF_2); + vis_src1(TMP18, REF_6); + } + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_psub16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_psub16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP8); + + vis_ld64_2(ref, 16, TMP4); + vis_and(TMP6, MASK_fe, TMP6); + + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_or(REF_0, REF_2, TMP10); + + vis_or(REF_4, REF_6, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_psub16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_psub16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + dest += stride; + + vis_xor(REF_0, REF_2, TMP6); + + vis_xor(REF_4, REF_6, TMP8); + + vis_and(TMP6, MASK_fe, TMP6); + + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_or(REF_0, REF_2, TMP10); + + vis_or(REF_4, REF_6, TMP12); + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_psub16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_psub16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); +} + +static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_ld64(constants_fe[0], MASK_fe); + + vis_ld64(constants_7f[0], MASK_7f); + + vis_ld64(constants128[0], CONST_128); + vis_faligndata(TMP0, TMP2, REF_0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + } else { + vis_src1(TMP2, REF_2); + } + + ref += stride; + height = (height >> 1) - 1; + + do { /* 20 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP4, MASK_fe, TMP4); + ref += stride; + + vis_ld64(ref[0], TMP8); + vis_or(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, 8, TMP10); + ref += stride; + vis_faligndata(TMP0, TMP2, REF_0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + } else { + vis_src1(TMP2, REF_2); + } + + vis_and(TMP4, MASK_7f, TMP4); + + vis_psub16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_or(REF_0, REF_2, TMP14); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_alignaddr_g0((void *)off); + vis_faligndata(TMP8, TMP10, REF_0); + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP8, TMP10, REF_2); + } else { + vis_src1(TMP10, REF_2); + } + + vis_and(TMP12, MASK_7f, TMP12); + + vis_psub16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_or(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP0, TMP2, REF_0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + } else { + vis_src1(TMP2, REF_2); + } + + vis_and(TMP4, MASK_7f, TMP4); + + vis_psub16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_or(REF_0, REF_2, TMP14); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_psub16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; +} + +static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + vis_ld64(constants3[0], CONST_3); + vis_fzero(ZERO); + vis_ld64(constants256_512[0], CONST_256); + + ref = vis_alignaddr(ref); + do { /* 26 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_alignaddr_g0((void *)off); + + vis_ld64(ref[16], TMP4); + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(dest[8], DST_2); + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + vis_mul8x16au(REF_0, CONST_256, TMP0); + + vis_pmerge(ZERO, REF_2, TMP4); + vis_mul8x16au(REF_0_1, CONST_256, TMP2); + + vis_pmerge(ZERO, REF_2_1, TMP6); + + vis_padd16(TMP0, TMP4, TMP0); + + vis_mul8x16al(DST_0, CONST_512, TMP4); + vis_padd16(TMP2, TMP6, TMP2); + + vis_mul8x16al(DST_1, CONST_512, TMP6); + + vis_mul8x16au(REF_6, CONST_256, TMP12); + + vis_padd16(TMP0, TMP4, TMP0); + vis_mul8x16au(REF_6_1, CONST_256, TMP14); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_4, CONST_256, TMP16); + + vis_padd16(TMP0, CONST_3, TMP8); + vis_mul8x16au(REF_4_1, CONST_256, TMP18); + + vis_padd16(TMP2, CONST_3, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_padd16(TMP16, TMP12, TMP0); + + vis_st64(DST_0, dest[0]); + vis_mul8x16al(DST_2, CONST_512, TMP4); + vis_padd16(TMP18, TMP14, TMP2); + + vis_mul8x16al(DST_3, CONST_512, TMP6); + vis_padd16(TMP0, CONST_3, TMP0); + + vis_padd16(TMP2, CONST_3, TMP2); + + vis_padd16(TMP0, TMP4, TMP0); + + vis_padd16(TMP2, TMP6, TMP2); + vis_pack16(TMP0, DST_2); + + vis_pack16(TMP2, DST_3); + vis_st64(DST_2, dest[8]); + + ref += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_times_2 = stride << 1; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + vis_ld64(constants3[0], CONST_3); + vis_fzero(ZERO); + vis_ld64(constants256_512[0], CONST_256); + + ref = vis_alignaddr(ref); + height >>= 2; + do { /* 47 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + + vis_alignaddr_g0((void *)off); + + vis_ld64(ref[0], TMP4); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 8, TMP6); + ref += stride; + + vis_ld64(ref[0], TMP8); + + vis_ld64_2(ref, 8, TMP10); + ref += stride; + vis_faligndata(TMP4, TMP6, REF_4); + + vis_ld64(ref[0], TMP12); + + vis_ld64_2(ref, 8, TMP14); + ref += stride; + vis_faligndata(TMP8, TMP10, REF_S0); + + vis_faligndata(TMP12, TMP14, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP0, TMP2, REF_2); + + vis_ld64_2(dest, stride, DST_2); + vis_faligndata(TMP4, TMP6, REF_6); + + vis_faligndata(TMP8, TMP10, REF_S2); + + vis_faligndata(TMP12, TMP14, REF_S6); + } else { + vis_ld64(dest[0], DST_0); + vis_src1(TMP2, REF_2); + + vis_ld64_2(dest, stride, DST_2); + vis_src1(TMP6, REF_6); + + vis_src1(TMP10, REF_S2); + + vis_src1(TMP14, REF_S6); + } + + vis_pmerge(ZERO, REF_0, TMP0); + vis_mul8x16au(REF_0_1, CONST_256, TMP2); + + vis_pmerge(ZERO, REF_2, TMP4); + vis_mul8x16au(REF_2_1, CONST_256, TMP6); + + vis_padd16(TMP0, CONST_3, TMP0); + vis_mul8x16al(DST_0, CONST_512, TMP16); + + vis_padd16(TMP2, CONST_3, TMP2); + vis_mul8x16al(DST_1, CONST_512, TMP18); + + vis_padd16(TMP0, TMP4, TMP0); + vis_mul8x16au(REF_4, CONST_256, TMP8); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_4_1, CONST_256, TMP10); + + vis_padd16(TMP0, TMP16, TMP0); + vis_mul8x16au(REF_6, CONST_256, TMP12); + + vis_padd16(TMP2, TMP18, TMP2); + vis_mul8x16au(REF_6_1, CONST_256, TMP14); + + vis_padd16(TMP8, CONST_3, TMP8); + vis_mul8x16al(DST_2, CONST_512, TMP16); + + vis_padd16(TMP8, TMP12, TMP8); + vis_mul8x16al(DST_3, CONST_512, TMP18); + + vis_padd16(TMP10, TMP14, TMP10); + vis_pack16(TMP0, DST_0); + + vis_pack16(TMP2, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + vis_padd16(TMP10, CONST_3, TMP10); + + vis_ld64_2(dest, stride, DST_0); + vis_padd16(TMP8, TMP16, TMP8); + + vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); + vis_padd16(TMP10, TMP18, TMP10); + vis_pack16(TMP8, DST_2); + + vis_pack16(TMP10, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + + vis_mul8x16au(REF_S0_1, CONST_256, TMP2); + vis_pmerge(ZERO, REF_S0, TMP0); + + vis_pmerge(ZERO, REF_S2, TMP24); + vis_mul8x16au(REF_S2_1, CONST_256, TMP6); + + vis_padd16(TMP0, CONST_3, TMP0); + vis_mul8x16au(REF_S4, CONST_256, TMP8); + + vis_padd16(TMP2, CONST_3, TMP2); + vis_mul8x16au(REF_S4_1, CONST_256, TMP10); + + vis_padd16(TMP0, TMP24, TMP0); + vis_mul8x16au(REF_S6, CONST_256, TMP12); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_S6_1, CONST_256, TMP14); + + vis_padd16(TMP8, CONST_3, TMP8); + vis_mul8x16al(DST_0, CONST_512, TMP16); + + vis_padd16(TMP10, CONST_3, TMP10); + vis_mul8x16al(DST_1, CONST_512, TMP18); + + vis_padd16(TMP8, TMP12, TMP8); + vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); + + vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); + vis_padd16(TMP0, TMP16, TMP0); + + vis_padd16(TMP2, TMP18, TMP2); + vis_pack16(TMP0, DST_0); + + vis_padd16(TMP10, TMP14, TMP10); + vis_pack16(TMP2, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_padd16(TMP8, TMP20, TMP8); + + vis_padd16(TMP10, TMP22, TMP10); + vis_pack16(TMP8, DST_2); + + vis_pack16(TMP10, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + + vis_ld64(ref[0], TMP6); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 8, TMP8); + vis_faligndata(TMP2, TMP4, REF_4); + + vis_ld64_2(ref, 16, TMP10); + ref += stride; + + vis_ld64(constants_fe[0], MASK_fe); + vis_faligndata(TMP6, TMP8, REF_2); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP8, TMP10, REF_6); + + vis_ld64(constants128[0], CONST_128); + height = (height >> 1) - 1; + do { /* 24 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP12); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP16); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + vis_or(REF_0, REF_2, TMP14); + + vis_ld64(ref[0], TMP6); + vis_or(REF_4, REF_6, TMP18); + + vis_ld64_2(ref, 8, TMP8); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 16, TMP10); + ref += stride; + vis_faligndata(TMP2, TMP4, REF_4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_and(TMP16, MASK_fe, TMP16); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_mul8x16(CONST_128, TMP16, TMP16); + vis_xor(REF_0, REF_2, TMP0); + + vis_xor(REF_4, REF_6, TMP2); + + vis_or(REF_0, REF_2, TMP20); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_and(TMP16, MASK_7f, TMP16); + + vis_psub16(TMP14, TMP12, TMP12); + vis_st64(TMP12, dest[0]); + + vis_psub16(TMP18, TMP16, TMP16); + vis_st64_2(TMP16, dest, 8); + dest += stride; + + vis_or(REF_4, REF_6, TMP18); + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP2, MASK_fe, TMP2); + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_faligndata(TMP6, TMP8, REF_2); + vis_mul8x16(CONST_128, TMP2, TMP2); + + vis_faligndata(TMP8, TMP10, REF_6); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_and(TMP2, MASK_7f, TMP2); + + vis_psub16(TMP20, TMP0, TMP0); + vis_st64(TMP0, dest[0]); + + vis_psub16(TMP18, TMP2, TMP2); + vis_st64_2(TMP2, dest, 8); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP12); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP16); + + vis_ld64_2(ref, 16, TMP4); + vis_or(REF_0, REF_2, TMP14); + + vis_or(REF_4, REF_6, TMP18); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_and(TMP16, MASK_fe, TMP16); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_mul8x16(CONST_128, TMP16, TMP16); + vis_xor(REF_0, REF_2, TMP0); + + vis_xor(REF_4, REF_6, TMP2); + + vis_or(REF_0, REF_2, TMP20); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_and(TMP16, MASK_7f, TMP16); + + vis_psub16(TMP14, TMP12, TMP12); + vis_st64(TMP12, dest[0]); + + vis_psub16(TMP18, TMP16, TMP16); + vis_st64_2(TMP16, dest, 8); + dest += stride; + + vis_or(REF_4, REF_6, TMP18); + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP2, MASK_fe, TMP2); + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_mul8x16(CONST_128, TMP2, TMP2); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_and(TMP2, MASK_7f, TMP2); + + vis_psub16(TMP20, TMP0, TMP0); + vis_st64(TMP0, dest[0]); + + vis_psub16(TMP18, TMP2, TMP2); + vis_st64_2(TMP2, dest, 8); +} + +static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + + vis_ld64(ref[0], TMP4); + + vis_ld64_2(ref, 8, TMP6); + ref += stride; + + vis_ld64(constants_fe[0], MASK_fe); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP4, TMP6, REF_2); + + vis_ld64(constants128[0], CONST_128); + height = (height >> 1) - 1; + do { /* 12 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + vis_and(TMP4, MASK_fe, TMP4); + + vis_or(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_faligndata(TMP0, TMP2, REF_0); + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_mul8x16(CONST_128, TMP12, TMP12); + vis_or(REF_0, REF_2, TMP14); + + vis_psub16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_faligndata(TMP0, TMP2, REF_2); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_psub16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_or(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_mul8x16(CONST_128, TMP12, TMP12); + vis_or(REF_0, REF_2, TMP14); + + vis_psub16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_and(TMP12, MASK_7f, TMP12); + + vis_psub16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); +} + +static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + int stride_8 = stride + 8; + int stride_16 = stride + 16; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(constants3[0], CONST_3); + vis_faligndata(TMP0, TMP2, REF_2); + + vis_ld64(constants256_512[0], CONST_256); + vis_faligndata(TMP2, TMP4, REF_6); + height >>= 1; + + do { /* 31 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_pmerge(ZERO, REF_2, TMP12); + vis_mul8x16au(REF_2_1, CONST_256, TMP14); + + vis_ld64_2(ref, stride_8, TMP2); + vis_pmerge(ZERO, REF_6, TMP16); + vis_mul8x16au(REF_6_1, CONST_256, TMP18); + + vis_ld64_2(ref, stride_16, TMP4); + ref += stride; + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(dest, 8, DST_2); + vis_faligndata(TMP2, TMP4, REF_4); + + vis_ld64_2(ref, stride, TMP6); + vis_pmerge(ZERO, REF_0, TMP0); + vis_mul8x16au(REF_0_1, CONST_256, TMP2); + + vis_ld64_2(ref, stride_8, TMP8); + vis_pmerge(ZERO, REF_4, TMP4); + + vis_ld64_2(ref, stride_16, TMP10); + ref += stride; + + vis_ld64_2(dest, stride, REF_S0/*DST_4*/); + vis_faligndata(TMP6, TMP8, REF_2); + vis_mul8x16au(REF_4_1, CONST_256, TMP6); + + vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); + vis_faligndata(TMP8, TMP10, REF_6); + vis_mul8x16al(DST_0, CONST_512, TMP20); + + vis_padd16(TMP0, CONST_3, TMP0); + vis_mul8x16al(DST_1, CONST_512, TMP22); + + vis_padd16(TMP2, CONST_3, TMP2); + vis_mul8x16al(DST_2, CONST_512, TMP24); + + vis_padd16(TMP4, CONST_3, TMP4); + vis_mul8x16al(DST_3, CONST_512, TMP26); + + vis_padd16(TMP6, CONST_3, TMP6); + + vis_padd16(TMP12, TMP20, TMP12); + vis_mul8x16al(REF_S0, CONST_512, TMP20); + + vis_padd16(TMP14, TMP22, TMP14); + vis_mul8x16al(REF_S0_1, CONST_512, TMP22); + + vis_padd16(TMP16, TMP24, TMP16); + vis_mul8x16al(REF_S2, CONST_512, TMP24); + + vis_padd16(TMP18, TMP26, TMP18); + vis_mul8x16al(REF_S2_1, CONST_512, TMP26); + + vis_padd16(TMP12, TMP0, TMP12); + vis_mul8x16au(REF_2, CONST_256, TMP28); + + vis_padd16(TMP14, TMP2, TMP14); + vis_mul8x16au(REF_2_1, CONST_256, TMP30); + + vis_padd16(TMP16, TMP4, TMP16); + vis_mul8x16au(REF_6, CONST_256, REF_S4); + + vis_padd16(TMP18, TMP6, TMP18); + vis_mul8x16au(REF_6_1, CONST_256, REF_S6); + + vis_pack16(TMP12, DST_0); + vis_padd16(TMP28, TMP0, TMP12); + + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + vis_padd16(TMP30, TMP2, TMP14); + + vis_pack16(TMP16, DST_2); + vis_padd16(REF_S4, TMP4, TMP16); + + vis_pack16(TMP18, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + vis_padd16(REF_S6, TMP6, TMP18); + + vis_padd16(TMP12, TMP20, TMP12); + + vis_padd16(TMP14, TMP22, TMP14); + vis_pack16(TMP12, DST_0); + + vis_padd16(TMP16, TMP24, TMP16); + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + + vis_padd16(TMP18, TMP26, TMP18); + vis_pack16(TMP16, DST_2); + + vis_pack16(TMP18, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + int stride_8 = stride + 8; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(constants3[0], CONST_3); + vis_faligndata(TMP0, TMP2, REF_2); + + vis_ld64(constants256_512[0], CONST_256); + + height >>= 1; + do { /* 20 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_pmerge(ZERO, REF_2, TMP8); + vis_mul8x16au(REF_2_1, CONST_256, TMP10); + + vis_ld64_2(ref, stride_8, TMP2); + ref += stride; + + vis_ld64(dest[0], DST_0); + + vis_ld64_2(dest, stride, DST_2); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, stride, TMP4); + vis_mul8x16al(DST_0, CONST_512, TMP16); + vis_pmerge(ZERO, REF_0, TMP12); + + vis_ld64_2(ref, stride_8, TMP6); + ref += stride; + vis_mul8x16al(DST_1, CONST_512, TMP18); + vis_pmerge(ZERO, REF_0_1, TMP14); + + vis_padd16(TMP12, CONST_3, TMP12); + vis_mul8x16al(DST_2, CONST_512, TMP24); + + vis_padd16(TMP14, CONST_3, TMP14); + vis_mul8x16al(DST_3, CONST_512, TMP26); + + vis_faligndata(TMP4, TMP6, REF_2); + + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + vis_mul8x16au(REF_2, CONST_256, TMP20); + + vis_padd16(TMP8, TMP16, TMP0); + vis_mul8x16au(REF_2_1, CONST_256, TMP22); + + vis_padd16(TMP10, TMP18, TMP2); + vis_pack16(TMP0, DST_0); + + vis_pack16(TMP2, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + vis_padd16(TMP12, TMP20, TMP12); + + vis_padd16(TMP14, TMP22, TMP14); + + vis_padd16(TMP12, TMP24, TMP0); + + vis_padd16(TMP14, TMP26, TMP2); + vis_pack16(TMP0, DST_2); + + vis_pack16(TMP2, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + int stride_16 = stride + 16; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(constants2[0], CONST_2); + vis_faligndata(TMP0, TMP2, REF_S0); + + vis_ld64(constants256_512[0], CONST_256); + vis_faligndata(TMP2, TMP4, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + vis_faligndata(TMP2, TMP4, REF_S6); + } else { + vis_src1(TMP2, REF_S2); + vis_src1(TMP4, REF_S6); + } + + height >>= 1; + do { + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP12); + vis_pmerge(ZERO, REF_S0_1, TMP14); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride_8, TMP2); + vis_mul8x16au(REF_S2, CONST_256, TMP16); + vis_pmerge(ZERO, REF_S2_1, TMP18); + + vis_ld64_2(ref, stride_16, TMP4); + ref += stride; + vis_mul8x16au(REF_S4, CONST_256, TMP20); + vis_pmerge(ZERO, REF_S4_1, TMP22); + + vis_ld64_2(ref, stride, TMP6); + vis_mul8x16au(REF_S6, CONST_256, TMP24); + vis_pmerge(ZERO, REF_S6_1, TMP26); + + vis_ld64_2(ref, stride_8, TMP8); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, stride_16, TMP10); + ref += stride; + vis_faligndata(TMP2, TMP4, REF_4); + + vis_faligndata(TMP6, TMP8, REF_S0); + + vis_faligndata(TMP8, TMP10, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + vis_faligndata(TMP6, TMP8, REF_S2); + vis_faligndata(TMP8, TMP10, REF_S6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + vis_src1(TMP8, REF_S2); + vis_src1(TMP10, REF_S6); + } + + vis_mul8x16au(REF_0, CONST_256, TMP0); + vis_pmerge(ZERO, REF_0_1, TMP2); + + vis_mul8x16au(REF_2, CONST_256, TMP4); + vis_pmerge(ZERO, REF_2_1, TMP6); + + vis_padd16(TMP0, CONST_2, TMP8); + vis_mul8x16au(REF_4, CONST_256, TMP0); + + vis_padd16(TMP2, CONST_2, TMP10); + vis_mul8x16au(REF_4_1, CONST_256, TMP2); + + vis_padd16(TMP8, TMP4, TMP8); + vis_mul8x16au(REF_6, CONST_256, TMP4); + + vis_padd16(TMP10, TMP6, TMP10); + vis_mul8x16au(REF_6_1, CONST_256, TMP6); + + vis_padd16(TMP12, TMP8, TMP12); + + vis_padd16(TMP14, TMP10, TMP14); + + vis_padd16(TMP12, TMP16, TMP12); + + vis_padd16(TMP14, TMP18, TMP14); + vis_pack16(TMP12, DST_0); + + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + vis_padd16(TMP0, CONST_2, TMP12); + + vis_mul8x16au(REF_S0, CONST_256, TMP0); + vis_padd16(TMP2, CONST_2, TMP14); + + vis_mul8x16au(REF_S0_1, CONST_256, TMP2); + vis_padd16(TMP12, TMP4, TMP12); + + vis_mul8x16au(REF_S2, CONST_256, TMP4); + vis_padd16(TMP14, TMP6, TMP14); + + vis_mul8x16au(REF_S2_1, CONST_256, TMP6); + vis_padd16(TMP20, TMP12, TMP20); + + vis_padd16(TMP22, TMP14, TMP22); + + vis_padd16(TMP20, TMP24, TMP20); + + vis_padd16(TMP22, TMP26, TMP22); + vis_pack16(TMP20, DST_2); + + vis_pack16(TMP22, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + vis_padd16(TMP0, TMP4, TMP24); + + vis_mul8x16au(REF_S4, CONST_256, TMP0); + vis_padd16(TMP2, TMP6, TMP26); + + vis_mul8x16au(REF_S4_1, CONST_256, TMP2); + vis_padd16(TMP24, TMP8, TMP24); + + vis_padd16(TMP26, TMP10, TMP26); + vis_pack16(TMP24, DST_0); + + vis_pack16(TMP26, DST_1); + vis_st64(DST_0, dest[0]); + vis_pmerge(ZERO, REF_S6, TMP4); + + vis_pmerge(ZERO, REF_S6_1, TMP6); + + vis_padd16(TMP0, TMP4, TMP0); + + vis_padd16(TMP2, TMP6, TMP2); + + vis_padd16(TMP0, TMP12, TMP0); + + vis_padd16(TMP2, TMP14, TMP2); + vis_pack16(TMP0, DST_2); + + vis_pack16(TMP2, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(constants2[0], CONST_2); + + vis_ld64(constants256_512[0], CONST_256); + vis_faligndata(TMP0, TMP2, REF_S0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + } else { + vis_src1(TMP2, REF_S2); + } + + height >>= 1; + do { /* 26 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP8); + vis_pmerge(ZERO, REF_S2, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride_8, TMP2); + ref += stride; + vis_mul8x16au(REF_S0_1, CONST_256, TMP10); + vis_pmerge(ZERO, REF_S2_1, TMP14); + + vis_ld64_2(ref, stride, TMP4); + + vis_ld64_2(ref, stride_8, TMP6); + ref += stride; + vis_faligndata(TMP0, TMP2, REF_S4); + + vis_pmerge(ZERO, REF_S4, TMP18); + + vis_pmerge(ZERO, REF_S4_1, TMP20); + + vis_faligndata(TMP4, TMP6, REF_S0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S6); + vis_faligndata(TMP4, TMP6, REF_S2); + } else { + vis_src1(TMP2, REF_S6); + vis_src1(TMP6, REF_S2); + } + + vis_padd16(TMP18, CONST_2, TMP18); + vis_mul8x16au(REF_S6, CONST_256, TMP22); + + vis_padd16(TMP20, CONST_2, TMP20); + vis_mul8x16au(REF_S6_1, CONST_256, TMP24); + + vis_mul8x16au(REF_S0, CONST_256, TMP26); + vis_pmerge(ZERO, REF_S0_1, TMP28); + + vis_mul8x16au(REF_S2, CONST_256, TMP30); + vis_padd16(TMP18, TMP22, TMP18); + + vis_mul8x16au(REF_S2_1, CONST_256, TMP32); + vis_padd16(TMP20, TMP24, TMP20); + + vis_padd16(TMP8, TMP18, TMP8); + + vis_padd16(TMP10, TMP20, TMP10); + + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + vis_padd16(TMP18, TMP26, TMP18); + + vis_padd16(TMP20, TMP28, TMP20); + + vis_padd16(TMP18, TMP30, TMP18); + + vis_padd16(TMP20, TMP32, TMP20); + vis_pack16(TMP18, DST_2); + + vis_pack16(TMP20, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + int stride_16 = stride + 16; + + vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(constants6[0], CONST_6); + vis_faligndata(TMP0, TMP2, REF_S0); + + vis_ld64(constants256_1024[0], CONST_256); + vis_faligndata(TMP2, TMP4, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + vis_faligndata(TMP2, TMP4, REF_S6); + } else { + vis_src1(TMP2, REF_S2); + vis_src1(TMP4, REF_S6); + } + + height >>= 1; + do { /* 55 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP12); + vis_pmerge(ZERO, REF_S0_1, TMP14); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride_8, TMP2); + vis_mul8x16au(REF_S2, CONST_256, TMP16); + vis_pmerge(ZERO, REF_S2_1, TMP18); + + vis_ld64_2(ref, stride_16, TMP4); + ref += stride; + vis_mul8x16au(REF_S4, CONST_256, TMP20); + vis_pmerge(ZERO, REF_S4_1, TMP22); + + vis_ld64_2(ref, stride, TMP6); + vis_mul8x16au(REF_S6, CONST_256, TMP24); + vis_pmerge(ZERO, REF_S6_1, TMP26); + + vis_ld64_2(ref, stride_8, TMP8); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, stride_16, TMP10); + ref += stride; + vis_faligndata(TMP2, TMP4, REF_4); + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP6, TMP8, REF_S0); + + vis_ld64_2(dest, 8, DST_2); + vis_faligndata(TMP8, TMP10, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + vis_faligndata(TMP6, TMP8, REF_S2); + vis_faligndata(TMP8, TMP10, REF_S6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + vis_src1(TMP8, REF_S2); + vis_src1(TMP10, REF_S6); + } + + vis_mul8x16al(DST_0, CONST_1024, TMP30); + vis_pmerge(ZERO, REF_0, TMP0); + + vis_mul8x16al(DST_1, CONST_1024, TMP32); + vis_pmerge(ZERO, REF_0_1, TMP2); + + vis_mul8x16au(REF_2, CONST_256, TMP4); + vis_pmerge(ZERO, REF_2_1, TMP6); + + vis_mul8x16al(DST_2, CONST_1024, REF_0); + vis_padd16(TMP0, CONST_6, TMP0); + + vis_mul8x16al(DST_3, CONST_1024, REF_2); + vis_padd16(TMP2, CONST_6, TMP2); + + vis_padd16(TMP0, TMP4, TMP0); + vis_mul8x16au(REF_4, CONST_256, TMP4); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_4_1, CONST_256, TMP6); + + vis_padd16(TMP12, TMP0, TMP12); + vis_mul8x16au(REF_6, CONST_256, TMP8); + + vis_padd16(TMP14, TMP2, TMP14); + vis_mul8x16au(REF_6_1, CONST_256, TMP10); + + vis_padd16(TMP12, TMP16, TMP12); + vis_mul8x16au(REF_S0, CONST_256, REF_4); + + vis_padd16(TMP14, TMP18, TMP14); + vis_mul8x16au(REF_S0_1, CONST_256, REF_6); + + vis_padd16(TMP12, TMP30, TMP12); + + vis_padd16(TMP14, TMP32, TMP14); + vis_pack16(TMP12, DST_0); + + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + vis_padd16(TMP4, CONST_6, TMP4); + + vis_ld64_2(dest, stride, DST_0); + vis_padd16(TMP6, CONST_6, TMP6); + vis_mul8x16au(REF_S2, CONST_256, TMP12); + + vis_padd16(TMP4, TMP8, TMP4); + vis_mul8x16au(REF_S2_1, CONST_256, TMP14); + + vis_padd16(TMP6, TMP10, TMP6); + + vis_padd16(TMP20, TMP4, TMP20); + + vis_padd16(TMP22, TMP6, TMP22); + + vis_padd16(TMP20, TMP24, TMP20); + + vis_padd16(TMP22, TMP26, TMP22); + + vis_padd16(TMP20, REF_0, TMP20); + vis_mul8x16au(REF_S4, CONST_256, REF_0); + + vis_padd16(TMP22, REF_2, TMP22); + vis_pack16(TMP20, DST_2); + + vis_pack16(TMP22, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + + vis_ld64_2(dest, 8, DST_2); + vis_mul8x16al(DST_0, CONST_1024, TMP30); + vis_pmerge(ZERO, REF_S4_1, REF_2); + + vis_mul8x16al(DST_1, CONST_1024, TMP32); + vis_padd16(REF_4, TMP0, TMP8); + + vis_mul8x16au(REF_S6, CONST_256, REF_4); + vis_padd16(REF_6, TMP2, TMP10); + + vis_mul8x16au(REF_S6_1, CONST_256, REF_6); + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + + vis_padd16(TMP8, TMP30, TMP8); + + vis_padd16(TMP10, TMP32, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_st64(DST_0, dest[0]); + + vis_padd16(REF_0, TMP4, REF_0); + + vis_mul8x16al(DST_2, CONST_1024, TMP30); + vis_padd16(REF_2, TMP6, REF_2); + + vis_mul8x16al(DST_3, CONST_1024, TMP32); + vis_padd16(REF_0, REF_4, REF_0); + + vis_padd16(REF_2, REF_6, REF_2); + + vis_padd16(REF_0, TMP30, REF_0); + + /* stall */ + + vis_padd16(REF_2, TMP32, REF_2); + vis_pack16(REF_0, DST_2); + + vis_pack16(REF_2, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + + vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + vis_fzero(ZERO); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64(constants6[0], CONST_6); + + vis_ld64(constants256_1024[0], CONST_256); + vis_faligndata(TMP0, TMP2, REF_S0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + } else { + vis_src1(TMP2, REF_S2); + } + + height >>= 1; + do { /* 31 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP8); + vis_pmerge(ZERO, REF_S0_1, TMP10); + + vis_ld64_2(ref, stride_8, TMP2); + ref += stride; + vis_mul8x16au(REF_S2, CONST_256, TMP12); + vis_pmerge(ZERO, REF_S2_1, TMP14); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride, TMP4); + vis_faligndata(TMP0, TMP2, REF_S4); + + vis_ld64_2(ref, stride_8, TMP6); + ref += stride; + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP4, TMP6, REF_S0); + + vis_ld64_2(dest, stride, DST_2); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S6); + vis_faligndata(TMP4, TMP6, REF_S2); + } else { + vis_src1(TMP2, REF_S6); + vis_src1(TMP6, REF_S2); + } + + vis_mul8x16al(DST_0, CONST_1024, TMP30); + vis_pmerge(ZERO, REF_S4, TMP22); + + vis_mul8x16al(DST_1, CONST_1024, TMP32); + vis_pmerge(ZERO, REF_S4_1, TMP24); + + vis_mul8x16au(REF_S6, CONST_256, TMP26); + vis_pmerge(ZERO, REF_S6_1, TMP28); + + vis_mul8x16au(REF_S0, CONST_256, REF_S4); + vis_padd16(TMP22, CONST_6, TMP22); + + vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); + vis_padd16(TMP24, CONST_6, TMP24); + + vis_mul8x16al(DST_2, CONST_1024, REF_0); + vis_padd16(TMP22, TMP26, TMP22); + + vis_mul8x16al(DST_3, CONST_1024, REF_2); + vis_padd16(TMP24, TMP28, TMP24); + + vis_mul8x16au(REF_S2, CONST_256, TMP26); + vis_padd16(TMP8, TMP22, TMP8); + + vis_mul8x16au(REF_S2_1, CONST_256, TMP28); + vis_padd16(TMP10, TMP24, TMP10); + + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + + vis_padd16(TMP8, TMP30, TMP8); + + vis_padd16(TMP10, TMP32, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_padd16(REF_S4, TMP22, TMP12); + + vis_padd16(REF_S6, TMP24, TMP14); + + vis_padd16(TMP12, TMP26, TMP12); + + vis_padd16(TMP14, TMP28, TMP14); + + vis_padd16(TMP12, REF_0, TMP12); + + vis_padd16(TMP14, REF_2, TMP14); + vis_pack16(TMP12, DST_2); + + vis_pack16(TMP14, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +/* End of rounding code */ + +/* Start of no rounding code */ +/* The trick used in some of this file is the formula from the MMX + * motion comp code, which is: + * + * (x+y)>>1 == (x&y)+((x^y)>>1) + * + * This allows us to average 8 bytes at a time in a 64-bit FPU reg. + * We avoid overflows by masking before we do the shift, and we + * implement the shift by multiplying by 1/2 using mul8x16. So in + * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask + * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and + * the value 0x80808080 is in f8): + * + * fxor f0, f2, f10 + * fand f10, f4, f10 + * fmul8x16 f8, f10, f10 + * fand f10, f6, f10 + * fand f0, f2, f12 + * fpadd16 f12, f10, f10 + */ + +static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + do { /* 5 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + + vis_faligndata(TMP0, TMP2, REF_0); + vis_st64(REF_0, dest[0]); + + vis_faligndata(TMP2, TMP4, REF_2); + vis_st64_2(REF_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + do { /* 4 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + ref += stride; + + /* stall */ + + vis_faligndata(TMP0, TMP2, REF_0); + vis_st64(REF_0, dest[0]); + dest += stride; + } while (--height); +} + + +static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + int stride_8 = stride + 8; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(dest[0], DST_0); + + vis_ld64(dest[8], DST_2); + + vis_ld64(constants_fe[0], MASK_fe); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP2, TMP4, REF_2); + + vis_ld64(constants128[0], CONST_128); + + ref += stride; + height = (height >> 1) - 1; + + do { /* 24 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP6, MASK_fe, TMP6); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_xor(DST_2, REF_2, TMP8); + + vis_and(TMP8, MASK_fe, TMP8); + + vis_and(DST_0, REF_0, TMP10); + vis_ld64_2(dest, stride, DST_0); + vis_mul8x16(CONST_128, TMP8, TMP8); + + vis_and(DST_2, REF_2, TMP12); + vis_ld64_2(dest, stride_8, DST_2); + + vis_ld64(ref[0], TMP14); + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_padd16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_padd16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + + dest += stride; + vis_ld64_2(ref, 8, TMP16); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 16, TMP18); + vis_faligndata(TMP2, TMP4, REF_2); + ref += stride; + + vis_xor(DST_0, REF_0, TMP20); + + vis_and(TMP20, MASK_fe, TMP20); + + vis_xor(DST_2, REF_2, TMP22); + vis_mul8x16(CONST_128, TMP20, TMP20); + + vis_and(TMP22, MASK_fe, TMP22); + + vis_and(DST_0, REF_0, TMP24); + vis_mul8x16(CONST_128, TMP22, TMP22); + + vis_and(DST_2, REF_2, TMP26); + + vis_ld64_2(dest, stride, DST_0); + vis_faligndata(TMP14, TMP16, REF_0); + + vis_ld64_2(dest, stride_8, DST_2); + vis_faligndata(TMP16, TMP18, REF_2); + + vis_and(TMP20, MASK_7f, TMP20); + + vis_and(TMP22, MASK_7f, TMP22); + + vis_padd16(TMP24, TMP20, TMP20); + vis_st64(TMP20, dest[0]); + + vis_padd16(TMP26, TMP22, TMP22); + vis_st64_2(TMP22, dest, 8); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP6, MASK_fe, TMP6); + + vis_ld64_2(ref, 16, TMP4); + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_xor(DST_2, REF_2, TMP8); + + vis_and(TMP8, MASK_fe, TMP8); + + vis_and(DST_0, REF_0, TMP10); + vis_ld64_2(dest, stride, DST_0); + vis_mul8x16(CONST_128, TMP8, TMP8); + + vis_and(DST_2, REF_2, TMP12); + vis_ld64_2(dest, stride_8, DST_2); + + vis_ld64(ref[0], TMP14); + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_padd16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_padd16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + + dest += stride; + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_2); + + vis_xor(DST_0, REF_0, TMP20); + + vis_and(TMP20, MASK_fe, TMP20); + + vis_xor(DST_2, REF_2, TMP22); + vis_mul8x16(CONST_128, TMP20, TMP20); + + vis_and(TMP22, MASK_fe, TMP22); + + vis_and(DST_0, REF_0, TMP24); + vis_mul8x16(CONST_128, TMP22, TMP22); + + vis_and(DST_2, REF_2, TMP26); + + vis_and(TMP20, MASK_7f, TMP20); + + vis_and(TMP22, MASK_7f, TMP22); + + vis_padd16(TMP24, TMP20, TMP20); + vis_st64(TMP20, dest[0]); + + vis_padd16(TMP26, TMP22, TMP22); + vis_st64_2(TMP22, dest, 8); +} + +static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_ld64(dest[0], DST_0); + + vis_ld64(constants_fe[0], MASK_fe); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants128[0], CONST_128); + + ref += stride; + height = (height >> 1) - 1; + + do { /* 12 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP4); + + vis_ld64(ref[8], TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_and(DST_0, REF_0, TMP6); + vis_ld64_2(dest, stride, DST_0); + ref += stride; + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_ld64(ref[0], TMP12); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(ref[8], TMP2); + vis_xor(DST_0, REF_0, TMP0); + ref += stride; + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_padd16(TMP6, TMP4, TMP4); + vis_st64(TMP4, dest[0]); + dest += stride; + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_and(DST_0, REF_0, TMP6); + vis_ld64_2(dest, stride, DST_0); + + vis_faligndata(TMP12, TMP2, REF_0); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_padd16(TMP6, TMP0, TMP4); + vis_st64(TMP4, dest[0]); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(DST_0, REF_0, TMP4); + + vis_ld64(ref[8], TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_and(DST_0, REF_0, TMP6); + vis_ld64_2(dest, stride, DST_0); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_xor(DST_0, REF_0, TMP0); + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_padd16(TMP6, TMP4, TMP4); + vis_st64(TMP4, dest[0]); + dest += stride; + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_and(DST_0, REF_0, TMP6); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_padd16(TMP6, TMP0, TMP4); + vis_st64(TMP4, dest[0]); +} + +static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64_2(ref, 16, TMP4); + + vis_ld64(constants_fe[0], MASK_fe); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants128[0], CONST_128); + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + ref += stride; + height = (height >> 1) - 1; + + do { /* 34 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP8); + + vis_ld64_2(ref, 16, TMP4); + vis_and(TMP6, MASK_fe, TMP6); + ref += stride; + + vis_ld64(ref[0], TMP14); + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_ld64_2(ref, 8, TMP16); + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_and(REF_0, REF_2, TMP10); + + vis_ld64_2(ref, 16, TMP18); + ref += stride; + vis_and(REF_4, REF_6, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_padd16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_padd16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + dest += stride; + + vis_xor(REF_0, REF_2, TMP6); + + vis_xor(REF_4, REF_6, TMP8); + + vis_and(TMP6, MASK_fe, TMP6); + + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_and(REF_0, REF_2, TMP10); + + vis_and(REF_4, REF_6, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP14, TMP16, REF_0); + + vis_faligndata(TMP16, TMP18, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP14, TMP16, REF_2); + vis_faligndata(TMP16, TMP18, REF_6); + } else { + vis_src1(TMP16, REF_2); + vis_src1(TMP18, REF_6); + } + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_padd16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_padd16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP6); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP8); + + vis_ld64_2(ref, 16, TMP4); + vis_and(TMP6, MASK_fe, TMP6); + + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_and(REF_0, REF_2, TMP10); + + vis_and(REF_4, REF_6, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_padd16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_padd16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); + dest += stride; + + vis_xor(REF_0, REF_2, TMP6); + + vis_xor(REF_4, REF_6, TMP8); + + vis_and(TMP6, MASK_fe, TMP6); + + vis_mul8x16(CONST_128, TMP6, TMP6); + vis_and(TMP8, MASK_fe, TMP8); + + vis_mul8x16(CONST_128, TMP8, TMP8); + vis_and(REF_0, REF_2, TMP10); + + vis_and(REF_4, REF_6, TMP12); + + vis_and(TMP6, MASK_7f, TMP6); + + vis_and(TMP8, MASK_7f, TMP8); + + vis_padd16(TMP10, TMP6, TMP6); + vis_st64(TMP6, dest[0]); + + vis_padd16(TMP12, TMP8, TMP8); + vis_st64_2(TMP8, dest, 8); +} + +static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_ld64(constants_fe[0], MASK_fe); + + vis_ld64(constants_7f[0], MASK_7f); + + vis_ld64(constants128[0], CONST_128); + vis_faligndata(TMP0, TMP2, REF_0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + } else { + vis_src1(TMP2, REF_2); + } + + ref += stride; + height = (height >> 1) - 1; + + do { /* 20 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP4, MASK_fe, TMP4); + ref += stride; + + vis_ld64(ref[0], TMP8); + vis_and(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, 8, TMP10); + ref += stride; + vis_faligndata(TMP0, TMP2, REF_0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + } else { + vis_src1(TMP2, REF_2); + } + + vis_and(TMP4, MASK_7f, TMP4); + + vis_padd16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_and(REF_0, REF_2, TMP14); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_alignaddr_g0((void *)off); + vis_faligndata(TMP8, TMP10, REF_0); + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP8, TMP10, REF_2); + } else { + vis_src1(TMP10, REF_2); + } + + vis_and(TMP12, MASK_7f, TMP12); + + vis_padd16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_and(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_alignaddr_g0((void *)off); + + vis_faligndata(TMP0, TMP2, REF_0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + } else { + vis_src1(TMP2, REF_2); + } + + vis_and(TMP4, MASK_7f, TMP4); + + vis_padd16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_and(REF_0, REF_2, TMP14); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_padd16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; +} + +static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + vis_ld64(constants3[0], CONST_3); + vis_fzero(ZERO); + vis_ld64(constants256_512[0], CONST_256); + + ref = vis_alignaddr(ref); + do { /* 26 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64(ref[8], TMP2); + + vis_alignaddr_g0((void *)off); + + vis_ld64(ref[16], TMP4); + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(dest[8], DST_2); + vis_faligndata(TMP2, TMP4, REF_4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + } + + vis_mul8x16au(REF_0, CONST_256, TMP0); + + vis_pmerge(ZERO, REF_2, TMP4); + vis_mul8x16au(REF_0_1, CONST_256, TMP2); + + vis_pmerge(ZERO, REF_2_1, TMP6); + + vis_padd16(TMP0, TMP4, TMP0); + + vis_mul8x16al(DST_0, CONST_512, TMP4); + vis_padd16(TMP2, TMP6, TMP2); + + vis_mul8x16al(DST_1, CONST_512, TMP6); + + vis_mul8x16au(REF_6, CONST_256, TMP12); + + vis_padd16(TMP0, TMP4, TMP0); + vis_mul8x16au(REF_6_1, CONST_256, TMP14); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_4, CONST_256, TMP16); + + vis_padd16(TMP0, CONST_3, TMP8); + vis_mul8x16au(REF_4_1, CONST_256, TMP18); + + vis_padd16(TMP2, CONST_3, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_padd16(TMP16, TMP12, TMP0); + + vis_st64(DST_0, dest[0]); + vis_mul8x16al(DST_2, CONST_512, TMP4); + vis_padd16(TMP18, TMP14, TMP2); + + vis_mul8x16al(DST_3, CONST_512, TMP6); + vis_padd16(TMP0, CONST_3, TMP0); + + vis_padd16(TMP2, CONST_3, TMP2); + + vis_padd16(TMP0, TMP4, TMP0); + + vis_padd16(TMP2, TMP6, TMP2); + vis_pack16(TMP0, DST_2); + + vis_pack16(TMP2, DST_3); + vis_st64(DST_2, dest[8]); + + ref += stride; + dest += stride; + } while (--height); +} + +static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_times_2 = stride << 1; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + vis_ld64(constants3[0], CONST_3); + vis_fzero(ZERO); + vis_ld64(constants256_512[0], CONST_256); + + ref = vis_alignaddr(ref); + height >>= 2; + do { /* 47 cycles */ + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + + vis_alignaddr_g0((void *)off); + + vis_ld64(ref[0], TMP4); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 8, TMP6); + ref += stride; + + vis_ld64(ref[0], TMP8); + + vis_ld64_2(ref, 8, TMP10); + ref += stride; + vis_faligndata(TMP4, TMP6, REF_4); + + vis_ld64(ref[0], TMP12); + + vis_ld64_2(ref, 8, TMP14); + ref += stride; + vis_faligndata(TMP8, TMP10, REF_S0); + + vis_faligndata(TMP12, TMP14, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP0, TMP2, REF_2); + + vis_ld64_2(dest, stride, DST_2); + vis_faligndata(TMP4, TMP6, REF_6); + + vis_faligndata(TMP8, TMP10, REF_S2); + + vis_faligndata(TMP12, TMP14, REF_S6); + } else { + vis_ld64(dest[0], DST_0); + vis_src1(TMP2, REF_2); + + vis_ld64_2(dest, stride, DST_2); + vis_src1(TMP6, REF_6); + + vis_src1(TMP10, REF_S2); + + vis_src1(TMP14, REF_S6); + } + + vis_pmerge(ZERO, REF_0, TMP0); + vis_mul8x16au(REF_0_1, CONST_256, TMP2); + + vis_pmerge(ZERO, REF_2, TMP4); + vis_mul8x16au(REF_2_1, CONST_256, TMP6); + + vis_padd16(TMP0, CONST_3, TMP0); + vis_mul8x16al(DST_0, CONST_512, TMP16); + + vis_padd16(TMP2, CONST_3, TMP2); + vis_mul8x16al(DST_1, CONST_512, TMP18); + + vis_padd16(TMP0, TMP4, TMP0); + vis_mul8x16au(REF_4, CONST_256, TMP8); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_4_1, CONST_256, TMP10); + + vis_padd16(TMP0, TMP16, TMP0); + vis_mul8x16au(REF_6, CONST_256, TMP12); + + vis_padd16(TMP2, TMP18, TMP2); + vis_mul8x16au(REF_6_1, CONST_256, TMP14); + + vis_padd16(TMP8, CONST_3, TMP8); + vis_mul8x16al(DST_2, CONST_512, TMP16); + + vis_padd16(TMP8, TMP12, TMP8); + vis_mul8x16al(DST_3, CONST_512, TMP18); + + vis_padd16(TMP10, TMP14, TMP10); + vis_pack16(TMP0, DST_0); + + vis_pack16(TMP2, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + vis_padd16(TMP10, CONST_3, TMP10); + + vis_ld64_2(dest, stride, DST_0); + vis_padd16(TMP8, TMP16, TMP8); + + vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); + vis_padd16(TMP10, TMP18, TMP10); + vis_pack16(TMP8, DST_2); + + vis_pack16(TMP10, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + + vis_mul8x16au(REF_S0_1, CONST_256, TMP2); + vis_pmerge(ZERO, REF_S0, TMP0); + + vis_pmerge(ZERO, REF_S2, TMP24); + vis_mul8x16au(REF_S2_1, CONST_256, TMP6); + + vis_padd16(TMP0, CONST_3, TMP0); + vis_mul8x16au(REF_S4, CONST_256, TMP8); + + vis_padd16(TMP2, CONST_3, TMP2); + vis_mul8x16au(REF_S4_1, CONST_256, TMP10); + + vis_padd16(TMP0, TMP24, TMP0); + vis_mul8x16au(REF_S6, CONST_256, TMP12); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_S6_1, CONST_256, TMP14); + + vis_padd16(TMP8, CONST_3, TMP8); + vis_mul8x16al(DST_0, CONST_512, TMP16); + + vis_padd16(TMP10, CONST_3, TMP10); + vis_mul8x16al(DST_1, CONST_512, TMP18); + + vis_padd16(TMP8, TMP12, TMP8); + vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); + + vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); + vis_padd16(TMP0, TMP16, TMP0); + + vis_padd16(TMP2, TMP18, TMP2); + vis_pack16(TMP0, DST_0); + + vis_padd16(TMP10, TMP14, TMP10); + vis_pack16(TMP2, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_padd16(TMP8, TMP20, TMP8); + + vis_padd16(TMP10, TMP22, TMP10); + vis_pack16(TMP8, DST_2); + + vis_pack16(TMP10, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + + vis_ld64(ref[0], TMP6); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 8, TMP8); + vis_faligndata(TMP2, TMP4, REF_4); + + vis_ld64_2(ref, 16, TMP10); + ref += stride; + + vis_ld64(constants_fe[0], MASK_fe); + vis_faligndata(TMP6, TMP8, REF_2); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP8, TMP10, REF_6); + + vis_ld64(constants128[0], CONST_128); + height = (height >> 1) - 1; + do { /* 24 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP12); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP16); + + vis_ld64_2(ref, 16, TMP4); + ref += stride; + vis_and(REF_0, REF_2, TMP14); + + vis_ld64(ref[0], TMP6); + vis_and(REF_4, REF_6, TMP18); + + vis_ld64_2(ref, 8, TMP8); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, 16, TMP10); + ref += stride; + vis_faligndata(TMP2, TMP4, REF_4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_and(TMP16, MASK_fe, TMP16); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_mul8x16(CONST_128, TMP16, TMP16); + vis_xor(REF_0, REF_2, TMP0); + + vis_xor(REF_4, REF_6, TMP2); + + vis_and(REF_0, REF_2, TMP20); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_and(TMP16, MASK_7f, TMP16); + + vis_padd16(TMP14, TMP12, TMP12); + vis_st64(TMP12, dest[0]); + + vis_padd16(TMP18, TMP16, TMP16); + vis_st64_2(TMP16, dest, 8); + dest += stride; + + vis_and(REF_4, REF_6, TMP18); + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP2, MASK_fe, TMP2); + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_faligndata(TMP6, TMP8, REF_2); + vis_mul8x16(CONST_128, TMP2, TMP2); + + vis_faligndata(TMP8, TMP10, REF_6); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_and(TMP2, MASK_7f, TMP2); + + vis_padd16(TMP20, TMP0, TMP0); + vis_st64(TMP0, dest[0]); + + vis_padd16(TMP18, TMP2, TMP2); + vis_st64_2(TMP2, dest, 8); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP12); + + vis_ld64_2(ref, 8, TMP2); + vis_xor(REF_4, REF_6, TMP16); + + vis_ld64_2(ref, 16, TMP4); + vis_and(REF_0, REF_2, TMP14); + + vis_and(REF_4, REF_6, TMP18); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_faligndata(TMP2, TMP4, REF_4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_and(TMP16, MASK_fe, TMP16); + vis_mul8x16(CONST_128, TMP12, TMP12); + + vis_mul8x16(CONST_128, TMP16, TMP16); + vis_xor(REF_0, REF_2, TMP0); + + vis_xor(REF_4, REF_6, TMP2); + + vis_and(REF_0, REF_2, TMP20); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_and(TMP16, MASK_7f, TMP16); + + vis_padd16(TMP14, TMP12, TMP12); + vis_st64(TMP12, dest[0]); + + vis_padd16(TMP18, TMP16, TMP16); + vis_st64_2(TMP16, dest, 8); + dest += stride; + + vis_and(REF_4, REF_6, TMP18); + + vis_and(TMP0, MASK_fe, TMP0); + + vis_and(TMP2, MASK_fe, TMP2); + vis_mul8x16(CONST_128, TMP0, TMP0); + + vis_mul8x16(CONST_128, TMP2, TMP2); + + vis_and(TMP0, MASK_7f, TMP0); + + vis_and(TMP2, MASK_7f, TMP2); + + vis_padd16(TMP20, TMP0, TMP0); + vis_st64(TMP0, dest[0]); + + vis_padd16(TMP18, TMP2, TMP2); + vis_st64_2(TMP2, dest, 8); +} + +static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + + ref = vis_alignaddr(ref); + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + + vis_ld64(ref[0], TMP4); + + vis_ld64_2(ref, 8, TMP6); + ref += stride; + + vis_ld64(constants_fe[0], MASK_fe); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64(constants_7f[0], MASK_7f); + vis_faligndata(TMP4, TMP6, REF_2); + + vis_ld64(constants128[0], CONST_128); + height = (height >> 1) - 1; + do { /* 12 cycles */ + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + vis_and(TMP4, MASK_fe, TMP4); + + vis_and(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_faligndata(TMP0, TMP2, REF_0); + vis_ld64(ref[0], TMP0); + + vis_ld64_2(ref, 8, TMP2); + ref += stride; + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_mul8x16(CONST_128, TMP12, TMP12); + vis_and(REF_0, REF_2, TMP14); + + vis_padd16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_faligndata(TMP0, TMP2, REF_2); + + vis_and(TMP12, MASK_7f, TMP12); + + vis_padd16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + } while (--height); + + vis_ld64(ref[0], TMP0); + vis_xor(REF_0, REF_2, TMP4); + + vis_ld64_2(ref, 8, TMP2); + vis_and(TMP4, MASK_fe, TMP4); + + vis_and(REF_0, REF_2, TMP6); + vis_mul8x16(CONST_128, TMP4, TMP4); + + vis_faligndata(TMP0, TMP2, REF_0); + + vis_xor(REF_0, REF_2, TMP12); + + vis_and(TMP4, MASK_7f, TMP4); + + vis_and(TMP12, MASK_fe, TMP12); + + vis_mul8x16(CONST_128, TMP12, TMP12); + vis_and(REF_0, REF_2, TMP14); + + vis_padd16(TMP6, TMP4, DST_0); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_and(TMP12, MASK_7f, TMP12); + + vis_padd16(TMP14, TMP12, DST_0); + vis_st64(DST_0, dest[0]); +} + +static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + int stride_8 = stride + 8; + int stride_16 = stride + 16; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(constants3[0], CONST_3); + vis_faligndata(TMP0, TMP2, REF_2); + + vis_ld64(constants256_512[0], CONST_256); + vis_faligndata(TMP2, TMP4, REF_6); + height >>= 1; + + do { /* 31 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_pmerge(ZERO, REF_2, TMP12); + vis_mul8x16au(REF_2_1, CONST_256, TMP14); + + vis_ld64_2(ref, stride_8, TMP2); + vis_pmerge(ZERO, REF_6, TMP16); + vis_mul8x16au(REF_6_1, CONST_256, TMP18); + + vis_ld64_2(ref, stride_16, TMP4); + ref += stride; + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(dest, 8, DST_2); + vis_faligndata(TMP2, TMP4, REF_4); + + vis_ld64_2(ref, stride, TMP6); + vis_pmerge(ZERO, REF_0, TMP0); + vis_mul8x16au(REF_0_1, CONST_256, TMP2); + + vis_ld64_2(ref, stride_8, TMP8); + vis_pmerge(ZERO, REF_4, TMP4); + + vis_ld64_2(ref, stride_16, TMP10); + ref += stride; + + vis_ld64_2(dest, stride, REF_S0/*DST_4*/); + vis_faligndata(TMP6, TMP8, REF_2); + vis_mul8x16au(REF_4_1, CONST_256, TMP6); + + vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); + vis_faligndata(TMP8, TMP10, REF_6); + vis_mul8x16al(DST_0, CONST_512, TMP20); + + vis_padd16(TMP0, CONST_3, TMP0); + vis_mul8x16al(DST_1, CONST_512, TMP22); + + vis_padd16(TMP2, CONST_3, TMP2); + vis_mul8x16al(DST_2, CONST_512, TMP24); + + vis_padd16(TMP4, CONST_3, TMP4); + vis_mul8x16al(DST_3, CONST_512, TMP26); + + vis_padd16(TMP6, CONST_3, TMP6); + + vis_padd16(TMP12, TMP20, TMP12); + vis_mul8x16al(REF_S0, CONST_512, TMP20); + + vis_padd16(TMP14, TMP22, TMP14); + vis_mul8x16al(REF_S0_1, CONST_512, TMP22); + + vis_padd16(TMP16, TMP24, TMP16); + vis_mul8x16al(REF_S2, CONST_512, TMP24); + + vis_padd16(TMP18, TMP26, TMP18); + vis_mul8x16al(REF_S2_1, CONST_512, TMP26); + + vis_padd16(TMP12, TMP0, TMP12); + vis_mul8x16au(REF_2, CONST_256, TMP28); + + vis_padd16(TMP14, TMP2, TMP14); + vis_mul8x16au(REF_2_1, CONST_256, TMP30); + + vis_padd16(TMP16, TMP4, TMP16); + vis_mul8x16au(REF_6, CONST_256, REF_S4); + + vis_padd16(TMP18, TMP6, TMP18); + vis_mul8x16au(REF_6_1, CONST_256, REF_S6); + + vis_pack16(TMP12, DST_0); + vis_padd16(TMP28, TMP0, TMP12); + + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + vis_padd16(TMP30, TMP2, TMP14); + + vis_pack16(TMP16, DST_2); + vis_padd16(REF_S4, TMP4, TMP16); + + vis_pack16(TMP18, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + vis_padd16(REF_S6, TMP6, TMP18); + + vis_padd16(TMP12, TMP20, TMP12); + + vis_padd16(TMP14, TMP22, TMP14); + vis_pack16(TMP12, DST_0); + + vis_padd16(TMP16, TMP24, TMP16); + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + + vis_padd16(TMP18, TMP26, TMP18); + vis_pack16(TMP16, DST_2); + + vis_pack16(TMP18, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_avg_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + int stride_8 = stride + 8; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(constants3[0], CONST_3); + vis_faligndata(TMP0, TMP2, REF_2); + + vis_ld64(constants256_512[0], CONST_256); + + height >>= 1; + do { /* 20 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_pmerge(ZERO, REF_2, TMP8); + vis_mul8x16au(REF_2_1, CONST_256, TMP10); + + vis_ld64_2(ref, stride_8, TMP2); + ref += stride; + + vis_ld64(dest[0], DST_0); + + vis_ld64_2(dest, stride, DST_2); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, stride, TMP4); + vis_mul8x16al(DST_0, CONST_512, TMP16); + vis_pmerge(ZERO, REF_0, TMP12); + + vis_ld64_2(ref, stride_8, TMP6); + ref += stride; + vis_mul8x16al(DST_1, CONST_512, TMP18); + vis_pmerge(ZERO, REF_0_1, TMP14); + + vis_padd16(TMP12, CONST_3, TMP12); + vis_mul8x16al(DST_2, CONST_512, TMP24); + + vis_padd16(TMP14, CONST_3, TMP14); + vis_mul8x16al(DST_3, CONST_512, TMP26); + + vis_faligndata(TMP4, TMP6, REF_2); + + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + vis_mul8x16au(REF_2, CONST_256, TMP20); + + vis_padd16(TMP8, TMP16, TMP0); + vis_mul8x16au(REF_2_1, CONST_256, TMP22); + + vis_padd16(TMP10, TMP18, TMP2); + vis_pack16(TMP0, DST_0); + + vis_pack16(TMP2, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + vis_padd16(TMP12, TMP20, TMP12); + + vis_padd16(TMP14, TMP22, TMP14); + + vis_padd16(TMP12, TMP24, TMP0); + + vis_padd16(TMP14, TMP26, TMP2); + vis_pack16(TMP0, DST_2); + + vis_pack16(TMP2, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + int stride_16 = stride + 16; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(constants1[0], CONST_1); + vis_faligndata(TMP0, TMP2, REF_S0); + + vis_ld64(constants256_512[0], CONST_256); + vis_faligndata(TMP2, TMP4, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + vis_faligndata(TMP2, TMP4, REF_S6); + } else { + vis_src1(TMP2, REF_S2); + vis_src1(TMP4, REF_S6); + } + + height >>= 1; + do { + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP12); + vis_pmerge(ZERO, REF_S0_1, TMP14); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride_8, TMP2); + vis_mul8x16au(REF_S2, CONST_256, TMP16); + vis_pmerge(ZERO, REF_S2_1, TMP18); + + vis_ld64_2(ref, stride_16, TMP4); + ref += stride; + vis_mul8x16au(REF_S4, CONST_256, TMP20); + vis_pmerge(ZERO, REF_S4_1, TMP22); + + vis_ld64_2(ref, stride, TMP6); + vis_mul8x16au(REF_S6, CONST_256, TMP24); + vis_pmerge(ZERO, REF_S6_1, TMP26); + + vis_ld64_2(ref, stride_8, TMP8); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, stride_16, TMP10); + ref += stride; + vis_faligndata(TMP2, TMP4, REF_4); + + vis_faligndata(TMP6, TMP8, REF_S0); + + vis_faligndata(TMP8, TMP10, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + vis_faligndata(TMP6, TMP8, REF_S2); + vis_faligndata(TMP8, TMP10, REF_S6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + vis_src1(TMP8, REF_S2); + vis_src1(TMP10, REF_S6); + } + + vis_mul8x16au(REF_0, CONST_256, TMP0); + vis_pmerge(ZERO, REF_0_1, TMP2); + + vis_mul8x16au(REF_2, CONST_256, TMP4); + vis_pmerge(ZERO, REF_2_1, TMP6); + + vis_padd16(TMP0, CONST_2, TMP8); + vis_mul8x16au(REF_4, CONST_256, TMP0); + + vis_padd16(TMP2, CONST_1, TMP10); + vis_mul8x16au(REF_4_1, CONST_256, TMP2); + + vis_padd16(TMP8, TMP4, TMP8); + vis_mul8x16au(REF_6, CONST_256, TMP4); + + vis_padd16(TMP10, TMP6, TMP10); + vis_mul8x16au(REF_6_1, CONST_256, TMP6); + + vis_padd16(TMP12, TMP8, TMP12); + + vis_padd16(TMP14, TMP10, TMP14); + + vis_padd16(TMP12, TMP16, TMP12); + + vis_padd16(TMP14, TMP18, TMP14); + vis_pack16(TMP12, DST_0); + + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + vis_padd16(TMP0, CONST_1, TMP12); + + vis_mul8x16au(REF_S0, CONST_256, TMP0); + vis_padd16(TMP2, CONST_1, TMP14); + + vis_mul8x16au(REF_S0_1, CONST_256, TMP2); + vis_padd16(TMP12, TMP4, TMP12); + + vis_mul8x16au(REF_S2, CONST_256, TMP4); + vis_padd16(TMP14, TMP6, TMP14); + + vis_mul8x16au(REF_S2_1, CONST_256, TMP6); + vis_padd16(TMP20, TMP12, TMP20); + + vis_padd16(TMP22, TMP14, TMP22); + + vis_padd16(TMP20, TMP24, TMP20); + + vis_padd16(TMP22, TMP26, TMP22); + vis_pack16(TMP20, DST_2); + + vis_pack16(TMP22, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + vis_padd16(TMP0, TMP4, TMP24); + + vis_mul8x16au(REF_S4, CONST_256, TMP0); + vis_padd16(TMP2, TMP6, TMP26); + + vis_mul8x16au(REF_S4_1, CONST_256, TMP2); + vis_padd16(TMP24, TMP8, TMP24); + + vis_padd16(TMP26, TMP10, TMP26); + vis_pack16(TMP24, DST_0); + + vis_pack16(TMP26, DST_1); + vis_st64(DST_0, dest[0]); + vis_pmerge(ZERO, REF_S6, TMP4); + + vis_pmerge(ZERO, REF_S6_1, TMP6); + + vis_padd16(TMP0, TMP4, TMP0); + + vis_padd16(TMP2, TMP6, TMP2); + + vis_padd16(TMP0, TMP12, TMP0); + + vis_padd16(TMP2, TMP14, TMP2); + vis_pack16(TMP0, DST_2); + + vis_pack16(TMP2, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + + vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(constants1[0], CONST_1); + + vis_ld64(constants256_512[0], CONST_256); + vis_faligndata(TMP0, TMP2, REF_S0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + } else { + vis_src1(TMP2, REF_S2); + } + + height >>= 1; + do { /* 26 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP8); + vis_pmerge(ZERO, REF_S2, TMP12); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride_8, TMP2); + ref += stride; + vis_mul8x16au(REF_S0_1, CONST_256, TMP10); + vis_pmerge(ZERO, REF_S2_1, TMP14); + + vis_ld64_2(ref, stride, TMP4); + + vis_ld64_2(ref, stride_8, TMP6); + ref += stride; + vis_faligndata(TMP0, TMP2, REF_S4); + + vis_pmerge(ZERO, REF_S4, TMP18); + + vis_pmerge(ZERO, REF_S4_1, TMP20); + + vis_faligndata(TMP4, TMP6, REF_S0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S6); + vis_faligndata(TMP4, TMP6, REF_S2); + } else { + vis_src1(TMP2, REF_S6); + vis_src1(TMP6, REF_S2); + } + + vis_padd16(TMP18, CONST_1, TMP18); + vis_mul8x16au(REF_S6, CONST_256, TMP22); + + vis_padd16(TMP20, CONST_1, TMP20); + vis_mul8x16au(REF_S6_1, CONST_256, TMP24); + + vis_mul8x16au(REF_S0, CONST_256, TMP26); + vis_pmerge(ZERO, REF_S0_1, TMP28); + + vis_mul8x16au(REF_S2, CONST_256, TMP30); + vis_padd16(TMP18, TMP22, TMP18); + + vis_mul8x16au(REF_S2_1, CONST_256, TMP32); + vis_padd16(TMP20, TMP24, TMP20); + + vis_padd16(TMP8, TMP18, TMP8); + + vis_padd16(TMP10, TMP20, TMP10); + + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + vis_padd16(TMP18, TMP26, TMP18); + + vis_padd16(TMP20, TMP28, TMP20); + + vis_padd16(TMP18, TMP30, TMP18); + + vis_padd16(TMP20, TMP32, TMP20); + vis_pack16(TMP18, DST_2); + + vis_pack16(TMP20, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + int stride_16 = stride + 16; + + vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[ 0], TMP0); + vis_fzero(ZERO); + + vis_ld64(ref[ 8], TMP2); + + vis_ld64(ref[16], TMP4); + + vis_ld64(constants6[0], CONST_6); + vis_faligndata(TMP0, TMP2, REF_S0); + + vis_ld64(constants256_1024[0], CONST_256); + vis_faligndata(TMP2, TMP4, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + vis_faligndata(TMP2, TMP4, REF_S6); + } else { + vis_src1(TMP2, REF_S2); + vis_src1(TMP4, REF_S6); + } + + height >>= 1; + do { /* 55 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP12); + vis_pmerge(ZERO, REF_S0_1, TMP14); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride_8, TMP2); + vis_mul8x16au(REF_S2, CONST_256, TMP16); + vis_pmerge(ZERO, REF_S2_1, TMP18); + + vis_ld64_2(ref, stride_16, TMP4); + ref += stride; + vis_mul8x16au(REF_S4, CONST_256, TMP20); + vis_pmerge(ZERO, REF_S4_1, TMP22); + + vis_ld64_2(ref, stride, TMP6); + vis_mul8x16au(REF_S6, CONST_256, TMP24); + vis_pmerge(ZERO, REF_S6_1, TMP26); + + vis_ld64_2(ref, stride_8, TMP8); + vis_faligndata(TMP0, TMP2, REF_0); + + vis_ld64_2(ref, stride_16, TMP10); + ref += stride; + vis_faligndata(TMP2, TMP4, REF_4); + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP6, TMP8, REF_S0); + + vis_ld64_2(dest, 8, DST_2); + vis_faligndata(TMP8, TMP10, REF_S4); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_2); + vis_faligndata(TMP2, TMP4, REF_6); + vis_faligndata(TMP6, TMP8, REF_S2); + vis_faligndata(TMP8, TMP10, REF_S6); + } else { + vis_src1(TMP2, REF_2); + vis_src1(TMP4, REF_6); + vis_src1(TMP8, REF_S2); + vis_src1(TMP10, REF_S6); + } + + vis_mul8x16al(DST_0, CONST_1024, TMP30); + vis_pmerge(ZERO, REF_0, TMP0); + + vis_mul8x16al(DST_1, CONST_1024, TMP32); + vis_pmerge(ZERO, REF_0_1, TMP2); + + vis_mul8x16au(REF_2, CONST_256, TMP4); + vis_pmerge(ZERO, REF_2_1, TMP6); + + vis_mul8x16al(DST_2, CONST_1024, REF_0); + vis_padd16(TMP0, CONST_6, TMP0); + + vis_mul8x16al(DST_3, CONST_1024, REF_2); + vis_padd16(TMP2, CONST_6, TMP2); + + vis_padd16(TMP0, TMP4, TMP0); + vis_mul8x16au(REF_4, CONST_256, TMP4); + + vis_padd16(TMP2, TMP6, TMP2); + vis_mul8x16au(REF_4_1, CONST_256, TMP6); + + vis_padd16(TMP12, TMP0, TMP12); + vis_mul8x16au(REF_6, CONST_256, TMP8); + + vis_padd16(TMP14, TMP2, TMP14); + vis_mul8x16au(REF_6_1, CONST_256, TMP10); + + vis_padd16(TMP12, TMP16, TMP12); + vis_mul8x16au(REF_S0, CONST_256, REF_4); + + vis_padd16(TMP14, TMP18, TMP14); + vis_mul8x16au(REF_S0_1, CONST_256, REF_6); + + vis_padd16(TMP12, TMP30, TMP12); + + vis_padd16(TMP14, TMP32, TMP14); + vis_pack16(TMP12, DST_0); + + vis_pack16(TMP14, DST_1); + vis_st64(DST_0, dest[0]); + vis_padd16(TMP4, CONST_6, TMP4); + + vis_ld64_2(dest, stride, DST_0); + vis_padd16(TMP6, CONST_6, TMP6); + vis_mul8x16au(REF_S2, CONST_256, TMP12); + + vis_padd16(TMP4, TMP8, TMP4); + vis_mul8x16au(REF_S2_1, CONST_256, TMP14); + + vis_padd16(TMP6, TMP10, TMP6); + + vis_padd16(TMP20, TMP4, TMP20); + + vis_padd16(TMP22, TMP6, TMP22); + + vis_padd16(TMP20, TMP24, TMP20); + + vis_padd16(TMP22, TMP26, TMP22); + + vis_padd16(TMP20, REF_0, TMP20); + vis_mul8x16au(REF_S4, CONST_256, REF_0); + + vis_padd16(TMP22, REF_2, TMP22); + vis_pack16(TMP20, DST_2); + + vis_pack16(TMP22, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + + vis_ld64_2(dest, 8, DST_2); + vis_mul8x16al(DST_0, CONST_1024, TMP30); + vis_pmerge(ZERO, REF_S4_1, REF_2); + + vis_mul8x16al(DST_1, CONST_1024, TMP32); + vis_padd16(REF_4, TMP0, TMP8); + + vis_mul8x16au(REF_S6, CONST_256, REF_4); + vis_padd16(REF_6, TMP2, TMP10); + + vis_mul8x16au(REF_S6_1, CONST_256, REF_6); + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + + vis_padd16(TMP8, TMP30, TMP8); + + vis_padd16(TMP10, TMP32, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_st64(DST_0, dest[0]); + + vis_padd16(REF_0, TMP4, REF_0); + + vis_mul8x16al(DST_2, CONST_1024, TMP30); + vis_padd16(REF_2, TMP6, REF_2); + + vis_mul8x16al(DST_3, CONST_1024, TMP32); + vis_padd16(REF_0, REF_4, REF_0); + + vis_padd16(REF_2, REF_6, REF_2); + + vis_padd16(REF_0, TMP30, REF_0); + + /* stall */ + + vis_padd16(REF_2, TMP32, REF_2); + vis_pack16(REF_0, DST_2); + + vis_pack16(REF_2, DST_3); + vis_st64_2(DST_2, dest, 8); + dest += stride; + } while (--height); +} + +static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref, + const int stride, int height) +{ + uint8_t *ref = (uint8_t *) _ref; + unsigned long off = (unsigned long) ref & 0x7; + unsigned long off_plus_1 = off + 1; + int stride_8 = stride + 8; + + vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); + + ref = vis_alignaddr(ref); + + vis_ld64(ref[0], TMP0); + vis_fzero(ZERO); + + vis_ld64_2(ref, 8, TMP2); + + vis_ld64(constants6[0], CONST_6); + + vis_ld64(constants256_1024[0], CONST_256); + vis_faligndata(TMP0, TMP2, REF_S0); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S2); + } else { + vis_src1(TMP2, REF_S2); + } + + height >>= 1; + do { /* 31 cycles */ + vis_ld64_2(ref, stride, TMP0); + vis_mul8x16au(REF_S0, CONST_256, TMP8); + vis_pmerge(ZERO, REF_S0_1, TMP10); + + vis_ld64_2(ref, stride_8, TMP2); + ref += stride; + vis_mul8x16au(REF_S2, CONST_256, TMP12); + vis_pmerge(ZERO, REF_S2_1, TMP14); + + vis_alignaddr_g0((void *)off); + + vis_ld64_2(ref, stride, TMP4); + vis_faligndata(TMP0, TMP2, REF_S4); + + vis_ld64_2(ref, stride_8, TMP6); + ref += stride; + + vis_ld64(dest[0], DST_0); + vis_faligndata(TMP4, TMP6, REF_S0); + + vis_ld64_2(dest, stride, DST_2); + + if (off != 0x7) { + vis_alignaddr_g0((void *)off_plus_1); + vis_faligndata(TMP0, TMP2, REF_S6); + vis_faligndata(TMP4, TMP6, REF_S2); + } else { + vis_src1(TMP2, REF_S6); + vis_src1(TMP6, REF_S2); + } + + vis_mul8x16al(DST_0, CONST_1024, TMP30); + vis_pmerge(ZERO, REF_S4, TMP22); + + vis_mul8x16al(DST_1, CONST_1024, TMP32); + vis_pmerge(ZERO, REF_S4_1, TMP24); + + vis_mul8x16au(REF_S6, CONST_256, TMP26); + vis_pmerge(ZERO, REF_S6_1, TMP28); + + vis_mul8x16au(REF_S0, CONST_256, REF_S4); + vis_padd16(TMP22, CONST_6, TMP22); + + vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); + vis_padd16(TMP24, CONST_6, TMP24); + + vis_mul8x16al(DST_2, CONST_1024, REF_0); + vis_padd16(TMP22, TMP26, TMP22); + + vis_mul8x16al(DST_3, CONST_1024, REF_2); + vis_padd16(TMP24, TMP28, TMP24); + + vis_mul8x16au(REF_S2, CONST_256, TMP26); + vis_padd16(TMP8, TMP22, TMP8); + + vis_mul8x16au(REF_S2_1, CONST_256, TMP28); + vis_padd16(TMP10, TMP24, TMP10); + + vis_padd16(TMP8, TMP12, TMP8); + + vis_padd16(TMP10, TMP14, TMP10); + + vis_padd16(TMP8, TMP30, TMP8); + + vis_padd16(TMP10, TMP32, TMP10); + vis_pack16(TMP8, DST_0); + + vis_pack16(TMP10, DST_1); + vis_st64(DST_0, dest[0]); + dest += stride; + + vis_padd16(REF_S4, TMP22, TMP12); + + vis_padd16(REF_S6, TMP24, TMP14); + + vis_padd16(TMP12, TMP26, TMP12); + + vis_padd16(TMP14, TMP28, TMP14); + + vis_padd16(TMP12, REF_0, TMP12); + + vis_padd16(TMP14, REF_2, TMP14); + vis_pack16(TMP12, DST_2); + + vis_pack16(TMP14, DST_3); + vis_st64(DST_2, dest[0]); + dest += stride; + } while (--height); +} + +/* End of no rounding code */ + +void get_pixels_vis(uint8_t *restrict dest, const uint8_t *_ref, int stride) +{ + int i; + uint8_t *ref = (uint8_t*)_ref; + ref = vis_alignaddr(ref); + + for (i = 0; i < 8; i++) + { + vis_ld64(ref[0], TMP0); + vis_st64(TMP0, dest[0]); + dest += 8; + ref += stride; + } +} + +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump = 0; + +static void sigill_handler (int sig) +{ + if (!canjump) { + signal (sig, SIG_DFL); + raise (sig); + } + + canjump = 0; + siglongjmp (jmpbuf, 1); +} + +#define ACCEL_SPARC_VIS 1 +#define ACCEL_SPARC_VIS2 2 + +static int vis_level () +{ + int accel = 0; + + signal (SIGILL, sigill_handler); + if (sigsetjmp (jmpbuf, 1)) { + signal (SIGILL, SIG_DFL); + return accel; + } + + canjump = 1; + + /* pdist %f0, %f0, %f0 */ + __asm__ __volatile__(".word\t0x81b007c0"); + + canjump = 0; + accel |= ACCEL_SPARC_VIS; + + if (sigsetjmp (jmpbuf, 1)) { + signal (SIGILL, SIG_DFL); + return accel; + } + + canjump = 1; + + /* edge8n %g0, %g0, %g0 */ + __asm__ __volatile__(".word\t0x81b00020"); + + canjump = 0; + accel |= ACCEL_SPARC_VIS2; + + signal (SIGILL, SIG_DFL); + + return accel; +} + +/* libavcodec initialization code */ +void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx) +{ + /* VIS specific optimisations */ + int accel = vis_level (); + + if (accel & ACCEL_SPARC_VIS) { + c->get_pixels = get_pixels_vis; + c->put_pixels_tab[0][0] = MC_put_o_16_vis; + c->put_pixels_tab[0][1] = MC_put_x_16_vis; + c->put_pixels_tab[0][2] = MC_put_y_16_vis; + c->put_pixels_tab[0][3] = MC_put_xy_16_vis; + + c->put_pixels_tab[1][0] = MC_put_o_8_vis; + c->put_pixels_tab[1][1] = MC_put_x_8_vis; + c->put_pixels_tab[1][2] = MC_put_y_8_vis; + c->put_pixels_tab[1][3] = MC_put_xy_8_vis; + + c->avg_pixels_tab[0][0] = MC_avg_o_16_vis; + c->avg_pixels_tab[0][1] = MC_avg_x_16_vis; + c->avg_pixels_tab[0][2] = MC_avg_y_16_vis; + c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis; + + c->avg_pixels_tab[1][0] = MC_avg_o_8_vis; + c->avg_pixels_tab[1][1] = MC_avg_x_8_vis; + c->avg_pixels_tab[1][2] = MC_avg_y_8_vis; + c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis; + + c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis; + c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis; + c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis; + c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis; + + c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis; + c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis; + c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis; + c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis; + + c->avg_no_rnd_pixels_tab[0][0] = MC_avg_no_round_o_16_vis; + c->avg_no_rnd_pixels_tab[0][1] = MC_avg_no_round_x_16_vis; + c->avg_no_rnd_pixels_tab[0][2] = MC_avg_no_round_y_16_vis; + c->avg_no_rnd_pixels_tab[0][3] = MC_avg_no_round_xy_16_vis; + + c->avg_no_rnd_pixels_tab[1][0] = MC_avg_no_round_o_8_vis; + c->avg_no_rnd_pixels_tab[1][1] = MC_avg_no_round_x_8_vis; + c->avg_no_rnd_pixels_tab[1][2] = MC_avg_no_round_y_8_vis; + c->avg_no_rnd_pixels_tab[1][3] = MC_avg_no_round_xy_8_vis; + } +} + +#endif /* !(ARCH_SPARC) */ diff --git a/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c b/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c new file mode 100644 index 000000000..a09ee4e28 --- /dev/null +++ b/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c @@ -0,0 +1,2 @@ + +char libavcodec_mlib_dummy; diff --git a/src/libffmpeg/libavcodec/sparc/vis.h b/src/libffmpeg/libavcodec/sparc/vis.h new file mode 100644 index 000000000..07dda2949 --- /dev/null +++ b/src/libffmpeg/libavcodec/sparc/vis.h @@ -0,0 +1,328 @@ +/* + * vis.h + * Copyright (C) 2003 David S. Miller <davem@redhat.com> + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* You may be asking why I hard-code the instruction opcodes and don't + * use the normal VIS assembler mnenomics for the VIS instructions. + * + * The reason is that Sun, in their infinite wisdom, decided that a binary + * using a VIS instruction will cause it to be marked (in the ELF headers) + * as doing so, and this prevents the OS from loading such binaries if the + * current cpu doesn't have VIS. There is no way to easily override this + * behavior of the assembler that I am aware of. + * + * This totally defeats what libmpeg2 is trying to do which is allow a + * single binary to be created, and then detect the availability of VIS + * at runtime. + * + * I'm not saying that tainting the binary by default is bad, rather I'm + * saying that not providing a way to override this easily unnecessarily + * ties people's hands. + * + * Thus, we do the opcode encoding by hand and output 32-bit words in + * the assembler to keep the binary from becoming tainted. + */ + +#define vis_opc_base ((0x1 << 31) | (0x36 << 19)) +#define vis_opf(X) ((X) << 5) +#define vis_sreg(X) (X) +#define vis_dreg(X) (((X)&0x1f)|((X)>>5)) +#define vis_rs1_s(X) (vis_sreg(X) << 14) +#define vis_rs1_d(X) (vis_dreg(X) << 14) +#define vis_rs2_s(X) (vis_sreg(X) << 0) +#define vis_rs2_d(X) (vis_dreg(X) << 0) +#define vis_rd_s(X) (vis_sreg(X) << 25) +#define vis_rd_d(X) (vis_dreg(X) << 25) + +#define vis_ss2s(opf,rs1,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs1_s(rs1) | \ + vis_rs2_s(rs2) | \ + vis_rd_s(rd))) + +#define vis_dd2d(opf,rs1,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs1_d(rs1) | \ + vis_rs2_d(rs2) | \ + vis_rd_d(rd))) + +#define vis_ss2d(opf,rs1,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs1_s(rs1) | \ + vis_rs2_s(rs2) | \ + vis_rd_d(rd))) + +#define vis_sd2d(opf,rs1,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs1_s(rs1) | \ + vis_rs2_d(rs2) | \ + vis_rd_d(rd))) + +#define vis_d2s(opf,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs2_d(rs2) | \ + vis_rd_s(rd))) + +#define vis_s2d(opf,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs2_s(rs2) | \ + vis_rd_d(rd))) + +#define vis_d12d(opf,rs1,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs1_d(rs1) | \ + vis_rd_d(rd))) + +#define vis_d22d(opf,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs2_d(rs2) | \ + vis_rd_d(rd))) + +#define vis_s12s(opf,rs1,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs1_s(rs1) | \ + vis_rd_s(rd))) + +#define vis_s22s(opf,rs2,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rs2_s(rs2) | \ + vis_rd_s(rd))) + +#define vis_s(opf,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rd_s(rd))) + +#define vis_d(opf,rd) \ + __asm__ __volatile__ (".word %0" \ + : : "i" (vis_opc_base | vis_opf(opf) | \ + vis_rd_d(rd))) + +#define vis_r2m(op,rd,mem) \ + __asm__ __volatile__ (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) ) + +#define vis_r2m_2(op,rd,mem1,mem2) \ + __asm__ __volatile__ (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) ) + +#define vis_m2r(op,mem,rd) \ + __asm__ __volatile__ (#op "\t[%0], %%f" #rd : : "r" (&(mem)) ) + +#define vis_m2r_2(op,mem1,mem2,rd) \ + __asm__ __volatile__ (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) ) + +static inline void vis_set_gsr(unsigned int _val) +{ + register unsigned int val asm("g1"); + + val = _val; + __asm__ __volatile__(".word 0xa7804000" + : : "r" (val)); +} + +#define VIS_GSR_ALIGNADDR_MASK 0x0000007 +#define VIS_GSR_ALIGNADDR_SHIFT 0 +#define VIS_GSR_SCALEFACT_MASK 0x0000078 +#define VIS_GSR_SCALEFACT_SHIFT 3 + +#define vis_ld32(mem,rs1) vis_m2r(ld, mem, rs1) +#define vis_ld32_2(mem1,mem2,rs1) vis_m2r_2(ld, mem1, mem2, rs1) +#define vis_st32(rs1,mem) vis_r2m(st, rs1, mem) +#define vis_st32_2(rs1,mem1,mem2) vis_r2m_2(st, rs1, mem1, mem2) +#define vis_ld64(mem,rs1) vis_m2r(ldd, mem, rs1) +#define vis_ld64_2(mem1,mem2,rs1) vis_m2r_2(ldd, mem1, mem2, rs1) +#define vis_st64(rs1,mem) vis_r2m(std, rs1, mem) +#define vis_st64_2(rs1,mem1,mem2) vis_r2m_2(std, rs1, mem1, mem2) + +#define vis_ldblk(mem, rd) \ +do { register void *__mem asm("g1"); \ + __mem = &(mem); \ + __asm__ __volatile__(".word 0xc1985e00 | %1" \ + : \ + : "r" (__mem), \ + "i" (vis_rd_d(rd)) \ + : "memory"); \ +} while (0) + +#define vis_stblk(rd, mem) \ +do { register void *__mem asm("g1"); \ + __mem = &(mem); \ + __asm__ __volatile__(".word 0xc1b85e00 | %1" \ + : \ + : "r" (__mem), \ + "i" (vis_rd_d(rd)) \ + : "memory"); \ +} while (0) + +#define vis_membar_storestore() \ + __asm__ __volatile__(".word 0x8143e008" : : : "memory") + +#define vis_membar_sync() \ + __asm__ __volatile__(".word 0x8143e040" : : : "memory") + +/* 16 and 32 bit partitioned addition and subtraction. The normal + * versions perform 4 16-bit or 2 32-bit additions or subtractions. + * The 's' versions perform 2 16-bit or 1 32-bit additions or + * subtractions. + */ + +#define vis_padd16(rs1,rs2,rd) vis_dd2d(0x50, rs1, rs2, rd) +#define vis_padd16s(rs1,rs2,rd) vis_ss2s(0x51, rs1, rs2, rd) +#define vis_padd32(rs1,rs2,rd) vis_dd2d(0x52, rs1, rs2, rd) +#define vis_padd32s(rs1,rs2,rd) vis_ss2s(0x53, rs1, rs2, rd) +#define vis_psub16(rs1,rs2,rd) vis_dd2d(0x54, rs1, rs2, rd) +#define vis_psub16s(rs1,rs2,rd) vis_ss2s(0x55, rs1, rs2, rd) +#define vis_psub32(rs1,rs2,rd) vis_dd2d(0x56, rs1, rs2, rd) +#define vis_psub32s(rs1,rs2,rd) vis_ss2s(0x57, rs1, rs2, rd) + +/* Pixel formatting instructions. */ + +#define vis_pack16(rs2,rd) vis_d2s( 0x3b, rs2, rd) +#define vis_pack32(rs1,rs2,rd) vis_dd2d(0x3a, rs1, rs2, rd) +#define vis_packfix(rs2,rd) vis_d2s( 0x3d, rs2, rd) +#define vis_expand(rs2,rd) vis_s2d( 0x4d, rs2, rd) +#define vis_pmerge(rs1,rs2,rd) vis_ss2d(0x4b, rs1, rs2, rd) + +/* Partitioned multiply instructions. */ + +#define vis_mul8x16(rs1,rs2,rd) vis_sd2d(0x31, rs1, rs2, rd) +#define vis_mul8x16au(rs1,rs2,rd) vis_ss2d(0x33, rs1, rs2, rd) +#define vis_mul8x16al(rs1,rs2,rd) vis_ss2d(0x35, rs1, rs2, rd) +#define vis_mul8sux16(rs1,rs2,rd) vis_dd2d(0x36, rs1, rs2, rd) +#define vis_mul8ulx16(rs1,rs2,rd) vis_dd2d(0x37, rs1, rs2, rd) +#define vis_muld8sux16(rs1,rs2,rd) vis_ss2d(0x38, rs1, rs2, rd) +#define vis_muld8ulx16(rs1,rs2,rd) vis_ss2d(0x39, rs1, rs2, rd) + +/* Alignment instructions. */ + +static inline void *vis_alignaddr(void *_ptr) +{ + register void *ptr asm("g1"); + + ptr = _ptr; + + __asm__ __volatile__(".word %2" + : "=&r" (ptr) + : "0" (ptr), + "i" (vis_opc_base | vis_opf(0x18) | + vis_rs1_s(1) | + vis_rs2_s(0) | + vis_rd_s(1))); + + return ptr; +} + +static inline void vis_alignaddr_g0(void *_ptr) +{ + register void *ptr asm("g1"); + + ptr = _ptr; + + __asm__ __volatile__(".word %2" + : "=&r" (ptr) + : "0" (ptr), + "i" (vis_opc_base | vis_opf(0x18) | + vis_rs1_s(1) | + vis_rs2_s(0) | + vis_rd_s(0))); +} + +static inline void *vis_alignaddrl(void *_ptr) +{ + register void *ptr asm("g1"); + + ptr = _ptr; + + __asm__ __volatile__(".word %2" + : "=&r" (ptr) + : "0" (ptr), + "i" (vis_opc_base | vis_opf(0x19) | + vis_rs1_s(1) | + vis_rs2_s(0) | + vis_rd_s(1))); + + return ptr; +} + +static inline void vis_alignaddrl_g0(void *_ptr) +{ + register void *ptr asm("g1"); + + ptr = _ptr; + + __asm__ __volatile__(".word %2" + : "=&r" (ptr) + : "0" (ptr), + "i" (vis_opc_base | vis_opf(0x19) | + vis_rs1_s(1) | + vis_rs2_s(0) | + vis_rd_s(0))); +} + +#define vis_faligndata(rs1,rs2,rd) vis_dd2d(0x48, rs1, rs2, rd) + +/* Logical operate instructions. */ + +#define vis_fzero(rd) vis_d( 0x60, rd) +#define vis_fzeros(rd) vis_s( 0x61, rd) +#define vis_fone(rd) vis_d( 0x7e, rd) +#define vis_fones(rd) vis_s( 0x7f, rd) +#define vis_src1(rs1,rd) vis_d12d(0x74, rs1, rd) +#define vis_src1s(rs1,rd) vis_s12s(0x75, rs1, rd) +#define vis_src2(rs2,rd) vis_d22d(0x78, rs2, rd) +#define vis_src2s(rs2,rd) vis_s22s(0x79, rs2, rd) +#define vis_not1(rs1,rd) vis_d12d(0x6a, rs1, rd) +#define vis_not1s(rs1,rd) vis_s12s(0x6b, rs1, rd) +#define vis_not2(rs2,rd) vis_d22d(0x66, rs2, rd) +#define vis_not2s(rs2,rd) vis_s22s(0x67, rs2, rd) +#define vis_or(rs1,rs2,rd) vis_dd2d(0x7c, rs1, rs2, rd) +#define vis_ors(rs1,rs2,rd) vis_ss2s(0x7d, rs1, rs2, rd) +#define vis_nor(rs1,rs2,rd) vis_dd2d(0x62, rs1, rs2, rd) +#define vis_nors(rs1,rs2,rd) vis_ss2s(0x63, rs1, rs2, rd) +#define vis_and(rs1,rs2,rd) vis_dd2d(0x70, rs1, rs2, rd) +#define vis_ands(rs1,rs2,rd) vis_ss2s(0x71, rs1, rs2, rd) +#define vis_nand(rs1,rs2,rd) vis_dd2d(0x6e, rs1, rs2, rd) +#define vis_nands(rs1,rs2,rd) vis_ss2s(0x6f, rs1, rs2, rd) +#define vis_xor(rs1,rs2,rd) vis_dd2d(0x6c, rs1, rs2, rd) +#define vis_xors(rs1,rs2,rd) vis_ss2s(0x6d, rs1, rs2, rd) +#define vis_xnor(rs1,rs2,rd) vis_dd2d(0x72, rs1, rs2, rd) +#define vis_xnors(rs1,rs2,rd) vis_ss2s(0x73, rs1, rs2, rd) +#define vis_ornot1(rs1,rs2,rd) vis_dd2d(0x7a, rs1, rs2, rd) +#define vis_ornot1s(rs1,rs2,rd) vis_ss2s(0x7b, rs1, rs2, rd) +#define vis_ornot2(rs1,rs2,rd) vis_dd2d(0x76, rs1, rs2, rd) +#define vis_ornot2s(rs1,rs2,rd) vis_ss2s(0x77, rs1, rs2, rd) +#define vis_andnot1(rs1,rs2,rd) vis_dd2d(0x68, rs1, rs2, rd) +#define vis_andnot1s(rs1,rs2,rd) vis_ss2s(0x69, rs1, rs2, rd) +#define vis_andnot2(rs1,rs2,rd) vis_dd2d(0x64, rs1, rs2, rd) +#define vis_andnot2s(rs1,rs2,rd) vis_ss2s(0x65, rs1, rs2, rd) + +/* Pixel component distance. */ + +#define vis_pdist(rs1,rs2,rd) vis_dd2d(0x3e, rs1, rs2, rd) diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c index 6a15270b7..781194f03 100644 --- a/src/libffmpeg/libavcodec/svq1.c +++ b/src/libffmpeg/libavcodec/svq1.c @@ -783,6 +783,8 @@ static int svq1_decode_init(AVCodecContext *avctx) MpegEncContext *s = avctx->priv_data; int i; + MPV_decode_defaults(s); + s->avctx = avctx; s->width = (avctx->width+3)&~3; s->height = (avctx->height+3)&~3; diff --git a/src/libffmpeg/libavcodec/truemotion1.c b/src/libffmpeg/libavcodec/truemotion1.c index 35bf3a788..2f6310192 100644 --- a/src/libffmpeg/libavcodec/truemotion1.c +++ b/src/libffmpeg/libavcodec/truemotion1.c @@ -36,9 +36,6 @@ #include "avcodec.h" #include "dsputil.h" -#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log -#define fprintf(...) {} - #include "truemotion1data.h" typedef struct TrueMotion1Context { @@ -232,7 +229,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s) header.header_size = ((s->buf[0] >> 5) | (s->buf[0] << 3)) & 0x7f; if (s->buf[0] < 0x10) { - printf("invalid header size\n"); + av_log(s->avctx, AV_LOG_ERROR, "invalid header size\n"); return -1; } @@ -282,7 +279,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s) } if (header.compression > 17) { - printf("invalid compression type (%d)\n", header.compression); + av_log(s->avctx, AV_LOG_ERROR, "invalid compression type (%d)\n", header.compression); return -1; } @@ -296,7 +293,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s) if (header.vectable < 4) sel_vector_table = tables[header.vectable - 1]; else { - printf("invalid vector table id (%d)\n", header.vectable); + av_log(s->avctx, AV_LOG_ERROR, "invalid vector table id (%d)\n", header.vectable); return -1; } } @@ -305,7 +302,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s) { if (compression_types[header.compression].algorithm == ALGO_RGB24H) { - printf("24bit compression not yet supported\n"); + av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n"); } else gen_vector_table(s, sel_vector_table); @@ -354,7 +351,7 @@ static int truemotion1_decode_init(AVCodecContext *avctx) #define GET_NEXT_INDEX() \ {\ if (index_stream_index >= s->index_stream_size) { \ - printf (" help! truemotion1 decoder went out of bounds\n"); \ + av_log(s->avctx, AV_LOG_INFO, " help! truemotion1 decoder went out of bounds\n"); \ return; \ } \ index = s->index_stream[index_stream_index++] * 4; \ @@ -542,7 +539,7 @@ static int truemotion1_decode_frame(AVCodecContext *avctx, s->frame.reference = 1; if (avctx->get_buffer(avctx, &s->frame) < 0) { - fprintf(stderr, "truemotion1: get_buffer() failed\n"); + av_log(s->avctx, AV_LOG_ERROR, "truemotion1: get_buffer() failed\n"); return -1; } @@ -561,7 +558,7 @@ static int truemotion1_decode_frame(AVCodecContext *avctx, memcpy(s->frame.data[0], s->prev_frame.data[0], s->frame.linesize[0] * s->avctx->height); } else if (compression_types[s->compression].algorithm == ALGO_RGB24H) { - printf (" 24-bit Duck TrueMotion decoding not yet implemented\n"); + av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n"); } else { truemotion1_decode_16bit(s); } diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c index 145f9df65..ffa0cb855 100644 --- a/src/libffmpeg/libavcodec/utils.c +++ b/src/libffmpeg/libavcodec/utils.c @@ -60,47 +60,40 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size) if(min_size < *size) return ptr; - *size= min_size + 10*1024; + *size= 17*min_size/16 + 32; return av_realloc(ptr, *size); } -/* allocation of static arrays - do not use for normal allocation */ static unsigned int last_static = 0; -static char*** array_static = NULL; -static const unsigned int grow_static = 64; // ^2 -void *__av_mallocz_static(void** location, unsigned int size) +static unsigned int allocated_static = 0; +static void** array_static = NULL; + +/** + * allocation of static arrays - do not use for normal allocation. + */ +void *av_mallocz_static(unsigned int size) { - unsigned int l = (last_static + grow_static) & ~(grow_static - 1); void *ptr = av_mallocz(size); - if (!ptr) - return NULL; - - if (location) - { - if (l > last_static) - array_static = av_realloc(array_static, l); - array_static[last_static++] = (char**) location; - *location = ptr; + + if(ptr){ + array_static =av_fast_realloc(array_static, &allocated_static, sizeof(void*)*(last_static+1)); + array_static[last_static++] = ptr; } + return ptr; } -/* free all static arrays and reset pointers to 0 */ + +/** + * free all static arrays and reset pointers to 0. + */ void av_free_static(void) { - if (array_static) - { - unsigned i; - for (i = 0; i < last_static; i++) - { - av_free(*array_static[i]); - *array_static[i] = NULL; - } - av_free(array_static); - array_static = 0; + while(last_static){ + av_freep(&array_static[--last_static]); } - last_static = 0; + av_freep(&array_static); } /** diff --git a/src/libffmpeg/libavcodec/vmdav.c b/src/libffmpeg/libavcodec/vmdav.c index 47c77513d..c09af1369 100644 --- a/src/libffmpeg/libavcodec/vmdav.c +++ b/src/libffmpeg/libavcodec/vmdav.c @@ -47,9 +47,6 @@ #include "avcodec.h" #include "dsputil.h" -#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log -#define fprintf(...) {} - #define VMD_HEADER_SIZE 0x330 #define PALETTE_COUNT 256 @@ -245,7 +242,7 @@ static void vmd_decode(VmdVideoContext *s) } } while (ofs < frame_width); if (ofs > frame_width) { - printf (" VMD video: offset > width (%d > %d)\n", + av_log(s->avctx, AV_LOG_ERROR, "VMD video: offset > width (%d > %d)\n", ofs, frame_width); break; } @@ -283,7 +280,7 @@ static void vmd_decode(VmdVideoContext *s) } } while (ofs < frame_width); if (ofs > frame_width) { - printf (" VMD video: offset > width (%d > %d)\n", + av_log(s->avctx, AV_LOG_ERROR, "VMD video: offset > width (%d > %d)\n", ofs, frame_width); } dp += s->frame.linesize[0]; @@ -311,7 +308,7 @@ static int vmdvideo_decode_init(AVCodecContext *avctx) /* make sure the VMD header made it */ if (s->avctx->extradata_size != VMD_HEADER_SIZE) { - printf(" VMD video: expected extradata size of %d\n", + av_log(s->avctx, AV_LOG_ERROR, "VMD video: expected extradata size of %d\n", VMD_HEADER_SIZE); return -1; } @@ -350,7 +347,7 @@ static int vmdvideo_decode_frame(AVCodecContext *avctx, s->frame.reference = 1; if (avctx->get_buffer(avctx, &s->frame)) { - printf (" VMD Video: get_buffer() failed\n"); + av_log(s->avctx, AV_LOG_ERROR, "VMD Video: get_buffer() failed\n"); return -1; } @@ -389,6 +386,7 @@ static int vmdvideo_decode_end(AVCodecContext *avctx) */ typedef struct VmdAudioContext { + AVCodecContext *avctx; int channels; int bits; int block_align; @@ -403,12 +401,13 @@ static int vmdaudio_decode_init(AVCodecContext *avctx) VmdAudioContext *s = (VmdAudioContext *)avctx->priv_data; int i; + s->avctx = avctx; s->channels = avctx->channels; s->bits = avctx->bits_per_sample; s->block_align = avctx->block_align; -printf (" %d channels, %d bits/sample, block align = %d, sample rate = %d\n", - s->channels, s->bits, s->block_align, avctx->sample_rate); + av_log(s->avctx, AV_LOG_DEBUG, "%d channels, %d bits/sample, block align = %d, sample rate = %d\n", + s->channels, s->bits, s->block_align, avctx->sample_rate); /* set up the steps8 and steps16 tables */ for (i = 0; i < 8; i++) { @@ -465,8 +464,8 @@ static int vmdaudio_loadsound(VmdAudioContext *s, unsigned char *data, int bytes_decoded = 0; int i; -if (silence) - printf (" silent block!\n"); + if (silence) + av_log(s->avctx, AV_LOG_INFO, "silent block!\n"); if (s->channels == 2) { /* stereo handling */ @@ -520,7 +519,6 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx, unsigned char *p = buf + 16; unsigned char *p_end = buf + buf_size; -printf (" processing audio frame with %d bytes\n", buf_size); if (buf_size < 16) return buf_size; @@ -529,7 +527,6 @@ printf (" processing audio frame with %d bytes\n", buf_size); /* the chunk contains audio */ *data_size = vmdaudio_loadsound(s, output_samples, p, 0); } else if (buf[6] == 2) { -printf (" hey! audio case #2\n"); /* the chunk contains audio and silence mixed together */ sound_flags = LE_32(p); p += 4; @@ -549,13 +546,10 @@ printf (" hey! audio case #2\n"); sound_flags >>= 1; } } else if (buf[6] == 3) { -printf (" hey! audio case #3\n"); /* silent chunk */ *data_size = vmdaudio_loadsound(s, output_samples, p, 1); } -printf (" final sample count = %d, byte count = %d\n", (*data_size) / 2, - *data_size); return buf_size; } diff --git a/src/libffmpeg/libavcodec/vp3.c b/src/libffmpeg/libavcodec/vp3.c index eadfd39b9..0667d99eb 100644 --- a/src/libffmpeg/libavcodec/vp3.c +++ b/src/libffmpeg/libavcodec/vp3.c @@ -268,9 +268,11 @@ typedef struct Vp3DecodeContext { VLC ac_vlc_3[16]; VLC ac_vlc_4[16]; - int16_t intra_y_dequant[64]; - int16_t intra_c_dequant[64]; - int16_t inter_dequant[64]; + /* these arrays need to be on 16-byte boundaries since SSE2 operations + * index into them */ + int16_t __align16 intra_y_dequant[64]; + int16_t __align16 intra_c_dequant[64]; + int16_t __align16 inter_dequant[64]; /* This table contains superblock_count * 16 entries. Each set of 16 * numbers corresponds to the fragment indices 0..15 of the superblock. diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c index 25498c4d2..cf2db1494 100644 --- a/src/libffmpeg/libavcodec/wmadec.c +++ b/src/libffmpeg/libavcodec/wmadec.c @@ -20,6 +20,15 @@ /** * @file wmadec.c * WMA compatible decoder. + * This decoder handles Microsoft Windows Media Audio data, versions 1 & 2. + * WMA v1 is identified by audio format 0x160 in Microsoft media files + * (ASF/AVI/WAV). WMA v2 is identified by audio format 0x161. + * + * To use this decoder, a calling application must supply the extra data + * bytes provided with the WMA data. These are the extra, codec-specific + * bytes at the end of a WAVEFORMATEX data structure. Transmit these bytes + * to the decoder using the extradata[_size] fields in AVCodecContext. There + * should be 4 extra bytes for v1 data and 6 extra bytes for v2 data. */ #include "avcodec.h" diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c index 130a7f89d..376f0706e 100644 --- a/src/libffmpeg/libavcodec/wmv2.c +++ b/src/libffmpeg/libavcodec/wmv2.c @@ -181,7 +181,7 @@ int ff_wmv2_encode_picture_header(MpegEncContext * s, int picture_number) put_bits(&s->pb, 1, s->dc_table_index); put_bits(&s->pb, 1, s->mv_table_index); - s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE); + s->inter_intra_pred= 0;//(s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE); } s->esc3_level_length= 0; s->esc3_run_length= 0; @@ -216,7 +216,7 @@ void ff_wmv2_encode_mb(MpegEncContext * s, wmv2_inter_table[w->cbp_table_index][cbp + 64][0]); /* motion vector */ - h263_pred_motion(s, 0, &pred_x, &pred_y); + h263_pred_motion(s, 0, 0, &pred_x, &pred_y); msmpeg4_encode_motion(s, motion_x - pred_x, motion_y - pred_y); } else { @@ -443,7 +443,7 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext * s) s->dc_table_index = get_bits1(&s->gb); s->mv_table_index = get_bits1(&s->gb); - s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE); + s->inter_intra_pred= 0;//(s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE); s->no_rounding ^= 1; if(s->avctx->debug&FF_DEBUG_PICT_INFO){ @@ -504,7 +504,7 @@ static int16_t *wmv2_pred_motion(Wmv2Context *w, int *px, int *py){ int xy, wrap, diff, type; int16_t *A, *B, *C, *mot_val; - wrap = s->block_wrap[0]; + wrap = s->b8_stride; xy = s->block_index[0]; mot_val = s->current_picture.motion_val[0][xy]; |