summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiguel Freitas <miguelfreitas@users.sourceforge.net>2004-04-25 18:57:04 +0000
committerMiguel Freitas <miguelfreitas@users.sourceforge.net>2004-04-25 18:57:04 +0000
commita2a44876712f079610f0396fb9a682ea47e05b6e (patch)
tree1f3e328dfe6a5f9fa7c79e7a23bf6310be2827fd
parentacb7dc0f256afc24e875a168da989ef25d86b7b7 (diff)
downloadxine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.gz
xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.bz2
ffmpeg sync
CVS patchset: 6437 CVS date: 2004/04/25 18:57:04
-rw-r--r--CREDITS2
-rw-r--r--configure.ac1
-rw-r--r--src/libffmpeg/libavcodec/Makefile.am3
-rw-r--r--src/libffmpeg/libavcodec/avcodec.h43
-rw-r--r--src/libffmpeg/libavcodec/cabac.c2
-rw-r--r--src/libffmpeg/libavcodec/common.h61
-rw-r--r--src/libffmpeg/libavcodec/dsputil.c3
-rw-r--r--src/libffmpeg/libavcodec/dsputil.h16
-rw-r--r--src/libffmpeg/libavcodec/dv.c463
-rw-r--r--src/libffmpeg/libavcodec/dvdata.h2
-rw-r--r--src/libffmpeg/libavcodec/error_resilience.c53
-rw-r--r--src/libffmpeg/libavcodec/h263.c201
-rw-r--r--src/libffmpeg/libavcodec/h263dec.c7
-rw-r--r--src/libffmpeg/libavcodec/h264.c1732
-rw-r--r--src/libffmpeg/libavcodec/h264data.h595
-rw-r--r--src/libffmpeg/libavcodec/i386/Makefile.am3
-rw-r--r--src/libffmpeg/libavcodec/i386/dsputil_mmx.c12
-rw-r--r--src/libffmpeg/libavcodec/i386/fdct_mmx.c21
-rw-r--r--src/libffmpeg/libavcodec/i386/mmx.h24
-rw-r--r--src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c803
-rw-r--r--src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c890
-rw-r--r--src/libffmpeg/libavcodec/imgresample.c53
-rw-r--r--src/libffmpeg/libavcodec/mjpeg.c41
-rw-r--r--src/libffmpeg/libavcodec/motion_est.c1018
-rw-r--r--src/libffmpeg/libavcodec/motion_est_template.c352
-rw-r--r--src/libffmpeg/libavcodec/mpeg12.c303
-rw-r--r--src/libffmpeg/libavcodec/mpegaudiodec.c9
-rw-r--r--src/libffmpeg/libavcodec/mpegvideo.c323
-rw-r--r--src/libffmpeg/libavcodec/mpegvideo.h47
-rw-r--r--src/libffmpeg/libavcodec/msmpeg4.c12
-rw-r--r--src/libffmpeg/libavcodec/ppc/Makefile.am1
-rw-r--r--src/libffmpeg/libavcodec/ppc/dsputil_altivec.c353
-rw-r--r--src/libffmpeg/libavcodec/ppc/dsputil_altivec.h3
-rw-r--r--src/libffmpeg/libavcodec/ppc/dsputil_ppc.c8
-rw-r--r--src/libffmpeg/libavcodec/ppc/dsputil_ppc.h37
-rw-r--r--src/libffmpeg/libavcodec/ppc/fdct_altivec.c498
-rw-r--r--src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c3
-rw-r--r--src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c152
-rw-r--r--src/libffmpeg/libavcodec/rv10.c28
-rw-r--r--src/libffmpeg/libavcodec/smc.c19
-rw-r--r--src/libffmpeg/libavcodec/sparc/Makefile.am15
-rw-r--r--src/libffmpeg/libavcodec/sparc/dsputil_vis.c4107
-rw-r--r--src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c2
-rw-r--r--src/libffmpeg/libavcodec/sparc/vis.h328
-rw-r--r--src/libffmpeg/libavcodec/svq1.c2
-rw-r--r--src/libffmpeg/libavcodec/truemotion1.c17
-rw-r--r--src/libffmpeg/libavcodec/utils.c47
-rw-r--r--src/libffmpeg/libavcodec/vmdav.c26
-rw-r--r--src/libffmpeg/libavcodec/vp3.c8
-rw-r--r--src/libffmpeg/libavcodec/wmadec.c9
-rw-r--r--src/libffmpeg/libavcodec/wmv2.c8
51 files changed, 10694 insertions, 2072 deletions
diff --git a/CREDITS b/CREDITS
index 0893a09d6..3bcd3b3e0 100644
--- a/CREDITS
+++ b/CREDITS
@@ -12,7 +12,7 @@ updates (the word 'maintainer' is intentionally avoided here).
project version mediator
-----------------------------------------------------------------------
-ffmpeg build 4707 Mike Melanson
+ffmpeg build 4710 Mike Melanson
goom 1.9dev5
gsm610 1.0.10 Mike Melanson
liba52 0.7.4
diff --git a/configure.ac b/configure.ac
index 6ae201efb..10f01a955 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1962,6 +1962,7 @@ src/libffmpeg/libavcodec/i386/Makefile
src/libffmpeg/libavcodec/mlib/Makefile
src/libffmpeg/libavcodec/alpha/Makefile
src/libffmpeg/libavcodec/ppc/Makefile
+src/libffmpeg/libavcodec/sparc/Makefile
src/libffmpeg/libavcodec/libpostproc/Makefile
src/libflac/Makefile
src/liblpcm/Makefile
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index 8e5d53df3..7bbcbd281 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -1,6 +1,6 @@
include $(top_srcdir)/misc/Makefile.common
-SUBDIRS = armv4l i386 mlib alpha ppc libpostproc
+SUBDIRS = armv4l i386 mlib alpha ppc sparc libpostproc
## some files here are #included by others... go figure.
EXTRA_DIST = fdctref.c motion_est_template.c svq3.c wmv2.c \
@@ -80,6 +80,7 @@ libavcodec_la_LDFLAGS = \
$(top_builddir)/src/libffmpeg/libavcodec/i386/libavcodec_mmx.la \
$(top_builddir)/src/libffmpeg/libavcodec/mlib/libavcodec_mlib.la \
$(top_builddir)/src/libffmpeg/libavcodec/ppc/libavcodec_ppc.la \
+ $(top_builddir)/src/libffmpeg/libavcodec/sparc/libavcodec_sparc.la \
-avoid-version -module
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 510bd41d2..731bcd375 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -24,7 +24,7 @@ extern "C" {
#define FFMPEG_VERSION_INT 0x000408
#define FFMPEG_VERSION "0.4.8"
-#define LIBAVCODEC_BUILD 4707
+#define LIBAVCODEC_BUILD 4710
#define LIBAVCODEC_VERSION_INT FFMPEG_VERSION_INT
#define LIBAVCODEC_VERSION FFMPEG_VERSION
@@ -449,7 +449,7 @@ typedef struct AVPanScan{
\
/**\
* Motion vector table\
- * - encoding: unused\
+ * - encoding: set by user\
* - decoding: set by lavc\
*/\
int16_t (*motion_val[2])[2];\
@@ -457,7 +457,7 @@ typedef struct AVPanScan{
/**\
* Macroblock type table\
* mb_type_base + mb_width + 2\
- * - encoding: unused\
+ * - encoding: set by user\
* - decoding: set by lavc\
*/\
uint32_t *mb_type;\
@@ -545,13 +545,20 @@ typedef struct AVPanScan{
* - decoding: set by lavc\
*/\
short *dct_coeff;\
+\
+ /**\
+ * Motion referece frame index\
+ * - encoding: set by user\
+ * - decoding: set by lavc\
+ */\
+ int8_t *ref_index[2];
#define FF_QSCALE_TYPE_MPEG1 0
#define FF_QSCALE_TYPE_MPEG2 1
#define FF_BUFFER_TYPE_INTERNAL 1
#define FF_BUFFER_TYPE_USER 2 ///< Direct rendering buffers (image is (de)allocated by user)
-#define FF_BUFFER_TYPE_SHARED 4 ///< buffer from somewher else, dont dealloc image (data/base)
+#define FF_BUFFER_TYPE_SHARED 4 ///< buffer from somewher else, dont dealloc image (data/base), all other tables are not shared
#define FF_BUFFER_TYPE_COPY 8 ///< just a (modified) copy of some other buffer, dont dealloc anything
@@ -847,6 +854,7 @@ typedef struct AVCodecContext {
#define FF_BUG_QPEL_CHROMA2 256
#define FF_BUG_DIRECT_BLOCKSIZE 512
#define FF_BUG_EDGE 1024
+#define FF_BUG_HPEL_CHROMA 2048
//#define FF_BUG_FAKE_SCALABILITY 16 //autodetection should work 100%
/**
@@ -1567,6 +1575,22 @@ typedef struct AVCodecContext {
* - decoding: set by execute()
*/
void *thread_opaque;
+
+ /**
+ * Motion estimation threshold. under which no motion estimation is
+ * performed, but instead the user specified motion vectors are used
+ *
+ * - encoding: set by user
+ * - decoding: unused
+ */
+ int me_threshold;
+
+ /**
+ * Macroblock threshold. under which the user specified macroblock types will be used
+ * - encoding: set by user
+ * - decoding: unused
+ */
+ int mb_threshold;
} AVCodecContext;
@@ -1676,6 +1700,7 @@ extern AVCodec h263p_encoder;
extern AVCodec flv_encoder;
extern AVCodec rv10_encoder;
extern AVCodec rv20_encoder;
+extern AVCodec dvvideo_encoder;
extern AVCodec mjpeg_encoder;
extern AVCodec ljpeg_encoder;
extern AVCodec mpeg4_encoder;
@@ -1826,7 +1851,10 @@ ImgReSampleContext *img_resample_init(int output_width, int output_height,
ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
int iwidth, int iheight,
int topBand, int bottomBand,
- int leftBand, int rightBand);
+ int leftBand, int rightBand,
+ int padtop, int padbottom,
+ int padleft, int padright);
+
void img_resample(ImgReSampleContext *s,
AVPicture *output, const AVPicture *input);
@@ -1901,6 +1929,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode);
void avcodec_get_context_defaults(AVCodecContext *s);
AVCodecContext *avcodec_alloc_context(void);
+void avcodec_get_frame_defaults(AVFrame *pic);
AVFrame *avcodec_alloc_frame(void);
int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic);
@@ -2102,8 +2131,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
/* for static data only */
/* call av_free_static to release all staticaly allocated tables */
void av_free_static(void);
-void *__av_mallocz_static(void** location, unsigned int size);
-#define av_mallocz_static(p, s) __av_mallocz_static((void **)(p), s)
+void *av_mallocz_static(unsigned int size);
/* add by bero : in adx.c */
int is_adx(const unsigned char *buf,size_t bufsize);
@@ -2115,6 +2143,7 @@ void img_copy(AVPicture *dst, const AVPicture *src,
#include <stdarg.h>
+#define AV_LOG_QUIET -1
#define AV_LOG_ERROR 0
#define AV_LOG_INFO 1
#define AV_LOG_DEBUG 2
diff --git a/src/libffmpeg/libavcodec/cabac.c b/src/libffmpeg/libavcodec/cabac.c
index 27e63045b..0e3e14f56 100644
--- a/src/libffmpeg/libavcodec/cabac.c
+++ b/src/libffmpeg/libavcodec/cabac.c
@@ -113,7 +113,7 @@ void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4],
c->mps_state[2*i+0]= 2*mps_state[i];
c->mps_state[2*i+1]= 2*mps_state[i]+1;
- if(lps_state[i]){
+ if( i ){
c->lps_state[2*i+0]= 2*lps_state[i];
c->lps_state[2*i+1]= 2*lps_state[i]+1;
}else{
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 59b128cef..de9382a13 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -6,6 +6,11 @@
#ifndef COMMON_H
#define COMMON_H
+// xine: disable DEBUG for ffmpeg (too noisy)
+#ifdef DEBUG
+#undef DEBUG
+#endif
+
#if defined(WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
# define CONFIG_WIN32
#endif
@@ -131,7 +136,7 @@ static inline float floorf(float f) {
/* windows */
-# ifndef __MINGW32__
+# if !defined(__MINGW32__) && !defined(__CYGWIN__)
# define int64_t_C(c) (c ## i64)
# define uint64_t_C(c) (c ## i64)
@@ -204,40 +209,30 @@ static inline float floorf(float f) {
/* debug stuff */
-# ifndef DEBUG
-# ifndef NDEBUG
+# if !defined(DEBUG) && !defined(NDEBUG)
# define NDEBUG
-# endif
# endif
# include <assert.h>
/* dprintf macros */
-# if defined(CONFIG_WIN32) && !defined(__MINGW32__)
+# if defined(CONFIG_WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
inline void dprintf(const char* fmt,...) {}
# else
-#if __GNUC__
-#ifdef DEBUG
-#define dprintf(fmt,args...) printf(fmt, ## args)
-#else
-#define dprintf(fmt,args...)
-#endif
-#else
-#ifdef DEBUG
-#define dprintf(...) printf(__VA_ARGS__)
-#else
-#define dprintf(...)
-#endif
-#endif
+# ifdef DEBUG
+# define dprintf(fmt,...) av_log(NULL, AV_LOG_DEBUG, fmt, __VA_ARGS__)
+# else
+# define dprintf(fmt,...)
+# endif
# endif /* !CONFIG_WIN32 */
# define av_abort() do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0)
//rounded divison & shift
-#define RSHIFT(a,b) ((a) > 0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
+#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
/* assume b>0 */
#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
#define ABS(a) ((a) >= 0 ? (a) : (-(a)))
@@ -291,6 +286,7 @@ struct PutBitContext;
typedef void (*WriteDataFunc)(void *, uint8_t *, int);
+/* buf and buf_end must be present and used by every alternative writer. */
typedef struct PutBitContext {
#ifdef ALT_BITSTREAM_WRITER
uint8_t *buf, *buf_end;
@@ -327,11 +323,6 @@ static inline int put_bits_count(PutBitContext *s)
#endif
}
-static inline int put_bits_left(PutBitContext* s)
-{
- return (s->buf_end - s->buf) * 8 - put_bits_count(s);
-}
-
/* pad the end of the output stream with zeros */
static inline void flush_put_bits(PutBitContext *s)
{
@@ -354,7 +345,7 @@ void align_put_bits(PutBitContext *s);
void put_string(PutBitContext * pbc, char *s, int put_zero);
/* bit input */
-
+/* buffer, buffer_end and size_in_bits must be present and used by every reader */
typedef struct GetBitContext {
const uint8_t *buffer, *buffer_end;
#ifdef ALT_BITSTREAM_READER
@@ -386,7 +377,7 @@ typedef struct RL_VLC_ELEM {
uint8_t run;
} RL_VLC_ELEM;
-#ifdef ARCH_SPARC64
+#ifdef ARCH_SPARC
#define UNALIGNED_STORES_ARE_BAD
#endif
@@ -437,7 +428,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
bit_buf<<=bit_left;
bit_buf |= value >> (n - bit_left);
#ifdef UNALIGNED_STORES_ARE_BAD
- if (3 & (int) s->buf_ptr) {
+ if (3 & (intptr_t) s->buf_ptr) {
s->buf_ptr[0] = bit_buf >> 24;
s->buf_ptr[1] = bit_buf >> 16;
s->buf_ptr[2] = bit_buf >> 8;
@@ -924,11 +915,6 @@ static inline void init_get_bits(GetBitContext *s,
#endif
}
-static inline int get_bits_left(GetBitContext *s)
-{
- return s->size_in_bits - get_bits_count(s);
-}
-
int check_marker(GetBitContext *s, const char *msg);
void align_get_bits(GetBitContext *s);
int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
@@ -1080,7 +1066,7 @@ static inline int get_xbits_trace(GetBitContext *s, int n, char *file, char *fun
#define get_vlc(s, vlc) get_vlc_trace(s, (vlc)->table, (vlc)->bits, 3, __FILE__, __PRETTY_FUNCTION__, __LINE__)
#define get_vlc2(s, tab, bits, max) get_vlc_trace(s, tab, bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define tprintf printf
+#define tprintf(...) av_log(NULL, AV_LOG_DEBUG, __VA_ARGS__)
#else //TRACE
#define tprintf(...) {}
@@ -1182,6 +1168,12 @@ static inline int clip(int a, int amin, int amax)
return a;
}
+static inline int clip_uint8(int a)
+{
+ if (a&(~255)) return (-a)>>31;
+ else return a;
+}
+
/* math */
extern const uint8_t ff_sqrt_tab[128];
@@ -1290,6 +1282,9 @@ tend= rdtsc();\
#define malloc please_use_av_malloc
#define free please_use_av_free
#define realloc please_use_av_realloc
+#define time time_is_forbidden_due_to_security_issues
+#define rand rand_is_forbidden_due_to_state_trashing
+#define srand srand_is_forbidden_due_to_state_trashing
#if !(defined(LIBAVFORMAT_BUILD) || defined(_FRAMEHOOK_H))
#define printf please_use_av_log
#define fprintf please_use_av_log
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index 7f26bd98a..fce0b8163 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -3286,6 +3286,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
#ifdef HAVE_MLIB
dsputil_init_mlib(c, avctx);
#endif
+#ifdef ARCH_SPARC
+ dsputil_init_vis(c,avctx);
+#endif
#ifdef ARCH_ALPHA
dsputil_init_alpha(c, avctx);
#endif
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index 35e965db0..730e1489d 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -76,6 +76,12 @@ void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
int coeff_count, uint8_t *dest, int stride);
+void vp3_dsp_init_sse2(void);
+void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
+ int coeff_count, uint8_t *dest, int stride);
+void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
+ int coeff_count, uint8_t *dest, int stride);
+
/* minimum alignment rules ;)
if u notice errors in the align stuff, need more alignment for some asm code for some cpu
@@ -378,6 +384,8 @@ static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
one or more MultiMedia extension */
int mm_support(void);
+#define __align16 __attribute__ ((aligned (16)))
+
#if defined(HAVE_MMX)
#undef emms_c
@@ -413,7 +421,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
#elif defined(ARCH_ARMV4L)
/* This is to use 4 bytes read to the IDCT pointers for some 'zero'
- line ptimizations */
+ line optimizations */
#define __align8 __attribute__ ((aligned (4)))
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
@@ -425,6 +433,12 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
+#elif defined(ARCH_SPARC)
+
+/* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
+#define __align8 __attribute__ ((aligned (8)))
+void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
+
#elif defined(ARCH_ALPHA)
#define __align8 __attribute__ ((aligned (8)))
diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c
index 08be11d45..5f1eaaa3b 100644
--- a/src/libffmpeg/libavcodec/dv.c
+++ b/src/libffmpeg/libavcodec/dv.c
@@ -1,6 +1,7 @@
/*
* DV decoder
* Copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2004 Roman Shaposhnik.
*
* DV encoder
* Copyright (c) 2003 Roman Shaposhnik.
@@ -33,20 +34,18 @@
#include "simple_idct.h"
#include "dvdata.h"
-typedef struct DVVideoDecodeContext {
+typedef struct DVVideoContext {
const DVprofile* sys;
AVFrame picture;
+ uint8_t *buf;
uint8_t dv_zigzag[2][64];
- uint8_t dv_idct_shift[2][22][64];
+ uint8_t dv_idct_shift[2][2][22][64];
void (*get_pixels)(DCTELEM *block, const uint8_t *pixels, int line_size);
void (*fdct[2])(DCTELEM *block);
void (*idct_put[2])(uint8_t *dest, int line_size, DCTELEM *block);
-
- GetBitContext gb;
- DCTELEM block[5*6][64] __align8;
-} DVVideoDecodeContext;
+} DVVideoContext;
#define TEX_VLC_BITS 9
@@ -58,15 +57,18 @@ typedef struct DVVideoDecodeContext {
#define DV_VLC_MAP_LEV_SIZE 512
#endif
+/* MultiThreading */
+static uint8_t** dv_anchor;
+
/* XXX: also include quantization */
-static RL_VLC_ELEM *dv_rl_vlc[1];
+static RL_VLC_ELEM *dv_rl_vlc;
/* VLC encoding lookup table */
static struct dv_vlc_pair {
uint32_t vlc;
uint8_t size;
} (*dv_vlc_map)[DV_VLC_MAP_LEV_SIZE] = NULL;
-static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm)
+static void dv_build_unquantize_tables(DVVideoContext *s, uint8_t* perm)
{
int i, q, j;
@@ -76,29 +78,34 @@ static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm)
for(i = 1; i < 64; i++) {
/* 88 table */
j = perm[i];
- s->dv_idct_shift[0][q][j] =
+ s->dv_idct_shift[0][0][q][j] =
dv_quant_shifts[q][dv_88_areas[i]] + 1;
+ s->dv_idct_shift[1][0][q][j] = s->dv_idct_shift[0][0][q][j] + 1;
}
/* 248DCT */
for(i = 1; i < 64; i++) {
/* 248 table */
- s->dv_idct_shift[1][q][i] =
+ s->dv_idct_shift[0][1][q][i] =
dv_quant_shifts[q][dv_248_areas[i]] + 1;
+ s->dv_idct_shift[1][1][q][i] = s->dv_idct_shift[0][1][q][i] + 1;
}
}
}
static int dvvideo_init(AVCodecContext *avctx)
{
- DVVideoDecodeContext *s = avctx->priv_data;
+ DVVideoContext *s = avctx->priv_data;
DSPContext dsp;
static int done=0;
int i, j;
if (!done) {
- int i;
VLC dv_vlc;
+ uint16_t new_dv_vlc_bits[NB_DV_VLC*2];
+ uint8_t new_dv_vlc_len[NB_DV_VLC*2];
+ uint8_t new_dv_vlc_run[NB_DV_VLC*2];
+ int16_t new_dv_vlc_level[NB_DV_VLC*2];
done = 1;
@@ -106,13 +113,42 @@ static int dvvideo_init(AVCodecContext *avctx)
if (!dv_vlc_map)
return -ENOMEM;
+ /* dv_anchor lets each thread know its Id */
+ dv_anchor = av_malloc(12*27*sizeof(void*));
+ if (!dv_anchor) {
+ av_free(dv_vlc_map);
+ return -ENOMEM;
+ }
+ for (i=0; i<12*27; i++)
+ dv_anchor[i] = (void*)(size_t)i;
+
+ /* it's faster to include sign bit in a generic VLC parsing scheme */
+ for (i=0, j=0; i<NB_DV_VLC; i++, j++) {
+ new_dv_vlc_bits[j] = dv_vlc_bits[i];
+ new_dv_vlc_len[j] = dv_vlc_len[i];
+ new_dv_vlc_run[j] = dv_vlc_run[i];
+ new_dv_vlc_level[j] = dv_vlc_level[i];
+
+ if (dv_vlc_level[i]) {
+ new_dv_vlc_bits[j] <<= 1;
+ new_dv_vlc_len[j]++;
+
+ j++;
+ new_dv_vlc_bits[j] = (dv_vlc_bits[i] << 1) | 1;
+ new_dv_vlc_len[j] = dv_vlc_len[i] + 1;
+ new_dv_vlc_run[j] = dv_vlc_run[i];
+ new_dv_vlc_level[j] = -dv_vlc_level[i];
+ }
+ }
+
/* NOTE: as a trick, we use the fact the no codes are unused
to accelerate the parsing of partial codes */
- init_vlc(&dv_vlc, TEX_VLC_BITS, NB_DV_VLC,
- dv_vlc_len, 1, 1, dv_vlc_bits, 2, 2);
+ init_vlc(&dv_vlc, TEX_VLC_BITS, j,
+ new_dv_vlc_len, 1, 1, new_dv_vlc_bits, 2, 2);
- dv_rl_vlc[0] = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM));
- if (!dv_rl_vlc[0]) {
+ dv_rl_vlc = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM));
+ if (!dv_rl_vlc) {
+ av_free(dv_anchor);
av_free(dv_vlc_map);
return -ENOMEM;
}
@@ -124,18 +160,15 @@ static int dvvideo_init(AVCodecContext *avctx)
if(len<0){ //more bits needed
run= 0;
level= code;
- } else if (code == (NB_DV_VLC - 1)) {
- /* EOB */
- run = 0;
- level = 256;
} else {
- run= dv_vlc_run[code] + 1;
- level= dv_vlc_level[code];
+ run= new_dv_vlc_run[code] + 1;
+ level= new_dv_vlc_level[code];
}
- dv_rl_vlc[0][i].len = len;
- dv_rl_vlc[0][i].level = level;
- dv_rl_vlc[0][i].run = run;
+ dv_rl_vlc[i].len = len;
+ dv_rl_vlc[i].level = level;
+ dv_rl_vlc[i].run = run;
}
+ free_vlc(&dv_vlc);
for (i = 0; i < NB_DV_VLC - 1; i++) {
if (dv_vlc_run[i] >= DV_VLC_MAP_RUN_SIZE || dv_vlc_level[i] >= DV_VLC_MAP_LEV_SIZE)
@@ -202,13 +235,19 @@ static int dvvideo_init(AVCodecContext *avctx)
return 0;
}
+static int dvvideo_end(AVCodecContext *avctx)
+{
+ avcodec_default_free_buffers(avctx);
+ return 0;
+}
+
// #define VLC_DEBUG
+// #define printf(...) av_log(NULL, AV_LOG_ERROR, __VA_ARGS__)
typedef struct BlockInfo {
const uint8_t *shift_table;
const uint8_t *scan_table;
uint8_t pos; /* position in block */
- uint8_t eob_reached; /* true if EOB has been reached */
uint8_t dct_mode;
uint8_t partial_bit_count;
uint16_t partial_bit_buffer;
@@ -228,141 +267,88 @@ static const int mb_area_start[5] = { 1, 6, 21, 43, 64 };
#warning only works with ALT_BITSTREAM_READER
#endif
+static inline int get_bits_left(GetBitContext *s)
+{
+ return s->size_in_bits - get_bits_count(s);
+}
+
+static inline int get_bits_size(GetBitContext *s)
+{
+ return s->size_in_bits;
+}
+
+static inline int put_bits_left(PutBitContext* s)
+{
+ return (s->buf_end - s->buf) * 8 - put_bits_count(s);
+}
+
/* decode ac coefs */
-static void dv_decode_ac(DVVideoDecodeContext *s,
- BlockInfo *mb, DCTELEM *block, int last_index)
+static void dv_decode_ac(GetBitContext *gb, BlockInfo *mb, DCTELEM *block)
{
- int last_re_index;
- int shift_offset = mb->shift_offset;
+ int last_index = get_bits_size(gb);
const uint8_t *scan_table = mb->scan_table;
const uint8_t *shift_table = mb->shift_table;
int pos = mb->pos;
- int level, pos1, sign, run;
- int partial_bit_count;
-#ifndef ALT_BITSTREAM_READER //FIXME
- int re_index=0;
- int re1_index=0;
-#endif
- OPEN_READER(re, &s->gb);
+ int partial_bit_count = mb->partial_bit_count;
+ int level, pos1, run, vlc_len, index;
+
+ OPEN_READER(re, gb);
+ UPDATE_CACHE(re, gb);
-#ifdef VLC_DEBUG
- printf("start\n");
-#endif
-
/* if we must parse a partial vlc, we do it here */
- partial_bit_count = mb->partial_bit_count;
if (partial_bit_count > 0) {
- uint8_t buf[4];
- uint32_t v;
- int l, l1;
- GetBitContext gb1;
-
- /* build the dummy bit buffer */
- l = 16 - partial_bit_count;
- UPDATE_CACHE(re, &s->gb);
-#ifdef VLC_DEBUG
- printf("show=%04x\n", SHOW_UBITS(re, &s->gb, 16));
-#endif
- v = (mb->partial_bit_buffer << l) | SHOW_UBITS(re, &s->gb, l);
- buf[0] = v >> 8;
- buf[1] = v;
-#ifdef VLC_DEBUG
- printf("v=%04x cnt=%d %04x\n",
- v, partial_bit_count, (mb->partial_bit_buffer << l));
-#endif
- /* try to read the codeword */
- init_get_bits(&gb1, buf, 4*8);
- {
- OPEN_READER(re1, &gb1);
- UPDATE_CACHE(re1, &gb1);
- GET_RL_VLC(level, run, re1, &gb1, dv_rl_vlc[0],
- TEX_VLC_BITS, 2);
- l = re1_index;
- CLOSE_READER(re1, &gb1);
- }
-#ifdef VLC_DEBUG
- printf("****run=%d level=%d size=%d\n", run, level, l);
-#endif
- /* compute codeword length */
- l1 = (level != 256 && level != 0);
- /* if too long, we cannot parse */
- l -= partial_bit_count;
- if ((re_index + l + l1) > last_index)
- return;
- /* skip read bits */
- last_re_index = 0; /* avoid warning */
- re_index += l;
- /* by definition, if we can read the vlc, all partial bits
- will be read (otherwise we could have read the vlc before) */
- mb->partial_bit_count = 0;
- UPDATE_CACHE(re, &s->gb);
- goto handle_vlc;
+ re_cache = ((unsigned)re_cache >> partial_bit_count) |
+ (mb->partial_bit_buffer << (sizeof(re_cache)*8 - partial_bit_count));
+ re_index -= partial_bit_count;
+ mb->partial_bit_count = 0;
}
/* get the AC coefficients until last_index is reached */
for(;;) {
- UPDATE_CACHE(re, &s->gb);
#ifdef VLC_DEBUG
- printf("%2d: bits=%04x index=%d\n",
- pos, SHOW_UBITS(re, &s->gb, 16), re_index);
+ printf("%2d: bits=%04x index=%d\n", pos, SHOW_UBITS(re, gb, 16), re_index);
#endif
- last_re_index = re_index;
- GET_RL_VLC(level, run, re, &s->gb, dv_rl_vlc[0],
- TEX_VLC_BITS, 2);
- handle_vlc:
+ /* our own optimized GET_RL_VLC */
+ index = NEG_USR32(re_cache, TEX_VLC_BITS);
+ vlc_len = dv_rl_vlc[index].len;
+ if (vlc_len < 0) {
+ index = NEG_USR32((unsigned)re_cache << TEX_VLC_BITS, -vlc_len) + dv_rl_vlc[index].level;
+ vlc_len = TEX_VLC_BITS - vlc_len;
+ }
+ level = dv_rl_vlc[index].level;
+ run = dv_rl_vlc[index].run;
+
+ /* gotta check if we're still within gb boundaries */
+ if (re_index + vlc_len > last_index) {
+ /* should be < 16 bits otherwise a codeword could have been parsed */
+ mb->partial_bit_count = last_index - re_index;
+ mb->partial_bit_buffer = NEG_USR32(re_cache, mb->partial_bit_count);
+ re_index = last_index;
+ break;
+ }
+ re_index += vlc_len;
+
#ifdef VLC_DEBUG
- printf("run=%d level=%d\n", run, level);
+ printf("run=%d level=%d\n", run, level);
#endif
- if (level == 256) {
- if (re_index > last_index) {
- cannot_read:
- /* put position before read code */
- re_index = last_re_index;
- mb->eob_reached = 0;
- break;
- }
- /* EOB */
- mb->eob_reached = 1;
- break;
- } else if (level != 0) {
- if ((re_index + 1) > last_index)
- goto cannot_read;
- sign = SHOW_SBITS(re, &s->gb, 1);
- level = (level ^ sign) - sign;
- LAST_SKIP_BITS(re, &s->gb, 1);
- pos += run;
- /* error */
- if (pos >= 64) {
- goto read_error;
- }
+ pos += run;
+ if (pos >= 64)
+ break;
+
+ if (level) {
pos1 = scan_table[pos];
- level = level << (shift_table[pos1] + shift_offset);
- block[pos1] = level;
- // printf("run=%d level=%d shift=%d\n", run, level, shift_table[pos1]);
- } else {
- if (re_index > last_index)
- goto cannot_read;
- /* level is zero: means run without coding. No
- sign is coded */
- pos += run;
- /* error */
- if (pos >= 64) {
- read_error:
-#if defined(VLC_DEBUG) || 1
- av_log(NULL, AV_LOG_ERROR, "error pos=%d\n", pos);
-#endif
- /* for errors, we consider the eob is reached */
- mb->eob_reached = 1;
- break;
- }
- }
+ block[pos1] = level << shift_table[pos1];
+ }
+
+ UPDATE_CACHE(re, gb);
}
- CLOSE_READER(re, &s->gb);
+ CLOSE_READER(re, gb);
mb->pos = pos;
}
-static inline void bit_copy(PutBitContext *pb, GetBitContext *gb, int bits_left)
+static inline void bit_copy(PutBitContext *pb, GetBitContext *gb)
{
+ int bits_left = get_bits_left(gb);
while (bits_left >= 16) {
put_bits(pb, 16, get_bits(gb, 16));
bits_left -= 16;
@@ -373,60 +359,56 @@ static inline void bit_copy(PutBitContext *pb, GetBitContext *gb, int bits_left)
}
/* mb_x and mb_y are in units of 8 pixels */
-static inline void dv_decode_video_segment(DVVideoDecodeContext *s,
+static inline void dv_decode_video_segment(DVVideoContext *s,
uint8_t *buf_ptr1,
const uint16_t *mb_pos_ptr)
{
int quant, dc, dct_mode, class1, j;
int mb_index, mb_x, mb_y, v, last_index;
DCTELEM *block, *block1;
- int c_offset, bits_left;
+ int c_offset;
uint8_t *y_ptr;
- BlockInfo mb_data[5 * 6], *mb, *mb1;
void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
uint8_t *buf_ptr;
PutBitContext pb, vs_pb;
+ GetBitContext gb;
+ BlockInfo mb_data[5 * 6], *mb, *mb1;
+ DCTELEM sblock[5*6][64] __align8;
uint8_t mb_bit_buffer[80 + 4]; /* allow some slack */
- int mb_bit_count;
uint8_t vs_bit_buffer[5 * 80 + 4]; /* allow some slack */
- int vs_bit_count;
-
- memset(s->block, 0, sizeof(s->block));
+
+ memset(sblock, 0, sizeof(sblock));
/* pass 1 : read DC and AC coefficients in blocks */
buf_ptr = buf_ptr1;
- block1 = &s->block[0][0];
+ block1 = &sblock[0][0];
mb1 = mb_data;
init_put_bits(&vs_pb, vs_bit_buffer, 5 * 80);
- vs_bit_count = 0;
- for(mb_index = 0; mb_index < 5; mb_index++) {
+ for(mb_index = 0; mb_index < 5; mb_index++, mb1 += 6, block1 += 6 * 64) {
/* skip header */
quant = buf_ptr[3] & 0x0f;
buf_ptr += 4;
init_put_bits(&pb, mb_bit_buffer, 80);
- mb_bit_count = 0;
mb = mb1;
block = block1;
for(j = 0;j < 6; j++) {
- /* NOTE: size is not important here */
- init_get_bits(&s->gb, buf_ptr, 14*8);
+ last_index = block_sizes[j];
+ init_get_bits(&gb, buf_ptr, last_index);
/* get the dc */
- dc = get_bits(&s->gb, 9);
+ dc = get_bits(&gb, 9);
dc = (dc << (32 - 9)) >> (32 - 9);
- dct_mode = get_bits1(&s->gb);
+ dct_mode = get_bits1(&gb);
mb->dct_mode = dct_mode;
mb->scan_table = s->dv_zigzag[dct_mode];
- class1 = get_bits(&s->gb, 2);
- mb->shift_offset = (class1 == 3);
- mb->shift_table = s->dv_idct_shift[dct_mode]
+ class1 = get_bits(&gb, 2);
+ mb->shift_table = s->dv_idct_shift[class1 == 3][dct_mode]
[quant + dv_quant_offset[class1]];
dc = dc << 2;
/* convert to unsigned because 128 is not added in the
standard IDCT */
dc += 1024;
block[0] = dc;
- last_index = block_sizes[j];
buf_ptr += last_index >> 3;
mb->pos = 0;
mb->partial_bit_count = 0;
@@ -434,88 +416,64 @@ static inline void dv_decode_video_segment(DVVideoDecodeContext *s,
#ifdef VLC_DEBUG
printf("MB block: %d, %d ", mb_index, j);
#endif
- dv_decode_ac(s, mb, block, last_index);
+ dv_decode_ac(&gb, mb, block);
/* write the remaining bits in a new buffer only if the
block is finished */
- bits_left = last_index - get_bits_count(&s->gb);
- if (mb->eob_reached) {
- mb->partial_bit_count = 0;
- mb_bit_count += bits_left;
- bit_copy(&pb, &s->gb, bits_left);
- } else {
- /* should be < 16 bits otherwise a codeword could have
- been parsed */
- mb->partial_bit_count = bits_left;
- mb->partial_bit_buffer = get_bits(&s->gb, bits_left);
- }
+ if (mb->pos >= 64)
+ bit_copy(&pb, &gb);
+
block += 64;
mb++;
}
- flush_put_bits(&pb);
-
/* pass 2 : we can do it just after */
#ifdef VLC_DEBUG
- printf("***pass 2 size=%d MB#=%d\n", mb_bit_count, mb_index);
+ printf("***pass 2 size=%d MB#=%d\n", put_bits_count(&pb), mb_index);
#endif
block = block1;
mb = mb1;
- init_get_bits(&s->gb, mb_bit_buffer, 80*8);
- for(j = 0;j < 6; j++) {
- if (!mb->eob_reached && get_bits_count(&s->gb) < mb_bit_count) {
- dv_decode_ac(s, mb, block, mb_bit_count);
+ init_get_bits(&gb, mb_bit_buffer, put_bits_count(&pb));
+ flush_put_bits(&pb);
+ for(j = 0;j < 6; j++, block += 64, mb++) {
+ if (mb->pos < 64 && get_bits_left(&gb) > 0) {
+ dv_decode_ac(&gb, mb, block);
/* if still not finished, no need to parse other blocks */
- if (!mb->eob_reached) {
- /* we could not parse the current AC coefficient,
- so we add the remaining bytes */
- bits_left = mb_bit_count - get_bits_count(&s->gb);
- if (bits_left > 0) {
- mb->partial_bit_count += bits_left;
- mb->partial_bit_buffer =
- (mb->partial_bit_buffer << bits_left) |
- get_bits(&s->gb, bits_left);
- }
- goto next_mb;
- }
+ if (mb->pos < 64)
+ break;
}
- block += 64;
- mb++;
}
/* all blocks are finished, so the extra bytes can be used at
the video segment level */
- bits_left = mb_bit_count - get_bits_count(&s->gb);
- vs_bit_count += bits_left;
- bit_copy(&vs_pb, &s->gb, bits_left);
- next_mb:
- mb1 += 6;
- block1 += 6 * 64;
+ if (j >= 6)
+ bit_copy(&vs_pb, &gb);
}
/* we need a pass other the whole video segment */
- flush_put_bits(&vs_pb);
-
#ifdef VLC_DEBUG
- printf("***pass 3 size=%d\n", vs_bit_count);
+ printf("***pass 3 size=%d\n", put_bits_count(&vs_pb));
#endif
- block = &s->block[0][0];
+ block = &sblock[0][0];
mb = mb_data;
- init_get_bits(&s->gb, vs_bit_buffer, 5 * 80*8);
+ init_get_bits(&gb, vs_bit_buffer, put_bits_count(&vs_pb));
+ flush_put_bits(&vs_pb);
for(mb_index = 0; mb_index < 5; mb_index++) {
for(j = 0;j < 6; j++) {
- if (!mb->eob_reached) {
+ if (mb->pos < 64) {
#ifdef VLC_DEBUG
printf("start %d:%d\n", mb_index, j);
#endif
- dv_decode_ac(s, mb, block, vs_bit_count);
+ dv_decode_ac(&gb, mb, block);
}
+ if (mb->pos >= 64 && mb->pos < 127)
+ av_log(NULL, AV_LOG_ERROR, "AC EOB marker is absent pos=%d\n", mb->pos);
block += 64;
mb++;
}
}
/* compute idct and place blocks */
- block = &s->block[0][0];
+ block = &sblock[0][0];
mb = mb_data;
for(mb_index = 0; mb_index < 5; mb_index++) {
v = *mb_pos_ptr++;
@@ -790,7 +748,7 @@ static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos)
* horrible and the weighting is missing. But it's missing from the
* decoding step also -- so at least we're on the same page with decoder ;-)
*/
-static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
+static inline void dv_encode_video_segment(DVVideoContext *s,
uint8_t *dif,
const uint16_t *mb_pos_ptr)
{
@@ -801,6 +759,7 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
uint8_t* ptr;
int do_edge_wrap;
DCTELEM block[64] __align8;
+ DCTELEM sblock[5*6][64] __align8;
EncBlockInfo enc_blks[5*6];
PutBitContext pbs[5*6];
PutBitContext* pb;
@@ -854,7 +813,7 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
}
enc_blk->dct_mode = dv_guess_dct_mode(block);
- enc_blk->mb = &s->block[mb_index*6+j][0];
+ enc_blk->mb = &sblock[mb_index*6+j][0];
enc_blk->area_q[0] = enc_blk->area_q[1] = enc_blk->area_q[2] = enc_blk->area_q[3] = 0;
enc_blk->partial_bit_count = 0;
enc_blk->partial_bit_buffer = 0;
@@ -906,15 +865,31 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
flush_put_bits(&pbs[j]);
}
+static int dv_decode_mt(AVCodecContext *avctx, void* sl)
+{
+ DVVideoContext *s = avctx->priv_data;
+ int slice = (size_t)sl;
+ dv_decode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80],
+ &s->sys->video_place[slice*5]);
+ return 0;
+}
+
+static int dv_encode_mt(AVCodecContext *avctx, void* sl)
+{
+ DVVideoContext *s = avctx->priv_data;
+ int slice = (size_t)sl;
+ dv_encode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80],
+ &s->sys->video_place[slice*5]);
+ return 0;
+}
+
/* NOTE: exactly one frame must be given (120000 bytes for NTSC,
144000 bytes for PAL) */
static int dvvideo_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
uint8_t *buf, int buf_size)
{
- DVVideoDecodeContext *s = avctx->priv_data;
- int ds, vs;
- const uint16_t *mb_pos_ptr;
+ DVVideoContext *s = avctx->priv_data;
*data_size=0;
/* special case for last picture */
@@ -925,7 +900,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
if (!s->sys || buf_size < s->sys->frame_size)
return -1; /* NOTE: we only accept several full frames */
-
if(s->picture.data[0])
avctx->release_buffer(avctx, &s->picture);
@@ -940,24 +914,10 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
s->picture.interlaced_frame = 1;
s->picture.top_field_first = 0;
- /* for each DIF segment */
- mb_pos_ptr = s->sys->video_place;
- for (ds = 0; ds < s->sys->difseg_size; ds++) {
- buf += 6 * 80; /* skip DIF segment header */
-
- for(vs = 0; vs < 27; vs++) {
- if ((vs % 3) == 0)
- buf += 80; /* skip audio block */
-
-#ifdef VLC_DEBUG
- printf("********************* %d, %d **********************\n", ds, vs);
-#endif
- dv_decode_video_segment(s, buf, mb_pos_ptr);
- buf += 5 * 80;
- mb_pos_ptr += 5;
- }
- }
-
+ s->buf = buf;
+ avctx->execute(avctx, dv_decode_mt, (void**)&dv_anchor[0], NULL,
+ s->sys->difseg_size * 27);
+
emms_c();
/* return image */
@@ -970,9 +930,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size,
void *data)
{
- DVVideoDecodeContext *s = c->priv_data;
- const uint16_t *mb_pos_ptr;
- int ds, vs;
+ DVVideoContext *s = c->priv_data;
s->sys = dv_codec_profile(c);
if (!s->sys)
@@ -981,41 +939,34 @@ static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size,
c->pix_fmt = s->sys->pix_fmt;
s->picture = *((AVFrame *)data);
- /* for each DIF segment */
- mb_pos_ptr = s->sys->video_place;
- for (ds = 0; ds < s->sys->difseg_size; ds++) {
- buf += 6 * 80; /* skip DIF segment header */
-
- for(vs = 0; vs < 27; vs++) {
- if ((vs % 3) == 0)
- buf += 80; /* skip audio block */
-
-#ifdef VLC_DEBUG
- printf("********************* %d, %d **********************\n", ds, vs);
-#endif
- dv_encode_video_segment(s, buf, mb_pos_ptr);
- buf += 5 * 80;
- mb_pos_ptr += 5;
- }
- }
+ s->buf = buf;
+ c->execute(c, dv_encode_mt, (void**)&dv_anchor[0], NULL,
+ s->sys->difseg_size * 27);
emms_c();
return s->sys->frame_size;
}
-static int dvvideo_end(AVCodecContext *avctx)
-{
- avcodec_default_free_buffers(avctx);
- return 0;
-}
+AVCodec dvvideo_encoder = {
+ "dvvideo",
+ CODEC_TYPE_VIDEO,
+ CODEC_ID_DVVIDEO,
+ sizeof(DVVideoContext),
+ dvvideo_init,
+ dvvideo_encode_frame,
+ dvvideo_end,
+ NULL,
+ CODEC_CAP_DR1,
+ NULL
+};
AVCodec dvvideo_decoder = {
"dvvideo",
CODEC_TYPE_VIDEO,
CODEC_ID_DVVIDEO,
- sizeof(DVVideoDecodeContext),
+ sizeof(DVVideoContext),
dvvideo_init,
- dvvideo_encode_frame,
+ NULL,
dvvideo_end,
dvvideo_decode_frame,
CODEC_CAP_DR1,
diff --git a/src/libffmpeg/libavcodec/dvdata.h b/src/libffmpeg/libavcodec/dvdata.h
index e6e0986ba..e60d99448 100644
--- a/src/libffmpeg/libavcodec/dvdata.h
+++ b/src/libffmpeg/libavcodec/dvdata.h
@@ -218,7 +218,7 @@ static const uint8_t dv_vlc_run[409] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
- 0,
+127,
};
static const uint8_t dv_vlc_level[409] = {
diff --git a/src/libffmpeg/libavcodec/error_resilience.c b/src/libffmpeg/libavcodec/error_resilience.c
index c6b10a79c..b7aeebddf 100644
--- a/src/libffmpeg/libavcodec/error_resilience.c
+++ b/src/libffmpeg/libavcodec/error_resilience.c
@@ -45,7 +45,7 @@ static void put_dc(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
{
int dc, dcu, dcv, y, i;
for(i=0; i<4; i++){
- dc= s->dc_val[0][mb_x*2+1 + (i&1) + (mb_y*2+1 + (i>>1))*(s->mb_width*2+2)];
+ dc= s->dc_val[0][mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride];
if(dc<0) dc=0;
else if(dc>2040) dc=2040;
for(y=0; y<8; y++){
@@ -55,8 +55,8 @@ static void put_dc(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
}
}
}
- dcu = s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)];
- dcv = s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)];
+ dcu = s->dc_val[1][mb_x + mb_y*s->mb_stride];
+ dcv = s->dc_val[2][mb_x + mb_y*s->mb_stride];
if (dcu<0 ) dcu=0;
else if(dcu>2040) dcu=2040;
if (dcv<0 ) dcv=0;
@@ -209,8 +209,8 @@ static void h_block_filter(MpegEncContext *s, uint8_t *dst, int w, int h, int st
int left_damage = left_status&(DC_ERROR|AC_ERROR|MV_ERROR);
int right_damage= right_status&(DC_ERROR|AC_ERROR|MV_ERROR);
int offset= b_x*8 + b_y*stride*8;
- int16_t *left_mv= s->current_picture.motion_val[0][s->block_wrap[0]*((b_y<<(1-is_luma)) + 1) + ( b_x <<(1-is_luma))];
- int16_t *right_mv= s->current_picture.motion_val[0][s->block_wrap[0]*((b_y<<(1-is_luma)) + 1) + ((b_x+1)<<(1-is_luma))];
+ int16_t *left_mv= s->current_picture.motion_val[0][s->b8_stride*(b_y<<(1-is_luma)) + ( b_x <<(1-is_luma))];
+ int16_t *right_mv= s->current_picture.motion_val[0][s->b8_stride*(b_y<<(1-is_luma)) + ((b_x+1)<<(1-is_luma))];
if(!(left_damage||right_damage)) continue; // both undamaged
@@ -269,8 +269,8 @@ static void v_block_filter(MpegEncContext *s, uint8_t *dst, int w, int h, int st
int top_damage = top_status&(DC_ERROR|AC_ERROR|MV_ERROR);
int bottom_damage= bottom_status&(DC_ERROR|AC_ERROR|MV_ERROR);
int offset= b_x*8 + b_y*stride*8;
- int16_t *top_mv= s->current_picture.motion_val[0][s->block_wrap[0]*(( b_y <<(1-is_luma)) + 1) + (b_x<<(1-is_luma))];
- int16_t *bottom_mv= s->current_picture.motion_val[0][s->block_wrap[0]*(((b_y+1)<<(1-is_luma)) + 1) + (b_x<<(1-is_luma))];
+ int16_t *top_mv= s->current_picture.motion_val[0][s->b8_stride*( b_y <<(1-is_luma)) + (b_x<<(1-is_luma))];
+ int16_t *bottom_mv= s->current_picture.motion_val[0][s->b8_stride*((b_y+1)<<(1-is_luma)) + (b_x<<(1-is_luma))];
if(!(top_damage||bottom_damage)) continue; // both undamaged
@@ -378,8 +378,8 @@ int score_sum=0;
int j;
int best_score=256*256*256*64;
int best_pred=0;
- const int mot_stride= mb_width*2+2;
- const int mot_index= mb_x*2 + 1 + (mb_y*2+1)*mot_stride;
+ const int mot_stride= s->b8_stride;
+ const int mot_index= mb_x*2 + mb_y*2*mot_stride;
int prev_x= s->current_picture.motion_val[0][mot_index][0];
int prev_y= s->current_picture.motion_val[0][mot_index][1];
@@ -672,14 +672,15 @@ void ff_er_frame_end(MpegEncContext *s){
av_log(s->avctx, AV_LOG_INFO, "concealing errors\n");
if(s->current_picture.motion_val[0] == NULL){
- int size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
+ int size = s->b8_stride * 2 * s->mb_height;
Picture *pic= s->current_picture_ptr;
av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
for(i=0; i<2; i++){
- pic->motion_val_base[i]= av_mallocz((size+1) * 2 * sizeof(uint16_t)); //FIXME size
- pic->motion_val[i]= pic->motion_val_base[i]+1;
+ pic->ref_index[i]= av_mallocz(size * sizeof(uint8_t));
+ pic->motion_val_base[i]= av_mallocz((size+2) * 2 * sizeof(uint16_t));
+ pic->motion_val[i]= pic->motion_val_base[i]+2;
}
pic->motion_subsample_log2= 3;
s->current_picture= *s->current_picture_ptr;
@@ -845,17 +846,17 @@ void ff_er_frame_end(MpegEncContext *s){
s->mb_intra=0;
s->mb_skiped=0;
if(IS_8X8(mb_type)){
- int mb_index= mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0];
+ int mb_index= mb_x*2 + mb_y*2*s->b8_stride;
int j;
s->mv_type = MV_TYPE_8X8;
for(j=0; j<4; j++){
- s->mv[0][j][0] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->block_wrap[0] ][0];
- s->mv[0][j][1] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->block_wrap[0] ][1];
+ s->mv[0][j][0] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->b8_stride ][0];
+ s->mv[0][j][1] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->b8_stride ][1];
}
}else{
s->mv_type = MV_TYPE_16X16;
- s->mv[0][0][0] = s->current_picture.motion_val[0][ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][0];
- s->mv[0][0][1] = s->current_picture.motion_val[0][ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][1];
+ s->mv[0][0][0] = s->current_picture.motion_val[0][ mb_x*2 + mb_y*2*s->b8_stride ][0];
+ s->mv[0][0][1] = s->current_picture.motion_val[0][ mb_x*2 + mb_y*2*s->b8_stride ][1];
}
s->dsp.clear_blocks(s->block[0]);
@@ -870,7 +871,7 @@ void ff_er_frame_end(MpegEncContext *s){
if(s->pict_type==B_TYPE){
for(mb_y=0; mb_y<s->mb_height; mb_y++){
for(mb_x=0; mb_x<s->mb_width; mb_x++){
- int xy= mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0];
+ int xy= mb_x*2 + mb_y*2*s->b8_stride;
const int mb_xy= mb_x + mb_y * s->mb_stride;
const int mb_type= s->current_picture.mb_type[mb_xy];
error= s->error_status_table[mb_xy];
@@ -930,7 +931,7 @@ void ff_er_frame_end(MpegEncContext *s){
dest_cb= s->current_picture.data[1] + mb_x*8 + mb_y*8 *s->uvlinesize;
dest_cr= s->current_picture.data[2] + mb_x*8 + mb_y*8 *s->uvlinesize;
- dc_ptr= &s->dc_val[0][mb_x*2+1 + (mb_y*2+1)*(s->mb_width*2+2)];
+ dc_ptr= &s->dc_val[0][mb_x*2 + mb_y*2*s->b8_stride];
for(n=0; n<4; n++){
dc=0;
for(y=0; y<8; y++){
@@ -939,7 +940,7 @@ void ff_er_frame_end(MpegEncContext *s){
dc+= dest_y[x + (n&1)*8 + (y + (n>>1)*8)*s->linesize];
}
}
- dc_ptr[(n&1) + (n>>1)*(s->mb_width*2+2)]= (dc+4)>>3;
+ dc_ptr[(n&1) + (n>>1)*s->b8_stride]= (dc+4)>>3;
}
dcu=dcv=0;
@@ -950,18 +951,18 @@ void ff_er_frame_end(MpegEncContext *s){
dcv+=dest_cr[x + y*(s->uvlinesize)];
}
}
- s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)]= (dcu+4)>>3;
- s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)]= (dcv+4)>>3;
+ s->dc_val[1][mb_x + mb_y*s->mb_stride]= (dcu+4)>>3;
+ s->dc_val[2][mb_x + mb_y*s->mb_stride]= (dcv+4)>>3;
}
}
#if 1
/* guess DC for damaged blocks */
- guess_dc(s, s->dc_val[0] + s->mb_width*2+3, s->mb_width*2, s->mb_height*2, s->mb_width*2+2, 1);
- guess_dc(s, s->dc_val[1] + s->mb_width +3, s->mb_width , s->mb_height , s->mb_width +2, 0);
- guess_dc(s, s->dc_val[2] + s->mb_width +3, s->mb_width , s->mb_height , s->mb_width +2, 0);
+ guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1);
+ guess_dc(s, s->dc_val[1], s->mb_width , s->mb_height , s->mb_stride, 0);
+ guess_dc(s, s->dc_val[2], s->mb_width , s->mb_height , s->mb_stride, 0);
#endif
/* filter luma DC */
- filter181(s->dc_val[0] + s->mb_width*2+3, s->mb_width*2, s->mb_height*2, s->mb_width*2+2);
+ filter181(s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride);
#if 1
/* render DC only intra */
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index 8a60ff08b..ec776eb98 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -75,7 +75,7 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr);
static void mpeg4_encode_visual_object_header(MpegEncContext * s);
static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_number);
#endif //CONFIG_ENCODERS
-static void mpeg4_decode_sprite_trajectory(MpegEncContext * s);
+static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb);
static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr, int *dir_ptr);
#ifdef CONFIG_ENCODERS
@@ -577,12 +577,13 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
} else if(IS_INTERLACED(colocated_mb_type)){
s->mv_type = MV_TYPE_FIELD;
for(i=0; i<2; i++){
+ int field_select= s->next_picture.ref_index[0][s->block_index[2*i]];
if(s->top_field_first){
- time_pp= s->pp_field_time - s->p_field_select_table[i][mb_index] + i;
- time_pb= s->pb_field_time - s->p_field_select_table[i][mb_index] + i;
+ time_pp= s->pp_field_time - field_select + i;
+ time_pb= s->pb_field_time - field_select + i;
}else{
- time_pp= s->pp_field_time + s->p_field_select_table[i][mb_index] - i;
- time_pb= s->pb_field_time + s->p_field_select_table[i][mb_index] - i;
+ time_pp= s->pp_field_time + field_select - i;
+ time_pb= s->pb_field_time + field_select - i;
}
s->mv[0][i][0] = s->p_field_mv_table[i][0][mb_index][0]*time_pb/time_pp + mx;
s->mv[0][i][1] = s->p_field_mv_table[i][0][mb_index][1]*time_pb/time_pp + my;
@@ -610,7 +611,7 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
void ff_h263_update_motion_val(MpegEncContext * s){
const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
//FIXME a lot of thet is only needed for !low_delay
- const int wrap = s->block_wrap[0];
+ const int wrap = s->b8_stride;
const int xy = s->block_index[0];
s->current_picture.mbskip_table[mb_xy]= s->mb_skiped;
@@ -631,10 +632,13 @@ void ff_h263_update_motion_val(MpegEncContext * s){
for(i=0; i<2; i++){
s->p_field_mv_table[i][0][mb_xy][0]= s->mv[0][i][0];
s->p_field_mv_table[i][0][mb_xy][1]= s->mv[0][i][1];
- s->p_field_select_table[i][mb_xy]= s->field_select[0][i];
}
+ s->current_picture.ref_index[0][xy ]=
+ s->current_picture.ref_index[0][xy + 1]= s->field_select[0][0];
+ s->current_picture.ref_index[0][xy + wrap ]=
+ s->current_picture.ref_index[0][xy + wrap + 1]= s->field_select[0][1];
}
-
+
/* no update if 8X8 because it has been done during parsing */
s->current_picture.motion_val[0][xy][0] = motion_x;
s->current_picture.motion_val[0][xy][1] = motion_y;
@@ -985,7 +989,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
}
/* motion vectors: 16x16 mode */
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
h263_encode_motion(s, motion_x - pred_x, s->f_code);
h263_encode_motion(s, motion_y - pred_y, s->f_code);
@@ -1009,7 +1013,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
}
/* motion vectors: 16x8 interlaced mode */
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
pred_y /=2;
put_bits(&s->pb, 1, s->field_select[0][0]);
@@ -1037,7 +1041,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
for(i=0; i<4; i++){
/* motion vectors: 8x8 mode*/
- h263_pred_motion(s, i, &pred_x, &pred_y);
+ h263_pred_motion(s, i, 0, &pred_x, &pred_y);
h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][0] - pred_x, s->f_code);
h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][1] - pred_y, s->f_code);
@@ -1185,7 +1189,7 @@ void h263_encode_mb(MpegEncContext * s,
}
/* motion vectors: 16x16 mode */
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
if (!s->umvplus) {
h263_encode_motion(s, motion_x - pred_x, 1);
@@ -1212,7 +1216,7 @@ void h263_encode_mb(MpegEncContext * s,
for(i=0; i<4; i++){
/* motion vectors: 8x8 mode*/
- h263_pred_motion(s, i, &pred_x, &pred_y);
+ h263_pred_motion(s, i, 0, &pred_x, &pred_y);
motion_x= s->current_picture.motion_val[0][ s->block_index[i] ][0];
motion_y= s->current_picture.motion_val[0][ s->block_index[i] ][1];
@@ -1435,16 +1439,16 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr)
/* find prediction */
if (n < 4) {
- x = 2 * s->mb_x + 1 + (n & 1);
- y = 2 * s->mb_y + 1 + ((n & 2) >> 1);
- wrap = s->mb_width * 2 + 2;
+ x = 2 * s->mb_x + (n & 1);
+ y = 2 * s->mb_y + ((n & 2) >> 1);
+ wrap = s->b8_stride;
dc_val = s->dc_val[0];
ac_val = s->ac_val[0][0];
scale = s->y_dc_scale;
} else {
- x = s->mb_x + 1;
- y = s->mb_y + 1;
- wrap = s->mb_width + 2;
+ x = s->mb_x;
+ y = s->mb_y;
+ wrap = s->mb_stride;
dc_val = s->dc_val[n - 4 + 1];
ac_val = s->ac_val[n - 4 + 1][0];
scale = s->c_dc_scale;
@@ -1456,8 +1460,10 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr)
c = dc_val[(x) + (y - 1) * wrap];
/* No prediction outside GOB boundary */
- if (s->first_slice_line && ((n < 2) || (n > 3)))
- c = 1024;
+ if(s->first_slice_line && n!=3){
+ if(n!=2) c= 1024;
+ if(n!=1 && s->mb_x == s->resync_mb_x) a= 1024;
+ }
pred_dc = 1024;
/* just DC prediction */
if (a != 1024 && c != 1024)
@@ -1480,16 +1486,16 @@ static void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n)
/* find prediction */
if (n < 4) {
- x = 2 * s->mb_x + 1 + (n & 1);
- y = 2 * s->mb_y + 1 + (n>> 1);
- wrap = s->mb_width * 2 + 2;
+ x = 2 * s->mb_x + (n & 1);
+ y = 2 * s->mb_y + (n>> 1);
+ wrap = s->b8_stride;
dc_val = s->dc_val[0];
ac_val = s->ac_val[0][0];
scale = s->y_dc_scale;
} else {
- x = s->mb_x + 1;
- y = s->mb_y + 1;
- wrap = s->mb_width + 2;
+ x = s->mb_x;
+ y = s->mb_y;
+ wrap = s->mb_stride;
dc_val = s->dc_val[n - 4 + 1];
ac_val = s->ac_val[n - 4 + 1][0];
scale = s->c_dc_scale;
@@ -1560,78 +1566,15 @@ static void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n)
ac_val1[8 + i] = block[s->dsp.idct_permutation[i ]];
}
-int16_t *h263_pred_motion(MpegEncContext * s, int block,
+int16_t *h263_pred_motion(MpegEncContext * s, int block, int dir,
int *px, int *py)
{
- int xy, wrap;
- int16_t *A, *B, *C, *mot_val;
- static const int off[4]= {2, 1, 1, -1};
-
- wrap = s->block_wrap[0];
- xy = s->block_index[block];
-
- mot_val = s->current_picture.motion_val[0][xy];
-
- A = s->current_picture.motion_val[0][xy - 1];
- /* special case for first (slice) line */
- if (s->first_slice_line && block<3) {
- // we cant just change some MVs to simulate that as we need them for the B frames (and ME)
- // and if we ever support non rectangular objects than we need to do a few ifs here anyway :(
- if(block==0){ //most common case
- if(s->mb_x == s->resync_mb_x){ //rare
- *px= *py = 0;
- }else if(s->mb_x + 1 == s->resync_mb_x && s->h263_pred){ //rare
- C = s->current_picture.motion_val[0][xy + off[block] - wrap];
- if(s->mb_x==0){
- *px = C[0];
- *py = C[1];
- }else{
- *px = mid_pred(A[0], 0, C[0]);
- *py = mid_pred(A[1], 0, C[1]);
- }
- }else{
- *px = A[0];
- *py = A[1];
- }
- }else if(block==1){
- if(s->mb_x + 1 == s->resync_mb_x && s->h263_pred){ //rare
- C = s->current_picture.motion_val[0][xy + off[block] - wrap];
- *px = mid_pred(A[0], 0, C[0]);
- *py = mid_pred(A[1], 0, C[1]);
- }else{
- *px = A[0];
- *py = A[1];
- }
- }else{ /* block==2*/
- B = s->current_picture.motion_val[0][xy - wrap];
- C = s->current_picture.motion_val[0][xy + off[block] - wrap];
- if(s->mb_x == s->resync_mb_x) //rare
- A[0]=A[1]=0;
-
- *px = mid_pred(A[0], B[0], C[0]);
- *py = mid_pred(A[1], B[1], C[1]);
- }
- } else {
- B = s->current_picture.motion_val[0][xy - wrap];
- C = s->current_picture.motion_val[0][xy + off[block] - wrap];
- *px = mid_pred(A[0], B[0], C[0]);
- *py = mid_pred(A[1], B[1], C[1]);
- }
- return mot_val;
-}
-
-// identical to above but with s->current_picture->motion_val, the above one will be removed, and this renamed to it
-int16_t *h263_pred_motion2(MpegEncContext * s, int block, int dir,
- int *px, int *py)
-{
- int xy, wrap;
+ int wrap;
int16_t *A, *B, *C, (*mot_val)[2];
static const int off[4]= {2, 1, 1, -1};
wrap = s->b8_stride;
- xy = 2*(s->mb_x + s->mb_y * wrap);
-
- mot_val = s->current_picture.motion_val[dir] + xy;
+ mot_val = s->current_picture.motion_val[dir] + s->block_index[block];
A = mot_val[ - 1];
/* special case for first (slice) line */
@@ -1785,7 +1728,7 @@ static void init_mv_penalty_and_fcode(MpegEncContext *s)
else{
int val, bit_size, range, code;
- bit_size = s->f_code - 1;
+ bit_size = f_code - 1;
range = 1 << bit_size;
val=mv;
@@ -2386,6 +2329,7 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
time_div= s->time/s->time_increment_resolution;
time_mod= s->time%s->time_increment_resolution;
time_incr= time_div - s->last_time_base;
+ assert(time_incr >= 0);
while(time_incr--)
put_bits(&s->pb, 1, 1);
@@ -2994,13 +2938,12 @@ void ff_mpeg4_init_partitions(MpegEncContext *s)
uint8_t *start= pbBufPtr(&s->pb);
uint8_t *end= s->pb.buf_end;
int size= end - start;
- int pb_size = size/3;
- int pb2_size= size/3;
- int tex_size= size - pb_size - pb2_size;
+ int pb_size = (((int)start + size/3)&(~3)) - (int)start;
+ int tex_size= (size - 2*pb_size)&(~3);
set_put_bits_buffer_size(&s->pb, pb_size);
init_put_bits(&s->tex_pb, start + pb_size , tex_size);
- init_put_bits(&s->pb2 , start + pb_size + tex_size, pb2_size);
+ init_put_bits(&s->pb2 , start + pb_size + tex_size, pb_size);
}
void ff_mpeg4_merge_partitions(MpegEncContext *s)
@@ -3165,7 +3108,7 @@ static int mpeg4_decode_video_packet_header(MpegEncContext *s)
skip_bits(&s->gb, 3); /* intra dc vlc threshold */
//FIXME dont just ignore everything
if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE){
- mpeg4_decode_sprite_trajectory(s);
+ mpeg4_decode_sprite_trajectory(s, &s->gb);
av_log(s->avctx, AV_LOG_ERROR, "untested\n");
}
@@ -3196,10 +3139,10 @@ void ff_mpeg4_clean_buffers(MpegEncContext *s)
{
int c_wrap, c_xy, l_wrap, l_xy;
- l_wrap= s->block_wrap[0];
- l_xy= s->mb_y*l_wrap*2 + s->mb_x*2;
- c_wrap= s->block_wrap[4];
- c_xy= s->mb_y*c_wrap + s->mb_x;
+ l_wrap= s->b8_stride;
+ l_xy= (2*s->mb_y-1)*l_wrap + s->mb_x*2 - 1;
+ c_wrap= s->mb_stride;
+ c_xy= (s->mb_y-1)*c_wrap + s->mb_x - 1;
#if 0
/* clean DC */
@@ -3372,7 +3315,7 @@ static int mpeg4_decode_partition_a(MpegEncContext *s){
}else{ /* P/S_TYPE */
int mx, my, pred_x, pred_y, bits;
int16_t * const mot_val= s->current_picture.motion_val[0][s->block_index[0]];
- const int stride= s->block_wrap[0]*2;
+ const int stride= s->b8_stride*2;
try_again:
bits= show_bits(&s->gb, 17);
@@ -3430,7 +3373,7 @@ try_again:
if ((cbpc & 16) == 0) {
/* 16x16 motion prediction */
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
if(!s->mcsel){
mx = h263_decode_motion(s, pred_x, s->f_code);
if (mx >= 0xffff)
@@ -3454,7 +3397,7 @@ try_again:
int i;
s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0;
for(i=0;i<4;i++) {
- int16_t *mot_val= h263_pred_motion(s, i, &pred_x, &pred_y);
+ int16_t *mot_val= h263_pred_motion(s, i, 0, &pred_x, &pred_y);
mx = h263_decode_motion(s, pred_x, s->f_code);
if (mx >= 0xffff)
return -1;
@@ -3708,7 +3651,7 @@ static void preview_obmc(MpegEncContext *s){
int cbpc, i, pred_x, pred_y, mx, my;
int16_t *mot_val;
const int xy= s->mb_x + 1 + s->mb_y * s->mb_stride;
- const int stride= s->block_wrap[0]*2;
+ const int stride= s->b8_stride*2;
for(i=0; i<4; i++)
s->block_index[i]+= 2;
@@ -3748,7 +3691,7 @@ static void preview_obmc(MpegEncContext *s){
if ((cbpc & 16) == 0) {
s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0;
/* 16x16 motion prediction */
- mot_val= h263_pred_motion(s, 0, &pred_x, &pred_y);
+ mot_val= h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
if (s->umvplus)
mx = h263p_decode_umotion(s, pred_x);
else
@@ -3766,7 +3709,7 @@ static void preview_obmc(MpegEncContext *s){
} else {
s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0;
for(i=0;i<4;i++) {
- mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
+ mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y);
if (s->umvplus)
mx = h263p_decode_umotion(s, pred_x);
else
@@ -3858,7 +3801,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0;
/* 16x16 motion prediction */
s->mv_type = MV_TYPE_16X16;
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
if (s->umvplus)
mx = h263p_decode_umotion(s, pred_x);
else
@@ -3883,7 +3826,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0;
s->mv_type = MV_TYPE_8X8;
for(i=0;i<4;i++) {
- mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
+ mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y);
if (s->umvplus)
mx = h263p_decode_umotion(s, pred_x);
else
@@ -3977,7 +3920,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
//FIXME UMV
if(USES_LIST(mb_type, 0)){
- int16_t *mot_val= h263_pred_motion2(s, 0, 0, &mx, &my);
+ int16_t *mot_val= h263_pred_motion(s, 0, 0, &mx, &my);
s->mv_dir = MV_DIR_FORWARD;
mx = h263_decode_motion(s, mx, 1);
@@ -3990,7 +3933,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
}
if(USES_LIST(mb_type, 1)){
- int16_t *mot_val= h263_pred_motion2(s, 0, 1, &mx, &my);
+ int16_t *mot_val= h263_pred_motion(s, 0, 1, &mx, &my);
s->mv_dir |= MV_DIR_BACKWARD;
mx = h263_decode_motion(s, mx, 1);
@@ -4145,7 +4088,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s,
s->field_select[0][0]= get_bits1(&s->gb);
s->field_select[0][1]= get_bits1(&s->gb);
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
for(i=0; i<2; i++){
mx = h263_decode_motion(s, pred_x, s->f_code);
@@ -4163,7 +4106,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s,
s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0;
/* 16x16 motion prediction */
s->mv_type = MV_TYPE_16X16;
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
mx = h263_decode_motion(s, pred_x, s->f_code);
if (mx >= 0xffff)
@@ -4180,7 +4123,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s,
s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0;
s->mv_type = MV_TYPE_8X8;
for(i=0;i<4;i++) {
- mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
+ mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y);
mx = h263_decode_motion(s, pred_x, s->f_code);
if (mx >= 0xffff)
return -1;
@@ -5117,11 +5060,15 @@ int h263_decode_picture_header(MpegEncContext *s)
s->qscale = get_bits(&s->gb, 5);
}
+ s->mb_width = (s->width + 15) / 16;
+ s->mb_height = (s->height + 15) / 16;
+ s->mb_num = s->mb_width * s->mb_height;
+
/* PEI */
while (get_bits1(&s->gb) != 0) {
skip_bits(&s->gb, 8);
}
-
+
if(s->h263_slice_structured){
if (get_bits1(&s->gb) != 1) {
av_log(s->avctx, AV_LOG_ERROR, "SEPB1 marker missing\n");
@@ -5181,7 +5128,7 @@ int h263_decode_picture_header(MpegEncContext *s)
return 0;
}
-static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
+static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb)
{
int i;
int a= 2<<s->sprite_warping_accuracy;
@@ -5201,17 +5148,17 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
int length;
int x=0, y=0;
- length= get_vlc(&s->gb, &sprite_trajectory);
+ length= get_vlc(gb, &sprite_trajectory);
if(length){
- x= get_xbits(&s->gb, length);
+ x= get_xbits(gb, length);
}
- if(!(s->divx_version==500 && s->divx_build==413)) skip_bits1(&s->gb); /* marker bit */
+ if(!(s->divx_version==500 && s->divx_build==413)) skip_bits1(gb); /* marker bit */
- length= get_vlc(&s->gb, &sprite_trajectory);
+ length= get_vlc(gb, &sprite_trajectory);
if(length){
- y=get_xbits(&s->gb, length);
+ y=get_xbits(gb, length);
}
- skip_bits1(&s->gb); /* marker bit */
+ skip_bits1(gb); /* marker bit */
//printf("%d %d %d %d\n", x, y, i, s->sprite_warping_accuracy);
d[i][0]= x;
d[i][1]= y;
@@ -5840,7 +5787,7 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
}
if(s->pict_type == S_TYPE && (s->vol_sprite_usage==STATIC_SPRITE || s->vol_sprite_usage==GMC_SPRITE)){
- mpeg4_decode_sprite_trajectory(s);
+ mpeg4_decode_sprite_trajectory(s, gb);
if(s->sprite_brightness_change) av_log(s->avctx, AV_LOG_ERROR, "sprite_brightness_change not supported\n");
if(s->vol_sprite_usage==STATIC_SPRITE) av_log(s->avctx, AV_LOG_ERROR, "static sprite not supported\n");
}
@@ -6111,8 +6058,10 @@ int flv_h263_decode_picture_header(MpegEncContext *s)
s->height = height;
s->pict_type = I_TYPE + get_bits(&s->gb, 2);
- if (s->pict_type > P_TYPE)
+ s->dropable= s->pict_type > P_TYPE;
+ if (s->dropable)
s->pict_type = P_TYPE;
+
skip_bits1(&s->gb); /* deblocking flag */
s->chroma_qscale= s->qscale = get_bits(&s->gb, 5);
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 88db359fe..aaf38b172 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -42,8 +42,8 @@ int ff_h263_decode_init(AVCodecContext *avctx)
s->workaround_bugs= avctx->workaround_bugs;
// set defaults
+ MPV_decode_defaults(s);
s->quant_precision=5;
- s->progressive_sequence=1;
s->decode_mb= ff_h263_decode_mb;
s->low_delay= 1;
avctx->pix_fmt= PIX_FMT_YUV420P;
@@ -551,6 +551,8 @@ retry:
s->workaround_bugs|= FF_BUG_EDGE;
}
+ if(s->divx_version)
+ s->workaround_bugs|= FF_BUG_HPEL_CHROMA;
#if 0
if(s->divx_version==500)
s->padding_bug_score= 256*256*256*64;
@@ -714,7 +716,8 @@ assert(s->current_picture.pict_type == s->pict_type);
ff_print_debug_info(s, pict);
} else {
*pict= *(AVFrame*)&s->last_picture;
- ff_print_debug_info(s, pict);
+ if(pict)
+ ff_print_debug_info(s, pict);
}
/* Return the Picture timestamp as the frame number */
diff --git a/src/libffmpeg/libavcodec/h264.c b/src/libffmpeg/libavcodec/h264.c
index 3f60e35e8..fa254e93b 100644
--- a/src/libffmpeg/libavcodec/h264.c
+++ b/src/libffmpeg/libavcodec/h264.c
@@ -31,6 +31,8 @@
#include "h264data.h"
#include "golomb.h"
+#include "cabac.h"
+
#undef NDEBUG
#include <assert.h>
@@ -162,6 +164,8 @@ typedef struct H264Context{
unsigned int top_samples_available;
unsigned int topright_samples_available;
unsigned int left_samples_available;
+ uint8_t (*top_border)[16+2*8];
+ uint8_t left_border[17+2*9];
/**
* non zero coeff count cache.
@@ -248,9 +252,9 @@ typedef struct H264Context{
int chroma_offset[2][16][2];
//deblock
- int disable_deblocking_filter_idc;
- int slice_alpha_c0_offset_div2;
- int slice_beta_offset_div2;
+ int deblocking_filter; ///< disable_deblocking_filter_idc with 1<->0
+ int slice_alpha_c0_offset;
+ int slice_beta_offset;
int redundant_pic_count;
@@ -282,6 +286,22 @@ typedef struct H264Context{
GetBitContext *inter_gb_ptr;
DCTELEM mb[16*24] __align8;
+
+ /**
+ * Cabac
+ */
+ CABACContext cabac;
+ uint8_t cabac_state[399];
+ int cabac_init_idc;
+
+ /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
+ uint16_t *cbp_table;
+ /* chroma_pred_mode for i4x4 or i16x16, else 0 */
+ uint8_t *chroma_pred_mode_table;
+ int last_qscale_diff;
+ int16_t (*mvd_table[2])[2];
+ int16_t mvd_cache[2][5*8][2];
+
}H264Context;
static VLC coeff_token_vlc[4];
@@ -295,6 +315,7 @@ static VLC run7_vlc;
static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
+static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr);
static inline uint32_t pack16to32(int a, int b){
#ifdef WORDS_BIGENDIAN
@@ -610,9 +631,52 @@ static inline void fill_caches(H264Context *h, int mb_type){
*(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
*(uint32_t*)h->mv_cache [list][scan8[4 ]]=
*(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
+
+ if( h->pps.cabac ) {
+ /* XXX beurk, Load mvd */
+ if(IS_INTER(topleft_type)){
+ const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
+ *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
+ }else{
+ *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
+ }
+
+ if(IS_INTER(top_type)){
+ const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
+ *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
+ *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
+ *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
+ *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
+ }else{
+ *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
+ *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
+ *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
+ *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
+ }
+ if(IS_INTER(left_type[0])){
+ const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
+ *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
+ *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
+ }else{
+ *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
+ *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
+ }
+ if(IS_INTER(left_type[1])){
+ const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
+ *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
+ *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
+ }else{
+ *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
+ *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
+ }
+ *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
+ *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
+ *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
+ *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
+ *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
+ }
}
//FIXME
-
}
#endif
}
@@ -920,6 +984,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
}
+ if( h->pps.cabac ) {
+ /* FIXME needed ? */
+ for(y=0; y<4; y++){
+ *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
+ *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
+ }
+ }
for(y=0; y<2; y++){
*(uint16_t*)s->current_picture.motion_val[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
}
@@ -931,6 +1002,12 @@ static inline void write_back_motion(H264Context *h, int mb_type){
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
*(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
}
+ if( h->pps.cabac ) {
+ for(y=0; y<4; y++){
+ *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
+ *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
+ }
+ }
for(y=0; y<2; y++){
s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
@@ -988,6 +1065,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
dst[di++]= 0;
dst[di++]= 0;
si+=3;
+ continue;
}else //next start code
break;
}
@@ -1001,6 +1079,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
return dst;
}
+#if 0
/**
* @param src the data which should be escaped
* @param dst the target buffer, dst+1 == src is allowed as a special case
@@ -1073,6 +1152,7 @@ static void encode_rbsp_trailing(PutBitContext *pb){
length= (-put_bits_count(pb))&7;
if(length) put_bits(pb, length, 0);
}
+#endif
/**
* identifies the exact end of the bitstream
@@ -1132,6 +1212,7 @@ static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
}
}
+#if 0
/**
* dct tranforms the 16 dc values.
* @param qp quantization parameter ??? FIXME
@@ -1169,6 +1250,8 @@ static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
block[stride*10+offset]= (z0 - z3)>>1;
}
}
+#endif
+
#undef xStride
#undef stride
@@ -1194,6 +1277,7 @@ static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){
block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1;
}
+#if 0
static void chroma_dc_dct_c(DCTELEM *block){
const int stride= 16*2;
const int xStride= 16;
@@ -1214,6 +1298,7 @@ static void chroma_dc_dct_c(DCTELEM *block){
block[stride*1 + xStride*0]= (a-c);
block[stride*1 + xStride*1]= (e-b);
}
+#endif
/**
* gets the chroma qp.
@@ -1232,18 +1317,6 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
uint8_t *cm = cropTbl + MAX_NEG_CROP;
block[0] += 32;
-#if 1
- for(i=0; i<4; i++){
- const int z0= block[i + 4*0] + block[i + 4*2];
- const int z1= block[i + 4*0] - block[i + 4*2];
- const int z2= (block[i + 4*1]>>1) - block[i + 4*3];
- const int z3= block[i + 4*1] + (block[i + 4*3]>>1);
-
- block[i + 4*0]= z0 + z3;
- block[i + 4*1]= z1 + z2;
- block[i + 4*2]= z1 - z2;
- block[i + 4*3]= z0 - z3;
- }
for(i=0; i<4; i++){
const int z0= block[0 + 4*i] + block[2 + 4*i];
@@ -1251,18 +1324,6 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i];
const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1);
- dst[0 + i*stride]= cm[ dst[0 + i*stride] + ((z0 + z3) >> 6) ];
- dst[1 + i*stride]= cm[ dst[1 + i*stride] + ((z1 + z2) >> 6) ];
- dst[2 + i*stride]= cm[ dst[2 + i*stride] + ((z1 - z2) >> 6) ];
- dst[3 + i*stride]= cm[ dst[3 + i*stride] + ((z0 - z3) >> 6) ];
- }
-#else
- for(i=0; i<4; i++){
- const int z0= block[0 + 4*i] + block[2 + 4*i];
- const int z1= block[0 + 4*i] - block[2 + 4*i];
- const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i];
- const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1);
-
block[0 + 4*i]= z0 + z3;
block[1 + 4*i]= z1 + z2;
block[2 + 4*i]= z1 - z2;
@@ -1280,9 +1341,9 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
}
-#endif
}
+#if 0
static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
int i;
//FIXME try int temp instead of block
@@ -1315,6 +1376,7 @@ static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int st
block[3*4 + i]= z3 - 2*z2;
}
}
+#endif
//FIXME need to check that this doesnt overflow signed 32 bit for low qp, iam not sure, its very close
//FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
@@ -2106,10 +2168,15 @@ static void init_pred_ptrs(H264Context *h){
static void free_tables(H264Context *h){
av_freep(&h->intra4x4_pred_mode);
+ av_freep(&h->chroma_pred_mode_table);
+ av_freep(&h->cbp_table);
+ av_freep(&h->mvd_table[0]);
+ av_freep(&h->mvd_table[1]);
av_freep(&h->non_zero_count);
av_freep(&h->slice_table_base);
+ av_freep(&h->top_border);
h->slice_table= NULL;
-
+
av_freep(&h->mb2b_xy);
av_freep(&h->mb2b8_xy);
}
@@ -2124,8 +2191,17 @@ static int alloc_tables(H264Context *h){
int x,y;
CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t))
+
CHECKED_ALLOCZ(h->non_zero_count , big_mb_num * 16 * sizeof(uint8_t))
CHECKED_ALLOCZ(h->slice_table_base , big_mb_num * sizeof(uint8_t))
+ CHECKED_ALLOCZ(h->top_border , s->mb_width * (16+8+8) * sizeof(uint8_t))
+
+ if( h->pps.cabac ) {
+ CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
+ CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
+ CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
+ CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
+ }
memset(h->slice_table_base, -1, big_mb_num * sizeof(uint8_t));
h->slice_table= h->slice_table_base + s->mb_stride + 1;
@@ -2166,6 +2242,8 @@ static int decode_init(AVCodecContext *avctx){
H264Context *h= avctx->priv_data;
MpegEncContext * const s = &h->s;
+ MPV_decode_defaults(s);
+
s->avctx = avctx;
common_init(h);
@@ -2173,7 +2251,6 @@ static int decode_init(AVCodecContext *avctx){
s->workaround_bugs= avctx->workaround_bugs;
// set defaults
- s->progressive_sequence=1;
// s->decode_mb= ff_h263_decode_mb;
s->low_delay= 1;
avctx->pix_fmt= PIX_FMT_YUV420P;
@@ -2205,6 +2282,66 @@ static void frame_start(H264Context *h){
// s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
}
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+ MpegEncContext * const s = &h->s;
+ int i;
+
+ src_y -= linesize;
+ src_cb -= uvlinesize;
+ src_cr -= uvlinesize;
+
+ h->left_border[0]= h->top_border[s->mb_x][15];
+ for(i=1; i<17; i++){
+ h->left_border[i]= src_y[15+i* linesize];
+ }
+
+ *(uint64_t*)(h->top_border[s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize);
+ *(uint64_t*)(h->top_border[s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
+
+ if(!(s->flags&CODEC_FLAG_GRAY)){
+ h->left_border[17 ]= h->top_border[s->mb_x][16+7];
+ h->left_border[17+9]= h->top_border[s->mb_x][24+7];
+ for(i=1; i<9; i++){
+ h->left_border[i+17 ]= src_cb[7+i*uvlinesize];
+ h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
+ }
+ *(uint64_t*)(h->top_border[s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
+ *(uint64_t*)(h->top_border[s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
+ }
+}
+
+static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+ MpegEncContext * const s = &h->s;
+ int temp8, i;
+ uint64_t temp64;
+
+ src_y -= linesize + 1;
+ src_cb -= uvlinesize + 1;
+ src_cr -= uvlinesize + 1;
+
+#define XCHG(a,b,t,xchg)\
+t= a;\
+if(xchg)\
+ a= b;\
+b= t;
+
+ for(i=0; i<17; i++){
+ XCHG(h->left_border[i ], src_y [i* linesize], temp8, xchg);
+ }
+
+ XCHG(*(uint64_t*)(h->top_border[s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
+ XCHG(*(uint64_t*)(h->top_border[s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
+
+ if(!(s->flags&CODEC_FLAG_GRAY)){
+ for(i=0; i<9; i++){
+ XCHG(h->left_border[i+17 ], src_cb[i*uvlinesize], temp8, xchg);
+ XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
+ }
+ XCHG(*(uint64_t*)(h->top_border[s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
+ XCHG(*(uint64_t*)(h->top_border[s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
+ }
+}
+
static void hl_decode_mb(H264Context *h){
MpegEncContext * const s = &h->s;
const int mb_x= s->mb_x;
@@ -2240,6 +2377,9 @@ static void hl_decode_mb(H264Context *h){
}
if(IS_INTRA(mb_type)){
+ if(h->deblocking_filter)
+ xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+
if(!(s->flags&CODEC_FLAG_GRAY)){
h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
@@ -2257,6 +2397,9 @@ static void hl_decode_mb(H264Context *h){
if(!topright_avail){
tr= ptr[3 - linesize]*0x01010101;
topright= (uint8_t*) &tr;
+ }else if(i==5 && h->deblocking_filter){
+ tr= *(uint32_t*)h->top_border[mb_x+1];
+ topright= (uint8_t*) &tr;
}
h->pred4x4[ dir ](ptr, topright, linesize);
@@ -2275,6 +2418,8 @@ static void hl_decode_mb(H264Context *h){
else
svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
}
+ if(h->deblocking_filter)
+ xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
}else if(s->codec_id == CODEC_ID_H264){
hl_motion(h, dest_y, dest_cb, dest_cr,
s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
@@ -2331,10 +2476,10 @@ static void hl_decode_mb(H264Context *h){
}
}
}
-}
-
-static void decode_mb_cabac(H264Context *h){
-// MpegEncContext * const s = &h->s;
+ if(h->deblocking_filter) {
+ backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+ filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr);
+ }
}
/**
@@ -2979,9 +3124,16 @@ static int decode_slice_header(H264Context *h){
if(s->current_picture.reference)
decode_ref_pic_marking(h);
- //FIXME CABAC stuff
- s->qscale = h->pps.init_qp + get_se_golomb(&s->gb); //slice_qp_delta
+ if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
+ h->cabac_init_idc = get_ue_golomb(&s->gb);
+
+ h->last_qscale_diff = 0;
+ s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
+ if(s->qscale<0 || s->qscale>51){
+ av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
+ return -1;
+ }
//FIXME qscale / qp ... stuff
if(h->slice_type == SP_TYPE){
get_bits1(&s->gb); /* sp_for_switch_flag */
@@ -2990,14 +3142,19 @@ static int decode_slice_header(H264Context *h){
get_se_golomb(&s->gb); /* slice_qs_delta */
}
+ h->deblocking_filter = 1;
+ h->slice_alpha_c0_offset = 0;
+ h->slice_beta_offset = 0;
if( h->pps.deblocking_filter_parameters_present ) {
- h->disable_deblocking_filter_idc= get_ue_golomb(&s->gb);
- if( h->disable_deblocking_filter_idc != 1 ) {
- h->slice_alpha_c0_offset_div2= get_se_golomb(&s->gb);
- h->slice_beta_offset_div2= get_se_golomb(&s->gb);
+ h->deblocking_filter= get_ue_golomb(&s->gb);
+ if(h->deblocking_filter < 2)
+ h->deblocking_filter^= 1; // 1<->0
+
+ if( h->deblocking_filter ) {
+ h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
+ h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
}
- }else
- h->disable_deblocking_filter_idc= 0;
+ }
#if 0 //FMO
if( h->pps.num_slice_groups > 1 && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
@@ -3012,7 +3169,7 @@ static int decode_slice_header(H264Context *h){
s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
h->ref_count[0], h->ref_count[1],
s->qscale,
- h->disable_deblocking_filter_idc
+ h->deblocking_filter
);
}
@@ -3122,7 +3279,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
#else
if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
- ? == prefix > 2 or sth
+ /* ? == prefix > 2 or sth */
#endif
tprintf("level: %d suffix_length:%d\n", level[i], suffix_length);
}
@@ -3186,7 +3343,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
* decodes a macroblock
* @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
*/
-static int decode_mb(H264Context *h){
+static int decode_mb_cavlc(H264Context *h){
MpegEncContext * const s = &h->s;
const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
int mb_type, partition_count, cbp;
@@ -3223,6 +3380,7 @@ static int decode_mb(H264Context *h){
write_back_motion(h, mb_type);
s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
+ s->current_picture.qscale_table[mb_xy]= s->qscale;
h->slice_table[ mb_xy ]= h->slice_num;
h->prev_mb_skiped= 1;
@@ -3303,7 +3461,9 @@ decode_intra_mb:
skip_bits(&s->gb, 384); //FIXME check /fix the bitstream readers
+ //FIXME deblock filter, non_zero_count_cache init ...
memset(h->non_zero_count[mb_xy], 16, 16);
+ s->current_picture.qscale_table[mb_xy]= s->qscale;
return 0;
}
@@ -3607,71 +3767,1470 @@ decode_intra_mb:
nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
}
}else{
- memset(&h->non_zero_count_cache[8], 0, 8*5);
+ uint8_t * const nnz= &h->non_zero_count_cache[0];
+ fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+ nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+ nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
}
+ s->current_picture.qscale_table[mb_xy]= s->qscale;
write_back_non_zero_count(h);
return 0;
}
-static int decode_slice(H264Context *h){
+static int decode_cabac_mb_type( H264Context *h ) {
MpegEncContext * const s = &h->s;
- const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
- s->mb_skip_run= -1;
-
-#if 1
- for(;;){
- int ret= decode_mb(h);
-
- hl_decode_mb(h);
-
- if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ?
- s->mb_y++;
- ret= decode_mb(h);
-
- hl_decode_mb(h);
- s->mb_y--;
+ if( h->slice_type == I_TYPE ) {
+ const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
+ int ctx = 0;
+ int mb_type;
+
+ if( s->mb_x > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-1] ) )
+ ctx++;
+ if( s->mb_y > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-s->mb_stride] ) )
+ ctx++;
+
+ if( get_cabac( &h->cabac, &h->cabac_state[3+ctx] ) == 0 )
+ return 0; /* I4x4 */
+
+ if( get_cabac_terminate( &h->cabac ) )
+ return 25; /* PCM */
+
+ mb_type = 1; /* I16x16 */
+ if( get_cabac( &h->cabac, &h->cabac_state[3+3] ) )
+ mb_type += 12; /* cbp_luma != 0 */
+
+ if( get_cabac( &h->cabac, &h->cabac_state[3+4] ) ) {
+ if( get_cabac( &h->cabac, &h->cabac_state[3+5] ) )
+ mb_type += 4 * 2; /* cbp_chroma == 2 */
+ else
+ mb_type += 4 * 1; /* cbp_chroma == 1 */
+ }
+ if( get_cabac( &h->cabac, &h->cabac_state[3+6] ) )
+ mb_type += 2;
+ if( get_cabac( &h->cabac, &h->cabac_state[3+7] ) )
+ mb_type += 1;
+ return mb_type;
+
+ } else if( h->slice_type == P_TYPE ) {
+ if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
+ /* P-type */
+ if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
+ if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
+ return 0; /* P_L0_D16x16; */
+ else
+ return 3; /* P_8x8; */
+ } else {
+ if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
+ return 2; /* P_L0_D8x16; */
+ else
+ return 1; /* P_L0_D16x8; */
+ }
+ } else {
+ int mb_type;
+ /* I-type */
+ if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
+ return 5+0; /* I_4x4 */
+ if( get_cabac_terminate( &h->cabac ) )
+ return 5+25; /*I_PCM */
+ mb_type = 5+1; /* I16x16 */
+ if( get_cabac( &h->cabac, &h->cabac_state[17+1] ) )
+ mb_type += 12; /* cbp_luma != 0 */
+
+ if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) ) {
+ if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) )
+ mb_type += 4 * 2; /* cbp_chroma == 2 */
+ else
+ mb_type += 4 * 1; /* cbp_chroma == 1 */
+ }
+ if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) )
+ mb_type += 2;
+ if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) )
+ mb_type += 1;
+
+ return mb_type;
}
+ } else {
+ /* TODO do others frames types */
+ return -1;
+ }
+}
- if(ret<0){
- av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
- ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+static int decode_cabac_mb_skip( H264Context *h) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+ const int mba_xy = mb_xy - 1;
+ const int mbb_xy = mb_xy - s->mb_stride;
+ int ctx = 0;
+
+ if( s->mb_x > 0 && !IS_SKIP( s->current_picture.mb_type[mba_xy] ) )
+ ctx++;
+ if( s->mb_y > 0 && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ) )
+ ctx++;
+
+ if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
+ return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
+ else /* B-frame */
+ return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
+}
- return -1;
+static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
+ int mode = 0;
+
+ if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
+ return pred_mode;
+
+ if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
+ mode += 1;
+ if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
+ mode += 2;
+ if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
+ mode += 4;
+ if( mode >= pred_mode )
+ return mode + 1;
+ else
+ return mode;
+}
+
+static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+ const int mba_xy = mb_xy - 1;
+ const int mbb_xy = mb_xy - s->mb_stride;
+
+ int ctx = 0;
+
+ /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
+ if( s->mb_x > 0 && h->chroma_pred_mode_table[mba_xy] != 0 )
+ ctx++;
+
+ if( s->mb_y > 0 && h->chroma_pred_mode_table[mbb_xy] != 0 )
+ ctx++;
+
+ if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
+ return 0;
+
+ if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
+ return 1;
+ if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
+ return 2;
+ else
+ return 3;
+}
+
+static const uint8_t block_idx_x[16] = {
+ 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] = {
+ 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] = {
+ { 0, 2, 8, 10},
+ { 1, 3, 9, 11},
+ { 4, 6, 12, 14},
+ { 5, 7, 13, 15}
+};
+
+static int decode_cabac_mb_cbp_luma( H264Context *h) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+
+ int cbp = 0;
+ int i8x8;
+
+ h->cbp_table[mb_xy] = 0; /* FIXME aaahahahah beurk */
+
+ for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+ int mba_xy = -1;
+ int mbb_xy = -1;
+ int x, y;
+ int ctx = 0;
+
+ x = block_idx_x[4*i8x8];
+ y = block_idx_y[4*i8x8];
+
+ if( x > 0 )
+ mba_xy = mb_xy;
+ else if( s->mb_x > 0 )
+ mba_xy = mb_xy - 1;
+
+ if( y > 0 )
+ mbb_xy = mb_xy;
+ else if( s->mb_y > 0 )
+ mbb_xy = mb_xy - s->mb_stride;
+
+ /* No need to test for skip as we put 0 for skip block */
+ if( mba_xy >= 0 ) {
+ int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+ if( ((h->cbp_table[mba_xy] >> i8x8a)&0x01) == 0 )
+ ctx++;
}
-
- if(++s->mb_x >= s->mb_width){
- s->mb_x=0;
- ff_draw_horiz_band(s, 16*s->mb_y, 16);
- if(++s->mb_y >= s->mb_height){
- tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
- if(get_bits_count(&s->gb) == s->gb.size_in_bits){
- ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+ if( mbb_xy >= 0 ) {
+ int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+ if( ((h->cbp_table[mbb_xy] >> i8x8b)&0x01) == 0 )
+ ctx += 2;
+ }
- return 0;
+ if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
+ cbp |= 1 << i8x8;
+ h->cbp_table[mb_xy] = cbp; /* FIXME aaahahahah beurk */
+ }
+ }
+ return cbp;
+}
+static int decode_cabac_mb_cbp_chroma( H264Context *h) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+ int ctx;
+ int cbp_a, cbp_b;
+
+ /* No need to test for skip */
+ if( s->mb_x > 0 )
+ cbp_a = (h->cbp_table[mb_xy-1]>>4)&0x03;
+ else
+ cbp_a = -1;
+
+ if( s->mb_y > 0 )
+ cbp_b = (h->cbp_table[mb_xy-s->mb_stride]>>4)&0x03;
+ else
+ cbp_b = -1;
+
+ ctx = 0;
+ if( cbp_a > 0 ) ctx++;
+ if( cbp_b > 0 ) ctx += 2;
+ if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
+ return 0;
+
+ ctx = 4;
+ if( cbp_a == 2 ) ctx++;
+ if( cbp_b == 2 ) ctx += 2;
+ if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) )
+ return 2;
+ else
+ return 1;
+}
+static int decode_cabac_mb_dqp( H264Context *h) {
+ MpegEncContext * const s = &h->s;
+ int mbn_xy;
+ int ctx = 0;
+ int val = 0;
+
+ if( s->mb_x > 0 )
+ mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
+ else
+ mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
+
+ if( mbn_xy >= 0 && h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
+ ctx++;
+
+ while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
+ if( ctx < 2 )
+ ctx = 2;
+ else
+ ctx = 3;
+ val++;
+ }
+
+ if( val&0x01 )
+ return (val + 1)/2;
+ else
+ return -(val + 1)/2;
+}
+static int decode_cabac_mb_sub_type( H264Context *h ) {
+ if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
+ return 0; /* 8x8 */
+ if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
+ return 1; /* 8x4 */
+ if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
+ return 2; /* 4x8 */
+ return 3; /* 4x4 */
+}
+
+static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
+ int refa = h->ref_cache[list][scan8[n] - 1];
+ int refb = h->ref_cache[list][scan8[n] - 8];
+ int ref = 0;
+ int ctx = 0;
+
+ if( refa > 0 )
+ ctx++;
+ if( refb > 0 )
+ ctx += 2;
+
+ while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
+ ref++;
+ if( ctx < 4 )
+ ctx = 4;
+ else
+ ctx = 5;
+ }
+ return ref;
+}
+
+static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
+ int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
+ abs( h->mvd_cache[list][scan8[n] - 8][l] );
+ int ctxbase = (l == 0) ? 40 : 47;
+ int ctx;
+ int mvd = 0;
+
+ if( amvd < 3 )
+ ctx = 0;
+ else if( amvd > 32 )
+ ctx = 2;
+ else
+ ctx = 1;
+
+ while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
+ mvd++;
+ if( ctx < 3 )
+ ctx = 3;
+ else if( ctx < 6 )
+ ctx++;
+ }
+
+ if( mvd >= 9 ) {
+ int k = 3;
+ while( get_cabac_bypass( &h->cabac ) ) {
+ mvd += 1 << k;
+ k++;
+ }
+ while( k-- ) {
+ if( get_cabac_bypass( &h->cabac ) )
+ mvd += 1 << k;
+ }
+ }
+ if( mvd != 0 && get_cabac_bypass( &h->cabac ) )
+ return -mvd;
+ return mvd;
+}
+
+
+static int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+ int mba_xy = -1;
+ int mbb_xy = -1;
+
+ int nza = -1;
+ int nzb = -1;
+ int ctx = 0;
+
+ if( cat == 0 ) {
+ if( s->mb_x > 0 ) {
+ mba_xy = mb_xy - 1;
+ if( IS_INTRA16x16(s->current_picture.mb_type[mba_xy] ) )
+ nza = h->cbp_table[mba_xy]&0x100;
+ }
+ if( s->mb_y > 0 ) {
+ mbb_xy = mb_xy - s->mb_stride;
+ if( IS_INTRA16x16(s->current_picture.mb_type[mbb_xy] ) )
+ nzb = h->cbp_table[mbb_xy]&0x100;
+ }
+ } else if( cat == 1 || cat == 2 ) {
+ int i8x8a, i8x8b;
+ int x, y;
+
+ x = block_idx_x[idx];
+ y = block_idx_y[idx];
+
+ if( x > 0 )
+ mba_xy = mb_xy;
+ else if( s->mb_x > 0 )
+ mba_xy = mb_xy - 1;
+
+ if( y > 0 )
+ mbb_xy = mb_xy;
+ else if( s->mb_y > 0 )
+ mbb_xy = mb_xy - s->mb_stride;
+
+ /* No need to test for skip */
+ if( mba_xy >= 0 ) {
+ i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+
+ if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
+ ((h->cbp_table[mba_xy]&0x0f)>>i8x8a))
+ nza = h->non_zero_count_cache[scan8[idx] - 1];
+ }
+
+ if( mbb_xy >= 0 ) {
+ i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+
+ if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
+ ((h->cbp_table[mbb_xy]&0x0f)>>i8x8b))
+ nzb = h->non_zero_count_cache[scan8[idx] - 8];
+ }
+ } else if( cat == 3 ) {
+ if( s->mb_x > 0 ) {
+ mba_xy = mb_xy - 1;
+
+ if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
+ (h->cbp_table[mba_xy]&0x30) )
+ nza = (h->cbp_table[mba_xy]>>(6+idx))&0x01;
+ }
+ if( s->mb_y > 0 ) {
+ mbb_xy = mb_xy - s->mb_stride;
+
+ if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
+ (h->cbp_table[mbb_xy]&0x30) )
+ nzb = (h->cbp_table[mbb_xy]>>(6+idx))&0x01;
+ }
+ } else if( cat == 4 ) {
+ int idxc = idx % 4 ;
+ if( idxc == 1 || idxc == 3 )
+ mba_xy = mb_xy;
+ else if( s->mb_x > 0 )
+ mba_xy = mb_xy -1;
+
+ if( idxc == 2 || idxc == 3 )
+ mbb_xy = mb_xy;
+ else if( s->mb_y > 0 )
+ mbb_xy = mb_xy - s->mb_stride;
+
+ if( mba_xy >= 0 &&
+ !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
+ (h->cbp_table[mba_xy]&0x30) == 0x20 )
+ nza = h->non_zero_count_cache[scan8[16+idx] - 1];
+
+ if( mbb_xy >= 0 &&
+ !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
+ (h->cbp_table[mbb_xy]&0x30) == 0x20 )
+ nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
+ }
+
+ if( ( mba_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) ||
+ ( mba_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) ) ||
+ nza > 0 )
+ ctx++;
+
+ if( ( mbb_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) ||
+ ( mbb_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) ) ||
+ nzb > 0 )
+ ctx += 2;
+
+ return ctx + 4 * cat;
+}
+
+static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
+ const int mb_xy = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
+ const uint16_t *qmul= dequant_coeff[qp];
+ static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+ static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+ static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+
+ int coeff[16];
+
+ int last = 0;
+ int coeff_count = 0;
+ int nz[16] = {0};
+ int i;
+
+ int abslevel1 = 0;
+ int abslevelgt1 = 0;
+
+ /* cat: 0-> DC 16x16 n = 0
+ * 1-> AC 16x16 n = luma4x4idx
+ * 2-> Luma4x4 n = luma4x4idx
+ * 3-> DC Chroma n = iCbCr
+ * 4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
+ */
+
+ /* read coded block flag */
+ if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
+ if( cat == 1 || cat == 2 )
+ h->non_zero_count_cache[scan8[n]] = 0;
+ else if( cat == 4 )
+ h->non_zero_count_cache[scan8[16+n]] = 0;
+
+ return 0;
+ }
+
+ while( last < max_coeff - 1 ) {
+ int ctx = FFMIN( last, max_coeff - 2 );
+
+ if( get_cabac( &h->cabac, &h->cabac_state[105+significant_coeff_flag_offset[cat]+ctx] ) == 0 ) {
+ nz[last++] = 0;
+ }
+ else {
+ nz[last++] = 1;
+ coeff_count++;
+ if( get_cabac( &h->cabac, &h->cabac_state[166+last_significant_coeff_flag_offset[cat]+ctx] ) ) {
+ while( last < max_coeff ) {
+ nz[last++] = 0;
+ }
+ break;
+ }
+ }
+ }
+ if( last == max_coeff -1 ) {
+ nz[last++] = 1;
+ coeff_count++;
+ }
+
+ if( cat == 0 && coeff_count > 0 )
+ h->cbp_table[mb_xy] |= 0x100;
+ else if( cat == 1 || cat == 2 )
+ h->non_zero_count_cache[scan8[n]] = coeff_count;
+ else if( cat == 3 && coeff_count > 0 )
+ h->cbp_table[mb_xy] |= 0x40 << n;
+ else if( cat == 4 )
+ h->non_zero_count_cache[scan8[16+n]] = coeff_count;
+
+ for( i = coeff_count - 1; i >= 0; i-- ) {
+ int coeff_abs_m1;
+
+ int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 + 1 )) + coeff_abs_level_m1_offset[cat];
+
+ if( get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) == 0 ) {
+ coeff_abs_m1 = 0;
+ } else {
+ coeff_abs_m1 = 1;
+ ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat];
+ while( coeff_abs_m1 < 14 && get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) ) {
+ coeff_abs_m1++;
+ }
+ }
+
+ if( coeff_abs_m1 >= 14 ) {
+ int j = 0;
+ while( get_cabac_bypass( &h->cabac ) ) {
+ coeff_abs_m1 += 1 << j;
+ j++;
+ }
+
+ while( j-- ) {
+ if( get_cabac_bypass( &h->cabac ) )
+ coeff_abs_m1 += 1 << j ;
+ }
+ }
+ if( get_cabac_bypass( &h->cabac ) )
+ coeff[i] = -1 *( coeff_abs_m1 + 1 );
+ else
+ coeff[i] = coeff_abs_m1 + 1;
+
+ if( coeff_abs_m1 == 0 )
+ abslevel1++;
+ else
+ abslevelgt1++;
+ }
+
+ if( cat == 0 || cat == 3 ) { /* DC */
+ int j;
+ for( i = 0, j = 0; j < coeff_count; i++ ) {
+ if( nz[i] ) {
+ block[scantable[i]] = coeff[j];
+
+ j++;
+ }
+ }
+
+ } else { /* AC */
+ int j;
+ for( i = 0, j = 0; j < coeff_count; i++ ) {
+ if( nz[i] ) {
+ block[scantable[i]] = coeff[j] * qmul[scantable[i]];
+
+ j++;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * decodes a macroblock
+ * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+static int decode_mb_cabac(H264Context *h) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
+ int mb_type, partition_count, cbp = 0;
+
+ s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?)
+
+ if( h->slice_type == B_TYPE ) {
+ av_log( h->s.avctx, AV_LOG_ERROR, "B-frame not supported with CABAC\n" );
+ return -1;
+ }
+ if( h->sps.mb_aff ) {
+ av_log( h->s.avctx, AV_LOG_ERROR, "Fields not supported with CABAC\n" );
+ return -1;
+ }
+
+ if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
+ /* read skip flags */
+ if( decode_cabac_mb_skip( h ) ) {
+ int mx, my;
+
+ /* skip mb */
+ mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+
+ memset(h->non_zero_count[mb_xy], 0, 16);
+ memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+#if 0
+ if(h->sps.mb_aff && s->mb_skip_run==0 && (s->mb_y&1)==0){
+ h->mb_field_decoding_flag= get_bits1(&s->gb);
+ }
+ if(h->mb_field_decoding_flag)
+ mb_type|= MB_TYPE_INTERLACED;
+#endif
+
+ fill_caches(h, mb_type); //FIXME check what is needed and what not ...
+ pred_pskip_motion(h, &mx, &my);
+ fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+ fill_rectangle( h->mvd_cache[0][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
+ fill_rectangle( h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
+ write_back_motion(h, mb_type);
+
+ s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
+ s->current_picture.qscale_table[mb_xy]= s->qscale;
+ h->slice_table[ mb_xy ]= h->slice_num;
+ h->cbp_table[mb_xy] = 0;
+ h->chroma_pred_mode_table[mb_xy] = 0;
+ h->last_qscale_diff = 0;
+
+ h->prev_mb_skiped= 1;
+
+ return 0;
+
+ }
+ }
+ h->prev_mb_skiped = 0;
+
+ if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
+ av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
+ return -1;
+ }
+
+ if( h->slice_type == P_TYPE ) {
+ if( mb_type < 5) {
+ partition_count= p_mb_type_info[mb_type].partition_count;
+ mb_type= p_mb_type_info[mb_type].type;
+ } else {
+ mb_type -= 5;
+ goto decode_intra_mb;
+ }
+ } else {
+ assert(h->slice_type == I_TYPE);
+decode_intra_mb:
+ partition_count = 0;
+ cbp= i_mb_type_info[mb_type].cbp;
+ h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
+ mb_type= i_mb_type_info[mb_type].type;
+ }
+#if 0
+ if(h->mb_field_decoding_flag)
+ mb_type |= MB_TYPE_INTERLACED;
+#endif
+
+ s->current_picture.mb_type[mb_xy]= mb_type;
+ h->slice_table[ mb_xy ]= h->slice_num;
+
+ if(IS_INTRA_PCM(mb_type)) {
+ /* TODO */
+ h->cbp_table[mb_xy] = 0xf +4*2;
+ h->chroma_pred_mode_table[mb_xy] = 0;
+ s->current_picture.qscale_table[mb_xy]= s->qscale;
+ return -1;
+ }
+
+ fill_caches(h, mb_type);
+
+ if( IS_INTRA( mb_type ) ) {
+ if( IS_INTRA4x4( mb_type ) ) {
+ int i;
+ for( i = 0; i < 16; i++ ) {
+ int pred = pred_intra_mode( h, i );
+ h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
+
+ //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
+ }
+ write_back_intra_pred_mode(h);
+ if( check_intra4x4_pred_mode(h) < 0 ) return -1;
+ } else {
+ h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
+ if( h->intra16x16_pred_mode < 0 ) return -1;
+ }
+ h->chroma_pred_mode_table[mb_xy] =
+ h->chroma_pred_mode = decode_cabac_mb_chroma_pre_mode( h );
+
+ h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
+ if( h->chroma_pred_mode < 0 ) return -1;
+ } else if( partition_count == 4 ) {
+ int i, j, sub_partition_count[4], list, ref[2][4];
+
+ /* Only P-frame */
+ for( i = 0; i < 4; i++ ) {
+ h->sub_mb_type[i] = decode_cabac_mb_sub_type( h );
+ sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
+ h->sub_mb_type[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+ }
+
+ for( list = 0; list < 2; list++ ) {
+ if( h->ref_count[list] > 0 ) {
+ for( i = 0; i < 4; i++ ) {
+ if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+ if( h->ref_count[list] > 1 )
+ ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
+ else
+ ref[list][i] = 0;
+ } else {
+ ref[list][i] = -1;
+ }
+ h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]=
+ h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
+ }
+ }
+ }
+
+ for(list=0; list<2; list++){
+
+ for(i=0; i<4; i++){
+ //h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]=
+ //h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
+
+ if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+ const int sub_mb_type= h->sub_mb_type[i];
+ const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
+ for(j=0; j<sub_partition_count[i]; j++){
+ int mpx, mpy;
+ int mx, my;
+ const int index= 4*i + block_width*j;
+ int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
+ int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
+ pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
+
+ mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
+ my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
+ tprintf("final mv:%d %d\n", mx, my);
+
+ if(IS_SUB_8X8(sub_mb_type)){
+ mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
+ mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
+ mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
+ mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
+
+ mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
+ mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
+ mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
+ mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
+ }else if(IS_SUB_8X4(sub_mb_type)){
+ mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
+ mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
+
+ mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
+ mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
+ }else if(IS_SUB_4X8(sub_mb_type)){
+ mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
+ mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
+
+ mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
+ mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
+ }else{
+ assert(IS_SUB_4X4(sub_mb_type));
+ mv_cache[ 0 ][0]= mx;
+ mv_cache[ 0 ][1]= my;
+
+ mvd_cache[ 0 ][0]= mx - mpx;
+ mvd_cache[ 0 ][1]= my - mpy;
+ }
+ }
}else{
- ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+ uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
+ uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
+ p[0] = p[1] = p[8] = p[9] = 0;
+ pd[0]= pd[1]= pd[8]= pd[9]= 0;
+ }
+ }
+ }
+ } else if( !IS_DIRECT(mb_type) ) {
+ int list, mx, my, i, mpx, mpy;
+ if(IS_16X16(mb_type)){
+ for(list=0; list<2; list++){
+ if(IS_DIR(mb_type, 0, list)){
+ if(h->ref_count[list] > 0 ){
+ const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
+ fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
+ }
+ }
+ }
+ for(list=0; list<2; list++){
+ if(IS_DIR(mb_type, 0, list)){
+ pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
+
+ mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
+ my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
+ tprintf("final mv:%d %d\n", mx, my);
+
+ fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
+ fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
+ }
+ }
+ }
+ else if(IS_16X8(mb_type)){
+ for(list=0; list<2; list++){
+ if(h->ref_count[list]>0){
+ for(i=0; i<2; i++){
+ if(IS_DIR(mb_type, i, list)){
+ const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
+ fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
+ }
+ }
+ }
+ }
+ for(list=0; list<2; list++){
+ for(i=0; i<2; i++){
+ if(IS_DIR(mb_type, i, list)){
+ pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
+ mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
+ my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
+ tprintf("final mv:%d %d\n", mx, my);
+ fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
+ fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
+ }
+ }
+ }
+ }else{
+ assert(IS_8X16(mb_type));
+ for(list=0; list<2; list++){
+ if(h->ref_count[list]>0){
+ for(i=0; i<2; i++){
+ if(IS_DIR(mb_type, i, list)){ //FIXME optimize
+ const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
+ fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
+ }
+ }
+ }
+ }
+ for(list=0; list<2; list++){
+ for(i=0; i<2; i++){
+ if(IS_DIR(mb_type, i, list)){
+ pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
+ mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
+ my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
+
+ tprintf("final mv:%d %d\n", mx, my);
+ fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
+ fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
+ }
+ }
+ }
+ }
+ }
+
+ if( IS_INTER( mb_type ) ) {
+ h->chroma_pred_mode_table[mb_xy] = 0;
+ write_back_motion( h, mb_type );
+ }
+
+ if( !IS_INTRA16x16( mb_type ) ) {
+ cbp = decode_cabac_mb_cbp_luma( h );
+ cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
+ }
+
+ h->cbp_table[mb_xy] = cbp;
+
+ if( cbp || IS_INTRA16x16( mb_type ) ) {
+ const uint8_t *scan, *dc_scan;
+ int dqp;
+
+ if(IS_INTERLACED(mb_type)){
+ scan= field_scan;
+ dc_scan= luma_dc_field_scan;
+ }else{
+ scan= zigzag_scan;
+ dc_scan= luma_dc_zigzag_scan;
+ }
+
+ h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
+ s->qscale += dqp;
+ if(((unsigned)s->qscale) > 51){
+ if(s->qscale<0) s->qscale+= 52;
+ else s->qscale-= 52;
+ }
+ h->chroma_qp = get_chroma_qp(h, s->qscale);
+
+ if( IS_INTRA16x16( mb_type ) ) {
+ int i;
+ //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
+ if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, s->qscale, 16) < 0)
+ return -1;
+ if( cbp&15 ) {
+ for( i = 0; i < 16; i++ ) {
+ //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
+ if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, s->qscale, 15) < 0 )
+ return -1;
+ }
+ } else {
+ fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+ }
+ } else {
+ int i8x8, i4x4;
+ for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+ if( cbp & (1<<i8x8) ) {
+ for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+ const int index = 4*i8x8 + i4x4;
+ //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
+ if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, s->qscale, 16) < 0 )
+ return -1;
+ }
+ } else {
+ uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+ nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+ }
+ }
+ }
+
+ if( cbp&0x30 ){
+ int c;
+ for( c = 0; c < 2; c++ ) {
+ //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+ if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->chroma_qp, 4) < 0)
return -1;
+ }
+ }
+
+ if( cbp&0x20 ) {
+ int c, i;
+ for( c = 0; c < 2; c++ ) {
+ for( i = 0; i < 4; i++ ) {
+ const int index = 16 + 4 * c + i;
+ //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+ if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->chroma_qp, 15) < 0)
+ return -1;
}
}
+ } else {
+ uint8_t * const nnz= &h->non_zero_count_cache[0];
+ nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+ nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
}
-
- if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
- if(get_bits_count(&s->gb) == s->gb.size_in_bits){
- ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+ } else {
+ memset( &h->non_zero_count_cache[8], 0, 8*5 );
+ }
+
+ s->current_picture.qscale_table[mb_xy]= s->qscale;
+ write_back_non_zero_count(h);
+
+ return 0;
+}
+
+
+static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+ int i, d;
+ const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+ const int alpha = alpha_table[index_a];
+ const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+
+ for( i = 0; i < 4; i++ ) {
+ if( bS[i] == 0 ) {
+ pix += 4 * stride;
+ continue;
+ }
+
+ if( bS[i] < 4 ) {
+ const int tc0 = tc0_table[index_a][bS[i] - 1];
+ /* 4px edge length */
+ for( d = 0; d < 4; d++ ) {
+ const int p0 = pix[-1];
+ const int p1 = pix[-2];
+ const int p2 = pix[-3];
+ const int q0 = pix[0];
+ const int q1 = pix[1];
+ const int q2 = pix[2];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+ int tc = tc0;
+ int i_delta;
+
+ if( ABS( p2 - p0 ) < beta ) {
+ pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+ tc++;
+ }
+ if( ABS( q2 - q0 ) < beta ) {
+ pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+ tc++;
+ }
+
+ i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+ pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
+ pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
+ }
+ pix += stride;
+ }
+ }else{
+ /* 4px edge length */
+ for( d = 0; d < 4; d++ ) {
+ const int p0 = pix[-1];
+ const int p1 = pix[-2];
+ const int p2 = pix[-3];
+
+ const int q0 = pix[0];
+ const int q1 = pix[1];
+ const int q2 = pix[2];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+
+ if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+ if( ABS( p2 - p0 ) < beta)
+ {
+ const int p3 = pix[-4];
+ /* p0', p1', p2' */
+ pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+ pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+ pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+ } else {
+ /* p0' */
+ pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+ }
+ if( ABS( q2 - q0 ) < beta)
+ {
+ const int q3 = pix[3];
+ /* q0', q1', q2' */
+ pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+ pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+ pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+ } else {
+ /* q0' */
+ pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+ }
+ }else{
+ /* p0', q0' */
+ pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+ pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+ }
+ }
+ pix += stride;
+ }
+ }
+ }
+}
+static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+ int i, d;
+ const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+ const int alpha = alpha_table[index_a];
+ const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+
+ for( i = 0; i < 4; i++ ) {
+ if( bS[i] == 0 ) {
+ pix += 2 * stride;
+ continue;
+ }
+
+ if( bS[i] < 4 ) {
+ const int tc = tc0_table[index_a][bS[i] - 1] + 1;
+ /* 2px edge length (because we use same bS than the one for luma) */
+ for( d = 0; d < 2; d++ ){
+ const int p0 = pix[-1];
+ const int p1 = pix[-2];
+ const int q0 = pix[0];
+ const int q1 = pix[1];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+ const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+ pix[-1] = clip_uint8( p0 + i_delta ); /* p0' */
+ pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
+ }
+ pix += stride;
+ }
+ }else{
+ /* 2px edge length (because we use same bS than the one for luma) */
+ for( d = 0; d < 2; d++ ){
+ const int p0 = pix[-1];
+ const int p1 = pix[-2];
+ const int q0 = pix[0];
+ const int q1 = pix[1];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+
+ pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
+ pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
+ }
+ pix += stride;
+ }
+ }
+ }
+}
+
+static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+ int i, d;
+ const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+ const int alpha = alpha_table[index_a];
+ const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+ const int pix_next = stride;
+
+ for( i = 0; i < 4; i++ ) {
+ if( bS[i] == 0 ) {
+ pix += 4;
+ continue;
+ }
+
+ if( bS[i] < 4 ) {
+ const int tc0 = tc0_table[index_a][bS[i] - 1];
+ /* 4px edge length */
+ for( d = 0; d < 4; d++ ) {
+ const int p0 = pix[-1*pix_next];
+ const int p1 = pix[-2*pix_next];
+ const int p2 = pix[-3*pix_next];
+ const int q0 = pix[0];
+ const int q1 = pix[1*pix_next];
+ const int q2 = pix[2*pix_next];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+
+ int tc = tc0;
+ int i_delta;
+
+ if( ABS( p2 - p0 ) < beta ) {
+ pix[-2*pix_next] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+ tc++;
+ }
+ if( ABS( q2 - q0 ) < beta ) {
+ pix[pix_next] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+ tc++;
+ }
+
+ i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+ pix[-pix_next] = clip_uint8( p0 + i_delta ); /* p0' */
+ pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
+ }
+ pix++;
+ }
+ }else{
+ /* 4px edge length */
+ for( d = 0; d < 4; d++ ) {
+ const int p0 = pix[-1*pix_next];
+ const int p1 = pix[-2*pix_next];
+ const int p2 = pix[-3*pix_next];
+ const int q0 = pix[0];
+ const int q1 = pix[1*pix_next];
+ const int q2 = pix[2*pix_next];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+
+ const int p3 = pix[-4*pix_next];
+ const int q3 = pix[ 3*pix_next];
+
+ if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+ if( ABS( p2 - p0 ) < beta) {
+ /* p0', p1', p2' */
+ pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+ pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+ pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+ } else {
+ /* p0' */
+ pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+ }
+ if( ABS( q2 - q0 ) < beta) {
+ /* q0', q1', q2' */
+ pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+ pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+ pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+ } else {
+ /* q0' */
+ pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+ }
+ }else{
+ /* p0', q0' */
+ pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+ pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+ }
+ }
+ pix++;
+ }
+ }
+ }
+}
+
+static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+ int i, d;
+ const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+ const int alpha = alpha_table[index_a];
+ const int beta = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+ const int pix_next = stride;
+
+ for( i = 0; i < 4; i++ )
+ {
+ if( bS[i] == 0 ) {
+ pix += 2;
+ continue;
+ }
+
+ if( bS[i] < 4 ) {
+ int tc = tc0_table[index_a][bS[i] - 1] + 1;
+ /* 2px edge length (see deblocking_filter_edgecv) */
+ for( d = 0; d < 2; d++ ) {
+ const int p0 = pix[-1*pix_next];
+ const int p1 = pix[-2*pix_next];
+ const int q0 = pix[0];
+ const int q1 = pix[1*pix_next];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+
+ int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+ pix[-pix_next] = clip_uint8( p0 + i_delta ); /* p0' */
+ pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
+ }
+ pix++;
+ }
+ }else{
+ /* 2px edge length (see deblocking_filter_edgecv) */
+ for( d = 0; d < 2; d++ ) {
+ const int p0 = pix[-1*pix_next];
+ const int p1 = pix[-2*pix_next];
+ const int q0 = pix[0];
+ const int q1 = pix[1*pix_next];
+
+ if( ABS( p0 - q0 ) < alpha &&
+ ABS( p1 - p0 ) < beta &&
+ ABS( q1 - q0 ) < beta ) {
+
+ pix[-pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
+ pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
+ }
+ pix++;
+ }
+ }
+ }
+}
+
+static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
+ MpegEncContext * const s = &h->s;
+ const int mb_xy= mb_x + mb_y*s->mb_stride;
+ int linesize, uvlinesize;
+ int dir;
+
+ /* FIXME Implement deblocking filter for field MB */
+ if( h->sps.mb_aff ) {
+ return;
+ }
+ linesize = s->linesize;
+ uvlinesize = s->uvlinesize;
+
+ /* dir : 0 -> vertical edge, 1 -> horizontal edge */
+ for( dir = 0; dir < 2; dir++ )
+ {
+ int start = 0;
+ int edge;
+
+ /* test picture boundary */
+ if( ( dir == 0 && mb_x == 0 ) || ( dir == 1 && mb_y == 0 ) ) {
+ start = 1;
+ }
+ /* FIXME test slice boundary */
+ if( h->deblocking_filter == 2 ) {
+ }
+
+ /* Calculate bS */
+ for( edge = start; edge < 4; edge++ ) {
+ /* mbn_xy: neighbour macroblock (how that works for field ?) */
+ int mbn_xy = edge > 0 ? mb_xy : ( dir == 0 ? mb_xy -1 : mb_xy - s->mb_stride );
+ int bS[4];
+ int qp;
+
+ if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
+ IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
+ bS[0] = bS[1] = bS[2] = bS[3] = ( edge == 0 ? 4 : 3 );
+ } else {
+ int i;
+ for( i = 0; i < 4; i++ ) {
+ int x = dir == 0 ? edge : i;
+ int y = dir == 0 ? i : edge;
+ int b_idx= 8 + 4 + x + 8*y;
+ int bn_idx= b_idx - (dir ? 8:1);
+
+ if( h->non_zero_count_cache[b_idx] != 0 ||
+ h->non_zero_count_cache[bn_idx] != 0 ) {
+ bS[i] = 2;
+ }
+ else if( h->slice_type == P_TYPE ) {
+ if( h->ref_cache[0][b_idx] != h->ref_cache[0][bn_idx] ||
+ ABS( h->mv_cache[0][b_idx][0] - h->mv_cache[0][bn_idx][0] ) >= 4 ||
+ ABS( h->mv_cache[0][b_idx][1] - h->mv_cache[0][bn_idx][1] ) >= 4 )
+ bS[i] = 1;
+ else
+ bS[i] = 0;
+ }
+ else {
+ /* FIXME Add support for B frame */
+ return;
+ }
+ }
+
+ if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+ continue;
+ }
+ /* Filter edge */
+ qp = ( s->qscale + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+ if( dir == 0 ) {
+ filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
+ if( (edge&1) == 0 ) {
+ int chroma_qp = ( h->chroma_qp +
+ get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+ filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
+ filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
+ }
+ } else {
+ filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
+ if( (edge&1) == 0 ) {
+ int chroma_qp = ( h->chroma_qp +
+ get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+ filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
+ filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
+ }
+ }
+ }
+ }
+}
+
+static int decode_slice(H264Context *h){
+ MpegEncContext * const s = &h->s;
+ const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
+
+ s->mb_skip_run= -1;
+
+ if( h->pps.cabac ) {
+ int i;
+
+ /* realign */
+ align_get_bits( &s->gb );
+
+ /* init cabac */
+ ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
+ ff_init_cabac_decoder( &h->cabac,
+ s->gb.buffer + get_bits_count(&s->gb)/8,
+ ( s->gb.size_in_bits - get_bits_count(&s->gb) ) );
+ /* calculate pre-state */
+ for( i= 0; i < 399; i++ ) {
+ int pre;
+ if( h->slice_type == I_TYPE )
+ pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
+ else
+ pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
+
+ if( pre <= 63 )
+ h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
+ else
+ h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
+ }
+
+ for(;;){
+ int ret = decode_mb_cabac(h);
+ int eos = get_cabac_terminate( &h->cabac ); /* End of Slice flag */
+
+ hl_decode_mb(h);
+
+ /* XXX: useless as decode_mb_cabac it doesn't support that ... */
+ if( ret >= 0 && h->sps.mb_aff ) { //FIXME optimal? or let mb_decode decode 16x32 ?
+ s->mb_y++;
+
+ ret = decode_mb_cabac(h);
+ eos = get_cabac_terminate( &h->cabac );
+
+ hl_decode_mb(h);
+ s->mb_y--;
+ }
+
+ if( ret < 0 ) {
+ av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+ return -1;
+ }
+
+ if( ++s->mb_x >= s->mb_width ) {
+ s->mb_x = 0;
+ ff_draw_horiz_band(s, 16*s->mb_y, 16);
+ if( ++s->mb_y >= s->mb_height ) {
+ tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
+ }
+ }
+
+ if( eos || s->mb_y >= s->mb_height ) {
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
return 0;
- }else{
+ }
+#if 0
+ /* TODO test over-reading in cabac code */
+ else if( read too much in h->cabac ) {
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+ return -1;
+ }
+#endif
+ }
+
+ } else {
+ for(;;){
+ int ret = decode_mb_cavlc(h);
+
+ hl_decode_mb(h);
+
+ if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ?
+ s->mb_y++;
+ ret = decode_mb_cavlc(h);
+
+ hl_decode_mb(h);
+ s->mb_y--;
+ }
+
+ if(ret<0){
+ av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
return -1;
}
+
+ if(++s->mb_x >= s->mb_width){
+ s->mb_x=0;
+ ff_draw_horiz_band(s, 16*s->mb_y, 16);
+ if(++s->mb_y >= s->mb_height){
+ tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
+
+ if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+
+ return 0;
+ }else{
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+
+ return -1;
+ }
+ }
+ }
+
+ if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
+ if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+
+ return 0;
+ }else{
+ ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+
+ return -1;
+ }
+ }
}
}
-#endif
+
#if 0
for(;s->mb_y < s->mb_height; s->mb_y++){
for(;s->mb_x < s->mb_width; s->mb_x++){
@@ -4022,7 +5581,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
buf_index += consumed;
- if(h->nal_ref_idc < s->hurry_up)
+ if( s->hurry_up == 1 && h->nal_ref_idc == 0 )
continue;
switch(h->nal_unit_type){
@@ -4035,7 +5594,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
s->data_partitioning = 0;
if(decode_slice_header(h) < 0) return -1;
- if(h->redundant_pic_count==0)
+ if(h->redundant_pic_count==0 && s->hurry_up < 5 )
decode_slice(h);
break;
case NAL_DPA:
@@ -4054,7 +5613,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
init_get_bits(&h->inter_gb, ptr, bit_length);
h->inter_gb_ptr= &h->inter_gb;
- if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning)
+ if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning && s->hurry_up < 5 )
decode_slice(h);
break;
case NAL_SEI:
@@ -4099,6 +5658,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
assert(h->mmco_index==0);
ff_er_frame_end(s);
+
MPV_frame_end(s);
return buf_index;
diff --git a/src/libffmpeg/libavcodec/h264data.h b/src/libffmpeg/libavcodec/h264data.h
index 40a252253..5480becd4 100644
--- a/src/libffmpeg/libavcodec/h264data.h
+++ b/src/libffmpeg/libavcodec/h264data.h
@@ -528,3 +528,598 @@ static const int quant_coeff[52][16]={
{ 1260, 819, 1260, 819, 819, 524, 819, 524, 1260, 819, 1260, 819, 819, 524, 819, 524,},
{ 1170, 728, 1170, 728, 728, 456, 728, 456, 1170, 728, 1170, 728, 728, 456, 728, 456,},
};
+
+
+/* Deblocking filter (p153) */
+static const int alpha_table[52] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
+ 7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
+ 25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+ 80, 90,101,113,127,144,162,182,203,226,
+ 255, 255
+};
+static const int beta_table[52] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
+ 3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
+ 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
+ 13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+ 18, 18
+};
+static const int tc0_table[52][3] = {
+ { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+ { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+ { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
+ { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
+ { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
+ { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
+ { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
+ { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
+ { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
+};
+
+/* Cabac pre state table */
+
+static const int cabac_context_init_I[399][2] =
+{
+ /* 0 - 10 */
+ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 },
+ { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 },
+ { -6, 53 }, { -1, 54 }, { 7, 51 },
+
+ /* 11 - 23 unsused for I */
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 },
+
+ /* 24- 39 */
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+
+ /* 40 - 53 */
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 },
+
+ /* 54 - 59 */
+ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 },
+
+ /* 60 - 69 */
+ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 },
+ { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 },
+ { 13, 41 }, { 3, 62 },
+
+ /* 70 -> 87 */
+ { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 },
+ { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 },
+ { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 },
+ { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 },
+ { -12, 115 },{ -16, 122 },
+
+ /* 88 -> 104 */
+ { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 },
+ { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 },
+ { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 },
+ { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 },
+ { -22, 125 },
+
+ /* 105 -> 135 */
+ { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 },
+ { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 },
+ { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 },
+ { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 },
+ { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 },
+ { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 },
+ { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 },
+ { 14, 62 }, { -13, 108 },{ -15, 100 },
+
+ /* 136 -> 165 */
+ { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 },
+ { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 },
+ { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 },
+ { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 },
+ { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 },
+ { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 },
+ { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 },
+ { 0, 62 }, { 12, 72 },
+
+ /* 166 -> 196 */
+ { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 },
+ { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 },
+ { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 },
+ { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 },
+ { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 },
+ { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 },
+ { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 },
+ { 0, 89 }, { 26, -19 }, { 22, -17 },
+
+ /* 197 -> 226 */
+ { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 },
+ { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 },
+ { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 },
+ { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 },
+ { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 },
+ { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 },
+ { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 },
+ { 12, 68 }, { 2, 97 },
+
+ /* 227 -> 251 */
+ { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 },
+ { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 },
+ { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 },
+ { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 },
+ { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 },
+ { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 },
+ { -4, 65 },
+
+ /* 252 -> 275 */
+ { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 },
+ { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 },
+ { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 },
+ { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 },
+ { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 },
+ { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 },
+
+ /* 276 a bit special (not used, bypass is used instead) */
+ { 0, 0 },
+
+ /* 277 -> 307 */
+ { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 },
+ { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 },
+ { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 },
+ { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 },
+ { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 },
+ { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 },
+ { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 },
+ { 9, 64 }, { -12, 104 },{ -11, 97 },
+
+ /* 308 -> 337 */
+ { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 },
+ { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 },
+ { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 },
+ { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 },
+ { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 },
+ { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 },
+ { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 },
+ { 5, 64 }, { 12, 70 },
+
+ /* 338 -> 368 */
+ { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 },
+ { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 },
+ { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 },
+ { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 },
+ { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 },
+ { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 },
+ { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 },
+ { -12, 109 },{ 36, -35 }, { 36, -34 },
+
+ /* 369 -> 398 */
+ { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 },
+ { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 },
+ { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 },
+ { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 },
+ { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 },
+ { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 },
+ { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 },
+ { 29, 39 }, { 19, 66 }
+};
+
+static const int cabac_context_init_PB[3][399][2] =
+{
+ /* i_cabac_init_idc == 0 */
+ {
+ /* 0 - 10 */
+ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 },
+ { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 },
+ { -6, 53 }, { -1, 54 }, { 7, 51 },
+
+ /* 11 - 23 */
+ { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 },
+ { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 },
+ { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 },
+ { 17, 50 },
+
+ /* 24 - 39 */
+ { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 },
+ { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 },
+ { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 },
+ { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 },
+
+ /* 40 - 53 */
+ { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 },
+ { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 },
+ { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 },
+ { -3, 81 }, { 0, 88 },
+
+ /* 54 - 59 */
+ { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 },
+ { -7, 72 }, { 1, 58 },
+
+ /* 60 - 69 */
+ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 },
+ { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 },
+ { 13, 41 }, { 3, 62 },
+
+ /* 70 - 87 */
+ { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 },
+ { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 },
+ { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 },
+ { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 },
+ { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 },
+ { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 },
+ { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 },
+ { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 },
+ { 0, 68 }, { -4, 69 }, { -8, 88 },
+
+ /* 105 -> 165 */
+ { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 },
+ { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 },
+ { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 },
+ { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 },
+ { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 },
+ { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 },
+ { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 },
+ { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 },
+ { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 },
+ { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 },
+ { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 },
+ { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 },
+ { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 },
+ { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 },
+ { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 },
+ { 9, 69 },
+
+ /* 166 - 226 */
+ { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 },
+ { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 },
+ { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 },
+ { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 },
+ { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 },
+ { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 },
+ { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 },
+ { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 },
+ { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 },
+ { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 },
+ { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 },
+ { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 },
+ { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 },
+ { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 },
+ { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 },
+ { -9, 108 },
+
+ /* 227 - 275 */
+ { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 },
+ { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 },
+ { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 },
+ { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 },
+ { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 },
+ { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 },
+ { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 },
+ { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 },
+ { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 },
+ { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 },
+ { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 },
+ { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 },
+ { -8, 85 },
+
+ /* 276 a bit special (not used, bypass is used instead) */
+ { 0, 0 },
+
+ /* 277 - 337 */
+ { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 },
+ { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 },
+ { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+ { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 },
+ { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 },
+ { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 },
+ { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 },
+ { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 },
+ { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 },
+ { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 },
+ { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 },
+ { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 },
+ { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 },
+ { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 },
+ { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 },
+ { 26, 43 },
+
+ /* 338 - 398 */
+ { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 },
+ { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 },
+ { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 },
+ { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 },
+ { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 },
+ { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 },
+ { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 },
+ { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 },
+ { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 },
+ { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 },
+ { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 },
+ { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 },
+ { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 },
+ { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 },
+ { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 },
+ { 11, 86 },
+
+
+ },
+
+ /* i_cabac_init_idc == 1 */
+ {
+ /* 0 - 10 */
+ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 },
+ { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 },
+ { -6, 53 }, { -1, 54 }, { 7, 51 },
+
+ /* 11 - 23 */
+ { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 },
+ { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 },
+ { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 },
+ { 10, 54 },
+
+ /* 24 - 39 */
+ { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 },
+ { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 },
+ { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 },
+ { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 },
+
+ /* 40 - 53 */
+ { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 },
+ { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 },
+ { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 },
+ { -7, 86 },{ -5, 95 },
+
+ /* 54 - 59 */
+ { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 },
+ { -5, 72 },{ 0, 61 },
+
+ /* 60 - 69 */
+ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 },
+ { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 },
+ { 13, 41 }, { 3, 62 },
+
+ /* 70 - 104 */
+ { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 },
+ { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 },
+ { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 },
+ { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 },
+ { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 },
+ { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 },
+ { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 },
+ { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 },
+ { 0, 68 }, { -7, 74 }, { -9, 88 },
+
+ /* 105 -> 165 */
+ { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 },
+ { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 },
+ { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 },
+ { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 },
+ { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 },
+ { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 },
+ { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 },
+ { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 },
+ { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 },
+ { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 },
+ { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 },
+ { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 },
+ { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 },
+ { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 },
+ { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 },
+ { 0, 89 },
+
+ /* 166 - 226 */
+ { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 },
+ { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 },
+ { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 },
+ { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 },
+ { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 },
+ { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 },
+ { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 },
+ { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 },
+ { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 },
+ { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 },
+ { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 },
+ { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 },
+ { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 },
+ { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 },
+ { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 },
+ { -10, 116 },
+
+ /* 227 - 275 */
+ { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 },
+ { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 },
+ { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 },
+ { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 },
+ { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+ { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 },
+ { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 },
+ { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 },
+ { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 },
+ { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 },
+ { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 },
+ { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 },
+ { -4, 78 },
+
+ /* 276 a bit special (not used, bypass is used instead) */
+ { 0, 0 },
+
+ /* 277 - 337 */
+ { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+ { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+ { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+ { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 },
+ { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 },
+ { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 },
+ { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 },
+ { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 },
+ { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 },
+ { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 },
+ { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 },
+ { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 },
+ { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 },
+ { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 },
+ { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 },
+ { 18, 50 },
+
+ /* 338 - 398 */
+ { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 },
+ { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 },
+ { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 },
+ { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 },
+ { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 },
+ { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 },
+ { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 },
+ { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 },
+ { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 },
+ { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 },
+ { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 },
+ { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 },
+ { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 },
+ { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 },
+ { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 },
+ { 11, 83 },
+
+ },
+
+ /* i_cabac_init_idc == 2 */
+ {
+ /* 0 - 10 */
+ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 },
+ { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 },
+ { -6, 53 }, { -1, 54 }, { 7, 51 },
+
+ /* 11 - 23 */
+ { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 },
+ { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 },
+ { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 },
+ { 14, 57 },
+
+ /* 24 - 39 */
+ { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 },
+ { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 },
+ { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 },
+ { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 },
+
+ /* 40 - 53 */
+ { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 },
+ { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 },
+ { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 },
+ { -3, 90 },{ -1, 101 },
+
+ /* 54 - 59 */
+ { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 },
+ { -7, 50 },{ 1, 60 },
+
+ /* 60 - 69 */
+ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 },
+ { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 },
+ { 13, 41 }, { 3, 62 },
+
+ /* 70 - 104 */
+ { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 },
+ { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 },
+ { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 },
+ { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 },
+ { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 },
+ { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 },
+ { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 },
+ { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 },
+ { 3, 68 }, { -8, 71 }, { -13, 98 },
+
+ /* 105 -> 165 */
+ { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 },
+ { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 },
+ { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 },
+ { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 },
+ { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 },
+ { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 },
+ { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 },
+ { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 },
+ { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 },
+ { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 },
+ { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 },
+ { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 },
+ { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 },
+ { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 },
+ { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 },
+ { -22, 127 },
+
+ /* 166 - 226 */
+ { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 },
+ { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 },
+ { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 },
+ { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 },
+ { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 },
+ { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 },
+ { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 },
+ { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 },
+ { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 },
+ { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 },
+ { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 },
+ { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 },
+ { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 },
+ { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 },
+ { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 },
+ { -24, 127 },
+
+ /* 227 - 275 */
+ { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 },
+ { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 },
+ { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 },
+ { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 },
+ { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 },
+ { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 },
+ { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 },
+ { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 },
+ { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 },
+ { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 },
+ { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 },
+ { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 },
+ { -10, 87 },
+
+ /* 276 a bit special (not used, bypass is used instead) */
+ { 0, 0 },
+
+ /* 277 - 337 */
+ { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+ { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+ { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+ { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 },
+ { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 },
+ { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 },
+ { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 },
+ { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 },
+ { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 },
+ { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 },
+ { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 },
+ { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 },
+ { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 },
+ { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 },
+ { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 },
+ { 25, 42 },
+
+ /* 338 - 398 */
+ { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 },
+ { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 },
+ { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 },
+ { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 },
+ { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 },
+ { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 },
+ { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 },
+ { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 },
+ { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 },
+ { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 },
+ { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 },
+ { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 },
+ { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 },
+ { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 },
+ { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 },
+ { 25, 61 },
+ }
+};
diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am
index d7b2bb4f8..eaa8d0f75 100644
--- a/src/libffmpeg/libavcodec/i386/Makefile.am
+++ b/src/libffmpeg/libavcodec/i386/Makefile.am
@@ -18,7 +18,8 @@ libavcodec_mmx_src = \
motion_est_mmx.c \
mpegvideo_mmx.c \
simple_idct_mmx.c \
- vp3dsp_mmx.c
+ vp3dsp_mmx.c \
+ vp3dsp_sse2.c
libavcodec_mmx_dummy = libavcodec_mmx_dummy.c
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 15dc8eec2..772c9c1f0 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -2147,9 +2147,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
/* VP3 optimized DSP functions */
- c->vp3_dsp_init = vp3_dsp_init_mmx;
- c->vp3_idct_put = vp3_idct_put_mmx;
- c->vp3_idct_add = vp3_idct_add_mmx;
+ if (mm_flags & MM_SSE2) {
+ c->vp3_dsp_init = vp3_dsp_init_sse2;
+ c->vp3_idct_put = vp3_idct_put_sse2;
+ c->vp3_idct_add = vp3_idct_add_sse2;
+ } else {
+ c->vp3_dsp_init = vp3_dsp_init_mmx;
+ c->vp3_idct_put = vp3_idct_put_mmx;
+ c->vp3_idct_add = vp3_idct_add_mmx;
+ }
#ifdef CONFIG_ENCODERS
c->get_pixels = get_pixels_mmx;
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index 877160773..7af576971 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -50,7 +50,14 @@ static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
-static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+struct
+{
+ const long fdct_r_row_sse2[4] ATTR_ALIGN(16);
+} fdct_r_row_sse2 ATTR_ALIGN(16)=
+{{
+ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
+}};
+//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
16384, 16384, -8867, -21407,
@@ -126,7 +133,12 @@ static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff
6270, 26722, 6270, -17855,
};
-static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
+struct
+{
+ const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
+} tab_frw_01234567_sse2 ATTR_ALIGN(16) =
+{{
+//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
C4, C4, C5, C7, C2, C6, C3, -C7, \
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
@@ -252,7 +264,8 @@ TABLE_SSE2
#define C6 12299
#define C7 6270
TABLE_SSE2
-};
+}};
+
static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
{
@@ -392,7 +405,7 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
"FDCT_ROW_SSE2_H2 80 192 \n\t"
"FDCT_ROW_SSE2 80 \n\t"
:
- : "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+ : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
);
}
diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h
index 7e94cfd9b..ad684bc5a 100644
--- a/src/libffmpeg/libavcodec/i386/mmx.h
+++ b/src/libffmpeg/libavcodec/i386/mmx.h
@@ -240,4 +240,28 @@ typedef union {
#define sfence() __asm__ __volatile__ ("sfence\n\t")
+/* SSE2 */
+#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm)
+#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm)
+#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm)
+#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm)
+
+#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm)
+
+#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg)
+#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var)
+#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd)
+#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg)
+#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var)
+#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd)
+
+#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var)
+
+#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg)
+#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg)
+
+#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
+#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
+
+
#endif /* AVCODEC_I386MMX_H */
diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
index 59020466f..76007a1d1 100644
--- a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
@@ -46,213 +46,216 @@ static uint16_t idct_cosine_table[7] = {
#define r7 mm7
/* from original comments: The Macro does IDct on 4 1-D Dcts */
-#define BeginIDCT() \
+#define BeginIDCT() { \
movq_m2r(*I(3), r2); \
movq_m2r(*C(3), r6); \
movq_r2r(r2, r4); \
movq_m2r(*J(5), r7); \
- pmulhw_r2r(r6, r4); \
+ pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
movq_m2r(*C(5), r1); \
- pmulhw_r2r(r7, r6); \
+ pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
movq_r2r(r1, r5); \
- pmulhw_r2r(r2, r1); \
+ pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
movq_m2r(*I(1), r3); \
- pmulhw_r2r(r7, r5); \
- movq_m2r(*C(1), r0); \
- paddw_r2r(r2, r4); \
- paddw_r2r(r7, r6); \
- paddw_r2r(r1, r2); \
+ pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
+ movq_m2r(*C(1), r0); /* (all registers are in use) */ \
+ paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
+ paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
+ paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
movq_m2r(*J(7), r1); \
- paddw_r2r(r5, r7); \
- movq_r2r(r0, r5); \
- pmulhw_r2r(r3, r0); \
- paddsw_r2r(r7, r4); \
- pmulhw_r2r(r1, r5); \
+ paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
+ movq_r2r(r0, r5); /* r5 = c1 */ \
+ pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
+ paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
+ pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
movq_m2r(*C(7), r7); \
- psubsw_r2r(r2, r6); \
- paddw_r2r(r3, r0); \
- pmulhw_r2r(r7, r3); \
+ psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
+ paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
+ pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
movq_m2r(*I(2), r2); \
- pmulhw_r2r(r1, r7); \
- paddw_r2r(r1, r5); \
- movq_r2r(r2, r1); \
- pmulhw_m2r(*C(2), r2); \
- psubsw_r2r(r5, r3); \
+ pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
+ paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
+ movq_r2r(r2, r1); /* r1 = i2 */ \
+ pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
+ psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
movq_m2r(*J(6), r5); \
- paddsw_r2r(r7, r0); \
- movq_r2r(r5, r7); \
- psubsw_r2r(r4, r0); \
- pmulhw_m2r(*C(2), r5); \
- paddw_r2r(r1, r2); \
- pmulhw_m2r(*C(6), r1); \
- paddsw_r2r(r4, r4); \
- paddsw_r2r(r0, r4); \
- psubsw_r2r(r6, r3); \
- paddw_r2r(r7, r5); \
- paddsw_r2r(r6, r6); \
- pmulhw_m2r(*C(6), r7); \
- paddsw_r2r(r3, r6); \
- movq_r2m(r4, *I(1)); \
- psubsw_r2r(r5, r1); \
+ paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
+ movq_r2r(r5, r7); /* r7 = i6 */ \
+ psubsw_r2r(r4, r0); /* r0 = A - C */ \
+ pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
+ paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
+ pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
+ paddsw_r2r(r4, r4); /* r4 = C + C */ \
+ paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
+ psubsw_r2r(r6, r3); /* r3 = B - D */ \
+ paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
+ paddsw_r2r(r6, r6); /* r6 = D + D */ \
+ pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
+ paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
+ movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
+ psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
movq_m2r(*C(4), r4); \
- movq_r2r(r3, r5); \
- pmulhw_r2r(r4, r3); \
- paddsw_r2r(r2, r7); \
- movq_r2m(r6, *I(2)); \
- movq_r2r(r0, r2); \
+ movq_r2r(r3, r5); /* r5 = B - D */ \
+ pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
+ paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
+ movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
+ movq_r2r(r0, r2); /* r2 = A - C */ \
movq_m2r(*I(0), r6); \
- pmulhw_r2r(r4, r0); \
- paddw_r2r(r3, r5); \
+ pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
+ paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
movq_m2r(*J(4), r3); \
- psubsw_r2r(r1, r5); \
- paddw_r2r(r0, r2); \
- psubsw_r2r(r3, r6); \
+ psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
+ paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
+ psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
movq_r2r(r6, r0); \
- pmulhw_r2r(r4, r6); \
- paddsw_r2r(r3, r3); \
- paddsw_r2r(r1, r1); \
- paddsw_r2r(r0, r3); \
- paddsw_r2r(r5, r1); \
- pmulhw_r2r(r3, r4); \
- paddsw_r2r(r0, r6); \
- psubsw_r2r(r2, r6); \
- paddsw_r2r(r2, r2); \
- movq_m2r(*I(1), r0); \
- paddsw_r2r(r6, r2); \
- paddw_r2r(r3, r4); \
- psubsw_r2r(r1, r2);
+ pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
+ paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
+ paddsw_r2r(r1, r1); /* r1 = H + H */ \
+ paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
+ paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
+ pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
+ paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
+ psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
+ paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
+ movq_m2r(*I(1), r0); /* r0 = C. */ \
+ paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
+ paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
+ psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
+}
/* RowIDCT gets ready to transpose */
-#define RowIDCT() \
+#define RowIDCT() { \
\
- BeginIDCT() \
+ BeginIDCT(); \
\
- movq_m2r(*I(2), r3); \
- psubsw_r2r(r7, r4); \
- paddsw_r2r(r1, r1); \
- paddsw_r2r(r7, r7); \
- paddsw_r2r(r2, r1); \
- paddsw_r2r(r4, r7); \
- psubsw_r2r(r3, r4); \
- psubsw_r2r(r5, r6); \
+ movq_m2r(*I(2), r3); /* r3 = D. */ \
+ psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
+ paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
+ paddsw_r2r(r7, r7); /* r7 = G + G */ \
+ paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
+ paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
+ psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
+ paddsw_r2r(r3, r3); \
+ psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
paddsw_r2r(r5, r5); \
- paddsw_r2r(r4, r3); \
- paddsw_r2r(r6, r5); \
- psubsw_r2r(r0, r7); \
+ paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
+ paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
+ psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
paddsw_r2r(r0, r0); \
- movq_r2m(r1, *I(1)); \
- paddsw_r2r(r7, r0);
+ movq_r2m(r1, *I(1)); /* save R1 */ \
+ paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
+}
/* Column IDCT normalizes and stores final results */
-#define ColumnIDCT() \
+#define ColumnIDCT() { \
\
- BeginIDCT() \
+ BeginIDCT(); \
\
- paddsw_m2r(*Eight, r2); \
- paddsw_r2r(r1, r1); \
- paddsw_r2r(r2, r1); \
- psraw_i2r(4, r2); \
- psubsw_r2r(r7, r4); \
- psraw_i2r(4, r1); \
- movq_m2r(*I(2), r3); \
- paddsw_r2r(r7, r7); \
- movq_r2m(r2, *I(2)); \
- paddsw_r2r(r4, r7); \
- movq_r2m(r1, *I(1)); \
- psubsw_r2r(r3, r4); \
- paddsw_m2r(*Eight, r4); \
- paddsw_r2r(r3, r3); \
- paddsw_r2r(r4, r3); \
- psraw_i2r(4, r4); \
- psubsw_r2r(r5, r6); \
- psraw_i2r(4, r3); \
- paddsw_m2r(*Eight, r6); \
- paddsw_r2r(r5, r5); \
- paddsw_r2r(r6, r5); \
- psraw_i2r(4, r6); \
- movq_r2m(r4, *J(4)); \
- psraw_i2r(4, r5); \
- movq_r2m(r3, *I(3)); \
- psubsw_r2r(r0, r7); \
- paddsw_m2r(*Eight, r7); \
- paddsw_r2r(r0, r0); \
- paddsw_r2r(r7, r0); \
- psraw_i2r(4, r7); \
- movq_r2m(r6, *J(6)); \
- psraw_i2r(4, r0); \
- movq_r2m(r5, *J(5)); \
- movq_r2m(r7, *J(7)); \
- movq_r2m(r0, *I(0));
-
+ paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
+ paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
+ paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
+ psraw_i2r(4, r2); /* r2 = NR2 */ \
+ psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
+ psraw_i2r(4, r1); /* r1 = NR1 */ \
+ movq_m2r(*I(2), r3); /* r3 = D. */ \
+ paddsw_r2r(r7, r7); /* r7 = G + G */ \
+ movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
+ paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
+ movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
+ psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
+ paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
+ paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
+ paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
+ psraw_i2r(4, r4); /* r4 = NR4 */ \
+ psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
+ psraw_i2r(4, r3); /* r3 = NR3 */ \
+ paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
+ paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
+ paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
+ psraw_i2r(4, r6); /* r6 = NR6 */ \
+ movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
+ psraw_i2r(4, r5); /* r5 = NR5 */ \
+ movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
+ psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
+ paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
+ paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
+ paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
+ psraw_i2r(4, r7); /* r7 = NR7 */ \
+ movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
+ psraw_i2r(4, r0); /* r0 = NR0 */ \
+ movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
+ movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
+ movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
+}
/* Following macro does two 4x4 transposes in place.
At entry (we assume):
- r0 = a3 a2 a1 a0
- I(1) = b3 b2 b1 b0
- r2 = c3 c2 c1 c0
- r3 = d3 d2 d1 d0
-
- r4 = e3 e2 e1 e0
- r5 = f3 f2 f1 f0
- r6 = g3 g2 g1 g0
- r7 = h3 h2 h1 h0
+ r0 = a3 a2 a1 a0
+ I(1) = b3 b2 b1 b0
+ r2 = c3 c2 c1 c0
+ r3 = d3 d2 d1 d0
- At exit, we have:
+ r4 = e3 e2 e1 e0
+ r5 = f3 f2 f1 f0
+ r6 = g3 g2 g1 g0
+ r7 = h3 h2 h1 h0
- I(0) = d0 c0 b0 a0
- I(1) = d1 c1 b1 a1
- I(2) = d2 c2 b2 a2
- I(3) = d3 c3 b3 a3
+ At exit, we have:
- J(4) = h0 g0 f0 e0
- J(5) = h1 g1 f1 e1
- J(6) = h2 g2 f2 e2
- J(7) = h3 g3 f3 e3
+ I(0) = d0 c0 b0 a0
+ I(1) = d1 c1 b1 a1
+ I(2) = d2 c2 b2 a2
+ I(3) = d3 c3 b3 a3
+
+ J(4) = h0 g0 f0 e0
+ J(5) = h1 g1 f1 e1
+ J(6) = h2 g2 f2 e2
+ J(7) = h3 g3 f3 e3
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
Since r1 is free at entry, we calculate the Js first. */
-#define Transpose() \
- movq_r2r(r4, r1); \
- punpcklwd_r2r(r5, r4); \
- movq_r2m(r0, *I(0)); \
- punpckhwd_r2r(r5, r1); \
- movq_r2r(r6, r0); \
- punpcklwd_r2r(r7, r6); \
- movq_r2r(r4, r5); \
- punpckldq_r2r(r6, r4); \
- punpckhdq_r2r(r6, r5); \
- movq_r2r(r1, r6); \
+#define Transpose() { \
+ movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
+ punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
+ movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
+ punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
+ movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
+ punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
+ movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
+ punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
+ punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
+ movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
movq_r2m(r4, *J(4)); \
- punpckhwd_r2r(r7, r0); \
+ punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
movq_r2m(r5, *J(5)); \
- punpckhdq_r2r(r0, r6); \
- movq_m2r(*I(0), r4); \
- punpckldq_r2r(r0, r1); \
- movq_m2r(*I(1), r5); \
- movq_r2r(r4, r0); \
+ punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
+ movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
+ punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
+ movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
+ movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
movq_r2m(r6, *J(7)); \
- punpcklwd_r2r(r5, r0); \
+ punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
movq_r2m(r1, *J(6)); \
- punpckhwd_r2r(r5, r4); \
- movq_r2r(r2, r5); \
- punpcklwd_r2r(r3, r2); \
- movq_r2r(r0, r1); \
- punpckldq_r2r(r2, r0); \
- punpckhdq_r2r(r2, r1); \
- movq_r2r(r4, r2); \
+ punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
+ movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
+ punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
+ movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
+ punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
+ punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
+ movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
movq_r2m(r0, *I(0)); \
- punpckhwd_r2r(r3, r5); \
+ punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
movq_r2m(r1, *I(1)); \
- punpckhdq_r2r(r5, r4); \
- punpckldq_r2r(r5, r2); \
+ punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
+ punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
movq_r2m(r4, *I(3)); \
- movq_r2m(r2, *I(2));
-
+ movq_r2m(r2, *I(2)); \
+}
void vp3_dsp_init_mmx(void)
{
@@ -263,7 +266,7 @@ void vp3_dsp_init_mmx(void)
idct_constants[--j] = 0;
} while (j);
- idct_constants[0] = idct_constants[5] =
+ idct_constants[0] = idct_constants[5] =
idct_constants[10] = idct_constants[15] = 65535;
j = 1;
@@ -272,7 +275,7 @@ void vp3_dsp_init_mmx(void)
p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
} while (++j <= 7);
- idct_constants[44] = idct_constants[45] =
+ idct_constants[44] = idct_constants[45] =
idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
}
@@ -292,254 +295,240 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
#define C(x) (idct_constants + 16 + (x - 1) * 4)
#define Eight (idct_constants + 44)
- movq_m2r(*input_data, r0);
- pmullw_m2r(*dequant_matrix, r0);
- movq_m2r(*(input_data + 8), r1);
- pmullw_m2r(*(dequant_matrix + 8), r1);
- movq_m2r(*M(0), r2);
- movq_r2r(r0, r3);
- movq_m2r(*(input_data + 4), r4);
- psrlq_i2r(16, r0);
- pmullw_m2r(*(dequant_matrix + 4), r4);
- pand_r2r(r2, r3);
- movq_r2r(r0, r5);
- movq_r2r(r1, r6);
- pand_r2r(r2, r5);
- psllq_i2r(32, r6);
- movq_m2r(*M(3), r7);
- pxor_r2r(r5, r0);
- pand_r2r(r6, r7);
- por_r2r(r3, r0);
- pxor_r2r(r7, r6);
- por_r2r(r7, r0);
- movq_m2r(*M(3), r7);
- movq_r2r(r4, r3);
- movq_r2m(r0, *output_data);
-
- pand_r2r(r2, r3);
- movq_m2r(*(input_data + 16), r0);
- psllq_i2r(16, r3);
- pmullw_m2r(*(dequant_matrix + 16), r0);
- pand_r2r(r1, r7);
- por_r2r(r3, r5);
- por_r2r(r6, r7);
- movq_m2r(*(input_data + 12), r3);
- por_r2r(r5, r7);
- pmullw_m2r(*(dequant_matrix + 12), r3);
- psrlq_i2r(16, r4);
- movq_r2m(r7, *(output_data + 8));
-
- movq_r2r(r4, r5);
- movq_r2r(r0, r7);
- psrlq_i2r(16, r4);
- psrlq_i2r(48, r7);
- movq_r2r(r2, r6);
- pand_r2r(r2, r5);
- pand_r2r(r4, r6);
- movq_r2m(r7, *(output_data + 40));
-
- pxor_r2r(r6, r4);
- psrlq_i2r(32, r1);
- por_r2r(r5, r4);
- movq_m2r(*M(3), r7);
- pand_r2r(r2, r1);
- movq_m2r(*(input_data + 24), r5);
- psllq_i2r(16, r0);
- pmullw_m2r(*(dequant_matrix + 24), r5);
- pand_r2r(r0, r7);
- movq_r2m(r1, *(output_data + 32));
-
- por_r2r(r4, r7);
- movq_r2r(r3, r4);
- pand_r2r(r2, r3);
- movq_m2r(*M(2), r1);
- psllq_i2r(32, r3);
- por_r2r(r3, r7);
- movq_r2r(r5, r3);
- psllq_i2r(48, r3);
- pand_r2r(r0, r1);
- movq_r2m(r7, *(output_data + 16));
-
- por_r2r(r3, r6);
- movq_m2r(*M(1), r7);
- por_r2r(r1, r6);
- movq_m2r(*(input_data + 28), r1);
- pand_r2r(r4, r7);
- pmullw_m2r(*(dequant_matrix + 28), r1);
- por_r2r(r6, r7);
- pand_m2r(*M(1), r0);
- psrlq_i2r(32, r4);
- movq_r2m(r7, *(output_data + 24));
-
- movq_r2r(r4, r6);
- movq_m2r(*M(3), r7);
- pand_r2r(r2, r4);
- movq_m2r(*M(1), r3);
- pand_r2r(r1, r7);
- pand_r2r(r5, r3);
- por_r2r(r4, r0);
- psllq_i2r(16, r3);
- por_r2r(r0, r7);
- movq_m2r(*M(2), r4);
- por_r2r(r3, r7);
- movq_m2r(*(input_data + 40), r0);
- movq_r2r(r4, r3);
- pmullw_m2r(*(dequant_matrix + 40), r0);
- pand_r2r(r5, r4);
- movq_r2m(r7, *(output_data + 4));
-
- por_r2r(r4, r6);
- movq_r2r(r3, r4);
- psrlq_i2r(16, r6);
- movq_r2r(r0, r7);
- pand_r2r(r1, r4);
- psllq_i2r(48, r7);
- por_r2r(r4, r6);
- movq_m2r(*(input_data + 44), r4);
- por_r2r(r6, r7);
- pmullw_m2r(*(dequant_matrix + 44), r4);
- psrlq_i2r(16, r3);
- movq_r2m(r7, *(output_data + 12));
-
- pand_r2r(r1, r3);
- psrlq_i2r(48, r5);
- pand_r2r(r2, r1);
- movq_m2r(*(input_data + 52), r6);
- por_r2r(r3, r5);
- pmullw_m2r(*(input_data + 52), r6);
- psrlq_i2r(16, r0);
- movq_r2r(r4, r7);
- movq_r2r(r2, r3);
- psllq_i2r(48, r7);
- pand_r2r(r0, r3);
- pxor_r2r(r3, r0);
- psllq_i2r(32, r3);
- por_r2r(r5, r7);
- movq_r2r(r6, r5);
- pand_m2r(*M(1), r6);
- por_r2r(r3, r7);
- psllq_i2r(32, r6);
- por_r2r(r1, r0);
- movq_r2m(r7, *(output_data + 20));
-
- por_r2r(r6, r0);
- movq_m2r(*(input_data + 60), r7);
- movq_r2r(r5, r6);
- pmullw_m2r(*(input_data + 60), r7);
- psrlq_i2r(32, r5);
- pand_r2r(r2, r6);
- movq_r2r(r5, r1);
- movq_r2m(r0, *(output_data + 28));
-
- pand_r2r(r2, r1);
- movq_m2r(*(input_data + 56), r0);
- movq_r2r(r7, r3);
- pmullw_m2r(*(dequant_matrix + 56), r0);
- psllq_i2r(16, r3);
- pand_m2r(*M(3), r7);
- pxor_r2r(r1, r5);
- por_r2r(r5, r6);
- movq_r2r(r3, r5);
- pand_m2r(*M(3), r5);
- por_r2r(r1, r7);
- movq_m2r(*(input_data + 48), r1);
- pxor_r2r(r5, r3);
- pmullw_m2r(*(dequant_matrix + 48), r1);
- por_r2r(r3, r7);
- por_r2r(r5, r6);
- movq_r2r(r0, r5);
- movq_r2m(r7, *(output_data + 60));
-
- psrlq_i2r(16, r5);
- pand_m2r(*M(2), r5);
- movq_r2r(r0, r7);
- por_r2r(r5, r6);
- pand_r2r(r2, r0);
- pxor_r2r(r0, r7);
- psllq_i2r(32, r0);
- movq_r2m(r6, *(output_data + 52));
-
- psrlq_i2r(16, r4);
- movq_m2r(*(input_data + 36), r5);
- psllq_i2r(16, r7);
- pmullw_m2r(*(dequant_matrix + 36), r5);
- movq_r2r(r7, r6);
- movq_m2r(*M(2), r3);
- psllq_i2r(16, r6);
- pand_m2r(*M(3), r7);
- pand_r2r(r1, r3);
- por_r2r(r0, r7);
- movq_r2r(r1, r0);
- pand_m2r(*M(3), r1);
- por_r2r(r3, r6);
- movq_r2r(r4, r3);
- psrlq_i2r(32, r1);
- pand_r2r(r2, r3);
- por_r2r(r1, r7);
- por_r2r(r3, r7);
- movq_r2r(r4, r3);
- pand_m2r(*M(1), r3);
- movq_r2r(r5, r1);
- movq_r2m(r7, *(output_data + 44));
-
- psrlq_i2r(48, r5);
- movq_m2r(*(input_data + 32), r7);
- por_r2r(r3, r6);
- pmullw_m2r(*(dequant_matrix + 32), r7);
- por_r2r(r5, r6);
- pand_m2r(*M(2), r4);
- psllq_i2r(32, r0);
- movq_r2m(r6, *(output_data + 36));
-
- movq_r2r(r0, r6);
- pand_m2r(*M(3), r0);
- psllq_i2r(16, r6);
- movq_m2r(*(input_data + 20), r5);
- movq_r2r(r1, r3);
- pmullw_m2r(*(dequant_matrix + 40), r5);
- psrlq_i2r(16, r1);
- pand_m2r(*M(1), r1);
- por_r2r(r4, r0);
- pand_r2r(r7, r2);
- por_r2r(r1, r0);
- por_r2r(r2, r0);
- psllq_i2r(16, r3);
- movq_r2r(r3, r4);
- movq_r2r(r5, r2);
- movq_r2m(r0, *(output_data + 56));
-
- psrlq_i2r(48, r2);
- pand_m2r(*M(2), r4);
- por_r2r(r2, r6);
- movq_m2r(*M(1), r2);
- por_r2r(r4, r6);
- pand_r2r(r7, r2);
- psllq_i2r(32, r3);
- por_m2r(*(output_data + 40), r3);
-
- por_r2r(r2, r6);
- movq_m2r(*M(3), r2);
- psllq_i2r(16, r5);
- movq_r2m(r6, *(output_data + 48));
-
- pand_r2r(r5, r2);
- movq_m2r(*M(2), r6);
- pxor_r2r(r2, r5);
- pand_r2r(r7, r6);
- psrlq_i2r(32, r2);
- pand_m2r(*M(3), r7);
- por_r2r(r2, r3);
- por_m2r(*(output_data + 32), r7);
-
- por_r2r(r3, r6);
- por_r2r(r5, r7);
- movq_r2m(r6, *(output_data + 40));
- movq_r2m(r7, *(output_data + 32));
+ unsigned char *input_bytes = (unsigned char *)input_data;
+ unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
+ unsigned char *output_data_bytes = (unsigned char *)output_data;
+
+ movq_m2r(*(input_bytes), r0);
+ pmullw_m2r(*(dequant_matrix_bytes), r0); /* r0 = 03 02 01 00 */
+ movq_m2r(*(input_bytes+16), r1);
+ pmullw_m2r(*(dequant_matrix_bytes+16), r1); /* r1 = 13 12 11 10 */
+ movq_m2r(*M(0), r2); /* r2 = __ __ __ FF */
+ movq_r2r(r0, r3); /* r3 = 03 02 01 00 */
+ movq_m2r(*(input_bytes+8), r4);
+ psrlq_i2r(16, r0); /* r0 = __ 03 02 01 */
+ pmullw_m2r(*(dequant_matrix_bytes+8), r4); /* r4 = 07 06 05 04 */
+ pand_r2r(r2, r3); /* r3 = __ __ __ 00 */
+ movq_r2r(r0, r5); /* r5 = __ 03 02 01 */
+ movq_r2r(r1, r6); /* r6 = 13 12 11 10 */
+ pand_r2r(r2, r5); /* r5 = __ __ __ 01 */
+ psllq_i2r(32, r6); /* r6 = 11 10 __ __ */
+ movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
+ pxor_r2r(r5, r0); /* r0 = __ 03 02 __ */
+ pand_r2r(r6, r7); /* r7 = 11 __ __ __ */
+ por_r2r(r3, r0); /* r0 = __ 03 02 00 */
+ pxor_r2r(r7, r6); /* r6 = __ 10 __ __ */
+ por_r2r(r7, r0); /* r0 = 11 03 02 00 = R0 */
+ movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
+ movq_r2r(r4, r3); /* r3 = 07 06 05 04 */
+ movq_r2m(r0, *(output_data_bytes)); /* write R0 = r0 */
+ pand_r2r(r2, r3); /* r3 = __ __ __ 04 */
+ movq_m2r(*(input_bytes+32), r0);
+ psllq_i2r(16, r3); /* r3 = __ __ 04 __ */
+ pmullw_m2r(*(dequant_matrix_bytes+32), r0); /* r0 = 23 22 21 20 */
+ pand_r2r(r1, r7); /* r7 = 13 __ __ __ */
+ por_r2r(r3, r5); /* r5 = __ __ 04 01 */
+ por_r2r(r6, r7); /* r7 = 13 10 __ __ */
+ movq_m2r(*(input_bytes+24), r3);
+ por_r2r(r5, r7); /* r7 = 13 10 04 01 = R1 */
+ pmullw_m2r(*(dequant_matrix_bytes+24), r3); /* r3 = 17 16 15 14 */
+ psrlq_i2r(16, r4); /* r4 = __ 07 06 05 */
+ movq_r2m(r7, *(output_data_bytes+16)); /* write R1 = r7 */
+ movq_r2r(r4, r5); /* r5 = __ 07 06 05 */
+ movq_r2r(r0, r7); /* r7 = 23 22 21 20 */
+ psrlq_i2r(16, r4); /* r4 = __ __ 07 06 */
+ psrlq_i2r(48, r7); /* r7 = __ __ __ 23 */
+ movq_r2r(r2, r6); /* r6 = __ __ __ FF */
+ pand_r2r(r2, r5); /* r5 = __ __ __ 05 */
+ pand_r2r(r4, r6); /* r6 = __ __ __ 06 */
+ movq_r2m(r7, *(output_data_bytes+80)); /* partial R9 = __ __ __ 23 */
+ pxor_r2r(r6, r4); /* r4 = __ __ 07 __ */
+ psrlq_i2r(32, r1); /* r1 = __ __ 13 12 */
+ por_r2r(r5, r4); /* r4 = __ __ 07 05 */
+ movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
+ pand_r2r(r2, r1); /* r1 = __ __ __ 12 */
+ movq_m2r(*(input_bytes+48), r5);
+ psllq_i2r(16, r0); /* r0 = 22 21 20 __ */
+ pmullw_m2r(*(dequant_matrix_bytes+48), r5); /* r5 = 33 32 31 30 */
+ pand_r2r(r0, r7); /* r7 = 22 __ __ __ */
+ movq_r2m(r1, *(output_data_bytes+64)); /* partial R8 = __ __ __ 12 */
+ por_r2r(r4, r7); /* r7 = 22 __ 07 05 */
+ movq_r2r(r3, r4); /* r4 = 17 16 15 14 */
+ pand_r2r(r2, r3); /* r3 = __ __ __ 14 */
+ movq_m2r(*M(2), r1); /* r1 = __ FF __ __ */
+ psllq_i2r(32, r3); /* r3 = __ 14 __ __ */
+ por_r2r(r3, r7); /* r7 = 22 14 07 05 = R2 */
+ movq_r2r(r5, r3); /* r3 = 33 32 31 30 */
+ psllq_i2r(48, r3); /* r3 = 30 __ __ __ */
+ pand_r2r(r0, r1); /* r1 = __ 21 __ __ */
+ movq_r2m(r7, *(output_data_bytes+32)); /* write R2 = r7 */
+ por_r2r(r3, r6); /* r6 = 30 __ __ 06 */
+ movq_m2r(*M(1), r7); /* r7 = __ __ FF __ */
+ por_r2r(r1, r6); /* r6 = 30 21 __ 06 */
+ movq_m2r(*(input_bytes+56), r1);
+ pand_r2r(r4, r7); /* r7 = __ __ 15 __ */
+ pmullw_m2r(*(dequant_matrix_bytes+56), r1); /* r1 = 37 36 35 34 */
+ por_r2r(r6, r7); /* r7 = 30 21 15 06 = R3 */
+ pand_m2r(*M(1), r0); /* r0 = __ __ 20 __ */
+ psrlq_i2r(32, r4); /* r4 = __ __ 17 16 */
+ movq_r2m(r7, *(output_data_bytes+48)); /* write R3 = r7 */
+ movq_r2r(r4, r6); /* r6 = __ __ 17 16 */
+ movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
+ pand_r2r(r2, r4); /* r4 = __ __ __ 16 */
+ movq_m2r(*M(1), r3); /* r3 = __ __ FF __ */
+ pand_r2r(r1, r7); /* r7 = 37 __ __ __ */
+ pand_r2r(r5, r3); /* r3 = __ __ 31 __ */
+ por_r2r(r4, r0); /* r0 = __ __ 20 16 */
+ psllq_i2r(16, r3); /* r3 = __ 31 __ __ */
+ por_r2r(r0, r7); /* r7 = 37 __ 20 16 */
+ movq_m2r(*M(2), r4); /* r4 = __ FF __ __ */
+ por_r2r(r3, r7); /* r7 = 37 31 20 16 = R4 */
+ movq_m2r(*(input_bytes+80), r0);
+ movq_r2r(r4, r3); /* r3 = __ __ FF __ */
+ pmullw_m2r(*(dequant_matrix_bytes+80), r0); /* r0 = 53 52 51 50 */
+ pand_r2r(r5, r4); /* r4 = __ 32 __ __ */
+ movq_r2m(r7, *(output_data_bytes+8)); /* write R4 = r7 */
+ por_r2r(r4, r6); /* r6 = __ 32 17 16 */
+ movq_r2r(r3, r4); /* r4 = __ FF __ __ */
+ psrlq_i2r(16, r6); /* r6 = __ __ 32 17 */
+ movq_r2r(r0, r7); /* r7 = 53 52 51 50 */
+ pand_r2r(r1, r4); /* r4 = __ 36 __ __ */
+ psllq_i2r(48, r7); /* r7 = 50 __ __ __ */
+ por_r2r(r4, r6); /* r6 = __ 36 32 17 */
+ movq_m2r(*(input_bytes+88), r4);
+ por_r2r(r6, r7); /* r7 = 50 36 32 17 = R5 */
+ pmullw_m2r(*(dequant_matrix_bytes+88), r4); /* r4 = 57 56 55 54 */
+ psrlq_i2r(16, r3); /* r3 = __ __ FF __ */
+ movq_r2m(r7, *(output_data_bytes+24)); /* write R5 = r7 */
+ pand_r2r(r1, r3); /* r3 = __ __ 35 __ */
+ psrlq_i2r(48, r5); /* r5 = __ __ __ 33 */
+ pand_r2r(r2, r1); /* r1 = __ __ __ 34 */
+ movq_m2r(*(input_bytes+104), r6);
+ por_r2r(r3, r5); /* r5 = __ __ 35 33 */
+ pmullw_m2r(*(dequant_matrix_bytes+104), r6); /* r6 = 67 66 65 64 */
+ psrlq_i2r(16, r0); /* r0 = __ 53 52 51 */
+ movq_r2r(r4, r7); /* r7 = 57 56 55 54 */
+ movq_r2r(r2, r3); /* r3 = __ __ __ FF */
+ psllq_i2r(48, r7); /* r7 = 54 __ __ __ */
+ pand_r2r(r0, r3); /* r3 = __ __ __ 51 */
+ pxor_r2r(r3, r0); /* r0 = __ 53 52 __ */
+ psllq_i2r(32, r3); /* r3 = __ 51 __ __ */
+ por_r2r(r5, r7); /* r7 = 54 __ 35 33 */
+ movq_r2r(r6, r5); /* r5 = 67 66 65 64 */
+ pand_m2r(*M(1), r6); /* r6 = __ __ 65 __ */
+ por_r2r(r3, r7); /* r7 = 54 51 35 33 = R6 */
+ psllq_i2r(32, r6); /* r6 = 65 __ __ __ */
+ por_r2r(r1, r0); /* r0 = __ 53 52 34 */
+ movq_r2m(r7, *(output_data_bytes+40)); /* write R6 = r7 */
+ por_r2r(r6, r0); /* r0 = 65 53 52 34 = R7 */
+ movq_m2r(*(input_bytes+120), r7);
+ movq_r2r(r5, r6); /* r6 = 67 66 65 64 */
+ pmullw_m2r(*(dequant_matrix_bytes+120), r7); /* r7 = 77 76 75 74 */
+ psrlq_i2r(32, r5); /* r5 = __ __ 67 66 */
+ pand_r2r(r2, r6); /* r6 = __ __ __ 64 */
+ movq_r2r(r5, r1); /* r1 = __ __ 67 66 */
+ movq_r2m(r0, *(output_data_bytes+56)); /* write R7 = r0 */
+ pand_r2r(r2, r1); /* r1 = __ __ __ 66 */
+ movq_m2r(*(input_bytes+112), r0);
+ movq_r2r(r7, r3); /* r3 = 77 76 75 74 */
+ pmullw_m2r(*(dequant_matrix_bytes+112), r0); /* r0 = 73 72 71 70 */
+ psllq_i2r(16, r3); /* r3 = 76 75 74 __ */
+ pand_m2r(*M(3), r7); /* r7 = 77 __ __ __ */
+ pxor_r2r(r1, r5); /* r5 = __ __ 67 __ */
+ por_r2r(r5, r6); /* r6 = __ __ 67 64 */
+ movq_r2r(r3, r5); /* r5 = 76 75 74 __ */
+ pand_m2r(*M(3), r5); /* r5 = 76 __ __ __ */
+ por_r2r(r1, r7); /* r7 = 77 __ __ 66 */
+ movq_m2r(*(input_bytes+96), r1);
+ pxor_r2r(r5, r3); /* r3 = __ 75 74 __ */
+ pmullw_m2r(*(dequant_matrix_bytes+96), r1); /* r1 = 63 62 61 60 */
+ por_r2r(r3, r7); /* r7 = 77 75 74 66 = R15 */
+ por_r2r(r5, r6); /* r6 = 76 __ 67 64 */
+ movq_r2r(r0, r5); /* r5 = 73 72 71 70 */
+ movq_r2m(r7, *(output_data_bytes+120)); /* store R15 = r7 */
+ psrlq_i2r(16, r5); /* r5 = __ 73 72 71 */
+ pand_m2r(*M(2), r5); /* r5 = __ 73 __ __ */
+ movq_r2r(r0, r7); /* r7 = 73 72 71 70 */
+ por_r2r(r5, r6); /* r6 = 76 73 67 64 = R14 */
+ pand_r2r(r2, r0); /* r0 = __ __ __ 70 */
+ pxor_r2r(r0, r7); /* r7 = 73 72 71 __ */
+ psllq_i2r(32, r0); /* r0 = __ 70 __ __ */
+ movq_r2m(r6, *(output_data_bytes+104)); /* write R14 = r6 */
+ psrlq_i2r(16, r4); /* r4 = __ 57 56 55 */
+ movq_m2r(*(input_bytes+72), r5);
+ psllq_i2r(16, r7); /* r7 = 72 71 __ __ */
+ pmullw_m2r(*(dequant_matrix_bytes+72), r5); /* r5 = 47 46 45 44 */
+ movq_r2r(r7, r6); /* r6 = 72 71 __ __ */
+ movq_m2r(*M(2), r3); /* r3 = __ FF __ __ */
+ psllq_i2r(16, r6); /* r6 = 71 __ __ __ */
+ pand_m2r(*M(3), r7); /* r7 = 72 __ __ __ */
+ pand_r2r(r1, r3); /* r3 = __ 62 __ __ */
+ por_r2r(r0, r7); /* r7 = 72 70 __ __ */
+ movq_r2r(r1, r0); /* r0 = 63 62 61 60 */
+ pand_m2r(*M(3), r1); /* r1 = 63 __ __ __ */
+ por_r2r(r3, r6); /* r6 = 71 62 __ __ */
+ movq_r2r(r4, r3); /* r3 = __ 57 56 55 */
+ psrlq_i2r(32, r1); /* r1 = __ __ 63 __ */
+ pand_r2r(r2, r3); /* r3 = __ __ __ 55 */
+ por_r2r(r1, r7); /* r7 = 72 70 63 __ */
+ por_r2r(r3, r7); /* r7 = 72 70 63 55 = R13 */
+ movq_r2r(r4, r3); /* r3 = __ 57 56 55 */
+ pand_m2r(*M(1), r3); /* r3 = __ __ 56 __ */
+ movq_r2r(r5, r1); /* r1 = 47 46 45 44 */
+ movq_r2m(r7, *(output_data_bytes+88)); /* write R13 = r7 */
+ psrlq_i2r(48, r5); /* r5 = __ __ __ 47 */
+ movq_m2r(*(input_bytes+64), r7);
+ por_r2r(r3, r6); /* r6 = 71 62 56 __ */
+ pmullw_m2r(*(dequant_matrix_bytes+64), r7); /* r7 = 43 42 41 40 */
+ por_r2r(r5, r6); /* r6 = 71 62 56 47 = R12 */
+ pand_m2r(*M(2), r4); /* r4 = __ 57 __ __ */
+ psllq_i2r(32, r0); /* r0 = 61 60 __ __ */
+ movq_r2m(r6, *(output_data_bytes+72)); /* write R12 = r6 */
+ movq_r2r(r0, r6); /* r6 = 61 60 __ __ */
+ pand_m2r(*M(3), r0); /* r0 = 61 __ __ __ */
+ psllq_i2r(16, r6); /* r6 = 60 __ __ __ */
+ movq_m2r(*(input_bytes+40), r5);
+ movq_r2r(r1, r3); /* r3 = 47 46 45 44 */
+ pmullw_m2r(*(dequant_matrix_bytes+40), r5); /* r5 = 27 26 25 24 */
+ psrlq_i2r(16, r1); /* r1 = __ 47 46 45 */
+ pand_m2r(*M(1), r1); /* r1 = __ __ 46 __ */
+ por_r2r(r4, r0); /* r0 = 61 57 __ __ */
+ pand_r2r(r7, r2); /* r2 = __ __ __ 40 */
+ por_r2r(r1, r0); /* r0 = 61 57 46 __ */
+ por_r2r(r2, r0); /* r0 = 61 57 46 40 = R11 */
+ psllq_i2r(16, r3); /* r3 = 46 45 44 __ */
+ movq_r2r(r3, r4); /* r4 = 46 45 44 __ */
+ movq_r2r(r5, r2); /* r2 = 27 26 25 24 */
+ movq_r2m(r0, *(output_data_bytes+112)); /* write R11 = r0 */
+ psrlq_i2r(48, r2); /* r2 = __ __ __ 27 */
+ pand_m2r(*M(2), r4); /* r4 = __ 45 __ __ */
+ por_r2r(r2, r6); /* r6 = 60 __ __ 27 */
+ movq_m2r(*M(1), r2); /* r2 = __ __ FF __ */
+ por_r2r(r4, r6); /* r6 = 60 45 __ 27 */
+ pand_r2r(r7, r2); /* r2 = __ __ 41 __ */
+ psllq_i2r(32, r3); /* r3 = 44 __ __ __ */
+ por_m2r(*(output_data_bytes+80), r3); /* r3 = 44 __ __ 23 */
+ por_r2r(r2, r6); /* r6 = 60 45 41 27 = R10 */
+ movq_m2r(*M(3), r2); /* r2 = FF __ __ __ */
+ psllq_i2r(16, r5); /* r5 = 26 25 24 __ */
+ movq_r2m(r6, *(output_data_bytes+96)); /* store R10 = r6 */
+ pand_r2r(r5, r2); /* r2 = 26 __ __ __ */
+ movq_m2r(*M(2), r6); /* r6 = __ FF __ __ */
+ pxor_r2r(r2, r5); /* r5 = __ 25 24 __ */
+ pand_r2r(r7, r6); /* r6 = __ 42 __ __ */
+ psrlq_i2r(32, r2); /* r2 = __ __ 26 __ */
+ pand_m2r(*M(3), r7); /* r7 = 43 __ __ __ */
+ por_r2r(r2, r3); /* r3 = 44 __ 26 23 */
+ por_m2r(*(output_data_bytes+64), r7); /* r7 = 43 __ __ 12 */
+ por_r2r(r3, r6); /* r6 = 44 42 26 23 = R9 */
+ por_r2r(r5, r7); /* r7 = 43 25 24 12 = R8 */
+ movq_r2m(r6, *(output_data_bytes+80)); /* store R9 = r6 */
+ movq_r2m(r7, *(output_data_bytes+64)); /* store R8 = r7 */
#undef M
- /* at this point, function has completed dequantization + dezigzag +
+ /* at this point, function has completed dequantization + dezigzag +
* partial transposition; now do the idct itself */
#define I(K) (output_data + K * 8)
diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
new file mode 100644
index 000000000..c8f9158af
--- /dev/null
+++ b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (C) 2004 the ffmpeg project
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/**
+ * @file vp3dsp_sse2.c
+ * SSE2-optimized functions cribbed from the original VP3 source code.
+ */
+
+#include "../dsputil.h"
+#include "mmx.h"
+
+static unsigned short __align16 SSE2_dequant_const[] =
+{
+ 0,65535,65535,0,0,0,0,0, // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000
+ 0,0,0,0,65535,65535,0,0, // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000
+ 65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF
+ 0,0,0,65535,0,0,0,0, // 0x0000 0000 0000 0000 FFFF 0000 0000 0000
+ 0,0,0,65535,65535,0,0,0, // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000
+ 65535,0,0,0,0,65535,0,0, // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF
+ 0,0,65535,65535, 0,0,0,0 // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000
+};
+
+static unsigned int __align16 eight_data[] =
+{
+ 0x00080008,
+ 0x00080008,
+ 0x00080008,
+ 0x00080008
+};
+
+static unsigned short __align16 SSE2_idct_data[7 * 8] =
+{
+ 64277,64277,64277,64277,64277,64277,64277,64277,
+ 60547,60547,60547,60547,60547,60547,60547,60547,
+ 54491,54491,54491,54491,54491,54491,54491,54491,
+ 46341,46341,46341,46341,46341,46341,46341,46341,
+ 36410,36410,36410,36410,36410,36410,36410,36410,
+ 25080,25080,25080,25080,25080,25080,25080,25080,
+ 12785,12785,12785,12785,12785,12785,12785,12785
+};
+
+
+#define SSE2_Column_IDCT() { \
+ \
+ movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \
+ movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \
+ \
+ movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
+ movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \
+ \
+ pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
+ movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \
+ \
+ pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
+ movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
+ \
+ pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
+ movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \
+ \
+ pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
+ movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \
+ \
+ /* all registers are in use */ \
+ \
+ paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
+ paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
+ \
+ paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
+ movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \
+ \
+ paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
+ movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
+ \
+ pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
+ paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
+ \
+ pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
+ movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \
+ \
+ psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
+ paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
+ \
+ pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
+ movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \
+ \
+ pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
+ paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
+ \
+ movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
+ pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \
+ \
+ psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
+ movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \
+ \
+ paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
+ movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
+ \
+ psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
+ pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \
+ \
+ paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
+ pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \
+ \
+ paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
+ paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
+ \
+ psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
+ paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
+ \
+ paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
+ pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \
+ \
+ paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
+ movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \
+ \
+ psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
+ movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \
+ \
+ movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
+ pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
+ \
+ paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
+ movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \
+ \
+ movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
+ movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \
+ \
+ pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
+ paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
+ \
+ movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \
+ psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
+ \
+ paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
+ psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
+ \
+ movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
+ pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
+ \
+ paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
+ paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
+ \
+ paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
+ paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
+ \
+ pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
+ paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
+ \
+ psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
+ paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
+ \
+ movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \
+ paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
+ \
+ paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
+ psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
+ \
+ paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \
+ paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
+ \
+ paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
+ psraw_i2r(4, xmm2); /* xmm2 = op2 */ \
+ \
+ psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
+ psraw_i2r(4, xmm1); /* xmm1 = op1 */ \
+ \
+ movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \
+ paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
+ \
+ movdqu_r2m(xmm2, *O(2)); /* Write out op2 */ \
+ paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
+ \
+ movdqu_r2m(xmm1, *O(1)); /* Write out op1 */ \
+ psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
+ \
+ paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \
+ paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
+ \
+ paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
+ psraw_i2r(4, xmm4); /* xmm4 = op4 */ \
+ \
+ psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
+ psraw_i2r(4, xmm3); /* xmm3 = op3 */ \
+ \
+ paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \
+ paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
+ \
+ paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
+ psraw_i2r(4, xmm6); /* xmm6 = op6 */ \
+ \
+ movdqu_r2m(xmm4, *O(4)); /* Write out op4 */ \
+ psraw_i2r(4, xmm5); /* xmm5 = op5 */ \
+ \
+ movdqu_r2m(xmm3, *O(3)); /* Write out op3 */ \
+ psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
+ \
+ paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \
+ paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
+ \
+ paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
+ psraw_i2r(4, xmm7); /* xmm7 = op7 */ \
+ \
+ movdqu_r2m(xmm6, *O(6)); /* Write out op6 */ \
+ psraw_i2r(4, xmm0); /* xmm0 = op0 */ \
+ \
+ movdqu_r2m(xmm5, *O(5)); /* Write out op5 */ \
+ movdqu_r2m(xmm7, *O(7)); /* Write out op7 */ \
+ \
+ movdqu_r2m(xmm0, *O(0)); /* Write out op0 */ \
+ \
+} /* End of SSE2_Column_IDCT macro */
+
+
+#define SSE2_Row_IDCT() { \
+ \
+ movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \
+ movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \
+ \
+ movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
+ movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \
+ \
+ pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
+ movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \
+ \
+ pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
+ movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
+ \
+ pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
+ movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \
+ \
+ pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
+ movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \
+ \
+ /* all registers are in use */ \
+ \
+ paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
+ paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
+ \
+ paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
+ movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \
+ \
+ paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
+ movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
+ \
+ pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
+ paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
+ \
+ pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
+ movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \
+ \
+ psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
+ paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
+ \
+ pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
+ movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \
+ \
+ pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
+ paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
+ \
+ movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
+ pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \
+ \
+ psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
+ movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \
+ \
+ paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
+ movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
+ \
+ psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
+ pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \
+ \
+ paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
+ pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \
+ \
+ paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
+ paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
+ \
+ psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
+ paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
+ \
+ paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
+ pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \
+ \
+ paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
+ movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \
+ \
+ psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
+ movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \
+ \
+ movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
+ pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
+ \
+ paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
+ movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \
+ \
+ movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
+ movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \
+ \
+ pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
+ paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
+ \
+ movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \
+ psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
+ \
+ paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
+ psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
+ \
+ movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
+ pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \
+ \
+ paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
+ paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
+ \
+ paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
+ paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
+ \
+ pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
+ paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
+ \
+ psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
+ paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
+ \
+ movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \
+ paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
+ \
+ paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
+ psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
+ \
+ paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
+ paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
+ \
+ psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
+ \
+ movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \
+ paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
+ \
+ movdqu_r2m(xmm2, *I(2)); /* Write out op2 */ \
+ paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
+ \
+ movdqu_r2m(xmm1, *I(1)); /* Write out op1 */ \
+ psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
+ \
+ paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
+ \
+ paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
+ \
+ psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
+ \
+ paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
+ \
+ paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
+ \
+ movdqu_r2m(xmm4, *I(4)); /* Write out op4 */ \
+ \
+ movdqu_r2m(xmm3, *I(3)); /* Write out op3 */ \
+ psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
+ \
+ paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
+ \
+ paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
+ \
+ movdqu_r2m(xmm6, *I(6)); /* Write out op6 */ \
+ \
+ movdqu_r2m(xmm5, *I(5)); /* Write out op5 */ \
+ movdqu_r2m(xmm7, *I(7)); /* Write out op7 */ \
+ \
+ movdqu_r2m(xmm0, *I(0)); /* Write out op0 */ \
+ \
+} /* End of SSE2_Row_IDCT macro */
+
+
+#define SSE2_Transpose() { \
+ \
+ movdqu_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \
+ movdqu_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \
+ \
+ movdqu_r2r(xmm4, xmm5); /* make a copy */ \
+ punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \
+ \
+ punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \
+ movdqu_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \
+ \
+ movdqu_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \
+ movdqu_r2r(xmm6, xmm7); /* make a copy */ \
+ \
+ punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \
+ punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \
+ \
+ movdqu_r2r(xmm4, xmm3); /* make a copy */ \
+ punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \
+ \
+ punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \
+ movdqu_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \
+ /* Free xmm6 */ \
+ movdqu_r2r(xmm5, xmm6); /* make a copy */ \
+ punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \
+ \
+ punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \
+ movdqu_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \
+ /* Free xmm7 */ \
+ movdqu_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \
+ movdqu_r2r(xmm0, xmm7); /* make a copy */ \
+ \
+ punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \
+ punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \
+ /* Free xmm1 */ \
+ movdqu_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \
+ movdqu_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \
+ \
+ movdqu_r2r(xmm2, xmm1); /* make a copy */ \
+ punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \
+ \
+ punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \
+ movdqu_r2r(xmm0, xmm3); /* make a copy */ \
+ \
+ punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \
+ punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \
+ /* Free xmm2 */ \
+ movdqu_r2r(xmm7, xmm2); /* make a copy */ \
+ punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \
+ \
+ punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \
+ movdqu_r2r(xmm0, xmm1); /* make a copy */ \
+ \
+ punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \
+ punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \
+ \
+ movdqu_r2m(xmm0, *I(0)); /* save I(0) */ \
+ movdqu_r2m(xmm1, *I(1)); /* save I(1) */ \
+ \
+ movdqu_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \
+ movdqu_r2r(xmm3, xmm1); /* make a copy */ \
+ \
+ punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \
+ punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \
+ \
+ movdqu_r2r(xmm2, xmm4); /* make a copy */ \
+ punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \
+ \
+ punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \
+ movdqu_r2m(xmm1, *I(2)); /* save I(2) */ \
+ \
+ movdqu_r2m(xmm3, *I(3)); /* save I(3) */ \
+ movdqu_r2m(xmm4, *I(4)); /* save I(4) */ \
+ \
+ movdqu_r2m(xmm2, *I(5)); /* save I(5) */ \
+ movdqu_r2r(xmm7, xmm5); /* make a copy */ \
+ \
+ punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \
+ punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \
+ \
+ movdqu_r2m(xmm5, *I(6)); /* save I(6) */ \
+ movdqu_r2m(xmm7, *I(7)); /* save I(7) */ \
+ \
+} /* End of Transpose Macro */
+
+
+#define SSE2_Dequantize() { \
+ movdqu_m2r(*(eax), xmm0); \
+ \
+ pmullw_m2r(*(ebx), xmm0); /* xmm0 = 07 06 05 04 03 02 01 00 */ \
+ movdqu_m2r(*(eax + 16), xmm1); \
+ \
+ pmullw_m2r(*(ebx + 16), xmm1); /* xmm1 = 17 16 15 14 13 12 11 10 */ \
+ pshuflw_r2r(xmm0, xmm3, 0x078); /* xmm3 = 07 06 05 04 01 03 02 00 */ \
+ \
+ movdqu_r2r(xmm1, xmm2); /* xmm2 = 17 16 15 14 13 12 11 10 */ \
+ movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
+ \
+ movdqu_m2r(*(eax + 32), xmm4); \
+ movdqu_m2r(*(eax + 64), xmm5); \
+ \
+ pmullw_m2r(*(ebx + 32), xmm4); /* xmm4 = 27 26 25 24 23 22 21 20 */ \
+ pmullw_m2r(*(ebx + 64), xmm5); /* xmm5 = 47 46 45 44 43 42 41 40 */ \
+ \
+ movdqu_m2r(*(ecx + 16), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \
+ pand_r2r(xmm2, xmm7); /* xmm7 = -- -- -- -- -- 12 11 -- */ \
+ \
+ pand_r2r(xmm4, xmm6); /* xmm6 = -- -- 25 24 -- -- -- -- */ \
+ pxor_r2r(xmm7, xmm2); /* xmm2 = 17 16 15 14 13 -- -- 10 */ \
+ \
+ pxor_r2r(xmm6, xmm4); /* xmm4 = 27 26 -- -- 23 22 21 20 */ \
+ pslldq_i2r(4, xmm7); /* xmm7 = -- -- -- 12 11 -- -- -- */ \
+ \
+ pslldq_i2r(2, xmm6); /* xmm6 = -- 25 24 -- -- -- -- -- */ \
+ por_r2r(xmm6, xmm7); /* xmm7 = -- 25 24 12 11 -- -- -- */ \
+ \
+ movdqu_m2r(*(ecx + 32), xmm0); /* xmm0 = -- -- -- -- -- FF FF FF */ \
+ movdqu_m2r(*(ecx + 48), xmm6); /* xmm6 = -- -- -- -- FF -- -- -- */ \
+ \
+ pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- -- -- 03 02 00 */ \
+ pand_r2r(xmm5, xmm6); /* xmm6 = -- -- -- -- 43 -- -- -- */ \
+ \
+ pxor_r2r(xmm0, xmm3); /* xmm3 = 07 06 05 04 01 -- -- -- */ \
+ pxor_r2r(xmm6, xmm5); /* xmm5 = 47 46 45 44 -- 42 41 40 */ \
+ \
+ por_r2r(xmm7, xmm0); /* xmm0 = -- 25 24 12 11 03 02 00 */ \
+ pslldq_i2r(8, xmm6); /* xmm6 = 43 -- -- -- -- -- -- -- */ \
+ \
+ por_r2r(xmm6, xmm0); /* xmm0 = 43 25 24 12 11 03 02 00 */ \
+ /* 02345 in use */ \
+ \
+ movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \
+ pshuflw_r2r(xmm5, xmm5, 0x0B4); /* xmm5 = 47 46 45 44 42 -- 41 40 */ \
+ \
+ movdqu_r2r(xmm1, xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
+ movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
+ \
+ movdqu_r2m(xmm0, *(eax)); /* write 43 25 24 12 11 03 02 00 */ \
+ pshufhw_r2r(xmm4, xmm4, 0x0C2); /* xmm4 = 27 -- -- 26 23 22 21 20 */ \
+ \
+ pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- 26 23 -- -- -- */ \
+ pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 44 42 -- -- -- */ \
+ \
+ pxor_r2r(xmm7, xmm4); /* xmm4 = 27 -- -- -- -- 22 21 20 */ \
+ pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 45 -- -- -- 41 40 */ \
+ \
+ pshuflw_r2r(xmm2, xmm2, 0x0C6); /* xmm2 = 17 16 15 14 13 10 -- -- */ \
+ movdqu_r2r(xmm6, xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
+ \
+ pslldq_i2r(2, xmm7); /* xmm7 = -- -- 26 23 -- -- -- -- */ \
+ pslldq_i2r(6, xmm1); /* xmm1 = 44 42 -- -- -- -- -- -- */ \
+ \
+ psrldq_i2r(2, xmm0); /* xmm0 = -- -- -- -- FF FF -- -- */ \
+ pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- 04 01 -- -- -- */ \
+ \
+ pand_r2r(xmm2, xmm0); /* xmm0 = -- -- -- -- 13 10 -- -- */ \
+ pxor_r2r(xmm6, xmm3); /* xmm3 = 07 06 05 -- -- -- -- -- */ \
+ \
+ pxor_r2r(xmm0, xmm2); /* xmm2 = 17 16 15 14 -- -- -- -- */ \
+ psrldq_i2r(6, xmm6); /* xmm0 = -- -- -- -- -- -- 04 01 */ \
+ \
+ por_r2r(xmm7, xmm1); /* xmm1 = 44 42 26 23 -- -- -- -- */ \
+ por_r2r(xmm6, xmm0); /* xmm1 = -- -- -- -- 13 10 04 01 */ \
+ /* 12345 in use */ \
+ por_r2r(xmm0, xmm1); /* xmm1 = 44 42 26 23 13 10 04 01 */ \
+ pshuflw_r2r(xmm4, xmm4, 0x093); /* xmm4 = 27 -- -- -- 22 21 20 -- */ \
+ \
+ pshufhw_r2r(xmm4, xmm4, 0x093); /* xmm4 = -- -- -- 27 22 21 20 -- */ \
+ movdqu_r2m(xmm1, *(eax + 16)); /* write 44 42 26 23 13 10 04 01 */ \
+ \
+ pshufhw_r2r(xmm3, xmm3, 0x0D2); /* xmm3 = 07 05 -- 06 -- -- -- -- */ \
+ movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
+ \
+ pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 06 -- -- -- -- */ \
+ psrldq_i2r(12, xmm3); /* xmm3 = -- -- -- -- -- -- 07 05 */ \
+ \
+ psrldq_i2r(8, xmm0); /* xmm0 = -- -- -- -- -- -- -- 06 */ \
+ \
+ movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
+ movdqu_m2r(*(ecx + 96), xmm7); /* xmm7 = -- -- -- -- FF FF -- -- */ \
+ \
+ pand_r2r(xmm4, xmm6); /* xmm6 = -- -- -- 27 22 -- -- -- */ \
+ pxor_r2r(xmm6, xmm4); /* xmm4 = -- -- -- -- -- 21 20 -- */ \
+ \
+ por_r2r(xmm6, xmm3); /* xmm3 = -- -- -- 27 22 -- 07 05 */ \
+ pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- -- -- 21 -- -- */ \
+ \
+ por_r2r(xmm7, xmm0); /* xmm0 = -- -- -- -- -- 21 -- 06 */ \
+ pxor_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- -- 20 -- */ \
+ \
+ movdqu_m2r(*(ecx + 16 ), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \
+ movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \
+ \
+ pand_r2r(xmm2, xmm6); /* xmm6 = -- -- 15 14 -- -- -- -- */ \
+ pand_r2r(xmm6, xmm1); /* xmm1 = -- -- -- 14 -- -- -- -- */ \
+ \
+ pxor_r2r(xmm6, xmm2); /* xmm2 = 17 16 -- -- -- -- -- -- */ \
+ pxor_r2r(xmm1, xmm6); /* xmm6 = -- -- 15 -- -- -- -- -- */ \
+ \
+ psrldq_i2r(4, xmm1); /* xmm1 = -- -- -- -- -- 14 -- -- */ \
+ \
+ psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- 15 -- */ \
+ por_r2r(xmm1, xmm3); /* xmm3 = -- -- -- 27 22 14 07 05 */ \
+ \
+ por_r2r(xmm6, xmm0); /* xmm0 = -- -- -- -- -- 21 15 06 */ \
+ pshufhw_r2r(xmm5, xmm5, 0x0E1); /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \
+ \
+ movdqu_m2r(*(ecx + 64), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \
+ pshuflw_r2r(xmm5, xmm5, 0x072); /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \
+ \
+ movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
+ pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 45 41 -- -- -- */ \
+ \
+ pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 -- -- -- -- 40 -- */ \
+ pslldq_i2r(4, xmm1); /* xmm1 = -- 45 41 -- -- -- -- -- */ \
+ \
+ pshufd_r2r(xmm5, xmm5, 0x09C); /* xmm5 = -- -- -- -- 47 46 40 -- */ \
+ por_r2r(xmm1, xmm3); /* xmm3 = -- 45 41 27 22 14 07 05 */ \
+ \
+ movdqu_m2r(*(eax + 96), xmm1); /* xmm1 = 67 66 65 64 63 62 61 60 */ \
+ pmullw_m2r(*(ebx + 96), xmm1); \
+ \
+ movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
+ \
+ psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \
+ pand_r2r(xmm5, xmm7); /* xmm7 = -- -- -- -- -- 46 40 -- */ \
+ \
+ pand_r2r(xmm1, xmm6); /* xmm6 = -- -- -- -- -- -- -- 60 */ \
+ pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- -- -- 47 -- -- -- */ \
+ \
+ pxor_r2r(xmm6, xmm1); /* xmm1 = 67 66 65 64 63 62 61 -- */ \
+ pslldq_i2r(2, xmm5); /* xmm5 = -- -- -- 47 -- -- -- -- */ \
+ \
+ pslldq_i2r(14, xmm6); /* xmm6 = 60 -- -- -- -- -- -- -- */ \
+ por_r2r(xmm5, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 -- */ \
+ \
+ por_r2r(xmm6, xmm3); /* xmm3 = 60 45 41 27 22 14 07 05 */ \
+ pslldq_i2r(6, xmm7); /* xmm7 = -- -- 46 40 -- -- -- -- */ \
+ \
+ movdqu_r2m(xmm3, *(eax+32)); /* write 60 45 41 27 22 14 07 05 */ \
+ por_r2r(xmm7, xmm0); /* xmm0 = -- -- 46 40 -- 21 15 06 */ \
+ /* 0, 1, 2, 4 in use */ \
+ movdqu_m2r(*(eax + 48), xmm3); /* xmm3 = 37 36 35 34 33 32 31 30 */ \
+ movdqu_m2r(*(eax + 80), xmm5); /* xmm5 = 57 56 55 54 53 52 51 50 */ \
+ \
+ pmullw_m2r(*(ebx + 48), xmm3); \
+ pmullw_m2r(*(ebx + 80), xmm5); \
+ \
+ movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
+ movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
+ \
+ psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \
+ pslldq_i2r(8, xmm7); /* xmm7 = FF -- -- -- -- -- -- -- */ \
+ \
+ pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- -- 30 */ \
+ pand_r2r(xmm5, xmm7); /* xmm7 = 57 -- -- -- -- -- -- -- */ \
+ \
+ pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 31 -- */ \
+ pxor_r2r(xmm7, xmm5); /* xmm5 = __ 56 55 54 53 52 51 50 */ \
+ \
+ pslldq_i2r(6, xmm6); /* xmm6 = -- -- -- -- 30 -- -- -- */ \
+ psrldq_i2r(2, xmm7); /* xmm7 = -- 57 -- -- -- -- -- -- */ \
+ \
+ por_r2r(xmm7, xmm6); /* xmm6 = -- 57 -- -- 30 -- -- -- */ \
+ movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
+ \
+ por_r2r(xmm6, xmm0); /* xmm0 = -- 57 46 40 30 21 15 06 */ \
+ psrldq_i2r(2, xmm7); /* xmm7 = -- -- -- -- -- -- FF FF */ \
+ \
+ movdqu_r2r(xmm2, xmm6); /* xmm6 = 17 16 -- -- -- -- -- -- */ \
+ pand_r2r(xmm1, xmm7); /* xmm7 = -- -- -- -- -- -- 61 -- */ \
+ \
+ pslldq_i2r(2, xmm6); /* xmm6 = 16 -- -- -- -- -- -- -- */ \
+ psrldq_i2r(14, xmm2); /* xmm2 = -- -- -- -- -- -- -- 17 */ \
+ \
+ pxor_r2r(xmm7, xmm1); /* xmm1 = 67 66 65 64 63 62 -- -- */ \
+ pslldq_i2r(12, xmm7); /* xmm7 = 61 -- -- -- -- -- -- -- */ \
+ \
+ psrldq_i2r(14, xmm6); /* xmm6 = -- -- -- -- -- -- -- 16 */ \
+ por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 16 */ \
+ \
+ por_r2r(xmm7, xmm0); /* xmm0 = 61 57 46 40 30 21 15 06 */ \
+ movdqu_m2r(*(ecx), xmm6); /* xmm6 = -- -- -- -- -- FF FF -- */ \
+ \
+ psrldq_i2r(2, xmm6); /* xmm6 = -- -- -- -- -- -- FF FF */ \
+ movdqu_r2m(xmm0, *(eax+48)); /* write 61 57 46 40 30 21 15 06 */ \
+ /* 1, 2, 3, 4, 5 in use */\
+ movdqu_m2r(*(ecx), xmm0); /* xmm0 = -- -- -- -- -- FF FF -- */ \
+ pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- 31 -- */ \
+ \
+ movdqu_r2r(xmm3, xmm7); /* xmm7 = 37 36 35 34 33 32 31 -- */ \
+ pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 -- -- */ \
+ \
+ pslldq_i2r(2, xmm3); /* xmm3 = 36 35 34 33 32 -- -- -- */ \
+ pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- -- 62 -- -- */ \
+ \
+ psrldq_i2r(14, xmm7); /* xmm7 = -- -- -- -- -- -- -- 37 */ \
+ pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 63 -- -- -- */ \
+ \
+ por_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- 31 37 */ \
+ movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
+ \
+ pshuflw_r2r(xmm6, xmm6, 0x01E); /* xmm6 = -- -- -- -- 37 31 -- -- */ \
+ pslldq_i2r(6, xmm7); /* xmm7 = FF FF -- -- -- -- -- -- */ \
+ \
+ por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 37 31 20 16 */ \
+ pand_r2r(xmm5, xmm7); /* xmm7 = -- 56 -- -- -- -- -- -- */ \
+ \
+ pslldq_i2r(8, xmm0); /* xmm0 = -- 62 -- -- -- -- -- -- */ \
+ pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- 55 54 53 52 51 50 */ \
+ \
+ psrldq_i2r(2, xmm7); /* xmm7 = -- -- 56 -- -- -- -- -- */ \
+ \
+ pshufhw_r2r(xmm3, xmm3, 0x087); /* xmm3 = 35 33 34 36 32 -- -- -- */ \
+ por_r2r(xmm7, xmm0); /* xmm0 = -- 62 56 -- -- -- -- -- */ \
+ \
+ movdqu_m2r(*(eax + 112), xmm7); /* xmm7 = 77 76 75 74 73 72 71 70 */ \
+ pmullw_m2r(*(ebx + 112), xmm7); \
+ \
+ movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
+ por_r2r(xmm0, xmm4); /* xmm4 = -- 62 56 47 37 31 20 16 */ \
+ \
+ pshuflw_r2r(xmm7, xmm7, 0x0E1); /* xmm7 = 77 76 75 74 73 72 70 71 */ \
+ psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \
+ \
+ movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
+ pand_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- -- 71 */ \
+ \
+ pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 36 32 -- -- -- */ \
+ pxor_r2r(xmm6, xmm7); /* xmm7 = 77 76 75 74 73 72 70 -- */ \
+ \
+ pxor_r2r(xmm0, xmm3); /* xmm3 = 35 33 34 -- -- -- -- -- */ \
+ pslldq_i2r(14, xmm6); /* xmm6 = 71 -- -- -- -- -- -- -- */ \
+ \
+ psrldq_i2r(4, xmm0); /* xmm0 = -- -- -- -- -- 36 32 -- */ \
+ por_r2r(xmm6, xmm4); /* xmm4 = 71 62 56 47 37 31 20 16 */ \
+ \
+ por_r2r(xmm0, xmm2); /* xmm2 = -- -- -- -- -- 36 32 17 */ \
+ movdqu_r2m(xmm4, *(eax + 64)); /* write 71 62 56 47 37 31 20 16 */ \
+ /* 1, 2, 3, 5, 7 in use */ \
+ movdqu_m2r(*(ecx + 80), xmm6); /* xmm6 = -- -- FF -- -- -- -- FF */ \
+ pshufhw_r2r(xmm7, xmm7, 0x0D2); /* xmm7 = 77 75 74 76 73 72 70 __ */ \
+ \
+ movdqu_m2r(*(ecx), xmm4); /* xmm4 = -- -- -- -- -- FF FF -- */ \
+ movdqu_m2r(*(ecx+48), xmm0); /* xmm0 = -- -- -- -- FF -- -- -- */ \
+ \
+ pand_r2r(xmm5, xmm6); /* xmm6 = -- -- 55 -- -- -- -- 50 */ \
+ pand_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- 72 70 -- */ \
+ \
+ pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- 63 -- -- -- */ \
+ pxor_r2r(xmm6, xmm5); /* xmm5 = -- -- -- 54 53 52 51 -- */ \
+ \
+ pxor_r2r(xmm4, xmm7); /* xmm7 = 77 75 74 76 73 -- -- -- */ \
+ pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 -- -- -- -- */ \
+ \
+ pshuflw_r2r(xmm6, xmm6, 0x02B); /* xmm6 = -- -- 55 -- 50 -- -- -- */ \
+ pslldq_i2r(10, xmm4); /* xmm4 = 72 20 -- -- -- -- -- -- */ \
+ \
+ pshufhw_r2r(xmm6, xmm6, 0x0B1); /* xmm6 = -- -- -- 55 50 -- -- -- */ \
+ pslldq_i2r(4, xmm0); /* xmm0 = -- -- 63 -- -- -- -- -- */ \
+ \
+ por_r2r(xmm4, xmm6); /* xmm6 = 72 70 -- 55 50 -- -- -- */ \
+ por_r2r(xmm0, xmm2); /* xmm2 = -- -- 63 -- -- 36 32 17 */ \
+ \
+ por_r2r(xmm6, xmm2); /* xmm2 = 72 70 64 55 50 36 32 17 */ \
+ pshufhw_r2r(xmm1, xmm1, 0x0C9); /* xmm1 = 67 64 66 65 -- -- -- -- */ \
+ \
+ movdqu_r2r(xmm3, xmm6); /* xmm6 = 35 33 34 -- -- -- -- -- */ \
+ movdqu_r2m(xmm2, *(eax+80)); /* write 72 70 64 55 50 36 32 17 */ \
+ \
+ psrldq_i2r(12, xmm6); /* xmm6 = -- -- -- -- -- -- 35 33 */ \
+ pslldq_i2r(4, xmm3); /* xmm3 = 34 -- -- -- -- -- -- -- */ \
+ \
+ pshuflw_r2r(xmm5, xmm5, 0x04E); /* xmm5 = -- -- -- 54 51 -- 53 52 */ \
+ movdqu_r2r(xmm7, xmm4); /* xmm4 = 77 75 74 76 73 -- -- -- */ \
+ \
+ movdqu_r2r(xmm5, xmm2); /* xmm2 = -- -- -- 54 51 -- 53 52 */ \
+ psrldq_i2r(10, xmm7); /* xmm7 = -- -- -- -- -- 77 75 74 */ \
+ \
+ pslldq_i2r(6, xmm4); /* xmm4 = 76 73 -- -- -- -- -- -- */ \
+ pslldq_i2r(12, xmm2); /* xmm2 = 53 52 -- -- -- -- -- -- */ \
+ \
+ movdqu_r2r(xmm1, xmm0); /* xmm0 = 67 64 66 65 -- -- -- -- */ \
+ psrldq_i2r(12, xmm1); /* xmm1 = -- -- -- -- -- -- 67 64 */ \
+ \
+ psrldq_i2r(6, xmm5); /* xmm5 = -- -- -- -- -- -- 54 51 */ \
+ psrldq_i2r(14, xmm3); /* xmm3 = -- -- -- -- -- -- -- 34 */ \
+ \
+ pslldq_i2r(10, xmm7); /* xmm7 = 77 75 74 -- -- -- -- -- */ \
+ por_r2r(xmm6, xmm4); /* xmm4 = 76 73 -- -- -- -- 35 33 */ \
+ \
+ psrldq_i2r(10, xmm2); /* xmm2 = -- -- -- -- -- 53 52 -- */ \
+ pslldq_i2r(4, xmm0); /* xmm0 = 66 65 -- -- -- -- -- -- */ \
+ \
+ pslldq_i2r(8, xmm1); /* xmm1 = -- -- 67 64 -- -- -- -- */ \
+ por_r2r(xmm7, xmm3); /* xmm3 = 77 75 74 -- -- -- -- 34 */ \
+ \
+ psrldq_i2r(6, xmm0); /* xmm0 = -- -- -- 66 65 -- -- -- */ \
+ pslldq_i2r(4, xmm5); /* xmm5 = -- -- -- -- 54 51 -- -- */ \
+ \
+ por_r2r(xmm1, xmm4); /* xmm4 = 76 73 67 64 -- -- 35 33 */ \
+ por_r2r(xmm2, xmm3); /* xmm3 = 77 75 74 -- -- 53 52 34 */ \
+ \
+ por_r2r(xmm5, xmm4); /* xmm4 = 76 73 67 64 54 51 35 33 */ \
+ por_r2r(xmm0, xmm3); /* xmm3 = 77 75 74 66 65 53 52 34 */ \
+ \
+ movdqu_r2m(xmm4, *(eax+96)); /* write 76 73 67 64 54 51 35 33 */ \
+ movdqu_r2m(xmm3, *(eax+112)); /* write 77 75 74 66 65 53 52 34 */ \
+ \
+} /* end of SSE2_Dequantize Macro */
+
+
+void vp3_dsp_init_sse2(void)
+{
+ /* nop */
+}
+
+
+static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
+ int16_t *output_data)
+{
+ unsigned char *input_bytes = (unsigned char *)input_data;
+ unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
+ unsigned char *dequant_const_bytes = (unsigned char *)SSE2_dequant_const;
+ unsigned char *output_data_bytes = (unsigned char *)output_data;
+ unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data;
+ unsigned char *Eight = (unsigned char *)eight_data;
+
+#define eax input_bytes
+#define ebx dequant_matrix_bytes
+#define ecx dequant_const_bytes
+#define edx idct_data_bytes
+
+#define I(i) (eax + 16 * i)
+#define O(i) (ebx + 16 * i)
+#define C(i) (edx + 16 * (i-1))
+
+ SSE2_Dequantize();
+
+#undef ebx
+#define ebx output_data_bytes
+
+ SSE2_Row_IDCT();
+
+ SSE2_Transpose();
+
+ SSE2_Column_IDCT();
+}
+
+
+void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
+ int coeff_count, uint8_t *dest, int stride)
+{
+ int16_t transformed_data[64];
+ int16_t *op;
+ int i, j;
+
+ vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
+
+ /* place in final output */
+ op = transformed_data;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ if (*op < -128)
+ *dest = 0;
+ else if (*op > 127)
+ *dest = 255;
+ else
+ *dest = (uint8_t)(*op + 128);
+ op++;
+ dest++;
+ }
+ dest += (stride - 8);
+ }
+}
+
+
+void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
+ int coeff_count, uint8_t *dest, int stride)
+{
+ int16_t transformed_data[64];
+ int16_t *op;
+ int i, j;
+ int16_t sample;
+
+ vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
+
+ /* place in final output */
+ op = transformed_data;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ sample = *dest + *op;
+ if (sample < 0)
+ *dest = 0;
+ else if (sample > 255)
+ *dest = 255;
+ else
+ *dest = (uint8_t)(sample & 0xFF);
+ op++;
+ dest++;
+ }
+ dest += (stride - 8);
+ }
+}
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index a18645e33..14fdb1059 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -45,7 +45,10 @@
#define LINE_BUF_HEIGHT (NB_TAPS * 4)
struct ImgReSampleContext {
- int iwidth, iheight, owidth, oheight, topBand, bottomBand, leftBand, rightBand;
+ int iwidth, iheight, owidth, oheight;
+ int topBand, bottomBand, leftBand, rightBand;
+ int padtop, padbottom, padleft, padright;
+ int pad_owidth, pad_oheight;
int h_incr, v_incr;
int16_t h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
int16_t v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
@@ -532,6 +535,7 @@ static void component_resample(ImgReSampleContext *s,
&s->v_filters[phase_y][0]);
src_y += s->v_incr;
+
output += owrap;
}
}
@@ -572,13 +576,16 @@ static void build_filter(int16_t *filter, float factor)
ImgReSampleContext *img_resample_init(int owidth, int oheight,
int iwidth, int iheight)
{
- return img_resample_full_init(owidth, oheight, iwidth, iheight, 0, 0, 0, 0);
+ return img_resample_full_init(owidth, oheight, iwidth, iheight,
+ 0, 0, 0, 0, 0, 0, 0, 0);
}
ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
int iwidth, int iheight,
int topBand, int bottomBand,
- int leftBand, int rightBand)
+ int leftBand, int rightBand,
+ int padtop, int padbottom,
+ int padleft, int padright)
{
ImgReSampleContext *s;
@@ -593,19 +600,30 @@ ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
s->oheight = oheight;
s->iwidth = iwidth;
s->iheight = iheight;
+
s->topBand = topBand;
s->bottomBand = bottomBand;
s->leftBand = leftBand;
s->rightBand = rightBand;
- s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / owidth;
- s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / oheight;
-
- build_filter(&s->h_filters[0][0], (float) owidth / (float) (iwidth - leftBand - rightBand));
- build_filter(&s->v_filters[0][0], (float) oheight / (float) (iheight - topBand - bottomBand));
+ s->padtop = padtop;
+ s->padbottom = padbottom;
+ s->padleft = padleft;
+ s->padright = padright;
+
+ s->pad_owidth = owidth - (padleft + padright);
+ s->pad_oheight = oheight - (padtop + padbottom);
+
+ s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / s->pad_owidth;
+ s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / s->pad_oheight;
+
+ build_filter(&s->h_filters[0][0], (float) s->pad_owidth /
+ (float) (iwidth - leftBand - rightBand));
+ build_filter(&s->v_filters[0][0], (float) s->pad_oheight /
+ (float) (iheight - topBand - bottomBand));
return s;
- fail:
+fail:
av_free(s);
return NULL;
}
@@ -614,13 +632,20 @@ void img_resample(ImgReSampleContext *s,
AVPicture *output, const AVPicture *input)
{
int i, shift;
+ uint8_t* optr;
- for(i=0;i<3;i++) {
+ for (i=0;i<3;i++) {
shift = (i == 0) ? 0 : 1;
- component_resample(s, output->data[i], output->linesize[i],
- s->owidth >> shift, s->oheight >> shift,
- input->data[i] + (input->linesize[i] * (s->topBand >> shift)) + (s->leftBand >> shift),
- input->linesize[i], ((s->iwidth - s->leftBand - s->rightBand) >> shift),
+
+ optr = output->data[i] + (((output->linesize[i] *
+ s->padtop) + s->padleft) >> shift);
+
+ component_resample(s, optr, output->linesize[i],
+ s->pad_owidth >> shift, s->pad_oheight >> shift,
+ input->data[i] + (input->linesize[i] *
+ (s->topBand >> shift)) + (s->leftBand >> shift),
+ input->linesize[i], ((s->iwidth - s->leftBand -
+ s->rightBand) >> shift),
(s->iheight - s->topBand - s->bottomBand) >> shift);
}
}
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 30029d40c..255a82d2c 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -862,13 +862,11 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
memset(&s2, 0, sizeof(MpegEncContext));
s2.avctx= avctx;
// s2->out_format = FMT_MJPEG;
- s2.width = 8;
- s2.height = 8;
- if (MPV_common_init(&s2) < 0)
- return -1;
+ dsputil_init(&s2.dsp, avctx);
+ DCT_common_init(&s2);
+
s->scantable= s2.intra_scantable;
s->idct_put= s2.dsp.idct_put;
- MPV_common_end(&s2);
s->mpeg_enc_ctx_allocated = 0;
s->buffer_size = 102400; /* smaller buffer should be enough,
@@ -1532,15 +1530,22 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
if (id == ff_get_fourcc("JFIF"))
{
- int t_w, t_h;
+ int t_w, t_h, v1, v2;
skip_bits(&s->gb, 8); /* the trailing zero-byte */
- av_log(s->avctx, AV_LOG_INFO, "mjpeg: JFIF header found (version: %x.%x)\n",
- get_bits(&s->gb, 8), get_bits(&s->gb, 8));
+ v1= get_bits(&s->gb, 8);
+ v2= get_bits(&s->gb, 8);
skip_bits(&s->gb, 8);
s->avctx->sample_aspect_ratio.num= get_bits(&s->gb, 16);
s->avctx->sample_aspect_ratio.den= get_bits(&s->gb, 16);
+ if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+ av_log(s->avctx, AV_LOG_INFO, "mjpeg: JFIF header found (version: %x.%x) SAR=%d/%d\n",
+ v1, v2,
+ s->avctx->sample_aspect_ratio.num,
+ s->avctx->sample_aspect_ratio.den
+ );
+
t_w = get_bits(&s->gb, 8);
t_h = get_bits(&s->gb, 8);
if (t_w && t_h)
@@ -1555,7 +1560,8 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
if (id == ff_get_fourcc("Adob") && (get_bits(&s->gb, 8) == 'e'))
{
- av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
+ if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+ av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
skip_bits(&s->gb, 16); /* version */
skip_bits(&s->gb, 16); /* flags0 */
skip_bits(&s->gb, 16); /* flags1 */
@@ -1565,7 +1571,8 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
}
if (id == ff_get_fourcc("LJIF")){
- av_log(s->avctx, AV_LOG_INFO, "Pegasus lossless jpeg header found\n");
+ if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+ av_log(s->avctx, AV_LOG_INFO, "Pegasus lossless jpeg header found\n");
skip_bits(&s->gb, 16); /* version ? */
skip_bits(&s->gb, 16); /* unknwon always 0? */
skip_bits(&s->gb, 16); /* unknwon always 0? */
@@ -1604,7 +1611,7 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
skip_bits(&s->gb, 32); /* scan off */
skip_bits(&s->gb, 32); /* data off */
#endif
- if (s->first_picture)
+ if (s->avctx->debug & FF_DEBUG_PICT_INFO)
av_log(s->avctx, AV_LOG_INFO, "mjpeg: Apple MJPEG-A header found\n");
}
}
@@ -1635,7 +1642,8 @@ static int mjpeg_decode_com(MJpegDecodeContext *s)
else
cbuf[i] = 0;
- av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
+ if(s->avctx->debug & FF_DEBUG_PICT_INFO)
+ av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
/* buggy avid, it puts EOI only at every 10th frame */
if (!strcmp(cbuf, "AVID"))
@@ -1781,13 +1789,12 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
/* process markers */
if (start_code >= 0xd0 && start_code <= 0xd7) {
dprintf("restart marker: %d\n", start_code&0x0f);
- } else if (s->first_picture) {
/* APP fields */
- if (start_code >= 0xe0 && start_code <= 0xef)
- mjpeg_decode_app(s);
+ } else if (start_code >= APP0 && start_code <= APP15) {
+ mjpeg_decode_app(s);
/* Comment */
- else if (start_code == COM)
- mjpeg_decode_com(s);
+ } else if (start_code == COM){
+ mjpeg_decode_com(s);
}
switch(start_code) {
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 5132487cf..f194a4d60 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -33,8 +33,8 @@
#include "dsputil.h"
#include "mpegvideo.h"
-//#undef NDEBUG
-//#include <assert.h>
+#undef NDEBUG
+#include <assert.h>
#define SQ(a) ((a)*(a))
@@ -46,9 +46,8 @@
static inline int sad_hpel_motion_search(MpegEncContext * s,
int *mx_ptr, int *my_ptr, int dmin,
- int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[6], int stride, int uvstride,
- int size, int h, uint8_t * const mv_penalty);
+ int src_index, int ref_index,
+ int size, int h);
static inline int update_map_generation(MpegEncContext * s)
{
@@ -73,209 +72,153 @@ static int minima_cmp(const void *a, const void *b){
return da->height - db->height;
}
-
-/* SIMPLE */
-#define RENAME(a) simple_ ## a
-#define CMP(d, x, y, size)\
-d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride, h);
+#define FLAG_QPEL 1 //must be 1
+#define FLAG_CHROMA 2
+#define FLAG_DIRECT 4
-#define CMP_HPEL(d, dx, dy, x, y, size)\
-{\
- const int dxy= (dx) + 2*(dy);\
- hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, h);\
- d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
-}
-
-
-#define CMP_QPEL(d, dx, dy, x, y, size)\
-{\
- const int dxy= (dx) + 4*(dy);\
- qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\
- d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
+static inline void init_ref(MpegEncContext *s, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
+ MotionEstContext * const c= &s->me;
+ const int offset[3]= {
+ y*c-> stride + x,
+ ((y*c->uvstride + x)>>1),
+ ((y*c->uvstride + x)>>1),
+ };
+ int i;
+ for(i=0; i<3; i++){
+ c->src[0][i]= src [i] + offset[i];
+ c->ref[0][i]= ref [i] + offset[i];
+ }
+ if(ref_index){
+ for(i=0; i<3; i++){
+ c->ref[ref_index][i]= ref2[i] + offset[i];
+ }
+ }
}
-#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-
-/* SIMPLE CHROMA */
-#define RENAME(a) simple_chroma_ ## a
-
-#define CMP(d, x, y, size)\
-d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride, h);\
-if(chroma_cmp){\
- int dxy= ((x)&1) + 2*((y)&1);\
- int c= ((x)>>1) + ((y)>>1)*uvstride;\
-\
- chroma_hpel_put[0][dxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\
- d += chroma_cmp(s, s->me.scratchpad, src_u, uvstride, h>>1);\
- chroma_hpel_put[0][dxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\
- d += chroma_cmp(s, s->me.scratchpad, src_v, uvstride, h>>1);\
+static int get_flags(MpegEncContext *s, int direct, int chroma){
+ return ((s->flags&CODEC_FLAG_QPEL) ? FLAG_QPEL : 0)
+ + (direct ? FLAG_DIRECT : 0)
+ + (chroma ? FLAG_CHROMA : 0);
}
-#define CMP_HPEL(d, dx, dy, x, y, size)\
-{\
- const int dxy= (dx) + 2*(dy);\
- hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, h);\
- d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
- if(chroma_cmp_sub){\
- int cxy= (dxy) | ((x)&1) | (2*((y)&1));\
- int c= ((x)>>1) + ((y)>>1)*uvstride;\
- chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\
- d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride, h>>1);\
- chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\
- d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride, h>>1);\
- }\
-}
+static always_inline int cmp(MpegEncContext *s, const int x, const int y, const int subx, const int suby,
+ const int size, const int h, int ref_index, int src_index,
+ me_cmp_func cmp_func, me_cmp_func chroma_cmp_func, const int flags){
+ MotionEstContext * const c= &s->me;
+ const int stride= c->stride;
+ const int uvstride= c->uvstride;
+ const int qpel= flags&FLAG_QPEL;
+ const int chroma= flags&FLAG_CHROMA;
+ const int dxy= subx + (suby<<(1+qpel)); //FIXME log2_subpel?
+ const int hx= subx + (x<<(1+qpel));
+ const int hy= suby + (y<<(1+qpel));
+ uint8_t * const * const ref= c->ref[ref_index];
+ uint8_t * const * const src= c->src[src_index];
+ int d;
+ //FIXME check chroma 4mv, (no crashes ...)
+ if(flags&FLAG_DIRECT){
+ if(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1)){
+ const int time_pp= s->pp_time;
+ const int time_pb= s->pb_time;
+ const int mask= 2*qpel+1;
+ if(s->mv_type==MV_TYPE_8X8){
+ int i;
+ for(i=0; i<4; i++){
+ int fx = c->direct_basis_mv[i][0] + hx;
+ int fy = c->direct_basis_mv[i][1] + hy;
+ int bx = hx ? fx - c->co_located_mv[i][0] : c->co_located_mv[i][0]*(time_pb - time_pp)/time_pp + ((i &1)<<(qpel+4));
+ int by = hy ? fy - c->co_located_mv[i][1] : c->co_located_mv[i][1]*(time_pb - time_pp)/time_pp + ((i>>1)<<(qpel+4));
+ int fxy= (fx&mask) + ((fy&mask)<<(qpel+1));
+ int bxy= (bx&mask) + ((by&mask)<<(qpel+1));
+
+ uint8_t *dst= c->temp + 8*(i&1) + 8*stride*(i>>1);
+ if(qpel){
+ c->qpel_put[1][fxy](dst, ref[0] + (fx>>2) + (fy>>2)*stride, stride);
+ c->qpel_avg[1][bxy](dst, ref[8] + (bx>>2) + (by>>2)*stride, stride);
+ }else{
+ c->hpel_put[1][fxy](dst, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 8);
+ c->hpel_avg[1][bxy](dst, ref[8] + (bx>>1) + (by>>1)*stride, stride, 8);
+ }
+ }
+ }else{
+ int fx = c->direct_basis_mv[0][0] + hx;
+ int fy = c->direct_basis_mv[0][1] + hy;
+ int bx = hx ? fx - c->co_located_mv[0][0] : (c->co_located_mv[0][0]*(time_pb - time_pp)/time_pp);
+ int by = hy ? fy - c->co_located_mv[0][1] : (c->co_located_mv[0][1]*(time_pb - time_pp)/time_pp);
+ int fxy= (fx&mask) + ((fy&mask)<<(qpel+1));
+ int bxy= (bx&mask) + ((by&mask)<<(qpel+1));
+
+ if(qpel){
+ c->qpel_put[1][fxy](c->temp , ref[0] + (fx>>2) + (fy>>2)*stride , stride);
+ c->qpel_put[1][fxy](c->temp + 8 , ref[0] + (fx>>2) + (fy>>2)*stride + 8 , stride);
+ c->qpel_put[1][fxy](c->temp + 8*stride, ref[0] + (fx>>2) + (fy>>2)*stride + 8*stride, stride);
+ c->qpel_put[1][fxy](c->temp + 8 + 8*stride, ref[0] + (fx>>2) + (fy>>2)*stride + 8 + 8*stride, stride);
+ c->qpel_avg[1][bxy](c->temp , ref[8] + (bx>>2) + (by>>2)*stride , stride);
+ c->qpel_avg[1][bxy](c->temp + 8 , ref[8] + (bx>>2) + (by>>2)*stride + 8 , stride);
+ c->qpel_avg[1][bxy](c->temp + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8*stride, stride);
+ c->qpel_avg[1][bxy](c->temp + 8 + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8 + 8*stride, stride);
+ }else{
+ assert((fx>>1) + 16*s->mb_x >= -16);
+ assert((fy>>1) + 16*s->mb_y >= -16);
+ assert((fx>>1) + 16*s->mb_x <= s->width);
+ assert((fy>>1) + 16*s->mb_y <= s->height);
+ assert((bx>>1) + 16*s->mb_x >= -16);
+ assert((by>>1) + 16*s->mb_y >= -16);
+ assert((bx>>1) + 16*s->mb_x <= s->width);
+ assert((by>>1) + 16*s->mb_y <= s->height);
+
+ c->hpel_put[0][fxy](c->temp, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 16);
+ c->hpel_avg[0][bxy](c->temp, ref[8] + (bx>>1) + (by>>1)*stride, stride, 16);
+ }
+ }
+ d = cmp_func(s, c->temp, src[0], stride, 16);
+ }else
+ d= 256*256*256*32;
+ }else{
+ int uvdxy;
+ if(dxy){
+ if(qpel){
+ c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+ if(chroma){
+ int cx= hx/2;
+ int cy= hy/2;
+ cx= (cx>>1)|(cx&1);
+ cy= (cy>>1)|(cy&1);
+ uvdxy= (cx&1) + 2*(cy&1);
+ //FIXME x/y wrong, but mpeg4 qpel is sick anyway, we should drop as much of it as possible in favor for h264
+ }
+ }else{
+ c->hpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride, h);
+ if(chroma)
+ uvdxy= dxy | (x&1) | (2*(y&1));
+ }
+ d = cmp_func(s, c->temp, src[0], stride, h);
+ }else{
+ d = cmp_func(s, src[0], ref[0] + x + y*stride, stride, h);
+ if(chroma)
+ uvdxy= (x&1) + 2*(y&1);
+ }
+ if(chroma){
+ uint8_t * const uvtemp= c->temp + 16*stride;
+ c->hpel_put[size+1][uvdxy](uvtemp , ref[1] + (x>>1) + (y>>1)*uvstride, uvstride, h>>1);
+ c->hpel_put[size+1][uvdxy](uvtemp+8, ref[2] + (x>>1) + (y>>1)*uvstride, uvstride, h>>1);
+ d += chroma_cmp_func(s, uvtemp , src[1], uvstride, h>>1);
+ d += chroma_cmp_func(s, uvtemp+8, src[2], uvstride, h>>1);
+ }
+ }
+#if 0
+ if(full_pel){
+ const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);
+ score_map[index]= d;
+ }
-#define CMP_QPEL(d, dx, dy, x, y, size)\
-{\
- const int dxy= (dx) + 4*(dy);\
- qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\
- d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
- if(chroma_cmp_sub){\
- int cxy, c;\
- int cx= (4*(x) + (dx))/2;\
- int cy= (4*(y) + (dy))/2;\
- cx= (cx>>1)|(cx&1);\
- cy= (cy>>1)|(cy&1);\
- cxy= (cx&1) + 2*(cy&1);\
- c= ((cx)>>1) + ((cy)>>1)*uvstride;\
- chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\
- d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride, h>>1);\
- chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\
- d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride, h>>1);\
- }\
+ d += (c->mv_penalty[hx - c->pred_x] + c->mv_penalty[hy - c->pred_y])*c->penalty_factor;
+#endif
+ return d;
}
#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-
-/* SIMPLE DIRECT HPEL */
-#define RENAME(a) simple_direct_hpel_ ## a
-//FIXME precalc divisions stuff
-
-#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\
-if((x) >= xmin && 2*(x) + (dx) <= 2*xmax && (y) >= ymin && 2*(y) + (dy) <= 2*ymax){\
- const int hx= 2*(x) + (dx);\
- const int hy= 2*(y) + (dy);\
- if(s->mv_type==MV_TYPE_8X8){\
- int i;\
- for(i=0; i<4; i++){\
- int fx = s->me.direct_basis_mv[i][0] + hx;\
- int fy = s->me.direct_basis_mv[i][1] + hy;\
- int bx = hx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\
- int by = hy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\
- int fxy= (fx&1) + 2*(fy&1);\
- int bxy= (bx&1) + 2*(by&1);\
-\
- uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\
- hpel_put[1][fxy](dst, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 8);\
- hpel_avg[1][bxy](dst, (ref_data[3]) + (bx>>1) + (by>>1)*(stride), stride, 8);\
- }\
- }else{\
- int fx = s->me.direct_basis_mv[0][0] + hx;\
- int fy = s->me.direct_basis_mv[0][1] + hy;\
- int bx = hx ? fx - s->me.co_located_mv[0][0] : (s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp);\
- int by = hy ? fy - s->me.co_located_mv[0][1] : (s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp);\
- int fxy= (fx&1) + 2*(fy&1);\
- int bxy= (bx&1) + 2*(by&1);\
- \
- assert((fx>>1) + 16*s->mb_x >= -16);\
- assert((fy>>1) + 16*s->mb_y >= -16);\
- assert((fx>>1) + 16*s->mb_x <= s->width);\
- assert((fy>>1) + 16*s->mb_y <= s->height);\
- assert((bx>>1) + 16*s->mb_x >= -16);\
- assert((by>>1) + 16*s->mb_y >= -16);\
- assert((bx>>1) + 16*s->mb_x <= s->width);\
- assert((by>>1) + 16*s->mb_y <= s->height);\
-\
- hpel_put[0][fxy](s->me.scratchpad, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 16);\
- hpel_avg[0][bxy](s->me.scratchpad, (ref_data[3]) + (bx>>1) + (by>>1)*(stride), stride, 16);\
- }\
- d = cmp_func(s, s->me.scratchpad, src_y, stride, 16);\
-}else\
- d= 256*256*256*32;
-
-
-#define CMP_HPEL(d, dx, dy, x, y, size)\
- CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub)
-
-#define CMP(d, x, y, size)\
- CMP_DIRECT(d, 0, 0, x, y, size, cmp)
-
-#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-#undef CMP_DIRECT
-
-/* SIMPLE DIRECT QPEL */
-#define RENAME(a) simple_direct_qpel_ ## a
-
-#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\
-if((x) >= xmin && 4*(x) + (dx) <= 4*xmax && (y) >= ymin && 4*(y) + (dy) <= 4*ymax){\
- const int qx= 4*(x) + (dx);\
- const int qy= 4*(y) + (dy);\
- if(s->mv_type==MV_TYPE_8X8){\
- int i;\
- for(i=0; i<4; i++){\
- int fx = s->me.direct_basis_mv[i][0] + qx;\
- int fy = s->me.direct_basis_mv[i][1] + qy;\
- int bx = qx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\
- int by = qy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\
- int fxy= (fx&3) + 4*(fy&3);\
- int bxy= (bx&3) + 4*(by&3);\
-\
- uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\
- qpel_put[1][fxy](dst, (ref_y ) + (fx>>2) + (fy>>2)*(stride), stride);\
- qpel_avg[1][bxy](dst, (ref_data[3]) + (bx>>2) + (by>>2)*(stride), stride);\
- }\
- }else{\
- int fx = s->me.direct_basis_mv[0][0] + qx;\
- int fy = s->me.direct_basis_mv[0][1] + qy;\
- int bx = qx ? fx - s->me.co_located_mv[0][0] : s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp;\
- int by = qy ? fy - s->me.co_located_mv[0][1] : s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp;\
- int fxy= (fx&3) + 4*(fy&3);\
- int bxy= (bx&3) + 4*(by&3);\
-\
- qpel_put[1][fxy](s->me.scratchpad , (ref_y ) + (fx>>2) + (fy>>2)*(stride) , stride);\
- qpel_put[1][fxy](s->me.scratchpad + 8 , (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8 , stride);\
- qpel_put[1][fxy](s->me.scratchpad + 8*stride, (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8*stride, stride);\
- qpel_put[1][fxy](s->me.scratchpad + 8 + 8*stride, (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8 + 8*stride, stride);\
- qpel_avg[1][bxy](s->me.scratchpad , (ref_data[3]) + (bx>>2) + (by>>2)*(stride) , stride);\
- qpel_avg[1][bxy](s->me.scratchpad + 8 , (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8 , stride);\
- qpel_avg[1][bxy](s->me.scratchpad + 8*stride, (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8*stride, stride);\
- qpel_avg[1][bxy](s->me.scratchpad + 8 + 8*stride, (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8 + 8*stride, stride);\
- }\
- d = cmp_func(s, s->me.scratchpad, src_y, stride, 16);\
-}else\
- d= 256*256*256*32;
-
-
-#define CMP_QPEL(d, dx, dy, x, y, size)\
- CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub)
-
-#define CMP(d, x, y, size)\
- CMP_DIRECT(d, 0, 0, x, y, size, cmp)
-
-#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-#undef CMP__DIRECT
static inline int get_penalty_factor(MpegEncContext *s, int type){
switch(type&0xFF){
@@ -297,54 +240,45 @@ static inline int get_penalty_factor(MpegEncContext *s, int type){
}
void ff_init_me(MpegEncContext *s){
+ MotionEstContext * const c= &s->me;
+
ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, s->avctx->me_pre_cmp);
ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
ff_set_cmp(&s->dsp, s->dsp.mb_cmp, s->avctx->mb_cmp);
+
+ s->me.flags = get_flags(s, 0, s->avctx->me_cmp &FF_CMP_CHROMA);
+ s->me.sub_flags= get_flags(s, 0, s->avctx->me_sub_cmp&FF_CMP_CHROMA);
+ s->me.mb_flags = get_flags(s, 0, s->avctx->mb_cmp &FF_CMP_CHROMA);
+/*FIXME s->no_rounding b_type*/
if(s->flags&CODEC_FLAG_QPEL){
- if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
- s->me.sub_motion_search= simple_chroma_qpel_motion_search;
- else
- s->me.sub_motion_search= simple_qpel_motion_search;
+ s->me.sub_motion_search= qpel_motion_search;
+ c->qpel_avg= s->dsp.avg_qpel_pixels_tab;
+ if(s->no_rounding) c->qpel_put= s->dsp.put_no_rnd_qpel_pixels_tab;
+ else c->qpel_put= s->dsp.put_qpel_pixels_tab;
}else{
if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
- s->me.sub_motion_search= simple_chroma_hpel_motion_search;
+ s->me.sub_motion_search= hpel_motion_search;
else if( s->avctx->me_sub_cmp == FF_CMP_SAD
&& s->avctx-> me_cmp == FF_CMP_SAD
&& s->avctx-> mb_cmp == FF_CMP_SAD)
s->me.sub_motion_search= sad_hpel_motion_search; // 2050 vs. 2450 cycles
else
- s->me.sub_motion_search= simple_hpel_motion_search;
+ s->me.sub_motion_search= hpel_motion_search;
+ c->hpel_avg= s->dsp.avg_pixels_tab;
+ if(s->no_rounding) c->hpel_put= s->dsp.put_no_rnd_pixels_tab;
+ else c->hpel_put= s->dsp.put_pixels_tab;
}
-
- if(s->avctx->me_cmp&FF_CMP_CHROMA){
- s->me.motion_search[0]= simple_chroma_epzs_motion_search;
- s->me.motion_search[1]= simple_chroma_epzs_motion_search4;
- s->me.motion_search[4]= simple_chroma_epzs_motion_search2;
+ if(s->linesize){
+ s->me.stride = s->linesize;
+ s->me.uvstride= s->uvlinesize;
}else{
- s->me.motion_search[0]= simple_epzs_motion_search;
- s->me.motion_search[1]= simple_epzs_motion_search4;
- s->me.motion_search[4]= simple_epzs_motion_search2;
- }
-
- if(s->avctx->me_pre_cmp&FF_CMP_CHROMA){
- s->me.pre_motion_search= simple_chroma_epzs_motion_search;
- }else{
- s->me.pre_motion_search= simple_epzs_motion_search;
- }
-
- if(s->flags&CODEC_FLAG_QPEL){
- if(s->avctx->mb_cmp&FF_CMP_CHROMA)
- s->me.get_mb_score= simple_chroma_qpel_get_mb_score;
- else
- s->me.get_mb_score= simple_qpel_get_mb_score;
- }else{
- if(s->avctx->mb_cmp&FF_CMP_CHROMA)
- s->me.get_mb_score= simple_chroma_hpel_get_mb_score;
- else
- s->me.get_mb_score= simple_hpel_get_mb_score;
+ s->me.stride = 16*s->mb_width + 32;
+ s->me.uvstride= 8*s->mb_width + 16;
}
+
+ c->temp= c->scratchpad;
}
#if 0
@@ -611,18 +545,17 @@ static int phods_motion_search(MpegEncContext * s,
static inline int sad_hpel_motion_search(MpegEncContext * s,
int *mx_ptr, int *my_ptr, int dmin,
- int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[6], int stride, int uvstride,
- int size, int h, uint8_t * const mv_penalty)
+ int src_index, int ref_index,
+ int size, int h)
{
- uint32_t *score_map= s->me.score_map;
const int penalty_factor= s->me.sub_penalty_factor;
int mx, my, dminh;
uint8_t *pix, *ptr;
- const int xmin= s->me.xmin;
- const int ymin= s->me.ymin;
- const int xmax= s->me.xmax;
- const int ymax= s->me.ymax;
+ int stride= s->me.stride;
+ const int flags= s->me.sub_flags;
+ LOAD_COMMON
+
+ assert(flags == 0);
if(s->me.skip){
// printf("S");
@@ -632,11 +565,11 @@ static inline int sad_hpel_motion_search(MpegEncContext * s,
}
// printf("N");
- pix = src_data[0];
+ pix = s->me.src[src_index][0];
mx = *mx_ptr;
my = *my_ptr;
- ptr = ref_data[0] + (my * stride) + mx;
+ ptr = s->me.ref[ref_index][0] + (my * stride) + mx;
dminh = dmin;
@@ -733,7 +666,7 @@ static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my, int mv4)
s->current_picture.motion_val[0][mot_xy+1][0]= mx;
s->current_picture.motion_val[0][mot_xy+1][1]= my;
- mot_xy += s->block_wrap[0];
+ mot_xy += s->b8_stride;
s->current_picture.motion_val[0][mot_xy ][0]= mx;
s->current_picture.motion_val[0][mot_xy ][1]= my;
s->current_picture.motion_val[0][mot_xy+1][0]= mx;
@@ -763,41 +696,40 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
}
}
+static inline void init_mv4_ref(MpegEncContext *s){
+ MotionEstContext * const c= &s->me;
+ const int stride= s->linesize;
+
+ c->ref[1][0] = c->ref[0][0] + 8;
+ c->ref[2][0] = c->ref[0][0] + 8*stride;
+ c->ref[3][0] = c->ref[2][0] + 8;
+ c->src[1][0] = c->src[0][0] + 8;
+ c->src[2][0] = c->src[0][0] + 8*stride;
+ c->src[3][0] = c->src[2][0] + 8;
+}
+
static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
{
+ MotionEstContext * const c= &s->me;
const int size= 1;
const int h=8;
int block;
int P[10][2];
int dmin_sum=0, mx4_sum=0, my4_sum=0;
- uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
int same=1;
const int stride= s->linesize;
const int uvstride= s->uvlinesize;
- const int xmin= s->me.xmin;
- const int ymin= s->me.ymin;
- const int xmax= s->me.xmax;
- const int ymax= s->me.ymax;
+ uint8_t *mv_penalty= s->me.current_mv_penalty;
+ init_mv4_ref(s);
+
for(block=0; block<4; block++){
int mx4, my4;
int pred_x4, pred_y4;
int dmin4;
static const int off[4]= {2, 1, 1, -1};
- const int mot_stride = s->block_wrap[0];
+ const int mot_stride = s->b8_stride;
const int mot_xy = s->block_index[block];
- const int block_x= (block&1);
- const int block_y= (block>>1);
- uint8_t *src_data[3]= {
- s->new_picture.data[0] + 8*(2*s->mb_x + block_x) + stride *8*(2*s->mb_y + block_y), //FIXME chroma?
- s->new_picture.data[1] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y),
- s->new_picture.data[2] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y)
- };
- uint8_t *ref_data[3]= {
- s->last_picture.data[0] + 8*(2*s->mb_x + block_x) + stride *8*(2*s->mb_y + block_y), //FIXME chroma?
- s->last_picture.data[1] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y),
- s->last_picture.data[2] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y)
- };
P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0];
P_LEFT[1] = s->current_picture.motion_val[0][mot_xy - 1][1];
@@ -806,8 +738,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
/* special case for first line */
if (s->first_slice_line && block<2) {
- pred_x4= P_LEFT[0];
- pred_y4= P_LEFT[1];
+ s->me.pred_x= pred_x4= P_LEFT[0];
+ s->me.pred_y= pred_y4= P_LEFT[1];
} else {
P_TOP[0] = s->current_picture.motion_val[0][mot_xy - mot_stride ][0];
P_TOP[1] = s->current_picture.motion_val[0][mot_xy - mot_stride ][1];
@@ -821,32 +753,22 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
-// if(s->out_format == FMT_H263){
- pred_x4 = P_MEDIAN[0];
- pred_y4 = P_MEDIAN[1];
-#if 0
- }else { /* mpeg1 at least */
- pred_x4= P_LEFT[0];
- pred_y4= P_LEFT[1];
- }
-#endif
+ s->me.pred_x= pred_x4 = P_MEDIAN[0];
+ s->me.pred_y= pred_y4 = P_MEDIAN[1];
}
P_MV1[0]= mx;
P_MV1[1]= my;
- dmin4 = s->me.motion_search[1](s, &mx4, &my4, P, pred_x4, pred_y4,
- src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty);
+ dmin4 = epzs_motion_search4(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift);
- dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4,
- pred_x4, pred_y4, src_data, ref_data, stride, uvstride, size, h, mv_penalty);
+ dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h);
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]
- && s->avctx->mb_decision == FF_MB_DECISION_SIMPLE){
+ if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
int dxy;
const int offset= ((block&1) + (block>>1)*stride)*8;
uint8_t *dest_y = s->me.scratchpad + offset;
if(s->quarter_sample){
- uint8_t *ref= ref_data[0] + (mx4>>2) + (my4>>2)*stride;
+ uint8_t *ref= c->ref[block][0] + (mx4>>2) + (my4>>2)*stride;
dxy = ((my4 & 3) << 2) | (mx4 & 3);
if(s->no_rounding)
@@ -854,7 +776,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
else
s->dsp.put_qpel_pixels_tab [1][dxy](dest_y , ref , stride);
}else{
- uint8_t *ref= ref_data[0] + (mx4>>1) + (my4>>1)*stride;
+ uint8_t *ref= c->ref[block][0] + (mx4>>1) + (my4>>1)*stride;
dxy = ((my4 & 1) << 1) | (mx4 & 1);
if(s->no_rounding)
@@ -909,6 +831,9 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad , s->uvlinesize, 8);
dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad+8, s->uvlinesize, 8);
}
+
+ s->me.pred_x= mx;
+ s->me.pred_y= my;
switch(s->avctx->mb_cmp&0xFF){
/*case FF_CMP_SSE:
@@ -920,14 +845,28 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
}
}
-static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint8_t *frame_ref_data[3],
- int16_t (*mv_tables[2][2])[2], uint8_t *field_select_tables[2], int f_code, int mx, int my)
+static inline void init_interlaced_ref(MpegEncContext *s, int ref_index){
+ MotionEstContext * const c= &s->me;
+
+ c->ref[1+ref_index][0] = c->ref[0+ref_index][0] + s->linesize;
+ c->src[1][0] = c->src[0][0] + s->linesize;
+ if(c->flags & FLAG_CHROMA){
+ c->ref[1+ref_index][1] = c->ref[0+ref_index][1] + s->uvlinesize;
+ c->ref[1+ref_index][2] = c->ref[0+ref_index][2] + s->uvlinesize;
+ c->src[1][1] = c->src[0][1] + s->uvlinesize;
+ c->src[1][2] = c->src[0][2] + s->uvlinesize;
+ }
+}
+
+static int interlaced_search(MpegEncContext *s, int ref_index,
+ int16_t (*mv_tables[2][2])[2], uint8_t *field_select_tables[2], int mx, int my, int user_field_select)
{
+ MotionEstContext * const c= &s->me;
const int size=0;
const int h=8;
int block;
int P[10][2];
- uint8_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV;
+ uint8_t * const mv_penalty= c->current_mv_penalty;
int same=1;
const int stride= 2*s->linesize;
const int uvstride= 2*s->uvlinesize;
@@ -935,45 +874,42 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
const int mot_stride= s->mb_stride;
const int xy= s->mb_x + s->mb_y*mot_stride;
- s->me.ymin>>=1;
- s->me.ymax>>=1;
+ c->ymin>>=1;
+ c->ymax>>=1;
+ c->stride<<=1;
+ c->uvstride<<=1;
+ init_interlaced_ref(s, ref_index);
for(block=0; block<2; block++){
int field_select;
int best_dmin= INT_MAX;
int best_field= -1;
- uint8_t *src_data[3]= {
- frame_src_data[0] + s-> linesize*block,
- frame_src_data[1] + s->uvlinesize*block,
- frame_src_data[2] + s->uvlinesize*block
- };
-
for(field_select=0; field_select<2; field_select++){
- int dmin, mx_i, my_i, pred_x, pred_y;
- uint8_t *ref_data[3]= {
- frame_ref_data[0] + s-> linesize*field_select,
- frame_ref_data[1] + s->uvlinesize*field_select,
- frame_ref_data[2] + s->uvlinesize*field_select
- };
+ int dmin, mx_i, my_i;
int16_t (*mv_table)[2]= mv_tables[block][field_select];
+ if(user_field_select){
+ if(field_select_tables[block][xy] != field_select)
+ continue;
+ }
+
P_LEFT[0] = mv_table[xy - 1][0];
P_LEFT[1] = mv_table[xy - 1][1];
- if(P_LEFT[0] > (s->me.xmax<<1)) P_LEFT[0] = (s->me.xmax<<1);
+ if(P_LEFT[0] > (c->xmax<<1)) P_LEFT[0] = (c->xmax<<1);
- pred_x= P_LEFT[0];
- pred_y= P_LEFT[1];
+ s->me.pred_x= P_LEFT[0];
+ s->me.pred_y= P_LEFT[1];
if(!s->first_slice_line){
P_TOP[0] = mv_table[xy - mot_stride][0];
P_TOP[1] = mv_table[xy - mot_stride][1];
P_TOPRIGHT[0] = mv_table[xy - mot_stride + 1][0];
P_TOPRIGHT[1] = mv_table[xy - mot_stride + 1][1];
- if(P_TOP[1] > (s->me.ymax<<1)) P_TOP[1] = (s->me.ymax<<1);
- if(P_TOPRIGHT[0] < (s->me.xmin<<1)) P_TOPRIGHT[0]= (s->me.xmin<<1);
- if(P_TOPRIGHT[0] > (s->me.xmax<<1)) P_TOPRIGHT[0]= (s->me.xmax<<1);
- if(P_TOPRIGHT[1] > (s->me.ymax<<1)) P_TOPRIGHT[1]= (s->me.ymax<<1);
+ if(P_TOP[1] > (c->ymax<<1)) P_TOP[1] = (c->ymax<<1);
+ if(P_TOPRIGHT[0] < (c->xmin<<1)) P_TOPRIGHT[0]= (c->xmin<<1);
+ if(P_TOPRIGHT[0] > (c->xmax<<1)) P_TOPRIGHT[0]= (c->xmax<<1);
+ if(P_TOPRIGHT[1] > (c->ymax<<1)) P_TOPRIGHT[1]= (c->ymax<<1);
P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
@@ -981,32 +917,29 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
P_MV1[0]= mx; //FIXME not correct if block != field_select
P_MV1[1]= my / 2;
- dmin = s->me.motion_search[4](s, &mx_i, &my_i, P, pred_x, pred_y,
- src_data, ref_data, stride, uvstride, mv_table, (1<<16)>>1, mv_penalty);
+ dmin = epzs_motion_search2(s, &mx_i, &my_i, P, block, field_select+ref_index, mv_table, (1<<16)>>1);
- dmin= s->me.sub_motion_search(s, &mx_i, &my_i, dmin,
- pred_x, pred_y, src_data, ref_data, stride, uvstride, size, h, mv_penalty);
+ dmin= c->sub_motion_search(s, &mx_i, &my_i, dmin, block, field_select+ref_index, size, h);
mv_table[xy][0]= mx_i;
mv_table[xy][1]= my_i;
- if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]
- && s->avctx->mb_decision == FF_MB_DECISION_SIMPLE){
+ if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
int dxy;
//FIXME chroma ME
- uint8_t *ref= ref_data[0] + (mx_i>>1) + (my_i>>1)*stride;
+ uint8_t *ref= c->ref[field_select+ref_index][0] + (mx_i>>1) + (my_i>>1)*stride;
dxy = ((my_i & 1) << 1) | (mx_i & 1);
if(s->no_rounding){
- s->dsp.put_no_rnd_pixels_tab[size][dxy](s->me.scratchpad, ref , stride, h);
+ s->dsp.put_no_rnd_pixels_tab[size][dxy](c->scratchpad, ref , stride, h);
}else{
- s->dsp.put_pixels_tab [size][dxy](s->me.scratchpad, ref , stride, h);
+ s->dsp.put_pixels_tab [size][dxy](c->scratchpad, ref , stride, h);
}
- dmin= s->dsp.mb_cmp[size](s, src_data[0], s->me.scratchpad, stride, h);
- dmin+= (mv_penalty[mx_i-pred_x] + mv_penalty[my_i-pred_y] + 1)*s->me.mb_penalty_factor;
+ dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h);
+ dmin+= (mv_penalty[mx_i-s->me.pred_x] + mv_penalty[my_i-s->me.pred_y] + 1)*c->mb_penalty_factor;
}else
- dmin+= s->me.mb_penalty_factor; //field_select bits
+ dmin+= c->mb_penalty_factor; //field_select bits
dmin += field_select != block; //slightly prefer same field
@@ -1028,8 +961,10 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
dmin_sum += best_dmin;
}
- s->me.ymin<<=1;
- s->me.ymax<<=1;
+ c->ymin<<=1;
+ c->ymax<<=1;
+ c->stride>>=1;
+ c->uvstride>>=1;
if(same)
return INT_MAX;
@@ -1040,44 +975,182 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
case FF_CMP_RD:
return dmin_sum;
default:
- return dmin_sum+ 11*s->me.mb_penalty_factor;
+ return dmin_sum+ 11*c->mb_penalty_factor;
}
}
+static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int p_type){
+ MotionEstContext * const c= &s->me;
+ Picture *p= s->current_picture_ptr;
+ int mb_xy= mb_x + mb_y*s->mb_stride;
+ int xy= 2*mb_x + 2*mb_y*s->b8_stride;
+ int mb_type= s->current_picture.mb_type[mb_xy];
+ int flags= c->flags;
+ int shift= (flags&FLAG_QPEL) + 1;
+ int mask= (1<<shift)-1;
+ int x, y, i;
+ int d=0;
+ me_cmp_func cmpf= s->dsp.sse[0];
+ me_cmp_func chroma_cmpf= s->dsp.sse[1];
+
+ assert(p_type==0 || !USES_LIST(mb_type, 1));
+ assert(IS_INTRA(mb_type) || USES_LIST(mb_type,0) || USES_LIST(mb_type,1));
+
+ if(IS_INTERLACED(mb_type)){
+ int xy2= xy + s->b8_stride;
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA;
+ c->stride<<=1;
+ c->uvstride<<=1;
+
+ assert(s->flags & CODEC_FLAG_INTERLACED_ME);
+
+ if(USES_LIST(mb_type, 0)){
+ int field_select0= p->ref_index[0][xy ];
+ int field_select1= p->ref_index[0][xy2];
+ assert(field_select0==0 ||field_select0==1);
+ assert(field_select1==0 ||field_select1==1);
+ init_interlaced_ref(s, 0);
+
+ if(p_type){
+ s->p_field_select_table[0][mb_xy]= field_select0;
+ s->p_field_select_table[1][mb_xy]= field_select1;
+ *(uint32_t*)s->p_field_mv_table[0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[0][xy ];
+ *(uint32_t*)s->p_field_mv_table[1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[0][xy2];
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER_I;
+ }else{
+ s->b_field_select_table[0][0][mb_xy]= field_select0;
+ s->b_field_select_table[0][1][mb_xy]= field_select1;
+ *(uint32_t*)s->b_field_mv_table[0][0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[0][xy ];
+ *(uint32_t*)s->b_field_mv_table[0][1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[0][xy2];
+ s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_FORWARD_I;
+ }
+
+ x= p->motion_val[0][xy ][0];
+ y= p->motion_val[0][xy ][1];
+ d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select0, 0, cmpf, chroma_cmpf, flags);
+ x= p->motion_val[0][xy2][0];
+ y= p->motion_val[0][xy2][1];
+ d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select1, 1, cmpf, chroma_cmpf, flags);
+ }
+ if(USES_LIST(mb_type, 1)){
+ int field_select0= p->ref_index[1][xy ];
+ int field_select1= p->ref_index[1][xy2];
+ assert(field_select0==0 ||field_select0==1);
+ assert(field_select1==0 ||field_select1==1);
+ init_interlaced_ref(s, 2);
+
+ s->b_field_select_table[1][0][mb_xy]= field_select0;
+ s->b_field_select_table[1][1][mb_xy]= field_select1;
+ *(uint32_t*)s->b_field_mv_table[1][0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[1][xy ];
+ *(uint32_t*)s->b_field_mv_table[1][1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[1][xy2];
+ if(USES_LIST(mb_type, 0)){
+ s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_BIDIR_I;
+ }else{
+ s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_BACKWARD_I;
+ }
+
+ x= p->motion_val[1][xy ][0];
+ y= p->motion_val[1][xy ][1];
+ d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select0+2, 0, cmpf, chroma_cmpf, flags);
+ x= p->motion_val[1][xy2][0];
+ y= p->motion_val[1][xy2][1];
+ d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select1+2, 1, cmpf, chroma_cmpf, flags);
+ //FIXME bidir scores
+ }
+ c->stride>>=1;
+ c->uvstride>>=1;
+ }else if(IS_8X8(mb_type)){
+ assert(s->flags & CODEC_FLAG_4MV);
+ cmpf= s->dsp.sse[1];
+ chroma_cmpf= s->dsp.sse[1];
+ init_mv4_ref(s);
+ for(i=0; i<4; i++){
+ xy= s->block_index[i];
+ x= p->motion_val[0][xy][0];
+ y= p->motion_val[0][xy][1];
+ d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 1, 8, i, i, cmpf, chroma_cmpf, flags);
+ }
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER4V;
+ }else{
+ if(USES_LIST(mb_type, 0)){
+ if(p_type){
+ *(uint32_t*)s->p_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy];
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER;
+ }else if(USES_LIST(mb_type, 1)){
+ *(uint32_t*)s->b_bidir_forw_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy];
+ *(uint32_t*)s->b_bidir_back_mv_table[mb_xy]= *(uint32_t*)p->motion_val[1][xy];
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_BIDIR;
+ }else{
+ *(uint32_t*)s->b_forw_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy];
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_FORWARD;
+ }
+ x= p->motion_val[0][xy][0];
+ y= p->motion_val[0][xy][1];
+ d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 16, 0, 0, cmpf, chroma_cmpf, flags);
+ }else if(USES_LIST(mb_type, 1)){
+ *(uint32_t*)s->b_back_mv_table[mb_xy]= *(uint32_t*)p->motion_val[1][xy];
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_BACKWARD;
+
+ x= p->motion_val[1][xy][0];
+ y= p->motion_val[1][xy][1];
+ d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 16, 2, 0, cmpf, chroma_cmpf, flags);
+ }else
+ s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA;
+ }
+ return d;
+}
+
void ff_estimate_p_frame_motion(MpegEncContext * s,
int mb_x, int mb_y)
{
+ MotionEstContext * const c= &s->me;
uint8_t *pix, *ppix;
- int sum, varc, vard, mx, my, dmin, xx, yy;
- int pred_x=0, pred_y=0;
+ int sum, varc, vard, mx, my, dmin;
int P[10][2];
const int shift= 1+s->quarter_sample;
int mb_type=0;
- uint8_t *ref_picture= s->last_picture.data[0];
Picture * const pic= &s->current_picture;
- uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
- const int stride= s->linesize;
- const int uvstride= s->uvlinesize;
- uint8_t *src_data[3]= {
- s->new_picture.data[0] + 16*(mb_x + stride*mb_y),
- s->new_picture.data[1] + 8*(mb_x + uvstride*mb_y),
- s->new_picture.data[2] + 8*(mb_x + uvstride*mb_y)
- };
- uint8_t *ref_data[3]= {
- s->last_picture.data[0] + 16*(mb_x + stride*mb_y),
- s->last_picture.data[1] + 8*(mb_x + uvstride*mb_y),
- s->last_picture.data[2] + 8*(mb_x + uvstride*mb_y)
- };
+
+ init_ref(s, s->new_picture.data, s->last_picture.data, NULL, 16*mb_x, 16*mb_y, 0);
assert(s->quarter_sample==0 || s->quarter_sample==1);
+ assert(s->linesize == s->me.stride);
+ assert(s->uvlinesize == s->me.uvstride);
s->me.penalty_factor = get_penalty_factor(s, s->avctx->me_cmp);
s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
+ s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
get_limits(s, 16*mb_x, 16*mb_y);
s->me.skip=0;
+ /* intra / predictive decision */
+ pix = c->src[0][0];
+ sum = s->dsp.pix_sum(pix, s->linesize);
+ varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+
+ pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
+ pic->mb_var [s->mb_stride * mb_y + mb_x] = varc;
+ s->mb_var_sum_temp += varc;
+
+ if(s->avctx->me_threshold){
+ vard= (check_input_motion(s, mb_x, mb_y, 1)+128)>>8;
+
+ if(vard<s->avctx->me_threshold){
+ pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
+ s->mc_mb_var_sum_temp += vard;
+ if (vard <= 64 || vard < varc) { //FIXME
+ s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+ }else{
+ s->scene_change_score+= s->qscale;
+ }
+ return;
+ }
+ if(vard<s->avctx->mb_threshold)
+ mb_type= s->mb_type[mb_x + mb_y*s->mb_stride];
+ }
+
switch(s->me_method) {
case ME_ZERO:
default:
@@ -1106,7 +1179,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
case ME_X1:
case ME_EPZS:
{
- const int mot_stride = s->block_wrap[0];
+ const int mot_stride = s->b8_stride;
const int mot_xy = s->block_index[0];
P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0];
@@ -1127,51 +1200,58 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
if(s->out_format == FMT_H263){
- pred_x = P_MEDIAN[0];
- pred_y = P_MEDIAN[1];
+ c->pred_x = P_MEDIAN[0];
+ c->pred_y = P_MEDIAN[1];
}else { /* mpeg1 at least */
- pred_x= P_LEFT[0];
- pred_y= P_LEFT[1];
+ c->pred_x= P_LEFT[0];
+ c->pred_y= P_LEFT[1];
}
}else{
- pred_x= P_LEFT[0];
- pred_y= P_LEFT[1];
+ c->pred_x= P_LEFT[0];
+ c->pred_y= P_LEFT[1];
}
}
- dmin = s->me.motion_search[0](s, &mx, &my, P, pred_x, pred_y,
- src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty);
-
+ dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift);
+
break;
}
- /* intra / predictive decision */
- xx = mb_x * 16;
- yy = mb_y * 16;
-
- pix = src_data[0];
/* At this point (mx,my) are full-pell and the relative displacement */
- ppix = ref_data[0] + (my * s->linesize) + mx;
-
- sum = s->dsp.pix_sum(pix, s->linesize);
-
- varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+ ppix = c->ref[0][0] + (my * s->linesize) + mx;
+
vard = (s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16)+128)>>8;
-//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
- pic->mb_var [s->mb_stride * mb_y + mb_x] = varc;
pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
- pic->mb_mean [s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
// pic->mb_cmp_score[s->mb_stride * mb_y + mb_x] = dmin;
- s->mb_var_sum_temp += varc;
s->mc_mb_var_sum_temp += vard;
-//printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
#if 0
printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n",
varc, s->avg_mb_var, sum, vard, mx - xx, my - yy);
#endif
- if(s->avctx->mb_decision > FF_MB_DECISION_SIMPLE){
+ if(mb_type){
+ if (vard <= 64 || vard < varc)
+ s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+ else
+ s->scene_change_score+= s->qscale;
+
+ if(mb_type == CANDIDATE_MB_TYPE_INTER){
+ s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
+ set_p_mv_tables(s, mx, my, 1);
+ }else{
+ mx <<=shift;
+ my <<=shift;
+ }
+ if(mb_type == CANDIDATE_MB_TYPE_INTER4V){
+ h263_mv4_search(s, mx, my, shift);
+
+ set_p_mv_tables(s, mx, my, 0);
+ }
+ if(mb_type == CANDIDATE_MB_TYPE_INTER_I){
+ interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 1);
+ }
+ }else if(s->avctx->mb_decision > FF_MB_DECISION_SIMPLE){
if (vard <= 64 || vard < varc)
s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
else
@@ -1181,8 +1261,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
mb_type|= CANDIDATE_MB_TYPE_INTRA;
if (varc*2 + 200 > vard){
mb_type|= CANDIDATE_MB_TYPE_INTER;
- s->me.sub_motion_search(s, &mx, &my, dmin,
- pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
+ s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
if(s->flags&CODEC_FLAG_MV0)
if(mx || my)
mb_type |= CANDIDATE_MB_TYPE_SKIPED; //FIXME check difference
@@ -1200,17 +1279,16 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
set_p_mv_tables(s, mx, my, 1);
if((s->flags&CODEC_FLAG_INTERLACED_ME)
&& !s->me.skip){ //FIXME varc/d checks
- if(interlaced_search(s, src_data, ref_data, s->p_field_mv_table, s->p_field_select_table, s->f_code, mx, my) < INT_MAX)
+ if(interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 0) < INT_MAX)
mb_type |= CANDIDATE_MB_TYPE_INTER_I;
}
}else{
int intra_score, i;
mb_type= CANDIDATE_MB_TYPE_INTER;
- dmin= s->me.sub_motion_search(s, &mx, &my, dmin,
- pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
+ dmin= s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
- dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, src_data, ref_data, stride, uvstride, mv_penalty);
+ dmin= get_mb_score(s, mx, my, 0, 0);
if((s->flags&CODEC_FLAG_4MV)
&& !s->me.skip && varc>50 && vard>10){
@@ -1222,7 +1300,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
}
if((s->flags&CODEC_FLAG_INTERLACED_ME)
&& !s->me.skip){ //FIXME varc/d checks
- int dmin_i= interlaced_search(s, src_data, ref_data, s->p_field_mv_table, s->p_field_select_table, s->f_code, mx, my);
+ int dmin_i= interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 0);
if(dmin_i < dmin){
mb_type = CANDIDATE_MB_TYPE_INTER_I;
dmin= dmin_i;
@@ -1256,7 +1334,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
int mean;
if(s->out_format == FMT_H263){
- mean= (s->dc_val[i][mb_x + (mb_y+1)*(s->mb_width+2)] + 4)>>3; //FIXME not exact but simple ;)
+ mean= (s->dc_val[i][mb_x + mb_y*s->b8_stride] + 4)>>3; //FIXME not exact but simple ;)
}else{
mean= (s->last_dc[i] + 4)>>3;
}
@@ -1293,28 +1371,17 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
int mb_x, int mb_y)
{
+ MotionEstContext * const c= &s->me;
int mx, my, dmin;
- int pred_x=0, pred_y=0;
int P[10][2];
const int shift= 1+s->quarter_sample;
- uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
const int xy= mb_x + mb_y*s->mb_stride;
- const int stride= s->linesize;
- const int uvstride= s->uvlinesize;
- uint8_t *src_data[3]= {
- s->new_picture.data[0] + 16*(mb_x + stride*mb_y),
- s->new_picture.data[1] + 8*(mb_x + uvstride*mb_y),
- s->new_picture.data[2] + 8*(mb_x + uvstride*mb_y)
- };
- uint8_t *ref_data[3]= {
- s->last_picture.data[0] + 16*(mb_x + stride*mb_y),
- s->last_picture.data[1] + 8*(mb_x + uvstride*mb_y),
- s->last_picture.data[2] + 8*(mb_x + uvstride*mb_y)
- };
+ init_ref(s, s->new_picture.data, s->last_picture.data, NULL, 16*mb_x, 16*mb_y, 0);
assert(s->quarter_sample==0 || s->quarter_sample==1);
s->me.pre_penalty_factor = get_penalty_factor(s, s->avctx->me_pre_cmp);
+ s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
get_limits(s, 16*mb_x, 16*mb_y);
s->me.skip=0;
@@ -1326,8 +1393,8 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
/* special case for first line */
if (s->first_slice_line) {
- pred_x= P_LEFT[0];
- pred_y= P_LEFT[1];
+ c->pred_x= P_LEFT[0];
+ c->pred_y= P_LEFT[1];
P_TOP[0]= P_TOPRIGHT[0]= P_MEDIAN[0]=
P_TOP[1]= P_TOPRIGHT[1]= P_MEDIAN[1]= 0; //FIXME
} else {
@@ -1342,11 +1409,11 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
- pred_x = P_MEDIAN[0];
- pred_y = P_MEDIAN[1];
+ c->pred_x = P_MEDIAN[0];
+ c->pred_y = P_MEDIAN[1];
}
- dmin = s->me.pre_motion_search(s, &mx, &my, P, pred_x, pred_y,
- src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty);
+
+ dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift);
s->p_mv_table[xy][0] = mx<<shift;
s->p_mv_table[xy][1] = my<<shift;
@@ -1355,22 +1422,20 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
}
static int ff_estimate_motion_b(MpegEncContext * s,
- int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride, int f_code)
+ int mb_x, int mb_y, int16_t (*mv_table)[2], int ref_index, int f_code)
{
int mx, my, dmin;
- int pred_x=0, pred_y=0;
int P[10][2];
const int shift= 1+s->quarter_sample;
const int mot_stride = s->mb_stride;
const int mot_xy = mb_y*mot_stride + mb_x;
- uint8_t * const ref_picture= ref_data[0] - 16*s->mb_x - 16*s->mb_y*s->linesize; //FIXME ugly
uint8_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV;
int mv_scale;
s->me.penalty_factor = get_penalty_factor(s, s->avctx->me_cmp);
s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
+ s->me.current_mv_penalty= mv_penalty;
get_limits(s, 16*mb_x, 16*mb_y);
@@ -1420,8 +1485,8 @@ static int ff_estimate_motion_b(MpegEncContext * s,
P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
}
- pred_x= P_LEFT[0];
- pred_y= P_LEFT[1];
+ s->me.pred_x= P_LEFT[0];
+ s->me.pred_y= P_LEFT[1];
}
if(mv_table == s->b_forw_mv_table){
@@ -1430,17 +1495,15 @@ static int ff_estimate_motion_b(MpegEncContext * s,
mv_scale= ((s->pb_time - s->pp_time)<<16) / (s->pp_time<<shift);
}
- dmin = s->me.motion_search[0](s, &mx, &my, P, pred_x, pred_y,
- src_data, ref_data, stride, uvstride, s->p_mv_table, mv_scale, mv_penalty);
+ dmin = epzs_motion_search(s, &mx, &my, P, 0, ref_index, s->p_mv_table, mv_scale);
break;
}
- dmin= s->me.sub_motion_search(s, &mx, &my, dmin,
- pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
+ dmin= s->me.sub_motion_search(s, &mx, &my, dmin, 0, ref_index, 0, 16);
if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
- dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, src_data, ref_data, stride, uvstride, mv_penalty);
+ dmin= get_mb_score(s, mx, my, 0, ref_index);
//printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
// s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
@@ -1450,8 +1513,7 @@ static int ff_estimate_motion_b(MpegEncContext * s,
return dmin;
}
-static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6],
- int stride, int uvstride,
+static inline int check_bidir_mv(MpegEncContext * s,
int motion_fx, int motion_fy,
int motion_bx, int motion_by,
int pred_fx, int pred_fy,
@@ -1459,15 +1521,20 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
int size, int h)
{
//FIXME optimize?
- //FIXME move into template?
//FIXME better f_code prediction (max mv & distance)
//FIXME pointers
+ MotionEstContext * const c= &s->me;
uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+ int stride= s->me.stride;
+ int uvstride= s->me.uvstride;
uint8_t *dest_y = s->me.scratchpad;
uint8_t *ptr;
int dxy;
int src_x, src_y;
int fbmin;
+ uint8_t **src_data= c->src[0];
+ uint8_t **ref_data= c->ref[0];
+ uint8_t **ref2_data= c->ref[2];
if(s->quarter_sample){
dxy = ((motion_fy & 3) << 2) | (motion_fx & 3);
@@ -1481,7 +1548,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
src_x = motion_bx >> 2;
src_y = motion_by >> 2;
- ptr = ref_data[3] + (src_y * stride) + src_x;
+ ptr = ref2_data[0] + (src_y * stride) + src_x;
s->dsp.avg_qpel_pixels_tab[size][dxy](dest_y , ptr , stride);
}else{
dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
@@ -1495,7 +1562,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
src_x = motion_bx >> 1;
src_y = motion_by >> 1;
- ptr = ref_data[3] + (src_y * stride) + src_x;
+ ptr = ref2_data[0] + (src_y * stride) + src_x;
s->dsp.avg_pixels_tab[size][dxy](dest_y , ptr , stride, h);
}
@@ -1511,9 +1578,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
}
/* refine the bidir vectors in hq mode and return the score in both lq & hq mode*/
-static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6],
- int stride, int uvstride,
- int mb_x, int mb_y)
+static inline int bidir_refine(MpegEncContext * s, int mb_x, int mb_y)
{
const int mot_stride = s->mb_stride;
const int xy = mb_y *mot_stride + mb_x;
@@ -1529,8 +1594,7 @@ static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t
//FIXME do refinement and add flag
- fbmin= check_bidir_mv(s, src_data, ref_data, stride, uvstride,
- motion_fx, motion_fy,
+ fbmin= check_bidir_mv(s, motion_fx, motion_fy,
motion_bx, motion_by,
pred_fx, pred_fy,
pred_bx, pred_by,
@@ -1539,9 +1603,7 @@ static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t
return fbmin;
}
-static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6],
- int stride, int uvstride,
- int mb_x, int mb_y)
+static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
{
int P[10][2];
const int mot_stride = s->mb_stride;
@@ -1552,8 +1614,8 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_
const int time_pb= s->pb_time;
int mx, my, xmin, xmax, ymin, ymax;
int16_t (*mv_table)[2]= s->b_direct_mv_table;
- uint8_t * const mv_penalty= s->me.mv_penalty[1] + MAX_MV;
+ s->me.current_mv_penalty= s->me.mv_penalty[1] + MAX_MV;
ymin= xmin=(-32)>>shift;
ymax= xmax= 31>>shift;
@@ -1604,6 +1666,10 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_
s->me.ymin= ymin;
s->me.xmax= xmax;
s->me.ymax= ymax;
+ s->me.flags |= FLAG_DIRECT;
+ s->me.sub_flags |= FLAG_DIRECT;
+ s->me.pred_x=0;
+ s->me.pred_y=0;
P_LEFT[0] = clip(mv_table[mot_xy - 1][0], xmin<<shift, xmax<<shift);
P_LEFT[1] = clip(mv_table[mot_xy - 1][1], ymin<<shift, ymax<<shift);
@@ -1619,29 +1685,22 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_
P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
}
- //FIXME direct_search ptr in context!!! (needed for chroma anyway or this will get messy)
- if(s->flags&CODEC_FLAG_QPEL){
- dmin = simple_direct_qpel_epzs_motion_search(s, &mx, &my, P, 0, 0,
- src_data, ref_data, stride, uvstride, mv_table, 1<<14, mv_penalty);
- dmin = simple_direct_qpel_qpel_motion_search(s, &mx, &my, dmin,
- 0, 0, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
-
- if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
- dmin= simple_direct_qpel_qpel_get_mb_score(s, mx, my, 0, 0, src_data, ref_data, stride, uvstride, mv_penalty);
- }else{
- dmin = simple_direct_hpel_epzs_motion_search(s, &mx, &my, P, 0, 0,
- src_data, ref_data, stride, uvstride, mv_table, 1<<15, mv_penalty);
- dmin = simple_direct_hpel_hpel_motion_search(s, &mx, &my, dmin,
- 0, 0, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
-
- if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
- dmin= simple_direct_hpel_hpel_get_mb_score(s, mx, my, 0, 0, src_data, ref_data, stride, uvstride, mv_penalty);
- }
+ dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, mv_table, 1<<(16-shift));
+ if(s->me.sub_flags&FLAG_QPEL)
+ dmin = qpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
+ else
+ dmin = hpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
+
+ if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+ dmin= get_mb_score(s, mx, my, 0, 0);
get_limits(s, 16*mb_x, 16*mb_y); //restore s->me.?min/max, maybe not needed
s->b_direct_mv_table[mot_xy][0]= mx;
s->b_direct_mv_table[mot_xy][1]= my;
+ s->me.flags &= ~FLAG_DIRECT;
+ s->me.sub_flags &= ~FLAG_DIRECT;
+
return dmin;
}
@@ -1651,52 +1710,89 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
const int penalty_factor= s->me.mb_penalty_factor;
int fmin, bmin, dmin, fbmin, bimin, fimin;
int type=0;
- const int stride= s->linesize;
- const int uvstride= s->uvlinesize;
- uint8_t *src_data[3]= {
- s->new_picture.data[0] + 16*(s->mb_x + stride*s->mb_y),
- s->new_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y),
- s->new_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y)
- };
- uint8_t *ref_data[6]= {
- s->last_picture.data[0] + 16*(s->mb_x + stride*s->mb_y),
- s->last_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y),
- s->last_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y),
- s->next_picture.data[0] + 16*(s->mb_x + stride*s->mb_y),
- s->next_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y),
- s->next_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y)
- };
+ const int xy = mb_y*s->mb_stride + mb_x;
+ init_ref(s, s->new_picture.data, s->last_picture.data, s->next_picture.data, 16*mb_x, 16*mb_y, 2);
+
s->me.skip=0;
+ if(s->avctx->me_threshold){
+ int vard= (check_input_motion(s, mb_x, mb_y, 0)+128)>>8;
+
+ if(vard<s->avctx->me_threshold){
+// pix = c->src[0][0];
+// sum = s->dsp.pix_sum(pix, s->linesize);
+// varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+
+// pic->mb_var [s->mb_stride * mb_y + mb_x] = varc;
+ s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
+/* pic->mb_mean [s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
+ s->mb_var_sum_temp += varc;*/
+ s->mc_mb_var_sum_temp += vard;
+/* if (vard <= 64 || vard < varc) {
+ s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+ }else{
+ s->scene_change_score+= s->qscale;
+ }*/
+ return;
+ }
+ if(vard<s->avctx->mb_threshold){
+ type= s->mb_type[mb_y*s->mb_stride + mb_x];
+ if(type == CANDIDATE_MB_TYPE_DIRECT){
+ direct_search(s, mb_x, mb_y);
+ }
+ if(type == CANDIDATE_MB_TYPE_FORWARD || type == CANDIDATE_MB_TYPE_BIDIR){
+ s->me.skip=0;
+ ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, 0, s->f_code);
+ }
+ if(type == CANDIDATE_MB_TYPE_BACKWARD || type == CANDIDATE_MB_TYPE_BIDIR){
+ s->me.skip=0;
+ ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, 2, s->b_code);
+ }
+ if(type == CANDIDATE_MB_TYPE_FORWARD_I || type == CANDIDATE_MB_TYPE_BIDIR_I){
+ s->me.skip=0;
+ s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
+ interlaced_search(s, 0,
+ s->b_field_mv_table[0], s->b_field_select_table[0],
+ s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 1);
+ }
+ if(type == CANDIDATE_MB_TYPE_BACKWARD_I || type == CANDIDATE_MB_TYPE_BIDIR_I){
+ s->me.skip=0;
+ s->me.current_mv_penalty= s->me.mv_penalty[s->b_code] + MAX_MV;
+ interlaced_search(s, 2,
+ s->b_field_mv_table[1], s->b_field_select_table[1],
+ s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 1);
+ }
+ return;
+ }
+ }
+
if (s->codec_id == CODEC_ID_MPEG4)
- dmin= direct_search(s, src_data, ref_data, stride, uvstride, mb_x, mb_y);
+ dmin= direct_search(s, mb_x, mb_y);
else
dmin= INT_MAX;
//FIXME penalty stuff for non mpeg4
s->me.skip=0;
- fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, src_data,
- ref_data, stride, uvstride, s->f_code) + 3*penalty_factor;
+ fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, 0, s->f_code) + 3*penalty_factor;
s->me.skip=0;
- bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, src_data,
- ref_data+3, stride, uvstride, s->b_code) + 2*penalty_factor;
+ bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, 2, s->b_code) + 2*penalty_factor;
//printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
s->me.skip=0;
- fbmin= bidir_refine(s, src_data, ref_data, stride, uvstride, mb_x, mb_y) + penalty_factor;
+ fbmin= bidir_refine(s, mb_x, mb_y) + penalty_factor;
//printf("%d %d %d %d\n", dmin, fmin, bmin, fbmin);
if(s->flags & CODEC_FLAG_INTERLACED_ME){
- const int xy = mb_y*s->mb_stride + mb_x;
-
//FIXME mb type penalty
s->me.skip=0;
- fimin= interlaced_search(s, src_data, ref_data ,
- s->b_field_mv_table[0], s->b_field_select_table[0], s->f_code,
- s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
- bimin= interlaced_search(s, src_data, ref_data+3,
- s->b_field_mv_table[1], s->b_field_select_table[1], s->b_code,
- s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1]);
+ s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
+ fimin= interlaced_search(s, 0,
+ s->b_field_mv_table[0], s->b_field_select_table[0],
+ s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 0);
+ s->me.current_mv_penalty= s->me.mv_penalty[s->b_code] + MAX_MV;
+ bimin= interlaced_search(s, 2,
+ s->b_field_mv_table[1], s->b_field_select_table[1],
+ s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 0);
}else
fimin= bimin= INT_MAX;
@@ -1813,11 +1909,11 @@ void ff_fix_long_p_mvs(MpegEncContext * s)
//printf("%d no:%d %d//\n", clip, noclip, f_code);
if(s->flags&CODEC_FLAG_4MV){
- const int wrap= 2+ s->mb_width*2;
+ const int wrap= s->b8_stride;
/* clip / convert to intra 8x8 type MVs */
for(y=0; y<s->mb_height; y++){
- int xy= (y*2 + 1)*wrap + 1;
+ int xy= y*2*wrap;
int i= y*s->mb_stride;
int x;
diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c
index 49c2e57b5..8ab6c7be4 100644
--- a/src/libffmpeg/libavcodec/motion_est_template.c
+++ b/src/libffmpeg/libavcodec/motion_est_template.c
@@ -22,58 +22,32 @@
* @file motion_est_template.c
* Motion estimation template.
*/
-//FIXME ref2_y next_pic?
+
//lets hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...)
-//Note, the last line is there to kill these ugly unused var warnings
#define LOAD_COMMON\
uint32_t * const score_map= s->me.score_map;\
- const int time_pp= s->pp_time;\
- const int time_pb= s->pb_time;\
const int xmin= s->me.xmin;\
const int ymin= s->me.ymin;\
const int xmax= s->me.xmax;\
const int ymax= s->me.ymax;\
- uint8_t * const src_y= src_data[0];\
- uint8_t * const src_u= src_data[1];\
- uint8_t * const src_v= src_data[2];\
- uint8_t * const ref_y= ref_data[0];\
- uint8_t * const ref_u= ref_data[1];\
- uint8_t * const ref_v= ref_data[2];\
- op_pixels_func (*hpel_put)[4];\
- op_pixels_func (*hpel_avg)[4]= &s->dsp.avg_pixels_tab[size];\
- op_pixels_func (*chroma_hpel_put)[4];\
- qpel_mc_func (*qpel_put)[16];\
- qpel_mc_func (*qpel_avg)[16]= &s->dsp.avg_qpel_pixels_tab[size];\
- const __attribute__((unused)) int unu= time_pp + time_pb + (size_t)src_u + (size_t)src_v + (size_t)ref_u + (size_t)ref_v\
- + (size_t)hpel_avg + (size_t)qpel_avg + (size_t)score_map\
- + xmin + xmax + ymin + ymax;\
- if(s->no_rounding /*FIXME b_type*/){\
- hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];\
- chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];\
- qpel_put= &s->dsp.put_no_rnd_qpel_pixels_tab[size];\
- }else{\
- hpel_put=& s->dsp.put_pixels_tab[size];\
- chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];\
- qpel_put= &s->dsp.put_qpel_pixels_tab[size];\
- }
+ uint8_t *mv_penalty= s->me.current_mv_penalty;\
+ const int pred_x= s->me.pred_x;\
+ const int pred_y= s->me.pred_y;\
-
-#ifdef CMP_HPEL
-
#define CHECK_HALF_MV(dx, dy, x, y)\
{\
const int hx= 2*(x)+(dx);\
const int hy= 2*(y)+(dy);\
- CMP_HPEL(d, dx, dy, x, y, size);\
+ d= cmp(s, x, y, dx, dy, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);\
d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\
COPY3_IF_LT(dmin, d, bx, hx, by, hy)\
}
#if 0
-static int RENAME(hpel_motion_search)(MpegEncContext * s,
+static int hpel_motion_search)(MpegEncContext * s,
int *mx_ptr, int *my_ptr, int dmin,
- int pred_x, int pred_y, uint8_t *ref_data[3],
- int size, uint8_t * const mv_penalty)
+ uint8_t *ref_data[3],
+ int size)
{
const int xx = 16 * s->mb_x + 8*(n&1);
const int yy = 16 * s->mb_y + 8*(n>>1);
@@ -94,8 +68,8 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
hpel_put=& s->dsp.put_pixels_tab[size];
chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];
}
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
cmp_sub= s->dsp.me_sub_cmp[size];
chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
@@ -138,11 +112,10 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
}
#else
-static int RENAME(hpel_motion_search)(MpegEncContext * s,
+static int hpel_motion_search(MpegEncContext * s,
int *mx_ptr, int *my_ptr, int dmin,
- int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- int size, int h, uint8_t * const mv_penalty)
+ int src_index, int ref_index,
+ int size, int h)
{
const int mx = *mx_ptr;
const int my = *my_ptr;
@@ -151,6 +124,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
int bx=2*mx, by=2*my;
LOAD_COMMON
+ int flags= s->me.sub_flags;
//FIXME factorize
@@ -164,7 +138,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
}
if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
- CMP_HPEL(dmin, 0, 0, mx, my, size);
+ dmin= cmp(s, mx, my, 0, 0, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
if(mx || my || size>0)
dmin += (mv_penalty[2*mx - pred_x] + mv_penalty[2*my - pred_y])*penalty_factor;
}
@@ -246,14 +220,16 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
}
#endif
-static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- uint8_t * const mv_penalty)
+static int inline get_mb_score(MpegEncContext * s, int mx, int my, int src_index,
+ int ref_index)
{
// const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
const int size= 0;
const int h= 16;
const int penalty_factor= s->me.mb_penalty_factor;
+ const int flags= s->me.mb_flags;
+ const int qpel= flags & FLAG_QPEL;
+ const int mask= 1+2*qpel;
me_cmp_func cmp_sub, chroma_cmp_sub;
int d;
@@ -267,7 +243,7 @@ static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
assert(!s->me.skip);
assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
- CMP_HPEL(d, mx&1, my&1, mx>>1, my>>1, size);
+ d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
//FIXME check cbp before adding penalty for (0,0) vector
if(mx || my || size>0)
d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
@@ -275,26 +251,19 @@ static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
return d;
}
-#endif /* CMP_HPEL */
-
-
-
-#ifdef CMP_QPEL
-
#define CHECK_QUARTER_MV(dx, dy, x, y)\
{\
const int hx= 4*(x)+(dx);\
const int hy= 4*(y)+(dy);\
- CMP_QPEL(d, dx, dy, x, y, size);\
+ d= cmp(s, x, y, dx, dy, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\
COPY3_IF_LT(dmin, d, bx, hx, by, hy)\
}
-static int RENAME(qpel_motion_search)(MpegEncContext * s,
+static int qpel_motion_search(MpegEncContext * s,
int *mx_ptr, int *my_ptr, int dmin,
- int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- int size, int h, uint8_t * const mv_penalty)
+ int src_index, int ref_index,
+ int size, int h)
{
const int mx = *mx_ptr;
const int my = *my_ptr;
@@ -302,13 +271,14 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
const int map_generation= s->me.map_generation;
const int subpel_quality= s->avctx->me_subpel_quality;
uint32_t *map= s->me.map;
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
me_cmp_func cmp_sub, chroma_cmp_sub;
LOAD_COMMON
+ int flags= s->me.sub_flags;
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1]; //factorize FIXME
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME
//FIXME factorize
cmp_sub= s->dsp.me_sub_cmp[size];
@@ -321,7 +291,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
}
if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
- CMP_QPEL(dmin, 0, 0, mx, my, size);
+ dmin= cmp(s, mx, my, 0, 0, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
if(mx || my || size>0)
dmin += (mv_penalty[4*mx - pred_x] + mv_penalty[4*my - pred_y])*penalty_factor;
}
@@ -386,7 +356,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == (my<<ME_MAP_MV_BITS) + mx + map_generation && 0){ //FIXME
tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
}else{
- CMP(tl, mx-1, my-1, size); //FIXME wrong if chroma me is different
+ tl= cmp(s, mx-1, my-1, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);//FIXME wrong if chroma me is different
}
cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c;
@@ -509,36 +479,6 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
return dmin;
}
-static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- uint8_t * const mv_penalty)
-{
- const int size= 0;
- const int h= 16;
- const int penalty_factor= s->me.mb_penalty_factor;
- me_cmp_func cmp_sub, chroma_cmp_sub;
- int d;
-
- LOAD_COMMON
-
- //FIXME factorize
-
- cmp_sub= s->dsp.mb_cmp[size];
- chroma_cmp_sub= s->dsp.mb_cmp[size+1];
-
- assert(!s->me.skip);
- assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
-
- CMP_QPEL(d, mx&3, my&3, mx>>2, my>>2, size);
- //FIXME check cbp before adding penalty for (0,0) vector
- if(mx || my || size>0)
- d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
-
- return d;
-}
-
-
-#endif /* CMP_QPEL */
#define CHECK_MV(x,y)\
{\
@@ -546,7 +486,7 @@ static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
/*printf("check_mv %d %d\n", x, y);*/\
if(map[index]!=key){\
- CMP(d, x, y, size);\
+ d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
map[index]= key;\
score_map[index]= d;\
d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
@@ -570,7 +510,7 @@ static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
/*printf("check_mv_dir %d %d %d\n", x, y, new_dir);*/\
if(map[index]!=key){\
- CMP(d, x, y, size);\
+ d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
map[index]= key;\
score_map[index]= d;\
d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
@@ -590,27 +530,29 @@ if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x,
if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
+#define LOAD_COMMON2\
+ uint32_t *map= s->me.map;\
+ const int qpel= flags&FLAG_QPEL;\
+ const int shift= 1+qpel;\
-static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, int dmin,
- uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- int const pred_x, int const pred_y, int const penalty_factor,
- int const shift,
- uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
- )
+static always_inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
+ int src_index, int ref_index, int const penalty_factor,
+ int size, int h, int flags)
{
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
int next_dir=-1;
LOAD_COMMON
+ LOAD_COMMON2
+ int map_generation= s->me.map_generation;
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
{ /* ensure that the best point is in the MAP as h/qpel refinement needs it */
const int key= (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
if(map[index]!=key){ //this will be executed only very rarey
- CMP(score_map[index], best[0], best[1], size);
+ score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
map[index]= key;
}
}
@@ -634,20 +576,18 @@ static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, in
}
}
-static inline int RENAME(funny_diamond_search)(MpegEncContext * s, int *best, int dmin,
- uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- int const pred_x, int const pred_y, int const penalty_factor,
- int const shift,
- uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
- )
+static int funny_diamond_search(MpegEncContext * s, int *best, int dmin,
+ int src_index, int ref_index, int const penalty_factor,
+ int size, int h, int flags)
{
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
int dia_size;
LOAD_COMMON
+ LOAD_COMMON2
+ int map_generation= s->me.map_generation;
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
for(dia_size=1; dia_size<=4; dia_size++){
int dir;
@@ -702,7 +642,7 @@ if(256*256*256*64 % (stats[0]+1)==0){
const int index= (((ay)<<ME_MAP_SHIFT) + (ax))&(ME_MAP_SIZE-1);\
/*printf("sab check %d %d\n", ax, ay);*/\
if(map[index]!=key){\
- CMP(d, ax, ay, size);\
+ d= cmp(s, ax, ay, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
map[index]= key;\
score_map[index]= d;\
d += (mv_penalty[((ax)<<shift)-pred_x] + mv_penalty[((ay)<<shift)-pred_y])*penalty_factor;\
@@ -726,22 +666,20 @@ if(256*256*256*64 % (stats[0]+1)==0){
}
#define MAX_SAB_SIZE 16
-static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int dmin,
- uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- int const pred_x, int const pred_y, int const penalty_factor,
- int const shift,
- uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
- )
+static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
+ int src_index, int ref_index, int const penalty_factor,
+ int size, int h, int flags)
{
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
Minima minima[MAX_SAB_SIZE];
const int minima_count= ABS(s->me.dia_size);
int i, j;
LOAD_COMMON
+ LOAD_COMMON2
+ int map_generation= s->me.map_generation;
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
for(j=i=0; i<ME_MAP_SIZE; i++){
uint32_t key= map[i];
@@ -807,20 +745,18 @@ static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int
return dmin;
}
-static inline int RENAME(var_diamond_search)(MpegEncContext * s, int *best, int dmin,
- uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride,
- int const pred_x, int const pred_y, int const penalty_factor,
- int const shift,
- uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
- )
+static int var_diamond_search(MpegEncContext * s, int *best, int dmin,
+ int src_index, int ref_index, int const penalty_factor,
+ int size, int h, int flags)
{
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
int dia_size;
LOAD_COMMON
+ LOAD_COMMON2
+ int map_generation= s->me.map_generation;
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
for(dia_size=1; dia_size<=s->me.dia_size; dia_size++){
int dir, start, end;
@@ -885,31 +821,42 @@ if(256*256*256*64 % (stats[0]+1)==0){
return dmin;
}
-static int RENAME(epzs_motion_search)(MpegEncContext * s,
- int *mx_ptr, int *my_ptr,
- int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2],
- int ref_mv_scale, uint8_t * const mv_penalty)
+static always_inline int diamond_search(MpegEncContext * s, int *best, int dmin,
+ int src_index, int ref_index, int const penalty_factor,
+ int size, int h, int flags){
+ if(s->me.dia_size==-1)
+ return funny_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+ else if(s->me.dia_size<-1)
+ return sab_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+ else if(s->me.dia_size<2)
+ return small_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+ else
+ return var_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+}
+
+static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx_ptr, int *my_ptr,
+ int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2],
+ int ref_mv_scale, int flags)
{
int best[2]={0, 0};
- int d, dmin;
- const int shift= 1+s->quarter_sample;
- uint32_t *map= s->me.map;
+ int d, dmin;
int map_generation;
const int penalty_factor= s->me.penalty_factor;
const int size=0;
const int h=16;
const int ref_mv_stride= s->mb_stride; //pass as arg FIXME
const int ref_mv_xy= s->mb_x + s->mb_y*ref_mv_stride; //add to last_mv beforepassing FIXME
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
+
LOAD_COMMON
+ LOAD_COMMON2
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
map_generation= update_map_generation(s);
- CMP(dmin, 0, 0, size);
+ dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
map[0]= map_generation;
score_map[0]= dmin;
@@ -974,22 +921,7 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s,
}
//check(best[0],best[1],0, b0)
- if(s->me.dia_size==-1)
- dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else if(s->me.dia_size<-1)
- dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else if(s->me.dia_size<2)
- dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else
- dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
+ dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
//check(best[0],best[1],0, b1)
*mx_ptr= best[0];
@@ -999,29 +931,42 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s,
return dmin;
}
-#ifndef CMP_DIRECT /* no 4mv search needed in direct mode */
-static int RENAME(epzs_motion_search4)(MpegEncContext * s,
- int *mx_ptr, int *my_ptr,
- int P[10][2], int pred_x, int pred_y,
- uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2],
- int ref_mv_scale, uint8_t * const mv_penalty)
+//this function is dedicated to the braindamaged gcc
+static inline int epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr,
+ int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2],
+ int ref_mv_scale)
+{
+//FIXME convert other functions in the same way if faster
+ switch(s->me.flags){
+ case 0:
+ return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, 0);
+// case FLAG_QPEL:
+// return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, FLAG_QPEL);
+ default:
+ return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, s->me.flags);
+ }
+}
+
+static int epzs_motion_search4(MpegEncContext * s,
+ int *mx_ptr, int *my_ptr, int P[10][2],
+ int src_index, int ref_index, int16_t (*last_mv)[2],
+ int ref_mv_scale)
{
int best[2]={0, 0};
int d, dmin;
- const int shift= 1+s->quarter_sample;
- uint32_t *map= s->me.map;
int map_generation;
const int penalty_factor= s->me.penalty_factor;
const int size=1;
const int h=8;
const int ref_mv_stride= s->mb_stride;
const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride;
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
LOAD_COMMON
+ int flags= s->me.flags;
+ LOAD_COMMON2
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
map_generation= update_map_generation(s);
@@ -1053,23 +998,7 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s,
(last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
}
- if(s->me.dia_size==-1)
- dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else if(s->me.dia_size<-1)
- dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else if(s->me.dia_size<2)
- dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else
- dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
-
+ dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
*mx_ptr= best[0];
*my_ptr= best[1];
@@ -1079,28 +1008,26 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s,
}
//try to merge with above FIXME (needs PSNR test)
-static int RENAME(epzs_motion_search2)(MpegEncContext * s,
- int *mx_ptr, int *my_ptr,
- int P[10][2], int pred_x, int pred_y,
- uint8_t *src_data[3],
- uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2],
- int ref_mv_scale, uint8_t * const mv_penalty)
+static int epzs_motion_search2(MpegEncContext * s,
+ int *mx_ptr, int *my_ptr, int P[10][2],
+ int src_index, int ref_index, int16_t (*last_mv)[2],
+ int ref_mv_scale)
{
int best[2]={0, 0};
int d, dmin;
- const int shift= 1+s->quarter_sample;
- uint32_t *map= s->me.map;
int map_generation;
const int penalty_factor= s->me.penalty_factor;
const int size=0; //FIXME pass as arg
const int h=8;
const int ref_mv_stride= s->mb_stride;
const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride;
- me_cmp_func cmp, chroma_cmp;
+ me_cmp_func cmpf, chroma_cmpf;
LOAD_COMMON
+ int flags= s->me.flags;
+ LOAD_COMMON2
- cmp= s->dsp.me_cmp[size];
- chroma_cmp= s->dsp.me_cmp[size+1];
+ cmpf= s->dsp.me_cmp[size];
+ chroma_cmpf= s->dsp.me_cmp[size+1];
map_generation= update_map_generation(s);
@@ -1132,23 +1059,7 @@ static int RENAME(epzs_motion_search2)(MpegEncContext * s,
(last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
}
- if(s->me.dia_size==-1)
- dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else if(s->me.dia_size<-1)
- dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else if(s->me.dia_size<2)
- dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
- else
- dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
- pred_x, pred_y, penalty_factor,
- shift, map, map_generation, size, h, mv_penalty);
-
+ dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
*mx_ptr= best[0];
*my_ptr= best[1];
@@ -1156,4 +1067,3 @@ static int RENAME(epzs_motion_search2)(MpegEncContext * s,
// printf("%d %d %d \n", best[0], best[1], dmin);
return dmin;
}
-#endif /* !CMP_DIRECT */
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index e39356c9d..493d1a445 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -249,7 +249,7 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
{
unsigned int vbv_buffer_size;
unsigned int fps, v;
- int n, i;
+ int i;
uint64_t time_code;
float best_aspect_error= 1E10;
float aspect_ratio= av_q2d(s->avctx->sample_aspect_ratio);
@@ -365,8 +365,14 @@ static inline void encode_mb_skip_run(MpegEncContext *s, int run){
static void common_init(MpegEncContext *s)
{
+int i;
+
s->y_dc_scale_table=
s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+
+ if(!s->encoding)
+ for(i=0;i<64;i++)
+ s->dsp.idct_permutation[i]=i;
}
void ff_mpeg1_clean_buffers(MpegEncContext *s){
@@ -500,8 +506,9 @@ void mpeg1_encode_mb(MpegEncContext *s,
cbp |= 1 << (5 - i);
}
- if (cbp == 0 && !first_mb && (mb_x != s->mb_width - 1 || (mb_y != s->mb_height - 1 && s->codec_id == CODEC_ID_MPEG1VIDEO)) &&
- ((s->pict_type == P_TYPE && s->mv_type == MV_TYPE_16X16 && (motion_x | motion_y) == 0) ||
+ if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 &&
+ (mb_x != s->mb_width - 1 || (mb_y != s->mb_height - 1 && s->codec_id == CODEC_ID_MPEG1VIDEO)) &&
+ ((s->pict_type == P_TYPE && (motion_x | motion_y) == 0) ||
(s->pict_type == B_TYPE && s->mv_dir == s->last_mv_dir && (((s->mv_dir & MV_DIR_FORWARD) ? ((s->mv[0][0][0] - s->last_mv[0][0][0])|(s->mv[0][0][1] - s->last_mv[0][0][1])) : 0) |
((s->mv_dir & MV_DIR_BACKWARD) ? ((s->mv[1][0][0] - s->last_mv[1][0][0])|(s->mv[1][0][1] - s->last_mv[1][0][1])) : 0)) == 0))) {
s->mb_skip_run++;
@@ -798,7 +805,7 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
else{
int val, bit_size, range, code;
- bit_size = s->f_code - 1;
+ bit_size = f_code - 1;
range = 1 << bit_size;
val=mv;
@@ -955,7 +962,7 @@ static VLC mb_ptype_vlc;
static VLC mb_btype_vlc;
static VLC mb_pat_vlc;
-static void init_vlcs()
+static void init_vlcs(void)
{
static int done = 0;
@@ -1754,11 +1761,17 @@ typedef struct Mpeg1Context {
int repeat_field; /* true if we must repeat the field */
AVPanScan pan_scan; /** some temporary storage for the panscan */
int slice_count;
+ int swap_uv;//indicate VCR2
+ int save_aspect_info;
+
} Mpeg1Context;
static int mpeg_decode_init(AVCodecContext *avctx)
{
Mpeg1Context *s = avctx->priv_data;
+ MpegEncContext *s2 = &s->mpeg_enc_ctx;
+
+ MPV_decode_defaults(s2);
s->mpeg_enc_ctx.avctx= avctx;
s->mpeg_enc_ctx.flags= avctx->flags;
@@ -1773,6 +1786,122 @@ static int mpeg_decode_init(AVCodecContext *avctx)
return 0;
}
+static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
+ const uint8_t *new_perm){
+uint16_t temp_matrix[64];
+int i;
+
+ memcpy(temp_matrix,matrix,64*sizeof(uint16_t));
+
+ for(i=0;i<64;i++){
+ matrix[new_perm[i]] = temp_matrix[old_perm[i]];
+ }
+}
+
+//Call this function when we know all parameters
+//it may be called in different places for mpeg1 and mpeg2
+static int mpeg_decode_postinit(AVCodecContext *avctx){
+Mpeg1Context *s1 = avctx->priv_data;
+MpegEncContext *s = &s1->mpeg_enc_ctx;
+uint8_t old_permutation[64];
+
+
+ if (
+ (s1->mpeg_enc_ctx_allocated == 0)||
+ avctx->width != s->width ||
+ avctx->height != s->height||
+// s1->save_aspect_info != avctx->aspect_ratio_info||
+ 0)
+ {
+
+ if (s1->mpeg_enc_ctx_allocated) {
+ MPV_common_end(s);
+ }
+
+ if( (s->width == 0 )||(s->height == 0))
+ return -2;
+
+ avctx->width = s->width;
+ avctx->height = s->height;
+ avctx->bit_rate = s->bit_rate;
+ s1->save_aspect_info = s->aspect_ratio_info;
+
+ //low_delay may be forced, in this case we will have B frames
+ //that behave like P frames
+ avctx->has_b_frames = !(s->low_delay);
+
+ if(avctx->sub_id==1){//s->codec_id==avctx->codec_id==CODEC_ID
+ //mpeg1 fps
+ avctx->frame_rate = frame_rate_tab[s->frame_rate_index].num;
+ avctx->frame_rate_base= frame_rate_tab[s->frame_rate_index].den;
+ //mpeg1 aspect
+ avctx->sample_aspect_ratio= av_d2q(
+ 1.0/mpeg1_aspect[s->aspect_ratio_info], 255);
+
+ }else{//mpeg2
+ //mpeg2 fps
+ av_reduce(
+ &s->avctx->frame_rate,
+ &s->avctx->frame_rate_base,
+ frame_rate_tab[s->frame_rate_index].num * (s->frame_rate_ext_n+1),
+ frame_rate_tab[s->frame_rate_index].den * (s->frame_rate_ext_d+1),
+ 1<<30);
+ //mpeg2 aspect
+ if(s->aspect_ratio_info > 1){
+ if( (s1->pan_scan.width == 0 )||(s1->pan_scan.height == 0) ){
+ s->avctx->sample_aspect_ratio=
+ av_div_q(
+ mpeg2_aspect[s->aspect_ratio_info],
+ (AVRational){s->width, s->height}
+ );
+ }else{
+ s->avctx->sample_aspect_ratio=
+ av_div_q(
+ mpeg2_aspect[s->aspect_ratio_info],
+ (AVRational){s1->pan_scan.width, s1->pan_scan.height}
+ );
+ }
+ }else{
+ s->avctx->sample_aspect_ratio=
+ mpeg2_aspect[s->aspect_ratio_info];
+ }
+ }//mpeg2
+
+ if(avctx->xvmc_acceleration){
+ avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
+ }else{
+ if(s->chroma_format < 2){
+ avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_420);
+ }else
+ if(s->chroma_format == 2){
+ avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_422);
+ }else
+ if(s->chroma_format > 2){
+ avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_444);
+ }
+ }
+ //until then pix_fmt may be changed right after codec init
+ if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT )
+ if( avctx->idct_algo == FF_IDCT_AUTO )
+ avctx->idct_algo = FF_IDCT_SIMPLE;
+
+ //quantization matrixes may need reordering
+ //if dct permutation is changed
+ memcpy(old_permutation,s->dsp.idct_permutation,64*sizeof(uint8_t));
+
+ if (MPV_common_init(s) < 0)
+ return -2;
+
+ quant_matrix_rebuild(s->intra_matrix, old_permutation,s->dsp.idct_permutation);
+ quant_matrix_rebuild(s->inter_matrix, old_permutation,s->dsp.idct_permutation);
+ quant_matrix_rebuild(s->chroma_intra_matrix,old_permutation,s->dsp.idct_permutation);
+ quant_matrix_rebuild(s->chroma_inter_matrix,old_permutation,s->dsp.idct_permutation);
+
+ s1->mpeg_enc_ctx_allocated = 1;
+ }
+ return 0;
+}
+
/* return the 8 bit start code value and update the search
state. Return -1 if no start code found */
static int find_start_code(const uint8_t **pbuf_ptr, const uint8_t *buf_end)
@@ -1807,6 +1936,9 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
MpegEncContext *s = &s1->mpeg_enc_ctx;
int ref, f_code, vbv_delay;
+ if(mpeg_decode_postinit(s->avctx) < 0)
+ return -2;
+
init_get_bits(&s->gb, buf, buf_size*8);
ref = get_bits(&s->gb, 10); /* temporal ref */
@@ -1845,7 +1977,6 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
{
int horiz_size_ext, vert_size_ext;
int bit_rate_ext;
- int frame_rate_ext_n, frame_rate_ext_d;
int level, profile;
skip_bits(&s->gb, 1); /* profil and level esc*/
@@ -1865,32 +1996,17 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
s->low_delay = get_bits1(&s->gb);
if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1;
- frame_rate_ext_n = get_bits(&s->gb, 2);
- frame_rate_ext_d = get_bits(&s->gb, 5);
- av_reduce(
- &s->avctx->frame_rate,
- &s->avctx->frame_rate_base,
- frame_rate_tab[s->frame_rate_index].num * (frame_rate_ext_n+1),
- frame_rate_tab[s->frame_rate_index].den * (frame_rate_ext_d+1),
- 1<<30);
+ s->frame_rate_ext_n = get_bits(&s->gb, 2);
+ s->frame_rate_ext_d = get_bits(&s->gb, 5);
dprintf("sequence extension\n");
s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG2VIDEO;
s->avctx->sub_id = 2; /* indicates mpeg2 found */
- if(s->aspect_ratio_info <= 1)
- s->avctx->sample_aspect_ratio= mpeg2_aspect[s->aspect_ratio_info];
- else{
- s->avctx->sample_aspect_ratio=
- av_div_q(
- mpeg2_aspect[s->aspect_ratio_info],
- (AVRational){s->width, s->height}
- );
- }
-
if(s->avctx->debug & FF_DEBUG_PICT_INFO)
av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n",
profile, level, s->avctx->rc_buffer_size, s->bit_rate);
+
}
static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1)
@@ -1912,14 +2028,7 @@ static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1)
s1->pan_scan.width= 16*w;
s1->pan_scan.height=16*h;
-
- if(s->aspect_ratio_info > 1)
- s->avctx->sample_aspect_ratio=
- av_div_q(
- mpeg2_aspect[s->aspect_ratio_info],
- (AVRational){w, h}
- );
-
+
if(s->avctx->debug & FF_DEBUG_PICT_INFO)
av_log(s->avctx, AV_LOG_DEBUG, "sde w:%d, h:%d\n", w, h);
}
@@ -1927,9 +2036,23 @@ static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1)
static void mpeg_decode_picture_display_extension(Mpeg1Context *s1)
{
MpegEncContext *s= &s1->mpeg_enc_ctx;
- int i;
-
- for(i=0; i<1; i++){ //FIXME count
+ int i,nofco;
+
+ nofco = 1;
+ if(s->progressive_sequence){
+ if(s->repeat_first_field){
+ nofco++;
+ if(s->top_field_first)
+ nofco++;
+ }
+ }else{
+ if(s->picture_structure == PICT_FRAME){
+ nofco++;
+ if(s->repeat_first_field)
+ nofco++;
+ }
+ }
+ for(i=0; i<nofco; i++){
s1->pan_scan.position[i][0]= get_sbits(&s->gb, 16);
skip_bits(&s->gb, 1); //marker
s1->pan_scan.position[i][1]= get_sbits(&s->gb, 16);
@@ -2134,8 +2257,8 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
s->resync_mb_x=
s->resync_mb_y= -1;
- if (mb_y >= s->mb_height){
- av_log(s->avctx, AV_LOG_ERROR, "slice below image (%d >= %d)\n", s->mb_y, s->mb_height);
+ if (mb_y<<field_pic >= s->mb_height){
+ av_log(s->avctx, AV_LOG_ERROR, "slice below image (%d >= %d)\n", mb_y, s->mb_height);
return -1;
}
@@ -2208,8 +2331,8 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
return -1;
if(s->current_picture.motion_val[0] && !s->encoding){ //note motion_val is normally NULL unless we want to extract the MVs
- const int wrap = field_pic ? 2*s->block_wrap[0] : s->block_wrap[0];
- int xy = s->mb_x*2 + 1 + (s->mb_y*2 +1)*wrap;
+ const int wrap = field_pic ? 2*s->b8_stride : s->b8_stride;
+ int xy = s->mb_x*2 + s->mb_y*2*wrap;
int motion_x, motion_y, dir, i;
if(field_pic && !s->first_field)
xy += wrap/2;
@@ -2218,18 +2341,20 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
for(dir=0; dir<2; dir++){
if (s->mb_intra || (dir==1 && s->pict_type != B_TYPE)) {
motion_x = motion_y = 0;
- }else if (s->mv_type == MV_TYPE_16X16){
+ }else if (s->mv_type == MV_TYPE_16X16 || (s->mv_type == MV_TYPE_FIELD && field_pic)){
motion_x = s->mv[dir][0][0];
motion_y = s->mv[dir][0][1];
} else /*if ((s->mv_type == MV_TYPE_FIELD) || (s->mv_type == MV_TYPE_16X8))*/ {
motion_x = s->mv[dir][i][0];
motion_y = s->mv[dir][i][1];
}
-
+
s->current_picture.motion_val[dir][xy ][0] = motion_x;
s->current_picture.motion_val[dir][xy ][1] = motion_y;
s->current_picture.motion_val[dir][xy + 1][0] = motion_x;
s->current_picture.motion_val[dir][xy + 1][1] = motion_y;
+ s->current_picture.ref_index [dir][xy ]=
+ s->current_picture.ref_index [dir][xy + 1]= s->field_select[dir][i];
}
xy += wrap;
}
@@ -2379,59 +2504,27 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
{
Mpeg1Context *s1 = avctx->priv_data;
MpegEncContext *s = &s1->mpeg_enc_ctx;
- int width, height, i, v, j;
- float aspect;
+ int width,height;
+ int i, v, j;
init_get_bits(&s->gb, buf, buf_size*8);
width = get_bits(&s->gb, 12);
height = get_bits(&s->gb, 12);
+ if (width <= 0 || height <= 0 ||
+ (width % 2) != 0 || (height % 2) != 0)
+ return -1;
s->aspect_ratio_info= get_bits(&s->gb, 4);
if (s->aspect_ratio_info == 0)
return -1;
- aspect= 1.0/mpeg1_aspect[s->aspect_ratio_info];
- avctx->sample_aspect_ratio= av_d2q(aspect, 255);
-
s->frame_rate_index = get_bits(&s->gb, 4);
if (s->frame_rate_index == 0 || s->frame_rate_index > 13)
return -1;
s->bit_rate = get_bits(&s->gb, 18) * 400;
if (get_bits1(&s->gb) == 0) /* marker */
return -1;
- if (width <= 0 || height <= 0 ||
- (width % 2) != 0 || (height % 2) != 0)
- return -1;
- if (width != s->width ||
- height != s->height) {
- /* start new mpeg1 context decoding */
- s->out_format = FMT_MPEG1;
- if (s1->mpeg_enc_ctx_allocated) {
- MPV_common_end(s);
- }
- s->width = width;
- s->height = height;
- avctx->has_b_frames= 1;
- avctx->width = width;
- avctx->height = height;
- avctx->frame_rate = frame_rate_tab[s->frame_rate_index].num;
- avctx->frame_rate_base= frame_rate_tab[s->frame_rate_index].den;
- avctx->bit_rate = s->bit_rate;
-
- if(avctx->xvmc_acceleration){
- avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
- }else{
- avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_420);
- }
-
- if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT )
- if( avctx->idct_algo == FF_IDCT_AUTO )
- avctx->idct_algo = FF_IDCT_SIMPLE;
-
- if (MPV_common_init(s) < 0)
- return -1;
- s1->mpeg_enc_ctx_allocated = 1;
- s->swap_uv = 0;//just in case vcr2 and mpeg2 stream have been concatinated
- }
+ s->width = width;
+ s->height = height;
s->avctx->rc_buffer_size= get_bits(&s->gb, 10) * 1024*16;
skip_bits(&s->gb, 1);
@@ -2444,19 +2537,21 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
av_log(s->avctx, AV_LOG_ERROR, "intra matrix damaged\n");
return -1;
}
- j = s->intra_scantable.permutated[i];
+ j = s->dsp.idct_permutation[ ff_zigzag_direct[i] ];
s->intra_matrix[j] = v;
s->chroma_intra_matrix[j] = v;
}
#ifdef DEBUG
+/*
dprintf("intra matrix present\n");
for(i=0;i<64;i++)
- dprintf(" %d", s->intra_matrix[s->intra_scantable.permutated[i]]);
+ dprintf(" %d", s->intra_matrix[s->dsp.idct_permutation[i]);
printf("\n");
+*/
#endif
} else {
for(i=0;i<64;i++) {
- int j= s->dsp.idct_permutation[i];
+ j = s->dsp.idct_permutation[i];
v = ff_mpeg1_default_intra_matrix[i];
s->intra_matrix[j] = v;
s->chroma_intra_matrix[j] = v;
@@ -2469,15 +2564,17 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
av_log(s->avctx, AV_LOG_ERROR, "inter matrix damaged\n");
return -1;
}
- j = s->intra_scantable.permutated[i];
+ j = s->dsp.idct_permutation[ ff_zigzag_direct[i] ];
s->inter_matrix[j] = v;
s->chroma_inter_matrix[j] = v;
}
#ifdef DEBUG
+/*
dprintf("non intra matrix present\n");
for(i=0;i<64;i++)
- dprintf(" %d", s->inter_matrix[s->intra_scantable.permutated[i]]);
+ dprintf(" %d", s->inter_matrix[s->dsp.idct_permutation[i]);
printf("\n");
+*/
#endif
} else {
for(i=0;i<64;i++) {
@@ -2501,6 +2598,8 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
s->chroma_format = 1;
s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG1VIDEO;
avctx->sub_id = 1; /* indicates mpeg1 */
+ s->out_format = FMT_MPEG1;
+ s->swap_uv = 0;//AFAIK VCR2 don't have SEQ_HEADER
if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1;
if(s->avctx->debug & FF_DEBUG_PICT_INFO)
@@ -2593,6 +2692,36 @@ static void mpeg_decode_user_data(AVCodecContext *avctx,
}
}
+static void mpeg_decode_gop(AVCodecContext *avctx,
+ const uint8_t *buf, int buf_size){
+ Mpeg1Context *s1 = avctx->priv_data;
+ MpegEncContext *s = &s1->mpeg_enc_ctx;
+
+ int drop_frame_flag;
+ int time_code_hours, time_code_minutes;
+ int time_code_seconds, time_code_pictures;
+ int broken_link;
+
+ init_get_bits(&s->gb, buf, buf_size*8);
+
+ drop_frame_flag = get_bits1(&s->gb);
+
+ time_code_hours=get_bits(&s->gb,5);
+ time_code_minutes = get_bits(&s->gb,6);
+ skip_bits1(&s->gb);//marker bit
+ time_code_seconds = get_bits(&s->gb,6);
+ time_code_pictures = get_bits(&s->gb,6);
+
+ /*broken_link indicate that after editing the
+ reference frames of the first B-Frames after GOP I-Frame
+ are missing (open gop)*/
+ broken_link = get_bits1(&s->gb);
+
+ if(s->avctx->debug & FF_DEBUG_PICT_INFO)
+ av_log(s->avctx, AV_LOG_DEBUG, "GOP (%2d:%02d:%02d.[%02d]) broken_link=%d\n",
+ time_code_hours, time_code_minutes, time_code_seconds,
+ time_code_pictures, broken_link);
+}
/**
* finds the end of the current frame in the bitstream.
* @return the position of the first byte of the next frame, or -1
@@ -2706,7 +2835,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
input_size = buf_end - buf_ptr;
if(avctx->debug & FF_DEBUG_STARTCODE){
- av_log(avctx, AV_LOG_DEBUG, "%3X at %d left %d\n", start_code, buf_ptr-buf, input_size);
+ av_log(avctx, AV_LOG_DEBUG, "%3X at %zd left %d\n", start_code, buf_ptr-buf, input_size);
}
/* prepare data for next start code */
@@ -2731,6 +2860,8 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
break;
case GOP_START_CODE:
s2->first_field=0;
+ mpeg_decode_gop(avctx,
+ buf_ptr, input_size);
break;
default:
if (start_code >= SLICE_MIN_START_CODE &&
diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c
index d01405f54..a9eed4e36 100644
--- a/src/libffmpeg/libavcodec/mpegaudiodec.c
+++ b/src/libffmpeg/libavcodec/mpegaudiodec.c
@@ -23,7 +23,6 @@
*/
//#define DEBUG
-#include <math.h>
#include "avcodec.h"
#include "mpegaudio.h"
#include "dsputil.h"
@@ -401,11 +400,11 @@ static int decode_init(AVCodecContext * avctx)
}
/* compute n ^ (4/3) and store it in mantissa/exp format */
- if (!av_mallocz_static(&table_4_3_exp,
- TABLE_4_3_SIZE * sizeof(table_4_3_exp[0])))
+ table_4_3_exp= av_mallocz_static(TABLE_4_3_SIZE * sizeof(table_4_3_exp[0]));
+ if(!table_4_3_exp)
return -1;
- if (!av_mallocz_static(&table_4_3_value,
- TABLE_4_3_SIZE * sizeof(table_4_3_value[0])))
+ table_4_3_value= av_mallocz_static(TABLE_4_3_SIZE * sizeof(table_4_3_value[0]));
+ if(!table_4_3_value)
return -1;
int_pow_init();
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 32a92917c..bef088a41 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -283,7 +283,9 @@ static void copy_picture(Picture *dst, Picture *src){
dst->type= FF_BUFFER_TYPE_COPY;
}
-static void copy_picture_attributes(AVFrame *dst, AVFrame *src){
+static void copy_picture_attributes(MpegEncContext *s, AVFrame *dst, AVFrame *src){
+ int i;
+
dst->pict_type = src->pict_type;
dst->quality = src->quality;
dst->coded_picture_number = src->coded_picture_number;
@@ -292,6 +294,32 @@ static void copy_picture_attributes(AVFrame *dst, AVFrame *src){
dst->pts = src->pts;
dst->interlaced_frame = src->interlaced_frame;
dst->top_field_first = src->top_field_first;
+
+ if(s->avctx->me_threshold){
+ if(!src->motion_val[0])
+ av_log(s->avctx, AV_LOG_ERROR, "AVFrame.motion_val not set!\n");
+ if(!src->mb_type)
+ av_log(s->avctx, AV_LOG_ERROR, "AVFrame.mb_type not set!\n");
+ if(!src->ref_index[0])
+ av_log(s->avctx, AV_LOG_ERROR, "AVFrame.ref_index not set!\n");
+ if(src->motion_subsample_log2 != dst->motion_subsample_log2)
+ av_log(s->avctx, AV_LOG_ERROR, "AVFrame.motion_subsample_log2 doesnt match! (%d!=%d)\n",
+ src->motion_subsample_log2, dst->motion_subsample_log2);
+
+ memcpy(dst->mb_type, src->mb_type, s->mb_stride * s->mb_height * sizeof(dst->mb_type[0]));
+
+ for(i=0; i<2; i++){
+ int stride= ((16*s->mb_width )>>src->motion_subsample_log2) + 1;
+ int height= ((16*s->mb_height)>>src->motion_subsample_log2);
+
+ if(src->motion_val[i] && src->motion_val[i] != dst->motion_val[i]){
+ memcpy(dst->motion_val[i], src->motion_val[i], 2*stride*height*sizeof(int16_t));
+ }
+ if(src->ref_index[i] && src->ref_index[i] != dst->ref_index[i]){
+ memcpy(dst->ref_index[i], src->ref_index[i], s->b8_stride*2*s->mb_height*sizeof(int8_t));
+ }
+ }
+ }
}
/**
@@ -350,13 +378,14 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
for(i=0; i<2; i++){
CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b4_array_size+2) * sizeof(int16_t))
pic->motion_val[i]= pic->motion_val_base[i]+2;
- CHECKED_ALLOCZ(pic->ref_index[i] , b8_array_size * sizeof(uint8_t))
+ CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t))
}
pic->motion_subsample_log2= 2;
}else if(s->out_format == FMT_H263 || s->encoding || (s->avctx->debug&FF_DEBUG_MV) || (s->avctx->debug_mv)){
for(i=0; i<2; i++){
- CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t)*2) //FIXME
+ CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t))
pic->motion_val[i]= pic->motion_val_base[i]+2;
+ CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t))
}
pic->motion_subsample_log2= 3;
}
@@ -510,7 +539,68 @@ static void update_duplicate_context_after_me(MpegEncContext *dst, MpegEncContex
#undef COPY
}
-/* init common structure for both encoder and decoder */
+/**
+ * sets the given MpegEncContext to common defaults (same for encoding and decoding).
+ * the changed fields will not depend upon the prior state of the MpegEncContext.
+ */
+static void MPV_common_defaults(MpegEncContext *s){
+ s->y_dc_scale_table=
+ s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+ s->chroma_qscale_table= ff_default_chroma_qscale_table;
+ s->progressive_frame= 1;
+ s->progressive_sequence= 1;
+ s->picture_structure= PICT_FRAME;
+
+ s->coded_picture_number = 0;
+ s->picture_number = 0;
+ s->input_picture_number = 0;
+
+ s->picture_in_gop_number = 0;
+
+ s->f_code = 1;
+ s->b_code = 1;
+}
+
+/**
+ * sets the given MpegEncContext to defaults for decoding.
+ * the changed fields will not depend upon the prior state of the MpegEncContext.
+ */
+void MPV_decode_defaults(MpegEncContext *s){
+ MPV_common_defaults(s);
+}
+
+/**
+ * sets the given MpegEncContext to defaults for encoding.
+ * the changed fields will not depend upon the prior state of the MpegEncContext.
+ */
+
+#ifdef CONFIG_ENCODERS
+void MPV_encode_defaults(MpegEncContext *s){
+ static int done=0;
+
+ MPV_common_defaults(s);
+
+ if(!done){
+ int i;
+ done=1;
+
+ default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) );
+ memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1));
+ memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1));
+
+ for(i=-16; i<16; i++){
+ default_fcode_tab[i + MAX_MV]= 1;
+ }
+ }
+ s->me.mv_penalty= default_mv_penalty;
+ s->fcode_tab= default_fcode_tab;
+}
+#endif //CONFIG_ENCODERS
+
+/**
+ * init common structure for both encoder and decoder.
+ * this assumes that some variables like width/height are already set
+ */
int MPV_common_init(MpegEncContext *s)
{
int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y;
@@ -538,31 +628,14 @@ int MPV_common_init(MpegEncContext *s)
s->block_wrap[0]=
s->block_wrap[1]=
s->block_wrap[2]=
- s->block_wrap[3]= s->mb_width*2 + 2;
+ s->block_wrap[3]= s->b8_stride;
s->block_wrap[4]=
- s->block_wrap[5]= s->mb_width + 2;
-
- s->y_dc_scale_table=
- s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
- s->chroma_qscale_table= ff_default_chroma_qscale_table;
- if( s->codec_id != CODEC_ID_MPEG1VIDEO &&
- s->codec_id != CODEC_ID_MPEG2VIDEO)
- {
- /* default structure is frame */
- s->progressive_frame= 1;
- s->picture_structure= PICT_FRAME;
-
- s->y_dc_scale_table=
- s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
- if (!s->encoding)
- s->progressive_sequence= 1;
- }
- s->coded_picture_number = 0;
-
- y_size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
- c_size = (s->mb_width + 2) * (s->mb_height + 2);
+ s->block_wrap[5]= s->mb_stride;
+
+ y_size = s->b8_stride * (2 * s->mb_height + 1);
+ c_size = s->mb_stride * (s->mb_height + 1);
yc_size = y_size + 2 * c_size;
-
+
/* convert fourcc to upper case */
s->avctx->codec_tag= toupper( s->avctx->codec_tag &0xFF)
+ (toupper((s->avctx->codec_tag>>8 )&0xFF)<<8 )
@@ -642,12 +715,14 @@ int MPV_common_init(MpegEncContext *s)
}
if (s->out_format == FMT_H263) {
/* ac values */
- CHECKED_ALLOCZ(s->ac_val[0], yc_size * sizeof(int16_t) * 16);
- s->ac_val[1] = s->ac_val[0] + y_size;
+ CHECKED_ALLOCZ(s->ac_val_base, yc_size * sizeof(int16_t) * 16);
+ s->ac_val[0] = s->ac_val_base + s->b8_stride + 1;
+ s->ac_val[1] = s->ac_val_base + y_size + s->mb_stride + 1;
s->ac_val[2] = s->ac_val[1] + c_size;
/* cbp values */
- CHECKED_ALLOCZ(s->coded_block, y_size);
+ CHECKED_ALLOCZ(s->coded_block_base, y_size);
+ s->coded_block= s->coded_block_base + s->b8_stride + 1;
/* divx501 bitstream reorder buffer */
CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE);
@@ -660,20 +735,18 @@ int MPV_common_init(MpegEncContext *s)
if (s->h263_pred || s->h263_plus || !s->encoding) {
/* dc values */
//MN: we need these for error resilience of intra-frames
- CHECKED_ALLOCZ(s->dc_val[0], yc_size * sizeof(int16_t));
- s->dc_val[1] = s->dc_val[0] + y_size;
+ CHECKED_ALLOCZ(s->dc_val_base, yc_size * sizeof(int16_t));
+ s->dc_val[0] = s->dc_val_base + s->b8_stride + 1;
+ s->dc_val[1] = s->dc_val_base + y_size + s->mb_stride + 1;
s->dc_val[2] = s->dc_val[1] + c_size;
for(i=0;i<yc_size;i++)
- s->dc_val[0][i] = 1024;
+ s->dc_val_base[i] = 1024;
}
/* which mb is a intra block */
CHECKED_ALLOCZ(s->mbintra_table, mb_array_size);
memset(s->mbintra_table, 1, mb_array_size);
- /* default structure is frame */
- s->picture_structure = PICT_FRAME;
-
/* init macroblock skip table */
CHECKED_ALLOCZ(s->mbskip_table, mb_array_size+2);
//Note the +1 is for a quicker mpeg4 slice_end detection
@@ -748,9 +821,9 @@ void MPV_common_end(MpegEncContext *s)
av_freep(&s->p_field_select_table[i]);
}
- av_freep(&s->dc_val[0]);
- av_freep(&s->ac_val[0]);
- av_freep(&s->coded_block);
+ av_freep(&s->dc_val_base);
+ av_freep(&s->ac_val_base);
+ av_freep(&s->coded_block_base);
av_freep(&s->mbintra_table);
av_freep(&s->cbp_table);
av_freep(&s->pred_dir_table);
@@ -782,9 +855,9 @@ void MPV_common_end(MpegEncContext *s)
s->last_picture_ptr=
s->next_picture_ptr=
s->current_picture_ptr= NULL;
+
for(i=0; i<3; i++)
- if (s->visualization_buffer[i])
- av_free(s->visualization_buffer[i]);
+ av_freep(&s->visualization_buffer[i]);
}
#ifdef CONFIG_ENCODERS
@@ -795,6 +868,8 @@ int MPV_encode_init(AVCodecContext *avctx)
MpegEncContext *s = avctx->priv_data;
int i, dummy;
int chroma_h_shift, chroma_v_shift;
+
+ MPV_encode_defaults(s);
avctx->pix_fmt = PIX_FMT_YUV420P; // FIXME
@@ -850,8 +925,15 @@ int MPV_encode_init(AVCodecContext *avctx)
if(avctx->rc_min_rate && avctx->rc_max_rate != avctx->rc_min_rate){
av_log(avctx, AV_LOG_INFO, "Warning min_rate > 0 but min_rate != max_rate isnt recommanded!\n");
- }
+ }
+
+ if( s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate
+ && (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO)
+ && 90000LL * (avctx->rc_buffer_size-1) > s->avctx->rc_max_rate*0xFFFFLL){
+ av_log(avctx, AV_LOG_INFO, "Warning vbv_delay will be set to 0xFFFF (=VBR) as the specified vbv buffer is too large for the given bitrate!\n");
+ }
+
if((s->flags & CODEC_FLAG_4MV) && s->codec_id != CODEC_ID_MPEG4
&& s->codec_id != CODEC_ID_H263 && s->codec_id != CODEC_ID_H263P && s->codec_id != CODEC_ID_FLV1){
av_log(avctx, AV_LOG_ERROR, "4MV not supported by codec\n");
@@ -882,7 +964,13 @@ int MPV_encode_init(AVCodecContext *avctx)
av_log(avctx, AV_LOG_ERROR, "b frames not supported by codec\n");
return -1;
}
-
+
+ if((s->flags & (CODEC_FLAG_INTERLACED_DCT|CODEC_FLAG_INTERLACED_ME|CODEC_FLAG_ALT_SCAN))
+ && s->codec_id != CODEC_ID_MPEG4 && s->codec_id != CODEC_ID_MPEG2VIDEO){
+ av_log(avctx, AV_LOG_ERROR, "interlacing not supported by codec\n");
+ return -1;
+ }
+
if(s->mpeg_quant && s->codec_id != CODEC_ID_MPEG4){ //FIXME mpeg2 uses that too
av_log(avctx, AV_LOG_ERROR, "mpeg2 style quantization not supporetd by codec\n");
return -1;
@@ -1081,28 +1169,6 @@ int MPV_encode_init(AVCodecContext *avctx)
return -1;
}
- { /* set up some save defaults, some codecs might override them later */
- static int done=0;
- if(!done){
- int i;
- done=1;
-
- default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) );
- memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1));
- memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1));
-
- for(i=-16; i<16; i++){
- default_fcode_tab[i + MAX_MV]= 1;
- }
- }
- }
- s->me.mv_penalty= default_mv_penalty;
- s->fcode_tab= default_fcode_tab;
-
- /* dont use mv_penalty table for crap MV as it would be confused */
- //FIXME remove after fixing / removing old ME
- if (s->me_method < ME_EPZS) s->me.mv_penalty = default_mv_penalty;
-
s->encoding = 1;
/* init */
@@ -1122,22 +1188,22 @@ int MPV_encode_init(AVCodecContext *avctx)
ff_init_me(s);
#endif /* #if 0 */
+#ifdef CONFIG_ENCODERS
/* xine: do not need this for decode or MPEG-1 encoding modes */
#if 0
-#ifdef CONFIG_ENCODERS
#ifdef CONFIG_RISKY
if (s->out_format == FMT_H263)
h263_encode_init(s);
if(s->msmpeg4_version)
ff_msmpeg4_encode_init(s);
#endif
-#endif
#endif /* #if 0 */
/* xine: we do want this for MPEG-1 encoding */
if (s->out_format == FMT_MPEG1)
ff_mpeg1_encode_init(s);
+#endif
- /* init default q matrix */
+ /* init q matrix */
for(i=0;i<64;i++) {
int j= s->dsp.idct_permutation[i];
#ifdef CONFIG_RISKY
@@ -1170,14 +1236,7 @@ int MPV_encode_init(AVCodecContext *avctx)
if(ff_rate_control_init(s) < 0)
return -1;
-
- s->picture_number = 0;
- s->input_picture_number = 0;
- s->picture_in_gop_number = 0;
- /* motion detector init */
- s->f_code = 1;
- s->b_code = 1;
-
+
return 0;
}
@@ -1321,7 +1380,7 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
assert(s->last_picture_ptr==NULL || s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3);
/* mark&release old frames */
- if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr->data[0]) {
+ if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr != s->next_picture_ptr && s->last_picture_ptr->data[0]) {
avctx->release_buffer(avctx, (AVFrame*)s->last_picture_ptr);
/* release forgotten pictures */
@@ -1351,7 +1410,7 @@ alloc:
pic= (AVFrame*)&s->picture[i];
}
- pic->reference= s->pict_type != B_TYPE ? 3 : 0;
+ pic->reference= s->pict_type != B_TYPE && !s->dropable ? 3 : 0;
pic->coded_picture_number= s->coded_picture_number++;
@@ -1373,8 +1432,14 @@ alloc:
if(s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3){
if (s->pict_type != B_TYPE) {
s->last_picture_ptr= s->next_picture_ptr;
- s->next_picture_ptr= s->current_picture_ptr;
+ if(!s->dropable)
+ s->next_picture_ptr= s->current_picture_ptr;
}
+/* av_log(s->avctx, AV_LOG_DEBUG, "L%p N%p C%p L%p N%p C%p type:%d drop:%d\n", s->last_picture_ptr, s->next_picture_ptr,s->current_picture_ptr,
+ s->last_picture_ptr ? s->last_picture_ptr->data[0] : NULL,
+ s->next_picture_ptr ? s->next_picture_ptr->data[0] : NULL,
+ s->current_picture_ptr ? s->current_picture_ptr->data[0] : NULL,
+ s->pict_type, s->dropable);*/
if(s->last_picture_ptr) copy_picture(&s->last_picture, s->last_picture_ptr);
if(s->next_picture_ptr) copy_picture(&s->next_picture, s->next_picture_ptr);
@@ -1486,7 +1551,7 @@ void MPV_frame_end(MpegEncContext *s)
* @param color color of the arrow
*/
static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){
- int t, x, y, f;
+ int t, x, y, fr, f;
sx= clip(sx, 0, w-1);
sy= clip(sy, 0, h-1);
@@ -1504,8 +1569,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h
ex-= sx;
f= ((ey-sy)<<16)/ex;
for(x= 0; x <= ex; x++){
- y= ((x*f) + (1<<15))>>16;
- buf[y*stride + x]+= color;
+ y = (x*f)>>16;
+ fr= (x*f)&0xFFFF;
+ buf[ y *stride + x]+= (color*(0x10000-fr))>>16;
+ buf[(y+1)*stride + x]+= (color* fr )>>16;
}
}else{
if(sy > ey){
@@ -1517,8 +1584,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h
if(ey) f= ((ex-sx)<<16)/ey;
else f= 0;
for(y= 0; y <= ey; y++){
- x= ((y*f) + (1<<15))>>16;
- buf[y*stride + x]+= color;
+ x = (y*f)>>16;
+ fr= (y*f)&0xFFFF;
+ buf[y*stride + x ]+= (color*(0x10000-fr))>>16;;
+ buf[y*stride + x+1]+= (color* fr )>>16;;
}
}
}
@@ -1680,12 +1749,13 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
if(!USES_LIST(pict->mb_type[mb_index], direction))
continue;
+ //FIXME for h264
if(IS_8X8(pict->mb_type[mb_index])){
int i;
for(i=0; i<4; i++){
int sx= mb_x*16 + 4 + 8*(i&1);
int sy= mb_y*16 + 4 + 8*(i>>1);
- int xy= 1 + mb_x*2 + (i&1) + (mb_y*2 + 1 + (i>>1))*(s->mb_width*2 + 2);
+ int xy= mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride;
int mx= (pict->motion_val[direction][xy][0]>>shift) + sx;
int my= (pict->motion_val[direction][xy][1]>>shift) + sy;
draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
@@ -1695,15 +1765,19 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
for(i=0; i<2; i++){
int sx=mb_x*16 + 8;
int sy=mb_y*16 + 4 + 8*i;
- int xy=1 + mb_x*2 + (mb_y*2 + 1 + i)*(s->mb_width*2 + 2);
- int mx=(pict->motion_val[direction][xy][0]>>shift) + sx;
- int my=(pict->motion_val[direction][xy][1]>>shift) + sy;
- draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
+ int xy= mb_x*2 + (mb_y*2 + i)*s->b8_stride;
+ int mx=(pict->motion_val[direction][xy][0]>>shift);
+ int my=(pict->motion_val[direction][xy][1]>>shift);
+
+ if(IS_INTERLACED(pict->mb_type[mb_index]))
+ my*=2;
+
+ draw_arrow(ptr, sx, sy, mx+sx, my+sy, s->width, s->height, s->linesize, 100);
}
}else{
int sx= mb_x*16 + 8;
int sy= mb_y*16 + 8;
- int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2);
+ int xy= mb_x*2 + mb_y*2*s->b8_stride;
int mx= (pict->motion_val[direction][xy][0]>>shift) + sx;
int my= (pict->motion_val[direction][xy][1]>>shift) + sy;
draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
@@ -1880,7 +1954,7 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
}
}
}
- copy_picture_attributes(pic, pic_arg);
+ copy_picture_attributes(s, pic, pic_arg);
pic->display_picture_number= s->input_picture_number++;
if(pic->pts != AV_NOPTS_VALUE){
@@ -2009,11 +2083,12 @@ static void select_input_picture(MpegEncContext *s){
s->reordered_input_picture[0]->data[i]= NULL;
s->reordered_input_picture[0]->type= 0;
- copy_picture_attributes((AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]);
pic->reference = s->reordered_input_picture[0]->reference;
alloc_picture(s, pic, 0);
+ copy_picture_attributes(s, (AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]);
+
s->current_picture_ptr= pic;
}else{
// input is not a shared pix -> reuse buffer for current_pix
@@ -2125,7 +2200,8 @@ int MPV_encode_picture(AVCodecContext *avctx,
}
/* update mpeg1/2 vbv_delay for CBR */
- if(s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate && s->out_format == FMT_MPEG1){
+ if(s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate && s->out_format == FMT_MPEG1
+ && 90000LL * (avctx->rc_buffer_size-1) <= s->avctx->rc_max_rate*0xFFFFLL){
int vbv_delay;
assert(s->repeat_first_field==0);
@@ -2432,9 +2508,17 @@ if(s->quarter_sample)
src_y = s->mb_y*(16>>field_based) + (motion_y >> 1);
if (s->out_format == FMT_H263) {
- uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1);
- uvsrc_x = src_x>>1;
- uvsrc_y = src_y>>1;
+ if((s->workaround_bugs & FF_BUG_HPEL_CHROMA) && field_based){
+ mx = (motion_x>>1)|(motion_x&1);
+ my = motion_y >>1;
+ uvdxy = ((my & 1) << 1) | (mx & 1);
+ uvsrc_x = s->mb_x* 8 + (mx >> 1);
+ uvsrc_y = s->mb_y*(8>>field_based) + (my >> 1);
+ }else{
+ uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1);
+ uvsrc_x = src_x>>1;
+ uvsrc_y = src_y>>1;
+ }
} else {
mx = motion_x / 2;
my = motion_y / 2;
@@ -2736,8 +2820,8 @@ static inline void MPV_motion(MpegEncContext *s,
if(s->obmc && s->pict_type != B_TYPE){
int16_t mv_cache[4][4][2];
const int xy= s->mb_x + s->mb_y*s->mb_stride;
- const int mot_stride= s->mb_width*2 + 2;
- const int mot_xy= 1 + mb_x*2 + (mb_y*2 + 1)*mot_stride;
+ const int mot_stride= s->b8_stride;
+ const int mot_xy= mb_x*2 + mb_y*2*mot_stride;
assert(!s->mb_skiped);
@@ -2993,7 +3077,7 @@ static inline void add_dequant_dct(MpegEncContext *s,
*/
void ff_clean_intra_table_entries(MpegEncContext *s)
{
- int wrap = s->block_wrap[0];
+ int wrap = s->b8_stride;
int xy = s->block_index[0];
s->dc_val[0][xy ] =
@@ -3010,15 +3094,15 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
s->coded_block[xy + 1 + wrap] = 0;
}
/* chroma */
- wrap = s->block_wrap[4];
- xy = s->mb_x + 1 + (s->mb_y + 1) * wrap;
+ wrap = s->mb_stride;
+ xy = s->mb_x + s->mb_y * wrap;
s->dc_val[1][xy] =
s->dc_val[2][xy] = 1024;
/* ac pred */
memset(s->ac_val[1][xy], 0, 16 * sizeof(int16_t));
memset(s->ac_val[2][xy], 0, 16 * sizeof(int16_t));
- s->mbintra_table[s->mb_x + s->mb_y*s->mb_stride]= 0;
+ s->mbintra_table[xy]= 0;
}
/* generic function called after a macroblock has been parsed by the
@@ -3338,12 +3422,12 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
const int linesize= s->current_picture.linesize[0]; //not s->linesize as this woulnd be wrong for field pics
const int uvlinesize= s->current_picture.linesize[1];
- s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
- s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1) + s->mb_x*2;
- s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2;
- s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2) + s->mb_x*2;
- s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
- s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
+ s->block_index[0]= s->b8_stride*(s->mb_y*2 ) - 2 + s->mb_x*2;
+ s->block_index[1]= s->b8_stride*(s->mb_y*2 ) - 1 + s->mb_x*2;
+ s->block_index[2]= s->b8_stride*(s->mb_y*2 + 1) - 2 + s->mb_x*2;
+ s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
+ s->block_index[4]= s->mb_stride*(s->mb_y + 1) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
+ s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
if(s->pict_type==B_TYPE && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME){
s->dest[0] = s->current_picture.data[0] + s->mb_x * 16 - 16;
@@ -3392,7 +3476,6 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
int dct_offset = s->linesize*8; //default for progressive frames
uint8_t *ptr_y, *ptr_cb, *ptr_cr;
int wrap_y, wrap_c;
- int emu=0;
for(i=0; i<6; i++) skip_dct[i]=0;
@@ -4065,8 +4148,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
ff_update_block_index(s);
/* write gob / video packet header */
-/* xine: do not need this for decode or MPEG-1 encoding modes */
-#if 0
#ifdef CONFIG_RISKY
if(s->rtp_mode){
int current_packet_size, is_gob_start;
@@ -4121,19 +4202,25 @@ static int encode_thread(AVCodecContext *c, void *arg){
s->avctx->rtp_callback(s->ptr_lastgob, current_packet_size, 0);
switch(s->codec_id){
+/* xine: do not need this for decode or MPEG-1 encoding modes */
+#if 0
case CODEC_ID_MPEG4:
ff_mpeg4_encode_video_packet_header(s);
ff_mpeg4_clean_buffers(s);
break;
+#endif /* #if 0 */
case CODEC_ID_MPEG1VIDEO:
case CODEC_ID_MPEG2VIDEO:
ff_mpeg1_encode_slice_header(s);
ff_mpeg1_clean_buffers(s);
break;
+/* xine: do not need this for decode or MPEG-1 encoding modes */
+#if 0
case CODEC_ID_H263:
case CODEC_ID_H263P:
h263_encode_gob_header(s, mb_y);
break;
+#endif /* #if 0 */
}
if(s->flags&CODEC_FLAG_PASS1){
@@ -4149,8 +4236,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
}
}
#endif
-#endif /* #if 0 */
-
if( (s->resync_mb_x == s->mb_x)
&& s->resync_mb_y+1 == s->mb_y){
@@ -4615,7 +4700,6 @@ static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src)
static void encode_picture(MpegEncContext *s, int picture_number)
{
- int mb_x, mb_y;
int i, j;
int bits;
@@ -4651,19 +4735,18 @@ static void encode_picture(MpegEncContext *s, int picture_number)
for(i=1; i<s->avctx->thread_count; i++){
ff_update_duplicate_context(s->thread_context[i], s);
}
-
+
+ ff_init_me(s);
+
/* Estimate motion for every MB */
if(s->pict_type != I_TYPE){
-/* xine: do not need this for decode or MPEG-1 encoding modes */
-#if 0
- if(s->pict_type != B_TYPE){
+ if(s->pict_type != B_TYPE && s->avctx->me_threshold==0){
if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
s->avctx->execute(s->avctx, pre_estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
}
}
s->avctx->execute(s->avctx, estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
-#endif /* #if 0 */
}else /* if(s->pict_type == I_TYPE) */{
/* I-Frame */
for(i=0; i<s->mb_stride*s->mb_height; i++)
@@ -5194,7 +5277,6 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise?
int prev_run=0;
int prev_level=0;
int qmul, qadd, start_i, last_non_zero, i, dc;
- const int esc_length= s->ac_esc_length;
uint8_t * length;
uint8_t * last_length;
int lambda;
@@ -5302,7 +5384,6 @@ STOP_TIMER("init rem[]")
#endif
for(;;){
int best_score=s->dsp.try_8x8basis(rem, weight, basis[0], 0);
- int nochange_score= best_score;
int best_coeff=0;
int best_change=0;
int run2, best_unquant_change, analyze_gradient;
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 171d66d83..cd42177f5 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -138,7 +138,6 @@ typedef struct Picture{
*/
uint8_t *interpolated[3];
int16_t (*motion_val_base[2])[2];
- int8_t *ref_index[2];
uint32_t *mb_type_base;
#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if theres just one type
#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4)
@@ -201,6 +200,10 @@ typedef struct MotionEstContext{
int co_located_mv[4][2]; ///< mv from last p frame for direct mode ME
int direct_basis_mv[4][2];
uint8_t *scratchpad; ///< data area for the me algo, so that the ME doesnt need to malloc/free
+ uint8_t *best_mb;
+ uint8_t *temp_mb[2];
+ uint8_t *temp;
+ int best_bits;
uint32_t *map; ///< map to avoid duplicate evaluations
uint32_t *score_map; ///< map to store the scores
int map_generation;
@@ -208,31 +211,32 @@ typedef struct MotionEstContext{
int penalty_factor;
int sub_penalty_factor;
int mb_penalty_factor;
+ int flags;
+ int sub_flags;
+ int mb_flags;
int pre_pass; ///< = 1 for the pre pass
int dia_size;
int xmin;
int xmax;
int ymin;
int ymax;
+ int pred_x;
+ int pred_y;
+ uint8_t *src[4][4];
+ uint8_t *ref[4][4];
+ int stride;
+ int uvstride;
+/* cmp, chroma_cmp;*/
+ op_pixels_func (*hpel_put)[4];
+ op_pixels_func (*hpel_avg)[4];
+ qpel_mc_func (*qpel_put)[16];
+ qpel_mc_func (*qpel_avg)[16];
uint8_t (*mv_penalty)[MAX_MV*2+1]; ///< amount of bits needed to encode a MV
+ uint8_t *current_mv_penalty;
int (*sub_motion_search)(struct MpegEncContext * s,
int *mx_ptr, int *my_ptr, int dmin,
- int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[6], int stride, int uvstride,
- int size, int h, uint8_t * const mv_penalty);
- int (*motion_search[7])(struct MpegEncContext * s,
- int *mx_ptr, int *my_ptr,
- int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[6], int stride, int uvstride, int16_t (*last_mv)[2],
- int ref_mv_scale, uint8_t * const mv_penalty);
- int (*pre_motion_search)(struct MpegEncContext * s,
- int *mx_ptr, int *my_ptr,
- int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[6], int stride, int uvstride, int16_t (*last_mv)[2],
- int ref_mv_scale, uint8_t * const mv_penalty);
- int (*get_mb_score)(struct MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3],
- uint8_t *ref_data[6], int stride, int uvstride,
- uint8_t * const mv_penalty);
+ int src_index, int ref_index,
+ int size, int h);
}MotionEstContext;
/**
@@ -321,13 +325,16 @@ typedef struct MpegEncContext {
Picture *current_picture_ptr; ///< pointer to the current picture
uint8_t *visualization_buffer[3]; //< temporary buffer vor MV visualization
int last_dc[3]; ///< last DC values for MPEG1
+ int16_t *dc_val_base;
int16_t *dc_val[3]; ///< used for mpeg4 DC prediction, all 3 arrays must be continuous
int16_t dc_cache[4*5];
int y_dc_scale, c_dc_scale;
uint8_t *y_dc_scale_table; ///< qscale -> y_dc_scale table
uint8_t *c_dc_scale_table; ///< qscale -> c_dc_scale table
const uint8_t *chroma_qscale_table; ///< qscale -> chroma_qscale (h263)
+ uint8_t *coded_block_base;
uint8_t *coded_block; ///< used for coded block pattern prediction (msmpeg4v3, wmv1)
+ int16_t (*ac_val_base)[16];
int16_t (*ac_val[3])[16]; ///< used for for mpeg4 AC prediction, all 3 arrays must be continuous
int ac_pred;
uint8_t *prev_pict_types; ///< previous picture types in bitstream order, used for mb skip
@@ -352,8 +359,9 @@ typedef struct MpegEncContext {
int adaptive_quant; ///< use adaptive quantization
int dquant; ///< qscale difference to prev qscale
int pict_type; ///< I_TYPE, P_TYPE, B_TYPE, ...
- int last_pict_type;
+ int last_pict_type; //FIXME removes
int last_non_b_pict_type; ///< used for mpeg4 gmc b-frames & ratecontrol
+ int dropable;
int frame_rate_index;
int frame_rate_ext_n; ///< MPEG-2 specific framerate modificators (numerator)
int frame_rate_ext_d; ///< MPEG-2 specific framerate modificators (denominator)
@@ -706,6 +714,7 @@ typedef struct MpegEncContext {
int DCT_common_init(MpegEncContext *s);
+void MPV_decode_defaults(MpegEncContext *s);
int MPV_common_init(MpegEncContext *s);
void MPV_common_end(MpegEncContext *s);
void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
@@ -856,7 +865,7 @@ void mpeg4_encode_mb(MpegEncContext *s,
void h263_encode_picture_header(MpegEncContext *s, int picture_number);
void ff_flv_encode_picture_header(MpegEncContext *s, int picture_number);
void h263_encode_gob_header(MpegEncContext * s, int mb_line);
-int16_t *h263_pred_motion(MpegEncContext * s, int block,
+int16_t *h263_pred_motion(MpegEncContext * s, int block, int dir,
int *px, int *py);
void mpeg4_pred_ac(MpegEncContext * s, DCTELEM *block, int n,
int dir);
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index b7b88c38f..c6cfebe16 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -425,7 +425,9 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
#ifdef DEBUG
intra_count = 0;
+/*
printf("*****frame %d:\n", frame_count++);
+*/
#endif
}
@@ -449,7 +451,7 @@ static inline int coded_block_pred(MpegEncContext * s, int n, uint8_t **coded_bl
int xy, wrap, pred, a, b, c;
xy = s->block_index[n];
- wrap = s->block_wrap[0];
+ wrap = s->b8_stride;
/* B C
* A X
@@ -567,7 +569,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
s->misc_bits += get_bits_diff(s);
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
msmpeg4v2_encode_motion(s, motion_x - pred_x);
msmpeg4v2_encode_motion(s, motion_y - pred_y);
}else{
@@ -578,7 +580,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
s->misc_bits += get_bits_diff(s);
/* motion vector */
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
msmpeg4_encode_motion(s, motion_x - pred_x,
motion_y - pred_y);
}
@@ -1549,7 +1551,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
cbp|= cbpy<<2;
if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C;
- h263_pred_motion(s, 0, &mx, &my);
+ h263_pred_motion(s, 0, 0, &mx, &my);
mx= msmpeg4v2_decode_motion(s, mx, 1);
my= msmpeg4v2_decode_motion(s, my, 1);
@@ -1637,7 +1639,7 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
s->rl_chroma_table_index = s->rl_table_index;
}
set_stat(ST_MV);
- h263_pred_motion(s, 0, &mx, &my);
+ h263_pred_motion(s, 0, 0, &mx, &my);
if (msmpeg4_decode_motion(s, &mx, &my) < 0)
return -1;
s->mv_dir = MV_DIR_FORWARD;
diff --git a/src/libffmpeg/libavcodec/ppc/Makefile.am b/src/libffmpeg/libavcodec/ppc/Makefile.am
index fbd734c29..50b9d802e 100644
--- a/src/libffmpeg/libavcodec/ppc/Makefile.am
+++ b/src/libffmpeg/libavcodec/ppc/Makefile.am
@@ -11,6 +11,7 @@ noinst_LTLIBRARIES = libavcodec_ppc.la
libavcodec_ppc_src = dsputil_altivec.c \
dsputil_ppc.c \
+ fdct_altivec.c \
fft_altivec.c \
idct_altivec.c \
gmc_altivec.c \
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
index 633cae68b..1bc6fb009 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
- * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -1302,6 +1302,357 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
}
+int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
+POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
+ int sum;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
+ register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
+ register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+#ifdef CONFIG_DARWIN
+ {
+ register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+ register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+ register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+ register const_vector unsigned char perm1 = (const_vector unsigned char)
+ (0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09,
+ 0x0E, 0x0F, 0x0C, 0x0D);
+ register const_vector unsigned char perm2 = (const_vector unsigned char)
+ (0x04, 0x05, 0x06, 0x07,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B);
+ register const_vector unsigned char perm3 = (const_vector unsigned char)
+ (0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07);
+#else
+ register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+ register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+ register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+ register const_vector unsigned char perm1 = (const_vector unsigned char)
+ {0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09,
+ 0x0E, 0x0F, 0x0C, 0x0D};
+ register const_vector unsigned char perm2 = (const_vector unsigned char)
+ {0x04, 0x05, 0x06, 0x07,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B};
+ register const_vector unsigned char perm3 = (const_vector unsigned char)
+ {0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07};
+#endif
+
+#define ONEITERBUTTERFLY(i, res) \
+ { \
+ register vector unsigned char src1, src2, srcO; \
+ register vector unsigned char dst1, dst2, dstO; \
+ src1 = vec_ld(stride * i, src); \
+ if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
+ src2 = vec_ld((stride * i) + 16, src); \
+ srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+ dst1 = vec_ld(stride * i, dst); \
+ if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
+ dst2 = vec_ld((stride * i) + 16, dst); \
+ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+ /* promote the unsigned chars to signed shorts */ \
+ /* we're in the 8x8 function, we only care for the first 8 */ \
+ register vector signed short srcV = \
+ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
+ register vector signed short dstV = \
+ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+ /* substractions inside the first butterfly */ \
+ register vector signed short but0 = vec_sub(srcV, dstV); \
+ register vector signed short op1 = vec_perm(but0, but0, perm1); \
+ register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
+ register vector signed short op2 = vec_perm(but1, but1, perm2); \
+ register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
+ register vector signed short op3 = vec_perm(but2, but2, perm3); \
+ res = vec_mladd(but2, vprod3, op3); \
+ }
+ ONEITERBUTTERFLY(0, temp0);
+ ONEITERBUTTERFLY(1, temp1);
+ ONEITERBUTTERFLY(2, temp2);
+ ONEITERBUTTERFLY(3, temp3);
+ ONEITERBUTTERFLY(4, temp4);
+ ONEITERBUTTERFLY(5, temp5);
+ ONEITERBUTTERFLY(6, temp6);
+ ONEITERBUTTERFLY(7, temp7);
+ }
+#undef ONEITERBUTTERFLY
+ {
+ register vector signed int vsum;
+ register vector signed short line0 = vec_add(temp0, temp1);
+ register vector signed short line1 = vec_sub(temp0, temp1);
+ register vector signed short line2 = vec_add(temp2, temp3);
+ register vector signed short line3 = vec_sub(temp2, temp3);
+ register vector signed short line4 = vec_add(temp4, temp5);
+ register vector signed short line5 = vec_sub(temp4, temp5);
+ register vector signed short line6 = vec_add(temp6, temp7);
+ register vector signed short line7 = vec_sub(temp6, temp7);
+
+ register vector signed short line0B = vec_add(line0, line2);
+ register vector signed short line2B = vec_sub(line0, line2);
+ register vector signed short line1B = vec_add(line1, line3);
+ register vector signed short line3B = vec_sub(line1, line3);
+ register vector signed short line4B = vec_add(line4, line6);
+ register vector signed short line6B = vec_sub(line4, line6);
+ register vector signed short line5B = vec_add(line5, line7);
+ register vector signed short line7B = vec_sub(line5, line7);
+
+ register vector signed short line0C = vec_add(line0B, line4B);
+ register vector signed short line4C = vec_sub(line0B, line4B);
+ register vector signed short line1C = vec_add(line1B, line5B);
+ register vector signed short line5C = vec_sub(line1B, line5B);
+ register vector signed short line2C = vec_add(line2B, line6B);
+ register vector signed short line6C = vec_sub(line2B, line6B);
+ register vector signed short line3C = vec_add(line3B, line7B);
+ register vector signed short line7C = vec_sub(line3B, line7B);
+
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+ vsum = vec_sum4s(vec_abs(line1C), vsum);
+ vsum = vec_sum4s(vec_abs(line2C), vsum);
+ vsum = vec_sum4s(vec_abs(line3C), vsum);
+ vsum = vec_sum4s(vec_abs(line4C), vsum);
+ vsum = vec_sum4s(vec_abs(line5C), vsum);
+ vsum = vec_sum4s(vec_abs(line6C), vsum);
+ vsum = vec_sum4s(vec_abs(line7C), vsum);
+ vsum = vec_sums(vsum, (vector signed int)vzero);
+ vsum = vec_splat(vsum, 3);
+ vec_ste(vsum, 0, &sum);
+ }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
+ return sum;
+}
+
+/*
+ 16x8 works with 16 elements ; it allows to avoid replicating
+ loads, and give the compiler more rooms for scheduling.
+ It's only used from inside hadamard8_diff16_altivec.
+
+ Unfortunately, it seems gcc-3.3 is a bit dumb, and
+ the compiled code has a LOT of spill code, it seems
+ gcc (unlike xlc) cannot keep everything in registers
+ by itself. The following code include hand-made
+ registers allocation. It's not clean, but on
+ a 7450 the resulting code is much faster (best case
+ fall from 700+ cycles to 550).
+
+ xlc doesn't add spill code, but it doesn't know how to
+ schedule for the 7450, and its code isn't much faster than
+ gcc-3.3 on the 7450 (but uses 25% less instructions...)
+
+ On the 970, the hand-made RA is still a win (arount 690
+ vs. around 780), but xlc goes to around 660 on the
+ regular C code...
+*/
+
+static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
+ int sum;
+ register vector signed short
+ temp0 asm ("v0"),
+ temp1 asm ("v1"),
+ temp2 asm ("v2"),
+ temp3 asm ("v3"),
+ temp4 asm ("v4"),
+ temp5 asm ("v5"),
+ temp6 asm ("v6"),
+ temp7 asm ("v7");
+ register vector signed short
+ temp0S asm ("v8"),
+ temp1S asm ("v9"),
+ temp2S asm ("v10"),
+ temp3S asm ("v11"),
+ temp4S asm ("v12"),
+ temp5S asm ("v13"),
+ temp6S asm ("v14"),
+ temp7S asm ("v15");
+ register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
+ {
+#ifdef CONFIG_DARWIN
+ register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+ register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+ register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+ register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
+ (0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09,
+ 0x0E, 0x0F, 0x0C, 0x0D);
+ register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
+ (0x04, 0x05, 0x06, 0x07,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B);
+ register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
+ (0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07);
+#else
+ register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+ register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+ register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+ register const_vector unsigned char perm1 = (const_vector unsigned char)
+ {0x02, 0x03, 0x00, 0x01,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09,
+ 0x0E, 0x0F, 0x0C, 0x0D};
+ register const_vector unsigned char perm2 = (const_vector unsigned char)
+ {0x04, 0x05, 0x06, 0x07,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x08, 0x09, 0x0A, 0x0B};
+ register const_vector unsigned char perm3 = (const_vector unsigned char)
+ {0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07};
+#endif
+#define ONEITERBUTTERFLY(i, res1, res2) \
+ { \
+ register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
+ register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
+ src1 = vec_ld(stride * i, src); \
+ src2 = vec_ld((stride * i) + 16, src); \
+ register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+ dst1 = vec_ld(stride * i, dst); \
+ dst2 = vec_ld((stride * i) + 16, dst); \
+ register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+ /* promote the unsigned chars to signed shorts */ \
+ register vector signed short srcV asm ("v24") = \
+ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
+ register vector signed short dstV asm ("v25") = \
+ (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+ register vector signed short srcW asm ("v26") = \
+ (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
+ register vector signed short dstW asm ("v27") = \
+ (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
+ /* substractions inside the first butterfly */ \
+ register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
+ register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
+ register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
+ register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
+ register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
+ register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
+ register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
+ register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
+ register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
+ register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
+ register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
+ res1 = vec_mladd(but2, vprod3, op3); \
+ register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
+ res2 = vec_mladd(but2S, vprod3, op3S); \
+ }
+ ONEITERBUTTERFLY(0, temp0, temp0S);
+ ONEITERBUTTERFLY(1, temp1, temp1S);
+ ONEITERBUTTERFLY(2, temp2, temp2S);
+ ONEITERBUTTERFLY(3, temp3, temp3S);
+ ONEITERBUTTERFLY(4, temp4, temp4S);
+ ONEITERBUTTERFLY(5, temp5, temp5S);
+ ONEITERBUTTERFLY(6, temp6, temp6S);
+ ONEITERBUTTERFLY(7, temp7, temp7S);
+ }
+#undef ONEITERBUTTERFLY
+ {
+ register vector signed int vsum;
+ register vector signed short line0 = vec_add(temp0, temp1);
+ register vector signed short line1 = vec_sub(temp0, temp1);
+ register vector signed short line2 = vec_add(temp2, temp3);
+ register vector signed short line3 = vec_sub(temp2, temp3);
+ register vector signed short line4 = vec_add(temp4, temp5);
+ register vector signed short line5 = vec_sub(temp4, temp5);
+ register vector signed short line6 = vec_add(temp6, temp7);
+ register vector signed short line7 = vec_sub(temp6, temp7);
+
+ register vector signed short line0B = vec_add(line0, line2);
+ register vector signed short line2B = vec_sub(line0, line2);
+ register vector signed short line1B = vec_add(line1, line3);
+ register vector signed short line3B = vec_sub(line1, line3);
+ register vector signed short line4B = vec_add(line4, line6);
+ register vector signed short line6B = vec_sub(line4, line6);
+ register vector signed short line5B = vec_add(line5, line7);
+ register vector signed short line7B = vec_sub(line5, line7);
+
+ register vector signed short line0C = vec_add(line0B, line4B);
+ register vector signed short line4C = vec_sub(line0B, line4B);
+ register vector signed short line1C = vec_add(line1B, line5B);
+ register vector signed short line5C = vec_sub(line1B, line5B);
+ register vector signed short line2C = vec_add(line2B, line6B);
+ register vector signed short line6C = vec_sub(line2B, line6B);
+ register vector signed short line3C = vec_add(line3B, line7B);
+ register vector signed short line7C = vec_sub(line3B, line7B);
+
+ vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+ vsum = vec_sum4s(vec_abs(line1C), vsum);
+ vsum = vec_sum4s(vec_abs(line2C), vsum);
+ vsum = vec_sum4s(vec_abs(line3C), vsum);
+ vsum = vec_sum4s(vec_abs(line4C), vsum);
+ vsum = vec_sum4s(vec_abs(line5C), vsum);
+ vsum = vec_sum4s(vec_abs(line6C), vsum);
+ vsum = vec_sum4s(vec_abs(line7C), vsum);
+
+ register vector signed short line0S = vec_add(temp0S, temp1S);
+ register vector signed short line1S = vec_sub(temp0S, temp1S);
+ register vector signed short line2S = vec_add(temp2S, temp3S);
+ register vector signed short line3S = vec_sub(temp2S, temp3S);
+ register vector signed short line4S = vec_add(temp4S, temp5S);
+ register vector signed short line5S = vec_sub(temp4S, temp5S);
+ register vector signed short line6S = vec_add(temp6S, temp7S);
+ register vector signed short line7S = vec_sub(temp6S, temp7S);
+
+ register vector signed short line0BS = vec_add(line0S, line2S);
+ register vector signed short line2BS = vec_sub(line0S, line2S);
+ register vector signed short line1BS = vec_add(line1S, line3S);
+ register vector signed short line3BS = vec_sub(line1S, line3S);
+ register vector signed short line4BS = vec_add(line4S, line6S);
+ register vector signed short line6BS = vec_sub(line4S, line6S);
+ register vector signed short line5BS = vec_add(line5S, line7S);
+ register vector signed short line7BS = vec_sub(line5S, line7S);
+
+ register vector signed short line0CS = vec_add(line0BS, line4BS);
+ register vector signed short line4CS = vec_sub(line0BS, line4BS);
+ register vector signed short line1CS = vec_add(line1BS, line5BS);
+ register vector signed short line5CS = vec_sub(line1BS, line5BS);
+ register vector signed short line2CS = vec_add(line2BS, line6BS);
+ register vector signed short line6CS = vec_sub(line2BS, line6BS);
+ register vector signed short line3CS = vec_add(line3BS, line7BS);
+ register vector signed short line7CS = vec_sub(line3BS, line7BS);
+
+ vsum = vec_sum4s(vec_abs(line0CS), vsum);
+ vsum = vec_sum4s(vec_abs(line1CS), vsum);
+ vsum = vec_sum4s(vec_abs(line2CS), vsum);
+ vsum = vec_sum4s(vec_abs(line3CS), vsum);
+ vsum = vec_sum4s(vec_abs(line4CS), vsum);
+ vsum = vec_sum4s(vec_abs(line5CS), vsum);
+ vsum = vec_sum4s(vec_abs(line6CS), vsum);
+ vsum = vec_sum4s(vec_abs(line7CS), vsum);
+ vsum = vec_sums(vsum, (vector signed int)vzero);
+ vsum = vec_splat(vsum, 3);
+ vec_ste(vsum, 0, &sum);
+ }
+ return sum;
+}
+
+int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
+POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
+ int score;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
+ score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+ if (h==16) {
+ dst += 8*stride;
+ src += 8*stride;
+ score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+ }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
+ return score;
+}
+
int has_altivec(void)
{
#ifdef CONFIG_DARWIN
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
index 93448a1ad..e2729ab22 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -45,6 +46,8 @@ extern void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l
extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
+extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
+extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder);
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
index b8372e51e..b70de7328 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -59,6 +60,8 @@ static unsigned char* perfname[] = {
"put_no_rnd_pixels8_xy2_altivec",
"put_pixels16_xy2_altivec",
"put_no_rnd_pixels16_xy2_altivec",
+ "hadamard8_diff8x8_altivec",
+ "hadamard8_diff16_altivec",
"clear_blocks_dcbz32_ppc",
"clear_blocks_dcbz128_ppc"
};
@@ -262,7 +265,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
c->add_bytes= add_bytes_altivec;
#endif /* 0 */
c->put_pixels_tab[0][0] = put_pixels16_altivec;
- /* the tow functions do the same thing, so use the same code */
+ /* the two functions do the same thing, so use the same code */
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
// next one disabled as it's untested.
@@ -276,6 +279,9 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
c->gmc1 = gmc1_altivec;
+ c->hadamard8_diff[0] = hadamard8_diff16_altivec;
+ c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
+
#ifdef CONFIG_ENCODERS
if (avctx->dct_algo == FF_DCT_AUTO ||
avctx->dct_algo == FF_DCT_ALTIVEC)
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h
index d672edfcb..8b34c6b45 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -50,6 +50,8 @@ enum powerpc_perf_index {
altivec_put_no_rnd_pixels8_xy2_num,
altivec_put_pixels16_xy2_num,
altivec_put_no_rnd_pixels16_xy2_num,
+ altivec_hadamard8_diff8x8_num,
+ altivec_hadamard8_diff16_num,
powerpc_clear_blocks_dcbz32,
powerpc_clear_blocks_dcbz128,
powerpc_perf_total
@@ -63,6 +65,8 @@ enum powerpc_data_index {
};
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
+#ifndef POWERPC_MODE_64BITS
+#define POWERP_PMC_DATATYPE unsigned long
#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
#if (POWERPC_NUM_PMC_ENABLED > 2)
@@ -79,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
-#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index;
+#else /* POWERPC_MODE_64BITS */
+#define POWERP_PMC_DATATYPE unsigned long long
+#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
+#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a))
+#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a))
+#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#endif /* POWERPC_MODE_64BITS */
+#define POWERPC_PERF_DECLARE(a, cond) \
+ POWERP_PMC_DATATYPE \
+ pmc_start[POWERPC_NUM_PMC_ENABLED], \
+ pmc_stop[POWERPC_NUM_PMC_ENABLED], \
+ pmc_loop_index;
#define POWERPC_PERF_START_COUNT(a, cond) do { \
POWERPC_GET_PMC6(pmc_start[5]); \
POWERPC_GET_PMC5(pmc_start[4]); \
@@ -101,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
pmc_loop_index++) \
{ \
- if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
- { \
- unsigned long diff = \
+ if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
+ { \
+ POWERP_PMC_DATATYPE diff = \
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
diff --git a/src/libffmpeg/libavcodec/ppc/fdct_altivec.c b/src/libffmpeg/libavcodec/ppc/fdct_altivec.c
new file mode 100644
index 000000000..99df5ced3
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/fdct_altivec.c
@@ -0,0 +1,498 @@
+/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the
+ * AltiVec optimized library for the FFMPEG Multimedia System
+ * Copyright (C) 2003 James Klicman <james@klicman.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+
+#include "../common.h"
+#include "../dsputil.h"
+#include "dsputil_altivec.h"
+#include "gcc_fixes.h"
+
+
+#define vs16(v) ((vector signed short)(v))
+#define vs32(v) ((vector signed int)(v))
+#define vu8(v) ((vector unsigned char)(v))
+#define vu16(v) ((vector unsigned short)(v))
+#define vu32(v) ((vector unsigned int)(v))
+
+
+#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */
+#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */
+#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */
+#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */
+#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */
+#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */
+#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */
+#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
+
+
+#define W0 -(2 * C2)
+#define W1 (2 * C6)
+#define W2 (SQRT_2 * C6)
+#define W3 (SQRT_2 * C3)
+#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
+#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
+#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
+#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
+#define W8 (SQRT_2 * ( C7 - C3))
+#define W9 (SQRT_2 * (-C1 - C3))
+#define WA (SQRT_2 * (-C3 - C5))
+#define WB (SQRT_2 * ( C5 - C3))
+
+
+static vector float fdctconsts[3] = {
+ (vector float)AVV( W0, W1, W2, W3 ),
+ (vector float)AVV( W4, W5, W6, W7 ),
+ (vector float)AVV( W8, W9, WA, WB )
+};
+
+#define LD_W0 vec_splat(cnsts0, 0)
+#define LD_W1 vec_splat(cnsts0, 1)
+#define LD_W2 vec_splat(cnsts0, 2)
+#define LD_W3 vec_splat(cnsts0, 3)
+#define LD_W4 vec_splat(cnsts1, 0)
+#define LD_W5 vec_splat(cnsts1, 1)
+#define LD_W6 vec_splat(cnsts1, 2)
+#define LD_W7 vec_splat(cnsts1, 3)
+#define LD_W8 vec_splat(cnsts2, 0)
+#define LD_W9 vec_splat(cnsts2, 1)
+#define LD_WA vec_splat(cnsts2, 2)
+#define LD_WB vec_splat(cnsts2, 3)
+
+
+#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
+ x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
+ x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
+ x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
+ x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
+ x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
+ x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
+ x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
+ x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
+ \
+ b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
+ b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
+ b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
+ b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
+ \
+ b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
+ b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
+ b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
+ cnst = LD_W2; \
+ b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
+ cnst = LD_W1; \
+ b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
+ cnst = LD_W0; \
+ b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
+ \
+ x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
+ x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
+ x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
+ x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
+ x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
+ cnst = LD_W3; \
+ x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
+ \
+ cnst = LD_W8; \
+ x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
+ cnst = LD_W9; \
+ x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
+ cnst = LD_WA; \
+ x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
+ cnst = LD_WB; \
+ x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
+ \
+ cnst = LD_W4; \
+ b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
+ cnst = LD_W5; \
+ b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
+ cnst = LD_W6; \
+ b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
+ cnst = LD_W7; \
+ b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
+ \
+ b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
+ b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
+ b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
+ b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \
+ /* }}} */
+
+#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
+ x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
+ x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
+ x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
+ x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
+ x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
+ x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
+ x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
+ x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
+ \
+ b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
+ b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
+ b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
+ b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
+ \
+ b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
+ b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
+ b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
+ cnst = LD_W2; \
+ b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
+ cnst = LD_W1; \
+ b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
+ cnst = LD_W0; \
+ b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
+ \
+ x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
+ x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
+ x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
+ x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
+ x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
+ cnst = LD_W3; \
+ x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
+ \
+ cnst = LD_W8; \
+ x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
+ cnst = LD_W9; \
+ x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
+ cnst = LD_WA; \
+ x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
+ cnst = LD_WB; \
+ x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
+ \
+ cnst = LD_W4; \
+ b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
+ cnst = LD_W5; \
+ b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
+ cnst = LD_W6; \
+ b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
+ cnst = LD_W7; \
+ b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
+ \
+ b7 = vec_add(b7, x2); /* b7 += x2; */ \
+ b5 = vec_add(b5, x3); /* b5 += x3; */ \
+ b3 = vec_add(b3, x2); /* b3 += x2; */ \
+ b1 = vec_add(b1, x3); /* b1 += x3; */ \
+ /* }}} */
+
+
+
+/* two dimensional discrete cosine transform */
+
+void fdct_altivec(int16_t *block)
+{
+POWERPC_PERF_DECLARE(altivec_fdct, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+POWERPC_PERF_START_COUNT(altivec_fdct, 1);
+ void ff_jpeg_fdct_islow(int16_t *block);
+ ff_jpeg_fdct_islow(block);
+POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+ vector signed short *bp;
+ vector float *cp;
+ vector float b00, b10, b20, b30, b40, b50, b60, b70;
+ vector float b01, b11, b21, b31, b41, b51, b61, b71;
+ vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
+ vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+ POWERPC_PERF_START_COUNT(altivec_fdct, 1);
+
+
+ /* setup constants {{{ */
+ /* mzero = -0.0 */
+ vu32(mzero) = vec_splat_u32(-1);
+ vu32(mzero) = vec_sl(vu32(mzero), vu32(mzero));
+ cp = fdctconsts;
+ cnsts0 = vec_ld(0, cp); cp++;
+ cnsts1 = vec_ld(0, cp); cp++;
+ cnsts2 = vec_ld(0, cp);
+ /* }}} */
+
+
+ /* 8x8 matrix transpose (vector short[8]) {{{ */
+#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
+
+ bp = (vector signed short*)block;
+ vs16(b00) = vec_ld(0, bp);
+ vs16(b40) = vec_ld(16*4, bp);
+ vs16(b01) = MERGE_S16(h, b00, b40);
+ vs16(b11) = MERGE_S16(l, b00, b40);
+ bp++;
+ vs16(b10) = vec_ld(0, bp);
+ vs16(b50) = vec_ld(16*4, bp);
+ vs16(b21) = MERGE_S16(h, b10, b50);
+ vs16(b31) = MERGE_S16(l, b10, b50);
+ bp++;
+ vs16(b20) = vec_ld(0, bp);
+ vs16(b60) = vec_ld(16*4, bp);
+ vs16(b41) = MERGE_S16(h, b20, b60);
+ vs16(b51) = MERGE_S16(l, b20, b60);
+ bp++;
+ vs16(b30) = vec_ld(0, bp);
+ vs16(b70) = vec_ld(16*4, bp);
+ vs16(b61) = MERGE_S16(h, b30, b70);
+ vs16(b71) = MERGE_S16(l, b30, b70);
+
+ vs16(x0) = MERGE_S16(h, b01, b41);
+ vs16(x1) = MERGE_S16(l, b01, b41);
+ vs16(x2) = MERGE_S16(h, b11, b51);
+ vs16(x3) = MERGE_S16(l, b11, b51);
+ vs16(x4) = MERGE_S16(h, b21, b61);
+ vs16(x5) = MERGE_S16(l, b21, b61);
+ vs16(x6) = MERGE_S16(h, b31, b71);
+ vs16(x7) = MERGE_S16(l, b31, b71);
+
+ vs16(b00) = MERGE_S16(h, x0, x4);
+ vs16(b10) = MERGE_S16(l, x0, x4);
+ vs16(b20) = MERGE_S16(h, x1, x5);
+ vs16(b30) = MERGE_S16(l, x1, x5);
+ vs16(b40) = MERGE_S16(h, x2, x6);
+ vs16(b50) = MERGE_S16(l, x2, x6);
+ vs16(b60) = MERGE_S16(h, x3, x7);
+ vs16(b70) = MERGE_S16(l, x3, x7);
+
+#undef MERGE_S16
+ /* }}} */
+
+
+/* Some of the initial calculations can be done as vector short before
+ * conversion to vector float. The following code section takes advantage
+ * of this.
+ */
+#if 1
+ /* fdct rows {{{ */
+ vs16(x0) = vec_add(vs16(b00), vs16(b70));
+ vs16(x7) = vec_sub(vs16(b00), vs16(b70));
+ vs16(x1) = vec_add(vs16(b10), vs16(b60));
+ vs16(x6) = vec_sub(vs16(b10), vs16(b60));
+ vs16(x2) = vec_add(vs16(b20), vs16(b50));
+ vs16(x5) = vec_sub(vs16(b20), vs16(b50));
+ vs16(x3) = vec_add(vs16(b30), vs16(b40));
+ vs16(x4) = vec_sub(vs16(b30), vs16(b40));
+
+ vs16(b70) = vec_add(vs16(x0), vs16(x3));
+ vs16(b10) = vec_add(vs16(x1), vs16(x2));
+
+ vs16(b00) = vec_add(vs16(b70), vs16(b10));
+ vs16(b40) = vec_sub(vs16(b70), vs16(b10));
+
+#define CTF0(n) \
+ vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
+ vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
+ b##n##1 = vec_ctf(vs32(b##n##1), 0); \
+ b##n##0 = vec_ctf(vs32(b##n##0), 0);
+
+ CTF0(0);
+ CTF0(4);
+
+ vs16(b20) = vec_sub(vs16(x0), vs16(x3));
+ vs16(b60) = vec_sub(vs16(x1), vs16(x2));
+
+ CTF0(2);
+ CTF0(6);
+
+#undef CTF0
+
+ x0 = vec_add(b60, b20);
+ x1 = vec_add(b61, b21);
+
+ cnst = LD_W2;
+ x0 = vec_madd(cnst, x0, mzero);
+ x1 = vec_madd(cnst, x1, mzero);
+ cnst = LD_W1;
+ b20 = vec_madd(cnst, b20, x0);
+ b21 = vec_madd(cnst, b21, x1);
+ cnst = LD_W0;
+ b60 = vec_madd(cnst, b60, x0);
+ b61 = vec_madd(cnst, b61, x1);
+
+#define CTFX(x,b) \
+ vs32(b##0) = vec_unpackh(vs16(x)); \
+ vs32(b##1) = vec_unpackl(vs16(x)); \
+ b##0 = vec_ctf(vs32(b##0), 0); \
+ b##1 = vec_ctf(vs32(b##1), 0); \
+
+ CTFX(x4, b7);
+ CTFX(x5, b5);
+ CTFX(x6, b3);
+ CTFX(x7, b1);
+
+#undef CTFX
+
+
+ x0 = vec_add(b70, b10);
+ x1 = vec_add(b50, b30);
+ x2 = vec_add(b70, b30);
+ x3 = vec_add(b50, b10);
+ x8 = vec_add(x2, x3);
+ cnst = LD_W3;
+ x8 = vec_madd(cnst, x8, mzero);
+
+ cnst = LD_W8;
+ x0 = vec_madd(cnst, x0, mzero);
+ cnst = LD_W9;
+ x1 = vec_madd(cnst, x1, mzero);
+ cnst = LD_WA;
+ x2 = vec_madd(cnst, x2, x8);
+ cnst = LD_WB;
+ x3 = vec_madd(cnst, x3, x8);
+
+ cnst = LD_W4;
+ b70 = vec_madd(cnst, b70, x0);
+ cnst = LD_W5;
+ b50 = vec_madd(cnst, b50, x1);
+ cnst = LD_W6;
+ b30 = vec_madd(cnst, b30, x1);
+ cnst = LD_W7;
+ b10 = vec_madd(cnst, b10, x0);
+
+ b70 = vec_add(b70, x2);
+ b50 = vec_add(b50, x3);
+ b30 = vec_add(b30, x2);
+ b10 = vec_add(b10, x3);
+
+
+ x0 = vec_add(b71, b11);
+ x1 = vec_add(b51, b31);
+ x2 = vec_add(b71, b31);
+ x3 = vec_add(b51, b11);
+ x8 = vec_add(x2, x3);
+ cnst = LD_W3;
+ x8 = vec_madd(cnst, x8, mzero);
+
+ cnst = LD_W8;
+ x0 = vec_madd(cnst, x0, mzero);
+ cnst = LD_W9;
+ x1 = vec_madd(cnst, x1, mzero);
+ cnst = LD_WA;
+ x2 = vec_madd(cnst, x2, x8);
+ cnst = LD_WB;
+ x3 = vec_madd(cnst, x3, x8);
+
+ cnst = LD_W4;
+ b71 = vec_madd(cnst, b71, x0);
+ cnst = LD_W5;
+ b51 = vec_madd(cnst, b51, x1);
+ cnst = LD_W6;
+ b31 = vec_madd(cnst, b31, x1);
+ cnst = LD_W7;
+ b11 = vec_madd(cnst, b11, x0);
+
+ b71 = vec_add(b71, x2);
+ b51 = vec_add(b51, x3);
+ b31 = vec_add(b31, x2);
+ b11 = vec_add(b11, x3);
+ /* }}} */
+#else
+ /* convert to float {{{ */
+#define CTF(n) \
+ vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
+ vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
+ b##n##1 = vec_ctf(vs32(b##n##1), 0); \
+ b##n##0 = vec_ctf(vs32(b##n##0), 0); \
+
+ CTF(0);
+ CTF(1);
+ CTF(2);
+ CTF(3);
+ CTF(4);
+ CTF(5);
+ CTF(6);
+ CTF(7);
+
+#undef CTF
+ /* }}} */
+
+ FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70);
+ FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71);
+#endif
+
+
+ /* 8x8 matrix transpose (vector float[8][2]) {{{ */
+ x0 = vec_mergel(b00, b20);
+ x1 = vec_mergeh(b00, b20);
+ x2 = vec_mergel(b10, b30);
+ x3 = vec_mergeh(b10, b30);
+
+ b00 = vec_mergeh(x1, x3);
+ b10 = vec_mergel(x1, x3);
+ b20 = vec_mergeh(x0, x2);
+ b30 = vec_mergel(x0, x2);
+
+ x4 = vec_mergel(b41, b61);
+ x5 = vec_mergeh(b41, b61);
+ x6 = vec_mergel(b51, b71);
+ x7 = vec_mergeh(b51, b71);
+
+ b41 = vec_mergeh(x5, x7);
+ b51 = vec_mergel(x5, x7);
+ b61 = vec_mergeh(x4, x6);
+ b71 = vec_mergel(x4, x6);
+
+ x0 = vec_mergel(b01, b21);
+ x1 = vec_mergeh(b01, b21);
+ x2 = vec_mergel(b11, b31);
+ x3 = vec_mergeh(b11, b31);
+
+ x4 = vec_mergel(b40, b60);
+ x5 = vec_mergeh(b40, b60);
+ x6 = vec_mergel(b50, b70);
+ x7 = vec_mergeh(b50, b70);
+
+ b40 = vec_mergeh(x1, x3);
+ b50 = vec_mergel(x1, x3);
+ b60 = vec_mergeh(x0, x2);
+ b70 = vec_mergel(x0, x2);
+
+ b01 = vec_mergeh(x5, x7);
+ b11 = vec_mergel(x5, x7);
+ b21 = vec_mergeh(x4, x6);
+ b31 = vec_mergel(x4, x6);
+ /* }}} */
+
+
+ FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
+ FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
+
+
+ /* round, convert back to short {{{ */
+#define CTS(n) \
+ b##n##0 = vec_round(b##n##0); \
+ b##n##1 = vec_round(b##n##1); \
+ vs32(b##n##0) = vec_cts(b##n##0, 0); \
+ vs32(b##n##1) = vec_cts(b##n##1, 0); \
+ vs16(b##n##0) = vec_pack(vs32(b##n##0), vs32(b##n##1)); \
+ vec_st(vs16(b##n##0), 0, bp);
+
+ bp = (vector signed short*)block;
+ CTS(0); bp++;
+ CTS(1); bp++;
+ CTS(2); bp++;
+ CTS(3); bp++;
+ CTS(4); bp++;
+ CTS(5); bp++;
+ CTS(6); bp++;
+ CTS(7);
+
+#undef CTS
+ /* }}} */
+
+POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* vim:set foldmethod=marker foldlevel=0: */
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
index 51b387792..91e744af9 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
@@ -1,6 +1,9 @@
/*
* Copyright (c) 2002 Dieter Shirley
*
+ * dct_unquantize_h263_altivec:
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
+ *
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
index ce4bf8a47..c8269eb9a 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
@@ -1,84 +1,86 @@
-/*
- * Copyright (c) 2002 Dieter Shirley
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include "../dsputil.h"
-#include "../mpegvideo.h"
-#include <time.h>
-
-#ifdef HAVE_ALTIVEC
-#include "dsputil_altivec.h"
-#endif
-
-extern int dct_quantize_altivec(MpegEncContext *s,
- DCTELEM *block, int n,
- int qscale, int *overflow);
+/*
+ * Copyright (c) 2002 Dieter Shirley
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "../dsputil.h"
+#include "../mpegvideo.h"
+#include <time.h>
+
+#ifdef HAVE_ALTIVEC
+#include "dsputil_altivec.h"
+#endif
+
+extern int dct_quantize_altivec(MpegEncContext *s,
+ DCTELEM *block, int n,
+ int qscale, int *overflow);
extern void dct_unquantize_h263_altivec(MpegEncContext *s,
DCTELEM *block, int n, int qscale);
-
-extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
-
-
-void MPV_common_init_ppc(MpegEncContext *s)
-{
+
+extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
+extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
+
+
+void MPV_common_init_ppc(MpegEncContext *s)
+{
#ifdef HAVE_ALTIVEC
- if (has_altivec())
- {
- if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
- (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
- {
- s->dsp.idct_put = idct_put_altivec;
- s->dsp.idct_add = idct_add_altivec;
+ if (has_altivec())
+ {
+ if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
+ (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
+ {
+ s->dsp.idct_put = idct_put_altivec;
+ s->dsp.idct_add = idct_add_altivec;
#ifndef ALTIVEC_USE_REFERENCE_C_CODE
- s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
#else /* ALTIVEC_USE_REFERENCE_C_CODE */
s->dsp.idct_permutation_type = FF_NO_IDCT_PERM;
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
- }
-
- // Test to make sure that the dct required alignments are met.
- if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
- (((long)(s->q_inter_matrix) & 0x0f) != 0))
- {
- av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
- "to use Altivec DCT. Reverting to non-altivec version.\n");
- return;
- }
-
- if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
- {
- av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
- "to use Altivec DCT. Reverting to non-altivec version.\n");
- return;
- }
-
-
- if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
- (s->avctx->dct_algo == FF_DCT_ALTIVEC))
- {
- s->dct_quantize = dct_quantize_altivec;
+ }
+
+ // Test to make sure that the dct required alignments are met.
+ if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
+ (((long)(s->q_inter_matrix) & 0x0f) != 0))
+ {
+ av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
+ "to use Altivec DCT. Reverting to non-altivec version.\n");
+ return;
+ }
+
+ if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
+ {
+ av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
+ "to use Altivec DCT. Reverting to non-altivec version.\n");
+ return;
+ }
+
+
+ if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
+ (s->avctx->dct_algo == FF_DCT_ALTIVEC))
+ {
+#if 0 /* seems to cause trouble under some circumstances */
+ s->dct_quantize = dct_quantize_altivec;
+#endif
s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
- }
- } else
-#endif
- {
- /* Non-AltiVec PPC optimisations here */
- }
-}
-
+ }
+ } else
+#endif
+ {
+ /* Non-AltiVec PPC optimisations here */
+ }
+}
+
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 11c9734fc..b67ec3974 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -388,6 +388,10 @@ static int rv20_decode_picture_header(MpegEncContext *s)
// return -1;
}
seq= get_bits(&s->gb, 15);
+ if (s->avctx->sub_id == 0x20201002 && get_bits(&s->gb, 1)){
+ av_log(s->avctx, AV_LOG_ERROR, "unknown bit4 set\n");
+// return -1;
+ }
mb_pos= get_bits(&s->gb, av_log2(s->mb_num-1)+1);
s->mb_x= mb_pos % s->mb_width;
s->mb_y= mb_pos / s->mb_width;
@@ -395,7 +399,7 @@ static int rv20_decode_picture_header(MpegEncContext *s)
seq= get_bits(&s->gb, 8)*128;
mb_pos= ff_h263_decode_mba(s);
}
-//printf("%d\n", seq);
+//av_log(s->avctx, AV_LOG_DEBUG, "%d\n", seq);
seq |= s->time &~0x7FFF;
if(seq - s->time > 0x4000) seq -= 0x8000;
if(seq - s->time < -0x4000) seq += 0x8000;
@@ -414,7 +418,10 @@ static int rv20_decode_picture_header(MpegEncContext *s)
}
}
// printf("%d %d %d %d %d\n", seq, (int)s->time, (int)s->last_non_b_time, s->pp_time, s->pb_time);
-
+/*for(i=0; i<32; i++){
+ av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
+}
+av_log(s->avctx, AV_LOG_DEBUG, "\n");*/
s->no_rounding= get_bits1(&s->gb);
s->f_code = 1;
@@ -441,6 +448,8 @@ static int rv10_decode_init(AVCodecContext *avctx)
MpegEncContext *s = avctx->priv_data;
static int done=0;
+ MPV_decode_defaults(s);
+
s->avctx= avctx;
s->out_format = FMT_H263;
s->codec_id= avctx->codec_id;
@@ -476,6 +485,7 @@ static int rv10_decode_init(AVCodecContext *avctx)
s->low_delay=1;
break;
case 0x20200002:
+ case 0x20201002:
case 0x30202002:
case 0x30203002:
s->low_delay=0;
@@ -490,8 +500,6 @@ static int rv10_decode_init(AVCodecContext *avctx)
h263_decode_init_vlc(s);
- s->progressive_sequence=1;
-
/* init rv vlc */
if (!done) {
init_vlc(&rv_dc_lum, DC_VLC_BITS, 256,
@@ -556,10 +564,6 @@ static int rv10_decode_packet(AVCodecContext *avctx,
return -1;
}
- if(s->pict_type == B_TYPE){ //FIXME remove after cleaning mottion_val indexing
- memset(s->current_picture.motion_val[0], 0, sizeof(int16_t)*2*(s->mb_width*2+2)*(s->mb_height*2+2));
- }
-
#ifdef DEBUG
printf("qscale=%d\n", s->qscale);
#endif
@@ -592,9 +596,9 @@ static int rv10_decode_packet(AVCodecContext *avctx,
s->block_wrap[0]=
s->block_wrap[1]=
s->block_wrap[2]=
- s->block_wrap[3]= s->mb_width*2 + 2;
+ s->block_wrap[3]= s->b8_stride;
s->block_wrap[4]=
- s->block_wrap[5]= s->mb_width + 2;
+ s->block_wrap[5]= s->mb_stride;
ff_init_block_index(s);
/* decode each macroblock */
@@ -669,10 +673,6 @@ static int rv10_decode_frame(AVCodecContext *avctx,
return -1;
}
- if(s->pict_type == B_TYPE){ //FIXME remove after cleaning mottion_val indexing
- memset(s->current_picture.motion_val[0], 0, sizeof(int16_t)*2*(s->mb_width*2+2)*(s->mb_height*2+2));
- }
-
if(s->mb_y>=s->mb_height){
MPV_frame_end(s);
diff --git a/src/libffmpeg/libavcodec/smc.c b/src/libffmpeg/libavcodec/smc.c
index 87db50005..e937b03c8 100644
--- a/src/libffmpeg/libavcodec/smc.c
+++ b/src/libffmpeg/libavcodec/smc.c
@@ -36,9 +36,6 @@
#include "avcodec.h"
#include "dsputil.h"
-#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log
-#define fprintf(...) {}
-
#define CPAIR 2
#define CQUAD 4
#define COCTET 8
@@ -75,7 +72,7 @@ typedef struct SmcContext {
total_blocks--; \
if (total_blocks < 0) \
{ \
- printf("warning: block counter just went negative (this should not happen)\n"); \
+ av_log(s->avctx, AV_LOG_INFO, "warning: block counter just went negative (this should not happen)\n"); \
return; \
} \
}
@@ -124,7 +121,7 @@ static void smc_decode_stream(SmcContext *s)
chunk_size = BE_32(&s->buf[stream_ptr]) & 0x00FFFFFF;
stream_ptr += 4;
if (chunk_size != s->size)
- printf("warning: MOV chunk size != encoded chunk size (%d != %d); using MOV chunk size\n",
+ av_log(s->avctx, AV_LOG_INFO, "warning: MOV chunk size != encoded chunk size (%d != %d); using MOV chunk size\n",
chunk_size, s->size);
chunk_size = s->size;
@@ -135,13 +132,13 @@ static void smc_decode_stream(SmcContext *s)
/* sanity checks */
/* make sure stream ptr hasn't gone out of bounds */
if (stream_ptr > chunk_size) {
- printf("SMC decoder just went out of bounds (stream ptr = %d, chunk size = %d)\n",
+ av_log(s->avctx, AV_LOG_INFO, "SMC decoder just went out of bounds (stream ptr = %d, chunk size = %d)\n",
stream_ptr, chunk_size);
return;
}
/* make sure the row pointer hasn't gone wild */
if (row_ptr >= image_size) {
- printf("SMC decoder just went out of bounds (row ptr = %d, height = %d)\n",
+ av_log(s->avctx, AV_LOG_INFO, "SMC decoder just went out of bounds (row ptr = %d, height = %d)\n",
row_ptr, image_size);
return;
}
@@ -164,7 +161,7 @@ static void smc_decode_stream(SmcContext *s)
/* sanity check */
if ((row_ptr == 0) && (pixel_ptr == 0)) {
- printf("encountered repeat block opcode (%02X) but no blocks rendered yet\n",
+ av_log(s->avctx, AV_LOG_INFO, "encountered repeat block opcode (%02X) but no blocks rendered yet\n",
opcode & 0xF0);
break;
}
@@ -198,7 +195,7 @@ static void smc_decode_stream(SmcContext *s)
/* sanity check */
if ((row_ptr == 0) && (pixel_ptr < 2 * 4)) {
- printf("encountered repeat block opcode (%02X) but not enough blocks rendered yet\n",
+ av_log(s->avctx, AV_LOG_INFO, "encountered repeat block opcode (%02X) but not enough blocks rendered yet\n",
opcode & 0xF0);
break;
}
@@ -425,7 +422,7 @@ static void smc_decode_stream(SmcContext *s)
break;
case 0xF0:
- printf("0xF0 opcode seen in SMC chunk (xine developers would like to know)\n");
+ av_log(s->avctx, AV_LOG_INFO, "0xF0 opcode seen in SMC chunk (contact the developers)\n");
break;
}
}
@@ -462,7 +459,7 @@ static int smc_decode_frame(AVCodecContext *avctx,
s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE |
FF_BUFFER_HINTS_REUSABLE | FF_BUFFER_HINTS_READABLE;
if (avctx->reget_buffer(avctx, &s->frame)) {
- printf ("reget_buffer() failed\n");
+ av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
return -1;
}
diff --git a/src/libffmpeg/libavcodec/sparc/Makefile.am b/src/libffmpeg/libavcodec/sparc/Makefile.am
new file mode 100644
index 000000000..cdf16e3ad
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/Makefile.am
@@ -0,0 +1,15 @@
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS = $(LIBFFMPEG_CFLAGS)
+ASFLAGS =
+
+noinst_LTLIBRARIES = libavcodec_sparc.la
+
+libavcodec_sparc_src = dsputil_vis.c
+libavcodec_sparc_dummy = libavcodec_sparc_dummy.c
+
+EXTRA_DIST = $(libavcodec_sparc_src) $(libavcodec_sparc_dummy) vis.h
+
+sparc_modules = $(libavcodec_sparc_src)
+
+libavcodec_sparc_la_SOURCES = $(sparc_modules) $(libavcodec_sparc_dummy)
diff --git a/src/libffmpeg/libavcodec/sparc/dsputil_vis.c b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
new file mode 100644
index 000000000..434cf74ac
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
@@ -0,0 +1,4107 @@
+/*
+ * dsputil_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of ffmpeg, a free MPEG-4 video stream decoder.
+ * See http://ffmpeg.sourceforge.net/ for updates.
+ *
+ * ffmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * ffmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the Lesser GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
+ The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
+ */
+
+#include "config.h"
+
+#ifdef ARCH_SPARC
+
+#include <inttypes.h>
+#include <signal.h>
+#include <setjmp.h>
+
+#include "../dsputil.h"
+
+#include "vis.h"
+
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16. So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ * fxor f0, f2, f10
+ * fand f10, f4, f10
+ * fmul8x16 f8, f10, f10
+ * fand f10, f6, f10
+ * for f0, f2, f12
+ * fpsub16 f12, f10, f10
+ */
+
+#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
+
+#define DUP4(x) {x, x, x, x}
+#define DUP8(x) {x, x, x, x, x, x, x, x}
+static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
+static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
+static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
+static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
+static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
+static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
+static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
+static const int16_t constants256_512[] ATTR_ALIGN(8) =
+ {256, 512, 256, 512};
+static const int16_t constants256_1024[] ATTR_ALIGN(8) =
+ {256, 1024, 256, 1024};
+
+#define REF_0 0
+#define REF_0_1 1
+#define REF_2 2
+#define REF_2_1 3
+#define REF_4 4
+#define REF_4_1 5
+#define REF_6 6
+#define REF_6_1 7
+#define REF_S0 8
+#define REF_S0_1 9
+#define REF_S2 10
+#define REF_S2_1 11
+#define REF_S4 12
+#define REF_S4_1 13
+#define REF_S6 14
+#define REF_S6_1 15
+#define DST_0 16
+#define DST_1 17
+#define DST_2 18
+#define DST_3 19
+#define CONST_1 20
+#define CONST_2 20
+#define CONST_3 20
+#define CONST_6 20
+#define MASK_fe 20
+#define CONST_128 22
+#define CONST_256 22
+#define CONST_512 22
+#define CONST_1024 22
+#define TMP0 24
+#define TMP1 25
+#define TMP2 26
+#define TMP3 27
+#define TMP4 28
+#define TMP5 29
+#define ZERO 30
+#define MASK_7f 30
+
+#define TMP6 32
+#define TMP8 34
+#define TMP10 36
+#define TMP12 38
+#define TMP14 40
+#define TMP16 42
+#define TMP18 44
+#define TMP20 46
+#define TMP22 48
+#define TMP24 50
+#define TMP26 52
+#define TMP28 54
+#define TMP30 56
+#define TMP32 58
+
+static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ do { /* 5 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+ vis_st64(REF_0, dest[0]);
+
+ vis_faligndata(TMP2, TMP4, REF_2);
+ vis_st64_2(REF_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ do { /* 4 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+ ref += stride;
+
+ /* stall */
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+ vis_st64(REF_0, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+
+static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ int stride_8 = stride + 8;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(dest[0], DST_0);
+
+ vis_ld64(dest[8], DST_2);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP2, TMP4, REF_2);
+
+ vis_ld64(constants128[0], CONST_128);
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 24 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_xor(DST_2, REF_2, TMP8);
+
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_or(DST_0, REF_0, TMP10);
+ vis_ld64_2(dest, stride, DST_0);
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+
+ vis_or(DST_2, REF_2, TMP12);
+ vis_ld64_2(dest, stride_8, DST_2);
+
+ vis_ld64(ref[0], TMP14);
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_psub16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_psub16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+
+ dest += stride;
+ vis_ld64_2(ref, 8, TMP16);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 16, TMP18);
+ vis_faligndata(TMP2, TMP4, REF_2);
+ ref += stride;
+
+ vis_xor(DST_0, REF_0, TMP20);
+
+ vis_and(TMP20, MASK_fe, TMP20);
+
+ vis_xor(DST_2, REF_2, TMP22);
+ vis_mul8x16(CONST_128, TMP20, TMP20);
+
+ vis_and(TMP22, MASK_fe, TMP22);
+
+ vis_or(DST_0, REF_0, TMP24);
+ vis_mul8x16(CONST_128, TMP22, TMP22);
+
+ vis_or(DST_2, REF_2, TMP26);
+
+ vis_ld64_2(dest, stride, DST_0);
+ vis_faligndata(TMP14, TMP16, REF_0);
+
+ vis_ld64_2(dest, stride_8, DST_2);
+ vis_faligndata(TMP16, TMP18, REF_2);
+
+ vis_and(TMP20, MASK_7f, TMP20);
+
+ vis_and(TMP22, MASK_7f, TMP22);
+
+ vis_psub16(TMP24, TMP20, TMP20);
+ vis_st64(TMP20, dest[0]);
+
+ vis_psub16(TMP26, TMP22, TMP22);
+ vis_st64_2(TMP22, dest, 8);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_xor(DST_2, REF_2, TMP8);
+
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_or(DST_0, REF_0, TMP10);
+ vis_ld64_2(dest, stride, DST_0);
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+
+ vis_or(DST_2, REF_2, TMP12);
+ vis_ld64_2(dest, stride_8, DST_2);
+
+ vis_ld64(ref[0], TMP14);
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_psub16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_psub16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+
+ dest += stride;
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_2);
+
+ vis_xor(DST_0, REF_0, TMP20);
+
+ vis_and(TMP20, MASK_fe, TMP20);
+
+ vis_xor(DST_2, REF_2, TMP22);
+ vis_mul8x16(CONST_128, TMP20, TMP20);
+
+ vis_and(TMP22, MASK_fe, TMP22);
+
+ vis_or(DST_0, REF_0, TMP24);
+ vis_mul8x16(CONST_128, TMP22, TMP22);
+
+ vis_or(DST_2, REF_2, TMP26);
+
+ vis_and(TMP20, MASK_7f, TMP20);
+
+ vis_and(TMP22, MASK_7f, TMP22);
+
+ vis_psub16(TMP24, TMP20, TMP20);
+ vis_st64(TMP20, dest[0]);
+
+ vis_psub16(TMP26, TMP22, TMP22);
+ vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_ld64(dest[0], DST_0);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants128[0], CONST_128);
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 12 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP4);
+
+ vis_ld64(ref[8], TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_or(DST_0, REF_0, TMP6);
+ vis_ld64_2(dest, stride, DST_0);
+ ref += stride;
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_ld64(ref[0], TMP12);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(ref[8], TMP2);
+ vis_xor(DST_0, REF_0, TMP0);
+ ref += stride;
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_psub16(TMP6, TMP4, TMP4);
+ vis_st64(TMP4, dest[0]);
+ dest += stride;
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_or(DST_0, REF_0, TMP6);
+ vis_ld64_2(dest, stride, DST_0);
+
+ vis_faligndata(TMP12, TMP2, REF_0);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_psub16(TMP6, TMP0, TMP4);
+ vis_st64(TMP4, dest[0]);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP4);
+
+ vis_ld64(ref[8], TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_or(DST_0, REF_0, TMP6);
+ vis_ld64_2(dest, stride, DST_0);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_xor(DST_0, REF_0, TMP0);
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_psub16(TMP6, TMP4, TMP4);
+ vis_st64(TMP4, dest[0]);
+ dest += stride;
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_or(DST_0, REF_0, TMP6);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_psub16(TMP6, TMP0, TMP4);
+ vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64_2(ref, 16, TMP4);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants128[0], CONST_128);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 34 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_and(TMP6, MASK_fe, TMP6);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP14);
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_ld64_2(ref, 8, TMP16);
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_or(REF_0, REF_2, TMP10);
+
+ vis_ld64_2(ref, 16, TMP18);
+ ref += stride;
+ vis_or(REF_4, REF_6, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_psub16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_psub16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_or(REF_0, REF_2, TMP10);
+
+ vis_or(REF_4, REF_6, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP14, TMP16, REF_0);
+
+ vis_faligndata(TMP16, TMP18, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP14, TMP16, REF_2);
+ vis_faligndata(TMP16, TMP18, REF_6);
+ } else {
+ vis_src1(TMP16, REF_2);
+ vis_src1(TMP18, REF_6);
+ }
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_psub16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_psub16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_or(REF_0, REF_2, TMP10);
+
+ vis_or(REF_4, REF_6, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_psub16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_psub16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_or(REF_0, REF_2, TMP10);
+
+ vis_or(REF_4, REF_6, TMP12);
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_psub16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_psub16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+
+ vis_ld64(constants128[0], CONST_128);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ } else {
+ vis_src1(TMP2, REF_2);
+ }
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 20 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP8);
+ vis_or(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, 8, TMP10);
+ ref += stride;
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ } else {
+ vis_src1(TMP2, REF_2);
+ }
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_psub16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_or(REF_0, REF_2, TMP14);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+ vis_faligndata(TMP8, TMP10, REF_0);
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP8, TMP10, REF_2);
+ } else {
+ vis_src1(TMP10, REF_2);
+ }
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_psub16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_or(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ } else {
+ vis_src1(TMP2, REF_2);
+ }
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_psub16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_or(REF_0, REF_2, TMP14);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_psub16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+}
+
+static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_fzero(ZERO);
+ vis_ld64(constants256_512[0], CONST_256);
+
+ ref = vis_alignaddr(ref);
+ do { /* 26 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(dest[8], DST_2);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ vis_mul8x16au(REF_0, CONST_256, TMP0);
+
+ vis_pmerge(ZERO, REF_2, TMP4);
+ vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+ vis_pmerge(ZERO, REF_2_1, TMP6);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+
+ vis_mul8x16al(DST_0, CONST_512, TMP4);
+ vis_padd16(TMP2, TMP6, TMP2);
+
+ vis_mul8x16al(DST_1, CONST_512, TMP6);
+
+ vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_4, CONST_256, TMP16);
+
+ vis_padd16(TMP0, CONST_3, TMP8);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+ vis_padd16(TMP2, CONST_3, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_padd16(TMP16, TMP12, TMP0);
+
+ vis_st64(DST_0, dest[0]);
+ vis_mul8x16al(DST_2, CONST_512, TMP4);
+ vis_padd16(TMP18, TMP14, TMP2);
+
+ vis_mul8x16al(DST_3, CONST_512, TMP6);
+ vis_padd16(TMP0, CONST_3, TMP0);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_pack16(TMP0, DST_2);
+
+ vis_pack16(TMP2, DST_3);
+ vis_st64(DST_2, dest[8]);
+
+ ref += stride;
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_times_2 = stride << 1;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_fzero(ZERO);
+ vis_ld64(constants256_512[0], CONST_256);
+
+ ref = vis_alignaddr(ref);
+ height >>= 2;
+ do { /* 47 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64(ref[0], TMP4);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 8, TMP6);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP8);
+
+ vis_ld64_2(ref, 8, TMP10);
+ ref += stride;
+ vis_faligndata(TMP4, TMP6, REF_4);
+
+ vis_ld64(ref[0], TMP12);
+
+ vis_ld64_2(ref, 8, TMP14);
+ ref += stride;
+ vis_faligndata(TMP8, TMP10, REF_S0);
+
+ vis_faligndata(TMP12, TMP14, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_ld64_2(dest, stride, DST_2);
+ vis_faligndata(TMP4, TMP6, REF_6);
+
+ vis_faligndata(TMP8, TMP10, REF_S2);
+
+ vis_faligndata(TMP12, TMP14, REF_S6);
+ } else {
+ vis_ld64(dest[0], DST_0);
+ vis_src1(TMP2, REF_2);
+
+ vis_ld64_2(dest, stride, DST_2);
+ vis_src1(TMP6, REF_6);
+
+ vis_src1(TMP10, REF_S2);
+
+ vis_src1(TMP14, REF_S6);
+ }
+
+ vis_pmerge(ZERO, REF_0, TMP0);
+ vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+ vis_pmerge(ZERO, REF_2, TMP4);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+ vis_padd16(TMP0, CONST_3, TMP0);
+ vis_mul8x16al(DST_0, CONST_512, TMP16);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+ vis_mul8x16al(DST_1, CONST_512, TMP18);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+ vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+ vis_padd16(TMP0, TMP16, TMP0);
+ vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+ vis_padd16(TMP2, TMP18, TMP2);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+ vis_padd16(TMP8, CONST_3, TMP8);
+ vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+ vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_pack16(TMP0, DST_0);
+
+ vis_pack16(TMP2, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ vis_padd16(TMP10, CONST_3, TMP10);
+
+ vis_ld64_2(dest, stride, DST_0);
+ vis_padd16(TMP8, TMP16, TMP8);
+
+ vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+ vis_padd16(TMP10, TMP18, TMP10);
+ vis_pack16(TMP8, DST_2);
+
+ vis_pack16(TMP10, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+
+ vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+ vis_pmerge(ZERO, REF_S0, TMP0);
+
+ vis_pmerge(ZERO, REF_S2, TMP24);
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+ vis_padd16(TMP0, CONST_3, TMP0);
+ vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+ vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+ vis_padd16(TMP0, TMP24, TMP0);
+ vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+ vis_padd16(TMP8, CONST_3, TMP8);
+ vis_mul8x16al(DST_0, CONST_512, TMP16);
+
+ vis_padd16(TMP10, CONST_3, TMP10);
+ vis_mul8x16al(DST_1, CONST_512, TMP18);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+ vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+ vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+ vis_padd16(TMP0, TMP16, TMP0);
+
+ vis_padd16(TMP2, TMP18, TMP2);
+ vis_pack16(TMP0, DST_0);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_pack16(TMP2, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_padd16(TMP8, TMP20, TMP8);
+
+ vis_padd16(TMP10, TMP22, TMP10);
+ vis_pack16(TMP8, DST_2);
+
+ vis_pack16(TMP10, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP6);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 8, TMP8);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_ld64_2(ref, 16, TMP10);
+ ref += stride;
+
+ vis_ld64(constants_fe[0], MASK_fe);
+ vis_faligndata(TMP6, TMP8, REF_2);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP8, TMP10, REF_6);
+
+ vis_ld64(constants128[0], CONST_128);
+ height = (height >> 1) - 1;
+ do { /* 24 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP16);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+ vis_or(REF_0, REF_2, TMP14);
+
+ vis_ld64(ref[0], TMP6);
+ vis_or(REF_4, REF_6, TMP18);
+
+ vis_ld64_2(ref, 8, TMP8);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 16, TMP10);
+ ref += stride;
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_and(TMP16, MASK_fe, TMP16);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_mul8x16(CONST_128, TMP16, TMP16);
+ vis_xor(REF_0, REF_2, TMP0);
+
+ vis_xor(REF_4, REF_6, TMP2);
+
+ vis_or(REF_0, REF_2, TMP20);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_and(TMP16, MASK_7f, TMP16);
+
+ vis_psub16(TMP14, TMP12, TMP12);
+ vis_st64(TMP12, dest[0]);
+
+ vis_psub16(TMP18, TMP16, TMP16);
+ vis_st64_2(TMP16, dest, 8);
+ dest += stride;
+
+ vis_or(REF_4, REF_6, TMP18);
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP2, MASK_fe, TMP2);
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_faligndata(TMP6, TMP8, REF_2);
+ vis_mul8x16(CONST_128, TMP2, TMP2);
+
+ vis_faligndata(TMP8, TMP10, REF_6);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_and(TMP2, MASK_7f, TMP2);
+
+ vis_psub16(TMP20, TMP0, TMP0);
+ vis_st64(TMP0, dest[0]);
+
+ vis_psub16(TMP18, TMP2, TMP2);
+ vis_st64_2(TMP2, dest, 8);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP16);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_or(REF_0, REF_2, TMP14);
+
+ vis_or(REF_4, REF_6, TMP18);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_and(TMP16, MASK_fe, TMP16);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_mul8x16(CONST_128, TMP16, TMP16);
+ vis_xor(REF_0, REF_2, TMP0);
+
+ vis_xor(REF_4, REF_6, TMP2);
+
+ vis_or(REF_0, REF_2, TMP20);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_and(TMP16, MASK_7f, TMP16);
+
+ vis_psub16(TMP14, TMP12, TMP12);
+ vis_st64(TMP12, dest[0]);
+
+ vis_psub16(TMP18, TMP16, TMP16);
+ vis_st64_2(TMP16, dest, 8);
+ dest += stride;
+
+ vis_or(REF_4, REF_6, TMP18);
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP2, MASK_fe, TMP2);
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_mul8x16(CONST_128, TMP2, TMP2);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_and(TMP2, MASK_7f, TMP2);
+
+ vis_psub16(TMP20, TMP0, TMP0);
+ vis_st64(TMP0, dest[0]);
+
+ vis_psub16(TMP18, TMP2, TMP2);
+ vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP4);
+
+ vis_ld64_2(ref, 8, TMP6);
+ ref += stride;
+
+ vis_ld64(constants_fe[0], MASK_fe);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP4, TMP6, REF_2);
+
+ vis_ld64(constants128[0], CONST_128);
+ height = (height >> 1) - 1;
+ do { /* 12 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_or(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+ vis_or(REF_0, REF_2, TMP14);
+
+ vis_psub16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_psub16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_or(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+ vis_or(REF_0, REF_2, TMP14);
+
+ vis_psub16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_psub16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ int stride_8 = stride + 8;
+ int stride_16 = stride + 16;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_ld64(constants256_512[0], CONST_256);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ height >>= 1;
+
+ do { /* 31 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_pmerge(ZERO, REF_2, TMP12);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ vis_pmerge(ZERO, REF_6, TMP16);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+ vis_ld64_2(ref, stride_16, TMP4);
+ ref += stride;
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(dest, 8, DST_2);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_ld64_2(ref, stride, TMP6);
+ vis_pmerge(ZERO, REF_0, TMP0);
+ vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+ vis_ld64_2(ref, stride_8, TMP8);
+ vis_pmerge(ZERO, REF_4, TMP4);
+
+ vis_ld64_2(ref, stride_16, TMP10);
+ ref += stride;
+
+ vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+ vis_faligndata(TMP6, TMP8, REF_2);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+ vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+ vis_faligndata(TMP8, TMP10, REF_6);
+ vis_mul8x16al(DST_0, CONST_512, TMP20);
+
+ vis_padd16(TMP0, CONST_3, TMP0);
+ vis_mul8x16al(DST_1, CONST_512, TMP22);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+ vis_mul8x16al(DST_2, CONST_512, TMP24);
+
+ vis_padd16(TMP4, CONST_3, TMP4);
+ vis_mul8x16al(DST_3, CONST_512, TMP26);
+
+ vis_padd16(TMP6, CONST_3, TMP6);
+
+ vis_padd16(TMP12, TMP20, TMP12);
+ vis_mul8x16al(REF_S0, CONST_512, TMP20);
+
+ vis_padd16(TMP14, TMP22, TMP14);
+ vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+ vis_padd16(TMP16, TMP24, TMP16);
+ vis_mul8x16al(REF_S2, CONST_512, TMP24);
+
+ vis_padd16(TMP18, TMP26, TMP18);
+ vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+ vis_padd16(TMP12, TMP0, TMP12);
+ vis_mul8x16au(REF_2, CONST_256, TMP28);
+
+ vis_padd16(TMP14, TMP2, TMP14);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+ vis_padd16(TMP16, TMP4, TMP16);
+ vis_mul8x16au(REF_6, CONST_256, REF_S4);
+
+ vis_padd16(TMP18, TMP6, TMP18);
+ vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+ vis_pack16(TMP12, DST_0);
+ vis_padd16(TMP28, TMP0, TMP12);
+
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_padd16(TMP30, TMP2, TMP14);
+
+ vis_pack16(TMP16, DST_2);
+ vis_padd16(REF_S4, TMP4, TMP16);
+
+ vis_pack16(TMP18, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ vis_padd16(REF_S6, TMP6, TMP18);
+
+ vis_padd16(TMP12, TMP20, TMP12);
+
+ vis_padd16(TMP14, TMP22, TMP14);
+ vis_pack16(TMP12, DST_0);
+
+ vis_padd16(TMP16, TMP24, TMP16);
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+
+ vis_padd16(TMP18, TMP26, TMP18);
+ vis_pack16(TMP16, DST_2);
+
+ vis_pack16(TMP18, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ int stride_8 = stride + 8;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_ld64(constants256_512[0], CONST_256);
+
+ height >>= 1;
+ do { /* 20 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_pmerge(ZERO, REF_2, TMP8);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ ref += stride;
+
+ vis_ld64(dest[0], DST_0);
+
+ vis_ld64_2(dest, stride, DST_2);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, stride, TMP4);
+ vis_mul8x16al(DST_0, CONST_512, TMP16);
+ vis_pmerge(ZERO, REF_0, TMP12);
+
+ vis_ld64_2(ref, stride_8, TMP6);
+ ref += stride;
+ vis_mul8x16al(DST_1, CONST_512, TMP18);
+ vis_pmerge(ZERO, REF_0_1, TMP14);
+
+ vis_padd16(TMP12, CONST_3, TMP12);
+ vis_mul8x16al(DST_2, CONST_512, TMP24);
+
+ vis_padd16(TMP14, CONST_3, TMP14);
+ vis_mul8x16al(DST_3, CONST_512, TMP26);
+
+ vis_faligndata(TMP4, TMP6, REF_2);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_mul8x16au(REF_2, CONST_256, TMP20);
+
+ vis_padd16(TMP8, TMP16, TMP0);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+ vis_padd16(TMP10, TMP18, TMP2);
+ vis_pack16(TMP0, DST_0);
+
+ vis_pack16(TMP2, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ vis_padd16(TMP12, TMP20, TMP12);
+
+ vis_padd16(TMP14, TMP22, TMP14);
+
+ vis_padd16(TMP12, TMP24, TMP0);
+
+ vis_padd16(TMP14, TMP26, TMP2);
+ vis_pack16(TMP0, DST_2);
+
+ vis_pack16(TMP2, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+ int stride_16 = stride + 16;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(constants2[0], CONST_2);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ vis_ld64(constants256_512[0], CONST_256);
+ vis_faligndata(TMP2, TMP4, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ vis_faligndata(TMP2, TMP4, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ vis_src1(TMP4, REF_S6);
+ }
+
+ height >>= 1;
+ do {
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP12);
+ vis_pmerge(ZERO, REF_S0_1, TMP14);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ vis_mul8x16au(REF_S2, CONST_256, TMP16);
+ vis_pmerge(ZERO, REF_S2_1, TMP18);
+
+ vis_ld64_2(ref, stride_16, TMP4);
+ ref += stride;
+ vis_mul8x16au(REF_S4, CONST_256, TMP20);
+ vis_pmerge(ZERO, REF_S4_1, TMP22);
+
+ vis_ld64_2(ref, stride, TMP6);
+ vis_mul8x16au(REF_S6, CONST_256, TMP24);
+ vis_pmerge(ZERO, REF_S6_1, TMP26);
+
+ vis_ld64_2(ref, stride_8, TMP8);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, stride_16, TMP10);
+ ref += stride;
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_faligndata(TMP6, TMP8, REF_S0);
+
+ vis_faligndata(TMP8, TMP10, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ vis_faligndata(TMP6, TMP8, REF_S2);
+ vis_faligndata(TMP8, TMP10, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ vis_src1(TMP8, REF_S2);
+ vis_src1(TMP10, REF_S6);
+ }
+
+ vis_mul8x16au(REF_0, CONST_256, TMP0);
+ vis_pmerge(ZERO, REF_0_1, TMP2);
+
+ vis_mul8x16au(REF_2, CONST_256, TMP4);
+ vis_pmerge(ZERO, REF_2_1, TMP6);
+
+ vis_padd16(TMP0, CONST_2, TMP8);
+ vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+ vis_padd16(TMP2, CONST_2, TMP10);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+ vis_padd16(TMP8, TMP4, TMP8);
+ vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+ vis_padd16(TMP10, TMP6, TMP10);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+ vis_padd16(TMP12, TMP8, TMP12);
+
+ vis_padd16(TMP14, TMP10, TMP14);
+
+ vis_padd16(TMP12, TMP16, TMP12);
+
+ vis_padd16(TMP14, TMP18, TMP14);
+ vis_pack16(TMP12, DST_0);
+
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_padd16(TMP0, CONST_2, TMP12);
+
+ vis_mul8x16au(REF_S0, CONST_256, TMP0);
+ vis_padd16(TMP2, CONST_2, TMP14);
+
+ vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+ vis_padd16(TMP12, TMP4, TMP12);
+
+ vis_mul8x16au(REF_S2, CONST_256, TMP4);
+ vis_padd16(TMP14, TMP6, TMP14);
+
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+ vis_padd16(TMP20, TMP12, TMP20);
+
+ vis_padd16(TMP22, TMP14, TMP22);
+
+ vis_padd16(TMP20, TMP24, TMP20);
+
+ vis_padd16(TMP22, TMP26, TMP22);
+ vis_pack16(TMP20, DST_2);
+
+ vis_pack16(TMP22, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ vis_padd16(TMP0, TMP4, TMP24);
+
+ vis_mul8x16au(REF_S4, CONST_256, TMP0);
+ vis_padd16(TMP2, TMP6, TMP26);
+
+ vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+ vis_padd16(TMP24, TMP8, TMP24);
+
+ vis_padd16(TMP26, TMP10, TMP26);
+ vis_pack16(TMP24, DST_0);
+
+ vis_pack16(TMP26, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_pmerge(ZERO, REF_S6, TMP4);
+
+ vis_pmerge(ZERO, REF_S6_1, TMP6);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+
+ vis_padd16(TMP0, TMP12, TMP0);
+
+ vis_padd16(TMP2, TMP14, TMP2);
+ vis_pack16(TMP0, DST_2);
+
+ vis_pack16(TMP2, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(constants2[0], CONST_2);
+
+ vis_ld64(constants256_512[0], CONST_256);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ }
+
+ height >>= 1;
+ do { /* 26 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP8);
+ vis_pmerge(ZERO, REF_S2, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ ref += stride;
+ vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+ vis_pmerge(ZERO, REF_S2_1, TMP14);
+
+ vis_ld64_2(ref, stride, TMP4);
+
+ vis_ld64_2(ref, stride_8, TMP6);
+ ref += stride;
+ vis_faligndata(TMP0, TMP2, REF_S4);
+
+ vis_pmerge(ZERO, REF_S4, TMP18);
+
+ vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+ vis_faligndata(TMP4, TMP6, REF_S0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S6);
+ vis_faligndata(TMP4, TMP6, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S6);
+ vis_src1(TMP6, REF_S2);
+ }
+
+ vis_padd16(TMP18, CONST_2, TMP18);
+ vis_mul8x16au(REF_S6, CONST_256, TMP22);
+
+ vis_padd16(TMP20, CONST_2, TMP20);
+ vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+ vis_mul8x16au(REF_S0, CONST_256, TMP26);
+ vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+ vis_mul8x16au(REF_S2, CONST_256, TMP30);
+ vis_padd16(TMP18, TMP22, TMP18);
+
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+ vis_padd16(TMP20, TMP24, TMP20);
+
+ vis_padd16(TMP8, TMP18, TMP8);
+
+ vis_padd16(TMP10, TMP20, TMP10);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ vis_padd16(TMP18, TMP26, TMP18);
+
+ vis_padd16(TMP20, TMP28, TMP20);
+
+ vis_padd16(TMP18, TMP30, TMP18);
+
+ vis_padd16(TMP20, TMP32, TMP20);
+ vis_pack16(TMP18, DST_2);
+
+ vis_pack16(TMP20, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+ int stride_16 = stride + 16;
+
+ vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(constants6[0], CONST_6);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ vis_ld64(constants256_1024[0], CONST_256);
+ vis_faligndata(TMP2, TMP4, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ vis_faligndata(TMP2, TMP4, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ vis_src1(TMP4, REF_S6);
+ }
+
+ height >>= 1;
+ do { /* 55 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP12);
+ vis_pmerge(ZERO, REF_S0_1, TMP14);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ vis_mul8x16au(REF_S2, CONST_256, TMP16);
+ vis_pmerge(ZERO, REF_S2_1, TMP18);
+
+ vis_ld64_2(ref, stride_16, TMP4);
+ ref += stride;
+ vis_mul8x16au(REF_S4, CONST_256, TMP20);
+ vis_pmerge(ZERO, REF_S4_1, TMP22);
+
+ vis_ld64_2(ref, stride, TMP6);
+ vis_mul8x16au(REF_S6, CONST_256, TMP24);
+ vis_pmerge(ZERO, REF_S6_1, TMP26);
+
+ vis_ld64_2(ref, stride_8, TMP8);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, stride_16, TMP10);
+ ref += stride;
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP6, TMP8, REF_S0);
+
+ vis_ld64_2(dest, 8, DST_2);
+ vis_faligndata(TMP8, TMP10, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ vis_faligndata(TMP6, TMP8, REF_S2);
+ vis_faligndata(TMP8, TMP10, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ vis_src1(TMP8, REF_S2);
+ vis_src1(TMP10, REF_S6);
+ }
+
+ vis_mul8x16al(DST_0, CONST_1024, TMP30);
+ vis_pmerge(ZERO, REF_0, TMP0);
+
+ vis_mul8x16al(DST_1, CONST_1024, TMP32);
+ vis_pmerge(ZERO, REF_0_1, TMP2);
+
+ vis_mul8x16au(REF_2, CONST_256, TMP4);
+ vis_pmerge(ZERO, REF_2_1, TMP6);
+
+ vis_mul8x16al(DST_2, CONST_1024, REF_0);
+ vis_padd16(TMP0, CONST_6, TMP0);
+
+ vis_mul8x16al(DST_3, CONST_1024, REF_2);
+ vis_padd16(TMP2, CONST_6, TMP2);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+ vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+ vis_padd16(TMP12, TMP0, TMP12);
+ vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+ vis_padd16(TMP14, TMP2, TMP14);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+ vis_padd16(TMP12, TMP16, TMP12);
+ vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+ vis_padd16(TMP14, TMP18, TMP14);
+ vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+ vis_padd16(TMP12, TMP30, TMP12);
+
+ vis_padd16(TMP14, TMP32, TMP14);
+ vis_pack16(TMP12, DST_0);
+
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_padd16(TMP4, CONST_6, TMP4);
+
+ vis_ld64_2(dest, stride, DST_0);
+ vis_padd16(TMP6, CONST_6, TMP6);
+ vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+ vis_padd16(TMP4, TMP8, TMP4);
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
+
+ vis_padd16(TMP6, TMP10, TMP6);
+
+ vis_padd16(TMP20, TMP4, TMP20);
+
+ vis_padd16(TMP22, TMP6, TMP22);
+
+ vis_padd16(TMP20, TMP24, TMP20);
+
+ vis_padd16(TMP22, TMP26, TMP22);
+
+ vis_padd16(TMP20, REF_0, TMP20);
+ vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+ vis_padd16(TMP22, REF_2, TMP22);
+ vis_pack16(TMP20, DST_2);
+
+ vis_pack16(TMP22, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+
+ vis_ld64_2(dest, 8, DST_2);
+ vis_mul8x16al(DST_0, CONST_1024, TMP30);
+ vis_pmerge(ZERO, REF_S4_1, REF_2);
+
+ vis_mul8x16al(DST_1, CONST_1024, TMP32);
+ vis_padd16(REF_4, TMP0, TMP8);
+
+ vis_mul8x16au(REF_S6, CONST_256, REF_4);
+ vis_padd16(REF_6, TMP2, TMP10);
+
+ vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+
+ vis_padd16(TMP8, TMP30, TMP8);
+
+ vis_padd16(TMP10, TMP32, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_st64(DST_0, dest[0]);
+
+ vis_padd16(REF_0, TMP4, REF_0);
+
+ vis_mul8x16al(DST_2, CONST_1024, TMP30);
+ vis_padd16(REF_2, TMP6, REF_2);
+
+ vis_mul8x16al(DST_3, CONST_1024, TMP32);
+ vis_padd16(REF_0, REF_4, REF_0);
+
+ vis_padd16(REF_2, REF_6, REF_2);
+
+ vis_padd16(REF_0, TMP30, REF_0);
+
+ /* stall */
+
+ vis_padd16(REF_2, TMP32, REF_2);
+ vis_pack16(REF_0, DST_2);
+
+ vis_pack16(REF_2, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+
+ vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64(constants6[0], CONST_6);
+
+ vis_ld64(constants256_1024[0], CONST_256);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ }
+
+ height >>= 1;
+ do { /* 31 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP8);
+ vis_pmerge(ZERO, REF_S0_1, TMP10);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ ref += stride;
+ vis_mul8x16au(REF_S2, CONST_256, TMP12);
+ vis_pmerge(ZERO, REF_S2_1, TMP14);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride, TMP4);
+ vis_faligndata(TMP0, TMP2, REF_S4);
+
+ vis_ld64_2(ref, stride_8, TMP6);
+ ref += stride;
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP4, TMP6, REF_S0);
+
+ vis_ld64_2(dest, stride, DST_2);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S6);
+ vis_faligndata(TMP4, TMP6, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S6);
+ vis_src1(TMP6, REF_S2);
+ }
+
+ vis_mul8x16al(DST_0, CONST_1024, TMP30);
+ vis_pmerge(ZERO, REF_S4, TMP22);
+
+ vis_mul8x16al(DST_1, CONST_1024, TMP32);
+ vis_pmerge(ZERO, REF_S4_1, TMP24);
+
+ vis_mul8x16au(REF_S6, CONST_256, TMP26);
+ vis_pmerge(ZERO, REF_S6_1, TMP28);
+
+ vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+ vis_padd16(TMP22, CONST_6, TMP22);
+
+ vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+ vis_padd16(TMP24, CONST_6, TMP24);
+
+ vis_mul8x16al(DST_2, CONST_1024, REF_0);
+ vis_padd16(TMP22, TMP26, TMP22);
+
+ vis_mul8x16al(DST_3, CONST_1024, REF_2);
+ vis_padd16(TMP24, TMP28, TMP24);
+
+ vis_mul8x16au(REF_S2, CONST_256, TMP26);
+ vis_padd16(TMP8, TMP22, TMP8);
+
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+ vis_padd16(TMP10, TMP24, TMP10);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+
+ vis_padd16(TMP8, TMP30, TMP8);
+
+ vis_padd16(TMP10, TMP32, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_padd16(REF_S4, TMP22, TMP12);
+
+ vis_padd16(REF_S6, TMP24, TMP14);
+
+ vis_padd16(TMP12, TMP26, TMP12);
+
+ vis_padd16(TMP14, TMP28, TMP14);
+
+ vis_padd16(TMP12, REF_0, TMP12);
+
+ vis_padd16(TMP14, REF_2, TMP14);
+ vis_pack16(TMP12, DST_2);
+
+ vis_pack16(TMP14, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+/* End of rounding code */
+
+/* Start of no rounding code */
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y)>>1 == (x&y)+((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16. So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ * fxor f0, f2, f10
+ * fand f10, f4, f10
+ * fmul8x16 f8, f10, f10
+ * fand f10, f6, f10
+ * fand f0, f2, f12
+ * fpadd16 f12, f10, f10
+ */
+
+static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ do { /* 5 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+ vis_st64(REF_0, dest[0]);
+
+ vis_faligndata(TMP2, TMP4, REF_2);
+ vis_st64_2(REF_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ do { /* 4 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+ ref += stride;
+
+ /* stall */
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+ vis_st64(REF_0, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+
+static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ int stride_8 = stride + 8;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(dest[0], DST_0);
+
+ vis_ld64(dest[8], DST_2);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP2, TMP4, REF_2);
+
+ vis_ld64(constants128[0], CONST_128);
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 24 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_xor(DST_2, REF_2, TMP8);
+
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_and(DST_0, REF_0, TMP10);
+ vis_ld64_2(dest, stride, DST_0);
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+
+ vis_and(DST_2, REF_2, TMP12);
+ vis_ld64_2(dest, stride_8, DST_2);
+
+ vis_ld64(ref[0], TMP14);
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_padd16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_padd16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+
+ dest += stride;
+ vis_ld64_2(ref, 8, TMP16);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 16, TMP18);
+ vis_faligndata(TMP2, TMP4, REF_2);
+ ref += stride;
+
+ vis_xor(DST_0, REF_0, TMP20);
+
+ vis_and(TMP20, MASK_fe, TMP20);
+
+ vis_xor(DST_2, REF_2, TMP22);
+ vis_mul8x16(CONST_128, TMP20, TMP20);
+
+ vis_and(TMP22, MASK_fe, TMP22);
+
+ vis_and(DST_0, REF_0, TMP24);
+ vis_mul8x16(CONST_128, TMP22, TMP22);
+
+ vis_and(DST_2, REF_2, TMP26);
+
+ vis_ld64_2(dest, stride, DST_0);
+ vis_faligndata(TMP14, TMP16, REF_0);
+
+ vis_ld64_2(dest, stride_8, DST_2);
+ vis_faligndata(TMP16, TMP18, REF_2);
+
+ vis_and(TMP20, MASK_7f, TMP20);
+
+ vis_and(TMP22, MASK_7f, TMP22);
+
+ vis_padd16(TMP24, TMP20, TMP20);
+ vis_st64(TMP20, dest[0]);
+
+ vis_padd16(TMP26, TMP22, TMP22);
+ vis_st64_2(TMP22, dest, 8);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_xor(DST_2, REF_2, TMP8);
+
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_and(DST_0, REF_0, TMP10);
+ vis_ld64_2(dest, stride, DST_0);
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+
+ vis_and(DST_2, REF_2, TMP12);
+ vis_ld64_2(dest, stride_8, DST_2);
+
+ vis_ld64(ref[0], TMP14);
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_padd16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_padd16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+
+ dest += stride;
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_2);
+
+ vis_xor(DST_0, REF_0, TMP20);
+
+ vis_and(TMP20, MASK_fe, TMP20);
+
+ vis_xor(DST_2, REF_2, TMP22);
+ vis_mul8x16(CONST_128, TMP20, TMP20);
+
+ vis_and(TMP22, MASK_fe, TMP22);
+
+ vis_and(DST_0, REF_0, TMP24);
+ vis_mul8x16(CONST_128, TMP22, TMP22);
+
+ vis_and(DST_2, REF_2, TMP26);
+
+ vis_and(TMP20, MASK_7f, TMP20);
+
+ vis_and(TMP22, MASK_7f, TMP22);
+
+ vis_padd16(TMP24, TMP20, TMP20);
+ vis_st64(TMP20, dest[0]);
+
+ vis_padd16(TMP26, TMP22, TMP22);
+ vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_ld64(dest[0], DST_0);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants128[0], CONST_128);
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 12 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP4);
+
+ vis_ld64(ref[8], TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_and(DST_0, REF_0, TMP6);
+ vis_ld64_2(dest, stride, DST_0);
+ ref += stride;
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_ld64(ref[0], TMP12);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(ref[8], TMP2);
+ vis_xor(DST_0, REF_0, TMP0);
+ ref += stride;
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_padd16(TMP6, TMP4, TMP4);
+ vis_st64(TMP4, dest[0]);
+ dest += stride;
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_and(DST_0, REF_0, TMP6);
+ vis_ld64_2(dest, stride, DST_0);
+
+ vis_faligndata(TMP12, TMP2, REF_0);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_padd16(TMP6, TMP0, TMP4);
+ vis_st64(TMP4, dest[0]);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(DST_0, REF_0, TMP4);
+
+ vis_ld64(ref[8], TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_and(DST_0, REF_0, TMP6);
+ vis_ld64_2(dest, stride, DST_0);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_xor(DST_0, REF_0, TMP0);
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_padd16(TMP6, TMP4, TMP4);
+ vis_st64(TMP4, dest[0]);
+ dest += stride;
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_and(DST_0, REF_0, TMP6);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_padd16(TMP6, TMP0, TMP4);
+ vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64_2(ref, 16, TMP4);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants128[0], CONST_128);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 34 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_and(TMP6, MASK_fe, TMP6);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP14);
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_ld64_2(ref, 8, TMP16);
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_and(REF_0, REF_2, TMP10);
+
+ vis_ld64_2(ref, 16, TMP18);
+ ref += stride;
+ vis_and(REF_4, REF_6, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_padd16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_padd16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_and(REF_0, REF_2, TMP10);
+
+ vis_and(REF_4, REF_6, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP14, TMP16, REF_0);
+
+ vis_faligndata(TMP16, TMP18, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP14, TMP16, REF_2);
+ vis_faligndata(TMP16, TMP18, REF_6);
+ } else {
+ vis_src1(TMP16, REF_2);
+ vis_src1(TMP18, REF_6);
+ }
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_padd16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_padd16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_and(REF_0, REF_2, TMP10);
+
+ vis_and(REF_4, REF_6, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_padd16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_padd16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP6);
+
+ vis_xor(REF_4, REF_6, TMP8);
+
+ vis_and(TMP6, MASK_fe, TMP6);
+
+ vis_mul8x16(CONST_128, TMP6, TMP6);
+ vis_and(TMP8, MASK_fe, TMP8);
+
+ vis_mul8x16(CONST_128, TMP8, TMP8);
+ vis_and(REF_0, REF_2, TMP10);
+
+ vis_and(REF_4, REF_6, TMP12);
+
+ vis_and(TMP6, MASK_7f, TMP6);
+
+ vis_and(TMP8, MASK_7f, TMP8);
+
+ vis_padd16(TMP10, TMP6, TMP6);
+ vis_st64(TMP6, dest[0]);
+
+ vis_padd16(TMP12, TMP8, TMP8);
+ vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_ld64(constants_fe[0], MASK_fe);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+
+ vis_ld64(constants128[0], CONST_128);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ } else {
+ vis_src1(TMP2, REF_2);
+ }
+
+ ref += stride;
+ height = (height >> 1) - 1;
+
+ do { /* 20 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP8);
+ vis_and(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, 8, TMP10);
+ ref += stride;
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ } else {
+ vis_src1(TMP2, REF_2);
+ }
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_padd16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_and(REF_0, REF_2, TMP14);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+ vis_faligndata(TMP8, TMP10, REF_0);
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP8, TMP10, REF_2);
+ } else {
+ vis_src1(TMP10, REF_2);
+ }
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_padd16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_and(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ } else {
+ vis_src1(TMP2, REF_2);
+ }
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_padd16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_and(REF_0, REF_2, TMP14);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_padd16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+}
+
+static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_fzero(ZERO);
+ vis_ld64(constants256_512[0], CONST_256);
+
+ ref = vis_alignaddr(ref);
+ do { /* 26 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64(ref[8], TMP2);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(dest[8], DST_2);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ }
+
+ vis_mul8x16au(REF_0, CONST_256, TMP0);
+
+ vis_pmerge(ZERO, REF_2, TMP4);
+ vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+ vis_pmerge(ZERO, REF_2_1, TMP6);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+
+ vis_mul8x16al(DST_0, CONST_512, TMP4);
+ vis_padd16(TMP2, TMP6, TMP2);
+
+ vis_mul8x16al(DST_1, CONST_512, TMP6);
+
+ vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_4, CONST_256, TMP16);
+
+ vis_padd16(TMP0, CONST_3, TMP8);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+ vis_padd16(TMP2, CONST_3, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_padd16(TMP16, TMP12, TMP0);
+
+ vis_st64(DST_0, dest[0]);
+ vis_mul8x16al(DST_2, CONST_512, TMP4);
+ vis_padd16(TMP18, TMP14, TMP2);
+
+ vis_mul8x16al(DST_3, CONST_512, TMP6);
+ vis_padd16(TMP0, CONST_3, TMP0);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_pack16(TMP0, DST_2);
+
+ vis_pack16(TMP2, DST_3);
+ vis_st64(DST_2, dest[8]);
+
+ ref += stride;
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_times_2 = stride << 1;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_fzero(ZERO);
+ vis_ld64(constants256_512[0], CONST_256);
+
+ ref = vis_alignaddr(ref);
+ height >>= 2;
+ do { /* 47 cycles */
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64(ref[0], TMP4);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 8, TMP6);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP8);
+
+ vis_ld64_2(ref, 8, TMP10);
+ ref += stride;
+ vis_faligndata(TMP4, TMP6, REF_4);
+
+ vis_ld64(ref[0], TMP12);
+
+ vis_ld64_2(ref, 8, TMP14);
+ ref += stride;
+ vis_faligndata(TMP8, TMP10, REF_S0);
+
+ vis_faligndata(TMP12, TMP14, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_ld64_2(dest, stride, DST_2);
+ vis_faligndata(TMP4, TMP6, REF_6);
+
+ vis_faligndata(TMP8, TMP10, REF_S2);
+
+ vis_faligndata(TMP12, TMP14, REF_S6);
+ } else {
+ vis_ld64(dest[0], DST_0);
+ vis_src1(TMP2, REF_2);
+
+ vis_ld64_2(dest, stride, DST_2);
+ vis_src1(TMP6, REF_6);
+
+ vis_src1(TMP10, REF_S2);
+
+ vis_src1(TMP14, REF_S6);
+ }
+
+ vis_pmerge(ZERO, REF_0, TMP0);
+ vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+ vis_pmerge(ZERO, REF_2, TMP4);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+ vis_padd16(TMP0, CONST_3, TMP0);
+ vis_mul8x16al(DST_0, CONST_512, TMP16);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+ vis_mul8x16al(DST_1, CONST_512, TMP18);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+ vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+ vis_padd16(TMP0, TMP16, TMP0);
+ vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+ vis_padd16(TMP2, TMP18, TMP2);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+ vis_padd16(TMP8, CONST_3, TMP8);
+ vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+ vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_pack16(TMP0, DST_0);
+
+ vis_pack16(TMP2, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ vis_padd16(TMP10, CONST_3, TMP10);
+
+ vis_ld64_2(dest, stride, DST_0);
+ vis_padd16(TMP8, TMP16, TMP8);
+
+ vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+ vis_padd16(TMP10, TMP18, TMP10);
+ vis_pack16(TMP8, DST_2);
+
+ vis_pack16(TMP10, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+
+ vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+ vis_pmerge(ZERO, REF_S0, TMP0);
+
+ vis_pmerge(ZERO, REF_S2, TMP24);
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+ vis_padd16(TMP0, CONST_3, TMP0);
+ vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+ vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+ vis_padd16(TMP0, TMP24, TMP0);
+ vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+ vis_padd16(TMP8, CONST_3, TMP8);
+ vis_mul8x16al(DST_0, CONST_512, TMP16);
+
+ vis_padd16(TMP10, CONST_3, TMP10);
+ vis_mul8x16al(DST_1, CONST_512, TMP18);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+ vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+ vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+ vis_padd16(TMP0, TMP16, TMP0);
+
+ vis_padd16(TMP2, TMP18, TMP2);
+ vis_pack16(TMP0, DST_0);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_pack16(TMP2, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_padd16(TMP8, TMP20, TMP8);
+
+ vis_padd16(TMP10, TMP22, TMP10);
+ vis_pack16(TMP8, DST_2);
+
+ vis_pack16(TMP10, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP6);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 8, TMP8);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_ld64_2(ref, 16, TMP10);
+ ref += stride;
+
+ vis_ld64(constants_fe[0], MASK_fe);
+ vis_faligndata(TMP6, TMP8, REF_2);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP8, TMP10, REF_6);
+
+ vis_ld64(constants128[0], CONST_128);
+ height = (height >> 1) - 1;
+ do { /* 24 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP16);
+
+ vis_ld64_2(ref, 16, TMP4);
+ ref += stride;
+ vis_and(REF_0, REF_2, TMP14);
+
+ vis_ld64(ref[0], TMP6);
+ vis_and(REF_4, REF_6, TMP18);
+
+ vis_ld64_2(ref, 8, TMP8);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, 16, TMP10);
+ ref += stride;
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_and(TMP16, MASK_fe, TMP16);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_mul8x16(CONST_128, TMP16, TMP16);
+ vis_xor(REF_0, REF_2, TMP0);
+
+ vis_xor(REF_4, REF_6, TMP2);
+
+ vis_and(REF_0, REF_2, TMP20);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_and(TMP16, MASK_7f, TMP16);
+
+ vis_padd16(TMP14, TMP12, TMP12);
+ vis_st64(TMP12, dest[0]);
+
+ vis_padd16(TMP18, TMP16, TMP16);
+ vis_st64_2(TMP16, dest, 8);
+ dest += stride;
+
+ vis_and(REF_4, REF_6, TMP18);
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP2, MASK_fe, TMP2);
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_faligndata(TMP6, TMP8, REF_2);
+ vis_mul8x16(CONST_128, TMP2, TMP2);
+
+ vis_faligndata(TMP8, TMP10, REF_6);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_and(TMP2, MASK_7f, TMP2);
+
+ vis_padd16(TMP20, TMP0, TMP0);
+ vis_st64(TMP0, dest[0]);
+
+ vis_padd16(TMP18, TMP2, TMP2);
+ vis_st64_2(TMP2, dest, 8);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_xor(REF_4, REF_6, TMP16);
+
+ vis_ld64_2(ref, 16, TMP4);
+ vis_and(REF_0, REF_2, TMP14);
+
+ vis_and(REF_4, REF_6, TMP18);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_and(TMP16, MASK_fe, TMP16);
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+
+ vis_mul8x16(CONST_128, TMP16, TMP16);
+ vis_xor(REF_0, REF_2, TMP0);
+
+ vis_xor(REF_4, REF_6, TMP2);
+
+ vis_and(REF_0, REF_2, TMP20);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_and(TMP16, MASK_7f, TMP16);
+
+ vis_padd16(TMP14, TMP12, TMP12);
+ vis_st64(TMP12, dest[0]);
+
+ vis_padd16(TMP18, TMP16, TMP16);
+ vis_st64_2(TMP16, dest, 8);
+ dest += stride;
+
+ vis_and(REF_4, REF_6, TMP18);
+
+ vis_and(TMP0, MASK_fe, TMP0);
+
+ vis_and(TMP2, MASK_fe, TMP2);
+ vis_mul8x16(CONST_128, TMP0, TMP0);
+
+ vis_mul8x16(CONST_128, TMP2, TMP2);
+
+ vis_and(TMP0, MASK_7f, TMP0);
+
+ vis_and(TMP2, MASK_7f, TMP2);
+
+ vis_padd16(TMP20, TMP0, TMP0);
+ vis_st64(TMP0, dest[0]);
+
+ vis_padd16(TMP18, TMP2, TMP2);
+ vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+
+ ref = vis_alignaddr(ref);
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+
+ vis_ld64(ref[0], TMP4);
+
+ vis_ld64_2(ref, 8, TMP6);
+ ref += stride;
+
+ vis_ld64(constants_fe[0], MASK_fe);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64(constants_7f[0], MASK_7f);
+ vis_faligndata(TMP4, TMP6, REF_2);
+
+ vis_ld64(constants128[0], CONST_128);
+ height = (height >> 1) - 1;
+ do { /* 12 cycles */
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_and(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+ vis_ld64(ref[0], TMP0);
+
+ vis_ld64_2(ref, 8, TMP2);
+ ref += stride;
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+ vis_and(REF_0, REF_2, TMP14);
+
+ vis_padd16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_padd16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ } while (--height);
+
+ vis_ld64(ref[0], TMP0);
+ vis_xor(REF_0, REF_2, TMP4);
+
+ vis_ld64_2(ref, 8, TMP2);
+ vis_and(TMP4, MASK_fe, TMP4);
+
+ vis_and(REF_0, REF_2, TMP6);
+ vis_mul8x16(CONST_128, TMP4, TMP4);
+
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_xor(REF_0, REF_2, TMP12);
+
+ vis_and(TMP4, MASK_7f, TMP4);
+
+ vis_and(TMP12, MASK_fe, TMP12);
+
+ vis_mul8x16(CONST_128, TMP12, TMP12);
+ vis_and(REF_0, REF_2, TMP14);
+
+ vis_padd16(TMP6, TMP4, DST_0);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_and(TMP12, MASK_7f, TMP12);
+
+ vis_padd16(TMP14, TMP12, DST_0);
+ vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ int stride_8 = stride + 8;
+ int stride_16 = stride + 16;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_ld64(constants256_512[0], CONST_256);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ height >>= 1;
+
+ do { /* 31 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_pmerge(ZERO, REF_2, TMP12);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ vis_pmerge(ZERO, REF_6, TMP16);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+ vis_ld64_2(ref, stride_16, TMP4);
+ ref += stride;
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(dest, 8, DST_2);
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_ld64_2(ref, stride, TMP6);
+ vis_pmerge(ZERO, REF_0, TMP0);
+ vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+ vis_ld64_2(ref, stride_8, TMP8);
+ vis_pmerge(ZERO, REF_4, TMP4);
+
+ vis_ld64_2(ref, stride_16, TMP10);
+ ref += stride;
+
+ vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+ vis_faligndata(TMP6, TMP8, REF_2);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+ vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+ vis_faligndata(TMP8, TMP10, REF_6);
+ vis_mul8x16al(DST_0, CONST_512, TMP20);
+
+ vis_padd16(TMP0, CONST_3, TMP0);
+ vis_mul8x16al(DST_1, CONST_512, TMP22);
+
+ vis_padd16(TMP2, CONST_3, TMP2);
+ vis_mul8x16al(DST_2, CONST_512, TMP24);
+
+ vis_padd16(TMP4, CONST_3, TMP4);
+ vis_mul8x16al(DST_3, CONST_512, TMP26);
+
+ vis_padd16(TMP6, CONST_3, TMP6);
+
+ vis_padd16(TMP12, TMP20, TMP12);
+ vis_mul8x16al(REF_S0, CONST_512, TMP20);
+
+ vis_padd16(TMP14, TMP22, TMP14);
+ vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+ vis_padd16(TMP16, TMP24, TMP16);
+ vis_mul8x16al(REF_S2, CONST_512, TMP24);
+
+ vis_padd16(TMP18, TMP26, TMP18);
+ vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+ vis_padd16(TMP12, TMP0, TMP12);
+ vis_mul8x16au(REF_2, CONST_256, TMP28);
+
+ vis_padd16(TMP14, TMP2, TMP14);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+ vis_padd16(TMP16, TMP4, TMP16);
+ vis_mul8x16au(REF_6, CONST_256, REF_S4);
+
+ vis_padd16(TMP18, TMP6, TMP18);
+ vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+ vis_pack16(TMP12, DST_0);
+ vis_padd16(TMP28, TMP0, TMP12);
+
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_padd16(TMP30, TMP2, TMP14);
+
+ vis_pack16(TMP16, DST_2);
+ vis_padd16(REF_S4, TMP4, TMP16);
+
+ vis_pack16(TMP18, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ vis_padd16(REF_S6, TMP6, TMP18);
+
+ vis_padd16(TMP12, TMP20, TMP12);
+
+ vis_padd16(TMP14, TMP22, TMP14);
+ vis_pack16(TMP12, DST_0);
+
+ vis_padd16(TMP16, TMP24, TMP16);
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+
+ vis_padd16(TMP18, TMP26, TMP18);
+ vis_pack16(TMP16, DST_2);
+
+ vis_pack16(TMP18, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ int stride_8 = stride + 8;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(constants3[0], CONST_3);
+ vis_faligndata(TMP0, TMP2, REF_2);
+
+ vis_ld64(constants256_512[0], CONST_256);
+
+ height >>= 1;
+ do { /* 20 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_pmerge(ZERO, REF_2, TMP8);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ ref += stride;
+
+ vis_ld64(dest[0], DST_0);
+
+ vis_ld64_2(dest, stride, DST_2);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, stride, TMP4);
+ vis_mul8x16al(DST_0, CONST_512, TMP16);
+ vis_pmerge(ZERO, REF_0, TMP12);
+
+ vis_ld64_2(ref, stride_8, TMP6);
+ ref += stride;
+ vis_mul8x16al(DST_1, CONST_512, TMP18);
+ vis_pmerge(ZERO, REF_0_1, TMP14);
+
+ vis_padd16(TMP12, CONST_3, TMP12);
+ vis_mul8x16al(DST_2, CONST_512, TMP24);
+
+ vis_padd16(TMP14, CONST_3, TMP14);
+ vis_mul8x16al(DST_3, CONST_512, TMP26);
+
+ vis_faligndata(TMP4, TMP6, REF_2);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_mul8x16au(REF_2, CONST_256, TMP20);
+
+ vis_padd16(TMP8, TMP16, TMP0);
+ vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+ vis_padd16(TMP10, TMP18, TMP2);
+ vis_pack16(TMP0, DST_0);
+
+ vis_pack16(TMP2, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ vis_padd16(TMP12, TMP20, TMP12);
+
+ vis_padd16(TMP14, TMP22, TMP14);
+
+ vis_padd16(TMP12, TMP24, TMP0);
+
+ vis_padd16(TMP14, TMP26, TMP2);
+ vis_pack16(TMP0, DST_2);
+
+ vis_pack16(TMP2, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+ int stride_16 = stride + 16;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(constants1[0], CONST_1);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ vis_ld64(constants256_512[0], CONST_256);
+ vis_faligndata(TMP2, TMP4, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ vis_faligndata(TMP2, TMP4, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ vis_src1(TMP4, REF_S6);
+ }
+
+ height >>= 1;
+ do {
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP12);
+ vis_pmerge(ZERO, REF_S0_1, TMP14);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ vis_mul8x16au(REF_S2, CONST_256, TMP16);
+ vis_pmerge(ZERO, REF_S2_1, TMP18);
+
+ vis_ld64_2(ref, stride_16, TMP4);
+ ref += stride;
+ vis_mul8x16au(REF_S4, CONST_256, TMP20);
+ vis_pmerge(ZERO, REF_S4_1, TMP22);
+
+ vis_ld64_2(ref, stride, TMP6);
+ vis_mul8x16au(REF_S6, CONST_256, TMP24);
+ vis_pmerge(ZERO, REF_S6_1, TMP26);
+
+ vis_ld64_2(ref, stride_8, TMP8);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, stride_16, TMP10);
+ ref += stride;
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_faligndata(TMP6, TMP8, REF_S0);
+
+ vis_faligndata(TMP8, TMP10, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ vis_faligndata(TMP6, TMP8, REF_S2);
+ vis_faligndata(TMP8, TMP10, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ vis_src1(TMP8, REF_S2);
+ vis_src1(TMP10, REF_S6);
+ }
+
+ vis_mul8x16au(REF_0, CONST_256, TMP0);
+ vis_pmerge(ZERO, REF_0_1, TMP2);
+
+ vis_mul8x16au(REF_2, CONST_256, TMP4);
+ vis_pmerge(ZERO, REF_2_1, TMP6);
+
+ vis_padd16(TMP0, CONST_2, TMP8);
+ vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+ vis_padd16(TMP2, CONST_1, TMP10);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+ vis_padd16(TMP8, TMP4, TMP8);
+ vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+ vis_padd16(TMP10, TMP6, TMP10);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+ vis_padd16(TMP12, TMP8, TMP12);
+
+ vis_padd16(TMP14, TMP10, TMP14);
+
+ vis_padd16(TMP12, TMP16, TMP12);
+
+ vis_padd16(TMP14, TMP18, TMP14);
+ vis_pack16(TMP12, DST_0);
+
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_padd16(TMP0, CONST_1, TMP12);
+
+ vis_mul8x16au(REF_S0, CONST_256, TMP0);
+ vis_padd16(TMP2, CONST_1, TMP14);
+
+ vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+ vis_padd16(TMP12, TMP4, TMP12);
+
+ vis_mul8x16au(REF_S2, CONST_256, TMP4);
+ vis_padd16(TMP14, TMP6, TMP14);
+
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+ vis_padd16(TMP20, TMP12, TMP20);
+
+ vis_padd16(TMP22, TMP14, TMP22);
+
+ vis_padd16(TMP20, TMP24, TMP20);
+
+ vis_padd16(TMP22, TMP26, TMP22);
+ vis_pack16(TMP20, DST_2);
+
+ vis_pack16(TMP22, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ vis_padd16(TMP0, TMP4, TMP24);
+
+ vis_mul8x16au(REF_S4, CONST_256, TMP0);
+ vis_padd16(TMP2, TMP6, TMP26);
+
+ vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+ vis_padd16(TMP24, TMP8, TMP24);
+
+ vis_padd16(TMP26, TMP10, TMP26);
+ vis_pack16(TMP24, DST_0);
+
+ vis_pack16(TMP26, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_pmerge(ZERO, REF_S6, TMP4);
+
+ vis_pmerge(ZERO, REF_S6_1, TMP6);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+
+ vis_padd16(TMP0, TMP12, TMP0);
+
+ vis_padd16(TMP2, TMP14, TMP2);
+ vis_pack16(TMP0, DST_2);
+
+ vis_pack16(TMP2, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+
+ vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(constants1[0], CONST_1);
+
+ vis_ld64(constants256_512[0], CONST_256);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ }
+
+ height >>= 1;
+ do { /* 26 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP8);
+ vis_pmerge(ZERO, REF_S2, TMP12);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ ref += stride;
+ vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+ vis_pmerge(ZERO, REF_S2_1, TMP14);
+
+ vis_ld64_2(ref, stride, TMP4);
+
+ vis_ld64_2(ref, stride_8, TMP6);
+ ref += stride;
+ vis_faligndata(TMP0, TMP2, REF_S4);
+
+ vis_pmerge(ZERO, REF_S4, TMP18);
+
+ vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+ vis_faligndata(TMP4, TMP6, REF_S0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S6);
+ vis_faligndata(TMP4, TMP6, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S6);
+ vis_src1(TMP6, REF_S2);
+ }
+
+ vis_padd16(TMP18, CONST_1, TMP18);
+ vis_mul8x16au(REF_S6, CONST_256, TMP22);
+
+ vis_padd16(TMP20, CONST_1, TMP20);
+ vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+ vis_mul8x16au(REF_S0, CONST_256, TMP26);
+ vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+ vis_mul8x16au(REF_S2, CONST_256, TMP30);
+ vis_padd16(TMP18, TMP22, TMP18);
+
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+ vis_padd16(TMP20, TMP24, TMP20);
+
+ vis_padd16(TMP8, TMP18, TMP8);
+
+ vis_padd16(TMP10, TMP20, TMP10);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+ vis_padd16(TMP18, TMP26, TMP18);
+
+ vis_padd16(TMP20, TMP28, TMP20);
+
+ vis_padd16(TMP18, TMP30, TMP18);
+
+ vis_padd16(TMP20, TMP32, TMP20);
+ vis_pack16(TMP18, DST_2);
+
+ vis_pack16(TMP20, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+ int stride_16 = stride + 16;
+
+ vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[ 0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64(ref[ 8], TMP2);
+
+ vis_ld64(ref[16], TMP4);
+
+ vis_ld64(constants6[0], CONST_6);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ vis_ld64(constants256_1024[0], CONST_256);
+ vis_faligndata(TMP2, TMP4, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ vis_faligndata(TMP2, TMP4, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ vis_src1(TMP4, REF_S6);
+ }
+
+ height >>= 1;
+ do { /* 55 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP12);
+ vis_pmerge(ZERO, REF_S0_1, TMP14);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ vis_mul8x16au(REF_S2, CONST_256, TMP16);
+ vis_pmerge(ZERO, REF_S2_1, TMP18);
+
+ vis_ld64_2(ref, stride_16, TMP4);
+ ref += stride;
+ vis_mul8x16au(REF_S4, CONST_256, TMP20);
+ vis_pmerge(ZERO, REF_S4_1, TMP22);
+
+ vis_ld64_2(ref, stride, TMP6);
+ vis_mul8x16au(REF_S6, CONST_256, TMP24);
+ vis_pmerge(ZERO, REF_S6_1, TMP26);
+
+ vis_ld64_2(ref, stride_8, TMP8);
+ vis_faligndata(TMP0, TMP2, REF_0);
+
+ vis_ld64_2(ref, stride_16, TMP10);
+ ref += stride;
+ vis_faligndata(TMP2, TMP4, REF_4);
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP6, TMP8, REF_S0);
+
+ vis_ld64_2(dest, 8, DST_2);
+ vis_faligndata(TMP8, TMP10, REF_S4);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_2);
+ vis_faligndata(TMP2, TMP4, REF_6);
+ vis_faligndata(TMP6, TMP8, REF_S2);
+ vis_faligndata(TMP8, TMP10, REF_S6);
+ } else {
+ vis_src1(TMP2, REF_2);
+ vis_src1(TMP4, REF_6);
+ vis_src1(TMP8, REF_S2);
+ vis_src1(TMP10, REF_S6);
+ }
+
+ vis_mul8x16al(DST_0, CONST_1024, TMP30);
+ vis_pmerge(ZERO, REF_0, TMP0);
+
+ vis_mul8x16al(DST_1, CONST_1024, TMP32);
+ vis_pmerge(ZERO, REF_0_1, TMP2);
+
+ vis_mul8x16au(REF_2, CONST_256, TMP4);
+ vis_pmerge(ZERO, REF_2_1, TMP6);
+
+ vis_mul8x16al(DST_2, CONST_1024, REF_0);
+ vis_padd16(TMP0, CONST_6, TMP0);
+
+ vis_mul8x16al(DST_3, CONST_1024, REF_2);
+ vis_padd16(TMP2, CONST_6, TMP2);
+
+ vis_padd16(TMP0, TMP4, TMP0);
+ vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+ vis_padd16(TMP2, TMP6, TMP2);
+ vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+ vis_padd16(TMP12, TMP0, TMP12);
+ vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+ vis_padd16(TMP14, TMP2, TMP14);
+ vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+ vis_padd16(TMP12, TMP16, TMP12);
+ vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+ vis_padd16(TMP14, TMP18, TMP14);
+ vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+ vis_padd16(TMP12, TMP30, TMP12);
+
+ vis_padd16(TMP14, TMP32, TMP14);
+ vis_pack16(TMP12, DST_0);
+
+ vis_pack16(TMP14, DST_1);
+ vis_st64(DST_0, dest[0]);
+ vis_padd16(TMP4, CONST_6, TMP4);
+
+ vis_ld64_2(dest, stride, DST_0);
+ vis_padd16(TMP6, CONST_6, TMP6);
+ vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+ vis_padd16(TMP4, TMP8, TMP4);
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
+
+ vis_padd16(TMP6, TMP10, TMP6);
+
+ vis_padd16(TMP20, TMP4, TMP20);
+
+ vis_padd16(TMP22, TMP6, TMP22);
+
+ vis_padd16(TMP20, TMP24, TMP20);
+
+ vis_padd16(TMP22, TMP26, TMP22);
+
+ vis_padd16(TMP20, REF_0, TMP20);
+ vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+ vis_padd16(TMP22, REF_2, TMP22);
+ vis_pack16(TMP20, DST_2);
+
+ vis_pack16(TMP22, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+
+ vis_ld64_2(dest, 8, DST_2);
+ vis_mul8x16al(DST_0, CONST_1024, TMP30);
+ vis_pmerge(ZERO, REF_S4_1, REF_2);
+
+ vis_mul8x16al(DST_1, CONST_1024, TMP32);
+ vis_padd16(REF_4, TMP0, TMP8);
+
+ vis_mul8x16au(REF_S6, CONST_256, REF_4);
+ vis_padd16(REF_6, TMP2, TMP10);
+
+ vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+
+ vis_padd16(TMP8, TMP30, TMP8);
+
+ vis_padd16(TMP10, TMP32, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_st64(DST_0, dest[0]);
+
+ vis_padd16(REF_0, TMP4, REF_0);
+
+ vis_mul8x16al(DST_2, CONST_1024, TMP30);
+ vis_padd16(REF_2, TMP6, REF_2);
+
+ vis_mul8x16al(DST_3, CONST_1024, TMP32);
+ vis_padd16(REF_0, REF_4, REF_0);
+
+ vis_padd16(REF_2, REF_6, REF_2);
+
+ vis_padd16(REF_0, TMP30, REF_0);
+
+ /* stall */
+
+ vis_padd16(REF_2, TMP32, REF_2);
+ vis_pack16(REF_0, DST_2);
+
+ vis_pack16(REF_2, DST_3);
+ vis_st64_2(DST_2, dest, 8);
+ dest += stride;
+ } while (--height);
+}
+
+static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+ const int stride, int height)
+{
+ uint8_t *ref = (uint8_t *) _ref;
+ unsigned long off = (unsigned long) ref & 0x7;
+ unsigned long off_plus_1 = off + 1;
+ int stride_8 = stride + 8;
+
+ vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+ ref = vis_alignaddr(ref);
+
+ vis_ld64(ref[0], TMP0);
+ vis_fzero(ZERO);
+
+ vis_ld64_2(ref, 8, TMP2);
+
+ vis_ld64(constants6[0], CONST_6);
+
+ vis_ld64(constants256_1024[0], CONST_256);
+ vis_faligndata(TMP0, TMP2, REF_S0);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S2);
+ }
+
+ height >>= 1;
+ do { /* 31 cycles */
+ vis_ld64_2(ref, stride, TMP0);
+ vis_mul8x16au(REF_S0, CONST_256, TMP8);
+ vis_pmerge(ZERO, REF_S0_1, TMP10);
+
+ vis_ld64_2(ref, stride_8, TMP2);
+ ref += stride;
+ vis_mul8x16au(REF_S2, CONST_256, TMP12);
+ vis_pmerge(ZERO, REF_S2_1, TMP14);
+
+ vis_alignaddr_g0((void *)off);
+
+ vis_ld64_2(ref, stride, TMP4);
+ vis_faligndata(TMP0, TMP2, REF_S4);
+
+ vis_ld64_2(ref, stride_8, TMP6);
+ ref += stride;
+
+ vis_ld64(dest[0], DST_0);
+ vis_faligndata(TMP4, TMP6, REF_S0);
+
+ vis_ld64_2(dest, stride, DST_2);
+
+ if (off != 0x7) {
+ vis_alignaddr_g0((void *)off_plus_1);
+ vis_faligndata(TMP0, TMP2, REF_S6);
+ vis_faligndata(TMP4, TMP6, REF_S2);
+ } else {
+ vis_src1(TMP2, REF_S6);
+ vis_src1(TMP6, REF_S2);
+ }
+
+ vis_mul8x16al(DST_0, CONST_1024, TMP30);
+ vis_pmerge(ZERO, REF_S4, TMP22);
+
+ vis_mul8x16al(DST_1, CONST_1024, TMP32);
+ vis_pmerge(ZERO, REF_S4_1, TMP24);
+
+ vis_mul8x16au(REF_S6, CONST_256, TMP26);
+ vis_pmerge(ZERO, REF_S6_1, TMP28);
+
+ vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+ vis_padd16(TMP22, CONST_6, TMP22);
+
+ vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+ vis_padd16(TMP24, CONST_6, TMP24);
+
+ vis_mul8x16al(DST_2, CONST_1024, REF_0);
+ vis_padd16(TMP22, TMP26, TMP22);
+
+ vis_mul8x16al(DST_3, CONST_1024, REF_2);
+ vis_padd16(TMP24, TMP28, TMP24);
+
+ vis_mul8x16au(REF_S2, CONST_256, TMP26);
+ vis_padd16(TMP8, TMP22, TMP8);
+
+ vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+ vis_padd16(TMP10, TMP24, TMP10);
+
+ vis_padd16(TMP8, TMP12, TMP8);
+
+ vis_padd16(TMP10, TMP14, TMP10);
+
+ vis_padd16(TMP8, TMP30, TMP8);
+
+ vis_padd16(TMP10, TMP32, TMP10);
+ vis_pack16(TMP8, DST_0);
+
+ vis_pack16(TMP10, DST_1);
+ vis_st64(DST_0, dest[0]);
+ dest += stride;
+
+ vis_padd16(REF_S4, TMP22, TMP12);
+
+ vis_padd16(REF_S6, TMP24, TMP14);
+
+ vis_padd16(TMP12, TMP26, TMP12);
+
+ vis_padd16(TMP14, TMP28, TMP14);
+
+ vis_padd16(TMP12, REF_0, TMP12);
+
+ vis_padd16(TMP14, REF_2, TMP14);
+ vis_pack16(TMP12, DST_2);
+
+ vis_pack16(TMP14, DST_3);
+ vis_st64(DST_2, dest[0]);
+ dest += stride;
+ } while (--height);
+}
+
+/* End of no rounding code */
+
+void get_pixels_vis(uint8_t *restrict dest, const uint8_t *_ref, int stride)
+{
+ int i;
+ uint8_t *ref = (uint8_t*)_ref;
+ ref = vis_alignaddr(ref);
+
+ for (i = 0; i < 8; i++)
+ {
+ vis_ld64(ref[0], TMP0);
+ vis_st64(TMP0, dest[0]);
+ dest += 8;
+ ref += stride;
+ }
+}
+
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler (int sig)
+{
+ if (!canjump) {
+ signal (sig, SIG_DFL);
+ raise (sig);
+ }
+
+ canjump = 0;
+ siglongjmp (jmpbuf, 1);
+}
+
+#define ACCEL_SPARC_VIS 1
+#define ACCEL_SPARC_VIS2 2
+
+static int vis_level ()
+{
+ int accel = 0;
+
+ signal (SIGILL, sigill_handler);
+ if (sigsetjmp (jmpbuf, 1)) {
+ signal (SIGILL, SIG_DFL);
+ return accel;
+ }
+
+ canjump = 1;
+
+ /* pdist %f0, %f0, %f0 */
+ __asm__ __volatile__(".word\t0x81b007c0");
+
+ canjump = 0;
+ accel |= ACCEL_SPARC_VIS;
+
+ if (sigsetjmp (jmpbuf, 1)) {
+ signal (SIGILL, SIG_DFL);
+ return accel;
+ }
+
+ canjump = 1;
+
+ /* edge8n %g0, %g0, %g0 */
+ __asm__ __volatile__(".word\t0x81b00020");
+
+ canjump = 0;
+ accel |= ACCEL_SPARC_VIS2;
+
+ signal (SIGILL, SIG_DFL);
+
+ return accel;
+}
+
+/* libavcodec initialization code */
+void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
+{
+ /* VIS specific optimisations */
+ int accel = vis_level ();
+
+ if (accel & ACCEL_SPARC_VIS) {
+ c->get_pixels = get_pixels_vis;
+ c->put_pixels_tab[0][0] = MC_put_o_16_vis;
+ c->put_pixels_tab[0][1] = MC_put_x_16_vis;
+ c->put_pixels_tab[0][2] = MC_put_y_16_vis;
+ c->put_pixels_tab[0][3] = MC_put_xy_16_vis;
+
+ c->put_pixels_tab[1][0] = MC_put_o_8_vis;
+ c->put_pixels_tab[1][1] = MC_put_x_8_vis;
+ c->put_pixels_tab[1][2] = MC_put_y_8_vis;
+ c->put_pixels_tab[1][3] = MC_put_xy_8_vis;
+
+ c->avg_pixels_tab[0][0] = MC_avg_o_16_vis;
+ c->avg_pixels_tab[0][1] = MC_avg_x_16_vis;
+ c->avg_pixels_tab[0][2] = MC_avg_y_16_vis;
+ c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis;
+
+ c->avg_pixels_tab[1][0] = MC_avg_o_8_vis;
+ c->avg_pixels_tab[1][1] = MC_avg_x_8_vis;
+ c->avg_pixels_tab[1][2] = MC_avg_y_8_vis;
+ c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis;
+
+ c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis;
+ c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis;
+ c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis;
+ c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis;
+
+ c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis;
+ c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis;
+ c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis;
+ c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis;
+
+ c->avg_no_rnd_pixels_tab[0][0] = MC_avg_no_round_o_16_vis;
+ c->avg_no_rnd_pixels_tab[0][1] = MC_avg_no_round_x_16_vis;
+ c->avg_no_rnd_pixels_tab[0][2] = MC_avg_no_round_y_16_vis;
+ c->avg_no_rnd_pixels_tab[0][3] = MC_avg_no_round_xy_16_vis;
+
+ c->avg_no_rnd_pixels_tab[1][0] = MC_avg_no_round_o_8_vis;
+ c->avg_no_rnd_pixels_tab[1][1] = MC_avg_no_round_x_8_vis;
+ c->avg_no_rnd_pixels_tab[1][2] = MC_avg_no_round_y_8_vis;
+ c->avg_no_rnd_pixels_tab[1][3] = MC_avg_no_round_xy_8_vis;
+ }
+}
+
+#endif /* !(ARCH_SPARC) */
diff --git a/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c b/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c
new file mode 100644
index 000000000..a09ee4e28
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c
@@ -0,0 +1,2 @@
+
+char libavcodec_mlib_dummy;
diff --git a/src/libffmpeg/libavcodec/sparc/vis.h b/src/libffmpeg/libavcodec/sparc/vis.h
new file mode 100644
index 000000000..07dda2949
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/vis.h
@@ -0,0 +1,328 @@
+/*
+ * vis.h
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* You may be asking why I hard-code the instruction opcodes and don't
+ * use the normal VIS assembler mnenomics for the VIS instructions.
+ *
+ * The reason is that Sun, in their infinite wisdom, decided that a binary
+ * using a VIS instruction will cause it to be marked (in the ELF headers)
+ * as doing so, and this prevents the OS from loading such binaries if the
+ * current cpu doesn't have VIS. There is no way to easily override this
+ * behavior of the assembler that I am aware of.
+ *
+ * This totally defeats what libmpeg2 is trying to do which is allow a
+ * single binary to be created, and then detect the availability of VIS
+ * at runtime.
+ *
+ * I'm not saying that tainting the binary by default is bad, rather I'm
+ * saying that not providing a way to override this easily unnecessarily
+ * ties people's hands.
+ *
+ * Thus, we do the opcode encoding by hand and output 32-bit words in
+ * the assembler to keep the binary from becoming tainted.
+ */
+
+#define vis_opc_base ((0x1 << 31) | (0x36 << 19))
+#define vis_opf(X) ((X) << 5)
+#define vis_sreg(X) (X)
+#define vis_dreg(X) (((X)&0x1f)|((X)>>5))
+#define vis_rs1_s(X) (vis_sreg(X) << 14)
+#define vis_rs1_d(X) (vis_dreg(X) << 14)
+#define vis_rs2_s(X) (vis_sreg(X) << 0)
+#define vis_rs2_d(X) (vis_dreg(X) << 0)
+#define vis_rd_s(X) (vis_sreg(X) << 25)
+#define vis_rd_d(X) (vis_dreg(X) << 25)
+
+#define vis_ss2s(opf,rs1,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs1_s(rs1) | \
+ vis_rs2_s(rs2) | \
+ vis_rd_s(rd)))
+
+#define vis_dd2d(opf,rs1,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs1_d(rs1) | \
+ vis_rs2_d(rs2) | \
+ vis_rd_d(rd)))
+
+#define vis_ss2d(opf,rs1,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs1_s(rs1) | \
+ vis_rs2_s(rs2) | \
+ vis_rd_d(rd)))
+
+#define vis_sd2d(opf,rs1,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs1_s(rs1) | \
+ vis_rs2_d(rs2) | \
+ vis_rd_d(rd)))
+
+#define vis_d2s(opf,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs2_d(rs2) | \
+ vis_rd_s(rd)))
+
+#define vis_s2d(opf,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs2_s(rs2) | \
+ vis_rd_d(rd)))
+
+#define vis_d12d(opf,rs1,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs1_d(rs1) | \
+ vis_rd_d(rd)))
+
+#define vis_d22d(opf,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs2_d(rs2) | \
+ vis_rd_d(rd)))
+
+#define vis_s12s(opf,rs1,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs1_s(rs1) | \
+ vis_rd_s(rd)))
+
+#define vis_s22s(opf,rs2,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rs2_s(rs2) | \
+ vis_rd_s(rd)))
+
+#define vis_s(opf,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rd_s(rd)))
+
+#define vis_d(opf,rd) \
+ __asm__ __volatile__ (".word %0" \
+ : : "i" (vis_opc_base | vis_opf(opf) | \
+ vis_rd_d(rd)))
+
+#define vis_r2m(op,rd,mem) \
+ __asm__ __volatile__ (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
+
+#define vis_r2m_2(op,rd,mem1,mem2) \
+ __asm__ __volatile__ (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
+
+#define vis_m2r(op,mem,rd) \
+ __asm__ __volatile__ (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
+
+#define vis_m2r_2(op,mem1,mem2,rd) \
+ __asm__ __volatile__ (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
+
+static inline void vis_set_gsr(unsigned int _val)
+{
+ register unsigned int val asm("g1");
+
+ val = _val;
+ __asm__ __volatile__(".word 0xa7804000"
+ : : "r" (val));
+}
+
+#define VIS_GSR_ALIGNADDR_MASK 0x0000007
+#define VIS_GSR_ALIGNADDR_SHIFT 0
+#define VIS_GSR_SCALEFACT_MASK 0x0000078
+#define VIS_GSR_SCALEFACT_SHIFT 3
+
+#define vis_ld32(mem,rs1) vis_m2r(ld, mem, rs1)
+#define vis_ld32_2(mem1,mem2,rs1) vis_m2r_2(ld, mem1, mem2, rs1)
+#define vis_st32(rs1,mem) vis_r2m(st, rs1, mem)
+#define vis_st32_2(rs1,mem1,mem2) vis_r2m_2(st, rs1, mem1, mem2)
+#define vis_ld64(mem,rs1) vis_m2r(ldd, mem, rs1)
+#define vis_ld64_2(mem1,mem2,rs1) vis_m2r_2(ldd, mem1, mem2, rs1)
+#define vis_st64(rs1,mem) vis_r2m(std, rs1, mem)
+#define vis_st64_2(rs1,mem1,mem2) vis_r2m_2(std, rs1, mem1, mem2)
+
+#define vis_ldblk(mem, rd) \
+do { register void *__mem asm("g1"); \
+ __mem = &(mem); \
+ __asm__ __volatile__(".word 0xc1985e00 | %1" \
+ : \
+ : "r" (__mem), \
+ "i" (vis_rd_d(rd)) \
+ : "memory"); \
+} while (0)
+
+#define vis_stblk(rd, mem) \
+do { register void *__mem asm("g1"); \
+ __mem = &(mem); \
+ __asm__ __volatile__(".word 0xc1b85e00 | %1" \
+ : \
+ : "r" (__mem), \
+ "i" (vis_rd_d(rd)) \
+ : "memory"); \
+} while (0)
+
+#define vis_membar_storestore() \
+ __asm__ __volatile__(".word 0x8143e008" : : : "memory")
+
+#define vis_membar_sync() \
+ __asm__ __volatile__(".word 0x8143e040" : : : "memory")
+
+/* 16 and 32 bit partitioned addition and subtraction. The normal
+ * versions perform 4 16-bit or 2 32-bit additions or subtractions.
+ * The 's' versions perform 2 16-bit or 1 32-bit additions or
+ * subtractions.
+ */
+
+#define vis_padd16(rs1,rs2,rd) vis_dd2d(0x50, rs1, rs2, rd)
+#define vis_padd16s(rs1,rs2,rd) vis_ss2s(0x51, rs1, rs2, rd)
+#define vis_padd32(rs1,rs2,rd) vis_dd2d(0x52, rs1, rs2, rd)
+#define vis_padd32s(rs1,rs2,rd) vis_ss2s(0x53, rs1, rs2, rd)
+#define vis_psub16(rs1,rs2,rd) vis_dd2d(0x54, rs1, rs2, rd)
+#define vis_psub16s(rs1,rs2,rd) vis_ss2s(0x55, rs1, rs2, rd)
+#define vis_psub32(rs1,rs2,rd) vis_dd2d(0x56, rs1, rs2, rd)
+#define vis_psub32s(rs1,rs2,rd) vis_ss2s(0x57, rs1, rs2, rd)
+
+/* Pixel formatting instructions. */
+
+#define vis_pack16(rs2,rd) vis_d2s( 0x3b, rs2, rd)
+#define vis_pack32(rs1,rs2,rd) vis_dd2d(0x3a, rs1, rs2, rd)
+#define vis_packfix(rs2,rd) vis_d2s( 0x3d, rs2, rd)
+#define vis_expand(rs2,rd) vis_s2d( 0x4d, rs2, rd)
+#define vis_pmerge(rs1,rs2,rd) vis_ss2d(0x4b, rs1, rs2, rd)
+
+/* Partitioned multiply instructions. */
+
+#define vis_mul8x16(rs1,rs2,rd) vis_sd2d(0x31, rs1, rs2, rd)
+#define vis_mul8x16au(rs1,rs2,rd) vis_ss2d(0x33, rs1, rs2, rd)
+#define vis_mul8x16al(rs1,rs2,rd) vis_ss2d(0x35, rs1, rs2, rd)
+#define vis_mul8sux16(rs1,rs2,rd) vis_dd2d(0x36, rs1, rs2, rd)
+#define vis_mul8ulx16(rs1,rs2,rd) vis_dd2d(0x37, rs1, rs2, rd)
+#define vis_muld8sux16(rs1,rs2,rd) vis_ss2d(0x38, rs1, rs2, rd)
+#define vis_muld8ulx16(rs1,rs2,rd) vis_ss2d(0x39, rs1, rs2, rd)
+
+/* Alignment instructions. */
+
+static inline void *vis_alignaddr(void *_ptr)
+{
+ register void *ptr asm("g1");
+
+ ptr = _ptr;
+
+ __asm__ __volatile__(".word %2"
+ : "=&r" (ptr)
+ : "0" (ptr),
+ "i" (vis_opc_base | vis_opf(0x18) |
+ vis_rs1_s(1) |
+ vis_rs2_s(0) |
+ vis_rd_s(1)));
+
+ return ptr;
+}
+
+static inline void vis_alignaddr_g0(void *_ptr)
+{
+ register void *ptr asm("g1");
+
+ ptr = _ptr;
+
+ __asm__ __volatile__(".word %2"
+ : "=&r" (ptr)
+ : "0" (ptr),
+ "i" (vis_opc_base | vis_opf(0x18) |
+ vis_rs1_s(1) |
+ vis_rs2_s(0) |
+ vis_rd_s(0)));
+}
+
+static inline void *vis_alignaddrl(void *_ptr)
+{
+ register void *ptr asm("g1");
+
+ ptr = _ptr;
+
+ __asm__ __volatile__(".word %2"
+ : "=&r" (ptr)
+ : "0" (ptr),
+ "i" (vis_opc_base | vis_opf(0x19) |
+ vis_rs1_s(1) |
+ vis_rs2_s(0) |
+ vis_rd_s(1)));
+
+ return ptr;
+}
+
+static inline void vis_alignaddrl_g0(void *_ptr)
+{
+ register void *ptr asm("g1");
+
+ ptr = _ptr;
+
+ __asm__ __volatile__(".word %2"
+ : "=&r" (ptr)
+ : "0" (ptr),
+ "i" (vis_opc_base | vis_opf(0x19) |
+ vis_rs1_s(1) |
+ vis_rs2_s(0) |
+ vis_rd_s(0)));
+}
+
+#define vis_faligndata(rs1,rs2,rd) vis_dd2d(0x48, rs1, rs2, rd)
+
+/* Logical operate instructions. */
+
+#define vis_fzero(rd) vis_d( 0x60, rd)
+#define vis_fzeros(rd) vis_s( 0x61, rd)
+#define vis_fone(rd) vis_d( 0x7e, rd)
+#define vis_fones(rd) vis_s( 0x7f, rd)
+#define vis_src1(rs1,rd) vis_d12d(0x74, rs1, rd)
+#define vis_src1s(rs1,rd) vis_s12s(0x75, rs1, rd)
+#define vis_src2(rs2,rd) vis_d22d(0x78, rs2, rd)
+#define vis_src2s(rs2,rd) vis_s22s(0x79, rs2, rd)
+#define vis_not1(rs1,rd) vis_d12d(0x6a, rs1, rd)
+#define vis_not1s(rs1,rd) vis_s12s(0x6b, rs1, rd)
+#define vis_not2(rs2,rd) vis_d22d(0x66, rs2, rd)
+#define vis_not2s(rs2,rd) vis_s22s(0x67, rs2, rd)
+#define vis_or(rs1,rs2,rd) vis_dd2d(0x7c, rs1, rs2, rd)
+#define vis_ors(rs1,rs2,rd) vis_ss2s(0x7d, rs1, rs2, rd)
+#define vis_nor(rs1,rs2,rd) vis_dd2d(0x62, rs1, rs2, rd)
+#define vis_nors(rs1,rs2,rd) vis_ss2s(0x63, rs1, rs2, rd)
+#define vis_and(rs1,rs2,rd) vis_dd2d(0x70, rs1, rs2, rd)
+#define vis_ands(rs1,rs2,rd) vis_ss2s(0x71, rs1, rs2, rd)
+#define vis_nand(rs1,rs2,rd) vis_dd2d(0x6e, rs1, rs2, rd)
+#define vis_nands(rs1,rs2,rd) vis_ss2s(0x6f, rs1, rs2, rd)
+#define vis_xor(rs1,rs2,rd) vis_dd2d(0x6c, rs1, rs2, rd)
+#define vis_xors(rs1,rs2,rd) vis_ss2s(0x6d, rs1, rs2, rd)
+#define vis_xnor(rs1,rs2,rd) vis_dd2d(0x72, rs1, rs2, rd)
+#define vis_xnors(rs1,rs2,rd) vis_ss2s(0x73, rs1, rs2, rd)
+#define vis_ornot1(rs1,rs2,rd) vis_dd2d(0x7a, rs1, rs2, rd)
+#define vis_ornot1s(rs1,rs2,rd) vis_ss2s(0x7b, rs1, rs2, rd)
+#define vis_ornot2(rs1,rs2,rd) vis_dd2d(0x76, rs1, rs2, rd)
+#define vis_ornot2s(rs1,rs2,rd) vis_ss2s(0x77, rs1, rs2, rd)
+#define vis_andnot1(rs1,rs2,rd) vis_dd2d(0x68, rs1, rs2, rd)
+#define vis_andnot1s(rs1,rs2,rd) vis_ss2s(0x69, rs1, rs2, rd)
+#define vis_andnot2(rs1,rs2,rd) vis_dd2d(0x64, rs1, rs2, rd)
+#define vis_andnot2s(rs1,rs2,rd) vis_ss2s(0x65, rs1, rs2, rd)
+
+/* Pixel component distance. */
+
+#define vis_pdist(rs1,rs2,rd) vis_dd2d(0x3e, rs1, rs2, rd)
diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c
index 6a15270b7..781194f03 100644
--- a/src/libffmpeg/libavcodec/svq1.c
+++ b/src/libffmpeg/libavcodec/svq1.c
@@ -783,6 +783,8 @@ static int svq1_decode_init(AVCodecContext *avctx)
MpegEncContext *s = avctx->priv_data;
int i;
+ MPV_decode_defaults(s);
+
s->avctx = avctx;
s->width = (avctx->width+3)&~3;
s->height = (avctx->height+3)&~3;
diff --git a/src/libffmpeg/libavcodec/truemotion1.c b/src/libffmpeg/libavcodec/truemotion1.c
index 35bf3a788..2f6310192 100644
--- a/src/libffmpeg/libavcodec/truemotion1.c
+++ b/src/libffmpeg/libavcodec/truemotion1.c
@@ -36,9 +36,6 @@
#include "avcodec.h"
#include "dsputil.h"
-#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log
-#define fprintf(...) {}
-
#include "truemotion1data.h"
typedef struct TrueMotion1Context {
@@ -232,7 +229,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
header.header_size = ((s->buf[0] >> 5) | (s->buf[0] << 3)) & 0x7f;
if (s->buf[0] < 0x10)
{
- printf("invalid header size\n");
+ av_log(s->avctx, AV_LOG_ERROR, "invalid header size\n");
return -1;
}
@@ -282,7 +279,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
}
if (header.compression > 17) {
- printf("invalid compression type (%d)\n", header.compression);
+ av_log(s->avctx, AV_LOG_ERROR, "invalid compression type (%d)\n", header.compression);
return -1;
}
@@ -296,7 +293,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
if (header.vectable < 4)
sel_vector_table = tables[header.vectable - 1];
else {
- printf("invalid vector table id (%d)\n", header.vectable);
+ av_log(s->avctx, AV_LOG_ERROR, "invalid vector table id (%d)\n", header.vectable);
return -1;
}
}
@@ -305,7 +302,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
{
if (compression_types[header.compression].algorithm == ALGO_RGB24H)
{
- printf("24bit compression not yet supported\n");
+ av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n");
}
else
gen_vector_table(s, sel_vector_table);
@@ -354,7 +351,7 @@ static int truemotion1_decode_init(AVCodecContext *avctx)
#define GET_NEXT_INDEX() \
{\
if (index_stream_index >= s->index_stream_size) { \
- printf (" help! truemotion1 decoder went out of bounds\n"); \
+ av_log(s->avctx, AV_LOG_INFO, " help! truemotion1 decoder went out of bounds\n"); \
return; \
} \
index = s->index_stream[index_stream_index++] * 4; \
@@ -542,7 +539,7 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
s->frame.reference = 1;
if (avctx->get_buffer(avctx, &s->frame) < 0) {
- fprintf(stderr, "truemotion1: get_buffer() failed\n");
+ av_log(s->avctx, AV_LOG_ERROR, "truemotion1: get_buffer() failed\n");
return -1;
}
@@ -561,7 +558,7 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
memcpy(s->frame.data[0], s->prev_frame.data[0],
s->frame.linesize[0] * s->avctx->height);
} else if (compression_types[s->compression].algorithm == ALGO_RGB24H) {
- printf (" 24-bit Duck TrueMotion decoding not yet implemented\n");
+ av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n");
} else {
truemotion1_decode_16bit(s);
}
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index 145f9df65..ffa0cb855 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -60,47 +60,40 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size)
if(min_size < *size)
return ptr;
- *size= min_size + 10*1024;
+ *size= 17*min_size/16 + 32;
return av_realloc(ptr, *size);
}
-/* allocation of static arrays - do not use for normal allocation */
static unsigned int last_static = 0;
-static char*** array_static = NULL;
-static const unsigned int grow_static = 64; // ^2
-void *__av_mallocz_static(void** location, unsigned int size)
+static unsigned int allocated_static = 0;
+static void** array_static = NULL;
+
+/**
+ * allocation of static arrays - do not use for normal allocation.
+ */
+void *av_mallocz_static(unsigned int size)
{
- unsigned int l = (last_static + grow_static) & ~(grow_static - 1);
void *ptr = av_mallocz(size);
- if (!ptr)
- return NULL;
-
- if (location)
- {
- if (l > last_static)
- array_static = av_realloc(array_static, l);
- array_static[last_static++] = (char**) location;
- *location = ptr;
+
+ if(ptr){
+ array_static =av_fast_realloc(array_static, &allocated_static, sizeof(void*)*(last_static+1));
+ array_static[last_static++] = ptr;
}
+
return ptr;
}
-/* free all static arrays and reset pointers to 0 */
+
+/**
+ * free all static arrays and reset pointers to 0.
+ */
void av_free_static(void)
{
- if (array_static)
- {
- unsigned i;
- for (i = 0; i < last_static; i++)
- {
- av_free(*array_static[i]);
- *array_static[i] = NULL;
- }
- av_free(array_static);
- array_static = 0;
+ while(last_static){
+ av_freep(&array_static[--last_static]);
}
- last_static = 0;
+ av_freep(&array_static);
}
/**
diff --git a/src/libffmpeg/libavcodec/vmdav.c b/src/libffmpeg/libavcodec/vmdav.c
index 47c77513d..c09af1369 100644
--- a/src/libffmpeg/libavcodec/vmdav.c
+++ b/src/libffmpeg/libavcodec/vmdav.c
@@ -47,9 +47,6 @@
#include "avcodec.h"
#include "dsputil.h"
-#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log
-#define fprintf(...) {}
-
#define VMD_HEADER_SIZE 0x330
#define PALETTE_COUNT 256
@@ -245,7 +242,7 @@ static void vmd_decode(VmdVideoContext *s)
}
} while (ofs < frame_width);
if (ofs > frame_width) {
- printf (" VMD video: offset > width (%d > %d)\n",
+ av_log(s->avctx, AV_LOG_ERROR, "VMD video: offset > width (%d > %d)\n",
ofs, frame_width);
break;
}
@@ -283,7 +280,7 @@ static void vmd_decode(VmdVideoContext *s)
}
} while (ofs < frame_width);
if (ofs > frame_width) {
- printf (" VMD video: offset > width (%d > %d)\n",
+ av_log(s->avctx, AV_LOG_ERROR, "VMD video: offset > width (%d > %d)\n",
ofs, frame_width);
}
dp += s->frame.linesize[0];
@@ -311,7 +308,7 @@ static int vmdvideo_decode_init(AVCodecContext *avctx)
/* make sure the VMD header made it */
if (s->avctx->extradata_size != VMD_HEADER_SIZE) {
- printf(" VMD video: expected extradata size of %d\n",
+ av_log(s->avctx, AV_LOG_ERROR, "VMD video: expected extradata size of %d\n",
VMD_HEADER_SIZE);
return -1;
}
@@ -350,7 +347,7 @@ static int vmdvideo_decode_frame(AVCodecContext *avctx,
s->frame.reference = 1;
if (avctx->get_buffer(avctx, &s->frame)) {
- printf (" VMD Video: get_buffer() failed\n");
+ av_log(s->avctx, AV_LOG_ERROR, "VMD Video: get_buffer() failed\n");
return -1;
}
@@ -389,6 +386,7 @@ static int vmdvideo_decode_end(AVCodecContext *avctx)
*/
typedef struct VmdAudioContext {
+ AVCodecContext *avctx;
int channels;
int bits;
int block_align;
@@ -403,12 +401,13 @@ static int vmdaudio_decode_init(AVCodecContext *avctx)
VmdAudioContext *s = (VmdAudioContext *)avctx->priv_data;
int i;
+ s->avctx = avctx;
s->channels = avctx->channels;
s->bits = avctx->bits_per_sample;
s->block_align = avctx->block_align;
-printf (" %d channels, %d bits/sample, block align = %d, sample rate = %d\n",
- s->channels, s->bits, s->block_align, avctx->sample_rate);
+ av_log(s->avctx, AV_LOG_DEBUG, "%d channels, %d bits/sample, block align = %d, sample rate = %d\n",
+ s->channels, s->bits, s->block_align, avctx->sample_rate);
/* set up the steps8 and steps16 tables */
for (i = 0; i < 8; i++) {
@@ -465,8 +464,8 @@ static int vmdaudio_loadsound(VmdAudioContext *s, unsigned char *data,
int bytes_decoded = 0;
int i;
-if (silence)
- printf (" silent block!\n");
+ if (silence)
+ av_log(s->avctx, AV_LOG_INFO, "silent block!\n");
if (s->channels == 2) {
/* stereo handling */
@@ -520,7 +519,6 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx,
unsigned char *p = buf + 16;
unsigned char *p_end = buf + buf_size;
-printf (" processing audio frame with %d bytes\n", buf_size);
if (buf_size < 16)
return buf_size;
@@ -529,7 +527,6 @@ printf (" processing audio frame with %d bytes\n", buf_size);
/* the chunk contains audio */
*data_size = vmdaudio_loadsound(s, output_samples, p, 0);
} else if (buf[6] == 2) {
-printf (" hey! audio case #2\n");
/* the chunk contains audio and silence mixed together */
sound_flags = LE_32(p);
p += 4;
@@ -549,13 +546,10 @@ printf (" hey! audio case #2\n");
sound_flags >>= 1;
}
} else if (buf[6] == 3) {
-printf (" hey! audio case #3\n");
/* silent chunk */
*data_size = vmdaudio_loadsound(s, output_samples, p, 1);
}
-printf (" final sample count = %d, byte count = %d\n", (*data_size) / 2,
- *data_size);
return buf_size;
}
diff --git a/src/libffmpeg/libavcodec/vp3.c b/src/libffmpeg/libavcodec/vp3.c
index eadfd39b9..0667d99eb 100644
--- a/src/libffmpeg/libavcodec/vp3.c
+++ b/src/libffmpeg/libavcodec/vp3.c
@@ -268,9 +268,11 @@ typedef struct Vp3DecodeContext {
VLC ac_vlc_3[16];
VLC ac_vlc_4[16];
- int16_t intra_y_dequant[64];
- int16_t intra_c_dequant[64];
- int16_t inter_dequant[64];
+ /* these arrays need to be on 16-byte boundaries since SSE2 operations
+ * index into them */
+ int16_t __align16 intra_y_dequant[64];
+ int16_t __align16 intra_c_dequant[64];
+ int16_t __align16 inter_dequant[64];
/* This table contains superblock_count * 16 entries. Each set of 16
* numbers corresponds to the fragment indices 0..15 of the superblock.
diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c
index 25498c4d2..cf2db1494 100644
--- a/src/libffmpeg/libavcodec/wmadec.c
+++ b/src/libffmpeg/libavcodec/wmadec.c
@@ -20,6 +20,15 @@
/**
* @file wmadec.c
* WMA compatible decoder.
+ * This decoder handles Microsoft Windows Media Audio data, versions 1 & 2.
+ * WMA v1 is identified by audio format 0x160 in Microsoft media files
+ * (ASF/AVI/WAV). WMA v2 is identified by audio format 0x161.
+ *
+ * To use this decoder, a calling application must supply the extra data
+ * bytes provided with the WMA data. These are the extra, codec-specific
+ * bytes at the end of a WAVEFORMATEX data structure. Transmit these bytes
+ * to the decoder using the extradata[_size] fields in AVCodecContext. There
+ * should be 4 extra bytes for v1 data and 6 extra bytes for v2 data.
*/
#include "avcodec.h"
diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c
index 130a7f89d..376f0706e 100644
--- a/src/libffmpeg/libavcodec/wmv2.c
+++ b/src/libffmpeg/libavcodec/wmv2.c
@@ -181,7 +181,7 @@ int ff_wmv2_encode_picture_header(MpegEncContext * s, int picture_number)
put_bits(&s->pb, 1, s->dc_table_index);
put_bits(&s->pb, 1, s->mv_table_index);
- s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
+ s->inter_intra_pred= 0;//(s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
}
s->esc3_level_length= 0;
s->esc3_run_length= 0;
@@ -216,7 +216,7 @@ void ff_wmv2_encode_mb(MpegEncContext * s,
wmv2_inter_table[w->cbp_table_index][cbp + 64][0]);
/* motion vector */
- h263_pred_motion(s, 0, &pred_x, &pred_y);
+ h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
msmpeg4_encode_motion(s, motion_x - pred_x,
motion_y - pred_y);
} else {
@@ -443,7 +443,7 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext * s)
s->dc_table_index = get_bits1(&s->gb);
s->mv_table_index = get_bits1(&s->gb);
- s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
+ s->inter_intra_pred= 0;//(s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
s->no_rounding ^= 1;
if(s->avctx->debug&FF_DEBUG_PICT_INFO){
@@ -504,7 +504,7 @@ static int16_t *wmv2_pred_motion(Wmv2Context *w, int *px, int *py){
int xy, wrap, diff, type;
int16_t *A, *B, *C, *mot_val;
- wrap = s->block_wrap[0];
+ wrap = s->b8_stride;
xy = s->block_index[0];
mot_val = s->current_picture.motion_val[0][xy];