ffmpeg sync

CVS patchset: 6437 CVS date: 2004/04/25 18:57:04
author: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2004-04-25 18:57:04 +0000
committer: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2004-04-25 18:57:04 +0000
commit: a2a44876712f079610f0396fb9a682ea47e05b6e (patch)
tree: 1f3e328dfe6a5f9fa7c79e7a23bf6310be2827fd
parent: acb7dc0f256afc24e875a168da989ef25d86b7b7 (diff)
download: xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.gz
xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.bz2
51 files changed, 10694 insertions, 2072 deletions
diff --git a/CREDITS b/CREDITS
index 0893a09d6..3bcd3b3e0 100644
--- a/CREDITS
+++ b/CREDITS
@@ -12,7 +12,7 @@ updates (the word 'maintainer' is intentionally avoided here).
 project				version			mediator
 -----------------------------------------------------------------------
 
-ffmpeg				build 4707		Mike Melanson
+ffmpeg				build 4710		Mike Melanson
 goom				1.9dev5
 gsm610				1.0.10			Mike Melanson
 liba52				0.7.4
diff --git a/configure.ac b/configure.ac
index 6ae201efb..10f01a955 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1962,6 +1962,7 @@ src/libffmpeg/libavcodec/i386/Makefile
 src/libffmpeg/libavcodec/mlib/Makefile
 src/libffmpeg/libavcodec/alpha/Makefile
 src/libffmpeg/libavcodec/ppc/Makefile
+src/libffmpeg/libavcodec/sparc/Makefile
 src/libffmpeg/libavcodec/libpostproc/Makefile
 src/libflac/Makefile
 src/liblpcm/Makefile
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index 8e5d53df3..7bbcbd281 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -1,6 +1,6 @@
 include $(top_srcdir)/misc/Makefile.common
 
-SUBDIRS = armv4l i386 mlib alpha ppc libpostproc
+SUBDIRS = armv4l i386 mlib alpha ppc sparc libpostproc
 
 ## some files here are #included by others... go figure.
 EXTRA_DIST = fdctref.c motion_est_template.c svq3.c wmv2.c \
@@ -80,6 +80,7 @@ libavcodec_la_LDFLAGS = \
 	$(top_builddir)/src/libffmpeg/libavcodec/i386/libavcodec_mmx.la \
 	$(top_builddir)/src/libffmpeg/libavcodec/mlib/libavcodec_mlib.la \
 	$(top_builddir)/src/libffmpeg/libavcodec/ppc/libavcodec_ppc.la \
+	$(top_builddir)/src/libffmpeg/libavcodec/sparc/libavcodec_sparc.la \
 	-avoid-version -module
 
 
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 510bd41d2..731bcd375 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -24,7 +24,7 @@ extern "C" {
 
 #define FFMPEG_VERSION_INT     0x000408
 #define FFMPEG_VERSION         "0.4.8"
-#define LIBAVCODEC_BUILD       4707
+#define LIBAVCODEC_BUILD       4710
 
 #define LIBAVCODEC_VERSION_INT FFMPEG_VERSION_INT
 #define LIBAVCODEC_VERSION     FFMPEG_VERSION
@@ -449,7 +449,7 @@ typedef struct AVPanScan{
 \
     /**\
      * Motion vector table\
-     * - encoding: unused\
+     * - encoding: set by user\
      * - decoding: set by lavc\
      */\
     int16_t (*motion_val[2])[2];\
@@ -457,7 +457,7 @@ typedef struct AVPanScan{
     /**\
      * Macroblock type table\
      * mb_type_base + mb_width + 2\
-     * - encoding: unused\
+     * - encoding: set by user\
      * - decoding: set by lavc\
      */\
     uint32_t *mb_type;\
@@ -545,13 +545,20 @@ typedef struct AVPanScan{
      * - decoding: set by lavc\
      */\
     short *dct_coeff;\
+\
+    /**\
+     * Motion referece frame index\
+     * - encoding: set by user\
+     * - decoding: set by lavc\
+     */\
+    int8_t *ref_index[2];
 
 #define FF_QSCALE_TYPE_MPEG1	0
 #define FF_QSCALE_TYPE_MPEG2	1
 
 #define FF_BUFFER_TYPE_INTERNAL 1
 #define FF_BUFFER_TYPE_USER     2 ///< Direct rendering buffers (image is (de)allocated by user)
-#define FF_BUFFER_TYPE_SHARED   4 ///< buffer from somewher else, dont dealloc image (data/base)
+#define FF_BUFFER_TYPE_SHARED   4 ///< buffer from somewher else, dont dealloc image (data/base), all other tables are not shared
 #define FF_BUFFER_TYPE_COPY     8 ///< just a (modified) copy of some other buffer, dont dealloc anything
 
 
@@ -847,6 +854,7 @@ typedef struct AVCodecContext {
 #define FF_BUG_QPEL_CHROMA2     256
 #define FF_BUG_DIRECT_BLOCKSIZE 512
 #define FF_BUG_EDGE             1024
+#define FF_BUG_HPEL_CHROMA      2048
 //#define FF_BUG_FAKE_SCALABILITY 16 //autodetection should work 100%
         
     /**
@@ -1567,6 +1575,22 @@ typedef struct AVCodecContext {
      * - decoding: set by execute()
      */
     void *thread_opaque;
+
+    /**
+     * Motion estimation threshold. under which no motion estimation is 
+     * performed, but instead the user specified motion vectors are used
+     * 
+     * - encoding: set by user
+     * - decoding: unused
+     */
+     int me_threshold;
+
+    /**
+     * Macroblock threshold. under which the user specified macroblock types will be used
+     * - encoding: set by user
+     * - decoding: unused
+     */
+     int mb_threshold;
 } AVCodecContext;
 
 
@@ -1676,6 +1700,7 @@ extern AVCodec h263p_encoder;
 extern AVCodec flv_encoder;
 extern AVCodec rv10_encoder;
 extern AVCodec rv20_encoder;
+extern AVCodec dvvideo_encoder;
 extern AVCodec mjpeg_encoder;
 extern AVCodec ljpeg_encoder;
 extern AVCodec mpeg4_encoder;
@@ -1826,7 +1851,10 @@ ImgReSampleContext *img_resample_init(int output_width, int output_height,
 ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
                                       int iwidth, int iheight,
                                       int topBand, int bottomBand,
-                                      int leftBand, int rightBand);
+                                      int leftBand, int rightBand,
+                                      int padtop, int padbottom,
+                                      int padleft, int padright);
+
 
 void img_resample(ImgReSampleContext *s, 
                   AVPicture *output, const AVPicture *input);
@@ -1901,6 +1929,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode);
 
 void avcodec_get_context_defaults(AVCodecContext *s);
 AVCodecContext *avcodec_alloc_context(void);
+void avcodec_get_frame_defaults(AVFrame *pic);
 AVFrame *avcodec_alloc_frame(void);
 
 int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic);
@@ -2102,8 +2131,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
 /* for static data only */
 /* call av_free_static to release all staticaly allocated tables */
 void av_free_static(void);
-void *__av_mallocz_static(void** location, unsigned int size);
-#define av_mallocz_static(p, s) __av_mallocz_static((void **)(p), s)
+void *av_mallocz_static(unsigned int size);
 
 /* add by bero : in adx.c */
 int is_adx(const unsigned char *buf,size_t bufsize);
@@ -2115,6 +2143,7 @@ void img_copy(AVPicture *dst, const AVPicture *src,
 
 #include <stdarg.h>
 
+#define AV_LOG_QUIET -1
 #define AV_LOG_ERROR 0
 #define AV_LOG_INFO 1
 #define AV_LOG_DEBUG 2
diff --git a/src/libffmpeg/libavcodec/cabac.c b/src/libffmpeg/libavcodec/cabac.c
index 27e63045b..0e3e14f56 100644
--- a/src/libffmpeg/libavcodec/cabac.c
+++ b/src/libffmpeg/libavcodec/cabac.c
@@ -113,7 +113,7 @@ void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4],
         c->mps_state[2*i+0]= 2*mps_state[i];
         c->mps_state[2*i+1]= 2*mps_state[i]+1;
 
-        if(lps_state[i]){
+        if( i ){
             c->lps_state[2*i+0]= 2*lps_state[i];
             c->lps_state[2*i+1]= 2*lps_state[i]+1;
         }else{
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 59b128cef..de9382a13 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -6,6 +6,11 @@
 #ifndef COMMON_H
 #define COMMON_H
 
+// xine: disable DEBUG for ffmpeg (too noisy)
+#ifdef DEBUG
+#undef DEBUG
+#endif
+
 #if defined(WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
 #    define CONFIG_WIN32
 #endif
@@ -131,7 +136,7 @@ static inline float floorf(float f) {
 
 /* windows */
 
-#    ifndef __MINGW32__
+#    if !defined(__MINGW32__) && !defined(__CYGWIN__)
 #        define int64_t_C(c)     (c ## i64)
 #        define uint64_t_C(c)    (c ## i64)
 
@@ -204,40 +209,30 @@ static inline float floorf(float f) {
 
 /* debug stuff */
 
-#    ifndef DEBUG
-#      ifndef NDEBUG
+#    if !defined(DEBUG) && !defined(NDEBUG)
 #        define NDEBUG
-#      endif
 #    endif
 #    include <assert.h>
 
 /* dprintf macros */
-#    if defined(CONFIG_WIN32) && !defined(__MINGW32__)
+#    if defined(CONFIG_WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
 
 inline void dprintf(const char* fmt,...) {}
 
 #    else
 
-#if __GNUC__
-#ifdef DEBUG
-#define dprintf(fmt,args...) printf(fmt, ## args)
-#else
-#define dprintf(fmt,args...)
-#endif
-#else
-#ifdef DEBUG
-#define dprintf(...) printf(__VA_ARGS__)
-#else
-#define dprintf(...)
-#endif
-#endif
+#        ifdef DEBUG
+#            define dprintf(fmt,...) av_log(NULL, AV_LOG_DEBUG, fmt, __VA_ARGS__)
+#        else
+#            define dprintf(fmt,...)
+#        endif
 
 #    endif /* !CONFIG_WIN32 */
 
 #    define av_abort()      do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0)
 
 //rounded divison & shift
-#define RSHIFT(a,b) ((a) > 0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
+#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
 /* assume b>0 */
 #define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
 #define ABS(a) ((a) >= 0 ? (a) : (-(a)))
@@ -291,6 +286,7 @@ struct PutBitContext;
 
 typedef void (*WriteDataFunc)(void *, uint8_t *, int);
 
+/* buf and buf_end must be present and used by every alternative writer. */
 typedef struct PutBitContext {
 #ifdef ALT_BITSTREAM_WRITER
     uint8_t *buf, *buf_end;
@@ -327,11 +323,6 @@ static inline int put_bits_count(PutBitContext *s)
 #endif
 }
 
-static inline int put_bits_left(PutBitContext* s)
-{
-    return (s->buf_end - s->buf) * 8 - put_bits_count(s);
-}
-
 /* pad the end of the output stream with zeros */
 static inline void flush_put_bits(PutBitContext *s)
 {
@@ -354,7 +345,7 @@ void align_put_bits(PutBitContext *s);
 void put_string(PutBitContext * pbc, char *s, int put_zero);
 
 /* bit input */
-
+/* buffer, buffer_end and size_in_bits must be present and used by every reader */
 typedef struct GetBitContext {
     const uint8_t *buffer, *buffer_end;
 #ifdef ALT_BITSTREAM_READER
@@ -386,7 +377,7 @@ typedef struct RL_VLC_ELEM {
     uint8_t run;
 } RL_VLC_ELEM;
 
-#ifdef ARCH_SPARC64
+#ifdef ARCH_SPARC
 #define UNALIGNED_STORES_ARE_BAD
 #endif
 
@@ -437,7 +428,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 	bit_buf<<=bit_left;
         bit_buf |= value >> (n - bit_left);
 #ifdef UNALIGNED_STORES_ARE_BAD
-        if (3 & (int) s->buf_ptr) {
+        if (3 & (intptr_t) s->buf_ptr) {
             s->buf_ptr[0] = bit_buf >> 24;
             s->buf_ptr[1] = bit_buf >> 16;
             s->buf_ptr[2] = bit_buf >>  8;
@@ -924,11 +915,6 @@ static inline void init_get_bits(GetBitContext *s,
 #endif
 }
 
-static inline int get_bits_left(GetBitContext *s)
-{
-    return s->size_in_bits - get_bits_count(s);
-}
-
 int check_marker(GetBitContext *s, const char *msg);
 void align_get_bits(GetBitContext *s);
 int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
@@ -1080,7 +1066,7 @@ static inline int get_xbits_trace(GetBitContext *s, int n, char *file, char *fun
 #define get_vlc(s, vlc)            get_vlc_trace(s, (vlc)->table, (vlc)->bits, 3, __FILE__, __PRETTY_FUNCTION__, __LINE__)
 #define get_vlc2(s, tab, bits, max) get_vlc_trace(s, tab, bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__)
 
-#define tprintf printf
+#define tprintf(...) av_log(NULL, AV_LOG_DEBUG, __VA_ARGS__)
 
 #else //TRACE
 #define tprintf(...) {}
@@ -1182,6 +1168,12 @@ static inline int clip(int a, int amin, int amax)
         return a;
 }
 
+static inline int clip_uint8(int a)
+{
+    if (a&(~255)) return (-a)>>31;
+    else          return a;
+}
+
 /* math */
 extern const uint8_t ff_sqrt_tab[128];
 
@@ -1290,6 +1282,9 @@ tend= rdtsc();\
 #define malloc please_use_av_malloc
 #define free please_use_av_free
 #define realloc please_use_av_realloc
+#define time time_is_forbidden_due_to_security_issues
+#define rand rand_is_forbidden_due_to_state_trashing
+#define srand srand_is_forbidden_due_to_state_trashing
 #if !(defined(LIBAVFORMAT_BUILD) || defined(_FRAMEHOOK_H))
 #define printf please_use_av_log
 #define fprintf please_use_av_log
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index 7f26bd98a..fce0b8163 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -3286,6 +3286,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 #ifdef HAVE_MLIB
     dsputil_init_mlib(c, avctx);
 #endif
+#ifdef ARCH_SPARC
+   dsputil_init_vis(c,avctx);
+#endif
 #ifdef ARCH_ALPHA
     dsputil_init_alpha(c, avctx);
 #endif
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index 35e965db0..730e1489d 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -76,6 +76,12 @@ void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
 void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
     int coeff_count, uint8_t *dest, int stride);
 
+void vp3_dsp_init_sse2(void);
+void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, uint8_t *dest, int stride);
+void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, uint8_t *dest, int stride);
+
 
 /* minimum alignment rules ;)
 if u notice errors in the align stuff, need more alignment for some asm code for some cpu
@@ -378,6 +384,8 @@ static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
    one or more MultiMedia extension */
 int mm_support(void);
 
+#define __align16 __attribute__ ((aligned (16)))
+
 #if defined(HAVE_MMX)
 
 #undef emms_c
@@ -413,7 +421,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
 #elif defined(ARCH_ARMV4L)
 
 /* This is to use 4 bytes read to the IDCT pointers for some 'zero'
-   line ptimizations */
+   line optimizations */
 #define __align8 __attribute__ ((aligned (4)))
 
 void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
@@ -425,6 +433,12 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
 
 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
 
+#elif defined(ARCH_SPARC)
+
+/* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
+#define __align8 __attribute__ ((aligned (8)))
+void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
+
 #elif defined(ARCH_ALPHA)
 
 #define __align8 __attribute__ ((aligned (8)))
diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c
index 08be11d45..5f1eaaa3b 100644
--- a/src/libffmpeg/libavcodec/dv.c
+++ b/src/libffmpeg/libavcodec/dv.c
@@ -1,6 +1,7 @@
 /*
  * DV decoder
  * Copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2004 Roman Shaposhnik.
  *
  * DV encoder 
  * Copyright (c) 2003 Roman Shaposhnik.
@@ -33,20 +34,18 @@
 #include "simple_idct.h"
 #include "dvdata.h"
 
-typedef struct DVVideoDecodeContext {
+typedef struct DVVideoContext {
     const DVprofile* sys;
     AVFrame picture;
+    uint8_t *buf;
     
     uint8_t dv_zigzag[2][64];
-    uint8_t dv_idct_shift[2][22][64];
+    uint8_t dv_idct_shift[2][2][22][64];
   
     void (*get_pixels)(DCTELEM *block, const uint8_t *pixels, int line_size);
     void (*fdct[2])(DCTELEM *block);
     void (*idct_put[2])(uint8_t *dest, int line_size, DCTELEM *block);
-    
-    GetBitContext gb;
-    DCTELEM block[5*6][64] __align8;
-} DVVideoDecodeContext;
+} DVVideoContext;
 
 #define TEX_VLC_BITS 9
 
@@ -58,15 +57,18 @@ typedef struct DVVideoDecodeContext {
 #define DV_VLC_MAP_LEV_SIZE 512
 #endif
 
+/* MultiThreading */
+static uint8_t** dv_anchor;
+
 /* XXX: also include quantization */
-static RL_VLC_ELEM *dv_rl_vlc[1];
+static RL_VLC_ELEM *dv_rl_vlc;
 /* VLC encoding lookup table */
 static struct dv_vlc_pair {
    uint32_t vlc;
    uint8_t  size;
 } (*dv_vlc_map)[DV_VLC_MAP_LEV_SIZE] = NULL;
 
-static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm)
+static void dv_build_unquantize_tables(DVVideoContext *s, uint8_t* perm)
 {
     int i, q, j;
 
@@ -76,29 +78,34 @@ static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm)
         for(i = 1; i < 64; i++) {
             /* 88 table */
             j = perm[i];
-            s->dv_idct_shift[0][q][j] =
+            s->dv_idct_shift[0][0][q][j] =
                 dv_quant_shifts[q][dv_88_areas[i]] + 1;
+	    s->dv_idct_shift[1][0][q][j] = s->dv_idct_shift[0][0][q][j] + 1;
         }
         
         /* 248DCT */
         for(i = 1; i < 64; i++) {
             /* 248 table */
-            s->dv_idct_shift[1][q][i] =  
+            s->dv_idct_shift[0][1][q][i] =  
                 dv_quant_shifts[q][dv_248_areas[i]] + 1;
+	    s->dv_idct_shift[1][1][q][i] = s->dv_idct_shift[0][1][q][i] + 1;
         }
     }
 }
 
 static int dvvideo_init(AVCodecContext *avctx)
 {
-    DVVideoDecodeContext *s = avctx->priv_data;
+    DVVideoContext *s = avctx->priv_data;
     DSPContext dsp;
     static int done=0;
     int i, j;
 
     if (!done) {
-        int i;
         VLC dv_vlc;
+        uint16_t new_dv_vlc_bits[NB_DV_VLC*2];
+        uint8_t new_dv_vlc_len[NB_DV_VLC*2];
+        uint8_t new_dv_vlc_run[NB_DV_VLC*2];
+        int16_t new_dv_vlc_level[NB_DV_VLC*2];
 
         done = 1;
 
@@ -106,13 +113,42 @@ static int dvvideo_init(AVCodecContext *avctx)
 	if (!dv_vlc_map)
 	    return -ENOMEM;
 
+	/* dv_anchor lets each thread know its Id */
+	dv_anchor = av_malloc(12*27*sizeof(void*));
+	if (!dv_anchor) {
+	    av_free(dv_vlc_map);
+	    return -ENOMEM;
+	}
+	for (i=0; i<12*27; i++)
+	    dv_anchor[i] = (void*)(size_t)i;
+
+	/* it's faster to include sign bit in a generic VLC parsing scheme */
+	for (i=0, j=0; i<NB_DV_VLC; i++, j++) {
+	    new_dv_vlc_bits[j] = dv_vlc_bits[i];
+	    new_dv_vlc_len[j] = dv_vlc_len[i];
+	    new_dv_vlc_run[j] = dv_vlc_run[i];
+	    new_dv_vlc_level[j] = dv_vlc_level[i];
+	    
+	    if (dv_vlc_level[i]) {
+	        new_dv_vlc_bits[j] <<= 1;
+		new_dv_vlc_len[j]++;
+
+		j++;
+		new_dv_vlc_bits[j] = (dv_vlc_bits[i] << 1) | 1;
+		new_dv_vlc_len[j] = dv_vlc_len[i] + 1;
+		new_dv_vlc_run[j] = dv_vlc_run[i];
+		new_dv_vlc_level[j] = -dv_vlc_level[i];
+	    }
+	}
+             
         /* NOTE: as a trick, we use the fact the no codes are unused
            to accelerate the parsing of partial codes */
-        init_vlc(&dv_vlc, TEX_VLC_BITS, NB_DV_VLC, 
-                 dv_vlc_len, 1, 1, dv_vlc_bits, 2, 2);
+        init_vlc(&dv_vlc, TEX_VLC_BITS, j, 
+                 new_dv_vlc_len, 1, 1, new_dv_vlc_bits, 2, 2);
 
-        dv_rl_vlc[0] = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM));
-	if (!dv_rl_vlc[0]) {
+        dv_rl_vlc = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM));
+	if (!dv_rl_vlc) {
+	    av_free(dv_anchor);
 	    av_free(dv_vlc_map);
 	    return -ENOMEM;
 	}
@@ -124,18 +160,15 @@ static int dvvideo_init(AVCodecContext *avctx)
             if(len<0){ //more bits needed
                 run= 0;
                 level= code;
-            } else if (code == (NB_DV_VLC - 1)) {
-                /* EOB */
-                run = 0;
-                level = 256;
             } else {
-                run=   dv_vlc_run[code] + 1;
-                level= dv_vlc_level[code];
+                run=   new_dv_vlc_run[code] + 1;
+                level= new_dv_vlc_level[code];
             }
-            dv_rl_vlc[0][i].len = len;
-            dv_rl_vlc[0][i].level = level;
-            dv_rl_vlc[0][i].run = run;
+            dv_rl_vlc[i].len = len;
+            dv_rl_vlc[i].level = level;
+            dv_rl_vlc[i].run = run;
         }
+	free_vlc(&dv_vlc);
 
 	for (i = 0; i < NB_DV_VLC - 1; i++) {
            if (dv_vlc_run[i] >= DV_VLC_MAP_RUN_SIZE || dv_vlc_level[i] >= DV_VLC_MAP_LEV_SIZE)
@@ -202,13 +235,19 @@ static int dvvideo_init(AVCodecContext *avctx)
     return 0;
 }
 
+static int dvvideo_end(AVCodecContext *avctx)
+{
+    avcodec_default_free_buffers(avctx);    
+    return 0;
+}
+
 // #define VLC_DEBUG
+// #define printf(...) av_log(NULL, AV_LOG_ERROR, __VA_ARGS__)
 
 typedef struct BlockInfo {
     const uint8_t *shift_table;
     const uint8_t *scan_table;
     uint8_t pos; /* position in block */
-    uint8_t eob_reached; /* true if EOB has been reached */
     uint8_t dct_mode;
     uint8_t partial_bit_count;
     uint16_t partial_bit_buffer;
@@ -228,141 +267,88 @@ static const int mb_area_start[5] = { 1, 6, 21, 43, 64 };
 #warning only works with ALT_BITSTREAM_READER
 #endif
 
+static inline int get_bits_left(GetBitContext *s)
+{
+    return s->size_in_bits - get_bits_count(s);
+}
+
+static inline int get_bits_size(GetBitContext *s)
+{
+    return s->size_in_bits;
+}
+
+static inline int put_bits_left(PutBitContext* s)
+{
+    return (s->buf_end - s->buf) * 8 - put_bits_count(s);
+}
+
 /* decode ac coefs */
-static void dv_decode_ac(DVVideoDecodeContext *s, 
-                         BlockInfo *mb, DCTELEM *block, int last_index)
+static void dv_decode_ac(GetBitContext *gb, BlockInfo *mb, DCTELEM *block)
 {
-    int last_re_index;
-    int shift_offset = mb->shift_offset;
+    int last_index = get_bits_size(gb);
     const uint8_t *scan_table = mb->scan_table;
     const uint8_t *shift_table = mb->shift_table;
     int pos = mb->pos;
-    int level, pos1, sign, run;
-    int partial_bit_count;
-#ifndef ALT_BITSTREAM_READER //FIXME
-    int re_index=0; 
-    int re1_index=0;
-#endif
-    OPEN_READER(re, &s->gb);
+    int partial_bit_count = mb->partial_bit_count;
+    int level, pos1, run, vlc_len, index;
+    
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
     
-#ifdef VLC_DEBUG
-    printf("start\n");
-#endif
-
     /* if we must parse a partial vlc, we do it here */
-    partial_bit_count = mb->partial_bit_count;
     if (partial_bit_count > 0) {
-        uint8_t buf[4];
-        uint32_t v;
-        int l, l1;
-        GetBitContext gb1;
-
-        /* build the dummy bit buffer */
-        l = 16 - partial_bit_count;
-        UPDATE_CACHE(re, &s->gb);
-#ifdef VLC_DEBUG
-        printf("show=%04x\n", SHOW_UBITS(re, &s->gb, 16));
-#endif
-        v = (mb->partial_bit_buffer << l) | SHOW_UBITS(re, &s->gb, l);
-        buf[0] = v >> 8;
-        buf[1] = v;
-#ifdef VLC_DEBUG
-        printf("v=%04x cnt=%d %04x\n", 
-               v, partial_bit_count, (mb->partial_bit_buffer << l));
-#endif
-        /* try to read the codeword */
-        init_get_bits(&gb1, buf, 4*8);
-        {
-            OPEN_READER(re1, &gb1);
-            UPDATE_CACHE(re1, &gb1);
-            GET_RL_VLC(level, run, re1, &gb1, dv_rl_vlc[0], 
-                       TEX_VLC_BITS, 2);
-            l = re1_index;
-            CLOSE_READER(re1, &gb1);
-        }
-#ifdef VLC_DEBUG
-        printf("****run=%d level=%d size=%d\n", run, level, l);
-#endif
-        /* compute codeword length */
-        l1 = (level != 256 && level != 0);
-        /* if too long, we cannot parse */
-        l -= partial_bit_count;
-        if ((re_index + l + l1) > last_index)
-            return;
-        /* skip read bits */
-        last_re_index = 0; /* avoid warning */
-        re_index += l;
-        /* by definition, if we can read the vlc, all partial bits
-           will be read (otherwise we could have read the vlc before) */
-        mb->partial_bit_count = 0;
-        UPDATE_CACHE(re, &s->gb);
-        goto handle_vlc;
+        re_cache = ((unsigned)re_cache >> partial_bit_count) |
+	           (mb->partial_bit_buffer << (sizeof(re_cache)*8 - partial_bit_count));
+	re_index -= partial_bit_count;
+	mb->partial_bit_count = 0;
     }
 
     /* get the AC coefficients until last_index is reached */
     for(;;) {
-        UPDATE_CACHE(re, &s->gb);
 #ifdef VLC_DEBUG
-        printf("%2d: bits=%04x index=%d\n", 
-               pos, SHOW_UBITS(re, &s->gb, 16), re_index);
+        printf("%2d: bits=%04x index=%d\n", pos, SHOW_UBITS(re, gb, 16), re_index);
 #endif
-        last_re_index = re_index;
-        GET_RL_VLC(level, run, re, &s->gb, dv_rl_vlc[0], 
-                   TEX_VLC_BITS, 2);
-    handle_vlc:
+        /* our own optimized GET_RL_VLC */
+        index = NEG_USR32(re_cache, TEX_VLC_BITS);
+	vlc_len = dv_rl_vlc[index].len;
+        if (vlc_len < 0) {
+            index = NEG_USR32((unsigned)re_cache << TEX_VLC_BITS, -vlc_len) + dv_rl_vlc[index].level;
+            vlc_len = TEX_VLC_BITS - vlc_len;
+        }
+        level = dv_rl_vlc[index].level;
+	run = dv_rl_vlc[index].run;
+	
+	/* gotta check if we're still within gb boundaries */
+	if (re_index + vlc_len > last_index) {
+	    /* should be < 16 bits otherwise a codeword could have been parsed */
+	    mb->partial_bit_count = last_index - re_index;
+	    mb->partial_bit_buffer = NEG_USR32(re_cache, mb->partial_bit_count);
+	    re_index = last_index;
+	    break;
+	}
+	re_index += vlc_len;
+
 #ifdef VLC_DEBUG
-        printf("run=%d level=%d\n", run, level);
+	printf("run=%d level=%d\n", run, level);
 #endif
-        if (level == 256) {
-            if (re_index > last_index) {
-            cannot_read:
-                /* put position before read code */
-                re_index = last_re_index;
-                mb->eob_reached = 0;
-                break;
-            }
-            /* EOB */
-            mb->eob_reached = 1;
-            break;
-        } else if (level != 0) {
-            if ((re_index + 1) > last_index)
-                goto cannot_read;
-            sign = SHOW_SBITS(re, &s->gb, 1);
-            level = (level ^ sign) - sign;
-            LAST_SKIP_BITS(re, &s->gb, 1);
-            pos += run;
-            /* error */
-            if (pos >= 64) {
-                goto read_error;
-            }
+	pos += run; 	
+	if (pos >= 64)
+	    break;
+        
+	if (level) {
             pos1 = scan_table[pos];
-            level = level << (shift_table[pos1] + shift_offset);
-            block[pos1] = level;
-            //            printf("run=%d level=%d shift=%d\n", run, level, shift_table[pos1]);
-        } else {
-            if (re_index > last_index)
-                goto cannot_read;
-            /* level is zero: means run without coding. No
-               sign is coded */
-            pos += run;
-            /* error */
-            if (pos >= 64) {
-            read_error:
-#if defined(VLC_DEBUG) || 1
-                av_log(NULL, AV_LOG_ERROR, "error pos=%d\n", pos);
-#endif
-                /* for errors, we consider the eob is reached */
-                mb->eob_reached = 1;
-                break;
-            }
-        }
+            block[pos1] = level << shift_table[pos1];
+        } 
+
+        UPDATE_CACHE(re, gb);
     }
-    CLOSE_READER(re, &s->gb);
+    CLOSE_READER(re, gb);
     mb->pos = pos;
 }
 
-static inline void bit_copy(PutBitContext *pb, GetBitContext *gb, int bits_left)
+static inline void bit_copy(PutBitContext *pb, GetBitContext *gb)
 {
+    int bits_left = get_bits_left(gb);
     while (bits_left >= 16) {
         put_bits(pb, 16, get_bits(gb, 16));
         bits_left -= 16;
@@ -373,60 +359,56 @@ static inline void bit_copy(PutBitContext *pb, GetBitContext *gb, int bits_left)
 }
 
 /* mb_x and mb_y are in units of 8 pixels */
-static inline void dv_decode_video_segment(DVVideoDecodeContext *s, 
+static inline void dv_decode_video_segment(DVVideoContext *s, 
                                            uint8_t *buf_ptr1, 
                                            const uint16_t *mb_pos_ptr)
 {
     int quant, dc, dct_mode, class1, j;
     int mb_index, mb_x, mb_y, v, last_index;
     DCTELEM *block, *block1;
-    int c_offset, bits_left;
+    int c_offset;
     uint8_t *y_ptr;
-    BlockInfo mb_data[5 * 6], *mb, *mb1;
     void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
     uint8_t *buf_ptr;
     PutBitContext pb, vs_pb;
+    GetBitContext gb;
+    BlockInfo mb_data[5 * 6], *mb, *mb1;
+    DCTELEM sblock[5*6][64] __align8;
     uint8_t mb_bit_buffer[80 + 4]; /* allow some slack */
-    int mb_bit_count;
     uint8_t vs_bit_buffer[5 * 80 + 4]; /* allow some slack */
-    int vs_bit_count;
-    
-    memset(s->block, 0, sizeof(s->block));
+	    
+    memset(sblock, 0, sizeof(sblock));
 
     /* pass 1 : read DC and AC coefficients in blocks */
     buf_ptr = buf_ptr1;
-    block1 = &s->block[0][0];
+    block1 = &sblock[0][0];
     mb1 = mb_data;
     init_put_bits(&vs_pb, vs_bit_buffer, 5 * 80);
-    vs_bit_count = 0;
-    for(mb_index = 0; mb_index < 5; mb_index++) {
+    for(mb_index = 0; mb_index < 5; mb_index++, mb1 += 6, block1 += 6 * 64) {
         /* skip header */
         quant = buf_ptr[3] & 0x0f;
         buf_ptr += 4;
         init_put_bits(&pb, mb_bit_buffer, 80);
-        mb_bit_count = 0;
         mb = mb1;
         block = block1;
         for(j = 0;j < 6; j++) {
-            /* NOTE: size is not important here */
-            init_get_bits(&s->gb, buf_ptr, 14*8);
+            last_index = block_sizes[j];
+	    init_get_bits(&gb, buf_ptr, last_index);
             
             /* get the dc */
-            dc = get_bits(&s->gb, 9);
+            dc = get_bits(&gb, 9);
             dc = (dc << (32 - 9)) >> (32 - 9);
-            dct_mode = get_bits1(&s->gb);
+            dct_mode = get_bits1(&gb);
             mb->dct_mode = dct_mode;
             mb->scan_table = s->dv_zigzag[dct_mode];
-            class1 = get_bits(&s->gb, 2);
-            mb->shift_offset = (class1 == 3);
-            mb->shift_table = s->dv_idct_shift[dct_mode]
+            class1 = get_bits(&gb, 2);
+            mb->shift_table = s->dv_idct_shift[class1 == 3][dct_mode]
                 [quant + dv_quant_offset[class1]];
             dc = dc << 2;
             /* convert to unsigned because 128 is not added in the
                standard IDCT */
             dc += 1024;
             block[0] = dc;
-            last_index = block_sizes[j];
             buf_ptr += last_index >> 3;
             mb->pos = 0;
             mb->partial_bit_count = 0;
@@ -434,88 +416,64 @@ static inline void dv_decode_video_segment(DVVideoDecodeContext *s,
 #ifdef VLC_DEBUG
             printf("MB block: %d, %d ", mb_index, j);
 #endif
-            dv_decode_ac(s, mb, block, last_index);
+            dv_decode_ac(&gb, mb, block);
 
             /* write the remaining bits  in a new buffer only if the
                block is finished */
-            bits_left = last_index - get_bits_count(&s->gb);
-            if (mb->eob_reached) {
-                mb->partial_bit_count = 0;
-                mb_bit_count += bits_left;
-                bit_copy(&pb, &s->gb, bits_left);
-            } else {
-                /* should be < 16 bits otherwise a codeword could have
-                   been parsed */
-                mb->partial_bit_count = bits_left;
-                mb->partial_bit_buffer = get_bits(&s->gb, bits_left);
-            }
+            if (mb->pos >= 64)
+                bit_copy(&pb, &gb);
+            
             block += 64;
             mb++;
         }
         
-        flush_put_bits(&pb);
-
         /* pass 2 : we can do it just after */
 #ifdef VLC_DEBUG
-        printf("***pass 2 size=%d MB#=%d\n", mb_bit_count, mb_index);
+        printf("***pass 2 size=%d MB#=%d\n", put_bits_count(&pb), mb_index);
 #endif
         block = block1;
         mb = mb1;
-        init_get_bits(&s->gb, mb_bit_buffer, 80*8);
-        for(j = 0;j < 6; j++) {
-            if (!mb->eob_reached && get_bits_count(&s->gb) < mb_bit_count) {
-                dv_decode_ac(s, mb, block, mb_bit_count);
+        init_get_bits(&gb, mb_bit_buffer, put_bits_count(&pb));
+	flush_put_bits(&pb);
+        for(j = 0;j < 6; j++, block += 64, mb++) {
+            if (mb->pos < 64 && get_bits_left(&gb) > 0) {
+                dv_decode_ac(&gb, mb, block);
                 /* if still not finished, no need to parse other blocks */
-                if (!mb->eob_reached) {
-                    /* we could not parse the current AC coefficient,
-                       so we add the remaining bytes */
-                    bits_left = mb_bit_count - get_bits_count(&s->gb);
-                    if (bits_left > 0) {
-                        mb->partial_bit_count += bits_left;
-                        mb->partial_bit_buffer = 
-                            (mb->partial_bit_buffer << bits_left) | 
-                            get_bits(&s->gb, bits_left);
-                    }
-                    goto next_mb;
-                }
+                if (mb->pos < 64)
+                    break;
             }
-            block += 64;
-            mb++;
         }
         /* all blocks are finished, so the extra bytes can be used at
            the video segment level */
-        bits_left = mb_bit_count - get_bits_count(&s->gb);
-        vs_bit_count += bits_left;
-        bit_copy(&vs_pb, &s->gb, bits_left);
-    next_mb:
-        mb1 += 6;
-        block1 += 6 * 64;
+        if (j >= 6)
+	    bit_copy(&vs_pb, &gb);
     }
 
     /* we need a pass other the whole video segment */
-    flush_put_bits(&vs_pb);
-        
 #ifdef VLC_DEBUG
-    printf("***pass 3 size=%d\n", vs_bit_count);
+    printf("***pass 3 size=%d\n", put_bits_count(&vs_pb));
 #endif
-    block = &s->block[0][0];
+    block = &sblock[0][0];
     mb = mb_data;
-    init_get_bits(&s->gb, vs_bit_buffer, 5 * 80*8);
+    init_get_bits(&gb, vs_bit_buffer, put_bits_count(&vs_pb));
+    flush_put_bits(&vs_pb);
     for(mb_index = 0; mb_index < 5; mb_index++) {
         for(j = 0;j < 6; j++) {
-            if (!mb->eob_reached) {
+            if (mb->pos < 64) {
 #ifdef VLC_DEBUG
                 printf("start %d:%d\n", mb_index, j);
 #endif
-                dv_decode_ac(s, mb, block, vs_bit_count);
+                dv_decode_ac(&gb, mb, block);
             }
+	    if (mb->pos >= 64 && mb->pos < 127)
+		av_log(NULL, AV_LOG_ERROR, "AC EOB marker is absent pos=%d\n", mb->pos);
             block += 64;
             mb++;
         }
     }
     
     /* compute idct and place blocks */
-    block = &s->block[0][0];
+    block = &sblock[0][0];
     mb = mb_data;
     for(mb_index = 0; mb_index < 5; mb_index++) {
         v = *mb_pos_ptr++;
@@ -790,7 +748,7 @@ static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos)
  * horrible and the weighting is missing. But it's missing from the 
  * decoding step also -- so at least we're on the same page with decoder ;-)
  */
-static inline void dv_encode_video_segment(DVVideoDecodeContext *s, 
+static inline void dv_encode_video_segment(DVVideoContext *s, 
                                            uint8_t *dif, 
                                            const uint16_t *mb_pos_ptr)
 {
@@ -801,6 +759,7 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
     uint8_t*  ptr;
     int       do_edge_wrap;
     DCTELEM   block[64] __align8;
+    DCTELEM   sblock[5*6][64] __align8;
     EncBlockInfo  enc_blks[5*6];
     PutBitContext pbs[5*6];
     PutBitContext* pb; 
@@ -854,7 +813,7 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
 	    }
 	  
             enc_blk->dct_mode = dv_guess_dct_mode(block);
-	    enc_blk->mb = &s->block[mb_index*6+j][0];
+	    enc_blk->mb = &sblock[mb_index*6+j][0];
 	    enc_blk->area_q[0] = enc_blk->area_q[1] = enc_blk->area_q[2] = enc_blk->area_q[3] = 0;
 	    enc_blk->partial_bit_count = 0;
 	    enc_blk->partial_bit_buffer = 0;
@@ -906,15 +865,31 @@ static inline void dv_encode_video_segment(DVVideoDecodeContext *s,
        flush_put_bits(&pbs[j]);
 }
 
+static int dv_decode_mt(AVCodecContext *avctx, void* sl)
+{
+    DVVideoContext *s = avctx->priv_data;
+    int slice = (size_t)sl;
+    dv_decode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80],
+	                    &s->sys->video_place[slice*5]);
+    return 0;
+}
+
+static int dv_encode_mt(AVCodecContext *avctx, void* sl)
+{
+    DVVideoContext *s = avctx->priv_data;
+    int slice = (size_t)sl;
+    dv_encode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80],
+	                    &s->sys->video_place[slice*5]);
+    return 0;
+}
+
 /* NOTE: exactly one frame must be given (120000 bytes for NTSC,
    144000 bytes for PAL) */
 static int dvvideo_decode_frame(AVCodecContext *avctx, 
                                  void *data, int *data_size,
                                  uint8_t *buf, int buf_size)
 {
-    DVVideoDecodeContext *s = avctx->priv_data;
-    int ds, vs;
-    const uint16_t *mb_pos_ptr;
+    DVVideoContext *s = avctx->priv_data;
   
     *data_size=0;
     /* special case for last picture */
@@ -925,7 +900,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
     if (!s->sys || buf_size < s->sys->frame_size)
         return -1; /* NOTE: we only accept several full frames */
 
-	
     if(s->picture.data[0])
         avctx->release_buffer(avctx, &s->picture);
     
@@ -940,24 +914,10 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
     s->picture.interlaced_frame = 1;
     s->picture.top_field_first = 0;
 
-    /* for each DIF segment */
-    mb_pos_ptr = s->sys->video_place;
-    for (ds = 0; ds < s->sys->difseg_size; ds++) {
-        buf += 6 * 80; /* skip DIF segment header */
-        
-        for(vs = 0; vs < 27; vs++) {
-            if ((vs % 3) == 0)
-	        buf += 80; /* skip audio block */
-            
-#ifdef VLC_DEBUG
-            printf("********************* %d, %d **********************\n", ds, vs);
-#endif
-	    dv_decode_video_segment(s, buf, mb_pos_ptr);
-            buf += 5 * 80;
-            mb_pos_ptr += 5;
-        }
-    }
-
+    s->buf = buf;
+    avctx->execute(avctx, dv_decode_mt, (void**)&dv_anchor[0], NULL, 
+	           s->sys->difseg_size * 27);
+    
     emms_c();
 
     /* return image */
@@ -970,9 +930,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
 static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size, 
                                 void *data)
 {
-    DVVideoDecodeContext *s = c->priv_data;
-    const uint16_t *mb_pos_ptr;
-    int ds, vs;
+    DVVideoContext *s = c->priv_data;
 
     s->sys = dv_codec_profile(c);
     if (!s->sys)
@@ -981,41 +939,34 @@ static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size,
     c->pix_fmt = s->sys->pix_fmt;
     s->picture = *((AVFrame *)data);
 
-    /* for each DIF segment */
-    mb_pos_ptr = s->sys->video_place;
-    for (ds = 0; ds < s->sys->difseg_size; ds++) {
-        buf += 6 * 80; /* skip DIF segment header */
-        
-        for(vs = 0; vs < 27; vs++) {
-            if ((vs % 3) == 0)
-	        buf += 80; /* skip audio block */
-
-#ifdef VLC_DEBUG
-            printf("********************* %d, %d **********************\n", ds, vs);
-#endif
-	    dv_encode_video_segment(s, buf, mb_pos_ptr);
-            buf += 5 * 80;
-            mb_pos_ptr += 5;
-        }
-    }
+    s->buf = buf;
+    c->execute(c, dv_encode_mt, (void**)&dv_anchor[0], NULL, 
+	       s->sys->difseg_size * 27);
 
     emms_c();
     return s->sys->frame_size;
 }
 
-static int dvvideo_end(AVCodecContext *avctx)
-{
-    avcodec_default_free_buffers(avctx);    
-    return 0;
-}
+AVCodec dvvideo_encoder = {
+    "dvvideo",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_DVVIDEO,
+    sizeof(DVVideoContext),
+    dvvideo_init,
+    dvvideo_encode_frame,
+    dvvideo_end,
+    NULL,
+    CODEC_CAP_DR1,
+    NULL
+};
 
 AVCodec dvvideo_decoder = {
     "dvvideo",
     CODEC_TYPE_VIDEO,
     CODEC_ID_DVVIDEO,
-    sizeof(DVVideoDecodeContext),
+    sizeof(DVVideoContext),
     dvvideo_init,
-    dvvideo_encode_frame,
+    NULL,
     dvvideo_end,
     dvvideo_decode_frame,
     CODEC_CAP_DR1,
diff --git a/src/libffmpeg/libavcodec/dvdata.h b/src/libffmpeg/libavcodec/dvdata.h
index e6e0986ba..e60d99448 100644
--- a/src/libffmpeg/libavcodec/dvdata.h
+++ b/src/libffmpeg/libavcodec/dvdata.h
@@ -218,7 +218,7 @@ static const uint8_t dv_vlc_run[409] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
-  0,
+127,
 };
 
 static const uint8_t dv_vlc_level[409] = {
diff --git a/src/libffmpeg/libavcodec/error_resilience.c b/src/libffmpeg/libavcodec/error_resilience.c
index c6b10a79c..b7aeebddf 100644
--- a/src/libffmpeg/libavcodec/error_resilience.c
+++ b/src/libffmpeg/libavcodec/error_resilience.c
@@ -45,7 +45,7 @@ static void put_dc(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
 {
     int dc, dcu, dcv, y, i;
     for(i=0; i<4; i++){
-        dc= s->dc_val[0][mb_x*2+1 + (i&1) + (mb_y*2+1 + (i>>1))*(s->mb_width*2+2)];
+        dc= s->dc_val[0][mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride];
         if(dc<0) dc=0;
         else if(dc>2040) dc=2040;
         for(y=0; y<8; y++){
@@ -55,8 +55,8 @@ static void put_dc(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
             }
         }
     }
-    dcu = s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)];
-    dcv = s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)];
+    dcu = s->dc_val[1][mb_x + mb_y*s->mb_stride];
+    dcv = s->dc_val[2][mb_x + mb_y*s->mb_stride];
     if     (dcu<0   ) dcu=0;
     else if(dcu>2040) dcu=2040;
     if     (dcv<0   ) dcv=0;
@@ -209,8 +209,8 @@ static void h_block_filter(MpegEncContext *s, uint8_t *dst, int w, int h, int st
             int left_damage =  left_status&(DC_ERROR|AC_ERROR|MV_ERROR);
             int right_damage= right_status&(DC_ERROR|AC_ERROR|MV_ERROR);
             int offset= b_x*8 + b_y*stride*8;
-            int16_t *left_mv=  s->current_picture.motion_val[0][s->block_wrap[0]*((b_y<<(1-is_luma)) + 1) + ( b_x   <<(1-is_luma))];
-            int16_t *right_mv= s->current_picture.motion_val[0][s->block_wrap[0]*((b_y<<(1-is_luma)) + 1) + ((b_x+1)<<(1-is_luma))];
+            int16_t *left_mv=  s->current_picture.motion_val[0][s->b8_stride*(b_y<<(1-is_luma)) + ( b_x   <<(1-is_luma))];
+            int16_t *right_mv= s->current_picture.motion_val[0][s->b8_stride*(b_y<<(1-is_luma)) + ((b_x+1)<<(1-is_luma))];
             
             if(!(left_damage||right_damage)) continue; // both undamaged
             
@@ -269,8 +269,8 @@ static void v_block_filter(MpegEncContext *s, uint8_t *dst, int w, int h, int st
             int top_damage =      top_status&(DC_ERROR|AC_ERROR|MV_ERROR);
             int bottom_damage= bottom_status&(DC_ERROR|AC_ERROR|MV_ERROR);
             int offset= b_x*8 + b_y*stride*8;
-            int16_t *top_mv=    s->current_picture.motion_val[0][s->block_wrap[0]*(( b_y   <<(1-is_luma)) + 1) + (b_x<<(1-is_luma))];
-            int16_t *bottom_mv= s->current_picture.motion_val[0][s->block_wrap[0]*(((b_y+1)<<(1-is_luma)) + 1) + (b_x<<(1-is_luma))];
+            int16_t *top_mv=    s->current_picture.motion_val[0][s->b8_stride*( b_y   <<(1-is_luma)) + (b_x<<(1-is_luma))];
+            int16_t *bottom_mv= s->current_picture.motion_val[0][s->b8_stride*((b_y+1)<<(1-is_luma)) + (b_x<<(1-is_luma))];
             
             if(!(top_damage||bottom_damage)) continue; // both undamaged
             
@@ -378,8 +378,8 @@ int score_sum=0;
                     int j;
                     int best_score=256*256*256*64;
                     int best_pred=0;
-                    const int mot_stride= mb_width*2+2;
-                    const int mot_index= mb_x*2 + 1 + (mb_y*2+1)*mot_stride;
+                    const int mot_stride= s->b8_stride;
+                    const int mot_index= mb_x*2 + mb_y*2*mot_stride;
                     int prev_x= s->current_picture.motion_val[0][mot_index][0];
                     int prev_y= s->current_picture.motion_val[0][mot_index][1];
 
@@ -672,14 +672,15 @@ void ff_er_frame_end(MpegEncContext *s){
     av_log(s->avctx, AV_LOG_INFO, "concealing errors\n");
     
     if(s->current_picture.motion_val[0] == NULL){
-        int size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
+        int size = s->b8_stride * 2 * s->mb_height;
         Picture *pic= s->current_picture_ptr;
         
         av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
             
         for(i=0; i<2; i++){
-            pic->motion_val_base[i]= av_mallocz((size+1) * 2 * sizeof(uint16_t)); //FIXME size
-            pic->motion_val[i]= pic->motion_val_base[i]+1;
+            pic->ref_index[i]= av_mallocz(size * sizeof(uint8_t));
+            pic->motion_val_base[i]= av_mallocz((size+2) * 2 * sizeof(uint16_t));
+            pic->motion_val[i]= pic->motion_val_base[i]+2;
         }
         pic->motion_subsample_log2= 3;
         s->current_picture= *s->current_picture_ptr;
@@ -845,17 +846,17 @@ void ff_er_frame_end(MpegEncContext *s){
             s->mb_intra=0;
             s->mb_skiped=0;
             if(IS_8X8(mb_type)){
-                int mb_index= mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0];
+                int mb_index= mb_x*2 + mb_y*2*s->b8_stride;
                 int j;
                 s->mv_type = MV_TYPE_8X8;
                 for(j=0; j<4; j++){
-                    s->mv[0][j][0] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->block_wrap[0] ][0];
-                    s->mv[0][j][1] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->block_wrap[0] ][1];
+                    s->mv[0][j][0] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->b8_stride ][0];
+                    s->mv[0][j][1] = s->current_picture.motion_val[0][ mb_index + (j&1) + (j>>1)*s->b8_stride ][1];
                 }
             }else{
                 s->mv_type = MV_TYPE_16X16;
-                s->mv[0][0][0] = s->current_picture.motion_val[0][ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][0];
-                s->mv[0][0][1] = s->current_picture.motion_val[0][ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][1];
+                s->mv[0][0][0] = s->current_picture.motion_val[0][ mb_x*2 + mb_y*2*s->b8_stride ][0];
+                s->mv[0][0][1] = s->current_picture.motion_val[0][ mb_x*2 + mb_y*2*s->b8_stride ][1];
             }
         
 	    s->dsp.clear_blocks(s->block[0]);
@@ -870,7 +871,7 @@ void ff_er_frame_end(MpegEncContext *s){
     if(s->pict_type==B_TYPE){
         for(mb_y=0; mb_y<s->mb_height; mb_y++){
             for(mb_x=0; mb_x<s->mb_width; mb_x++){
-                int xy= mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0];
+                int xy= mb_x*2 + mb_y*2*s->b8_stride;
                 const int mb_xy= mb_x + mb_y * s->mb_stride;
                 const int mb_type= s->current_picture.mb_type[mb_xy];
                 error= s->error_status_table[mb_xy];
@@ -930,7 +931,7 @@ void ff_er_frame_end(MpegEncContext *s){
             dest_cb= s->current_picture.data[1] + mb_x*8  + mb_y*8 *s->uvlinesize;
             dest_cr= s->current_picture.data[2] + mb_x*8  + mb_y*8 *s->uvlinesize;
            
-            dc_ptr= &s->dc_val[0][mb_x*2+1 + (mb_y*2+1)*(s->mb_width*2+2)];
+            dc_ptr= &s->dc_val[0][mb_x*2 + mb_y*2*s->b8_stride];
             for(n=0; n<4; n++){
                 dc=0;
                 for(y=0; y<8; y++){
@@ -939,7 +940,7 @@ void ff_er_frame_end(MpegEncContext *s){
                        dc+= dest_y[x + (n&1)*8 + (y + (n>>1)*8)*s->linesize];
                     }
                 }
-                dc_ptr[(n&1) + (n>>1)*(s->mb_width*2+2)]= (dc+4)>>3;
+                dc_ptr[(n&1) + (n>>1)*s->b8_stride]= (dc+4)>>3;
             }
 
             dcu=dcv=0;
@@ -950,18 +951,18 @@ void ff_er_frame_end(MpegEncContext *s){
                     dcv+=dest_cr[x + y*(s->uvlinesize)];
                 }
             }
-            s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)]= (dcu+4)>>3;
-            s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)]= (dcv+4)>>3;   
+            s->dc_val[1][mb_x + mb_y*s->mb_stride]= (dcu+4)>>3;
+            s->dc_val[2][mb_x + mb_y*s->mb_stride]= (dcv+4)>>3;   
         }
     }
 #if 1
     /* guess DC for damaged blocks */
-    guess_dc(s, s->dc_val[0] + s->mb_width*2+3, s->mb_width*2, s->mb_height*2, s->mb_width*2+2, 1);
-    guess_dc(s, s->dc_val[1] + s->mb_width  +3, s->mb_width  , s->mb_height  , s->mb_width  +2, 0);
-    guess_dc(s, s->dc_val[2] + s->mb_width  +3, s->mb_width  , s->mb_height  , s->mb_width  +2, 0);
+    guess_dc(s, s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride, 1);
+    guess_dc(s, s->dc_val[1], s->mb_width  , s->mb_height  , s->mb_stride, 0);
+    guess_dc(s, s->dc_val[2], s->mb_width  , s->mb_height  , s->mb_stride, 0);
 #endif   
     /* filter luma DC */
-    filter181(s->dc_val[0] + s->mb_width*2+3, s->mb_width*2, s->mb_height*2, s->mb_width*2+2);
+    filter181(s->dc_val[0], s->mb_width*2, s->mb_height*2, s->b8_stride);
     
 #if 1
     /* render DC only intra */
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index 8a60ff08b..ec776eb98 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -75,7 +75,7 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr);
 static void mpeg4_encode_visual_object_header(MpegEncContext * s);
 static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_number);
 #endif //CONFIG_ENCODERS
-static void mpeg4_decode_sprite_trajectory(MpegEncContext * s);
+static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb);
 static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr, int *dir_ptr);
 
 #ifdef CONFIG_ENCODERS
@@ -577,12 +577,13 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
     } else if(IS_INTERLACED(colocated_mb_type)){
         s->mv_type = MV_TYPE_FIELD;
         for(i=0; i<2; i++){
+            int field_select= s->next_picture.ref_index[0][s->block_index[2*i]];
             if(s->top_field_first){
-                time_pp= s->pp_field_time - s->p_field_select_table[i][mb_index] + i;
-                time_pb= s->pb_field_time - s->p_field_select_table[i][mb_index] + i;
+                time_pp= s->pp_field_time - field_select + i;
+                time_pb= s->pb_field_time - field_select + i;
             }else{
-                time_pp= s->pp_field_time + s->p_field_select_table[i][mb_index] - i;
-                time_pb= s->pb_field_time + s->p_field_select_table[i][mb_index] - i;
+                time_pp= s->pp_field_time + field_select - i;
+                time_pb= s->pb_field_time + field_select - i;
             }
             s->mv[0][i][0] = s->p_field_mv_table[i][0][mb_index][0]*time_pb/time_pp + mx;
             s->mv[0][i][1] = s->p_field_mv_table[i][0][mb_index][1]*time_pb/time_pp + my;
@@ -610,7 +611,7 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
 void ff_h263_update_motion_val(MpegEncContext * s){
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
                //FIXME a lot of thet is only needed for !low_delay
-    const int wrap = s->block_wrap[0];
+    const int wrap = s->b8_stride;
     const int xy = s->block_index[0];
     
     s->current_picture.mbskip_table[mb_xy]= s->mb_skiped; 
@@ -631,10 +632,13 @@ void ff_h263_update_motion_val(MpegEncContext * s){
             for(i=0; i<2; i++){
                 s->p_field_mv_table[i][0][mb_xy][0]= s->mv[0][i][0];
                 s->p_field_mv_table[i][0][mb_xy][1]= s->mv[0][i][1];
-                s->p_field_select_table[i][mb_xy]= s->field_select[0][i];
             }
+            s->current_picture.ref_index[0][xy           ]=
+            s->current_picture.ref_index[0][xy        + 1]= s->field_select[0][0];
+            s->current_picture.ref_index[0][xy + wrap    ]=
+            s->current_picture.ref_index[0][xy + wrap + 1]= s->field_select[0][1];
         }
-        
+
         /* no update if 8X8 because it has been done during parsing */
         s->current_picture.motion_val[0][xy][0] = motion_x;
         s->current_picture.motion_val[0][xy][1] = motion_y;
@@ -985,7 +989,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 }
 
                 /* motion vectors: 16x16 mode */
-                h263_pred_motion(s, 0, &pred_x, &pred_y);
+                h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             
                 h263_encode_motion(s, motion_x - pred_x, s->f_code);
                 h263_encode_motion(s, motion_y - pred_y, s->f_code);
@@ -1009,7 +1013,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 }
 
                 /* motion vectors: 16x8 interlaced mode */
-                h263_pred_motion(s, 0, &pred_x, &pred_y);
+                h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                 pred_y /=2;
                 
                 put_bits(&s->pb, 1, s->field_select[0][0]);
@@ -1037,7 +1041,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
 
                 for(i=0; i<4; i++){
                     /* motion vectors: 8x8 mode*/
-                    h263_pred_motion(s, i, &pred_x, &pred_y);
+                    h263_pred_motion(s, i, 0, &pred_x, &pred_y);
 
                     h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][0] - pred_x, s->f_code);
                     h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][1] - pred_y, s->f_code);
@@ -1185,7 +1189,7 @@ void h263_encode_mb(MpegEncContext * s,
             }
 
             /* motion vectors: 16x16 mode */
-            h263_pred_motion(s, 0, &pred_x, &pred_y);
+            h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             
             if (!s->umvplus) {  
                 h263_encode_motion(s, motion_x - pred_x, 1);
@@ -1212,7 +1216,7 @@ void h263_encode_mb(MpegEncContext * s,
 
             for(i=0; i<4; i++){
                 /* motion vectors: 8x8 mode*/
-                h263_pred_motion(s, i, &pred_x, &pred_y);
+                h263_pred_motion(s, i, 0, &pred_x, &pred_y);
 
                 motion_x= s->current_picture.motion_val[0][ s->block_index[i] ][0];
                 motion_y= s->current_picture.motion_val[0][ s->block_index[i] ][1];
@@ -1435,16 +1439,16 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr)
 
     /* find prediction */
     if (n < 4) {
-        x = 2 * s->mb_x + 1 + (n & 1);
-        y = 2 * s->mb_y + 1 + ((n & 2) >> 1);
-        wrap = s->mb_width * 2 + 2;
+        x = 2 * s->mb_x + (n & 1);
+        y = 2 * s->mb_y + ((n & 2) >> 1);
+        wrap = s->b8_stride;
         dc_val = s->dc_val[0];
         ac_val = s->ac_val[0][0];
         scale = s->y_dc_scale;
     } else {
-        x = s->mb_x + 1;
-        y = s->mb_y + 1;
-        wrap = s->mb_width + 2;
+        x = s->mb_x;
+        y = s->mb_y;
+        wrap = s->mb_stride;
         dc_val = s->dc_val[n - 4 + 1];
         ac_val = s->ac_val[n - 4 + 1][0];
         scale = s->c_dc_scale;
@@ -1456,8 +1460,10 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr)
     c = dc_val[(x) + (y - 1) * wrap];
     
     /* No prediction outside GOB boundary */
-    if (s->first_slice_line && ((n < 2) || (n > 3)))
-        c = 1024;
+    if(s->first_slice_line && n!=3){
+        if(n!=2) c= 1024;
+        if(n!=1 && s->mb_x == s->resync_mb_x) a= 1024;
+    }
     pred_dc = 1024;
     /* just DC prediction */
     if (a != 1024 && c != 1024)
@@ -1480,16 +1486,16 @@ static void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n)
 
     /* find prediction */
     if (n < 4) {
-        x = 2 * s->mb_x + 1 + (n & 1);
-        y = 2 * s->mb_y + 1 + (n>> 1);
-        wrap = s->mb_width * 2 + 2;
+        x = 2 * s->mb_x + (n & 1);
+        y = 2 * s->mb_y + (n>> 1);
+        wrap = s->b8_stride;
         dc_val = s->dc_val[0];
         ac_val = s->ac_val[0][0];
         scale = s->y_dc_scale;
     } else {
-        x = s->mb_x + 1;
-        y = s->mb_y + 1;
-        wrap = s->mb_width + 2;
+        x = s->mb_x;
+        y = s->mb_y;
+        wrap = s->mb_stride;
         dc_val = s->dc_val[n - 4 + 1];
         ac_val = s->ac_val[n - 4 + 1][0];
         scale = s->c_dc_scale;
@@ -1560,78 +1566,15 @@ static void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n)
         ac_val1[8 + i] = block[s->dsp.idct_permutation[i   ]];
 }
 
-int16_t *h263_pred_motion(MpegEncContext * s, int block, 
+int16_t *h263_pred_motion(MpegEncContext * s, int block, int dir,
                         int *px, int *py)
 {
-    int xy, wrap;
-    int16_t *A, *B, *C, *mot_val;
-    static const int off[4]= {2, 1, 1, -1};
-
-    wrap = s->block_wrap[0];
-    xy = s->block_index[block];
-
-    mot_val = s->current_picture.motion_val[0][xy];
-
-    A = s->current_picture.motion_val[0][xy - 1];
-    /* special case for first (slice) line */
-    if (s->first_slice_line && block<3) {
-        // we cant just change some MVs to simulate that as we need them for the B frames (and ME)
-        // and if we ever support non rectangular objects than we need to do a few ifs here anyway :(
-        if(block==0){ //most common case
-            if(s->mb_x  == s->resync_mb_x){ //rare
-                *px= *py = 0;
-            }else if(s->mb_x + 1 == s->resync_mb_x && s->h263_pred){ //rare
-                C = s->current_picture.motion_val[0][xy + off[block] - wrap];
-                if(s->mb_x==0){
-                    *px = C[0];
-                    *py = C[1];
-                }else{
-                    *px = mid_pred(A[0], 0, C[0]);
-                    *py = mid_pred(A[1], 0, C[1]);
-                }
-            }else{
-                *px = A[0];
-                *py = A[1];
-            }
-        }else if(block==1){
-            if(s->mb_x + 1 == s->resync_mb_x && s->h263_pred){ //rare
-                C = s->current_picture.motion_val[0][xy + off[block] - wrap];
-                *px = mid_pred(A[0], 0, C[0]);
-                *py = mid_pred(A[1], 0, C[1]);
-            }else{
-                *px = A[0];
-                *py = A[1];
-            }
-        }else{ /* block==2*/
-            B = s->current_picture.motion_val[0][xy - wrap];
-            C = s->current_picture.motion_val[0][xy + off[block] - wrap];
-            if(s->mb_x == s->resync_mb_x) //rare
-                A[0]=A[1]=0;
-    
-            *px = mid_pred(A[0], B[0], C[0]);
-            *py = mid_pred(A[1], B[1], C[1]);
-        }
-    } else {
-        B = s->current_picture.motion_val[0][xy - wrap];
-        C = s->current_picture.motion_val[0][xy + off[block] - wrap];
-        *px = mid_pred(A[0], B[0], C[0]);
-        *py = mid_pred(A[1], B[1], C[1]);
-    }
-    return mot_val;
-}
-
-// identical to above but with s->current_picture->motion_val, the above one will be removed, and this renamed to it
-int16_t *h263_pred_motion2(MpegEncContext * s, int block, int dir,
-                        int *px, int *py)
-{
-    int xy, wrap;
+    int wrap;
     int16_t *A, *B, *C, (*mot_val)[2];
     static const int off[4]= {2, 1, 1, -1};
 
     wrap = s->b8_stride;
-    xy = 2*(s->mb_x + s->mb_y * wrap);
-
-    mot_val = s->current_picture.motion_val[dir] + xy;
+    mot_val = s->current_picture.motion_val[dir] + s->block_index[block];
 
     A = mot_val[ - 1];
     /* special case for first (slice) line */
@@ -1785,7 +1728,7 @@ static void init_mv_penalty_and_fcode(MpegEncContext *s)
             else{
                 int val, bit_size, range, code;
 
-                bit_size = s->f_code - 1;
+                bit_size = f_code - 1;
                 range = 1 << bit_size;
 
                 val=mv;
@@ -2386,6 +2329,7 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     time_div= s->time/s->time_increment_resolution;
     time_mod= s->time%s->time_increment_resolution;
     time_incr= time_div - s->last_time_base;
+    assert(time_incr >= 0);
     while(time_incr--)
         put_bits(&s->pb, 1, 1);
         
@@ -2994,13 +2938,12 @@ void ff_mpeg4_init_partitions(MpegEncContext *s)
     uint8_t *start= pbBufPtr(&s->pb);
     uint8_t *end= s->pb.buf_end;
     int size= end - start;
-    int pb_size = size/3;
-    int pb2_size= size/3;
-    int tex_size= size - pb_size - pb2_size;
+    int pb_size = (((int)start + size/3)&(~3)) - (int)start;
+    int tex_size= (size - 2*pb_size)&(~3);
     
     set_put_bits_buffer_size(&s->pb, pb_size);
     init_put_bits(&s->tex_pb, start + pb_size           , tex_size);
-    init_put_bits(&s->pb2   , start + pb_size + tex_size, pb2_size);
+    init_put_bits(&s->pb2   , start + pb_size + tex_size, pb_size);
 }
 
 void ff_mpeg4_merge_partitions(MpegEncContext *s)
@@ -3165,7 +3108,7 @@ static int mpeg4_decode_video_packet_header(MpegEncContext *s)
             skip_bits(&s->gb, 3); /* intra dc vlc threshold */
 //FIXME dont just ignore everything
             if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE){
-                mpeg4_decode_sprite_trajectory(s);
+                mpeg4_decode_sprite_trajectory(s, &s->gb);
                 av_log(s->avctx, AV_LOG_ERROR, "untested\n");
             }
 
@@ -3196,10 +3139,10 @@ void ff_mpeg4_clean_buffers(MpegEncContext *s)
 {
     int c_wrap, c_xy, l_wrap, l_xy;
 
-    l_wrap= s->block_wrap[0];
-    l_xy= s->mb_y*l_wrap*2 + s->mb_x*2;
-    c_wrap= s->block_wrap[4];
-    c_xy= s->mb_y*c_wrap + s->mb_x;
+    l_wrap= s->b8_stride;
+    l_xy= (2*s->mb_y-1)*l_wrap + s->mb_x*2 - 1;
+    c_wrap= s->mb_stride;
+    c_xy= (s->mb_y-1)*c_wrap + s->mb_x - 1;
 
 #if 0
     /* clean DC */
@@ -3372,7 +3315,7 @@ static int mpeg4_decode_partition_a(MpegEncContext *s){
             }else{ /* P/S_TYPE */
                 int mx, my, pred_x, pred_y, bits;
                 int16_t * const mot_val= s->current_picture.motion_val[0][s->block_index[0]];
-                const int stride= s->block_wrap[0]*2;
+                const int stride= s->b8_stride*2;
 
 try_again:
                 bits= show_bits(&s->gb, 17);
@@ -3430,7 +3373,7 @@ try_again:
                     if ((cbpc & 16) == 0) {
                         /* 16x16 motion prediction */
 
-                        h263_pred_motion(s, 0, &pred_x, &pred_y);
+                        h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                         if(!s->mcsel){
                             mx = h263_decode_motion(s, pred_x, s->f_code);
                             if (mx >= 0xffff)
@@ -3454,7 +3397,7 @@ try_again:
                         int i;
                         s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0;
                         for(i=0;i<4;i++) {
-                            int16_t *mot_val= h263_pred_motion(s, i, &pred_x, &pred_y);
+                            int16_t *mot_val= h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                             mx = h263_decode_motion(s, pred_x, s->f_code);
                             if (mx >= 0xffff)
                                 return -1;
@@ -3708,7 +3651,7 @@ static void preview_obmc(MpegEncContext *s){
     int cbpc, i, pred_x, pred_y, mx, my;
     int16_t *mot_val;
     const int xy= s->mb_x + 1 + s->mb_y * s->mb_stride;
-    const int stride= s->block_wrap[0]*2;
+    const int stride= s->b8_stride*2;
     
     for(i=0; i<4; i++)
         s->block_index[i]+= 2;
@@ -3748,7 +3691,7 @@ static void preview_obmc(MpegEncContext *s){
         if ((cbpc & 16) == 0) {
                 s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0; 
                 /* 16x16 motion prediction */
-                mot_val= h263_pred_motion(s, 0, &pred_x, &pred_y);
+                mot_val= h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                 if (s->umvplus)
                    mx = h263p_decode_umotion(s, pred_x);
                 else
@@ -3766,7 +3709,7 @@ static void preview_obmc(MpegEncContext *s){
         } else {
             s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; 
             for(i=0;i<4;i++) {
-                mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
+                mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                 if (s->umvplus)
                   mx = h263p_decode_umotion(s, pred_x);
                 else
@@ -3858,7 +3801,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0; 
             /* 16x16 motion prediction */
             s->mv_type = MV_TYPE_16X16;
-            h263_pred_motion(s, 0, &pred_x, &pred_y);
+            h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             if (s->umvplus)
                mx = h263p_decode_umotion(s, pred_x);
             else
@@ -3883,7 +3826,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; 
             s->mv_type = MV_TYPE_8X8;
             for(i=0;i<4;i++) {
-                mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
+                mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                 if (s->umvplus)
                   mx = h263p_decode_umotion(s, pred_x);
                 else
@@ -3977,7 +3920,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
 //FIXME UMV
 
             if(USES_LIST(mb_type, 0)){
-                int16_t *mot_val= h263_pred_motion2(s, 0, 0, &mx, &my);
+                int16_t *mot_val= h263_pred_motion(s, 0, 0, &mx, &my);
                 s->mv_dir = MV_DIR_FORWARD;
 
                 mx = h263_decode_motion(s, mx, 1);
@@ -3990,7 +3933,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             }
     
             if(USES_LIST(mb_type, 1)){
-                int16_t *mot_val= h263_pred_motion2(s, 0, 1, &mx, &my);
+                int16_t *mot_val= h263_pred_motion(s, 0, 1, &mx, &my);
                 s->mv_dir |= MV_DIR_BACKWARD;
                 
                 mx = h263_decode_motion(s, mx, 1);
@@ -4145,7 +4088,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s,
                 s->field_select[0][0]= get_bits1(&s->gb);
                 s->field_select[0][1]= get_bits1(&s->gb);
 
-                h263_pred_motion(s, 0, &pred_x, &pred_y);
+                h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                 
                 for(i=0; i<2; i++){
                     mx = h263_decode_motion(s, pred_x, s->f_code);
@@ -4163,7 +4106,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s,
                 s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0; 
                 /* 16x16 motion prediction */
                 s->mv_type = MV_TYPE_16X16;
-                h263_pred_motion(s, 0, &pred_x, &pred_y);
+                h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
                 mx = h263_decode_motion(s, pred_x, s->f_code);
             
                 if (mx >= 0xffff)
@@ -4180,7 +4123,7 @@ int ff_mpeg4_decode_mb(MpegEncContext *s,
             s->current_picture.mb_type[xy]= MB_TYPE_8x8 | MB_TYPE_L0; 
             s->mv_type = MV_TYPE_8X8;
             for(i=0;i<4;i++) {
-                mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
+                mot_val = h263_pred_motion(s, i, 0, &pred_x, &pred_y);
                 mx = h263_decode_motion(s, pred_x, s->f_code);
                 if (mx >= 0xffff)
                     return -1;
@@ -5117,11 +5060,15 @@ int h263_decode_picture_header(MpegEncContext *s)
         s->qscale = get_bits(&s->gb, 5);
     }
 
+    s->mb_width = (s->width  + 15) / 16;
+    s->mb_height = (s->height  + 15) / 16;
+    s->mb_num = s->mb_width * s->mb_height;
+
     /* PEI */
     while (get_bits1(&s->gb) != 0) {
         skip_bits(&s->gb, 8);
     }
-    
+
     if(s->h263_slice_structured){
         if (get_bits1(&s->gb) != 1) {
             av_log(s->avctx, AV_LOG_ERROR, "SEPB1 marker missing\n");
@@ -5181,7 +5128,7 @@ int h263_decode_picture_header(MpegEncContext *s)
     return 0;
 }
 
-static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
+static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb)
 {
     int i;
     int a= 2<<s->sprite_warping_accuracy;
@@ -5201,17 +5148,17 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
         int length;
         int x=0, y=0;
 
-        length= get_vlc(&s->gb, &sprite_trajectory);
+        length= get_vlc(gb, &sprite_trajectory);
         if(length){
-            x= get_xbits(&s->gb, length);
+            x= get_xbits(gb, length);
         }
-        if(!(s->divx_version==500 && s->divx_build==413)) skip_bits1(&s->gb); /* marker bit */
+        if(!(s->divx_version==500 && s->divx_build==413)) skip_bits1(gb); /* marker bit */
         
-        length= get_vlc(&s->gb, &sprite_trajectory);
+        length= get_vlc(gb, &sprite_trajectory);
         if(length){
-            y=get_xbits(&s->gb, length);
+            y=get_xbits(gb, length);
         }
-        skip_bits1(&s->gb); /* marker bit */
+        skip_bits1(gb); /* marker bit */
 //printf("%d %d %d %d\n", x, y, i, s->sprite_warping_accuracy);
         d[i][0]= x;
         d[i][1]= y;
@@ -5840,7 +5787,7 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
      }
  
      if(s->pict_type == S_TYPE && (s->vol_sprite_usage==STATIC_SPRITE || s->vol_sprite_usage==GMC_SPRITE)){
-         mpeg4_decode_sprite_trajectory(s);
+         mpeg4_decode_sprite_trajectory(s, gb);
          if(s->sprite_brightness_change) av_log(s->avctx, AV_LOG_ERROR, "sprite_brightness_change not supported\n");
          if(s->vol_sprite_usage==STATIC_SPRITE) av_log(s->avctx, AV_LOG_ERROR, "static sprite not supported\n");
      }
@@ -6111,8 +6058,10 @@ int flv_h263_decode_picture_header(MpegEncContext *s)
     s->height = height;
 
     s->pict_type = I_TYPE + get_bits(&s->gb, 2);
-    if (s->pict_type > P_TYPE)
+    s->dropable= s->pict_type > P_TYPE;
+    if (s->dropable)
         s->pict_type = P_TYPE;
+
     skip_bits1(&s->gb);	/* deblocking flag */
     s->chroma_qscale= s->qscale = get_bits(&s->gb, 5);
 
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 88db359fe..aaf38b172 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -42,8 +42,8 @@ int ff_h263_decode_init(AVCodecContext *avctx)
     s->workaround_bugs= avctx->workaround_bugs;
 
     // set defaults
+    MPV_decode_defaults(s);
     s->quant_precision=5;
-    s->progressive_sequence=1;
     s->decode_mb= ff_h263_decode_mb;
     s->low_delay= 1;
     avctx->pix_fmt= PIX_FMT_YUV420P;
@@ -551,6 +551,8 @@ retry:
             s->workaround_bugs|= FF_BUG_EDGE;
         }
         
+        if(s->divx_version)
+            s->workaround_bugs|= FF_BUG_HPEL_CHROMA;
 #if 0
         if(s->divx_version==500)
             s->padding_bug_score= 256*256*256*64;
@@ -714,7 +716,8 @@ assert(s->current_picture.pict_type == s->pict_type);
         ff_print_debug_info(s, pict);
     } else {
         *pict= *(AVFrame*)&s->last_picture;
-        ff_print_debug_info(s, pict);
+        if(pict)
+            ff_print_debug_info(s, pict);
     }
 
     /* Return the Picture timestamp as the frame number */
diff --git a/src/libffmpeg/libavcodec/h264.c b/src/libffmpeg/libavcodec/h264.c
index 3f60e35e8..fa254e93b 100644
--- a/src/libffmpeg/libavcodec/h264.c
+++ b/src/libffmpeg/libavcodec/h264.c
@@ -31,6 +31,8 @@
 #include "h264data.h"
 #include "golomb.h"
 
+#include "cabac.h"
+
 #undef NDEBUG
 #include <assert.h>
 
@@ -162,6 +164,8 @@ typedef struct H264Context{
     unsigned int top_samples_available;
     unsigned int topright_samples_available;
     unsigned int left_samples_available;
+    uint8_t (*top_border)[16+2*8];
+    uint8_t left_border[17+2*9];
 
     /**
      * non zero coeff count cache.
@@ -248,9 +252,9 @@ typedef struct H264Context{
     int chroma_offset[2][16][2];
    
     //deblock
-    int disable_deblocking_filter_idc;
-    int slice_alpha_c0_offset_div2;
-    int slice_beta_offset_div2;
+    int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0 
+    int slice_alpha_c0_offset;
+    int slice_beta_offset;
      
     int redundant_pic_count;
     
@@ -282,6 +286,22 @@ typedef struct H264Context{
     GetBitContext *inter_gb_ptr;
     
     DCTELEM mb[16*24] __align8;
+
+    /**
+     * Cabac
+     */
+    CABACContext cabac;
+    uint8_t      cabac_state[399];
+    int          cabac_init_idc;
+
+    /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
+    uint16_t     *cbp_table;
+    /* chroma_pred_mode for i4x4 or i16x16, else 0 */
+    uint8_t     *chroma_pred_mode_table;
+    int         last_qscale_diff;
+    int16_t     (*mvd_table[2])[2];
+    int16_t     mvd_cache[2][5*8][2];
+
 }H264Context;
 
 static VLC coeff_token_vlc[4];
@@ -295,6 +315,7 @@ static VLC run7_vlc;
 
 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
+static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr);
 
 static inline uint32_t pack16to32(int a, int b){
 #ifdef WORDS_BIGENDIAN
@@ -610,9 +631,52 @@ static inline void fill_caches(H264Context *h, int mb_type){
             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
+
+            if( h->pps.cabac ) {
+                /* XXX beurk, Load mvd */
+                if(IS_INTER(topleft_type)){
+                    const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
+                }else{
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
+                }
+
+                if(IS_INTER(top_type)){
+                    const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
+                }else{
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= 
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]= 
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= 
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
+                }
+                if(IS_INTER(left_type[0])){
+                    const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
+                }else{
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
+                }
+                if(IS_INTER(left_type[1])){
+                    const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
+                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
+                }else{
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
+                    *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
+                }
+                *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
+                *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
+                *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
+                *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
+                *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
+            }
         }
 //FIXME
-
     }
 #endif
 }
@@ -920,6 +984,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){
                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
                 }
+                if( h->pps.cabac ) {
+                    /* FIXME needed ? */
+                    for(y=0; y<4; y++){
+                        *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
+                        *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
+                    }
+                }
                 for(y=0; y<2; y++){
                     *(uint16_t*)s->current_picture.motion_val[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
                 }
@@ -931,6 +1002,12 @@ static inline void write_back_motion(H264Context *h, int mb_type){
             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
         }
+        if( h->pps.cabac ) {
+            for(y=0; y<4; y++){
+                *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
+                *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
+            }
+        }
         for(y=0; y<2; y++){
             s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
@@ -988,6 +1065,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
                 dst[di++]= 0;
                 dst[di++]= 0;
                 si+=3;
+                continue;
             }else //next start code
                 break;
         }
@@ -1001,6 +1079,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
     return dst;
 }
 
+#if 0
 /**
  * @param src the data which should be escaped
  * @param dst the target buffer, dst+1 == src is allowed as a special case
@@ -1073,6 +1152,7 @@ static void encode_rbsp_trailing(PutBitContext *pb){
     length= (-put_bits_count(pb))&7;
     if(length) put_bits(pb, length, 0);
 }
+#endif
 
 /**
  * identifies the exact end of the bitstream
@@ -1132,6 +1212,7 @@ static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
     }
 }
 
+#if 0
 /**
  * dct tranforms the 16 dc values.
  * @param qp quantization parameter ??? FIXME
@@ -1169,6 +1250,8 @@ static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
         block[stride*10+offset]= (z0 - z3)>>1;
     }
 }
+#endif
+
 #undef xStride
 #undef stride
 
@@ -1194,6 +1277,7 @@ static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){
     block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1;
 }
 
+#if 0
 static void chroma_dc_dct_c(DCTELEM *block){
     const int stride= 16*2;
     const int xStride= 16;
@@ -1214,6 +1298,7 @@ static void chroma_dc_dct_c(DCTELEM *block){
     block[stride*1 + xStride*0]= (a-c);
     block[stride*1 + xStride*1]= (e-b);
 }
+#endif
 
 /**
  * gets the chroma qp.
@@ -1232,18 +1317,6 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 
     block[0] += 32;
-#if 1
-    for(i=0; i<4; i++){
-        const int z0=  block[i + 4*0]     +  block[i + 4*2];
-        const int z1=  block[i + 4*0]     -  block[i + 4*2];
-        const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
-        const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
-
-        block[i + 4*0]= z0 + z3;
-        block[i + 4*1]= z1 + z2;
-        block[i + 4*2]= z1 - z2;
-        block[i + 4*3]= z0 - z3;
-    }
 
     for(i=0; i<4; i++){
         const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
@@ -1251,18 +1324,6 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
         const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
         const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
 
-        dst[0 + i*stride]= cm[ dst[0 + i*stride] + ((z0 + z3) >> 6) ];
-        dst[1 + i*stride]= cm[ dst[1 + i*stride] + ((z1 + z2) >> 6) ];
-        dst[2 + i*stride]= cm[ dst[2 + i*stride] + ((z1 - z2) >> 6) ];
-        dst[3 + i*stride]= cm[ dst[3 + i*stride] + ((z0 - z3) >> 6) ];
-    }
-#else
-    for(i=0; i<4; i++){
-        const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
-        const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
-        const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
-        const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
-
         block[0 + 4*i]= z0 + z3;
         block[1 + 4*i]= z1 + z2;
         block[2 + 4*i]= z1 - z2;
@@ -1280,9 +1341,9 @@ static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
         dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
         dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
     }
-#endif
 }
 
+#if 0
 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
     int i;
     //FIXME try int temp instead of block
@@ -1315,6 +1376,7 @@ static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int st
         block[3*4 + i]=   z3 - 2*z2;
     }
 }
+#endif
 
 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, iam not sure, its very close
 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
@@ -2106,10 +2168,15 @@ static void init_pred_ptrs(H264Context *h){
 
 static void free_tables(H264Context *h){
     av_freep(&h->intra4x4_pred_mode);
+    av_freep(&h->chroma_pred_mode_table);
+    av_freep(&h->cbp_table);
+    av_freep(&h->mvd_table[0]);
+    av_freep(&h->mvd_table[1]);
     av_freep(&h->non_zero_count);
     av_freep(&h->slice_table_base);
+    av_freep(&h->top_border);
     h->slice_table= NULL;
-    
+
     av_freep(&h->mb2b_xy);
     av_freep(&h->mb2b8_xy);
 }
@@ -2124,8 +2191,17 @@ static int alloc_tables(H264Context *h){
     int x,y;
 
     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
+
     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->top_border       , s->mb_width * (16+8+8) * sizeof(uint8_t))
+
+    if( h->pps.cabac ) {
+        CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
+        CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
+        CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
+        CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
+    }
 
     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
     h->slice_table= h->slice_table_base + s->mb_stride + 1;
@@ -2166,6 +2242,8 @@ static int decode_init(AVCodecContext *avctx){
     H264Context *h= avctx->priv_data;
     MpegEncContext * const s = &h->s;
 
+    MPV_decode_defaults(s);
+    
     s->avctx = avctx;
     common_init(h);
 
@@ -2173,7 +2251,6 @@ static int decode_init(AVCodecContext *avctx){
     s->workaround_bugs= avctx->workaround_bugs;
 
     // set defaults
-    s->progressive_sequence=1;
 //    s->decode_mb= ff_h263_decode_mb;
     s->low_delay= 1;
     avctx->pix_fmt= PIX_FMT_YUV420P;
@@ -2205,6 +2282,66 @@ static void frame_start(H264Context *h){
 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
 }
 
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+    MpegEncContext * const s = &h->s;
+    int i;
+    
+    src_y  -=   linesize;
+    src_cb -= uvlinesize;
+    src_cr -= uvlinesize;
+
+    h->left_border[0]= h->top_border[s->mb_x][15];
+    for(i=1; i<17; i++){
+        h->left_border[i]= src_y[15+i*  linesize];
+    }
+    
+    *(uint64_t*)(h->top_border[s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
+    *(uint64_t*)(h->top_border[s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
+
+    if(!(s->flags&CODEC_FLAG_GRAY)){
+        h->left_border[17  ]= h->top_border[s->mb_x][16+7];
+        h->left_border[17+9]= h->top_border[s->mb_x][24+7];
+        for(i=1; i<9; i++){
+            h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
+            h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
+        }
+        *(uint64_t*)(h->top_border[s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
+        *(uint64_t*)(h->top_border[s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
+    }
+}
+
+static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+    MpegEncContext * const s = &h->s;
+    int temp8, i;
+    uint64_t temp64;
+
+    src_y  -=   linesize + 1;
+    src_cb -= uvlinesize + 1;
+    src_cr -= uvlinesize + 1;
+
+#define XCHG(a,b,t,xchg)\
+t= a;\
+if(xchg)\
+    a= b;\
+b= t;
+    
+    for(i=0; i<17; i++){
+        XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
+    }
+    
+    XCHG(*(uint64_t*)(h->top_border[s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
+    XCHG(*(uint64_t*)(h->top_border[s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
+
+    if(!(s->flags&CODEC_FLAG_GRAY)){
+        for(i=0; i<9; i++){
+            XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
+            XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
+        }
+        XCHG(*(uint64_t*)(h->top_border[s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
+        XCHG(*(uint64_t*)(h->top_border[s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
+    }
+}
+
 static void hl_decode_mb(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int mb_x= s->mb_x;
@@ -2240,6 +2377,9 @@ static void hl_decode_mb(H264Context *h){
     }
 
     if(IS_INTRA(mb_type)){
+        if(h->deblocking_filter)
+            xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+
         if(!(s->flags&CODEC_FLAG_GRAY)){
             h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
             h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
@@ -2257,6 +2397,9 @@ static void hl_decode_mb(H264Context *h){
                     if(!topright_avail){
                         tr= ptr[3 - linesize]*0x01010101;
                         topright= (uint8_t*) &tr;
+                    }else if(i==5 && h->deblocking_filter){
+                        tr= *(uint32_t*)h->top_border[mb_x+1];
+                        topright= (uint8_t*) &tr;
                     }
 
                     h->pred4x4[ dir ](ptr, topright, linesize);
@@ -2275,6 +2418,8 @@ static void hl_decode_mb(H264Context *h){
             else
                 svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
         }
+        if(h->deblocking_filter)
+            xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
     }else if(s->codec_id == CODEC_ID_H264){
         hl_motion(h, dest_y, dest_cb, dest_cr,
                   s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab, 
@@ -2331,10 +2476,10 @@ static void hl_decode_mb(H264Context *h){
             }
         }
     }
-}
-
-static void decode_mb_cabac(H264Context *h){
-//    MpegEncContext * const s = &h->s;
+    if(h->deblocking_filter) {
+        backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+        filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr);
+    }
 }
 
 /**
@@ -2979,9 +3124,16 @@ static int decode_slice_header(H264Context *h){
     
     if(s->current_picture.reference)
         decode_ref_pic_marking(h);
-    //FIXME CABAC stuff
 
-    s->qscale = h->pps.init_qp + get_se_golomb(&s->gb); //slice_qp_delta
+    if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
+        h->cabac_init_idc = get_ue_golomb(&s->gb);
+
+    h->last_qscale_diff = 0;
+    s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
+    if(s->qscale<0 || s->qscale>51){
+        av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
+        return -1;
+    }
     //FIXME qscale / qp ... stuff
     if(h->slice_type == SP_TYPE){
         get_bits1(&s->gb); /* sp_for_switch_flag */
@@ -2990,14 +3142,19 @@ static int decode_slice_header(H264Context *h){
         get_se_golomb(&s->gb); /* slice_qs_delta */
     }
 
+    h->deblocking_filter = 1;
+    h->slice_alpha_c0_offset = 0;
+    h->slice_beta_offset = 0;
     if( h->pps.deblocking_filter_parameters_present ) {
-        h->disable_deblocking_filter_idc= get_ue_golomb(&s->gb);
-        if( h->disable_deblocking_filter_idc  !=  1 ) {
-            h->slice_alpha_c0_offset_div2= get_se_golomb(&s->gb);
-            h->slice_beta_offset_div2= get_se_golomb(&s->gb);
+        h->deblocking_filter= get_ue_golomb(&s->gb);
+        if(h->deblocking_filter < 2) 
+            h->deblocking_filter^= 1; // 1<->0
+
+        if( h->deblocking_filter ) {
+            h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
+            h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
         }
-    }else
-        h->disable_deblocking_filter_idc= 0;
+    }
 
 #if 0 //FMO
     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
@@ -3012,7 +3169,7 @@ static int decode_slice_header(H264Context *h){
                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
                h->ref_count[0], h->ref_count[1],
                s->qscale,
-               h->disable_deblocking_filter_idc
+               h->deblocking_filter
                );
     }
 
@@ -3122,7 +3279,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
         if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
 #else        
         if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
-        ? == prefix > 2 or sth
+        /* ? == prefix > 2 or sth */
 #endif
         tprintf("level: %d suffix_length:%d\n", level[i], suffix_length);
     }
@@ -3186,7 +3343,7 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
  * decodes a macroblock
  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
  */
-static int decode_mb(H264Context *h){
+static int decode_mb_cavlc(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
     int mb_type, partition_count, cbp;
@@ -3223,6 +3380,7 @@ static int decode_mb(H264Context *h){
             write_back_motion(h, mb_type);
 
             s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
+            s->current_picture.qscale_table[mb_xy]= s->qscale;
             h->slice_table[ mb_xy ]= h->slice_num;
 
             h->prev_mb_skiped= 1;
@@ -3303,7 +3461,9 @@ decode_intra_mb:
     
         skip_bits(&s->gb, 384); //FIXME check /fix the bitstream readers
         
+        //FIXME deblock filter, non_zero_count_cache init ...
         memset(h->non_zero_count[mb_xy], 16, 16);
+        s->current_picture.qscale_table[mb_xy]= s->qscale;
         
         return 0;
     }
@@ -3607,71 +3767,1470 @@ decode_intra_mb:
             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
         }
     }else{
-        memset(&h->non_zero_count_cache[8], 0, 8*5);
+        uint8_t * const nnz= &h->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
     }
+    s->current_picture.qscale_table[mb_xy]= s->qscale;
     write_back_non_zero_count(h);
 
     return 0;
 }
 
-static int decode_slice(H264Context *h){
+static int decode_cabac_mb_type( H264Context *h ) {
     MpegEncContext * const s = &h->s;
-    const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
 
-    s->mb_skip_run= -1;
-    
-#if 1
-    for(;;){
-        int ret= decode_mb(h);
-            
-        hl_decode_mb(h);
-        
-        if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ?
-            s->mb_y++;
-            ret= decode_mb(h);
-            
-            hl_decode_mb(h);
-            s->mb_y--;
+    if( h->slice_type == I_TYPE ) {
+        const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
+        int ctx = 0;
+        int mb_type;
+
+        if( s->mb_x > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-1] ) )
+            ctx++;
+        if( s->mb_y > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-s->mb_stride] ) )
+            ctx++;
+
+        if( get_cabac( &h->cabac, &h->cabac_state[3+ctx] ) == 0 )
+            return 0;   /* I4x4 */
+
+        if( get_cabac_terminate( &h->cabac ) )
+            return 25;  /* PCM */
+
+        mb_type = 1;    /* I16x16 */
+        if( get_cabac( &h->cabac, &h->cabac_state[3+3] ) )
+            mb_type += 12;  /* cbp_luma != 0 */
+
+        if( get_cabac( &h->cabac, &h->cabac_state[3+4] ) ) {
+            if( get_cabac( &h->cabac, &h->cabac_state[3+5] ) )
+                mb_type += 4 * 2;   /* cbp_chroma == 2 */
+            else
+                mb_type += 4 * 1;   /* cbp_chroma == 1 */
+        }
+        if( get_cabac( &h->cabac, &h->cabac_state[3+6] ) )
+            mb_type += 2;
+        if( get_cabac( &h->cabac, &h->cabac_state[3+7] ) )
+            mb_type += 1;
+        return mb_type;
+
+    } else if( h->slice_type == P_TYPE ) {
+        if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
+            /* P-type */
+            if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
+                if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
+                    return 0; /* P_L0_D16x16; */
+                else
+                    return 3; /* P_8x8; */
+            } else {
+                if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
+                    return 2; /* P_L0_D8x16; */
+                else
+                    return 1; /* P_L0_D16x8; */
+            }
+        } else {
+            int mb_type;
+            /* I-type */
+            if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
+                return 5+0; /* I_4x4 */
+            if( get_cabac_terminate( &h->cabac ) )
+                return 5+25; /*I_PCM */
+            mb_type = 5+1;    /* I16x16 */
+            if( get_cabac( &h->cabac, &h->cabac_state[17+1] ) )
+                mb_type += 12;  /* cbp_luma != 0 */
+
+            if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) ) {
+                if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) )
+                    mb_type += 4 * 2;   /* cbp_chroma == 2 */
+                else
+                    mb_type += 4 * 1;   /* cbp_chroma == 1 */
+            }
+            if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) )
+                mb_type += 2;
+            if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) )
+                mb_type += 1;
+
+            return mb_type;
         }
+    } else {
+        /* TODO do others frames types */
+        return -1;
+    }
+}
 
-        if(ret<0){
-            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
-            ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+static int decode_cabac_mb_skip( H264Context *h) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+    const int mba_xy = mb_xy - 1;
+    const int mbb_xy = mb_xy - s->mb_stride;
+    int ctx = 0;
+
+    if( s->mb_x > 0 && !IS_SKIP( s->current_picture.mb_type[mba_xy] ) )
+        ctx++;
+    if( s->mb_y > 0 && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ) )
+        ctx++;
+
+    if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
+        return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
+    else /* B-frame */
+        return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
+}
 
-            return -1;
+static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
+    int mode = 0;
+
+    if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
+        return pred_mode;
+
+    if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
+        mode += 1;
+    if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
+        mode += 2;
+    if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
+        mode += 4;
+    if( mode >= pred_mode )
+        return mode + 1;
+    else
+        return mode;
+}
+
+static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+    const int mba_xy = mb_xy - 1;
+    const int mbb_xy = mb_xy - s->mb_stride;
+
+    int ctx = 0;
+
+    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
+    if( s->mb_x > 0 && h->chroma_pred_mode_table[mba_xy] != 0 )
+        ctx++;
+
+    if( s->mb_y > 0 && h->chroma_pred_mode_table[mbb_xy] != 0 )
+        ctx++;
+
+    if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
+        return 0;
+
+    if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
+        return 1;
+    if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
+        return 2;
+    else
+        return 3;
+}
+
+static const uint8_t block_idx_x[16] = {
+    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
+};
+static const uint8_t block_idx_y[16] = {
+    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
+};
+static const uint8_t block_idx_xy[4][4] = {
+    { 0, 2, 8,  10},
+    { 1, 3, 9,  11},
+    { 4, 6, 12, 14},
+    { 5, 7, 13, 15}
+};
+
+static int decode_cabac_mb_cbp_luma( H264Context *h) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+
+    int cbp = 0;
+    int i8x8;
+
+    h->cbp_table[mb_xy] = 0;  /* FIXME aaahahahah beurk */
+
+    for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+        int mba_xy = -1;
+        int mbb_xy = -1;
+        int x, y;
+        int ctx = 0;
+
+        x = block_idx_x[4*i8x8];
+        y = block_idx_y[4*i8x8];
+
+        if( x > 0 )
+            mba_xy = mb_xy;
+        else if( s->mb_x > 0 )
+            mba_xy = mb_xy - 1;
+
+        if( y > 0 )
+            mbb_xy = mb_xy;
+        else if( s->mb_y > 0 )
+            mbb_xy = mb_xy - s->mb_stride;
+
+        /* No need to test for skip as we put 0 for skip block */
+        if( mba_xy >= 0 ) {
+            int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+            if( ((h->cbp_table[mba_xy] >> i8x8a)&0x01) == 0 )
+                ctx++;
         }
-        
-        if(++s->mb_x >= s->mb_width){
-            s->mb_x=0;
-            ff_draw_horiz_band(s, 16*s->mb_y, 16);
-            if(++s->mb_y >= s->mb_height){
-                tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
 
-                if(get_bits_count(&s->gb) == s->gb.size_in_bits){
-                    ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+        if( mbb_xy >= 0 ) {
+            int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+            if( ((h->cbp_table[mbb_xy] >> i8x8b)&0x01) == 0 )
+                ctx += 2;
+        }
 
-                    return 0;
+        if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
+            cbp |= 1 << i8x8;
+            h->cbp_table[mb_xy] = cbp;  /* FIXME aaahahahah beurk */
+        }
+    }
+    return cbp;
+}
+static int decode_cabac_mb_cbp_chroma( H264Context *h) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+    int ctx;
+    int cbp_a, cbp_b;
+
+    /* No need to test for skip */
+    if( s->mb_x > 0 )
+        cbp_a = (h->cbp_table[mb_xy-1]>>4)&0x03;
+    else
+        cbp_a = -1;
+
+    if( s->mb_y > 0 )
+        cbp_b = (h->cbp_table[mb_xy-s->mb_stride]>>4)&0x03;
+    else
+        cbp_b = -1;
+
+    ctx = 0;
+    if( cbp_a > 0 ) ctx++;
+    if( cbp_b > 0 ) ctx += 2;
+    if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
+        return 0;
+
+    ctx = 4;
+    if( cbp_a == 2 ) ctx++;
+    if( cbp_b == 2 ) ctx += 2;
+    if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) )
+        return 2;
+    else
+        return 1;
+}
+static int decode_cabac_mb_dqp( H264Context *h) {
+    MpegEncContext * const s = &h->s;
+    int mbn_xy;
+    int   ctx = 0;
+    int   val = 0;
+
+    if( s->mb_x > 0 )
+        mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
+    else
+        mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
+
+    if( mbn_xy >= 0 && h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
+        ctx++;
+
+    while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
+        if( ctx < 2 )
+            ctx = 2;
+        else
+            ctx = 3;
+        val++;
+    }
+
+    if( val&0x01 )
+        return (val + 1)/2;
+    else
+        return -(val + 1)/2;
+}
+static int decode_cabac_mb_sub_type( H264Context *h ) {
+    if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
+        return 0;   /* 8x8 */
+    if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
+        return 1;   /* 8x4 */
+    if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
+        return 2;   /* 4x8 */
+    return 3;       /* 4x4 */
+}
+
+static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
+    int refa = h->ref_cache[list][scan8[n] - 1];
+    int refb = h->ref_cache[list][scan8[n] - 8];
+    int ref  = 0;
+    int ctx  = 0;
+
+    if( refa > 0 )
+        ctx++;
+    if( refb > 0 )
+        ctx += 2;
+
+    while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
+        ref++;
+        if( ctx < 4 )
+            ctx = 4;
+        else
+            ctx = 5;
+    }
+    return ref;
+}
+
+static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
+    int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
+               abs( h->mvd_cache[list][scan8[n] - 8][l] );
+    int ctxbase = (l == 0) ? 40 : 47;
+    int ctx;
+    int mvd = 0;
+
+    if( amvd < 3 )
+        ctx = 0;
+    else if( amvd > 32 )
+        ctx = 2;
+    else
+        ctx = 1;
+
+    while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
+        mvd++;
+        if( ctx < 3 )
+            ctx = 3;
+        else if( ctx < 6 )
+            ctx++;
+    }
+
+    if( mvd >= 9 ) {
+        int k = 3;
+        while( get_cabac_bypass( &h->cabac ) ) {
+            mvd += 1 << k;
+            k++;
+        }
+        while( k-- ) {
+            if( get_cabac_bypass( &h->cabac ) )
+                mvd += 1 << k;
+        }
+    }
+    if( mvd != 0 && get_cabac_bypass( &h->cabac ) )
+        return -mvd;
+    return mvd;
+}
+
+
+static int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
+    int mba_xy = -1;
+    int mbb_xy = -1;
+
+    int nza = -1;
+    int nzb = -1;
+    int ctx = 0;
+
+    if( cat == 0 ) {
+        if( s->mb_x > 0 ) {
+            mba_xy = mb_xy - 1;
+            if( IS_INTRA16x16(s->current_picture.mb_type[mba_xy] ) )
+                    nza = h->cbp_table[mba_xy]&0x100;
+        }
+        if( s->mb_y > 0 ) {
+            mbb_xy = mb_xy - s->mb_stride;
+            if( IS_INTRA16x16(s->current_picture.mb_type[mbb_xy] ) )
+                    nzb = h->cbp_table[mbb_xy]&0x100;
+        }
+    } else if( cat == 1 || cat == 2 ) {
+        int i8x8a, i8x8b;
+        int x, y;
+
+        x = block_idx_x[idx];
+        y = block_idx_y[idx];
+
+        if( x > 0 )
+            mba_xy = mb_xy;
+        else if( s->mb_x > 0 )
+            mba_xy = mb_xy - 1;
+
+        if( y > 0 )
+            mbb_xy = mb_xy;
+        else if( s->mb_y > 0 )
+            mbb_xy = mb_xy - s->mb_stride;
+
+        /* No need to test for skip */
+        if( mba_xy >= 0 ) {
+            i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
+
+            if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
+                ((h->cbp_table[mba_xy]&0x0f)>>i8x8a))
+                nza = h->non_zero_count_cache[scan8[idx] - 1];
+        }
+
+        if( mbb_xy >= 0 ) {
+            i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
+
+            if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
+                ((h->cbp_table[mbb_xy]&0x0f)>>i8x8b))
+                nzb = h->non_zero_count_cache[scan8[idx] - 8];
+        }
+    } else if( cat == 3 ) {
+        if( s->mb_x > 0 ) {
+            mba_xy = mb_xy - 1;
+
+            if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
+                (h->cbp_table[mba_xy]&0x30) )
+                nza = (h->cbp_table[mba_xy]>>(6+idx))&0x01;
+        }
+        if( s->mb_y > 0 ) {
+            mbb_xy = mb_xy - s->mb_stride;
+
+            if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
+                (h->cbp_table[mbb_xy]&0x30) )
+                nzb = (h->cbp_table[mbb_xy]>>(6+idx))&0x01;
+        }
+    } else if( cat == 4 ) {
+        int idxc = idx % 4 ;
+        if( idxc == 1 || idxc == 3 )
+            mba_xy = mb_xy;
+        else if( s->mb_x > 0 )
+            mba_xy = mb_xy -1;
+
+        if( idxc == 2 || idxc == 3 )
+            mbb_xy = mb_xy;
+        else if( s->mb_y > 0 )
+            mbb_xy = mb_xy - s->mb_stride;
+
+        if( mba_xy >= 0 &&
+            !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
+            (h->cbp_table[mba_xy]&0x30) == 0x20 )
+            nza = h->non_zero_count_cache[scan8[16+idx] - 1];
+
+        if( mbb_xy >= 0 &&
+            !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
+            (h->cbp_table[mbb_xy]&0x30) == 0x20 )
+            nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
+    }
+
+    if( ( mba_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) ||
+        ( mba_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) ) ||
+          nza > 0 )
+        ctx++;
+
+    if( ( mbb_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) ||
+        ( mbb_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) ) ||
+          nzb > 0 )
+        ctx += 2;
+
+    return ctx + 4 * cat;
+}
+
+static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
+    const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
+    const uint16_t *qmul= dequant_coeff[qp];
+    static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
+    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+
+    int coeff[16];
+
+    int last = 0;
+    int coeff_count = 0;
+    int nz[16] = {0};
+    int i;
+
+    int abslevel1 = 0;
+    int abslevelgt1 = 0;
+
+    /* cat: 0-> DC 16x16  n = 0
+     *      1-> AC 16x16  n = luma4x4idx
+     *      2-> Luma4x4   n = luma4x4idx
+     *      3-> DC Chroma n = iCbCr
+     *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
+     */
+
+    /* read coded block flag */
+    if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
+        if( cat == 1 || cat == 2 )
+            h->non_zero_count_cache[scan8[n]] = 0;
+        else if( cat == 4 )
+            h->non_zero_count_cache[scan8[16+n]] = 0;
+
+        return 0;
+    }
+
+    while( last < max_coeff - 1 ) {
+        int ctx = FFMIN( last, max_coeff - 2 );
+
+        if( get_cabac( &h->cabac, &h->cabac_state[105+significant_coeff_flag_offset[cat]+ctx] ) == 0 ) {
+            nz[last++] = 0;
+        }
+        else {
+            nz[last++] = 1;
+            coeff_count++;
+            if( get_cabac( &h->cabac, &h->cabac_state[166+last_significant_coeff_flag_offset[cat]+ctx] ) ) {
+                while( last < max_coeff ) {
+                    nz[last++] = 0;
+                }
+                break;
+            }
+        }
+    }
+    if( last == max_coeff -1 ) {
+        nz[last++] = 1;
+        coeff_count++;
+    }
+
+    if( cat == 0 && coeff_count > 0 )
+        h->cbp_table[mb_xy] |= 0x100;
+    else if( cat == 1 || cat == 2 )
+        h->non_zero_count_cache[scan8[n]] = coeff_count;
+    else if( cat == 3 && coeff_count > 0 )
+        h->cbp_table[mb_xy] |= 0x40 << n;
+    else if( cat == 4 )
+        h->non_zero_count_cache[scan8[16+n]] = coeff_count;
+
+    for( i = coeff_count - 1; i >= 0; i-- ) {
+        int coeff_abs_m1;
+
+        int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 + 1 )) + coeff_abs_level_m1_offset[cat];
+
+        if( get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) == 0 ) {
+            coeff_abs_m1 = 0;
+        } else {
+            coeff_abs_m1 = 1;
+            ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat];
+            while( coeff_abs_m1 < 14 && get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) ) {
+                coeff_abs_m1++;
+            }
+        }
+
+        if( coeff_abs_m1 >= 14 ) {
+            int j = 0;
+            while( get_cabac_bypass( &h->cabac ) ) {
+                coeff_abs_m1 += 1 << j;
+                j++;
+            }
+
+            while( j-- ) {
+                if( get_cabac_bypass( &h->cabac ) )
+                    coeff_abs_m1 += 1 << j ;
+            }
+        }
+        if( get_cabac_bypass( &h->cabac ) )
+            coeff[i] = -1 *( coeff_abs_m1 + 1 );
+        else
+            coeff[i] = coeff_abs_m1 + 1;
+
+        if( coeff_abs_m1 == 0 )
+            abslevel1++;
+        else
+            abslevelgt1++;
+    }
+
+    if( cat == 0 || cat == 3 ) { /* DC */
+        int j;
+        for( i = 0, j = 0; j < coeff_count; i++ ) {
+            if( nz[i] ) {
+                block[scantable[i]] = coeff[j];
+
+                j++;
+            }
+        }
+
+    } else { /* AC */
+        int j;
+        for( i = 0, j = 0; j < coeff_count; i++ ) {
+            if( nz[i] ) {
+                block[scantable[i]] = coeff[j] * qmul[scantable[i]];
+
+                j++;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
+ * decodes a macroblock
+ * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
+ */
+static int decode_mb_cabac(H264Context *h) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
+    int mb_type, partition_count, cbp = 0;
+
+    s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?)
+
+    if( h->slice_type == B_TYPE ) {
+        av_log( h->s.avctx, AV_LOG_ERROR, "B-frame not supported with CABAC\n" );
+        return -1;
+    }
+    if( h->sps.mb_aff ) {
+        av_log( h->s.avctx, AV_LOG_ERROR, "Fields not supported with CABAC\n" );
+        return -1;
+    }
+
+    if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
+        /* read skip flags */
+        if( decode_cabac_mb_skip( h ) ) {
+            int mx, my;
+
+            /* skip mb */
+            mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+
+            memset(h->non_zero_count[mb_xy], 0, 16);
+            memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+#if 0
+            if(h->sps.mb_aff && s->mb_skip_run==0 && (s->mb_y&1)==0){
+                h->mb_field_decoding_flag= get_bits1(&s->gb);
+            }
+            if(h->mb_field_decoding_flag)
+                mb_type|= MB_TYPE_INTERLACED;
+#endif
+
+            fill_caches(h, mb_type); //FIXME check what is needed and what not ...
+            pred_pskip_motion(h, &mx, &my);
+            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+            fill_rectangle(  h->mvd_cache[0][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
+            fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
+            write_back_motion(h, mb_type);
+
+            s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
+            s->current_picture.qscale_table[mb_xy]= s->qscale;
+            h->slice_table[ mb_xy ]= h->slice_num;
+            h->cbp_table[mb_xy] = 0;
+            h->chroma_pred_mode_table[mb_xy] = 0;
+            h->last_qscale_diff = 0;
+
+            h->prev_mb_skiped= 1;
+
+            return 0;
+
+        }
+    }
+    h->prev_mb_skiped = 0;
+
+    if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
+        av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
+        return -1;
+    }
+
+    if( h->slice_type == P_TYPE ) {
+        if( mb_type < 5) {
+            partition_count= p_mb_type_info[mb_type].partition_count;
+            mb_type=         p_mb_type_info[mb_type].type;
+        } else {
+            mb_type -= 5;
+            goto decode_intra_mb;
+        }
+    } else {
+       assert(h->slice_type == I_TYPE);
+decode_intra_mb:
+        partition_count = 0;
+        cbp= i_mb_type_info[mb_type].cbp;
+        h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
+        mb_type= i_mb_type_info[mb_type].type;
+    }
+#if 0
+    if(h->mb_field_decoding_flag)
+        mb_type |= MB_TYPE_INTERLACED;
+#endif
+
+    s->current_picture.mb_type[mb_xy]= mb_type;
+    h->slice_table[ mb_xy ]= h->slice_num;
+
+    if(IS_INTRA_PCM(mb_type)) {
+        /* TODO */
+        h->cbp_table[mb_xy] = 0xf +4*2;
+        h->chroma_pred_mode_table[mb_xy] = 0;
+        s->current_picture.qscale_table[mb_xy]= s->qscale;
+        return -1;
+    }
+
+    fill_caches(h, mb_type);
+
+    if( IS_INTRA( mb_type ) ) {
+        if( IS_INTRA4x4( mb_type ) ) {
+            int i;
+            for( i = 0; i < 16; i++ ) {
+                int pred = pred_intra_mode( h, i );
+                h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
+
+                //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
+            }
+            write_back_intra_pred_mode(h);
+            if( check_intra4x4_pred_mode(h) < 0 ) return -1;
+        } else {
+            h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
+            if( h->intra16x16_pred_mode < 0 ) return -1;
+        }
+        h->chroma_pred_mode_table[mb_xy] =
+            h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
+
+        h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
+        if( h->chroma_pred_mode < 0 ) return -1;
+    } else if( partition_count == 4 ) {
+        int i, j, sub_partition_count[4], list, ref[2][4];
+
+        /* Only P-frame */
+        for( i = 0; i < 4; i++ ) {
+            h->sub_mb_type[i] = decode_cabac_mb_sub_type( h );
+            sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
+            h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+        }
+
+        for( list = 0; list < 2; list++ ) {
+            if( h->ref_count[list] > 0 ) {
+                for( i = 0; i < 4; i++ ) {
+                    if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+                        if( h->ref_count[list] > 1 )
+                            ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
+                        else
+                            ref[list][i] = 0;
+                    } else {
+                        ref[list][i] = -1;
+                    }
+                    h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
+                    h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
+                }
+            }
+        }
+
+        for(list=0; list<2; list++){
+
+            for(i=0; i<4; i++){
+                //h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
+                //h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
+
+                if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+                    const int sub_mb_type= h->sub_mb_type[i];
+                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
+                    for(j=0; j<sub_partition_count[i]; j++){
+                        int mpx, mpy;
+                        int mx, my;
+                        const int index= 4*i + block_width*j;
+                        int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
+                        int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
+                        pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
+
+                        mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
+                        my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
+                        tprintf("final mv:%d %d\n", mx, my);
+
+                        if(IS_SUB_8X8(sub_mb_type)){
+                            mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
+                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
+                            mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
+                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
+
+                            mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
+                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
+                            mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
+                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
+                        }else if(IS_SUB_8X4(sub_mb_type)){
+                            mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
+                            mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
+
+                            mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
+                            mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
+                        }else if(IS_SUB_4X8(sub_mb_type)){
+                            mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
+                            mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
+
+                            mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
+                            mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
+                        }else{
+                            assert(IS_SUB_4X4(sub_mb_type));
+                            mv_cache[ 0 ][0]= mx;
+                            mv_cache[ 0 ][1]= my;
+
+                            mvd_cache[ 0 ][0]= mx - mpx;
+                            mvd_cache[ 0 ][1]= my - mpy;
+                        }
+                    }
                 }else{
-                    ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+                    uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
+                    uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
+                    p[0] = p[1] = p[8] = p[9] = 0;
+                    pd[0]= pd[1]= pd[8]= pd[9]= 0;
+                }
+            }
+        }
+    } else if( !IS_DIRECT(mb_type) ) {
+        int list, mx, my, i, mpx, mpy;
+        if(IS_16X16(mb_type)){
+            for(list=0; list<2; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    if(h->ref_count[list] > 0 ){
+                        const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
+                        fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
+                    }
+                }
+            }
+            for(list=0; list<2; list++){
+                if(IS_DIR(mb_type, 0, list)){
+                    pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
+
+                    mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
+                    my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
+                    tprintf("final mv:%d %d\n", mx, my);
+
+                    fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
+                    fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
+                }
+            }
+        }
+        else if(IS_16X8(mb_type)){
+            for(list=0; list<2; list++){
+                if(h->ref_count[list]>0){
+                    for(i=0; i<2; i++){
+                        if(IS_DIR(mb_type, i, list)){
+                            const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
+                            fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
+                        }
+                    }
+                }
+            }
+            for(list=0; list<2; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
+                        mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
+                        my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
+                        tprintf("final mv:%d %d\n", mx, my);
 
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
+                        fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
+                    }
+                }
+            }
+        }else{
+            assert(IS_8X16(mb_type));
+            for(list=0; list<2; list++){
+                if(h->ref_count[list]>0){
+                    for(i=0; i<2; i++){
+                        if(IS_DIR(mb_type, i, list)){ //FIXME optimize
+                            const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
+                            fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
+                        }
+                    }
+                }
+            }
+            for(list=0; list<2; list++){
+                for(i=0; i<2; i++){
+                    if(IS_DIR(mb_type, i, list)){
+                        pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
+                        mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
+                        my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
+
+                        tprintf("final mv:%d %d\n", mx, my);
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
+                        fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
+                    }
+                }
+            }
+        }
+    }
+
+   if( IS_INTER( mb_type ) ) {
+        h->chroma_pred_mode_table[mb_xy] = 0;
+        write_back_motion( h, mb_type );
+   }
+
+    if( !IS_INTRA16x16( mb_type ) ) {
+        cbp  = decode_cabac_mb_cbp_luma( h );
+        cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
+    }
+
+    h->cbp_table[mb_xy] = cbp;
+
+    if( cbp || IS_INTRA16x16( mb_type ) ) {
+        const uint8_t *scan, *dc_scan;
+        int dqp;
+
+        if(IS_INTERLACED(mb_type)){
+            scan= field_scan;
+            dc_scan= luma_dc_field_scan;
+        }else{
+            scan= zigzag_scan;
+            dc_scan= luma_dc_zigzag_scan;
+        }
+
+        h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
+        s->qscale += dqp;
+        if(((unsigned)s->qscale) > 51){
+            if(s->qscale<0) s->qscale+= 52;
+            else            s->qscale-= 52;
+        }
+        h->chroma_qp = get_chroma_qp(h, s->qscale);
+
+        if( IS_INTRA16x16( mb_type ) ) {
+            int i;
+            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
+            if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, s->qscale, 16) < 0)
+                return -1;
+            if( cbp&15 ) {
+                for( i = 0; i < 16; i++ ) {
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
+                    if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, s->qscale, 15) < 0 )
+                        return -1;
+                }
+            } else {
+                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
+            }
+        } else {
+            int i8x8, i4x4;
+            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
+                if( cbp & (1<<i8x8) ) {
+                    for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+                        const int index = 4*i8x8 + i4x4;
+                        //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
+                        if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, s->qscale, 16) < 0 )
+                            return -1;
+                    }
+                } else {
+                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
+                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
+                }
+            }
+        }
+
+        if( cbp&0x30 ){
+            int c;
+            for( c = 0; c < 2; c++ ) {
+                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+                if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->chroma_qp, 4) < 0)
                     return -1;
+            }
+        }
+
+        if( cbp&0x20 ) {
+            int c, i;
+            for( c = 0; c < 2; c++ ) {
+                for( i = 0; i < 4; i++ ) {
+                    const int index = 16 + 4 * c + i;
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+                    if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->chroma_qp, 15) < 0)
+                        return -1;
                 }
             }
+        } else {
+            uint8_t * const nnz= &h->non_zero_count_cache[0];
+            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
         }
-        
-        if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
-            if(get_bits_count(&s->gb) == s->gb.size_in_bits){
-                ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+    } else {
+        memset( &h->non_zero_count_cache[8], 0, 8*5 );
+    }
+
+    s->current_picture.qscale_table[mb_xy]= s->qscale;
+    write_back_non_zero_count(h);
+
+    return 0;
+}
+
+
+static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+    int i, d;
+    const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ ) {
+        if( bS[i] == 0 ) {
+            pix += 4 * stride;
+            continue;
+        }
+
+        if( bS[i] < 4 ) {
+            const int tc0 = tc0_table[index_a][bS[i] - 1];
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ ) {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( ABS( p2 - p0 ) < beta ) {
+                        pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( ABS( q2 - q0 ) < beta ) {
+                        pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += stride;
+            }
+        }else{
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ ) {
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int p2 = pix[-3];
+
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+                const int q2 = pix[2];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+
+                    if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                        if( ABS( p2 - p0 ) < beta)
+                        {
+                            const int p3 = pix[-4];
+                            /* p0', p1', p2' */
+                            pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        } else {
+                            /* p0' */
+                            pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( ABS( q2 - q0 ) < beta)
+                        {
+                            const int q3 = pix[3];
+                            /* q0', q1', q2' */
+                            pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        } else {
+                            /* q0' */
+                            pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }else{
+                        /* p0', q0' */
+                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix += stride;
+            }
+        }
+    }
+}
+static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+    int i, d;
+    const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+
+    for( i = 0; i < 4; i++ ) {
+        if( bS[i] == 0 ) {
+            pix += 2 * stride;
+            continue;
+        }
+
+        if( bS[i] < 4 ) {
+            const int tc = tc0_table[index_a][bS[i] - 1] + 1;
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ ){
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+                    const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix += stride;
+            }
+        }else{
+            /* 2px edge length (because we use same bS than the one for luma) */
+            for( d = 0; d < 2; d++ ){
+                const int p0 = pix[-1];
+                const int p1 = pix[-2];
+                const int q0 = pix[0];
+                const int q1 = pix[1];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+
+                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix += stride;
+            }
+        }
+    }
+}
+
+static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+    int i, d;
+    const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+    const int pix_next  = stride;
+
+    for( i = 0; i < 4; i++ ) {
+        if( bS[i] == 0 ) {
+            pix += 4;
+            continue;
+        }
+
+        if( bS[i] < 4 ) {
+            const int tc0 = tc0_table[index_a][bS[i] - 1];
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ ) {
+                const int p0 = pix[-1*pix_next];
+                const int p1 = pix[-2*pix_next];
+                const int p2 = pix[-3*pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*pix_next];
+                const int q2 = pix[2*pix_next];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+
+                    int tc = tc0;
+                    int i_delta;
+
+                    if( ABS( p2 - p0 ) < beta ) {
+                        pix[-2*pix_next] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+                    if( ABS( q2 - q0 ) < beta ) {
+                        pix[pix_next] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                        tc++;
+                    }
+
+                    i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                    pix[-pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]         = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }else{
+            /* 4px edge length */
+            for( d = 0; d < 4; d++ ) {
+                const int p0 = pix[-1*pix_next];
+                const int p1 = pix[-2*pix_next];
+                const int p2 = pix[-3*pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*pix_next];
+                const int q2 = pix[2*pix_next];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+
+                    const int p3 = pix[-4*pix_next];
+                    const int q3 = pix[ 3*pix_next];
+
+                    if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                        if( ABS( p2 - p0 ) < beta) {
+                            /* p0', p1', p2' */
+                            pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                            pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                            pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                        } else {
+                            /* p0' */
+                            pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        }
+                        if( ABS( q2 - q0 ) < beta) {
+                            /* q0', q1', q2' */
+                            pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                            pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                            pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                        } else {
+                            /* q0' */
+                            pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                        }
+                    }else{
+                        /* p0', q0' */
+                        pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                        pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }
+                pix++;
+            }
+        }
+    }
+}
+
+static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
+    int i, d;
+    const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
+    const int alpha = alpha_table[index_a];
+    const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
+    const int pix_next  = stride;
+
+    for( i = 0; i < 4; i++ )
+    {
+        if( bS[i] == 0 ) {
+            pix += 2;
+            continue;
+        }
+
+        if( bS[i] < 4 ) {
+            int tc = tc0_table[index_a][bS[i] - 1] + 1;
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ ) {
+                const int p0 = pix[-1*pix_next];
+                const int p1 = pix[-2*pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*pix_next];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+
+                    int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                    pix[-pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
+                    pix[0]         = clip_uint8( q0 - i_delta );    /* q0' */
+                }
+                pix++;
+            }
+        }else{
+            /* 2px edge length (see deblocking_filter_edgecv) */
+            for( d = 0; d < 2; d++ ) {
+                const int p0 = pix[-1*pix_next];
+                const int p1 = pix[-2*pix_next];
+                const int q0 = pix[0];
+                const int q1 = pix[1*pix_next];
+
+                if( ABS( p0 - q0 ) < alpha &&
+                    ABS( p1 - p0 ) < beta &&
+                    ABS( q1 - q0 ) < beta ) {
+
+                    pix[-pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                    pix[0]         = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                }
+                pix++;
+            }
+        }
+    }
+}
+
+static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= mb_x + mb_y*s->mb_stride;
+    int linesize, uvlinesize;
+    int dir;
+
+    /* FIXME Implement deblocking filter for field MB */
+    if( h->sps.mb_aff ) {
+        return;
+    }
+    linesize = s->linesize;
+    uvlinesize = s->uvlinesize;
+
+    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
+    for( dir = 0; dir < 2; dir++ )
+    {
+        int start = 0;
+        int edge;
+
+        /* test picture boundary */
+        if( ( dir == 0 && mb_x == 0 ) || ( dir == 1 && mb_y == 0 ) ) {
+            start = 1;
+        }
+        /* FIXME test slice boundary */
+        if( h->deblocking_filter == 2 ) {
+        }
+
+        /* Calculate bS */
+        for( edge = start; edge < 4; edge++ ) {
+            /* mbn_xy: neighbour macroblock (how that works for field ?) */
+            int mbn_xy = edge > 0 ? mb_xy : ( dir == 0 ? mb_xy -1 : mb_xy - s->mb_stride );
+            int bS[4];
+            int qp;
+
+            if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
+                IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
+                bS[0] = bS[1] = bS[2] = bS[3] = ( edge == 0 ? 4 : 3 );
+            } else {
+                int i;
+                for( i = 0; i < 4; i++ ) {
+                    int x = dir == 0 ? edge : i;
+                    int y = dir == 0 ? i    : edge;
+                    int b_idx= 8 + 4 + x + 8*y;
+                    int bn_idx= b_idx - (dir ? 8:1);
+
+                    if( h->non_zero_count_cache[b_idx] != 0 ||
+                        h->non_zero_count_cache[bn_idx] != 0 ) {
+                        bS[i] = 2;
+                    }
+                    else if( h->slice_type == P_TYPE ) {
+                        if( h->ref_cache[0][b_idx] != h->ref_cache[0][bn_idx] ||
+                            ABS( h->mv_cache[0][b_idx][0] - h->mv_cache[0][bn_idx][0] ) >= 4 ||
+                            ABS( h->mv_cache[0][b_idx][1] - h->mv_cache[0][bn_idx][1] ) >= 4 )
+                            bS[i] = 1;
+                        else
+                            bS[i] = 0;
+                    }
+                    else {
+                        /* FIXME Add support for B frame */
+                        return;
+                    }
+                }
+
+                if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+                    continue;
+            }
 
+            /* Filter edge */
+            qp = ( s->qscale + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+            if( dir == 0 ) {
+                filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
+                if( (edge&1) == 0 ) {
+                    int chroma_qp = ( h->chroma_qp +
+                                      get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                    filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
+                    filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
+                }
+            } else {
+                filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
+                if( (edge&1) == 0 ) {
+                    int chroma_qp = ( h->chroma_qp +
+                                      get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                    filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
+                    filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
+                }
+            }
+        }
+    }
+}
+
+static int decode_slice(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
+
+    s->mb_skip_run= -1;
+
+    if( h->pps.cabac ) {
+        int i;
+
+        /* realign */
+        align_get_bits( &s->gb );
+
+        /* init cabac */
+        ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
+        ff_init_cabac_decoder( &h->cabac,
+                               s->gb.buffer + get_bits_count(&s->gb)/8,
+                               ( s->gb.size_in_bits - get_bits_count(&s->gb) ) );
+        /* calculate pre-state */
+        for( i= 0; i < 399; i++ ) {
+            int pre;
+            if( h->slice_type == I_TYPE )
+                pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
+            else
+                pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
+
+            if( pre <= 63 )
+                h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
+            else
+                h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
+        }
+
+        for(;;){
+            int ret = decode_mb_cabac(h);
+            int eos = get_cabac_terminate( &h->cabac ); /* End of Slice flag */
+
+            hl_decode_mb(h);
+
+            /* XXX: useless as decode_mb_cabac it doesn't support that ... */
+            if( ret >= 0 && h->sps.mb_aff ) { //FIXME optimal? or let mb_decode decode 16x32 ?
+                s->mb_y++;
+
+                ret = decode_mb_cabac(h);
+                eos = get_cabac_terminate( &h->cabac );
+
+                hl_decode_mb(h);
+                s->mb_y--;
+            }
+
+            if( ret < 0 ) {
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+                return -1;
+            }
+
+            if( ++s->mb_x >= s->mb_width ) {
+                s->mb_x = 0;
+                ff_draw_horiz_band(s, 16*s->mb_y, 16);
+                if( ++s->mb_y >= s->mb_height ) {
+                    tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
+                }
+            }
+
+            if( eos || s->mb_y >= s->mb_height ) {
+                ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
                 return 0;
-            }else{
+            }
+#if 0
+            /* TODO test over-reading in cabac code */
+            else if( read too much in h->cabac ) {
+                ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+                return -1;
+            }
+#endif
+        }
+
+    } else {
+        for(;;){
+            int ret = decode_mb_cavlc(h);
+
+            hl_decode_mb(h);
+
+            if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ?
+                s->mb_y++;
+                ret = decode_mb_cavlc(h);
+
+                hl_decode_mb(h);
+                s->mb_y--;
+            }
+
+            if(ret<0){
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
 
                 return -1;
             }
+
+            if(++s->mb_x >= s->mb_width){
+                s->mb_x=0;
+                ff_draw_horiz_band(s, 16*s->mb_y, 16);
+                if(++s->mb_y >= s->mb_height){
+                    tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
+
+                    if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
+                        ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+
+                        return 0;
+                    }else{
+                        ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+
+                        return -1;
+                    }
+                }
+            }
+
+            if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
+                if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
+                    ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
+
+                    return 0;
+                }else{
+                    ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
+
+                    return -1;
+                }
+            }
         }
     }
-#endif
+
 #if 0
     for(;s->mb_y < s->mb_height; s->mb_y++){
         for(;s->mb_x < s->mb_width; s->mb_x++){
@@ -4022,7 +5581,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         
         buf_index += consumed;
 
-        if(h->nal_ref_idc < s->hurry_up)
+        if( s->hurry_up == 1 && h->nal_ref_idc  == 0 )
             continue;
         
         switch(h->nal_unit_type){
@@ -4035,7 +5594,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
             s->data_partitioning = 0;
             
             if(decode_slice_header(h) < 0) return -1;
-            if(h->redundant_pic_count==0)
+            if(h->redundant_pic_count==0 && s->hurry_up < 5 )
                 decode_slice(h);
             break;
         case NAL_DPA:
@@ -4054,7 +5613,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
             init_get_bits(&h->inter_gb, ptr, bit_length);
             h->inter_gb_ptr= &h->inter_gb;
 
-            if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning)
+            if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning && s->hurry_up < 5 )
                 decode_slice(h);
             break;
         case NAL_SEI:
@@ -4099,6 +5658,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         assert(h->mmco_index==0);
 
     ff_er_frame_end(s);
+
     MPV_frame_end(s);
 
     return buf_index;
diff --git a/src/libffmpeg/libavcodec/h264data.h b/src/libffmpeg/libavcodec/h264data.h
index 40a252253..5480becd4 100644
--- a/src/libffmpeg/libavcodec/h264data.h
+++ b/src/libffmpeg/libavcodec/h264data.h
@@ -528,3 +528,598 @@ static const int quant_coeff[52][16]={
     {   1260,   819,  1260,   819,   819,   524,   819,   524,  1260,   819,  1260,   819,   819,   524,   819,   524,},
     {   1170,   728,  1170,   728,   728,   456,   728,   456,  1170,   728,  1170,   728,   728,   456,   728,   456,},
 };
+
+
+/* Deblocking filter (p153) */
+static const int alpha_table[52] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+    80, 90,101,113,127,144,162,182,203,226,
+    255, 255
+};
+static const int beta_table[52] = {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18
+};
+static const int tc0_table[52][3] = {
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
+    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
+    { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
+    { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
+    { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
+    { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
+    { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
+    { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 }
+};
+
+/* Cabac pre state table */
+
+static const int cabac_context_init_I[399][2] =
+{
+    /* 0 - 10 */
+    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
+    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
+    { -6,  53 }, { -1, 54 },  {  7,  51 },
+
+    /* 11 - 23 unsused for I */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },
+
+    /* 24- 39 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+
+    /* 40 - 53 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 54 - 59 */
+    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
+    { 0, 0 },    { 0, 0 },
+
+    /* 60 - 69 */
+    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+    { 13, 41 },  { 3, 62 },
+
+    /* 70 -> 87 */
+    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
+    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
+    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
+    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
+    { -12, 115 },{ -16, 122 },
+
+    /* 88 -> 104 */
+    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
+    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
+    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
+    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
+    { -22, 125 },
+
+    /* 105 -> 135 */
+    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
+    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
+    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
+    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
+    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
+    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
+    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
+    { 14, 62 },  { -13, 108 },{ -15, 100 },
+
+    /* 136 -> 165 */
+    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
+    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
+    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
+    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
+    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
+    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
+    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
+    { 0, 62 },   { 12, 72 },
+
+    /* 166 -> 196 */
+    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
+    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
+    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
+    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
+    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
+    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
+    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
+    { 0, 89 },   { 26, -19 }, { 22, -17 },
+
+    /* 197 -> 226 */
+    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
+    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
+    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
+    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
+    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
+    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
+    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
+    { 12, 68 },  { 2, 97 },
+
+    /* 227 -> 251 */
+    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
+    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
+    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
+    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
+    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
+    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
+    { -4, 65 },
+
+    /* 252 -> 275 */
+    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
+    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
+    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
+    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
+    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
+    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
+
+    /* 276 a bit special (not used, bypass is used instead) */
+    { 0, 0 },
+
+    /* 277 -> 307 */
+    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
+    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
+    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
+    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
+    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
+    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
+    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
+    { 9, 64 },   { -12, 104 },{ -11, 97 },
+
+    /* 308 -> 337 */
+    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
+    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
+    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
+    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
+    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
+    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
+    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
+    { 5, 64 },   { 12, 70 },
+
+    /* 338 -> 368 */
+    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
+    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
+    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
+    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
+    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
+    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
+    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
+    { -12, 109 },{ 36, -35 }, { 36, -34 },
+
+    /* 369 -> 398 */
+    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
+    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
+    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
+    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
+    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
+    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
+    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
+    { 29, 39 },  { 19, 66 }
+};
+
+static const int cabac_context_init_PB[3][399][2] =
+{
+    /* i_cabac_init_idc == 0 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
+        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
+        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
+        {  17,  50 },
+
+        /* 24 - 39 */
+        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
+        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
+        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
+        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
+
+        /* 40 - 53 */
+        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
+        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
+        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
+        {  -3,  81 }, {   0,  88 },
+
+        /* 54 - 59 */
+        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
+        {  -7,  72 }, {   1,  58 },
+
+        /* 60 - 69 */
+        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
+        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
+        {  13,  41 }, {   3,  62 },
+
+        /* 70 - 87 */
+        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
+        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
+        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
+        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
+        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
+        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
+        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
+        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
+        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
+
+        /* 105 -> 165 */
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
+        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
+        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
+        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
+        {   9,  69 },
+
+        /* 166 - 226 */
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
+        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
+        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
+        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
+        {  -9, 108 },
+
+        /* 227 - 275 */
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
+        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
+        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
+        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
+        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
+        {  -8,  85 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
+        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
+        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
+        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
+        {  26,  43 },
+
+        /* 338 - 398 */
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
+        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
+        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
+        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
+        {  11,  86 },
+
+
+    },
+
+    /* i_cabac_init_idc == 1 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
+        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
+        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
+        {  10,  54 },
+
+        /* 24 - 39 */
+        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
+        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
+        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
+        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
+
+        /* 40 - 53 */
+        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
+        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
+        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
+        {  -7,  86 },{  -5,  95 },
+
+        /* 54 - 59 */
+        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
+        {  -5,  72 },{   0,  61 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
+        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
+        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
+        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
+        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
+        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
+        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
+        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
+        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
+
+        /* 105 -> 165 */
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
+        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
+        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
+        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
+        {   0,  89 },
+
+        /* 166 - 226 */
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
+        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
+        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
+        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
+        { -10, 116 },
+
+        /* 227 - 275 */
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
+        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
+        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
+        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
+        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
+        {  -4,  78 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
+        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
+        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
+        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
+        {  18,  50 },
+
+        /* 338 - 398 */
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
+        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
+        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
+        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
+        {  11,  83 },
+
+    },
+
+    /* i_cabac_init_idc == 2 */
+    {
+        /* 0 - 10 */
+        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
+        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
+        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
+
+        /* 11 - 23 */
+        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
+        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
+        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
+        {  14,  57 },
+
+        /* 24 - 39 */
+        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
+        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
+        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
+        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
+
+        /* 40 - 53 */
+        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
+        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
+        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
+        {  -3,  90 },{  -1,  101 },
+
+        /* 54 - 59 */
+        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
+        {  -7,  50 },{   1,  60 },
+
+        /* 60 - 69 */
+        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
+        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
+        { 13, 41 },  { 3, 62 },
+
+        /* 70 - 104 */
+        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
+        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
+        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
+        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
+        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
+        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
+        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
+        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
+        {   3,  68 }, {  -8,  71 }, { -13,  98 },
+
+        /* 105 -> 165 */
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
+        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
+        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
+        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
+        { -22, 127 },
+
+        /* 166 - 226 */
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
+        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
+        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
+        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
+        { -24, 127 },
+
+        /* 227 - 275 */
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
+        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
+        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
+        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
+        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
+        { -10,  87 },
+
+        /* 276 a bit special (not used, bypass is used instead) */
+        { 0, 0 },
+
+        /* 277 - 337 */
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
+        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
+        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
+        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
+        {  25,  42 },
+
+        /* 338 - 398 */
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
+        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
+        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
+        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
+        {  25,  61 },
+    }
+};
diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am
index d7b2bb4f8..eaa8d0f75 100644
--- a/src/libffmpeg/libavcodec/i386/Makefile.am
+++ b/src/libffmpeg/libavcodec/i386/Makefile.am
@@ -18,7 +18,8 @@ libavcodec_mmx_src = \
 	motion_est_mmx.c \
 	mpegvideo_mmx.c \
 	simple_idct_mmx.c \
-	vp3dsp_mmx.c
+	vp3dsp_mmx.c \
+	vp3dsp_sse2.c
 
 libavcodec_mmx_dummy = libavcodec_mmx_dummy.c
 
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 15dc8eec2..772c9c1f0 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -2147,9 +2147,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         }
 
         /* VP3 optimized DSP functions */
-        c->vp3_dsp_init = vp3_dsp_init_mmx;
-        c->vp3_idct_put = vp3_idct_put_mmx;
-        c->vp3_idct_add = vp3_idct_add_mmx;
+        if (mm_flags & MM_SSE2) {
+            c->vp3_dsp_init = vp3_dsp_init_sse2;
+            c->vp3_idct_put = vp3_idct_put_sse2;
+            c->vp3_idct_add = vp3_idct_add_sse2;
+        } else {
+            c->vp3_dsp_init = vp3_dsp_init_mmx;
+            c->vp3_idct_put = vp3_idct_put_mmx;
+            c->vp3_idct_add = vp3_idct_add_mmx;
+        }
         
 #ifdef CONFIG_ENCODERS
         c->get_pixels = get_pixels_mmx;
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index 877160773..7af576971 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -50,7 +50,14 @@ static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
 
 static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
 
-static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+struct 
+{
+ const long fdct_r_row_sse2[4] ATTR_ALIGN(16);
+} fdct_r_row_sse2 ATTR_ALIGN(16)=
+{{
+ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
+}};
+//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
 
 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
   16384,   16384,   -8867,  -21407, 
@@ -126,7 +133,12 @@ static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff
    6270,   26722,    6270,  -17855, 
 };
 
-static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table  
+struct 
+{
+ const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
+} tab_frw_01234567_sse2 ATTR_ALIGN(16) =
+{{
+//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table  
 #define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
                    C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
                   -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
@@ -252,7 +264,8 @@ TABLE_SSE2
 #define C6 12299
 #define C7 6270
 TABLE_SSE2
-};
+}};
+
 
 static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
 {
@@ -392,7 +405,7 @@ static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
 	"FDCT_ROW_SSE2_H2 80 192 \n\t"
 	"FDCT_ROW_SSE2 80 \n\t"
 	:
-	: "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+	: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
     );
 }
 
diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h
index 7e94cfd9b..ad684bc5a 100644
--- a/src/libffmpeg/libavcodec/i386/mmx.h
+++ b/src/libffmpeg/libavcodec/i386/mmx.h
@@ -240,4 +240,28 @@ typedef	union {
 
 #define	sfence() __asm__ __volatile__ ("sfence\n\t")
 
+/* SSE2 */
+#define	pshufhw_m2r(var,reg,imm)	mmx_m2ri(pshufhw, var, reg, imm)
+#define	pshufhw_r2r(regs,regd,imm)	mmx_r2ri(pshufhw, regs, regd, imm)
+#define	pshuflw_m2r(var,reg,imm)	mmx_m2ri(pshuflw, var, reg, imm)
+#define	pshuflw_r2r(regs,regd,imm)	mmx_r2ri(pshuflw, regs, regd, imm)
+
+#define	pshufd_r2r(regs,regd,imm)	mmx_r2ri(pshufd, regs, regd, imm)
+
+#define	movdqa_m2r(var,reg)		mmx_m2r (movdqa, var, reg)
+#define	movdqa_r2m(reg,var)		mmx_r2m (movdqa, reg, var)
+#define	movdqa_r2r(regs,regd)		mmx_r2r (movdqa, regs, regd)
+#define	movdqu_m2r(var,reg)		mmx_m2r (movdqu, var, reg)
+#define	movdqu_r2m(reg,var)		mmx_r2m (movdqu, reg, var)
+#define	movdqu_r2r(regs,regd)		mmx_r2r (movdqu, regs, regd)
+
+#define	pmullw_r2m(reg,var)		mmx_r2m (pmullw, reg, var)
+
+#define	pslldq_i2r(imm,reg)		mmx_i2r (pslldq, imm, reg)
+#define	psrldq_i2r(imm,reg)		mmx_i2r (psrldq, imm, reg)
+
+#define	punpcklqdq_r2r(regs,regd)	mmx_r2r (punpcklqdq, regs, regd)
+#define	punpckhqdq_r2r(regs,regd)	mmx_r2r (punpckhqdq, regs, regd)
+
+
 #endif /* AVCODEC_I386MMX_H */
diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
index 59020466f..76007a1d1 100644
--- a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
@@ -46,213 +46,216 @@ static uint16_t idct_cosine_table[7] = {
 #define r7 mm7
 
 /* from original comments: The Macro does IDct on 4 1-D Dcts */
-#define BeginIDCT() \
+#define BeginIDCT() { \
     movq_m2r(*I(3), r2); \
     movq_m2r(*C(3), r6); \
     movq_r2r(r2, r4); \
     movq_m2r(*J(5), r7); \
-    pmulhw_r2r(r6, r4); \
+    pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
     movq_m2r(*C(5), r1); \
-    pmulhw_r2r(r7, r6); \
+    pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
     movq_r2r(r1, r5); \
-    pmulhw_r2r(r2, r1); \
+    pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
     movq_m2r(*I(1), r3); \
-    pmulhw_r2r(r7, r5); \
-    movq_m2r(*C(1), r0); \
-    paddw_r2r(r2, r4); \
-    paddw_r2r(r7, r6); \
-    paddw_r2r(r1, r2); \
+    pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
+    movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
+    paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
+    paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
+    paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
     movq_m2r(*J(7), r1); \
-    paddw_r2r(r5, r7); \
-    movq_r2r(r0, r5); \
-    pmulhw_r2r(r3, r0); \
-    paddsw_r2r(r7, r4); \
-    pmulhw_r2r(r1, r5); \
+    paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
+    movq_r2r(r0, r5);         /* r5 = c1 */ \
+    pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
+    paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
+    pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
     movq_m2r(*C(7), r7); \
-    psubsw_r2r(r2, r6); \
-    paddw_r2r(r3, r0); \
-    pmulhw_r2r(r7, r3); \
+    psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
+    paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
+    pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
     movq_m2r(*I(2), r2); \
-    pmulhw_r2r(r1, r7); \
-    paddw_r2r(r1, r5); \
-    movq_r2r(r2, r1); \
-    pmulhw_m2r(*C(2), r2); \
-    psubsw_r2r(r5, r3); \
+    pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
+    paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
+    movq_r2r(r2, r1);         /* r1 = i2 */ \
+    pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
+    psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
     movq_m2r(*J(6), r5); \
-    paddsw_r2r(r7, r0); \
-    movq_r2r(r5, r7); \
-    psubsw_r2r(r4, r0); \
-    pmulhw_m2r(*C(2), r5); \
-    paddw_r2r(r1, r2); \
-    pmulhw_m2r(*C(6), r1); \
-    paddsw_r2r(r4, r4); \
-    paddsw_r2r(r0, r4); \
-    psubsw_r2r(r6, r3); \
-    paddw_r2r(r7, r5); \
-    paddsw_r2r(r6, r6); \
-    pmulhw_m2r(*C(6), r7); \
-    paddsw_r2r(r3, r6); \
-    movq_r2m(r4, *I(1)); \
-    psubsw_r2r(r5, r1); \
+    paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
+    movq_r2r(r5, r7);         /* r7 = i6 */ \
+    psubsw_r2r(r4, r0);       /* r0 = A - C */ \
+    pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
+    paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
+    pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
+    paddsw_r2r(r4, r4);       /* r4 = C + C */ \
+    paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
+    psubsw_r2r(r6, r3);       /* r3 = B - D */ \
+    paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
+    paddsw_r2r(r6, r6);       /* r6 = D + D */ \
+    pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
+    paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
+    movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
+    psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
     movq_m2r(*C(4), r4); \
-    movq_r2r(r3, r5); \
-    pmulhw_r2r(r4, r3); \
-    paddsw_r2r(r2, r7); \
-    movq_r2m(r6, *I(2)); \
-    movq_r2r(r0, r2); \
+    movq_r2r(r3, r5);         /* r5 = B - D */ \
+    pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
+    paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
+    movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
+    movq_r2r(r0, r2);         /* r2 = A - C */ \
     movq_m2r(*I(0), r6); \
-    pmulhw_r2r(r4, r0); \
-    paddw_r2r(r3, r5); \
+    pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
+    paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
     movq_m2r(*J(4), r3); \
-    psubsw_r2r(r1, r5); \
-    paddw_r2r(r0, r2); \
-    psubsw_r2r(r3, r6); \
+    psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
+    paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
+    psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
     movq_r2r(r6, r0); \
-    pmulhw_r2r(r4, r6); \
-    paddsw_r2r(r3, r3); \
-    paddsw_r2r(r1, r1); \
-    paddsw_r2r(r0, r3); \
-    paddsw_r2r(r5, r1); \
-    pmulhw_r2r(r3, r4); \
-    paddsw_r2r(r0, r6); \
-    psubsw_r2r(r2, r6); \
-    paddsw_r2r(r2, r2); \
-    movq_m2r(*I(1), r0); \
-    paddsw_r2r(r6, r2); \
-    paddw_r2r(r3, r4); \
-    psubsw_r2r(r1, r2);
+    pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
+    paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
+    paddsw_r2r(r1, r1);       /* r1 = H + H */ \
+    paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
+    paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
+    pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
+    paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
+    psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
+    paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
+    movq_m2r(*I(1), r0);      /* r0 = C. */ \
+    paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
+    paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
+    psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
+}
 
 /* RowIDCT gets ready to transpose */
-#define RowIDCT() \
+#define RowIDCT() { \
     \
-    BeginIDCT() \
+    BeginIDCT(); \
     \
-    movq_m2r(*I(2), r3); \
-    psubsw_r2r(r7, r4); \
-    paddsw_r2r(r1, r1); \
-    paddsw_r2r(r7, r7); \
-    paddsw_r2r(r2, r1); \
-    paddsw_r2r(r4, r7); \
-    psubsw_r2r(r3, r4); \
-    psubsw_r2r(r5, r6); \
+    movq_m2r(*I(2), r3);   /* r3 = D. */ \
+    psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
+    paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
+    paddsw_r2r(r7, r7);    /* r7 = G + G */ \
+    paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
+    paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
+    psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
+    paddsw_r2r(r3, r3); \
+    psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
     paddsw_r2r(r5, r5); \
-    paddsw_r2r(r4, r3); \
-    paddsw_r2r(r6, r5); \
-    psubsw_r2r(r0, r7); \
+    paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
+    paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
+    psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
     paddsw_r2r(r0, r0); \
-    movq_r2m(r1, *I(1)); \
-    paddsw_r2r(r7, r0);
+    movq_r2m(r1, *I(1));   /* save R1 */ \
+    paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
+}
 
 /* Column IDCT normalizes and stores final results */
-#define ColumnIDCT() \
+#define ColumnIDCT() { \
     \
-    BeginIDCT() \
+    BeginIDCT(); \
     \
-    paddsw_m2r(*Eight, r2); \
-    paddsw_r2r(r1, r1); \
-    paddsw_r2r(r2, r1); \
-    psraw_i2r(4, r2); \
-    psubsw_r2r(r7, r4); \
-    psraw_i2r(4, r1); \
-    movq_m2r(*I(2), r3); \
-    paddsw_r2r(r7, r7); \
-    movq_r2m(r2, *I(2)); \
-    paddsw_r2r(r4, r7); \
-    movq_r2m(r1, *I(1)); \
-    psubsw_r2r(r3, r4); \
-    paddsw_m2r(*Eight, r4); \
-    paddsw_r2r(r3, r3); \
-    paddsw_r2r(r4, r3); \
-    psraw_i2r(4, r4); \
-    psubsw_r2r(r5, r6); \
-    psraw_i2r(4, r3); \
-    paddsw_m2r(*Eight, r6); \
-    paddsw_r2r(r5, r5); \
-    paddsw_r2r(r6, r5); \
-    psraw_i2r(4, r6); \
-    movq_r2m(r4, *J(4)); \
-    psraw_i2r(4, r5); \
-    movq_r2m(r3, *I(3)); \
-    psubsw_r2r(r0, r7); \
-    paddsw_m2r(*Eight, r7); \
-    paddsw_r2r(r0, r0); \
-    paddsw_r2r(r7, r0); \
-    psraw_i2r(4, r7); \
-    movq_r2m(r6, *J(6)); \
-    psraw_i2r(4, r0); \
-    movq_r2m(r5, *J(5)); \
-    movq_r2m(r7, *J(7)); \
-    movq_r2m(r0, *I(0));
-
+    paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
+    paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
+    paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
+    psraw_i2r(4, r2);          /* r2 = NR2 */ \
+    psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
+    psraw_i2r(4, r1);          /* r1 = NR1 */ \
+    movq_m2r(*I(2), r3);       /* r3 = D. */ \
+    paddsw_r2r(r7, r7);        /* r7 = G + G */ \
+    movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
+    paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
+    movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
+    psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
+    paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
+    paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
+    paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
+    psraw_i2r(4, r4);          /* r4 = NR4 */ \
+    psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
+    psraw_i2r(4, r3);          /* r3 = NR3 */ \
+    paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
+    paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
+    paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
+    psraw_i2r(4, r6);          /* r6 = NR6 */ \
+    movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
+    psraw_i2r(4, r5);          /* r5 = NR5 */ \
+    movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
+    psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
+    paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
+    paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
+    paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
+    psraw_i2r(4, r7);          /* r7 = NR7 */ \
+    movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
+    psraw_i2r(4, r0);          /* r0 = NR0 */ \
+    movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
+    movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
+    movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
+}
 
 /* Following macro does two 4x4 transposes in place.
 
   At entry (we assume):
 
-        r0 = a3 a2 a1 a0
-        I(1) = b3 b2 b1 b0
-        r2 = c3 c2 c1 c0
-        r3 = d3 d2 d1 d0
-
-        r4 = e3 e2 e1 e0
-        r5 = f3 f2 f1 f0
-        r6 = g3 g2 g1 g0
-        r7 = h3 h2 h1 h0
+    r0 = a3 a2 a1 a0
+    I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
 
-   At exit, we have:
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
 
-        I(0) = d0 c0 b0 a0
-        I(1) = d1 c1 b1 a1
-        I(2) = d2 c2 b2 a2
-        I(3) = d3 c3 b3 a3
+  At exit, we have:
 
-        J(4) = h0 g0 f0 e0
-        J(5) = h1 g1 f1 e1
-        J(6) = h2 g2 f2 e2
-        J(7) = h3 g3 f3 e3
+    I(0) = d0 c0 b0 a0
+    I(1) = d1 c1 b1 a1
+    I(2) = d2 c2 b2 a2
+    I(3) = d3 c3 b3 a3
+    
+    J(4) = h0 g0 f0 e0
+    J(5) = h1 g1 f1 e1
+    J(6) = h2 g2 f2 e2
+    J(7) = h3 g3 f3 e3
 
    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
 
    Since r1 is free at entry, we calculate the Js first. */
 
-#define Transpose() \
-    movq_r2r(r4, r1); \
-    punpcklwd_r2r(r5, r4); \
-    movq_r2m(r0, *I(0)); \
-    punpckhwd_r2r(r5, r1); \
-    movq_r2r(r6, r0); \
-    punpcklwd_r2r(r7, r6); \
-    movq_r2r(r4, r5); \
-    punpckldq_r2r(r6, r4); \
-    punpckhdq_r2r(r6, r5); \
-    movq_r2r(r1, r6); \
+#define Transpose() { \
+    movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
+    punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
+    movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
+    punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
+    movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
+    punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
+    movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
+    punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
+    punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
+    movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
     movq_r2m(r4, *J(4)); \
-    punpckhwd_r2r(r7, r0); \
+    punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
     movq_r2m(r5, *J(5)); \
-    punpckhdq_r2r(r0, r6); \
-    movq_m2r(*I(0), r4); \
-    punpckldq_r2r(r0, r1); \
-    movq_m2r(*I(1), r5); \
-    movq_r2r(r4, r0); \
+    punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
+    movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
+    punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
+    movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
+    movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
     movq_r2m(r6, *J(7)); \
-    punpcklwd_r2r(r5, r0); \
+    punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
     movq_r2m(r1, *J(6)); \
-    punpckhwd_r2r(r5, r4); \
-    movq_r2r(r2, r5); \
-    punpcklwd_r2r(r3, r2); \
-    movq_r2r(r0, r1); \
-    punpckldq_r2r(r2, r0); \
-    punpckhdq_r2r(r2, r1); \
-    movq_r2r(r4, r2); \
+    punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
+    movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
+    punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
+    movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
+    punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
+    punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
+    movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
     movq_r2m(r0, *I(0)); \
-    punpckhwd_r2r(r3, r5); \
+    punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
     movq_r2m(r1, *I(1)); \
-    punpckhdq_r2r(r5, r4); \
-    punpckldq_r2r(r5, r2); \
+    punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
+    punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
     movq_r2m(r4, *I(3)); \
-    movq_r2m(r2, *I(2));
-
+    movq_r2m(r2, *I(2)); \
+}
 
 void vp3_dsp_init_mmx(void)
 {
@@ -263,7 +266,7 @@ void vp3_dsp_init_mmx(void)
         idct_constants[--j] = 0;
     } while (j);
 
-    idct_constants[0]  = idct_constants[5] = 
+    idct_constants[0]  = idct_constants[5] =
     idct_constants[10] = idct_constants[15] = 65535;
 
     j = 1;
@@ -272,7 +275,7 @@ void vp3_dsp_init_mmx(void)
         p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
     } while (++j <= 7);
 
-    idct_constants[44] = idct_constants[45] = 
+    idct_constants[44] = idct_constants[45] =
     idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
 }
 
@@ -292,254 +295,240 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
 #define C(x) (idct_constants + 16 + (x - 1) * 4)
 #define Eight (idct_constants + 44)
 
-    movq_m2r(*input_data, r0);
-    pmullw_m2r(*dequant_matrix, r0);
-    movq_m2r(*(input_data + 8), r1);
-    pmullw_m2r(*(dequant_matrix + 8), r1);
-    movq_m2r(*M(0), r2);
-    movq_r2r(r0, r3);
-    movq_m2r(*(input_data + 4), r4);
-    psrlq_i2r(16, r0);
-    pmullw_m2r(*(dequant_matrix + 4), r4);
-    pand_r2r(r2, r3);
-    movq_r2r(r0, r5);
-    movq_r2r(r1, r6);
-    pand_r2r(r2, r5);
-    psllq_i2r(32, r6);
-    movq_m2r(*M(3), r7);
-    pxor_r2r(r5, r0);
-    pand_r2r(r6, r7);
-    por_r2r(r3, r0);
-    pxor_r2r(r7, r6);
-    por_r2r(r7, r0);
-    movq_m2r(*M(3), r7);
-    movq_r2r(r4, r3);
-    movq_r2m(r0, *output_data);
-
-    pand_r2r(r2, r3);
-    movq_m2r(*(input_data + 16), r0);
-    psllq_i2r(16, r3);
-    pmullw_m2r(*(dequant_matrix + 16), r0);
-    pand_r2r(r1, r7);
-    por_r2r(r3, r5);
-    por_r2r(r6, r7);
-    movq_m2r(*(input_data + 12), r3);
-    por_r2r(r5, r7);
-    pmullw_m2r(*(dequant_matrix + 12), r3);
-    psrlq_i2r(16, r4);
-    movq_r2m(r7, *(output_data + 8));
-
-    movq_r2r(r4, r5);
-    movq_r2r(r0, r7);
-    psrlq_i2r(16, r4);
-    psrlq_i2r(48, r7);
-    movq_r2r(r2, r6);
-    pand_r2r(r2, r5);
-    pand_r2r(r4, r6);
-    movq_r2m(r7, *(output_data + 40));
-
-    pxor_r2r(r6, r4);
-    psrlq_i2r(32, r1);
-    por_r2r(r5, r4);
-    movq_m2r(*M(3), r7);
-    pand_r2r(r2, r1);
-    movq_m2r(*(input_data + 24), r5);
-    psllq_i2r(16, r0);
-    pmullw_m2r(*(dequant_matrix + 24), r5);
-    pand_r2r(r0, r7);
-    movq_r2m(r1, *(output_data + 32));
-
-    por_r2r(r4, r7);
-    movq_r2r(r3, r4);
-    pand_r2r(r2, r3);
-    movq_m2r(*M(2), r1);
-    psllq_i2r(32, r3);
-    por_r2r(r3, r7);
-    movq_r2r(r5, r3);
-    psllq_i2r(48, r3);
-    pand_r2r(r0, r1);
-    movq_r2m(r7, *(output_data + 16));
-
-    por_r2r(r3, r6);
-    movq_m2r(*M(1), r7);
-    por_r2r(r1, r6);
-    movq_m2r(*(input_data + 28), r1);
-    pand_r2r(r4, r7);
-    pmullw_m2r(*(dequant_matrix + 28), r1);
-    por_r2r(r6, r7);
-    pand_m2r(*M(1), r0);
-    psrlq_i2r(32, r4);
-    movq_r2m(r7, *(output_data + 24));
-
-    movq_r2r(r4, r6);
-    movq_m2r(*M(3), r7);
-    pand_r2r(r2, r4);
-    movq_m2r(*M(1), r3);
-    pand_r2r(r1, r7);
-    pand_r2r(r5, r3);
-    por_r2r(r4, r0);
-    psllq_i2r(16, r3);
-    por_r2r(r0, r7);
-    movq_m2r(*M(2), r4);
-    por_r2r(r3, r7);
-    movq_m2r(*(input_data + 40), r0);
-    movq_r2r(r4, r3);
-    pmullw_m2r(*(dequant_matrix + 40), r0);
-    pand_r2r(r5, r4);
-    movq_r2m(r7, *(output_data + 4));
-
-    por_r2r(r4, r6);
-    movq_r2r(r3, r4);
-    psrlq_i2r(16, r6);
-    movq_r2r(r0, r7);
-    pand_r2r(r1, r4);
-    psllq_i2r(48, r7);
-    por_r2r(r4, r6);
-    movq_m2r(*(input_data + 44), r4);
-    por_r2r(r6, r7);
-    pmullw_m2r(*(dequant_matrix + 44), r4);
-    psrlq_i2r(16, r3);
-    movq_r2m(r7, *(output_data + 12));
-
-    pand_r2r(r1, r3);
-    psrlq_i2r(48, r5);
-    pand_r2r(r2, r1);
-    movq_m2r(*(input_data + 52), r6);
-    por_r2r(r3, r5);
-    pmullw_m2r(*(input_data + 52), r6);
-    psrlq_i2r(16, r0);
-    movq_r2r(r4, r7);
-    movq_r2r(r2, r3);
-    psllq_i2r(48, r7);
-    pand_r2r(r0, r3);
-    pxor_r2r(r3, r0);
-    psllq_i2r(32, r3);
-    por_r2r(r5, r7);
-    movq_r2r(r6, r5);
-    pand_m2r(*M(1), r6);
-    por_r2r(r3, r7);
-    psllq_i2r(32, r6);
-    por_r2r(r1, r0);
-    movq_r2m(r7, *(output_data + 20));
-
-    por_r2r(r6, r0);
-    movq_m2r(*(input_data + 60), r7);
-    movq_r2r(r5, r6);
-    pmullw_m2r(*(input_data + 60), r7);
-    psrlq_i2r(32, r5);
-    pand_r2r(r2, r6);
-    movq_r2r(r5, r1);
-    movq_r2m(r0, *(output_data + 28));
-
-    pand_r2r(r2, r1);
-    movq_m2r(*(input_data + 56), r0);
-    movq_r2r(r7, r3);
-    pmullw_m2r(*(dequant_matrix + 56), r0);
-    psllq_i2r(16, r3);
-    pand_m2r(*M(3), r7);
-    pxor_r2r(r1, r5);
-    por_r2r(r5, r6);
-    movq_r2r(r3, r5);
-    pand_m2r(*M(3), r5);
-    por_r2r(r1, r7);
-    movq_m2r(*(input_data + 48), r1);
-    pxor_r2r(r5, r3);
-    pmullw_m2r(*(dequant_matrix + 48), r1);
-    por_r2r(r3, r7);
-    por_r2r(r5, r6);
-    movq_r2r(r0, r5);
-    movq_r2m(r7, *(output_data + 60));
-
-    psrlq_i2r(16, r5);
-    pand_m2r(*M(2), r5);
-    movq_r2r(r0, r7);
-    por_r2r(r5, r6);
-    pand_r2r(r2, r0);
-    pxor_r2r(r0, r7);
-    psllq_i2r(32, r0);
-    movq_r2m(r6, *(output_data + 52));
-
-    psrlq_i2r(16, r4);
-    movq_m2r(*(input_data + 36), r5);
-    psllq_i2r(16, r7);
-    pmullw_m2r(*(dequant_matrix + 36), r5);
-    movq_r2r(r7, r6);
-    movq_m2r(*M(2), r3);
-    psllq_i2r(16, r6);
-    pand_m2r(*M(3), r7);
-    pand_r2r(r1, r3);
-    por_r2r(r0, r7);
-    movq_r2r(r1, r0);
-    pand_m2r(*M(3), r1);
-    por_r2r(r3, r6);
-    movq_r2r(r4, r3);
-    psrlq_i2r(32, r1);
-    pand_r2r(r2, r3);
-    por_r2r(r1, r7);
-    por_r2r(r3, r7);
-    movq_r2r(r4, r3);
-    pand_m2r(*M(1), r3);
-    movq_r2r(r5, r1);
-    movq_r2m(r7, *(output_data + 44));
-
-    psrlq_i2r(48, r5);
-    movq_m2r(*(input_data + 32), r7);
-    por_r2r(r3, r6);
-    pmullw_m2r(*(dequant_matrix + 32), r7);
-    por_r2r(r5, r6);
-    pand_m2r(*M(2), r4);
-    psllq_i2r(32, r0);
-    movq_r2m(r6, *(output_data + 36));
-
-    movq_r2r(r0, r6);
-    pand_m2r(*M(3), r0);
-    psllq_i2r(16, r6);
-    movq_m2r(*(input_data + 20), r5);
-    movq_r2r(r1, r3);
-    pmullw_m2r(*(dequant_matrix + 40), r5);
-    psrlq_i2r(16, r1);
-    pand_m2r(*M(1), r1);
-    por_r2r(r4, r0);
-    pand_r2r(r7, r2);
-    por_r2r(r1, r0);
-    por_r2r(r2, r0);
-    psllq_i2r(16, r3);
-    movq_r2r(r3, r4);
-    movq_r2r(r5, r2);
-    movq_r2m(r0, *(output_data + 56));
-
-    psrlq_i2r(48, r2);
-    pand_m2r(*M(2), r4);
-    por_r2r(r2, r6);
-    movq_m2r(*M(1), r2);
-    por_r2r(r4, r6);
-    pand_r2r(r7, r2);
-    psllq_i2r(32, r3);
-    por_m2r(*(output_data + 40), r3);
-
-    por_r2r(r2, r6);
-    movq_m2r(*M(3), r2);
-    psllq_i2r(16, r5);
-    movq_r2m(r6, *(output_data + 48));
-
-    pand_r2r(r5, r2);
-    movq_m2r(*M(2), r6);
-    pxor_r2r(r2, r5);
-    pand_r2r(r7, r6);
-    psrlq_i2r(32, r2);
-    pand_m2r(*M(3), r7);
-    por_r2r(r2, r3);
-    por_m2r(*(output_data + 32), r7);
-
-    por_r2r(r3, r6);
-    por_r2r(r5, r7);
-    movq_r2m(r6, *(output_data + 40));
-    movq_r2m(r7, *(output_data + 32));
+    unsigned char *input_bytes = (unsigned char *)input_data;
+    unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
+    unsigned char *output_data_bytes = (unsigned char *)output_data;
+
+    movq_m2r(*(input_bytes), r0);
+    pmullw_m2r(*(dequant_matrix_bytes), r0);       /* r0 = 03 02 01 00 */
+    movq_m2r(*(input_bytes+16), r1);
+    pmullw_m2r(*(dequant_matrix_bytes+16), r1);    /* r1 = 13 12 11 10 */
+    movq_m2r(*M(0), r2);                           /* r2 = __ __ __ FF */
+    movq_r2r(r0, r3);                              /* r3 = 03 02 01 00 */
+    movq_m2r(*(input_bytes+8), r4);
+    psrlq_i2r(16, r0);                             /* r0 = __ 03 02 01 */
+    pmullw_m2r(*(dequant_matrix_bytes+8), r4);     /* r4 = 07 06 05 04 */
+    pand_r2r(r2, r3);                              /* r3 = __ __ __ 00 */
+    movq_r2r(r0, r5);                              /* r5 = __ 03 02 01 */
+    movq_r2r(r1, r6);                              /* r6 = 13 12 11 10 */
+    pand_r2r(r2, r5);                              /* r5 = __ __ __ 01 */
+    psllq_i2r(32, r6);                             /* r6 = 11 10 __ __ */
+    movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
+    pxor_r2r(r5, r0);                              /* r0 = __ 03 02 __ */
+    pand_r2r(r6, r7);                              /* r7 = 11 __ __ __ */
+    por_r2r(r3, r0);                               /* r0 = __ 03 02 00 */
+    pxor_r2r(r7, r6);                              /* r6 = __ 10 __ __ */
+    por_r2r(r7, r0);                               /* r0 = 11 03 02 00 = R0 */
+    movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
+    movq_r2r(r4, r3);                              /* r3 = 07 06 05 04 */
+    movq_r2m(r0, *(output_data_bytes));            /* write R0 = r0 */
+    pand_r2r(r2, r3);                              /* r3 = __ __ __ 04 */
+    movq_m2r(*(input_bytes+32), r0);
+    psllq_i2r(16, r3);                             /* r3 = __ __ 04 __ */
+    pmullw_m2r(*(dequant_matrix_bytes+32), r0);    /* r0 = 23 22 21 20 */
+    pand_r2r(r1, r7);                              /* r7 = 13 __ __ __ */
+    por_r2r(r3, r5);                               /* r5 = __ __ 04 01 */
+    por_r2r(r6, r7);                               /* r7 = 13 10 __ __ */
+    movq_m2r(*(input_bytes+24), r3);
+    por_r2r(r5, r7);                               /* r7 = 13 10 04 01 = R1 */
+    pmullw_m2r(*(dequant_matrix_bytes+24), r3);    /* r3 = 17 16 15 14 */
+    psrlq_i2r(16, r4);                             /* r4 = __ 07 06 05 */
+    movq_r2m(r7, *(output_data_bytes+16));         /* write R1 = r7 */
+    movq_r2r(r4, r5);                              /* r5 = __ 07 06 05 */
+    movq_r2r(r0, r7);                              /* r7 = 23 22 21 20 */
+    psrlq_i2r(16, r4);                             /* r4 = __ __ 07 06 */
+    psrlq_i2r(48, r7);                             /* r7 = __ __ __ 23 */
+    movq_r2r(r2, r6);                              /* r6 = __ __ __ FF */
+    pand_r2r(r2, r5);                              /* r5 = __ __ __ 05 */
+    pand_r2r(r4, r6);                              /* r6 = __ __ __ 06 */
+    movq_r2m(r7, *(output_data_bytes+80));      /* partial R9 = __ __ __ 23 */
+    pxor_r2r(r6, r4);                              /* r4 = __ __ 07 __ */
+    psrlq_i2r(32, r1);                             /* r1 = __ __ 13 12 */
+    por_r2r(r5, r4);                               /* r4 = __ __ 07 05 */
+    movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
+    pand_r2r(r2, r1);                              /* r1 = __ __ __ 12 */
+    movq_m2r(*(input_bytes+48), r5);
+    psllq_i2r(16, r0);                             /* r0 = 22 21 20 __ */
+    pmullw_m2r(*(dequant_matrix_bytes+48), r5);    /* r5 = 33 32 31 30 */
+    pand_r2r(r0, r7);                              /* r7 = 22 __ __ __ */
+    movq_r2m(r1, *(output_data_bytes+64));      /* partial R8 = __ __ __ 12 */
+    por_r2r(r4, r7);                               /* r7 = 22 __ 07 05 */
+    movq_r2r(r3, r4);                              /* r4 = 17 16 15 14 */
+    pand_r2r(r2, r3);                              /* r3 = __ __ __ 14 */
+    movq_m2r(*M(2), r1);                           /* r1 = __ FF __ __ */
+    psllq_i2r(32, r3);                             /* r3 = __ 14 __ __ */
+    por_r2r(r3, r7);                               /* r7 = 22 14 07 05 = R2 */
+    movq_r2r(r5, r3);                              /* r3 = 33 32 31 30 */
+    psllq_i2r(48, r3);                             /* r3 = 30 __ __ __ */
+    pand_r2r(r0, r1);                              /* r1 = __ 21 __ __ */
+    movq_r2m(r7, *(output_data_bytes+32));         /* write R2 = r7 */
+    por_r2r(r3, r6);                               /* r6 = 30 __ __ 06 */
+    movq_m2r(*M(1), r7);                           /* r7 = __ __ FF __ */
+    por_r2r(r1, r6);                               /* r6 = 30 21 __ 06 */
+    movq_m2r(*(input_bytes+56), r1);
+    pand_r2r(r4, r7);                              /* r7 = __ __ 15 __ */
+    pmullw_m2r(*(dequant_matrix_bytes+56), r1);    /* r1 = 37 36 35 34 */
+    por_r2r(r6, r7);                               /* r7 = 30 21 15 06 = R3 */
+    pand_m2r(*M(1), r0);                           /* r0 = __ __ 20 __ */
+    psrlq_i2r(32, r4);                             /* r4 = __ __ 17 16 */
+    movq_r2m(r7, *(output_data_bytes+48));         /* write R3 = r7 */
+    movq_r2r(r4, r6);                              /* r6 = __ __ 17 16 */
+    movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
+    pand_r2r(r2, r4);                              /* r4 = __ __ __ 16 */
+    movq_m2r(*M(1), r3);                           /* r3 = __ __ FF __ */
+    pand_r2r(r1, r7);                              /* r7 = 37 __ __ __ */
+    pand_r2r(r5, r3);                              /* r3 = __ __ 31 __ */
+    por_r2r(r4, r0);                               /* r0 = __ __ 20 16 */
+    psllq_i2r(16, r3);                             /* r3 = __ 31 __ __ */
+    por_r2r(r0, r7);                               /* r7 = 37 __ 20 16 */
+    movq_m2r(*M(2), r4);                           /* r4 = __ FF __ __ */
+    por_r2r(r3, r7);                               /* r7 = 37 31 20 16 = R4 */
+    movq_m2r(*(input_bytes+80), r0);
+    movq_r2r(r4, r3);                              /* r3 = __ __ FF __ */
+    pmullw_m2r(*(dequant_matrix_bytes+80), r0);    /* r0 = 53 52 51 50 */
+    pand_r2r(r5, r4);                              /* r4 = __ 32 __ __ */
+    movq_r2m(r7, *(output_data_bytes+8));          /* write R4 = r7 */
+    por_r2r(r4, r6);                               /* r6 = __ 32 17 16 */
+    movq_r2r(r3, r4);                              /* r4 = __ FF __ __ */
+    psrlq_i2r(16, r6);                             /* r6 = __ __ 32 17 */
+    movq_r2r(r0, r7);                              /* r7 = 53 52 51 50 */
+    pand_r2r(r1, r4);                              /* r4 = __ 36 __ __ */
+    psllq_i2r(48, r7);                             /* r7 = 50 __ __ __ */
+    por_r2r(r4, r6);                               /* r6 = __ 36 32 17 */
+    movq_m2r(*(input_bytes+88), r4);
+    por_r2r(r6, r7);                               /* r7 = 50 36 32 17 = R5 */
+    pmullw_m2r(*(dequant_matrix_bytes+88), r4);    /* r4 = 57 56 55 54 */
+    psrlq_i2r(16, r3);                             /* r3 = __ __ FF __ */
+    movq_r2m(r7, *(output_data_bytes+24));         /* write R5 = r7 */
+    pand_r2r(r1, r3);                              /* r3 = __ __ 35 __ */
+    psrlq_i2r(48, r5);                             /* r5 = __ __ __ 33 */
+    pand_r2r(r2, r1);                              /* r1 = __ __ __ 34 */
+    movq_m2r(*(input_bytes+104), r6);
+    por_r2r(r3, r5);                               /* r5 = __ __ 35 33 */
+    pmullw_m2r(*(dequant_matrix_bytes+104), r6);   /* r6 = 67 66 65 64 */
+    psrlq_i2r(16, r0);                             /* r0 = __ 53 52 51 */
+    movq_r2r(r4, r7);                              /* r7 = 57 56 55 54 */
+    movq_r2r(r2, r3);                              /* r3 = __ __ __ FF */
+    psllq_i2r(48, r7);                             /* r7 = 54 __ __ __ */
+    pand_r2r(r0, r3);                              /* r3 = __ __ __ 51 */
+    pxor_r2r(r3, r0);                              /* r0 = __ 53 52 __ */
+    psllq_i2r(32, r3);                             /* r3 = __ 51 __ __ */
+    por_r2r(r5, r7);                               /* r7 = 54 __ 35 33 */
+    movq_r2r(r6, r5);                              /* r5 = 67 66 65 64 */
+    pand_m2r(*M(1), r6);                           /* r6 = __ __ 65 __ */
+    por_r2r(r3, r7);                               /* r7 = 54 51 35 33 = R6 */
+    psllq_i2r(32, r6);                             /* r6 = 65 __ __ __ */
+    por_r2r(r1, r0);                               /* r0 = __ 53 52 34 */
+    movq_r2m(r7, *(output_data_bytes+40));         /* write R6 = r7 */
+    por_r2r(r6, r0);                               /* r0 = 65 53 52 34 = R7 */
+    movq_m2r(*(input_bytes+120), r7);
+    movq_r2r(r5, r6);                              /* r6 = 67 66 65 64 */
+    pmullw_m2r(*(dequant_matrix_bytes+120), r7);   /* r7 = 77 76 75 74 */
+    psrlq_i2r(32, r5);                             /* r5 = __ __ 67 66 */
+    pand_r2r(r2, r6);                              /* r6 = __ __ __ 64 */
+    movq_r2r(r5, r1);                              /* r1 = __ __ 67 66 */
+    movq_r2m(r0, *(output_data_bytes+56));         /* write R7 = r0 */
+    pand_r2r(r2, r1);                              /* r1 = __ __ __ 66 */
+    movq_m2r(*(input_bytes+112), r0);
+    movq_r2r(r7, r3);                              /* r3 = 77 76 75 74 */
+    pmullw_m2r(*(dequant_matrix_bytes+112), r0);   /* r0 = 73 72 71 70 */
+    psllq_i2r(16, r3);                             /* r3 = 76 75 74 __ */
+    pand_m2r(*M(3), r7);                           /* r7 = 77 __ __ __ */
+    pxor_r2r(r1, r5);                              /* r5 = __ __ 67 __ */
+    por_r2r(r5, r6);                               /* r6 = __ __ 67 64 */
+    movq_r2r(r3, r5);                              /* r5 = 76 75 74 __ */
+    pand_m2r(*M(3), r5);                           /* r5 = 76 __ __ __ */
+    por_r2r(r1, r7);                               /* r7 = 77 __ __ 66 */
+    movq_m2r(*(input_bytes+96), r1);
+    pxor_r2r(r5, r3);                              /* r3 = __ 75 74 __ */
+    pmullw_m2r(*(dequant_matrix_bytes+96), r1);    /* r1 = 63 62 61 60 */
+    por_r2r(r3, r7);                               /* r7 = 77 75 74 66 = R15 */
+    por_r2r(r5, r6);                               /* r6 = 76 __ 67 64 */
+    movq_r2r(r0, r5);                              /* r5 = 73 72 71 70 */
+    movq_r2m(r7, *(output_data_bytes+120));        /* store R15 = r7 */
+    psrlq_i2r(16, r5);                             /* r5 = __ 73 72 71 */
+    pand_m2r(*M(2), r5);                           /* r5 = __ 73 __ __ */
+    movq_r2r(r0, r7);                              /* r7 = 73 72 71 70 */
+    por_r2r(r5, r6);                               /* r6 = 76 73 67 64 = R14 */
+    pand_r2r(r2, r0);                              /* r0 = __ __ __ 70 */
+    pxor_r2r(r0, r7);                              /* r7 = 73 72 71 __ */
+    psllq_i2r(32, r0);                             /* r0 = __ 70 __ __ */
+    movq_r2m(r6, *(output_data_bytes+104));        /* write R14 = r6 */
+    psrlq_i2r(16, r4);                             /* r4 = __ 57 56 55 */
+    movq_m2r(*(input_bytes+72), r5);
+    psllq_i2r(16, r7);                             /* r7 = 72 71 __ __ */
+    pmullw_m2r(*(dequant_matrix_bytes+72), r5);    /* r5 = 47 46 45 44 */
+    movq_r2r(r7, r6);                              /* r6 = 72 71 __ __ */
+    movq_m2r(*M(2), r3);                           /* r3 = __ FF __ __ */
+    psllq_i2r(16, r6);                             /* r6 = 71 __ __ __ */
+    pand_m2r(*M(3), r7);                           /* r7 = 72 __ __ __ */
+    pand_r2r(r1, r3);                              /* r3 = __ 62 __ __ */
+    por_r2r(r0, r7);                               /* r7 = 72 70 __ __ */
+    movq_r2r(r1, r0);                              /* r0 = 63 62 61 60 */
+    pand_m2r(*M(3), r1);                           /* r1 = 63 __ __ __ */
+    por_r2r(r3, r6);                               /* r6 = 71 62 __ __ */
+    movq_r2r(r4, r3);                              /* r3 = __ 57 56 55 */
+    psrlq_i2r(32, r1);                             /* r1 = __ __ 63 __ */
+    pand_r2r(r2, r3);                              /* r3 = __ __ __ 55 */
+    por_r2r(r1, r7);                               /* r7 = 72 70 63 __ */
+    por_r2r(r3, r7);                               /* r7 = 72 70 63 55 = R13 */
+    movq_r2r(r4, r3);                              /* r3 = __ 57 56 55 */
+    pand_m2r(*M(1), r3);                           /* r3 = __ __ 56 __ */
+    movq_r2r(r5, r1);                              /* r1 = 47 46 45 44 */
+    movq_r2m(r7, *(output_data_bytes+88));         /* write R13 = r7 */
+    psrlq_i2r(48, r5);                             /* r5 = __ __ __ 47 */
+    movq_m2r(*(input_bytes+64), r7);
+    por_r2r(r3, r6);                               /* r6 = 71 62 56 __ */
+    pmullw_m2r(*(dequant_matrix_bytes+64), r7);    /* r7 = 43 42 41 40 */
+    por_r2r(r5, r6);                               /* r6 = 71 62 56 47 = R12 */
+    pand_m2r(*M(2), r4);                           /* r4 = __ 57 __ __ */
+    psllq_i2r(32, r0);                             /* r0 = 61 60 __ __ */
+    movq_r2m(r6, *(output_data_bytes+72));         /* write R12 = r6 */
+    movq_r2r(r0, r6);                              /* r6 = 61 60 __ __ */
+    pand_m2r(*M(3), r0);                           /* r0 = 61 __ __ __ */
+    psllq_i2r(16, r6);                             /* r6 = 60 __ __ __ */
+    movq_m2r(*(input_bytes+40), r5);
+    movq_r2r(r1, r3);                              /* r3 = 47 46 45 44 */
+    pmullw_m2r(*(dequant_matrix_bytes+40), r5);    /* r5 = 27 26 25 24 */
+    psrlq_i2r(16, r1);                             /* r1 = __ 47 46 45 */
+    pand_m2r(*M(1), r1);                           /* r1 = __ __ 46 __ */
+    por_r2r(r4, r0);                               /* r0 = 61 57 __ __ */
+    pand_r2r(r7, r2);                              /* r2 = __ __ __ 40 */
+    por_r2r(r1, r0);                               /* r0 = 61 57 46 __ */
+    por_r2r(r2, r0);                               /* r0 = 61 57 46 40 = R11 */
+    psllq_i2r(16, r3);                             /* r3 = 46 45 44 __ */
+    movq_r2r(r3, r4);                              /* r4 = 46 45 44 __ */
+    movq_r2r(r5, r2);                              /* r2 = 27 26 25 24 */
+    movq_r2m(r0, *(output_data_bytes+112));        /* write R11 = r0 */
+    psrlq_i2r(48, r2);                             /* r2 = __ __ __ 27 */
+    pand_m2r(*M(2), r4);                           /* r4 = __ 45 __ __ */
+    por_r2r(r2, r6);                               /* r6 = 60 __ __ 27 */
+    movq_m2r(*M(1), r2);                           /* r2 = __ __ FF __ */
+    por_r2r(r4, r6);                               /* r6 = 60 45 __ 27 */
+    pand_r2r(r7, r2);                              /* r2 = __ __ 41 __ */
+    psllq_i2r(32, r3);                             /* r3 = 44 __ __ __ */
+    por_m2r(*(output_data_bytes+80), r3);          /* r3 = 44 __ __ 23 */
+    por_r2r(r2, r6);                               /* r6 = 60 45 41 27 = R10 */
+    movq_m2r(*M(3), r2);                           /* r2 = FF __ __ __ */
+    psllq_i2r(16, r5);                             /* r5 = 26 25 24 __ */
+    movq_r2m(r6, *(output_data_bytes+96));         /* store R10 = r6 */
+    pand_r2r(r5, r2);                              /* r2 = 26 __ __ __ */
+    movq_m2r(*M(2), r6);                           /* r6 = __ FF __ __ */
+    pxor_r2r(r2, r5);                              /* r5 = __ 25 24 __ */
+    pand_r2r(r7, r6);                              /* r6 = __ 42 __ __ */
+    psrlq_i2r(32, r2);                             /* r2 = __ __ 26 __ */
+    pand_m2r(*M(3), r7);                           /* r7 = 43 __ __ __ */
+    por_r2r(r2, r3);                               /* r3 = 44 __ 26 23 */
+    por_m2r(*(output_data_bytes+64), r7);          /* r7 = 43 __ __ 12 */
+    por_r2r(r3, r6);                               /* r6 = 44 42 26 23 = R9 */
+    por_r2r(r5, r7);                               /* r7 = 43 25 24 12 = R8 */
+    movq_r2m(r6, *(output_data_bytes+80));         /* store R9 = r6 */
+    movq_r2m(r7, *(output_data_bytes+64));         /* store R8 = r7 */
 
 
 #undef M
 
-    /* at this point, function has completed dequantization + dezigzag + 
+    /* at this point, function has completed dequantization + dezigzag +
      * partial transposition; now do the idct itself */
 
 #define I(K) (output_data + K * 8)
diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
new file mode 100644
index 000000000..c8f9158af
--- /dev/null
+++ b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (C) 2004 the ffmpeg project
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**
+ * @file vp3dsp_sse2.c
+ * SSE2-optimized functions cribbed from the original VP3 source code.
+ */
+
+#include "../dsputil.h"
+#include "mmx.h"
+
+static unsigned short __align16 SSE2_dequant_const[] =
+{
+    0,65535,65535,0,0,0,0,0,    // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000
+    0,0,0,0,65535,65535,0,0,    // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000
+    65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF
+    0,0,0,65535,0,0,0,0,        // 0x0000 0000 0000 0000 FFFF 0000 0000 0000
+    0,0,0,65535,65535,0,0,0,    // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000
+    65535,0,0,0,0,65535,0,0,    // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF
+    0,0,65535,65535, 0,0,0,0    // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000
+};
+
+static unsigned int __align16 eight_data[] =
+{ 
+    0x00080008, 
+    0x00080008,
+    0x00080008, 
+    0x00080008 
+}; 
+
+static unsigned short __align16 SSE2_idct_data[7 * 8] =
+{
+    64277,64277,64277,64277,64277,64277,64277,64277, 
+    60547,60547,60547,60547,60547,60547,60547,60547, 
+    54491,54491,54491,54491,54491,54491,54491,54491, 
+    46341,46341,46341,46341,46341,46341,46341,46341, 
+    36410,36410,36410,36410,36410,36410,36410,36410, 
+    25080,25080,25080,25080,25080,25080,25080,25080, 
+    12785,12785,12785,12785,12785,12785,12785,12785
+};
+
+
+#define SSE2_Column_IDCT() {        \
+    \
+    movdqu_m2r(*I(3), xmm2);     /* xmm2 = i3 */ \
+    movdqu_m2r(*C(3), xmm6);     /* xmm6 = c3 */ \
+    \
+    movdqu_r2r(xmm2, xmm4);      /* xmm4 = i3 */ \
+    movdqu_m2r(*I(5), xmm7);     /* xmm7 = i5 */ \
+    \
+    pmulhw_r2r(xmm6, xmm4);      /* xmm4 = c3 * i3 - i3 */ \
+    movdqu_m2r(*C(5), xmm1);     /* xmm1 = c5 */ \
+    \
+    pmulhw_r2r(xmm7, xmm6);      /* xmm6 = c3 * i5 - i5 */ \
+    movdqu_r2r(xmm1, xmm5);      /* xmm5 = c5 */ \
+    \
+    pmulhw_r2r(xmm2, xmm1);      /* xmm1 = c5 * i3 - i3 */ \
+    movdqu_m2r(*I(1), xmm3);     /* xmm3 = i1 */ \
+    \
+    pmulhw_r2r(xmm7, xmm5);      /* xmm5 = c5 * i5 - i5 */ \
+    movdqu_m2r(*C(1), xmm0);     /* xmm0 = c1 */ \
+    \
+    /* all registers are in use */ \
+    \
+    paddw_r2r(xmm2, xmm4);       /* xmm4 = c3 * i3 */ \
+    paddw_r2r(xmm7, xmm6);       /* xmm6 = c3 * i5 */ \
+    \
+    paddw_r2r(xmm1, xmm2);       /* xmm2 = c5 * i3 */ \
+    movdqu_m2r(*I(7), xmm1);     /* xmm1 = i7 */ \
+    \
+    paddw_r2r(xmm5, xmm7);       /* xmm7 = c5 * i5 */ \
+    movdqu_r2r(xmm0, xmm5);      /* xmm5 = c1 */ \
+    \
+    pmulhw_r2r(xmm3, xmm0);      /* xmm0 = c1 * i1 - i1 */ \
+    paddsw_r2r(xmm7, xmm4);      /* xmm4 = c3 * i3 + c5 * i5 = C */ \
+    \
+    pmulhw_r2r(xmm1, xmm5);      /* xmm5 = c1 * i7 - i7 */ \
+    movdqu_m2r(*C(7), xmm7);     /* xmm7 = c7 */ \
+    \
+    psubsw_r2r(xmm2, xmm6);      /* xmm6 = c3 * i5 - c5 * i3 = D */ \
+    paddw_r2r(xmm3, xmm0);       /* xmm0 = c1 * i1 */ \
+    \
+    pmulhw_r2r(xmm7, xmm3);      /* xmm3 = c7 * i1 */ \
+    movdqu_m2r(*I(2), xmm2);     /* xmm2 = i2 */ \
+    \
+    pmulhw_r2r(xmm1, xmm7);      /* xmm7 = c7 * i7 */ \
+    paddw_r2r(xmm1, xmm5);       /* xmm5 = c1 * i7 */ \
+    \
+    movdqu_r2r(xmm2, xmm1);      /* xmm1 = i2 */ \
+    pmulhw_m2r(*C(2), xmm2);     /* xmm2 = i2 * c2 -i2 */ \
+    \
+    psubsw_r2r(xmm5, xmm3);      /* xmm3 = c7 * i1 - c1 * i7 = B */ \
+    movdqu_m2r(*I(6), xmm5);     /* xmm5 = i6 */ \
+    \
+    paddsw_r2r(xmm7, xmm0);      /* xmm0 = c1 * i1 + c7 * i7 = A */ \
+    movdqu_r2r(xmm5, xmm7);      /* xmm7 = i6 */ \
+    \
+    psubsw_r2r(xmm4, xmm0);      /* xmm0 = A - C */ \
+    pmulhw_m2r(*C(2), xmm5);     /* xmm5 = c2 * i6 - i6 */ \
+    \
+    paddw_r2r(xmm1, xmm2);       /* xmm2 = i2 * c2 */ \
+    pmulhw_m2r(*C(6), xmm1);     /* xmm1 = c6 * i2 */ \
+    \
+    paddsw_r2r(xmm4, xmm4);      /* xmm4 = C + C */ \
+    paddsw_r2r(xmm0, xmm4);      /* xmm4 = A + C = C. */ \
+    \
+    psubsw_r2r(xmm6, xmm3);      /* xmm3 = B - D */ \
+    paddw_r2r(xmm7, xmm5);       /* xmm5 = c2 * i6 */ \
+    \
+    paddsw_r2r(xmm6, xmm6);      /* xmm6 = D + D */ \
+    pmulhw_m2r(*C(6), xmm7);     /* xmm7 = c6 * i6 */ \
+    \
+    paddsw_r2r(xmm3, xmm6);      /* xmm6 = B + D = D. */ \
+    movdqu_r2m(xmm4, *I(1));     /* Save C. at I(1) */ \
+    \
+    psubsw_r2r(xmm5, xmm1);      /* xmm1 = c6 * i2 - c2 * i6 = H */ \
+    movdqu_m2r(*C(4), xmm4);     /* xmm4 = c4 */ \
+    \
+    movdqu_r2r(xmm3, xmm5);      /* xmm5 = B - D */ \
+    pmulhw_r2r(xmm4, xmm3);      /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
+    \
+    paddsw_r2r(xmm2, xmm7);      /* xmm7 = c2 * i2 + c6 * i6 = G */ \
+    movdqu_r2m(xmm6, *I(2));     /* Save D. at I(2) */ \
+    \
+    movdqu_r2r(xmm0, xmm2);      /* xmm2 = A - C */ \
+    movdqu_m2r(*I(0), xmm6);     /* xmm6 = i0 */ \
+    \
+    pmulhw_r2r(xmm4, xmm0);      /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
+    paddw_r2r(xmm3, xmm5);       /* xmm5 = c4 * ( B - D ) = B. */ \
+    \
+    movdqu_m2r(*I(4), xmm3);     /* xmm3 = i4 */ \
+    psubsw_r2r(xmm1, xmm5);      /* xmm5 = B. - H = B.. */ \
+    \
+    paddw_r2r(xmm0, xmm2);       /* xmm2 = c4 * ( A - C) = A. */ \
+    psubsw_r2r(xmm3, xmm6);      /* xmm6 = i0 - i4 */ \
+    \
+    movdqu_r2r(xmm6, xmm0);      /* xmm0 = i0 - i4 */ \
+    pmulhw_r2r(xmm4, xmm6);      /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
+    \
+    paddsw_r2r(xmm3, xmm3);      /* xmm3 = i4 + i4 */ \
+    paddsw_r2r(xmm1, xmm1);      /* xmm1 = H + H */ \
+    \
+    paddsw_r2r(xmm0, xmm3);      /* xmm3 = i0 + i4 */ \
+    paddsw_r2r(xmm5, xmm1);      /* xmm1 = B. + H = H. */ \
+    \
+    pmulhw_r2r(xmm3, xmm4);      /* xmm4 = ( c4 - 1 ) * ( i0 + i4 )  */ \
+    paddw_r2r(xmm0, xmm6);       /* xmm6 = c4 * ( i0 - i4 ) */ \
+    \
+    psubsw_r2r(xmm2, xmm6);      /* xmm6 = F - A. = F. */ \
+    paddsw_r2r(xmm2, xmm2);      /* xmm2 = A. + A. */ \
+    \
+    movdqu_m2r(*I(1), xmm0);     /* Load        C. from I(1) */ \
+    paddsw_r2r(xmm6, xmm2);      /* xmm2 = F + A. = A.. */ \
+    \
+    paddw_r2r(xmm3, xmm4);       /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
+    psubsw_r2r(xmm1, xmm2);      /* xmm2 = A.. - H. = R2 */ \
+    \
+    paddsw_m2r(*Eight, xmm2);    /* Adjust R2 and R1 before shifting */ \
+    paddsw_r2r(xmm1, xmm1);      /* xmm1 = H. + H. */ \
+    \
+    paddsw_r2r(xmm2, xmm1);      /* xmm1 = A.. + H. = R1 */ \
+    psraw_i2r(4, xmm2);          /* xmm2 = op2 */ \
+    \
+    psubsw_r2r(xmm7, xmm4);      /* xmm4 = E - G = E. */ \
+    psraw_i2r(4, xmm1);          /* xmm1 = op1 */ \
+    \
+    movdqu_m2r(*I(2), xmm3);     /* Load D. from I(2) */ \
+    paddsw_r2r(xmm7, xmm7);      /* xmm7 = G + G */ \
+    \
+    movdqu_r2m(xmm2, *O(2));     /* Write out op2 */ \
+    paddsw_r2r(xmm4, xmm7);      /* xmm7 = E + G = G. */ \
+    \
+    movdqu_r2m(xmm1, *O(1));     /* Write out op1 */ \
+    psubsw_r2r(xmm3, xmm4);      /* xmm4 = E. - D. = R4 */ \
+    \
+    paddsw_m2r(*Eight, xmm4);    /* Adjust R4 and R3 before shifting */ \
+    paddsw_r2r(xmm3, xmm3);      /* xmm3 = D. + D. */ \
+    \
+    paddsw_r2r(xmm4, xmm3);      /* xmm3 = E. + D. = R3 */ \
+    psraw_i2r(4, xmm4);          /* xmm4 = op4 */ \
+    \
+    psubsw_r2r(xmm5, xmm6);      /* xmm6 = F. - B..= R6 */ \
+    psraw_i2r(4, xmm3);          /* xmm3 = op3 */ \
+    \
+    paddsw_m2r(*Eight, xmm6);    /* Adjust R6 and R5 before shifting */ \
+    paddsw_r2r(xmm5, xmm5);      /* xmm5 = B.. + B.. */ \
+    \
+    paddsw_r2r(xmm6, xmm5);      /* xmm5 = F. + B.. = R5 */ \
+    psraw_i2r(4, xmm6);          /* xmm6 = op6 */ \
+    \
+    movdqu_r2m(xmm4, *O(4));     /* Write out op4 */ \
+    psraw_i2r(4, xmm5);          /* xmm5 = op5 */ \
+    \
+    movdqu_r2m(xmm3, *O(3));     /* Write out op3 */ \
+    psubsw_r2r(xmm0, xmm7);      /* xmm7 = G. - C. = R7 */ \
+    \
+    paddsw_m2r(*Eight, xmm7);    /* Adjust R7 and R0 before shifting */ \
+    paddsw_r2r(xmm0, xmm0);      /* xmm0 = C. + C. */ \
+    \
+    paddsw_r2r(xmm7, xmm0);      /* xmm0 = G. + C. */ \
+    psraw_i2r(4, xmm7);          /* xmm7 = op7 */ \
+    \
+    movdqu_r2m(xmm6, *O(6));     /* Write out op6 */ \
+    psraw_i2r(4, xmm0);          /* xmm0 = op0 */ \
+    \
+    movdqu_r2m(xmm5, *O(5));     /* Write out op5 */ \
+    movdqu_r2m(xmm7, *O(7));     /* Write out op7 */ \
+    \
+    movdqu_r2m(xmm0, *O(0));     /* Write out op0 */ \
+    \
+} /* End of SSE2_Column_IDCT macro */
+
+
+#define SSE2_Row_IDCT() {        \
+    \
+    movdqu_m2r(*I(3), xmm2);     /* xmm2 = i3 */ \
+    movdqu_m2r(*C(3), xmm6);     /* xmm6 = c3 */ \
+    \
+    movdqu_r2r(xmm2, xmm4);      /* xmm4 = i3 */ \
+    movdqu_m2r(*I(5), xmm7);     /* xmm7 = i5 */ \
+    \
+    pmulhw_r2r(xmm6, xmm4);      /* xmm4 = c3 * i3 - i3 */ \
+    movdqu_m2r(*C(5), xmm1);     /* xmm1 = c5 */ \
+    \
+    pmulhw_r2r(xmm7, xmm6);      /* xmm6 = c3 * i5 - i5 */ \
+    movdqu_r2r(xmm1, xmm5);      /* xmm5 = c5 */ \
+    \
+    pmulhw_r2r(xmm2, xmm1);      /* xmm1 = c5 * i3 - i3 */ \
+    movdqu_m2r(*I(1), xmm3);     /* xmm3 = i1 */ \
+    \
+    pmulhw_r2r(xmm7, xmm5);      /* xmm5 = c5 * i5 - i5 */ \
+    movdqu_m2r(*C(1), xmm0);     /* xmm0 = c1 */ \
+    \
+    /* all registers are in use */ \
+    \
+    paddw_r2r(xmm2, xmm4);      /* xmm4 = c3 * i3 */ \
+    paddw_r2r(xmm7, xmm6);      /* xmm6 = c3 * i5 */ \
+    \
+    paddw_r2r(xmm1, xmm2);      /* xmm2 = c5 * i3 */ \
+    movdqu_m2r(*I(7), xmm1);    /* xmm1 = i7 */ \
+    \
+    paddw_r2r(xmm5, xmm7);      /* xmm7 = c5 * i5 */ \
+    movdqu_r2r(xmm0, xmm5);     /* xmm5 = c1 */ \
+    \
+    pmulhw_r2r(xmm3, xmm0);     /* xmm0 = c1 * i1 - i1 */ \
+    paddsw_r2r(xmm7, xmm4);     /* xmm4 = c3 * i3 + c5 * i5 = C */ \
+    \
+    pmulhw_r2r(xmm1, xmm5);     /* xmm5 = c1 * i7 - i7 */ \
+    movdqu_m2r(*C(7), xmm7);    /* xmm7 = c7 */ \
+    \
+    psubsw_r2r(xmm2, xmm6);     /* xmm6 = c3 * i5 - c5 * i3 = D */ \
+    paddw_r2r(xmm3, xmm0);      /* xmm0 = c1 * i1 */ \
+    \
+    pmulhw_r2r(xmm7, xmm3);     /* xmm3 = c7 * i1 */ \
+    movdqu_m2r(*I(2), xmm2);    /* xmm2 = i2 */ \
+    \
+    pmulhw_r2r(xmm1, xmm7);     /* xmm7 = c7 * i7 */ \
+    paddw_r2r(xmm1, xmm5);      /* xmm5 = c1 * i7 */ \
+    \
+    movdqu_r2r(xmm2, xmm1);     /* xmm1 = i2 */ \
+    pmulhw_m2r(*C(2), xmm2);    /* xmm2 = i2 * c2 -i2 */ \
+    \
+    psubsw_r2r(xmm5, xmm3);     /* xmm3 = c7 * i1 - c1 * i7 = B */ \
+    movdqu_m2r(*I(6), xmm5);    /* xmm5 = i6 */ \
+    \
+    paddsw_r2r(xmm7, xmm0);     /* xmm0 = c1 * i1 + c7 * i7        = A */ \
+    movdqu_r2r(xmm5, xmm7);     /* xmm7 = i6 */ \
+    \
+    psubsw_r2r(xmm4, xmm0);     /* xmm0 = A - C */ \
+    pmulhw_m2r(*C(2), xmm5);    /* xmm5 = c2 * i6 - i6 */ \
+    \
+    paddw_r2r(xmm1, xmm2);      /* xmm2 = i2 * c2 */ \
+    pmulhw_m2r(*C(6), xmm1);    /* xmm1 = c6 * i2 */ \
+    \
+    paddsw_r2r(xmm4, xmm4);     /* xmm4 = C + C */ \
+    paddsw_r2r(xmm0, xmm4);     /* xmm4 = A + C = C. */ \
+    \
+    psubsw_r2r(xmm6, xmm3);     /* xmm3 = B - D */ \
+    paddw_r2r(xmm7, xmm5);      /* xmm5 = c2 * i6 */ \
+    \
+    paddsw_r2r(xmm6, xmm6);     /* xmm6 = D + D */ \
+    pmulhw_m2r(*C(6), xmm7);    /* xmm7 = c6 * i6 */ \
+    \
+    paddsw_r2r(xmm3, xmm6);     /* xmm6 = B + D = D. */ \
+    movdqu_r2m(xmm4, *I(1));    /* Save C. at I(1)        */ \
+    \
+    psubsw_r2r(xmm5, xmm1);     /* xmm1 = c6 * i2 - c2 * i6 = H */ \
+    movdqu_m2r(*C(4), xmm4);    /* xmm4 = c4 */ \
+    \
+    movdqu_r2r(xmm3, xmm5);     /* xmm5 = B - D */ \
+    pmulhw_r2r(xmm4, xmm3);     /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
+    \
+    paddsw_r2r(xmm2, xmm7);     /* xmm7 = c2 * i2 + c6 * i6 = G */ \
+    movdqu_r2m(xmm6, *I(2));    /* Save D. at I(2) */ \
+    \
+    movdqu_r2r(xmm0, xmm2);     /* xmm2 = A - C */ \
+    movdqu_m2r(*I(0), xmm6);    /* xmm6 = i0 */ \
+    \
+    pmulhw_r2r(xmm4, xmm0);     /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
+    paddw_r2r(xmm3, xmm5);      /* xmm5 = c4 * ( B - D ) = B. */ \
+    \
+    movdqu_m2r(*I(4), xmm3);    /* xmm3 = i4 */ \
+    psubsw_r2r(xmm1, xmm5);     /* xmm5 = B. - H = B.. */ \
+    \
+    paddw_r2r(xmm0, xmm2);      /* xmm2 = c4 * ( A - C) = A. */ \
+    psubsw_r2r(xmm3, xmm6);     /* xmm6 = i0 - i4 */ \
+    \
+    movdqu_r2r(xmm6, xmm0);     /* xmm0 = i0 - i4 */ \
+    pmulhw_r2r(xmm4, xmm6);     /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \
+    \
+    paddsw_r2r(xmm3, xmm3);     /* xmm3 = i4 + i4 */ \
+    paddsw_r2r(xmm1, xmm1);     /* xmm1 = H + H */ \
+    \
+    paddsw_r2r(xmm0, xmm3);     /* xmm3 = i0 + i4 */ \
+    paddsw_r2r(xmm5, xmm1);     /* xmm1 = B. + H = H. */ \
+    \
+    pmulhw_r2r(xmm3, xmm4);     /* xmm4 = ( c4 - 1 ) * ( i0 + i4 )  */ \
+    paddw_r2r(xmm0, xmm6);      /* xmm6 = c4 * ( i0 - i4 ) */ \
+    \
+    psubsw_r2r(xmm2, xmm6);     /* xmm6 = F - A. = F. */ \
+    paddsw_r2r(xmm2, xmm2);     /* xmm2 = A. + A. */ \
+    \
+    movdqu_m2r(*I(1), xmm0);    /* Load C. from I(1) */ \
+    paddsw_r2r(xmm6, xmm2);     /* xmm2 = F + A. = A.. */ \
+    \
+    paddw_r2r(xmm3, xmm4);      /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
+    psubsw_r2r(xmm1, xmm2);     /* xmm2 = A.. - H. = R2 */ \
+    \
+    paddsw_r2r(xmm1, xmm1);     /* xmm1 = H. + H. */ \
+    paddsw_r2r(xmm2, xmm1);     /* xmm1 = A.. + H. = R1 */ \
+    \
+    psubsw_r2r(xmm7, xmm4);     /* xmm4 = E - G = E. */ \
+    \
+    movdqu_m2r(*I(2), xmm3);    /* Load D. from I(2) */ \
+    paddsw_r2r(xmm7, xmm7);     /* xmm7 = G + G */ \
+    \
+    movdqu_r2m(xmm2, *I(2));    /* Write out op2 */ \
+    paddsw_r2r(xmm4, xmm7);     /* xmm7 = E + G = G. */ \
+    \
+    movdqu_r2m(xmm1, *I(1));    /* Write out op1 */ \
+    psubsw_r2r(xmm3, xmm4);     /* xmm4 = E. - D. = R4 */ \
+    \
+    paddsw_r2r(xmm3, xmm3);     /* xmm3 = D. + D. */ \
+    \
+    paddsw_r2r(xmm4, xmm3);     /* xmm3 = E. + D. = R3 */ \
+    \
+    psubsw_r2r(xmm5, xmm6);     /* xmm6 = F. - B..= R6 */ \
+    \
+    paddsw_r2r(xmm5, xmm5);     /* xmm5 = B.. + B.. */ \
+    \
+    paddsw_r2r(xmm6, xmm5);     /* xmm5 = F. + B.. = R5 */ \
+    \
+    movdqu_r2m(xmm4, *I(4));    /* Write out op4 */ \
+    \
+    movdqu_r2m(xmm3, *I(3));    /* Write out op3 */ \
+    psubsw_r2r(xmm0, xmm7);     /* xmm7 = G. - C. = R7 */ \
+    \
+    paddsw_r2r(xmm0, xmm0);     /* xmm0 = C. + C. */ \
+    \
+    paddsw_r2r(xmm7, xmm0);     /* xmm0 = G. + C. */ \
+    \
+    movdqu_r2m(xmm6, *I(6));    /* Write out op6 */ \
+    \
+    movdqu_r2m(xmm5, *I(5));    /* Write out op5 */ \
+    movdqu_r2m(xmm7, *I(7));    /* Write out op7 */ \
+    \
+    movdqu_r2m(xmm0, *I(0));    /* Write out op0 */ \
+    \
+} /* End of SSE2_Row_IDCT macro */
+
+
+#define SSE2_Transpose() {    \
+    \
+    movdqu_m2r(*I(4), xmm4);    /* xmm4=e7e6e5e4e3e2e1e0 */ \
+    movdqu_m2r(*I(5), xmm0);    /* xmm4=f7f6f5f4f3f2f1f0 */ \
+    \
+    movdqu_r2r(xmm4, xmm5);     /* make a copy */ \
+    punpcklwd_r2r(xmm0, xmm4);  /* xmm4=f3e3f2e2f1e1f0e0 */ \
+    \
+    punpckhwd_r2r(xmm0, xmm5);  /* xmm5=f7e7f6e6f5e5f4e4 */ \
+    movdqu_m2r(*I(6), xmm6);    /* xmm6=g7g6g5g4g3g2g1g0 */ \
+    \
+    movdqu_m2r(*I(7), xmm0);    /* xmm0=h7h6h5h4h3h2h1h0 */ \
+    movdqu_r2r(xmm6, xmm7);     /* make a copy */ \
+    \
+    punpcklwd_r2r(xmm0, xmm6);  /* xmm6=h3g3h3g2h1g1h0g0 */ \
+    punpckhwd_r2r(xmm0, xmm7);  /* xmm7=h7g7h6g6h5g5h4g4 */ \
+    \
+    movdqu_r2r(xmm4, xmm3);     /* make a copy */ \
+    punpckldq_r2r(xmm6, xmm4);  /* xmm4=h1g1f1e1h0g0f0e0 */ \
+    \
+    punpckhdq_r2r(xmm6, xmm3);  /* xmm3=h3g3g3e3h2g2f2e2 */ \
+    movdqu_r2m(xmm3, *I(6));    /* save h3g3g3e3h2g2f2e2 */ \
+    /* Free xmm6 */ \
+    movdqu_r2r(xmm5, xmm6);     /* make a copy */ \
+    punpckldq_r2r(xmm7, xmm5);  /* xmm5=h5g5f5e5h4g4f4e4 */ \
+    \
+    punpckhdq_r2r(xmm7, xmm6);  /* xmm6=h7g7f7e7h6g6f6e6 */ \
+    movdqu_m2r(*I(0), xmm0);    /* xmm0=a7a6a5a4a3a2a1a0 */ \
+    /* Free xmm7 */ \
+    movdqu_m2r(*I(1), xmm1);    /* xmm1=b7b6b5b4b3b2b1b0 */ \
+    movdqu_r2r(xmm0, xmm7);     /* make a copy */ \
+    \
+    punpcklwd_r2r(xmm1, xmm0);  /* xmm0=b3a3b2a2b1a1b0a0 */ \
+    punpckhwd_r2r(xmm1, xmm7);  /* xmm7=b7a7b6a6b5a5b4a4 */ \
+    /* Free xmm1 */ \
+    movdqu_m2r(*I(2), xmm2);    /* xmm2=c7c6c5c4c3c2c1c0 */ \
+    movdqu_m2r(*I(3), xmm3);    /* xmm3=d7d6d5d4d3d2d1d0 */ \
+    \
+    movdqu_r2r(xmm2, xmm1);     /* make a copy */ \
+    punpcklwd_r2r(xmm3, xmm2);  /* xmm2=d3c3d2c2d1c1d0c0 */ \
+    \
+    punpckhwd_r2r(xmm3, xmm1);  /* xmm1=d7c7d6c6d5c5d4c4 */ \
+    movdqu_r2r(xmm0, xmm3);     /* make a copy        */ \
+    \
+    punpckldq_r2r(xmm2, xmm0);  /* xmm0=d1c1b1a1d0c0b0a0 */ \
+    punpckhdq_r2r(xmm2, xmm3);  /* xmm3=d3c3b3a3d2c2b2a2 */ \
+    /* Free xmm2 */ \
+    movdqu_r2r(xmm7, xmm2);     /* make a copy */ \
+    punpckldq_r2r(xmm1, xmm2);  /* xmm2=d5c5b5a5d4c4b4a4 */ \
+    \
+    punpckhdq_r2r(xmm1, xmm7);  /* xmm7=d7c7b7a7d6c6b6a6 */ \
+    movdqu_r2r(xmm0, xmm1);     /* make a copy */ \
+    \
+    punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \
+    punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \
+    \
+    movdqu_r2m(xmm0, *I(0));    /* save I(0) */ \
+    movdqu_r2m(xmm1, *I(1));    /* save I(1) */ \
+    \
+    movdqu_m2r(*I(6), xmm0);    /* load h3g3g3e3h2g2f2e2 */ \
+    movdqu_r2r(xmm3, xmm1);     /* make a copy */ \
+    \
+    punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \
+    punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \
+    \
+    movdqu_r2r(xmm2, xmm4);     /* make a copy */ \
+    punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \
+    \
+    punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \
+    movdqu_r2m(xmm1, *I(2));    /* save I(2) */ \
+    \
+    movdqu_r2m(xmm3, *I(3));    /* save I(3) */ \
+    movdqu_r2m(xmm4, *I(4));    /* save I(4) */ \
+    \
+    movdqu_r2m(xmm2, *I(5));    /* save I(5) */ \
+    movdqu_r2r(xmm7, xmm5);     /* make a copy */ \
+    \
+    punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \
+    punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \
+    \
+    movdqu_r2m(xmm5, *I(6));    /* save I(6) */ \
+    movdqu_r2m(xmm7, *I(7));    /* save I(7) */ \
+    \
+} /* End of Transpose Macro */
+
+
+#define SSE2_Dequantize() {        \
+    movdqu_m2r(*(eax), xmm0);     \
+    \
+    pmullw_m2r(*(ebx), xmm0);          /* xmm0 = 07 06 05 04 03 02 01 00 */ \
+    movdqu_m2r(*(eax + 16), xmm1);     \
+    \
+    pmullw_m2r(*(ebx + 16), xmm1);     /* xmm1 = 17 16 15 14 13 12 11 10 */ \
+    pshuflw_r2r(xmm0, xmm3, 0x078);    /* xmm3 = 07 06 05 04 01 03 02 00 */ \
+    \
+    movdqu_r2r(xmm1, xmm2);            /* xmm2 = 17 16 15 14 13 12 11 10 */ \
+    movdqu_m2r(*(ecx), xmm7);          /* xmm7 = -- -- -- -- -- FF FF -- */ \
+    \
+    movdqu_m2r(*(eax + 32), xmm4);     \
+    movdqu_m2r(*(eax + 64), xmm5);     \
+    \
+    pmullw_m2r(*(ebx + 32), xmm4);     /* xmm4 = 27 26 25 24 23 22 21 20 */ \
+    pmullw_m2r(*(ebx + 64), xmm5);     /* xmm5 = 47 46 45 44 43 42 41 40 */ \
+    \
+    movdqu_m2r(*(ecx + 16), xmm6);     /* xmm6 = -- -- FF FF -- -- -- -- */ \
+    pand_r2r(xmm2, xmm7);              /* xmm7 = -- -- -- -- -- 12 11 -- */ \
+    \
+    pand_r2r(xmm4, xmm6);              /* xmm6 = -- -- 25 24 -- -- -- -- */ \
+    pxor_r2r(xmm7, xmm2);              /* xmm2 = 17 16 15 14 13 -- -- 10 */ \
+    \
+    pxor_r2r(xmm6, xmm4);              /* xmm4 = 27 26 -- -- 23 22 21 20 */ \
+    pslldq_i2r(4, xmm7);               /* xmm7 = -- -- -- 12 11 -- -- -- */ \
+    \
+    pslldq_i2r(2, xmm6);               /* xmm6 = -- 25 24 -- -- -- -- -- */ \
+    por_r2r(xmm6, xmm7);               /* xmm7 = -- 25 24 12 11 -- -- -- */ \
+    \
+    movdqu_m2r(*(ecx + 32), xmm0);     /* xmm0 = -- -- -- -- -- FF FF FF */ \
+    movdqu_m2r(*(ecx + 48), xmm6);     /* xmm6 = -- -- -- -- FF -- -- -- */ \
+    \
+    pand_r2r(xmm3, xmm0);              /* xmm0 = -- -- -- -- -- 03 02 00 */ \
+    pand_r2r(xmm5, xmm6);              /* xmm6 = -- -- -- -- 43 -- -- -- */ \
+    \
+    pxor_r2r(xmm0, xmm3);              /* xmm3 = 07 06 05 04 01 -- -- -- */ \
+    pxor_r2r(xmm6, xmm5);              /* xmm5 = 47 46 45 44 -- 42 41 40 */ \
+    \
+    por_r2r(xmm7, xmm0);               /* xmm0 = -- 25 24 12 11 03 02 00 */ \
+    pslldq_i2r(8, xmm6);               /* xmm6 = 43 -- -- -- -- -- -- -- */ \
+    \
+    por_r2r(xmm6, xmm0);               /* xmm0 = 43 25 24 12 11 03 02 00 */ \
+    /* 02345 in use */ \
+    \
+    movdqu_m2r(*(ecx + 64 ), xmm1);    /* xmm1 = -- -- -- FF FF -- -- -- */ \
+    pshuflw_r2r(xmm5, xmm5, 0x0B4);    /* xmm5 = 47 46 45 44 42 -- 41 40 */ \
+    \
+    movdqu_r2r(xmm1, xmm7);            /* xmm7 = -- -- -- FF FF -- -- -- */ \
+    movdqu_r2r(xmm1, xmm6);            /* xmm6 = -- -- -- FF FF -- -- -- */ \
+    \
+    movdqu_r2m(xmm0, *(eax));          /* write  43 25 24 12 11 03 02 00 */ \
+    pshufhw_r2r(xmm4, xmm4, 0x0C2);    /* xmm4 = 27 -- -- 26 23 22 21 20 */ \
+    \
+    pand_r2r(xmm4, xmm7);              /* xmm7 = -- -- -- 26 23 -- -- -- */ \
+    pand_r2r(xmm5, xmm1);              /* xmm1 = -- -- -- 44 42 -- -- -- */ \
+    \
+    pxor_r2r(xmm7, xmm4);              /* xmm4 = 27 -- -- -- -- 22 21 20 */ \
+    pxor_r2r(xmm1, xmm5);              /* xmm5 = 47 46 45 -- -- -- 41 40 */ \
+    \
+    pshuflw_r2r(xmm2, xmm2, 0x0C6);    /* xmm2 = 17 16 15 14 13 10 -- -- */ \
+    movdqu_r2r(xmm6, xmm0);            /* xmm0 = -- -- -- FF FF -- -- -- */ \
+    \
+    pslldq_i2r(2, xmm7);               /* xmm7 = -- -- 26 23 -- -- -- -- */ \
+    pslldq_i2r(6, xmm1);               /* xmm1 = 44 42 -- -- -- -- -- -- */ \
+    \
+    psrldq_i2r(2, xmm0);               /* xmm0 = -- -- -- -- FF FF -- -- */ \
+    pand_r2r(xmm3, xmm6);              /* xmm6 = -- -- -- 04 01 -- -- -- */ \
+    \
+    pand_r2r(xmm2, xmm0);              /* xmm0 = -- -- -- -- 13 10 -- -- */ \
+    pxor_r2r(xmm6, xmm3);              /* xmm3 = 07 06 05 -- -- -- -- -- */ \
+    \
+    pxor_r2r(xmm0, xmm2);              /* xmm2 = 17 16 15 14 -- -- -- -- */ \
+    psrldq_i2r(6, xmm6);               /* xmm0 = -- -- -- -- -- -- 04 01 */ \
+    \
+    por_r2r(xmm7, xmm1);               /* xmm1 = 44 42 26 23 -- -- -- -- */ \
+    por_r2r(xmm6, xmm0);               /* xmm1 = -- -- -- -- 13 10 04 01 */ \
+    /* 12345 in use */    \
+    por_r2r(xmm0, xmm1);               /* xmm1 = 44 42 26 23 13 10 04 01 */ \
+    pshuflw_r2r(xmm4, xmm4, 0x093);    /* xmm4 = 27 -- -- -- 22 21 20 -- */ \
+    \
+    pshufhw_r2r(xmm4, xmm4, 0x093);    /* xmm4 = -- -- -- 27 22 21 20 -- */ \
+    movdqu_r2m(xmm1, *(eax + 16));     /* write  44 42 26 23 13 10 04 01 */ \
+    \
+    pshufhw_r2r(xmm3, xmm3, 0x0D2);    /* xmm3 = 07 05 -- 06 -- -- -- -- */ \
+    movdqu_m2r(*(ecx + 64), xmm0);     /* xmm0 = -- -- -- FF FF -- -- -- */ \
+    \
+    pand_r2r(xmm3, xmm0);              /* xmm0 = -- -- -- 06 -- -- -- -- */ \
+    psrldq_i2r(12, xmm3);              /* xmm3 = -- -- -- -- -- -- 07 05 */ \
+    \
+    psrldq_i2r(8, xmm0);               /* xmm0 = -- -- -- -- -- -- -- 06 */ \
+    \
+    movdqu_m2r(*(ecx + 64), xmm6);     /* xmm6 = -- -- -- FF FF -- -- -- */ \
+    movdqu_m2r(*(ecx + 96), xmm7);     /* xmm7 = -- -- -- -- FF FF -- -- */ \
+    \
+    pand_r2r(xmm4, xmm6);              /* xmm6 = -- -- -- 27 22 -- -- -- */ \
+    pxor_r2r(xmm6, xmm4);              /* xmm4 = -- -- -- -- -- 21 20 -- */ \
+    \
+    por_r2r(xmm6, xmm3);               /* xmm3 = -- -- -- 27 22 -- 07 05 */ \
+    pand_r2r(xmm4, xmm7);              /* xmm7 = -- -- -- -- -- 21 -- -- */ \
+    \
+    por_r2r(xmm7, xmm0);               /* xmm0 = -- -- -- -- -- 21 -- 06 */ \
+    pxor_r2r(xmm7, xmm4);              /* xmm4 = -- -- -- -- -- -- 20 -- */ \
+    \
+    movdqu_m2r(*(ecx + 16 ), xmm6);    /* xmm6 = -- -- FF FF -- -- -- -- */ \
+    movdqu_m2r(*(ecx + 64 ), xmm1);    /* xmm1 = -- -- -- FF FF -- -- -- */ \
+    \
+    pand_r2r(xmm2, xmm6);              /* xmm6 = -- -- 15 14 -- -- -- -- */ \
+    pand_r2r(xmm6, xmm1);              /* xmm1 = -- -- -- 14 -- -- -- -- */ \
+    \
+    pxor_r2r(xmm6, xmm2);              /* xmm2 = 17 16 -- -- -- -- -- -- */ \
+    pxor_r2r(xmm1, xmm6);              /* xmm6 = -- -- 15 -- -- -- -- -- */ \
+    \
+    psrldq_i2r(4, xmm1);               /* xmm1 = -- -- -- -- -- 14 -- -- */ \
+    \
+    psrldq_i2r(8, xmm6);               /* xmm6 = -- -- -- -- -- -- 15 -- */ \
+    por_r2r(xmm1, xmm3);               /* xmm3 = -- -- -- 27 22 14 07 05 */ \
+    \
+    por_r2r(xmm6, xmm0);               /* xmm0 = -- -- -- -- -- 21 15 06 */ \
+    pshufhw_r2r(xmm5, xmm5, 0x0E1);    /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \
+    \
+    movdqu_m2r(*(ecx + 64), xmm1);     /* xmm1 = -- -- -- FF FF -- -- -- */ \
+    pshuflw_r2r(xmm5, xmm5, 0x072);    /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \
+    \
+    movdqu_r2r(xmm1, xmm6);            /* xmm6 = -- -- -- FF FF -- -- -- */ \
+    pand_r2r(xmm5, xmm1);              /* xmm1 = -- -- -- 45 41 -- -- -- */ \
+    \
+    pxor_r2r(xmm1, xmm5);              /* xmm5 = 47 46 -- -- -- -- 40 -- */ \
+    pslldq_i2r(4, xmm1);               /* xmm1 = -- 45 41 -- -- -- -- -- */ \
+    \
+    pshufd_r2r(xmm5, xmm5, 0x09C);     /* xmm5 = -- -- -- -- 47 46 40 -- */ \
+    por_r2r(xmm1, xmm3);               /* xmm3 = -- 45 41 27 22 14 07 05 */ \
+    \
+    movdqu_m2r(*(eax + 96), xmm1);     /* xmm1 = 67 66 65 64 63 62 61 60 */ \
+    pmullw_m2r(*(ebx + 96), xmm1);     \
+    \
+    movdqu_m2r(*(ecx), xmm7);          /* xmm7 = -- -- -- -- -- FF FF -- */ \
+    \
+    psrldq_i2r(8, xmm6);               /* xmm6 = -- -- -- -- -- -- -- FF */ \
+    pand_r2r(xmm5, xmm7);              /* xmm7 = -- -- -- -- -- 46 40 -- */ \
+    \
+    pand_r2r(xmm1, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 60 */ \
+    pxor_r2r(xmm7, xmm5);              /* xmm5 = -- -- -- -- 47 -- -- -- */ \
+    \
+    pxor_r2r(xmm6, xmm1);              /* xmm1 = 67 66 65 64 63 62 61 -- */ \
+    pslldq_i2r(2, xmm5);               /* xmm5 = -- -- -- 47 -- -- -- -- */ \
+    \
+    pslldq_i2r(14, xmm6);              /* xmm6 = 60 -- -- -- -- -- -- -- */ \
+    por_r2r(xmm5, xmm4);               /* xmm4 = -- -- -- 47 -- -- 20 -- */ \
+    \
+    por_r2r(xmm6, xmm3);               /* xmm3 = 60 45 41 27 22 14 07 05 */ \
+    pslldq_i2r(6, xmm7);               /* xmm7 = -- -- 46 40 -- -- -- -- */ \
+    \
+    movdqu_r2m(xmm3, *(eax+32));       /* write  60 45 41 27 22 14 07 05 */ \
+    por_r2r(xmm7, xmm0);               /* xmm0 = -- -- 46 40 -- 21 15 06 */ \
+    /* 0, 1, 2, 4 in use */    \
+    movdqu_m2r(*(eax + 48), xmm3);     /* xmm3 = 37 36 35 34 33 32 31 30 */ \
+    movdqu_m2r(*(eax + 80), xmm5);     /* xmm5 = 57 56 55 54 53 52 51 50 */ \
+    \
+    pmullw_m2r(*(ebx + 48), xmm3);     \
+    pmullw_m2r(*(ebx + 80), xmm5);     \
+    \
+    movdqu_m2r(*(ecx + 64), xmm6);     /* xmm6 = -- -- -- FF FF -- -- -- */ \
+    movdqu_m2r(*(ecx + 64), xmm7);     /* xmm7 = -- -- -- FF FF -- -- -- */ \
+    \
+    psrldq_i2r(8, xmm6);               /* xmm6 = -- -- -- -- -- -- -- FF */ \
+    pslldq_i2r(8, xmm7);               /* xmm7 = FF -- -- -- -- -- -- -- */ \
+    \
+    pand_r2r(xmm3, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 30 */ \
+    pand_r2r(xmm5, xmm7);              /* xmm7 = 57 -- -- -- -- -- -- -- */ \
+    \
+    pxor_r2r(xmm6, xmm3);              /* xmm3 = 37 36 35 34 33 32 31 -- */ \
+    pxor_r2r(xmm7, xmm5);              /* xmm5 = __ 56 55 54 53 52 51 50 */ \
+    \
+    pslldq_i2r(6, xmm6);               /* xmm6 = -- -- -- -- 30 -- -- -- */ \
+    psrldq_i2r(2, xmm7);               /* xmm7 = -- 57 -- -- -- -- -- -- */ \
+    \
+    por_r2r(xmm7, xmm6);               /* xmm6 = -- 57 -- -- 30 -- -- -- */ \
+    movdqu_m2r(*(ecx), xmm7);          /* xmm7 = -- -- -- -- -- FF FF -- */ \
+    \
+    por_r2r(xmm6, xmm0);               /* xmm0 = -- 57 46 40 30 21 15 06 */ \
+    psrldq_i2r(2, xmm7);               /* xmm7 = -- -- -- -- -- -- FF FF */ \
+    \
+    movdqu_r2r(xmm2, xmm6);            /* xmm6 = 17 16 -- -- -- -- -- -- */ \
+    pand_r2r(xmm1, xmm7);              /* xmm7 = -- -- -- -- -- -- 61 -- */ \
+    \
+    pslldq_i2r(2, xmm6);               /* xmm6 = 16 -- -- -- -- -- -- -- */ \
+    psrldq_i2r(14, xmm2);              /* xmm2 = -- -- -- -- -- -- -- 17 */ \
+    \
+    pxor_r2r(xmm7, xmm1);              /* xmm1 = 67 66 65 64 63 62 -- -- */ \
+    pslldq_i2r(12, xmm7);              /* xmm7 = 61 -- -- -- -- -- -- -- */ \
+    \
+    psrldq_i2r(14, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 16 */ \
+    por_r2r(xmm6, xmm4);               /* xmm4 = -- -- -- 47 -- -- 20 16 */ \
+    \
+    por_r2r(xmm7, xmm0);               /* xmm0 = 61 57 46 40 30 21 15 06 */ \
+    movdqu_m2r(*(ecx), xmm6);          /* xmm6 = -- -- -- -- -- FF FF -- */ \
+    \
+    psrldq_i2r(2, xmm6);               /* xmm6 = -- -- -- -- -- -- FF FF */ \
+    movdqu_r2m(xmm0, *(eax+48));       /* write  61 57 46 40 30 21 15 06 */ \
+    /* 1, 2, 3, 4, 5 in use */\
+    movdqu_m2r(*(ecx), xmm0);          /* xmm0 = -- -- -- -- -- FF FF -- */ \
+    pand_r2r(xmm3, xmm6);              /* xmm6 = -- -- -- -- -- -- 31 -- */ \
+    \
+    movdqu_r2r(xmm3, xmm7);            /* xmm7 = 37 36 35 34 33 32 31 -- */ \
+    pxor_r2r(xmm6, xmm3);              /* xmm3 = 37 36 35 34 33 32 -- -- */ \
+    \
+    pslldq_i2r(2, xmm3);               /* xmm3 = 36 35 34 33 32 -- -- -- */ \
+    pand_r2r(xmm1, xmm0);              /* xmm0 = -- -- -- -- -- 62 -- -- */ \
+    \
+    psrldq_i2r(14, xmm7);              /* xmm7 = -- -- -- -- -- -- -- 37 */ \
+    pxor_r2r(xmm0, xmm1);              /* xmm1 = 67 66 65 64 63 -- -- -- */ \
+    \
+    por_r2r(xmm7, xmm6);               /* xmm6 = -- -- -- -- -- -- 31 37 */ \
+    movdqu_m2r(*(ecx + 64), xmm7);     /* xmm7 = -- -- -- FF FF -- -- -- */ \
+    \
+    pshuflw_r2r(xmm6, xmm6, 0x01E);    /* xmm6 = -- -- -- -- 37 31 -- -- */ \
+    pslldq_i2r(6, xmm7);               /* xmm7 = FF FF -- -- -- -- -- -- */ \
+    \
+    por_r2r(xmm6, xmm4);               /* xmm4 = -- -- -- 47 37 31 20 16 */ \
+    pand_r2r(xmm5, xmm7);              /* xmm7 = -- 56 -- -- -- -- -- -- */ \
+    \
+    pslldq_i2r(8, xmm0);               /* xmm0 = -- 62 -- -- -- -- -- -- */ \
+    pxor_r2r(xmm7, xmm5);              /* xmm5 = -- -- 55 54 53 52 51 50 */ \
+    \
+    psrldq_i2r(2, xmm7);               /* xmm7 = -- -- 56 -- -- -- -- -- */ \
+    \
+    pshufhw_r2r(xmm3, xmm3, 0x087);    /* xmm3 = 35 33 34 36 32 -- -- -- */ \
+    por_r2r(xmm7, xmm0);               /* xmm0 = -- 62 56 -- -- -- -- -- */ \
+    \
+    movdqu_m2r(*(eax + 112), xmm7);    /* xmm7 = 77 76 75 74 73 72 71 70 */ \
+    pmullw_m2r(*(ebx + 112), xmm7);     \
+    \
+    movdqu_m2r(*(ecx + 64), xmm6);     /* xmm6 = -- -- -- FF FF -- -- -- */ \
+    por_r2r(xmm0, xmm4);               /* xmm4 = -- 62 56 47 37 31 20 16 */ \
+    \
+    pshuflw_r2r(xmm7, xmm7, 0x0E1);    /* xmm7 = 77 76 75 74 73 72 70 71 */ \
+    psrldq_i2r(8, xmm6);               /* xmm6 = -- -- -- -- -- -- -- FF */ \
+    \
+    movdqu_m2r(*(ecx + 64), xmm0);     /* xmm0 = -- -- -- FF FF -- -- -- */ \
+    pand_r2r(xmm7, xmm6);              /* xmm6 = -- -- -- -- -- -- -- 71 */ \
+    \
+    pand_r2r(xmm3, xmm0);              /* xmm0 = -- -- -- 36 32 -- -- -- */ \
+    pxor_r2r(xmm6, xmm7);              /* xmm7 = 77 76 75 74 73 72 70 -- */ \
+    \
+    pxor_r2r(xmm0, xmm3);              /* xmm3 = 35 33 34 -- -- -- -- -- */ \
+    pslldq_i2r(14, xmm6);              /* xmm6 = 71 -- -- -- -- -- -- -- */ \
+    \
+    psrldq_i2r(4, xmm0);               /* xmm0 = -- -- -- -- -- 36 32 -- */ \
+    por_r2r(xmm6, xmm4);               /* xmm4 = 71 62 56 47 37 31 20 16 */ \
+    \
+    por_r2r(xmm0, xmm2);               /* xmm2 = -- -- -- -- -- 36 32 17 */ \
+    movdqu_r2m(xmm4, *(eax + 64));     /* write  71 62 56 47 37 31 20 16 */ \
+    /* 1, 2, 3, 5, 7 in use */ \
+    movdqu_m2r(*(ecx + 80), xmm6);     /* xmm6 = -- -- FF -- -- -- -- FF */ \
+    pshufhw_r2r(xmm7, xmm7, 0x0D2);    /* xmm7 = 77 75 74 76 73 72 70 __ */ \
+    \
+    movdqu_m2r(*(ecx), xmm4);          /* xmm4 = -- -- -- -- -- FF FF -- */ \
+    movdqu_m2r(*(ecx+48), xmm0);       /* xmm0 = -- -- -- -- FF -- -- -- */ \
+    \
+    pand_r2r(xmm5, xmm6);              /* xmm6 = -- -- 55 -- -- -- -- 50 */ \
+    pand_r2r(xmm7, xmm4);              /* xmm4 = -- -- -- -- -- 72 70 -- */ \
+    \
+    pand_r2r(xmm1, xmm0);              /* xmm0 = -- -- -- -- 63 -- -- -- */ \
+    pxor_r2r(xmm6, xmm5);              /* xmm5 = -- -- -- 54 53 52 51 -- */ \
+    \
+    pxor_r2r(xmm4, xmm7);              /* xmm7 = 77 75 74 76 73 -- -- -- */ \
+    pxor_r2r(xmm0, xmm1);              /* xmm1 = 67 66 65 64 -- -- -- -- */ \
+    \
+    pshuflw_r2r(xmm6, xmm6, 0x02B);    /* xmm6 = -- -- 55 -- 50 -- -- -- */ \
+    pslldq_i2r(10, xmm4);              /* xmm4 = 72 20 -- -- -- -- -- -- */ \
+    \
+    pshufhw_r2r(xmm6, xmm6, 0x0B1);    /* xmm6 = -- -- -- 55 50 -- -- -- */ \
+    pslldq_i2r(4, xmm0);               /* xmm0 = -- -- 63 -- -- -- -- -- */ \
+    \
+    por_r2r(xmm4, xmm6);               /* xmm6 = 72 70 -- 55 50 -- -- -- */ \
+    por_r2r(xmm0, xmm2);               /* xmm2 = -- -- 63 -- -- 36 32 17 */ \
+    \
+    por_r2r(xmm6, xmm2);               /* xmm2 = 72 70 64 55 50 36 32 17 */ \
+    pshufhw_r2r(xmm1, xmm1, 0x0C9);    /* xmm1 = 67 64 66 65 -- -- -- -- */ \
+    \
+    movdqu_r2r(xmm3, xmm6);            /* xmm6 = 35 33 34 -- -- -- -- -- */ \
+    movdqu_r2m(xmm2, *(eax+80));       /* write  72 70 64 55 50 36 32 17 */ \
+    \
+    psrldq_i2r(12, xmm6);              /* xmm6 = -- -- -- -- -- -- 35 33 */ \
+    pslldq_i2r(4, xmm3);               /* xmm3 = 34 -- -- -- -- -- -- -- */ \
+    \
+    pshuflw_r2r(xmm5, xmm5, 0x04E);    /* xmm5 = -- -- -- 54 51 -- 53 52 */ \
+    movdqu_r2r(xmm7, xmm4);            /* xmm4 = 77 75 74 76 73 -- -- -- */ \
+    \
+    movdqu_r2r(xmm5, xmm2);            /* xmm2 = -- -- -- 54 51 -- 53 52 */ \
+    psrldq_i2r(10, xmm7);              /* xmm7 = -- -- -- -- -- 77 75 74 */ \
+    \
+    pslldq_i2r(6, xmm4);               /* xmm4 = 76 73 -- -- -- -- -- -- */ \
+    pslldq_i2r(12, xmm2);              /* xmm2 = 53 52 -- -- -- -- -- -- */ \
+    \
+    movdqu_r2r(xmm1, xmm0);            /* xmm0 = 67 64 66 65 -- -- -- -- */ \
+    psrldq_i2r(12, xmm1);              /* xmm1 = -- -- -- -- -- -- 67 64 */ \
+    \
+    psrldq_i2r(6, xmm5);               /* xmm5 = -- -- -- -- -- -- 54 51 */ \
+    psrldq_i2r(14, xmm3);              /* xmm3 = -- -- -- -- -- -- -- 34 */ \
+    \
+    pslldq_i2r(10, xmm7);              /* xmm7 = 77 75 74 -- -- -- -- -- */ \
+    por_r2r(xmm6, xmm4);               /* xmm4 = 76 73 -- -- -- -- 35 33 */ \
+    \
+    psrldq_i2r(10, xmm2);              /* xmm2 = -- -- -- -- -- 53 52 -- */ \
+    pslldq_i2r(4, xmm0);               /* xmm0 = 66 65 -- -- -- -- -- -- */ \
+    \
+    pslldq_i2r(8, xmm1);               /* xmm1 = -- -- 67 64 -- -- -- -- */ \
+    por_r2r(xmm7, xmm3);               /* xmm3 = 77 75 74 -- -- -- -- 34 */ \
+    \
+    psrldq_i2r(6, xmm0);               /* xmm0 = -- -- -- 66 65 -- -- -- */ \
+    pslldq_i2r(4, xmm5);               /* xmm5 = -- -- -- -- 54 51 -- -- */ \
+    \
+    por_r2r(xmm1, xmm4);               /* xmm4 = 76 73 67 64 -- -- 35 33 */ \
+    por_r2r(xmm2, xmm3);               /* xmm3 = 77 75 74 -- -- 53 52 34 */ \
+    \
+    por_r2r(xmm5, xmm4);               /* xmm4 = 76 73 67 64 54 51 35 33 */ \
+    por_r2r(xmm0, xmm3);               /* xmm3 = 77 75 74 66 65 53 52 34 */ \
+    \
+    movdqu_r2m(xmm4, *(eax+96));       /* write  76 73 67 64 54 51 35 33 */ \
+    movdqu_r2m(xmm3, *(eax+112));      /* write  77 75 74 66 65 53 52 34 */ \
+    \
+} /* end of SSE2_Dequantize Macro */
+
+
+void vp3_dsp_init_sse2(void)
+{
+    /* nop */
+}
+
+
+static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
+     int16_t *output_data)
+{
+    unsigned char *input_bytes = (unsigned char *)input_data;
+    unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
+    unsigned char *dequant_const_bytes = (unsigned char *)SSE2_dequant_const;
+    unsigned char *output_data_bytes = (unsigned char *)output_data;
+    unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data;
+    unsigned char *Eight = (unsigned char *)eight_data;
+
+#define eax input_bytes
+#define ebx dequant_matrix_bytes
+#define ecx dequant_const_bytes
+#define edx idct_data_bytes
+
+#define I(i) (eax + 16 * i)
+#define O(i) (ebx + 16 * i)
+#define C(i) (edx + 16 * (i-1))
+
+    SSE2_Dequantize();
+
+#undef ebx
+#define ebx output_data_bytes
+
+    SSE2_Row_IDCT();
+
+    SSE2_Transpose();
+        
+    SSE2_Column_IDCT();
+}
+
+
+void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+
+    vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            if (*op < -128)
+                *dest = 0;
+            else if (*op > 127)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(*op + 128);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
+
+
+void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, uint8_t *dest, int stride)
+{
+    int16_t transformed_data[64];
+    int16_t *op;
+    int i, j;
+    int16_t sample;
+
+    vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
+
+    /* place in final output */
+    op = transformed_data;
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            sample = *dest + *op;
+            if (sample < 0)
+                *dest = 0;
+            else if (sample > 255)
+                *dest = 255;
+            else
+                *dest = (uint8_t)(sample & 0xFF);
+            op++;
+            dest++;
+        }
+        dest += (stride - 8);
+    }
+}
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index a18645e33..14fdb1059 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -45,7 +45,10 @@
 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
 
 struct ImgReSampleContext {
-    int iwidth, iheight, owidth, oheight, topBand, bottomBand, leftBand, rightBand;
+    int iwidth, iheight, owidth, oheight;
+    int topBand, bottomBand, leftBand, rightBand;
+    int padtop, padbottom, padleft, padright;
+    int pad_owidth, pad_oheight;
     int h_incr, v_incr;
     int16_t h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
     int16_t v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
@@ -532,6 +535,7 @@ static void component_resample(ImgReSampleContext *s,
                        &s->v_filters[phase_y][0]);
             
         src_y += s->v_incr;
+        
         output += owrap;
     }
 }
@@ -572,13 +576,16 @@ static void build_filter(int16_t *filter, float factor)
 ImgReSampleContext *img_resample_init(int owidth, int oheight,
                                       int iwidth, int iheight)
 {
-	return img_resample_full_init(owidth, oheight, iwidth, iheight, 0, 0, 0, 0);
+    return img_resample_full_init(owidth, oheight, iwidth, iheight, 
+            0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
                                       int iwidth, int iheight,
                                       int topBand, int bottomBand,
-                                      int leftBand, int rightBand)
+        int leftBand, int rightBand,
+        int padtop, int padbottom,
+        int padleft, int padright)
 {
     ImgReSampleContext *s;
 
@@ -593,19 +600,30 @@ ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
     s->oheight = oheight;
     s->iwidth = iwidth;
     s->iheight = iheight;
+  
     s->topBand = topBand;
     s->bottomBand = bottomBand;
     s->leftBand = leftBand;
     s->rightBand = rightBand;
     
-    s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / owidth;
-    s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / oheight;
-    
-    build_filter(&s->h_filters[0][0], (float) owidth  / (float) (iwidth - leftBand - rightBand));
-    build_filter(&s->v_filters[0][0], (float) oheight / (float) (iheight - topBand - bottomBand));
+    s->padtop = padtop;
+    s->padbottom = padbottom;
+    s->padleft = padleft;
+    s->padright = padright;
+
+    s->pad_owidth = owidth - (padleft + padright);
+    s->pad_oheight = oheight - (padtop + padbottom);
+
+    s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / s->pad_owidth;
+    s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / s->pad_oheight; 
+
+    build_filter(&s->h_filters[0][0], (float) s->pad_owidth  / 
+            (float) (iwidth - leftBand - rightBand));
+    build_filter(&s->v_filters[0][0], (float) s->pad_oheight / 
+            (float) (iheight - topBand - bottomBand));
 
     return s;
- fail:
+fail:
     av_free(s);
     return NULL;
 }
@@ -614,13 +632,20 @@ void img_resample(ImgReSampleContext *s,
                   AVPicture *output, const AVPicture *input)
 {
     int i, shift;
+    uint8_t* optr;
 
-    for(i=0;i<3;i++) {
+    for (i=0;i<3;i++) {
         shift = (i == 0) ? 0 : 1;
-        component_resample(s, output->data[i], output->linesize[i], 
-                           s->owidth >> shift, s->oheight >> shift,
-                           input->data[i] + (input->linesize[i] * (s->topBand >> shift)) + (s->leftBand >> shift),
-                           input->linesize[i], ((s->iwidth - s->leftBand - s->rightBand) >> shift),
+
+        optr = output->data[i] + (((output->linesize[i] * 
+                        s->padtop) + s->padleft) >> shift);
+
+        component_resample(s, optr, output->linesize[i], 
+                s->pad_owidth >> shift, s->pad_oheight >> shift,
+                input->data[i] + (input->linesize[i] * 
+                    (s->topBand >> shift)) + (s->leftBand >> shift),
+                input->linesize[i], ((s->iwidth - s->leftBand - 
+                        s->rightBand) >> shift),
                            (s->iheight - s->topBand - s->bottomBand) >> shift);
     }
 }
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 30029d40c..255a82d2c 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -862,13 +862,11 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
     memset(&s2, 0, sizeof(MpegEncContext));
     s2.avctx= avctx;
 //    s2->out_format = FMT_MJPEG;
-    s2.width = 8;
-    s2.height = 8;
-    if (MPV_common_init(&s2) < 0)
-       return -1;
+    dsputil_init(&s2.dsp, avctx);
+    DCT_common_init(&s2);
+
     s->scantable= s2.intra_scantable;
     s->idct_put= s2.dsp.idct_put;
-    MPV_common_end(&s2);
 
     s->mpeg_enc_ctx_allocated = 0;
     s->buffer_size = 102400; /* smaller buffer should be enough,
@@ -1532,15 +1530,22 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
     
     if (id == ff_get_fourcc("JFIF"))
     {
-	int t_w, t_h;
+	int t_w, t_h, v1, v2;
 	skip_bits(&s->gb, 8); /* the trailing zero-byte */
-	av_log(s->avctx, AV_LOG_INFO, "mjpeg: JFIF header found (version: %x.%x)\n",
-	    get_bits(&s->gb, 8), get_bits(&s->gb, 8));
+	v1= get_bits(&s->gb, 8);
+        v2= get_bits(&s->gb, 8);
         skip_bits(&s->gb, 8);
 
         s->avctx->sample_aspect_ratio.num= get_bits(&s->gb, 16);
         s->avctx->sample_aspect_ratio.den= get_bits(&s->gb, 16);
 
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "mjpeg: JFIF header found (version: %x.%x) SAR=%d/%d\n",
+                v1, v2,
+                s->avctx->sample_aspect_ratio.num,
+                s->avctx->sample_aspect_ratio.den
+            );
+
 	t_w = get_bits(&s->gb, 8);
 	t_h = get_bits(&s->gb, 8);
 	if (t_w && t_h)
@@ -1555,7 +1560,8 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
     
     if (id == ff_get_fourcc("Adob") && (get_bits(&s->gb, 8) == 'e'))
     {
-	av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "mjpeg: Adobe header found\n");
 	skip_bits(&s->gb, 16); /* version */
 	skip_bits(&s->gb, 16); /* flags0 */
 	skip_bits(&s->gb, 16); /* flags1 */
@@ -1565,7 +1571,8 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
     }
 
     if (id == ff_get_fourcc("LJIF")){
-        av_log(s->avctx, AV_LOG_INFO, "Pegasus lossless jpeg header found\n");
+        if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_INFO, "Pegasus lossless jpeg header found\n");
 	skip_bits(&s->gb, 16); /* version ? */
 	skip_bits(&s->gb, 16); /* unknwon always 0? */
 	skip_bits(&s->gb, 16); /* unknwon always 0? */
@@ -1604,7 +1611,7 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
 	    skip_bits(&s->gb, 32); /* scan off */
 	    skip_bits(&s->gb, 32); /* data off */
 #endif
-	    if (s->first_picture)
+            if (s->avctx->debug & FF_DEBUG_PICT_INFO)
 		av_log(s->avctx, AV_LOG_INFO, "mjpeg: Apple MJPEG-A header found\n");
 	}
     }
@@ -1635,7 +1642,8 @@ static int mjpeg_decode_com(MJpegDecodeContext *s)
 	    else
 		cbuf[i] = 0;
 
-	    av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
+            if(s->avctx->debug & FF_DEBUG_PICT_INFO)
+                av_log(s->avctx, AV_LOG_INFO, "mjpeg comment: '%s'\n", cbuf);
 
 	    /* buggy avid, it puts EOI only at every 10th frame */
 	    if (!strcmp(cbuf, "AVID"))
@@ -1781,13 +1789,12 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
 		/* process markers */
 		if (start_code >= 0xd0 && start_code <= 0xd7) {
 		    dprintf("restart marker: %d\n", start_code&0x0f);
-		} else if (s->first_picture) {
 		    /* APP fields */
-		    if (start_code >= 0xe0 && start_code <= 0xef)
-			mjpeg_decode_app(s);
+		} else if (start_code >= APP0 && start_code <= APP15) {
+		    mjpeg_decode_app(s);
 		    /* Comment */
-		    else if (start_code == COM)
-			mjpeg_decode_com(s);
+		} else if (start_code == COM){
+		    mjpeg_decode_com(s);
 		}
 
                 switch(start_code) {
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 5132487cf..f194a4d60 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -33,8 +33,8 @@
 #include "dsputil.h"
 #include "mpegvideo.h"
 
-//#undef NDEBUG
-//#include <assert.h>
+#undef NDEBUG
+#include <assert.h>
 
 #define SQ(a) ((a)*(a))
 
@@ -46,9 +46,8 @@
 
 static inline int sad_hpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-                                  int pred_x, int pred_y, uint8_t *src_data[3],
-                                  uint8_t *ref_data[6], int stride, int uvstride,
-                                  int size, int h, uint8_t * const mv_penalty);
+                                  int src_index, int ref_index,
+                                  int size, int h);
 
 static inline int update_map_generation(MpegEncContext * s)
 {
@@ -73,209 +72,153 @@ static int minima_cmp(const void *a, const void *b){
     
     return da->height - db->height;
 }
-                                  
-/* SIMPLE */
-#define RENAME(a) simple_ ## a
 
-#define CMP(d, x, y, size)\
-d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride, h);
+#define FLAG_QPEL   1 //must be 1
+#define FLAG_CHROMA 2
+#define FLAG_DIRECT 4
 
-#define CMP_HPEL(d, dx, dy, x, y, size)\
-{\
-    const int dxy= (dx) + 2*(dy);\
-    hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, h);\
-    d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
-}
-
-
-#define CMP_QPEL(d, dx, dy, x, y, size)\
-{\
-    const int dxy= (dx) + 4*(dy);\
-    qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\
-    d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
+static inline void init_ref(MpegEncContext *s, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
+    MotionEstContext * const c= &s->me;
+    const int offset[3]= {
+          y*c->  stride + x,
+        ((y*c->uvstride + x)>>1),
+        ((y*c->uvstride + x)>>1),
+    };
+    int i;
+    for(i=0; i<3; i++){
+        c->src[0][i]= src [i] + offset[i];
+        c->ref[0][i]= ref [i] + offset[i];
+    }
+    if(ref_index){
+        for(i=0; i<3; i++){
+            c->ref[ref_index][i]= ref2[i] + offset[i];
+        }
+    }
 }
 
-#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-
-/* SIMPLE CHROMA */
-#define RENAME(a) simple_chroma_ ## a
-
-#define CMP(d, x, y, size)\
-d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride, h);\
-if(chroma_cmp){\
-    int dxy= ((x)&1) + 2*((y)&1);\
-    int c= ((x)>>1) + ((y)>>1)*uvstride;\
-\
-    chroma_hpel_put[0][dxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\
-    d += chroma_cmp(s, s->me.scratchpad, src_u, uvstride, h>>1);\
-    chroma_hpel_put[0][dxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\
-    d += chroma_cmp(s, s->me.scratchpad, src_v, uvstride, h>>1);\
+static int get_flags(MpegEncContext *s, int direct, int chroma){
+    return   ((s->flags&CODEC_FLAG_QPEL) ? FLAG_QPEL : 0)
+           + (direct ? FLAG_DIRECT : 0) 
+           + (chroma ? FLAG_CHROMA : 0);
 }
 
-#define CMP_HPEL(d, dx, dy, x, y, size)\
-{\
-    const int dxy= (dx) + 2*(dy);\
-    hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, h);\
-    d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
-    if(chroma_cmp_sub){\
-        int cxy= (dxy) | ((x)&1) | (2*((y)&1));\
-        int c= ((x)>>1) + ((y)>>1)*uvstride;\
-        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\
-        d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride, h>>1);\
-        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\
-        d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride, h>>1);\
-    }\
-}
+static always_inline int cmp(MpegEncContext *s, const int x, const int y, const int subx, const int suby,
+                      const int size, const int h, int ref_index, int src_index,
+                      me_cmp_func cmp_func, me_cmp_func chroma_cmp_func, const int flags){
+    MotionEstContext * const c= &s->me;
+    const int stride= c->stride;
+    const int uvstride= c->uvstride;
+    const int qpel= flags&FLAG_QPEL;
+    const int chroma= flags&FLAG_CHROMA;
+    const int dxy= subx + (suby<<(1+qpel)); //FIXME log2_subpel?
+    const int hx= subx + (x<<(1+qpel));
+    const int hy= suby + (y<<(1+qpel));
+    uint8_t * const * const ref= c->ref[ref_index];
+    uint8_t * const * const src= c->src[src_index];
+    int d;
+    //FIXME check chroma 4mv, (no crashes ...)
+    if(flags&FLAG_DIRECT){
+        if(x >= c->xmin && hx <= c->xmax<<(qpel+1) && y >= c->ymin && hy <= c->ymax<<(qpel+1)){
+            const int time_pp= s->pp_time;
+            const int time_pb= s->pb_time;
+            const int mask= 2*qpel+1;
+            if(s->mv_type==MV_TYPE_8X8){
+                int i;
+                for(i=0; i<4; i++){
+                    int fx = c->direct_basis_mv[i][0] + hx;
+                    int fy = c->direct_basis_mv[i][1] + hy;
+                    int bx = hx ? fx - c->co_located_mv[i][0] : c->co_located_mv[i][0]*(time_pb - time_pp)/time_pp + ((i &1)<<(qpel+4));
+                    int by = hy ? fy - c->co_located_mv[i][1] : c->co_located_mv[i][1]*(time_pb - time_pp)/time_pp + ((i>>1)<<(qpel+4));
+                    int fxy= (fx&mask) + ((fy&mask)<<(qpel+1));
+                    int bxy= (bx&mask) + ((by&mask)<<(qpel+1));
+        
+                    uint8_t *dst= c->temp + 8*(i&1) + 8*stride*(i>>1);
+                    if(qpel){
+                        c->qpel_put[1][fxy](dst, ref[0] + (fx>>2) + (fy>>2)*stride, stride);
+                        c->qpel_avg[1][bxy](dst, ref[8] + (bx>>2) + (by>>2)*stride, stride);
+                    }else{
+                        c->hpel_put[1][fxy](dst, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 8);
+                        c->hpel_avg[1][bxy](dst, ref[8] + (bx>>1) + (by>>1)*stride, stride, 8);
+                    }
+                }
+            }else{
+                int fx = c->direct_basis_mv[0][0] + hx;
+                int fy = c->direct_basis_mv[0][1] + hy;
+                int bx = hx ? fx - c->co_located_mv[0][0] : (c->co_located_mv[0][0]*(time_pb - time_pp)/time_pp);
+                int by = hy ? fy - c->co_located_mv[0][1] : (c->co_located_mv[0][1]*(time_pb - time_pp)/time_pp);
+                int fxy= (fx&mask) + ((fy&mask)<<(qpel+1));
+                int bxy= (bx&mask) + ((by&mask)<<(qpel+1));
+                
+                if(qpel){
+                    c->qpel_put[1][fxy](c->temp               , ref[0] + (fx>>2) + (fy>>2)*stride               , stride);
+                    c->qpel_put[1][fxy](c->temp + 8           , ref[0] + (fx>>2) + (fy>>2)*stride + 8           , stride);
+                    c->qpel_put[1][fxy](c->temp     + 8*stride, ref[0] + (fx>>2) + (fy>>2)*stride     + 8*stride, stride);
+                    c->qpel_put[1][fxy](c->temp + 8 + 8*stride, ref[0] + (fx>>2) + (fy>>2)*stride + 8 + 8*stride, stride);
+                    c->qpel_avg[1][bxy](c->temp               , ref[8] + (bx>>2) + (by>>2)*stride               , stride);
+                    c->qpel_avg[1][bxy](c->temp + 8           , ref[8] + (bx>>2) + (by>>2)*stride + 8           , stride);
+                    c->qpel_avg[1][bxy](c->temp     + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride     + 8*stride, stride);
+                    c->qpel_avg[1][bxy](c->temp + 8 + 8*stride, ref[8] + (bx>>2) + (by>>2)*stride + 8 + 8*stride, stride);
+                }else{            
+                    assert((fx>>1) + 16*s->mb_x >= -16);
+                    assert((fy>>1) + 16*s->mb_y >= -16);
+                    assert((fx>>1) + 16*s->mb_x <= s->width);
+                    assert((fy>>1) + 16*s->mb_y <= s->height);
+                    assert((bx>>1) + 16*s->mb_x >= -16);
+                    assert((by>>1) + 16*s->mb_y >= -16);
+                    assert((bx>>1) + 16*s->mb_x <= s->width);
+                    assert((by>>1) + 16*s->mb_y <= s->height);
+
+                    c->hpel_put[0][fxy](c->temp, ref[0] + (fx>>1) + (fy>>1)*stride, stride, 16);
+                    c->hpel_avg[0][bxy](c->temp, ref[8] + (bx>>1) + (by>>1)*stride, stride, 16);
+                }
+            }
+            d = cmp_func(s, c->temp, src[0], stride, 16);
+        }else
+            d= 256*256*256*32;
+    }else{
+        int uvdxy;
+        if(dxy){
+            if(qpel){
+                c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
+                if(chroma){
+                    int cx= hx/2;
+                    int cy= hy/2;
+                    cx= (cx>>1)|(cx&1);
+                    cy= (cy>>1)|(cy&1);
+                    uvdxy= (cx&1) + 2*(cy&1);
+                    //FIXME x/y wrong, but mpeg4 qpel is sick anyway, we should drop as much of it as possible in favor for h264
+                }
+            }else{
+                c->hpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride, h);
+                if(chroma)
+                    uvdxy= dxy | (x&1) | (2*(y&1));
+            }
+            d = cmp_func(s, c->temp, src[0], stride, h); 
+        }else{
+            d = cmp_func(s, src[0], ref[0] + x + y*stride, stride, h); 
+            if(chroma)
+                uvdxy= (x&1) + 2*(y&1);
+        }
+        if(chroma){
+            uint8_t * const uvtemp= c->temp + 16*stride;
+            c->hpel_put[size+1][uvdxy](uvtemp  , ref[1] + (x>>1) + (y>>1)*uvstride, uvstride, h>>1);
+            c->hpel_put[size+1][uvdxy](uvtemp+8, ref[2] + (x>>1) + (y>>1)*uvstride, uvstride, h>>1);
+            d += chroma_cmp_func(s, uvtemp  , src[1], uvstride, h>>1); 
+            d += chroma_cmp_func(s, uvtemp+8, src[2], uvstride, h>>1); 
+        }
+    }
+#if 0
+    if(full_pel){
+        const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);
+        score_map[index]= d;
+    }
 
-#define CMP_QPEL(d, dx, dy, x, y, size)\
-{\
-    const int dxy= (dx) + 4*(dy);\
-    qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\
-    d = cmp_sub(s, s->me.scratchpad, src_y, stride, h);\
-    if(chroma_cmp_sub){\
-        int cxy, c;\
-        int cx= (4*(x) + (dx))/2;\
-        int cy= (4*(y) + (dy))/2;\
-        cx= (cx>>1)|(cx&1);\
-        cy= (cy>>1)|(cy&1);\
-        cxy= (cx&1) + 2*(cy&1);\
-        c= ((cx)>>1) + ((cy)>>1)*uvstride;\
-        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, h>>1);\
-        d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride, h>>1);\
-        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, h>>1);\
-        d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride, h>>1);\
-    }\
+    d += (c->mv_penalty[hx - c->pred_x] + c->mv_penalty[hy - c->pred_y])*c->penalty_factor;
+#endif
+    return d;
 }
 
 #include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-
-/* SIMPLE DIRECT HPEL */
-#define RENAME(a) simple_direct_hpel_ ## a
-//FIXME precalc divisions stuff
-
-#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\
-if((x) >= xmin && 2*(x) + (dx) <= 2*xmax && (y) >= ymin && 2*(y) + (dy) <= 2*ymax){\
-    const int hx= 2*(x) + (dx);\
-    const int hy= 2*(y) + (dy);\
-    if(s->mv_type==MV_TYPE_8X8){\
-        int i;\
-        for(i=0; i<4; i++){\
-            int fx = s->me.direct_basis_mv[i][0] + hx;\
-            int fy = s->me.direct_basis_mv[i][1] + hy;\
-            int bx = hx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\
-            int by = hy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\
-            int fxy= (fx&1) + 2*(fy&1);\
-            int bxy= (bx&1) + 2*(by&1);\
-\
-            uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\
-            hpel_put[1][fxy](dst, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 8);\
-            hpel_avg[1][bxy](dst, (ref_data[3]) + (bx>>1) + (by>>1)*(stride), stride, 8);\
-        }\
-    }else{\
-        int fx = s->me.direct_basis_mv[0][0] + hx;\
-        int fy = s->me.direct_basis_mv[0][1] + hy;\
-        int bx = hx ? fx - s->me.co_located_mv[0][0] : (s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp);\
-        int by = hy ? fy - s->me.co_located_mv[0][1] : (s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp);\
-        int fxy= (fx&1) + 2*(fy&1);\
-        int bxy= (bx&1) + 2*(by&1);\
-        \
-        assert((fx>>1) + 16*s->mb_x >= -16);\
-        assert((fy>>1) + 16*s->mb_y >= -16);\
-        assert((fx>>1) + 16*s->mb_x <= s->width);\
-        assert((fy>>1) + 16*s->mb_y <= s->height);\
-        assert((bx>>1) + 16*s->mb_x >= -16);\
-        assert((by>>1) + 16*s->mb_y >= -16);\
-        assert((bx>>1) + 16*s->mb_x <= s->width);\
-        assert((by>>1) + 16*s->mb_y <= s->height);\
-\
-        hpel_put[0][fxy](s->me.scratchpad, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 16);\
-        hpel_avg[0][bxy](s->me.scratchpad, (ref_data[3]) + (bx>>1) + (by>>1)*(stride), stride, 16);\
-    }\
-    d = cmp_func(s, s->me.scratchpad, src_y, stride, 16);\
-}else\
-    d= 256*256*256*32;
-
-
-#define CMP_HPEL(d, dx, dy, x, y, size)\
-    CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub)
-
-#define CMP(d, x, y, size)\
-    CMP_DIRECT(d, 0, 0, x, y, size, cmp)
-    
-#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-#undef CMP_DIRECT
-
-/* SIMPLE DIRECT QPEL */
-#define RENAME(a) simple_direct_qpel_ ## a
-
-#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\
-if((x) >= xmin && 4*(x) + (dx) <= 4*xmax && (y) >= ymin && 4*(y) + (dy) <= 4*ymax){\
-    const int qx= 4*(x) + (dx);\
-    const int qy= 4*(y) + (dy);\
-    if(s->mv_type==MV_TYPE_8X8){\
-        int i;\
-        for(i=0; i<4; i++){\
-            int fx = s->me.direct_basis_mv[i][0] + qx;\
-            int fy = s->me.direct_basis_mv[i][1] + qy;\
-            int bx = qx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\
-            int by = qy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\
-            int fxy= (fx&3) + 4*(fy&3);\
-            int bxy= (bx&3) + 4*(by&3);\
-\
-            uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\
-            qpel_put[1][fxy](dst, (ref_y ) + (fx>>2) + (fy>>2)*(stride), stride);\
-            qpel_avg[1][bxy](dst, (ref_data[3]) + (bx>>2) + (by>>2)*(stride), stride);\
-        }\
-    }else{\
-        int fx = s->me.direct_basis_mv[0][0] + qx;\
-        int fy = s->me.direct_basis_mv[0][1] + qy;\
-        int bx = qx ? fx - s->me.co_located_mv[0][0] : s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp;\
-        int by = qy ? fy - s->me.co_located_mv[0][1] : s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp;\
-        int fxy= (fx&3) + 4*(fy&3);\
-        int bxy= (bx&3) + 4*(by&3);\
-\
-        qpel_put[1][fxy](s->me.scratchpad               , (ref_y ) + (fx>>2) + (fy>>2)*(stride)               , stride);\
-        qpel_put[1][fxy](s->me.scratchpad + 8           , (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8           , stride);\
-        qpel_put[1][fxy](s->me.scratchpad     + 8*stride, (ref_y ) + (fx>>2) + (fy>>2)*(stride)     + 8*stride, stride);\
-        qpel_put[1][fxy](s->me.scratchpad + 8 + 8*stride, (ref_y ) + (fx>>2) + (fy>>2)*(stride) + 8 + 8*stride, stride);\
-        qpel_avg[1][bxy](s->me.scratchpad               , (ref_data[3]) + (bx>>2) + (by>>2)*(stride)               , stride);\
-        qpel_avg[1][bxy](s->me.scratchpad + 8           , (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8           , stride);\
-        qpel_avg[1][bxy](s->me.scratchpad     + 8*stride, (ref_data[3]) + (bx>>2) + (by>>2)*(stride)     + 8*stride, stride);\
-        qpel_avg[1][bxy](s->me.scratchpad + 8 + 8*stride, (ref_data[3]) + (bx>>2) + (by>>2)*(stride) + 8 + 8*stride, stride);\
-    }\
-    d = cmp_func(s, s->me.scratchpad, src_y, stride, 16);\
-}else\
-    d= 256*256*256*32;
-
-
-#define CMP_QPEL(d, dx, dy, x, y, size)\
-    CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub)
-
-#define CMP(d, x, y, size)\
-    CMP_DIRECT(d, 0, 0, x, y, size, cmp)
-
-#include "motion_est_template.c"
-#undef RENAME
-#undef CMP
-#undef CMP_HPEL
-#undef CMP_QPEL
-#undef INIT
-#undef CMP__DIRECT
 
 static inline int get_penalty_factor(MpegEncContext *s, int type){
     switch(type&0xFF){
@@ -297,54 +240,45 @@ static inline int get_penalty_factor(MpegEncContext *s, int type){
 }
 
 void ff_init_me(MpegEncContext *s){
+    MotionEstContext * const c= &s->me;
+
     ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, s->avctx->me_pre_cmp);
     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
     ff_set_cmp(&s->dsp, s->dsp.mb_cmp, s->avctx->mb_cmp);
+    
+    s->me.flags    = get_flags(s, 0, s->avctx->me_cmp    &FF_CMP_CHROMA);
+    s->me.sub_flags= get_flags(s, 0, s->avctx->me_sub_cmp&FF_CMP_CHROMA);
+    s->me.mb_flags = get_flags(s, 0, s->avctx->mb_cmp    &FF_CMP_CHROMA);
 
+/*FIXME s->no_rounding b_type*/
     if(s->flags&CODEC_FLAG_QPEL){
-        if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
-            s->me.sub_motion_search= simple_chroma_qpel_motion_search;
-        else
-            s->me.sub_motion_search= simple_qpel_motion_search;
+        s->me.sub_motion_search= qpel_motion_search;
+        c->qpel_avg= s->dsp.avg_qpel_pixels_tab;
+        if(s->no_rounding) c->qpel_put= s->dsp.put_no_rnd_qpel_pixels_tab;
+        else               c->qpel_put= s->dsp.put_qpel_pixels_tab;
     }else{
         if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
-            s->me.sub_motion_search= simple_chroma_hpel_motion_search;
+            s->me.sub_motion_search= hpel_motion_search;
         else if(   s->avctx->me_sub_cmp == FF_CMP_SAD 
                 && s->avctx->    me_cmp == FF_CMP_SAD 
                 && s->avctx->    mb_cmp == FF_CMP_SAD)
             s->me.sub_motion_search= sad_hpel_motion_search; // 2050 vs. 2450 cycles
         else
-            s->me.sub_motion_search= simple_hpel_motion_search;
+            s->me.sub_motion_search= hpel_motion_search;
+        c->hpel_avg= s->dsp.avg_pixels_tab;
+        if(s->no_rounding) c->hpel_put= s->dsp.put_no_rnd_pixels_tab;
+        else               c->hpel_put= s->dsp.put_pixels_tab;
     }
-
-    if(s->avctx->me_cmp&FF_CMP_CHROMA){
-        s->me.motion_search[0]= simple_chroma_epzs_motion_search;
-        s->me.motion_search[1]= simple_chroma_epzs_motion_search4;
-        s->me.motion_search[4]= simple_chroma_epzs_motion_search2;
+    if(s->linesize){
+        s->me.stride  = s->linesize; 
+        s->me.uvstride= s->uvlinesize;
     }else{
-        s->me.motion_search[0]= simple_epzs_motion_search;
-        s->me.motion_search[1]= simple_epzs_motion_search4;
-        s->me.motion_search[4]= simple_epzs_motion_search2;
-    }
-    
-    if(s->avctx->me_pre_cmp&FF_CMP_CHROMA){
-        s->me.pre_motion_search= simple_chroma_epzs_motion_search;
-    }else{
-        s->me.pre_motion_search= simple_epzs_motion_search;
-    }
-    
-    if(s->flags&CODEC_FLAG_QPEL){
-        if(s->avctx->mb_cmp&FF_CMP_CHROMA)
-            s->me.get_mb_score= simple_chroma_qpel_get_mb_score;
-        else
-            s->me.get_mb_score= simple_qpel_get_mb_score;
-    }else{
-        if(s->avctx->mb_cmp&FF_CMP_CHROMA)
-            s->me.get_mb_score= simple_chroma_hpel_get_mb_score;
-        else
-            s->me.get_mb_score= simple_hpel_get_mb_score;
+        s->me.stride  = 16*s->mb_width + 32;
+        s->me.uvstride=  8*s->mb_width + 16;
     }
+
+    c->temp= c->scratchpad;
 }
       
 #if 0
@@ -611,18 +545,17 @@ static int phods_motion_search(MpegEncContext * s,
 
 static inline int sad_hpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-                                  int pred_x, int pred_y, uint8_t *src_data[3],
-                                  uint8_t *ref_data[6], int stride, int uvstride,                                  
-                                  int size, int h, uint8_t * const mv_penalty)
+                                  int src_index, int ref_index,
+                                  int size, int h)
 {
-    uint32_t *score_map= s->me.score_map;
     const int penalty_factor= s->me.sub_penalty_factor;
     int mx, my, dminh;
     uint8_t *pix, *ptr;
-    const int xmin= s->me.xmin;
-    const int ymin= s->me.ymin;
-    const int xmax= s->me.xmax;
-    const int ymax= s->me.ymax;
+    int stride= s->me.stride;
+    const int flags= s->me.sub_flags;
+    LOAD_COMMON
+    
+    assert(flags == 0);
 
     if(s->me.skip){
 //    printf("S");
@@ -632,11 +565,11 @@ static inline int sad_hpel_motion_search(MpegEncContext * s,
     }
 //    printf("N");
         
-    pix = src_data[0];
+    pix = s->me.src[src_index][0];
 
     mx = *mx_ptr;
     my = *my_ptr;
-    ptr = ref_data[0] + (my * stride) + mx;
+    ptr = s->me.ref[ref_index][0] + (my * stride) + mx;
     
     dminh = dmin;
 
@@ -733,7 +666,7 @@ static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my, int mv4)
         s->current_picture.motion_val[0][mot_xy+1][0]= mx;
         s->current_picture.motion_val[0][mot_xy+1][1]= my;
 
-        mot_xy += s->block_wrap[0];
+        mot_xy += s->b8_stride;
         s->current_picture.motion_val[0][mot_xy  ][0]= mx;
         s->current_picture.motion_val[0][mot_xy  ][1]= my;
         s->current_picture.motion_val[0][mot_xy+1][0]= mx;
@@ -763,41 +696,40 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
     }
 }
 
+static inline void init_mv4_ref(MpegEncContext *s){
+    MotionEstContext * const c= &s->me;
+    const int stride= s->linesize;
+
+    c->ref[1][0] = c->ref[0][0] + 8;
+    c->ref[2][0] = c->ref[0][0] + 8*stride;
+    c->ref[3][0] = c->ref[2][0] + 8;
+    c->src[1][0] = c->src[0][0] + 8;
+    c->src[2][0] = c->src[0][0] + 8*stride;
+    c->src[3][0] = c->src[2][0] + 8;
+}
+
 static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
 {
+    MotionEstContext * const c= &s->me;
     const int size= 1;
     const int h=8;
     int block;
     int P[10][2];
     int dmin_sum=0, mx4_sum=0, my4_sum=0;
-    uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
     int same=1;
     const int stride= s->linesize;
     const int uvstride= s->uvlinesize;
-    const int xmin= s->me.xmin;
-    const int ymin= s->me.ymin;
-    const int xmax= s->me.xmax;
-    const int ymax= s->me.ymax;
+    uint8_t *mv_penalty= s->me.current_mv_penalty;
 
+    init_mv4_ref(s);
+    
     for(block=0; block<4; block++){
         int mx4, my4;
         int pred_x4, pred_y4;
         int dmin4;
         static const int off[4]= {2, 1, 1, -1};
-        const int mot_stride = s->block_wrap[0];
+        const int mot_stride = s->b8_stride;
         const int mot_xy = s->block_index[block];
-        const int block_x= (block&1);
-        const int block_y= (block>>1);
-        uint8_t *src_data[3]= {
-            s->new_picture.data[0] + 8*(2*s->mb_x + block_x) + stride  *8*(2*s->mb_y + block_y), //FIXME chroma?
-            s->new_picture.data[1] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y),
-            s->new_picture.data[2] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y)
-        };
-        uint8_t *ref_data[3]= {
-            s->last_picture.data[0] + 8*(2*s->mb_x + block_x) + stride  *8*(2*s->mb_y + block_y), //FIXME chroma?
-            s->last_picture.data[1] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y),
-            s->last_picture.data[2] + 4*(2*s->mb_x + block_x) + uvstride*4*(2*s->mb_y + block_y)
-        };
 
         P_LEFT[0] = s->current_picture.motion_val[0][mot_xy - 1][0];
         P_LEFT[1] = s->current_picture.motion_val[0][mot_xy - 1][1];
@@ -806,8 +738,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
 
         /* special case for first line */
         if (s->first_slice_line && block<2) {
-            pred_x4= P_LEFT[0];
-            pred_y4= P_LEFT[1];
+            s->me.pred_x= pred_x4= P_LEFT[0];
+            s->me.pred_y= pred_y4= P_LEFT[1];
         } else {
             P_TOP[0]      = s->current_picture.motion_val[0][mot_xy - mot_stride             ][0];
             P_TOP[1]      = s->current_picture.motion_val[0][mot_xy - mot_stride             ][1];
@@ -821,32 +753,22 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
             P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
             P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
 
-//            if(s->out_format == FMT_H263){
-                pred_x4 = P_MEDIAN[0];
-                pred_y4 = P_MEDIAN[1];
-#if 0
-            }else { /* mpeg1 at least */
-                pred_x4= P_LEFT[0];
-                pred_y4= P_LEFT[1];
-            }
-#endif
+            s->me.pred_x= pred_x4 = P_MEDIAN[0];
+            s->me.pred_y= pred_y4 = P_MEDIAN[1];
         }
         P_MV1[0]= mx;
         P_MV1[1]= my;
 
-        dmin4 = s->me.motion_search[1](s, &mx4, &my4, P, pred_x4, pred_y4, 
-                                       src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty);
+        dmin4 = epzs_motion_search4(s, &mx4, &my4, P, block, block, s->p_mv_table, (1<<16)>>shift);
 
-        dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, 
-					  pred_x4, pred_y4, src_data, ref_data, stride, uvstride, size, h, mv_penalty);
+        dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h);
         
-        if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]
-           && s->avctx->mb_decision == FF_MB_DECISION_SIMPLE){
+        if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
             int dxy;
             const int offset= ((block&1) + (block>>1)*stride)*8;
             uint8_t *dest_y = s->me.scratchpad + offset;
             if(s->quarter_sample){
-                uint8_t *ref= ref_data[0] + (mx4>>2) + (my4>>2)*stride;
+                uint8_t *ref= c->ref[block][0] + (mx4>>2) + (my4>>2)*stride;
                 dxy = ((my4 & 3) << 2) | (mx4 & 3);
 
                 if(s->no_rounding)
@@ -854,7 +776,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
                 else
                     s->dsp.put_qpel_pixels_tab       [1][dxy](dest_y   , ref    , stride);
             }else{
-                uint8_t *ref= ref_data[0] + (mx4>>1) + (my4>>1)*stride;
+                uint8_t *ref= c->ref[block][0] + (mx4>>1) + (my4>>1)*stride;
                 dxy = ((my4 & 1) << 1) | (mx4 & 1);
 
                 if(s->no_rounding)
@@ -909,6 +831,9 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
         dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad  , s->uvlinesize, 8);
         dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad+8, s->uvlinesize, 8);
     }
+    
+    s->me.pred_x= mx;
+    s->me.pred_y= my;
 
     switch(s->avctx->mb_cmp&0xFF){
     /*case FF_CMP_SSE:
@@ -920,14 +845,28 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
     }
 }
 
-static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint8_t *frame_ref_data[3], 
-                             int16_t (*mv_tables[2][2])[2], uint8_t *field_select_tables[2], int f_code, int mx, int my)
+static inline void init_interlaced_ref(MpegEncContext *s, int ref_index){
+    MotionEstContext * const c= &s->me;
+
+    c->ref[1+ref_index][0] = c->ref[0+ref_index][0] + s->linesize;
+    c->src[1][0] = c->src[0][0] + s->linesize;
+    if(c->flags & FLAG_CHROMA){
+        c->ref[1+ref_index][1] = c->ref[0+ref_index][1] + s->uvlinesize;
+        c->ref[1+ref_index][2] = c->ref[0+ref_index][2] + s->uvlinesize;
+        c->src[1][1] = c->src[0][1] + s->uvlinesize;
+        c->src[1][2] = c->src[0][2] + s->uvlinesize;
+    }
+}
+
+static int interlaced_search(MpegEncContext *s, int ref_index, 
+                             int16_t (*mv_tables[2][2])[2], uint8_t *field_select_tables[2], int mx, int my, int user_field_select)
 {
+    MotionEstContext * const c= &s->me;
     const int size=0;
     const int h=8;
     int block;
     int P[10][2];
-    uint8_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV;
+    uint8_t * const mv_penalty= c->current_mv_penalty;
     int same=1;
     const int stride= 2*s->linesize;
     const int uvstride= 2*s->uvlinesize;
@@ -935,45 +874,42 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
     const int mot_stride= s->mb_stride;
     const int xy= s->mb_x + s->mb_y*mot_stride;
     
-    s->me.ymin>>=1;
-    s->me.ymax>>=1;
+    c->ymin>>=1;
+    c->ymax>>=1;
+    c->stride<<=1;
+    c->uvstride<<=1;
+    init_interlaced_ref(s, ref_index);
     
     for(block=0; block<2; block++){
         int field_select;
         int best_dmin= INT_MAX;
         int best_field= -1;
 
-        uint8_t *src_data[3]= {
-            frame_src_data[0] + s->  linesize*block,
-            frame_src_data[1] + s->uvlinesize*block,
-            frame_src_data[2] + s->uvlinesize*block
-        };
-
         for(field_select=0; field_select<2; field_select++){
-            int dmin, mx_i, my_i, pred_x, pred_y;
-            uint8_t *ref_data[3]= {
-                frame_ref_data[0] + s->  linesize*field_select,
-                frame_ref_data[1] + s->uvlinesize*field_select,
-                frame_ref_data[2] + s->uvlinesize*field_select
-            };
+            int dmin, mx_i, my_i;
             int16_t (*mv_table)[2]= mv_tables[block][field_select];
             
+            if(user_field_select){
+                if(field_select_tables[block][xy] != field_select)
+                    continue;
+            }
+            
             P_LEFT[0] = mv_table[xy - 1][0];
             P_LEFT[1] = mv_table[xy - 1][1];
-            if(P_LEFT[0]       > (s->me.xmax<<1)) P_LEFT[0]       = (s->me.xmax<<1);
+            if(P_LEFT[0]       > (c->xmax<<1)) P_LEFT[0]       = (c->xmax<<1);
             
-            pred_x= P_LEFT[0];
-            pred_y= P_LEFT[1];
+            s->me.pred_x= P_LEFT[0];
+            s->me.pred_y= P_LEFT[1];
             
             if(!s->first_slice_line){
                 P_TOP[0]      = mv_table[xy - mot_stride][0];
                 P_TOP[1]      = mv_table[xy - mot_stride][1];
                 P_TOPRIGHT[0] = mv_table[xy - mot_stride + 1][0];
                 P_TOPRIGHT[1] = mv_table[xy - mot_stride + 1][1];
-                if(P_TOP[1]      > (s->me.ymax<<1)) P_TOP[1]     = (s->me.ymax<<1);
-                if(P_TOPRIGHT[0] < (s->me.xmin<<1)) P_TOPRIGHT[0]= (s->me.xmin<<1);
-                if(P_TOPRIGHT[0] > (s->me.xmax<<1)) P_TOPRIGHT[0]= (s->me.xmax<<1);
-                if(P_TOPRIGHT[1] > (s->me.ymax<<1)) P_TOPRIGHT[1]= (s->me.ymax<<1);
+                if(P_TOP[1]      > (c->ymax<<1)) P_TOP[1]     = (c->ymax<<1);
+                if(P_TOPRIGHT[0] < (c->xmin<<1)) P_TOPRIGHT[0]= (c->xmin<<1);
+                if(P_TOPRIGHT[0] > (c->xmax<<1)) P_TOPRIGHT[0]= (c->xmax<<1);
+                if(P_TOPRIGHT[1] > (c->ymax<<1)) P_TOPRIGHT[1]= (c->ymax<<1);
     
                 P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
                 P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
@@ -981,32 +917,29 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
             P_MV1[0]= mx; //FIXME not correct if block != field_select
             P_MV1[1]= my / 2;
             
-            dmin = s->me.motion_search[4](s, &mx_i, &my_i, P, pred_x, pred_y, 
-                                           src_data, ref_data, stride, uvstride, mv_table, (1<<16)>>1, mv_penalty);
+            dmin = epzs_motion_search2(s, &mx_i, &my_i, P, block, field_select+ref_index, mv_table, (1<<16)>>1);
 
-            dmin= s->me.sub_motion_search(s, &mx_i, &my_i, dmin, 
-                                           pred_x, pred_y, src_data, ref_data, stride, uvstride, size, h, mv_penalty);
+            dmin= c->sub_motion_search(s, &mx_i, &my_i, dmin, block, field_select+ref_index, size, h);
             
             mv_table[xy][0]= mx_i;
             mv_table[xy][1]= my_i;
             
-            if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]
-               && s->avctx->mb_decision == FF_MB_DECISION_SIMPLE){
+            if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
                 int dxy;
 
                 //FIXME chroma ME
-                uint8_t *ref= ref_data[0] + (mx_i>>1) + (my_i>>1)*stride;
+                uint8_t *ref= c->ref[field_select+ref_index][0] + (mx_i>>1) + (my_i>>1)*stride;
                 dxy = ((my_i & 1) << 1) | (mx_i & 1);
 
                 if(s->no_rounding){
-                    s->dsp.put_no_rnd_pixels_tab[size][dxy](s->me.scratchpad, ref    , stride, h);
+                    s->dsp.put_no_rnd_pixels_tab[size][dxy](c->scratchpad, ref    , stride, h);
                 }else{
-                    s->dsp.put_pixels_tab       [size][dxy](s->me.scratchpad, ref    , stride, h);
+                    s->dsp.put_pixels_tab       [size][dxy](c->scratchpad, ref    , stride, h);
                 }
-                dmin= s->dsp.mb_cmp[size](s, src_data[0], s->me.scratchpad, stride, h);
-                dmin+= (mv_penalty[mx_i-pred_x] + mv_penalty[my_i-pred_y] + 1)*s->me.mb_penalty_factor;
+                dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h);
+                dmin+= (mv_penalty[mx_i-s->me.pred_x] + mv_penalty[my_i-s->me.pred_y] + 1)*c->mb_penalty_factor;
             }else
-                dmin+= s->me.mb_penalty_factor; //field_select bits
+                dmin+= c->mb_penalty_factor; //field_select bits
                 
             dmin += field_select != block; //slightly prefer same field
             
@@ -1028,8 +961,10 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
         dmin_sum += best_dmin;
     }
     
-    s->me.ymin<<=1;
-    s->me.ymax<<=1;
+    c->ymin<<=1;
+    c->ymax<<=1;
+    c->stride>>=1;
+    c->uvstride>>=1;
 
     if(same)
         return INT_MAX;
@@ -1040,44 +975,182 @@ static int interlaced_search(MpegEncContext *s, uint8_t *frame_src_data[3], uint
     case FF_CMP_RD:
         return dmin_sum;
     default:
-        return dmin_sum+ 11*s->me.mb_penalty_factor;
+        return dmin_sum+ 11*c->mb_penalty_factor;
     }
 }
 
+static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int p_type){
+    MotionEstContext * const c= &s->me;
+    Picture *p= s->current_picture_ptr;
+    int mb_xy= mb_x + mb_y*s->mb_stride;
+    int xy= 2*mb_x + 2*mb_y*s->b8_stride;
+    int mb_type= s->current_picture.mb_type[mb_xy];
+    int flags= c->flags;
+    int shift= (flags&FLAG_QPEL) + 1;
+    int mask= (1<<shift)-1;
+    int x, y, i;
+    int d=0;
+    me_cmp_func cmpf= s->dsp.sse[0];
+    me_cmp_func chroma_cmpf= s->dsp.sse[1];
+    
+    assert(p_type==0 || !USES_LIST(mb_type, 1));
+    assert(IS_INTRA(mb_type) || USES_LIST(mb_type,0) || USES_LIST(mb_type,1));
+    
+    if(IS_INTERLACED(mb_type)){
+        int xy2= xy  + s->b8_stride;
+        s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA;
+        c->stride<<=1;
+        c->uvstride<<=1;
+        
+        assert(s->flags & CODEC_FLAG_INTERLACED_ME);
+
+        if(USES_LIST(mb_type, 0)){
+            int field_select0= p->ref_index[0][xy ];
+            int field_select1= p->ref_index[0][xy2];
+            assert(field_select0==0 ||field_select0==1);
+            assert(field_select1==0 ||field_select1==1);
+            init_interlaced_ref(s, 0);
+
+            if(p_type){
+                s->p_field_select_table[0][mb_xy]= field_select0;
+                s->p_field_select_table[1][mb_xy]= field_select1;
+                *(uint32_t*)s->p_field_mv_table[0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[0][xy ];
+                *(uint32_t*)s->p_field_mv_table[1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[0][xy2];
+                s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER_I;
+            }else{
+                s->b_field_select_table[0][0][mb_xy]= field_select0;
+                s->b_field_select_table[0][1][mb_xy]= field_select1;
+                *(uint32_t*)s->b_field_mv_table[0][0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[0][xy ];
+                *(uint32_t*)s->b_field_mv_table[0][1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[0][xy2];
+                s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_FORWARD_I;
+            }
+
+            x= p->motion_val[0][xy ][0]; 
+            y= p->motion_val[0][xy ][1];
+            d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select0, 0, cmpf, chroma_cmpf, flags);
+            x= p->motion_val[0][xy2][0]; 
+            y= p->motion_val[0][xy2][1];
+            d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select1, 1, cmpf, chroma_cmpf, flags);
+        }
+        if(USES_LIST(mb_type, 1)){
+            int field_select0= p->ref_index[1][xy ];
+            int field_select1= p->ref_index[1][xy2];
+            assert(field_select0==0 ||field_select0==1);
+            assert(field_select1==0 ||field_select1==1);
+            init_interlaced_ref(s, 2);
+
+            s->b_field_select_table[1][0][mb_xy]= field_select0;
+            s->b_field_select_table[1][1][mb_xy]= field_select1;
+            *(uint32_t*)s->b_field_mv_table[1][0][field_select0][mb_xy]= *(uint32_t*)p->motion_val[1][xy ];
+            *(uint32_t*)s->b_field_mv_table[1][1][field_select1][mb_xy]= *(uint32_t*)p->motion_val[1][xy2];
+            if(USES_LIST(mb_type, 0)){
+                s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_BIDIR_I;
+            }else{
+                s->mb_type[mb_xy]= CANDIDATE_MB_TYPE_BACKWARD_I;
+            }
+
+            x= p->motion_val[1][xy ][0]; 
+            y= p->motion_val[1][xy ][1];
+            d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select0+2, 0, cmpf, chroma_cmpf, flags);
+            x= p->motion_val[1][xy2][0]; 
+            y= p->motion_val[1][xy2][1];
+            d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 8, field_select1+2, 1, cmpf, chroma_cmpf, flags);
+            //FIXME bidir scores
+        }
+        c->stride>>=1;
+        c->uvstride>>=1;
+    }else if(IS_8X8(mb_type)){
+        assert(s->flags & CODEC_FLAG_4MV);
+        cmpf= s->dsp.sse[1];
+        chroma_cmpf= s->dsp.sse[1];
+        init_mv4_ref(s);
+        for(i=0; i<4; i++){
+            xy= s->block_index[i];
+            x= p->motion_val[0][xy][0]; 
+            y= p->motion_val[0][xy][1];
+            d+= cmp(s, x>>shift, y>>shift, x&mask, y&mask, 1, 8, i, i, cmpf, chroma_cmpf, flags);
+        }
+        s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER4V;
+    }else{
+        if(USES_LIST(mb_type, 0)){
+            if(p_type){
+                *(uint32_t*)s->p_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy];
+                s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTER;
+            }else if(USES_LIST(mb_type, 1)){
+                *(uint32_t*)s->b_bidir_forw_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy];
+                *(uint32_t*)s->b_bidir_back_mv_table[mb_xy]= *(uint32_t*)p->motion_val[1][xy];
+                s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_BIDIR;
+            }else{
+                *(uint32_t*)s->b_forw_mv_table[mb_xy]= *(uint32_t*)p->motion_val[0][xy];
+                s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_FORWARD;
+            }
+            x= p->motion_val[0][xy][0]; 
+            y= p->motion_val[0][xy][1];
+            d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 16, 0, 0, cmpf, chroma_cmpf, flags);
+        }else if(USES_LIST(mb_type, 1)){
+            *(uint32_t*)s->b_back_mv_table[mb_xy]= *(uint32_t*)p->motion_val[1][xy];
+            s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_BACKWARD;
+           
+            x= p->motion_val[1][xy][0]; 
+            y= p->motion_val[1][xy][1];
+            d = cmp(s, x>>shift, y>>shift, x&mask, y&mask, 0, 16, 2, 0, cmpf, chroma_cmpf, flags);
+        }else
+            s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA;
+    }
+    return d;
+}
+
 void ff_estimate_p_frame_motion(MpegEncContext * s,
                                 int mb_x, int mb_y)
 {
+    MotionEstContext * const c= &s->me;
     uint8_t *pix, *ppix;
-    int sum, varc, vard, mx, my, dmin, xx, yy;
-    int pred_x=0, pred_y=0;
+    int sum, varc, vard, mx, my, dmin;
     int P[10][2];
     const int shift= 1+s->quarter_sample;
     int mb_type=0;
-    uint8_t *ref_picture= s->last_picture.data[0];
     Picture * const pic= &s->current_picture;
-    uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
-    const int stride= s->linesize;
-    const int uvstride= s->uvlinesize;
-    uint8_t *src_data[3]= {
-        s->new_picture.data[0] + 16*(mb_x + stride*mb_y),
-        s->new_picture.data[1] + 8*(mb_x + uvstride*mb_y),
-        s->new_picture.data[2] + 8*(mb_x + uvstride*mb_y)
-    };
-    uint8_t *ref_data[3]= {
-        s->last_picture.data[0] + 16*(mb_x + stride*mb_y),
-        s->last_picture.data[1] + 8*(mb_x + uvstride*mb_y),
-        s->last_picture.data[2] + 8*(mb_x + uvstride*mb_y)
-    };
+    
+    init_ref(s, s->new_picture.data, s->last_picture.data, NULL, 16*mb_x, 16*mb_y, 0);
 
     assert(s->quarter_sample==0 || s->quarter_sample==1);
+    assert(s->linesize == s->me.stride);
+    assert(s->uvlinesize == s->me.uvstride);
 
     s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
     s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
     s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
+    s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
     s->me.skip=0;
 
+    /* intra / predictive decision */
+    pix = c->src[0][0];
+    sum = s->dsp.pix_sum(pix, s->linesize);
+    varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+
+    pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
+    pic->mb_var [s->mb_stride * mb_y + mb_x] = varc;
+    s->mb_var_sum_temp += varc;
+
+    if(s->avctx->me_threshold){
+        vard= (check_input_motion(s, mb_x, mb_y, 1)+128)>>8;
+        
+        if(vard<s->avctx->me_threshold){
+            pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
+            s->mc_mb_var_sum_temp += vard;
+            if (vard <= 64 || vard < varc) { //FIXME
+                s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+            }else{
+                s->scene_change_score+= s->qscale;
+            }
+            return;
+        }
+        if(vard<s->avctx->mb_threshold)
+            mb_type= s->mb_type[mb_x + mb_y*s->mb_stride];
+    }
+
     switch(s->me_method) {
     case ME_ZERO:
     default:
@@ -1106,7 +1179,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     case ME_X1:
     case ME_EPZS:
        {
-            const int mot_stride = s->block_wrap[0];
+            const int mot_stride = s->b8_stride;
             const int mot_xy = s->block_index[0];
 
             P_LEFT[0]       = s->current_picture.motion_val[0][mot_xy - 1][0];
@@ -1127,51 +1200,58 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
                 P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
 
                 if(s->out_format == FMT_H263){
-                    pred_x = P_MEDIAN[0];
-                    pred_y = P_MEDIAN[1];
+                    c->pred_x = P_MEDIAN[0];
+                    c->pred_y = P_MEDIAN[1];
                 }else { /* mpeg1 at least */
-                    pred_x= P_LEFT[0];
-                    pred_y= P_LEFT[1];
+                    c->pred_x= P_LEFT[0];
+                    c->pred_y= P_LEFT[1];
                 }
             }else{
-                pred_x= P_LEFT[0];
-                pred_y= P_LEFT[1];
+                c->pred_x= P_LEFT[0];
+                c->pred_y= P_LEFT[1];
             }
 
         }
-        dmin = s->me.motion_search[0](s, &mx, &my, P, pred_x, pred_y, 
-                                      src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty);
- 
+        dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift);       
+
         break;
     }
 
-    /* intra / predictive decision */
-    xx = mb_x * 16;
-    yy = mb_y * 16;
-
-    pix = src_data[0];
     /* At this point (mx,my) are full-pell and the relative displacement */
-    ppix = ref_data[0] + (my * s->linesize) + mx;
-    
-    sum = s->dsp.pix_sum(pix, s->linesize);
-    
-    varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+    ppix = c->ref[0][0] + (my * s->linesize) + mx;
+        
     vard = (s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16)+128)>>8;
 
-//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
-    pic->mb_var   [s->mb_stride * mb_y + mb_x] = varc;
     pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
-    pic->mb_mean  [s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
 //    pic->mb_cmp_score[s->mb_stride * mb_y + mb_x] = dmin; 
-    s->mb_var_sum_temp    += varc;
     s->mc_mb_var_sum_temp += vard;
-//printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
     
 #if 0
     printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n",
 	   varc, s->avg_mb_var, sum, vard, mx - xx, my - yy);
 #endif
-    if(s->avctx->mb_decision > FF_MB_DECISION_SIMPLE){
+    if(mb_type){
+        if (vard <= 64 || vard < varc)
+            s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+        else
+            s->scene_change_score+= s->qscale;
+
+        if(mb_type == CANDIDATE_MB_TYPE_INTER){
+            s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
+            set_p_mv_tables(s, mx, my, 1);
+        }else{
+            mx <<=shift;
+            my <<=shift;
+        }
+        if(mb_type == CANDIDATE_MB_TYPE_INTER4V){
+            h263_mv4_search(s, mx, my, shift);
+
+            set_p_mv_tables(s, mx, my, 0);
+        }
+        if(mb_type == CANDIDATE_MB_TYPE_INTER_I){
+            interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 1);
+        }
+    }else if(s->avctx->mb_decision > FF_MB_DECISION_SIMPLE){
         if (vard <= 64 || vard < varc)
             s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
         else
@@ -1181,8 +1261,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             mb_type|= CANDIDATE_MB_TYPE_INTRA;
         if (varc*2 + 200 > vard){
             mb_type|= CANDIDATE_MB_TYPE_INTER;
-            s->me.sub_motion_search(s, &mx, &my, dmin,
-				   pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
+            s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
             if(s->flags&CODEC_FLAG_MV0)
                 if(mx || my)
                     mb_type |= CANDIDATE_MB_TYPE_SKIPED; //FIXME check difference
@@ -1200,17 +1279,16 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             set_p_mv_tables(s, mx, my, 1);
         if((s->flags&CODEC_FLAG_INTERLACED_ME)
            && !s->me.skip){ //FIXME varc/d checks
-            if(interlaced_search(s, src_data, ref_data, s->p_field_mv_table, s->p_field_select_table, s->f_code, mx, my) < INT_MAX)
+            if(interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 0) < INT_MAX)
                 mb_type |= CANDIDATE_MB_TYPE_INTER_I;
         }
     }else{
         int intra_score, i;
         mb_type= CANDIDATE_MB_TYPE_INTER;
 
-        dmin= s->me.sub_motion_search(s, &mx, &my, dmin,
-                                    pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
+        dmin= s->me.sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
         if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
-            dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, src_data, ref_data, stride, uvstride, mv_penalty);
+            dmin= get_mb_score(s, mx, my, 0, 0);
 
         if((s->flags&CODEC_FLAG_4MV)
            && !s->me.skip && varc>50 && vard>10){
@@ -1222,7 +1300,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
         }
         if((s->flags&CODEC_FLAG_INTERLACED_ME)
            && !s->me.skip){ //FIXME varc/d checks
-            int dmin_i= interlaced_search(s, src_data, ref_data, s->p_field_mv_table, s->p_field_select_table, s->f_code, mx, my);
+            int dmin_i= interlaced_search(s, 0, s->p_field_mv_table, s->p_field_select_table, mx, my, 0);
             if(dmin_i < dmin){
                 mb_type = CANDIDATE_MB_TYPE_INTER_I;
                 dmin= dmin_i;
@@ -1256,7 +1334,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
                 int mean;
                 
                 if(s->out_format == FMT_H263){
-                    mean= (s->dc_val[i][mb_x + (mb_y+1)*(s->mb_width+2)] + 4)>>3; //FIXME not exact but simple ;)
+                    mean= (s->dc_val[i][mb_x + mb_y*s->b8_stride] + 4)>>3; //FIXME not exact but simple ;)
                 }else{
                     mean= (s->last_dc[i] + 4)>>3;
                 }
@@ -1293,28 +1371,17 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
                                     int mb_x, int mb_y)
 {
+    MotionEstContext * const c= &s->me;
     int mx, my, dmin;
-    int pred_x=0, pred_y=0;
     int P[10][2];
     const int shift= 1+s->quarter_sample;
-    uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
     const int xy= mb_x + mb_y*s->mb_stride;
-    const int stride= s->linesize;
-    const int uvstride= s->uvlinesize;
-    uint8_t *src_data[3]= {
-        s->new_picture.data[0] + 16*(mb_x + stride*mb_y),
-        s->new_picture.data[1] + 8*(mb_x + uvstride*mb_y),
-        s->new_picture.data[2] + 8*(mb_x + uvstride*mb_y)
-    };
-    uint8_t *ref_data[3]= {
-        s->last_picture.data[0] + 16*(mb_x + stride*mb_y),
-        s->last_picture.data[1] + 8*(mb_x + uvstride*mb_y),
-        s->last_picture.data[2] + 8*(mb_x + uvstride*mb_y)
-    };
+    init_ref(s, s->new_picture.data, s->last_picture.data, NULL, 16*mb_x, 16*mb_y, 0);
     
     assert(s->quarter_sample==0 || s->quarter_sample==1);
 
     s->me.pre_penalty_factor    = get_penalty_factor(s, s->avctx->me_pre_cmp);
+    s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
     s->me.skip=0;
@@ -1326,8 +1393,8 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
 
     /* special case for first line */
     if (s->first_slice_line) {
-        pred_x= P_LEFT[0];
-        pred_y= P_LEFT[1];
+        c->pred_x= P_LEFT[0];
+        c->pred_y= P_LEFT[1];
         P_TOP[0]= P_TOPRIGHT[0]= P_MEDIAN[0]=
         P_TOP[1]= P_TOPRIGHT[1]= P_MEDIAN[1]= 0; //FIXME 
     } else {
@@ -1342,11 +1409,11 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
         P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
         P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
 
-        pred_x = P_MEDIAN[0];
-        pred_y = P_MEDIAN[1];
+        c->pred_x = P_MEDIAN[0];
+        c->pred_y = P_MEDIAN[1];
     }
-    dmin = s->me.pre_motion_search(s, &mx, &my, P, pred_x, pred_y, 
-                                   src_data, ref_data, stride, uvstride, s->p_mv_table, (1<<16)>>shift, mv_penalty);
+
+    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift);       
 
     s->p_mv_table[xy][0] = mx<<shift;
     s->p_mv_table[xy][1] = my<<shift;
@@ -1355,22 +1422,20 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
 }
 
 static int ff_estimate_motion_b(MpegEncContext * s,
-                       int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *src_data[3],
-                       uint8_t *ref_data[3], int stride, int uvstride, int f_code)
+                       int mb_x, int mb_y, int16_t (*mv_table)[2], int ref_index, int f_code)
 {
     int mx, my, dmin;
-    int pred_x=0, pred_y=0;
     int P[10][2];
     const int shift= 1+s->quarter_sample;
     const int mot_stride = s->mb_stride;
     const int mot_xy = mb_y*mot_stride + mb_x;
-    uint8_t * const ref_picture= ref_data[0] - 16*s->mb_x - 16*s->mb_y*s->linesize; //FIXME ugly
     uint8_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV;
     int mv_scale;
         
     s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
     s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
     s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
+    s->me.current_mv_penalty= mv_penalty;
 
     get_limits(s, 16*mb_x, 16*mb_y);
 
@@ -1420,8 +1485,8 @@ static int ff_estimate_motion_b(MpegEncContext * s,
                 P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
                 P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
             }
-            pred_x= P_LEFT[0];
-            pred_y= P_LEFT[1];
+            s->me.pred_x= P_LEFT[0];
+            s->me.pred_y= P_LEFT[1];
         }
         
         if(mv_table == s->b_forw_mv_table){
@@ -1430,17 +1495,15 @@ static int ff_estimate_motion_b(MpegEncContext * s,
             mv_scale= ((s->pb_time - s->pp_time)<<16) / (s->pp_time<<shift);
         }
         
-        dmin = s->me.motion_search[0](s, &mx, &my, P, pred_x, pred_y, 
-                                      src_data, ref_data, stride, uvstride, s->p_mv_table, mv_scale, mv_penalty);
+        dmin = epzs_motion_search(s, &mx, &my, P, 0, ref_index, s->p_mv_table, mv_scale);
  
         break;
     }
     
-    dmin= s->me.sub_motion_search(s, &mx, &my, dmin,
-				   pred_x, pred_y, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
+    dmin= s->me.sub_motion_search(s, &mx, &my, dmin, 0, ref_index, 0, 16);
                                    
     if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
-        dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, src_data, ref_data, stride, uvstride, mv_penalty);
+        dmin= get_mb_score(s, mx, my, 0, ref_index);
 
 //printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
@@ -1450,8 +1513,7 @@ static int ff_estimate_motion_b(MpegEncContext * s,
     return dmin;
 }
 
-static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6],
-                   int stride, int uvstride,
+static inline int check_bidir_mv(MpegEncContext * s,
                    int motion_fx, int motion_fy,
                    int motion_bx, int motion_by,
                    int pred_fx, int pred_fy,
@@ -1459,15 +1521,20 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
                    int size, int h)
 {
     //FIXME optimize?
-    //FIXME move into template?
     //FIXME better f_code prediction (max mv & distance)
     //FIXME pointers
+    MotionEstContext * const c= &s->me;
     uint8_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    int stride= s->me.stride;
+    int uvstride= s->me.uvstride;
     uint8_t *dest_y = s->me.scratchpad;
     uint8_t *ptr;
     int dxy;
     int src_x, src_y;
     int fbmin;
+    uint8_t **src_data= c->src[0];
+    uint8_t **ref_data= c->ref[0];
+    uint8_t **ref2_data= c->ref[2];
 
     if(s->quarter_sample){
         dxy = ((motion_fy & 3) << 2) | (motion_fx & 3);
@@ -1481,7 +1548,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
         src_x = motion_bx >> 2;
         src_y = motion_by >> 2;
     
-        ptr = ref_data[3] + (src_y * stride) + src_x;
+        ptr = ref2_data[0] + (src_y * stride) + src_x;
         s->dsp.avg_qpel_pixels_tab[size][dxy](dest_y    , ptr    , stride);
     }else{
         dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
@@ -1495,7 +1562,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
         src_x = motion_bx >> 1;
         src_y = motion_by >> 1;
     
-        ptr = ref_data[3] + (src_y * stride) + src_x;
+        ptr = ref2_data[0] + (src_y * stride) + src_x;
         s->dsp.avg_pixels_tab[size][dxy](dest_y    , ptr    , stride, h);
     }
 
@@ -1511,9 +1578,7 @@ static inline int check_bidir_mv(MpegEncContext * s, uint8_t *src_data[3], uint8
 }
 
 /* refine the bidir vectors in hq mode and return the score in both lq & hq mode*/
-static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6],
-                                  int stride, int uvstride,
-                                  int mb_x, int mb_y)
+static inline int bidir_refine(MpegEncContext * s, int mb_x, int mb_y)
 {
     const int mot_stride = s->mb_stride;
     const int xy = mb_y *mot_stride + mb_x;
@@ -1529,8 +1594,7 @@ static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t
 
     //FIXME do refinement and add flag
     
-    fbmin= check_bidir_mv(s, src_data, ref_data, stride, uvstride,
-                          motion_fx, motion_fy,
+    fbmin= check_bidir_mv(s, motion_fx, motion_fy,
                           motion_bx, motion_by,
                           pred_fx, pred_fy,
                           pred_bx, pred_by,
@@ -1539,9 +1603,7 @@ static inline int bidir_refine(MpegEncContext * s, uint8_t *src_data[3], uint8_t
    return fbmin;
 }
 
-static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_t *ref_data[6],
-                                int stride, int uvstride,
-                                int mb_x, int mb_y)
+static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
 {
     int P[10][2];
     const int mot_stride = s->mb_stride;
@@ -1552,8 +1614,8 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_
     const int time_pb= s->pb_time;
     int mx, my, xmin, xmax, ymin, ymax;
     int16_t (*mv_table)[2]= s->b_direct_mv_table;
-    uint8_t * const mv_penalty= s->me.mv_penalty[1] + MAX_MV;
     
+    s->me.current_mv_penalty= s->me.mv_penalty[1] + MAX_MV;
     ymin= xmin=(-32)>>shift;
     ymax= xmax=   31>>shift;
 
@@ -1604,6 +1666,10 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_
     s->me.ymin= ymin;
     s->me.xmax= xmax;
     s->me.ymax= ymax;
+    s->me.flags     |= FLAG_DIRECT;
+    s->me.sub_flags |= FLAG_DIRECT;
+    s->me.pred_x=0;
+    s->me.pred_y=0;
 
     P_LEFT[0]        = clip(mv_table[mot_xy - 1][0], xmin<<shift, xmax<<shift);
     P_LEFT[1]        = clip(mv_table[mot_xy - 1][1], ymin<<shift, ymax<<shift);
@@ -1619,29 +1685,22 @@ static inline int direct_search(MpegEncContext * s, uint8_t *src_data[3], uint8_
         P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
  
-    //FIXME direct_search  ptr in context!!! (needed for chroma anyway or this will get messy)   
-    if(s->flags&CODEC_FLAG_QPEL){
-        dmin = simple_direct_qpel_epzs_motion_search(s, &mx, &my, P, 0, 0, 
-                                                     src_data, ref_data, stride, uvstride, mv_table, 1<<14, mv_penalty);
-        dmin = simple_direct_qpel_qpel_motion_search(s, &mx, &my, dmin,
-                                                0, 0, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
-        
-        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
-            dmin= simple_direct_qpel_qpel_get_mb_score(s, mx, my, 0, 0, src_data, ref_data, stride, uvstride, mv_penalty);
-    }else{
-        dmin = simple_direct_hpel_epzs_motion_search(s, &mx, &my, P, 0, 0, 
-                                                     src_data, ref_data, stride, uvstride, mv_table, 1<<15, mv_penalty);
-        dmin = simple_direct_hpel_hpel_motion_search(s, &mx, &my, dmin,
-                                                0, 0, src_data, ref_data, stride, uvstride, 0, 16, mv_penalty);
-
-        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
-            dmin= simple_direct_hpel_hpel_get_mb_score(s, mx, my, 0, 0, src_data, ref_data, stride, uvstride, mv_penalty);
-    }
+    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, mv_table, 1<<(16-shift));
+    if(s->me.sub_flags&FLAG_QPEL) 
+        dmin = qpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
+    else
+        dmin = hpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
+    
+    if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+        dmin= get_mb_score(s, mx, my, 0, 0);
     
     get_limits(s, 16*mb_x, 16*mb_y); //restore s->me.?min/max, maybe not needed
 
     s->b_direct_mv_table[mot_xy][0]= mx;
     s->b_direct_mv_table[mot_xy][1]= my;
+    s->me.flags     &= ~FLAG_DIRECT;
+    s->me.sub_flags &= ~FLAG_DIRECT;
+
     return dmin;
 }
 
@@ -1651,52 +1710,89 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
     const int penalty_factor= s->me.mb_penalty_factor;
     int fmin, bmin, dmin, fbmin, bimin, fimin;
     int type=0;
-    const int stride= s->linesize;
-    const int uvstride= s->uvlinesize;
-    uint8_t *src_data[3]= {
-        s->new_picture.data[0] + 16*(s->mb_x + stride*s->mb_y),
-        s->new_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y),
-        s->new_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y)
-    };
-    uint8_t *ref_data[6]= {
-        s->last_picture.data[0] + 16*(s->mb_x + stride*s->mb_y),
-        s->last_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y),
-        s->last_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y),
-        s->next_picture.data[0] + 16*(s->mb_x + stride*s->mb_y),
-        s->next_picture.data[1] + 8*(s->mb_x + uvstride*s->mb_y),
-        s->next_picture.data[2] + 8*(s->mb_x + uvstride*s->mb_y)
-    };
+    const int xy = mb_y*s->mb_stride + mb_x;
+    init_ref(s, s->new_picture.data, s->last_picture.data, s->next_picture.data, 16*mb_x, 16*mb_y, 2);
+
     
     s->me.skip=0;
+    if(s->avctx->me_threshold){
+        int vard= (check_input_motion(s, mb_x, mb_y, 0)+128)>>8;
+        
+        if(vard<s->avctx->me_threshold){
+//            pix = c->src[0][0];
+//            sum = s->dsp.pix_sum(pix, s->linesize);
+//            varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
+        
+//            pic->mb_var   [s->mb_stride * mb_y + mb_x] = varc;
+             s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] = vard;
+/*            pic->mb_mean  [s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
+            s->mb_var_sum_temp    += varc;*/
+            s->mc_mb_var_sum_temp += vard;
+/*            if (vard <= 64 || vard < varc) {
+                s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+            }else{
+                s->scene_change_score+= s->qscale;
+            }*/
+            return;
+        }
+        if(vard<s->avctx->mb_threshold){
+            type= s->mb_type[mb_y*s->mb_stride + mb_x];
+            if(type == CANDIDATE_MB_TYPE_DIRECT){
+                direct_search(s, mb_x, mb_y);
+            }
+            if(type == CANDIDATE_MB_TYPE_FORWARD || type == CANDIDATE_MB_TYPE_BIDIR){
+                s->me.skip=0;
+                ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, 0, s->f_code);
+            }
+            if(type == CANDIDATE_MB_TYPE_BACKWARD || type == CANDIDATE_MB_TYPE_BIDIR){
+                s->me.skip=0;
+                ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, 2, s->b_code);
+            }
+            if(type == CANDIDATE_MB_TYPE_FORWARD_I || type == CANDIDATE_MB_TYPE_BIDIR_I){
+                s->me.skip=0;
+                s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
+                interlaced_search(s, 0,
+                                        s->b_field_mv_table[0], s->b_field_select_table[0],
+                                        s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 1);
+            }
+            if(type == CANDIDATE_MB_TYPE_BACKWARD_I || type == CANDIDATE_MB_TYPE_BIDIR_I){
+                s->me.skip=0;
+                s->me.current_mv_penalty= s->me.mv_penalty[s->b_code] + MAX_MV;
+                interlaced_search(s, 2,
+                                        s->b_field_mv_table[1], s->b_field_select_table[1],
+                                        s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 1);
+            }
+            return;
+        }
+    }
+
     if (s->codec_id == CODEC_ID_MPEG4)
-        dmin= direct_search(s, src_data, ref_data, stride, uvstride, mb_x, mb_y);
+        dmin= direct_search(s, mb_x, mb_y);
     else
         dmin= INT_MAX;
 //FIXME penalty stuff for non mpeg4
     s->me.skip=0;
-    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, src_data, 
-                               ref_data, stride, uvstride, s->f_code) + 3*penalty_factor;
+    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, 0, s->f_code) + 3*penalty_factor;
     
     s->me.skip=0;
-    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, src_data, 
-                               ref_data+3, stride, uvstride, s->b_code) + 2*penalty_factor;
+    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, 2, s->b_code) + 2*penalty_factor;
 //printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
 
     s->me.skip=0;
-    fbmin= bidir_refine(s, src_data, ref_data, stride, uvstride, mb_x, mb_y) + penalty_factor;
+    fbmin= bidir_refine(s, mb_x, mb_y) + penalty_factor;
 //printf("%d %d %d %d\n", dmin, fmin, bmin, fbmin);
     
     if(s->flags & CODEC_FLAG_INTERLACED_ME){
-        const int xy = mb_y*s->mb_stride + mb_x;
-
 //FIXME mb type penalty
         s->me.skip=0;
-        fimin= interlaced_search(s, src_data, ref_data  , 
-                                 s->b_field_mv_table[0], s->b_field_select_table[0], s->f_code,
-                                 s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
-        bimin= interlaced_search(s, src_data, ref_data+3, 
-                                 s->b_field_mv_table[1], s->b_field_select_table[1], s->b_code,
-                                 s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1]);
+        s->me.current_mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
+        fimin= interlaced_search(s, 0,
+                                 s->b_field_mv_table[0], s->b_field_select_table[0],
+                                 s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1], 0);
+        s->me.current_mv_penalty= s->me.mv_penalty[s->b_code] + MAX_MV;
+        bimin= interlaced_search(s, 2,
+                                 s->b_field_mv_table[1], s->b_field_select_table[1],
+                                 s->b_back_mv_table[xy][0], s->b_back_mv_table[xy][1], 0);
     }else
         fimin= bimin= INT_MAX;
 
@@ -1813,11 +1909,11 @@ void ff_fix_long_p_mvs(MpegEncContext * s)
     
 //printf("%d no:%d %d//\n", clip, noclip, f_code);
     if(s->flags&CODEC_FLAG_4MV){
-        const int wrap= 2+ s->mb_width*2;
+        const int wrap= s->b8_stride;
 
         /* clip / convert to intra 8x8 type MVs */
         for(y=0; y<s->mb_height; y++){
-            int xy= (y*2 + 1)*wrap + 1;
+            int xy= y*2*wrap;
             int i= y*s->mb_stride;
             int x;
 
diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c
index 49c2e57b5..8ab6c7be4 100644
--- a/src/libffmpeg/libavcodec/motion_est_template.c
+++ b/src/libffmpeg/libavcodec/motion_est_template.c
@@ -22,58 +22,32 @@
  * @file motion_est_template.c
  * Motion estimation template.
  */
-//FIXME ref2_y next_pic?
+
 //lets hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...)
-//Note, the last line is there to kill these ugly unused var warnings
 #define LOAD_COMMON\
     uint32_t * const score_map= s->me.score_map;\
-    const int time_pp= s->pp_time;\
-    const int time_pb= s->pb_time;\
     const int xmin= s->me.xmin;\
     const int ymin= s->me.ymin;\
     const int xmax= s->me.xmax;\
     const int ymax= s->me.ymax;\
-    uint8_t * const src_y= src_data[0];\
-    uint8_t * const src_u= src_data[1];\
-    uint8_t * const src_v= src_data[2];\
-    uint8_t * const ref_y= ref_data[0];\
-    uint8_t * const ref_u= ref_data[1];\
-    uint8_t * const ref_v= ref_data[2];\
-    op_pixels_func (*hpel_put)[4];\
-    op_pixels_func (*hpel_avg)[4]= &s->dsp.avg_pixels_tab[size];\
-    op_pixels_func (*chroma_hpel_put)[4];\
-    qpel_mc_func (*qpel_put)[16];\
-    qpel_mc_func (*qpel_avg)[16]= &s->dsp.avg_qpel_pixels_tab[size];\
-    const __attribute__((unused)) int unu= time_pp + time_pb + (size_t)src_u + (size_t)src_v + (size_t)ref_u + (size_t)ref_v\
-                                           + (size_t)hpel_avg + (size_t)qpel_avg + (size_t)score_map\
-                                           + xmin + xmax + ymin + ymax;\
-    if(s->no_rounding /*FIXME b_type*/){\
-        hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];\
-        chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];\
-        qpel_put= &s->dsp.put_no_rnd_qpel_pixels_tab[size];\
-    }else{\
-        hpel_put=& s->dsp.put_pixels_tab[size];\
-        chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];\
-        qpel_put= &s->dsp.put_qpel_pixels_tab[size];\
-    }
+    uint8_t *mv_penalty= s->me.current_mv_penalty;\
+    const int pred_x= s->me.pred_x;\
+    const int pred_y= s->me.pred_y;\
 
-
-#ifdef CMP_HPEL
-    
 #define CHECK_HALF_MV(dx, dy, x, y)\
 {\
     const int hx= 2*(x)+(dx);\
     const int hy= 2*(y)+(dy);\
-    CMP_HPEL(d, dx, dy, x, y, size);\
+    d= cmp(s, x, y, dx, dy, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);\
     d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\
     COPY3_IF_LT(dmin, d, bx, hx, by, hy)\
 }
 
 #if 0
-static int RENAME(hpel_motion_search)(MpegEncContext * s,
+static int hpel_motion_search)(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-                                  int pred_x, int pred_y, uint8_t *ref_data[3], 
-                                  int size, uint8_t * const mv_penalty)
+                                  uint8_t *ref_data[3], 
+                                  int size)
 {
     const int xx = 16 * s->mb_x + 8*(n&1);
     const int yy = 16 * s->mb_y + 8*(n>>1);
@@ -94,8 +68,8 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
         hpel_put=& s->dsp.put_pixels_tab[size];
         chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];
     }
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
     cmp_sub= s->dsp.me_sub_cmp[size];
     chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
 
@@ -138,11 +112,10 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
 }
 
 #else
-static int RENAME(hpel_motion_search)(MpegEncContext * s,
+static int hpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-                                  int pred_x, int pred_y, uint8_t *src_data[3], 
-                                  uint8_t *ref_data[3], int stride, int uvstride,
-                                  int size, int h, uint8_t * const mv_penalty)
+                                  int src_index, int ref_index,
+                                  int size, int h)
 {
     const int mx = *mx_ptr;
     const int my = *my_ptr;   
@@ -151,6 +124,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
     int bx=2*mx, by=2*my;
 
     LOAD_COMMON
+    int flags= s->me.sub_flags;
     
  //FIXME factorize
 
@@ -164,7 +138,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
     }
         
     if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
-        CMP_HPEL(dmin, 0, 0, mx, my, size);
+        dmin= cmp(s, mx, my, 0, 0, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
         if(mx || my || size>0)
             dmin += (mv_penalty[2*mx - pred_x] + mv_penalty[2*my - pred_y])*penalty_factor;
     }
@@ -246,14 +220,16 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
 }
 #endif
 
-static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3], 
-                                  uint8_t *ref_data[3], int stride, int uvstride,
-                                  uint8_t * const mv_penalty)
+static int inline get_mb_score(MpegEncContext * s, int mx, int my, int src_index,
+                               int ref_index)
 {
 //    const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
     const int size= 0;
     const int h= 16;
     const int penalty_factor= s->me.mb_penalty_factor;
+    const int flags= s->me.mb_flags;
+    const int qpel= flags & FLAG_QPEL;
+    const int mask= 1+2*qpel;
     me_cmp_func cmp_sub, chroma_cmp_sub;
     int d;
 
@@ -267,7 +243,7 @@ static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
     assert(!s->me.skip);
     assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
 
-    CMP_HPEL(d, mx&1, my&1, mx>>1, my>>1, size);
+    d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
     //FIXME check cbp before adding penalty for (0,0) vector
     if(mx || my || size>0)
         d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
@@ -275,26 +251,19 @@ static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
     return d;
 }
 
-#endif /* CMP_HPEL */
-
-
-
-#ifdef CMP_QPEL
-
 #define CHECK_QUARTER_MV(dx, dy, x, y)\
 {\
     const int hx= 4*(x)+(dx);\
     const int hy= 4*(y)+(dy);\
-    CMP_QPEL(d, dx, dy, x, y, size);\
+    d= cmp(s, x, y, dx, dy, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
     d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\
     COPY3_IF_LT(dmin, d, bx, hx, by, hy)\
 }
 
-static int RENAME(qpel_motion_search)(MpegEncContext * s,
+static int qpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-                                  int pred_x, int pred_y, uint8_t *src_data[3], 
-                                  uint8_t *ref_data[3], int stride, int uvstride,                                  
-                                  int size, int h, uint8_t * const mv_penalty)
+                                  int src_index, int ref_index,                                  
+                                  int size, int h)
 {
     const int mx = *mx_ptr;
     const int my = *my_ptr;   
@@ -302,13 +271,14 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
     const int map_generation= s->me.map_generation;
     const int subpel_quality= s->avctx->me_subpel_quality;
     uint32_t *map= s->me.map;
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     me_cmp_func cmp_sub, chroma_cmp_sub;
 
     LOAD_COMMON
+    int flags= s->me.sub_flags;
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1]; //factorize FIXME
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME
  //FIXME factorize
 
     cmp_sub= s->dsp.me_sub_cmp[size];
@@ -321,7 +291,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
     }
         
     if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
-        CMP_QPEL(dmin, 0, 0, mx, my, size);
+        dmin= cmp(s, mx, my, 0, 0, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
         if(mx || my || size>0)
             dmin += (mv_penalty[4*mx - pred_x] + mv_penalty[4*my - pred_y])*penalty_factor;
     }
@@ -386,7 +356,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
             if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == (my<<ME_MAP_MV_BITS) + mx + map_generation && 0){ //FIXME
                 tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
             }else{
-                CMP(tl, mx-1, my-1, size); //FIXME wrong if chroma me is different
+                tl= cmp(s, mx-1, my-1, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);//FIXME wrong if chroma me is different
             }
             
             cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c; 
@@ -509,36 +479,6 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
     return dmin;
 }
 
-static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3], 
-                                  uint8_t *ref_data[3], int stride, int uvstride,
-                                  uint8_t * const mv_penalty)
-{
-    const int size= 0;
-    const int h= 16;
-    const int penalty_factor= s->me.mb_penalty_factor;
-    me_cmp_func cmp_sub, chroma_cmp_sub;
-    int d;
-
-    LOAD_COMMON
-    
- //FIXME factorize
-
-    cmp_sub= s->dsp.mb_cmp[size];
-    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
-    
-    assert(!s->me.skip);
-    assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
-
-    CMP_QPEL(d, mx&3, my&3, mx>>2, my>>2, size);
-    //FIXME check cbp before adding penalty for (0,0) vector
-    if(mx || my || size>0)
-        d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
-        
-    return d;
-}
-
-
-#endif /* CMP_QPEL */
 
 #define CHECK_MV(x,y)\
 {\
@@ -546,7 +486,7 @@ static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
     const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
 /*printf("check_mv %d %d\n", x, y);*/\
     if(map[index]!=key){\
-        CMP(d, x, y, size);\
+        d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
         d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
@@ -570,7 +510,7 @@ static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pre
     const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
 /*printf("check_mv_dir %d %d %d\n", x, y, new_dir);*/\
     if(map[index]!=key){\
-        CMP(d, x, y, size);\
+        d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
         d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
@@ -590,27 +530,29 @@ if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x,
 if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
 if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
 
+#define LOAD_COMMON2\
+    uint32_t *map= s->me.map;\
+    const int qpel= flags&FLAG_QPEL;\
+    const int shift= 1+qpel;\
 
-static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, int dmin,
-                                       uint8_t *src_data[3],
-                                       uint8_t *ref_data[3], int stride, int uvstride,
-                                       int const pred_x, int const pred_y, int const penalty_factor,
-                                       int const shift,
-                                       uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
-                                       )
+static always_inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
+                                       int src_index, int ref_index, int const penalty_factor,
+                                       int size, int h, int flags)
 {
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     int next_dir=-1;
     LOAD_COMMON
+    LOAD_COMMON2
+    int map_generation= s->me.map_generation;
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
 
     { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
         const int key= (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
         const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
         if(map[index]!=key){ //this will be executed only very rarey
-            CMP(score_map[index], best[0], best[1], size);
+            score_map[index]= cmp(s, best[0], best[1], 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
             map[index]= key;
         }
     }
@@ -634,20 +576,18 @@ static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, in
     }
 }
 
-static inline int RENAME(funny_diamond_search)(MpegEncContext * s, int *best, int dmin,
-                                       uint8_t *src_data[3],
-                                       uint8_t *ref_data[3], int stride, int uvstride,
-                                       int const pred_x, int const pred_y, int const penalty_factor,
-                                       int const shift,
-                                       uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
-                                       )
+static int funny_diamond_search(MpegEncContext * s, int *best, int dmin,
+                                       int src_index, int ref_index, int const penalty_factor,
+                                       int size, int h, int flags)
 {
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     int dia_size;
     LOAD_COMMON
+    LOAD_COMMON2
+    int map_generation= s->me.map_generation;
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
 
     for(dia_size=1; dia_size<=4; dia_size++){
         int dir;
@@ -702,7 +642,7 @@ if(256*256*256*64 % (stats[0]+1)==0){
     const int index= (((ay)<<ME_MAP_SHIFT) + (ax))&(ME_MAP_SIZE-1);\
 /*printf("sab check %d %d\n", ax, ay);*/\
     if(map[index]!=key){\
-        CMP(d, ax, ay, size);\
+        d= cmp(s, ax, ay, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
         map[index]= key;\
         score_map[index]= d;\
         d += (mv_penalty[((ax)<<shift)-pred_x] + mv_penalty[((ay)<<shift)-pred_y])*penalty_factor;\
@@ -726,22 +666,20 @@ if(256*256*256*64 % (stats[0]+1)==0){
 }
 
 #define MAX_SAB_SIZE 16
-static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int dmin,
-                                       uint8_t *src_data[3],
-                                       uint8_t *ref_data[3], int stride, int uvstride,
-                                       int const pred_x, int const pred_y, int const penalty_factor,
-                                       int const shift,
-                                       uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
-                                       )
+static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
+                                       int src_index, int ref_index, int const penalty_factor,
+                                       int size, int h, int flags)
 {
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     Minima minima[MAX_SAB_SIZE];
     const int minima_count= ABS(s->me.dia_size);
     int i, j;
     LOAD_COMMON
+    LOAD_COMMON2
+    int map_generation= s->me.map_generation;
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
     
     for(j=i=0; i<ME_MAP_SIZE; i++){
         uint32_t key= map[i];
@@ -807,20 +745,18 @@ static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int
     return dmin;    
 }
 
-static inline int RENAME(var_diamond_search)(MpegEncContext * s, int *best, int dmin,
-                                       uint8_t *src_data[3],
-                                       uint8_t *ref_data[3], int stride, int uvstride,
-                                       int const pred_x, int const pred_y, int const penalty_factor,
-                                       int const shift,
-                                       uint32_t *map, int map_generation, int size, int h, uint8_t * const mv_penalty
-                                       )
+static int var_diamond_search(MpegEncContext * s, int *best, int dmin,
+                                       int src_index, int ref_index, int const penalty_factor,
+                                       int size, int h, int flags)
 {
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     int dia_size;
     LOAD_COMMON
+    LOAD_COMMON2
+    int map_generation= s->me.map_generation;
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
 
     for(dia_size=1; dia_size<=s->me.dia_size; dia_size++){
         int dir, start, end;
@@ -885,31 +821,42 @@ if(256*256*256*64 % (stats[0]+1)==0){
     return dmin;    
 }
 
-static int RENAME(epzs_motion_search)(MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3], 
-                             uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2], 
-                             int ref_mv_scale, uint8_t * const mv_penalty)
+static always_inline int diamond_search(MpegEncContext * s, int *best, int dmin,
+                                       int src_index, int ref_index, int const penalty_factor,
+                                       int size, int h, int flags){
+    if(s->me.dia_size==-1)
+        return funny_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+    else if(s->me.dia_size<-1)
+        return   sab_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+    else if(s->me.dia_size<2)
+        return small_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+    else
+        return   var_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
+}
+
+static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx_ptr, int *my_ptr,
+                             int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, int flags)
 {
     int best[2]={0, 0};
-    int d, dmin; 
-    const int shift= 1+s->quarter_sample;
-    uint32_t *map= s->me.map;
+    int d, dmin;
     int map_generation;
     const int penalty_factor= s->me.penalty_factor;
     const int size=0;
     const int h=16;
     const int ref_mv_stride= s->mb_stride; //pass as arg  FIXME
     const int ref_mv_xy= s->mb_x + s->mb_y*ref_mv_stride; //add to last_mv beforepassing FIXME
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
+    
     LOAD_COMMON
+    LOAD_COMMON2
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
     
     map_generation= update_map_generation(s);
 
-    CMP(dmin, 0, 0, size);
+    dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
     map[0]= map_generation;
     score_map[0]= dmin;
 
@@ -974,22 +921,7 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s,
     }
 
 //check(best[0],best[1],0, b0)
-    if(s->me.dia_size==-1)
-        dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else if(s->me.dia_size<-1)
-        dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else if(s->me.dia_size<2)
-        dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else
-        dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
+    dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
 
 //check(best[0],best[1],0, b1)
     *mx_ptr= best[0];
@@ -999,29 +931,42 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s,
     return dmin;
 }
 
-#ifndef CMP_DIRECT /* no 4mv search needed in direct mode */
-static int RENAME(epzs_motion_search4)(MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y,
-                             uint8_t *src_data[3], 
-                             uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2], 
-                             int ref_mv_scale, uint8_t * const mv_penalty)
+//this function is dedicated to the braindamaged gcc
+static inline int epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr,
+                             int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], 
+                             int ref_mv_scale)
+{
+//FIXME convert other functions in the same way if faster
+    switch(s->me.flags){
+    case 0:
+        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, 0);
+//    case FLAG_QPEL:
+//        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, FLAG_QPEL);
+    default:
+        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, s->me.flags);
+    }
+}
+
+static int epzs_motion_search4(MpegEncContext * s,
+                             int *mx_ptr, int *my_ptr, int P[10][2],
+                             int src_index, int ref_index, int16_t (*last_mv)[2], 
+                             int ref_mv_scale)
 {
     int best[2]={0, 0};
     int d, dmin; 
-    const int shift= 1+s->quarter_sample;
-    uint32_t *map= s->me.map;
     int map_generation;
     const int penalty_factor= s->me.penalty_factor;
     const int size=1;
     const int h=8;
     const int ref_mv_stride= s->mb_stride;
     const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride;
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     LOAD_COMMON
+    int flags= s->me.flags;
+    LOAD_COMMON2
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
 
     map_generation= update_map_generation(s);
 
@@ -1053,23 +998,7 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s,
                             (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
     }
 
-    if(s->me.dia_size==-1)
-        dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else if(s->me.dia_size<-1)
-        dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else if(s->me.dia_size<2)
-        dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else
-        dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-
+    dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
 
     *mx_ptr= best[0];
     *my_ptr= best[1];    
@@ -1079,28 +1008,26 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s,
 }
 
 //try to merge with above FIXME (needs PSNR test)
-static int RENAME(epzs_motion_search2)(MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y,
-                             uint8_t *src_data[3], 
-                             uint8_t *ref_data[3], int stride, int uvstride, int16_t (*last_mv)[2], 
-                             int ref_mv_scale, uint8_t * const mv_penalty)
+static int epzs_motion_search2(MpegEncContext * s,
+                             int *mx_ptr, int *my_ptr, int P[10][2],
+                             int src_index, int ref_index, int16_t (*last_mv)[2], 
+                             int ref_mv_scale)
 {
     int best[2]={0, 0};
     int d, dmin; 
-    const int shift= 1+s->quarter_sample;
-    uint32_t *map= s->me.map;
     int map_generation;
     const int penalty_factor= s->me.penalty_factor;
     const int size=0; //FIXME pass as arg
     const int h=8;
     const int ref_mv_stride= s->mb_stride;
     const int ref_mv_xy= s->mb_x + s->mb_y *ref_mv_stride;
-    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmpf, chroma_cmpf;
     LOAD_COMMON
+    int flags= s->me.flags;
+    LOAD_COMMON2
     
-    cmp= s->dsp.me_cmp[size];
-    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmpf= s->dsp.me_cmp[size];
+    chroma_cmpf= s->dsp.me_cmp[size+1];
 
     map_generation= update_map_generation(s);
 
@@ -1132,23 +1059,7 @@ static int RENAME(epzs_motion_search2)(MpegEncContext * s,
                             (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
     }
 
-    if(s->me.dia_size==-1)
-        dmin= RENAME(funny_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else if(s->me.dia_size<-1)
-        dmin= RENAME(sab_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else if(s->me.dia_size<2)
-        dmin= RENAME(small_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-    else
-        dmin= RENAME(var_diamond_search)(s, best, dmin, src_data, ref_data, stride, uvstride,
-                                   pred_x, pred_y, penalty_factor, 
-				   shift, map, map_generation, size, h, mv_penalty);
-
+    dmin= diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
 
     *mx_ptr= best[0];
     *my_ptr= best[1];    
@@ -1156,4 +1067,3 @@ static int RENAME(epzs_motion_search2)(MpegEncContext * s,
 //    printf("%d %d %d \n", best[0], best[1], dmin);
     return dmin;
 }
-#endif /* !CMP_DIRECT */
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index e39356c9d..493d1a445 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -249,7 +249,7 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
 {
         unsigned int vbv_buffer_size;
         unsigned int fps, v;
-        int n, i;
+        int i;
         uint64_t time_code;
         float best_aspect_error= 1E10;
         float aspect_ratio= av_q2d(s->avctx->sample_aspect_ratio);
@@ -365,8 +365,14 @@ static inline void encode_mb_skip_run(MpegEncContext *s, int run){
 
 static void common_init(MpegEncContext *s)
 {
+int i;
+
     s->y_dc_scale_table=
     s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+
+    if(!s->encoding)    
+    for(i=0;i<64;i++)
+       s->dsp.idct_permutation[i]=i;
 }
 
 void ff_mpeg1_clean_buffers(MpegEncContext *s){
@@ -500,8 +506,9 @@ void mpeg1_encode_mb(MpegEncContext *s,
             cbp |= 1 << (5 - i);
     }
     
-    if (cbp == 0 && !first_mb && (mb_x != s->mb_width - 1 || (mb_y != s->mb_height - 1 && s->codec_id == CODEC_ID_MPEG1VIDEO)) && 
-        ((s->pict_type == P_TYPE && s->mv_type == MV_TYPE_16X16 && (motion_x | motion_y) == 0) ||
+    if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 &&
+        (mb_x != s->mb_width - 1 || (mb_y != s->mb_height - 1 && s->codec_id == CODEC_ID_MPEG1VIDEO)) && 
+        ((s->pict_type == P_TYPE && (motion_x | motion_y) == 0) ||
         (s->pict_type == B_TYPE && s->mv_dir == s->last_mv_dir && (((s->mv_dir & MV_DIR_FORWARD) ? ((s->mv[0][0][0] - s->last_mv[0][0][0])|(s->mv[0][0][1] - s->last_mv[0][0][1])) : 0) |
         ((s->mv_dir & MV_DIR_BACKWARD) ? ((s->mv[1][0][0] - s->last_mv[1][0][0])|(s->mv[1][0][1] - s->last_mv[1][0][1])) : 0)) == 0))) {
         s->mb_skip_run++;
@@ -798,7 +805,7 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
                 else{
                     int val, bit_size, range, code;
 
-                    bit_size = s->f_code - 1;
+                    bit_size = f_code - 1;
                     range = 1 << bit_size;
 
                     val=mv;
@@ -955,7 +962,7 @@ static VLC mb_ptype_vlc;
 static VLC mb_btype_vlc;
 static VLC mb_pat_vlc;
 
-static void init_vlcs()
+static void init_vlcs(void)
 {
     static int done = 0;
 
@@ -1754,11 +1761,17 @@ typedef struct Mpeg1Context {
     int repeat_field; /* true if we must repeat the field */
     AVPanScan pan_scan; /** some temporary storage for the panscan */
     int slice_count;
+    int swap_uv;//indicate VCR2
+    int save_aspect_info;
+
 } Mpeg1Context;
 
 static int mpeg_decode_init(AVCodecContext *avctx)
 {
     Mpeg1Context *s = avctx->priv_data;
+    MpegEncContext *s2 = &s->mpeg_enc_ctx;
+    
+    MPV_decode_defaults(s2);
     
     s->mpeg_enc_ctx.avctx= avctx;
     s->mpeg_enc_ctx.flags= avctx->flags;
@@ -1773,6 +1786,122 @@ static int mpeg_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm, 
+                                     const uint8_t *new_perm){
+uint16_t temp_matrix[64];
+int i;
+
+    memcpy(temp_matrix,matrix,64*sizeof(uint16_t));
+    
+    for(i=0;i<64;i++){
+        matrix[new_perm[i]] = temp_matrix[old_perm[i]];
+    }      
+}
+
+//Call this function when we know all parameters
+//it may be called in different places for mpeg1 and mpeg2
+static int mpeg_decode_postinit(AVCodecContext *avctx){
+Mpeg1Context *s1 = avctx->priv_data;
+MpegEncContext *s = &s1->mpeg_enc_ctx;
+uint8_t old_permutation[64];
+
+
+    if (
+    	(s1->mpeg_enc_ctx_allocated == 0)|| 
+        avctx->width  != s->width ||
+        avctx->height != s->height||
+//      s1->save_aspect_info != avctx->aspect_ratio_info||
+        0)
+    {
+    
+        if (s1->mpeg_enc_ctx_allocated) {
+            MPV_common_end(s);
+        }
+
+	if( (s->width == 0 )||(s->height == 0))
+	    return -2;
+
+        avctx->width = s->width;
+        avctx->height = s->height;
+        avctx->bit_rate = s->bit_rate;
+        s1->save_aspect_info = s->aspect_ratio_info;
+
+     //low_delay may be forced, in this case we will have B frames
+     //that behave like P frames
+        avctx->has_b_frames = !(s->low_delay);
+
+        if(avctx->sub_id==1){//s->codec_id==avctx->codec_id==CODEC_ID
+            //mpeg1 fps
+            avctx->frame_rate     = frame_rate_tab[s->frame_rate_index].num;
+            avctx->frame_rate_base= frame_rate_tab[s->frame_rate_index].den;
+            //mpeg1 aspect
+            avctx->sample_aspect_ratio= av_d2q(
+                    1.0/mpeg1_aspect[s->aspect_ratio_info], 255);
+
+        }else{//mpeg2
+        //mpeg2 fps
+            av_reduce(
+                &s->avctx->frame_rate, 
+                &s->avctx->frame_rate_base, 
+                frame_rate_tab[s->frame_rate_index].num * (s->frame_rate_ext_n+1),
+                frame_rate_tab[s->frame_rate_index].den * (s->frame_rate_ext_d+1),
+                1<<30);
+        //mpeg2 aspect
+            if(s->aspect_ratio_info > 1){
+                if( (s1->pan_scan.width == 0 )||(s1->pan_scan.height == 0) ){
+                    s->avctx->sample_aspect_ratio= 
+                        av_div_q(
+                         mpeg2_aspect[s->aspect_ratio_info], 
+                         (AVRational){s->width, s->height}
+                         );
+                }else{
+                    s->avctx->sample_aspect_ratio= 
+                        av_div_q(
+                         mpeg2_aspect[s->aspect_ratio_info], 
+                         (AVRational){s1->pan_scan.width, s1->pan_scan.height}
+                        );
+        	}
+            }else{
+                s->avctx->sample_aspect_ratio= 
+                    mpeg2_aspect[s->aspect_ratio_info];
+            }
+        }//mpeg2
+
+        if(avctx->xvmc_acceleration){
+            avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
+        }else{
+            if(s->chroma_format <  2){
+                avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_420);
+            }else
+            if(s->chroma_format == 2){
+                avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_422);
+            }else
+            if(s->chroma_format >  2){
+                avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_444);
+            }
+        }
+        //until then pix_fmt may be changed right after codec init
+        if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT )
+            if( avctx->idct_algo == FF_IDCT_AUTO )
+                avctx->idct_algo = FF_IDCT_SIMPLE;
+
+        //quantization matrixes may need reordering 
+        //if dct permutation is changed
+        memcpy(old_permutation,s->dsp.idct_permutation,64*sizeof(uint8_t));
+
+        if (MPV_common_init(s) < 0)
+            return -2;
+
+        quant_matrix_rebuild(s->intra_matrix,       old_permutation,s->dsp.idct_permutation);
+        quant_matrix_rebuild(s->inter_matrix,       old_permutation,s->dsp.idct_permutation);
+        quant_matrix_rebuild(s->chroma_intra_matrix,old_permutation,s->dsp.idct_permutation);
+        quant_matrix_rebuild(s->chroma_inter_matrix,old_permutation,s->dsp.idct_permutation);
+
+        s1->mpeg_enc_ctx_allocated = 1;
+    }
+    return 0;
+}
+
 /* return the 8 bit start code value and update the search
    state. Return -1 if no start code found */
 static int find_start_code(const uint8_t **pbuf_ptr, const uint8_t *buf_end)
@@ -1807,6 +1936,9 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     int ref, f_code, vbv_delay;
 
+    if(mpeg_decode_postinit(s->avctx) < 0) 
+       return -2;
+
     init_get_bits(&s->gb, buf, buf_size*8);
 
     ref = get_bits(&s->gb, 10); /* temporal ref */
@@ -1845,7 +1977,6 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
 {
     int horiz_size_ext, vert_size_ext;
     int bit_rate_ext;
-    int frame_rate_ext_n, frame_rate_ext_d;
     int level, profile;
 
     skip_bits(&s->gb, 1); /* profil and level esc*/
@@ -1865,32 +1996,17 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
     s->low_delay = get_bits1(&s->gb);
     if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1;
 
-    frame_rate_ext_n = get_bits(&s->gb, 2);
-    frame_rate_ext_d = get_bits(&s->gb, 5);
-    av_reduce(
-        &s->avctx->frame_rate, 
-        &s->avctx->frame_rate_base, 
-        frame_rate_tab[s->frame_rate_index].num * (frame_rate_ext_n+1),
-        frame_rate_tab[s->frame_rate_index].den * (frame_rate_ext_d+1),
-        1<<30);
+    s->frame_rate_ext_n = get_bits(&s->gb, 2);
+    s->frame_rate_ext_d = get_bits(&s->gb, 5);
 
     dprintf("sequence extension\n");
     s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG2VIDEO;
     s->avctx->sub_id = 2; /* indicates mpeg2 found */
 
-    if(s->aspect_ratio_info <= 1)
-        s->avctx->sample_aspect_ratio= mpeg2_aspect[s->aspect_ratio_info];
-    else{
-        s->avctx->sample_aspect_ratio= 
-            av_div_q(
-                mpeg2_aspect[s->aspect_ratio_info], 
-                (AVRational){s->width, s->height}
-            );
-    }
-    
     if(s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n", 
                profile, level, s->avctx->rc_buffer_size, s->bit_rate);
+
 }
 
 static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1)
@@ -1912,14 +2028,7 @@ static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1)
     
     s1->pan_scan.width= 16*w;
     s1->pan_scan.height=16*h;
-
-    if(s->aspect_ratio_info > 1)
-        s->avctx->sample_aspect_ratio= 
-            av_div_q(
-                mpeg2_aspect[s->aspect_ratio_info], 
-                (AVRational){w, h}
-            );
-    
+        
     if(s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG, "sde w:%d, h:%d\n", w, h);
 }
@@ -1927,9 +2036,23 @@ static void mpeg_decode_sequence_display_extension(Mpeg1Context *s1)
 static void mpeg_decode_picture_display_extension(Mpeg1Context *s1)
 {
     MpegEncContext *s= &s1->mpeg_enc_ctx;
-    int i;
-
-    for(i=0; i<1; i++){ //FIXME count
+    int i,nofco;
+
+    nofco = 1;
+    if(s->progressive_sequence){
+        if(s->repeat_first_field){
+	    nofco++;
+	    if(s->top_field_first)
+	        nofco++;	
+	}
+    }else{
+        if(s->picture_structure == PICT_FRAME){
+            nofco++;
+	    if(s->repeat_first_field)
+	        nofco++;
+	}
+    }
+    for(i=0; i<nofco; i++){
         s1->pan_scan.position[i][0]= get_sbits(&s->gb, 16);
         skip_bits(&s->gb, 1); //marker
         s1->pan_scan.position[i][1]= get_sbits(&s->gb, 16);
@@ -2134,8 +2257,8 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
     s->resync_mb_x=
     s->resync_mb_y= -1;
 
-    if (mb_y >= s->mb_height){
-        av_log(s->avctx, AV_LOG_ERROR, "slice below image (%d >= %d)\n", s->mb_y, s->mb_height);
+    if (mb_y<<field_pic >= s->mb_height){
+        av_log(s->avctx, AV_LOG_ERROR, "slice below image (%d >= %d)\n", mb_y, s->mb_height);
         return -1;
     }
     
@@ -2208,8 +2331,8 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
             return -1;
 
         if(s->current_picture.motion_val[0] && !s->encoding){ //note motion_val is normally NULL unless we want to extract the MVs
-            const int wrap = field_pic ? 2*s->block_wrap[0] : s->block_wrap[0];
-            int xy = s->mb_x*2 + 1 + (s->mb_y*2 +1)*wrap;
+            const int wrap = field_pic ? 2*s->b8_stride : s->b8_stride;
+            int xy = s->mb_x*2 + s->mb_y*2*wrap;
             int motion_x, motion_y, dir, i;
             if(field_pic && !s->first_field)
                 xy += wrap/2;
@@ -2218,18 +2341,20 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
                 for(dir=0; dir<2; dir++){
                     if (s->mb_intra || (dir==1 && s->pict_type != B_TYPE)) {
                         motion_x = motion_y = 0;
-                    }else if (s->mv_type == MV_TYPE_16X16){
+                    }else if (s->mv_type == MV_TYPE_16X16 || (s->mv_type == MV_TYPE_FIELD && field_pic)){
                         motion_x = s->mv[dir][0][0];
                         motion_y = s->mv[dir][0][1];
                     } else /*if ((s->mv_type == MV_TYPE_FIELD) || (s->mv_type == MV_TYPE_16X8))*/ {
                         motion_x = s->mv[dir][i][0];
                         motion_y = s->mv[dir][i][1];
                     }
-                    
+
                     s->current_picture.motion_val[dir][xy    ][0] = motion_x;
                     s->current_picture.motion_val[dir][xy    ][1] = motion_y;
                     s->current_picture.motion_val[dir][xy + 1][0] = motion_x;
                     s->current_picture.motion_val[dir][xy + 1][1] = motion_y;
+                    s->current_picture.ref_index [dir][xy    ]=
+                    s->current_picture.ref_index [dir][xy + 1]= s->field_select[dir][i];
                 }
                 xy += wrap;
             }
@@ -2379,59 +2504,27 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
 {
     Mpeg1Context *s1 = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
-    int width, height, i, v, j;
-    float aspect;
+    int width,height;
+    int i, v, j;
 
     init_get_bits(&s->gb, buf, buf_size*8);
 
     width = get_bits(&s->gb, 12);
     height = get_bits(&s->gb, 12);
+    if (width <= 0 || height <= 0 ||
+        (width % 2) != 0 || (height % 2) != 0)
+        return -1;
     s->aspect_ratio_info= get_bits(&s->gb, 4);
     if (s->aspect_ratio_info == 0)
         return -1;
-    aspect= 1.0/mpeg1_aspect[s->aspect_ratio_info];
-    avctx->sample_aspect_ratio= av_d2q(aspect, 255);
-
     s->frame_rate_index = get_bits(&s->gb, 4);
     if (s->frame_rate_index == 0 || s->frame_rate_index > 13)
         return -1;
     s->bit_rate = get_bits(&s->gb, 18) * 400;
     if (get_bits1(&s->gb) == 0) /* marker */
         return -1;
-    if (width <= 0 || height <= 0 ||
-        (width % 2) != 0 || (height % 2) != 0)
-        return -1;
-    if (width != s->width ||
-        height != s->height) {
-        /* start new mpeg1 context decoding */
-        s->out_format = FMT_MPEG1;
-        if (s1->mpeg_enc_ctx_allocated) {
-            MPV_common_end(s);
-        }
-        s->width = width;
-        s->height = height;
-        avctx->has_b_frames= 1;
-        avctx->width = width;
-        avctx->height = height;
-        avctx->frame_rate     = frame_rate_tab[s->frame_rate_index].num;
-        avctx->frame_rate_base= frame_rate_tab[s->frame_rate_index].den;
-        avctx->bit_rate = s->bit_rate;
-        
-        if(avctx->xvmc_acceleration){
-            avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
-        }else{
-            avctx->pix_fmt = avctx->get_format(avctx,pixfmt_yuv_420);
-        }
-	
-        if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT )
-            if( avctx->idct_algo == FF_IDCT_AUTO )
-                avctx->idct_algo = FF_IDCT_SIMPLE;
-
-        if (MPV_common_init(s) < 0)
-            return -1;
-        s1->mpeg_enc_ctx_allocated = 1;
-        s->swap_uv = 0;//just in case vcr2 and mpeg2 stream have been concatinated
-    }
+    s->width = width;
+    s->height = height;
 
     s->avctx->rc_buffer_size= get_bits(&s->gb, 10) * 1024*16;
     skip_bits(&s->gb, 1);
@@ -2444,19 +2537,21 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
                 av_log(s->avctx, AV_LOG_ERROR, "intra matrix damaged\n");
                 return -1;
             }
-            j = s->intra_scantable.permutated[i];
+            j = s->dsp.idct_permutation[ ff_zigzag_direct[i] ];
             s->intra_matrix[j] = v;
             s->chroma_intra_matrix[j] = v;
         }
 #ifdef DEBUG
+/*
         dprintf("intra matrix present\n");
         for(i=0;i<64;i++)
-            dprintf(" %d", s->intra_matrix[s->intra_scantable.permutated[i]]);
+            dprintf(" %d", s->intra_matrix[s->dsp.idct_permutation[i]);
         printf("\n");
+*/
 #endif
     } else {
         for(i=0;i<64;i++) {
-            int j= s->dsp.idct_permutation[i];
+            j = s->dsp.idct_permutation[i];
             v = ff_mpeg1_default_intra_matrix[i];
             s->intra_matrix[j] = v;
             s->chroma_intra_matrix[j] = v;
@@ -2469,15 +2564,17 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
                 av_log(s->avctx, AV_LOG_ERROR, "inter matrix damaged\n");
                 return -1;
             }
-            j = s->intra_scantable.permutated[i];
+            j = s->dsp.idct_permutation[ ff_zigzag_direct[i] ];
             s->inter_matrix[j] = v;
             s->chroma_inter_matrix[j] = v;
         }
 #ifdef DEBUG
+/*
         dprintf("non intra matrix present\n");
         for(i=0;i<64;i++)
-            dprintf(" %d", s->inter_matrix[s->intra_scantable.permutated[i]]);
+            dprintf(" %d", s->inter_matrix[s->dsp.idct_permutation[i]);
         printf("\n");
+*/
 #endif
     } else {
         for(i=0;i<64;i++) {
@@ -2501,6 +2598,8 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     s->chroma_format = 1;
     s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG1VIDEO;
     avctx->sub_id = 1; /* indicates mpeg1 */
+    s->out_format = FMT_MPEG1;
+    s->swap_uv = 0;//AFAIK VCR2 don't have SEQ_HEADER
     if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1;
     
     if(s->avctx->debug & FF_DEBUG_PICT_INFO)
@@ -2593,6 +2692,36 @@ static void mpeg_decode_user_data(AVCodecContext *avctx,
     }
 }
 
+static void mpeg_decode_gop(AVCodecContext *avctx, 
+                            const uint8_t *buf, int buf_size){
+    Mpeg1Context *s1 = avctx->priv_data;
+    MpegEncContext *s = &s1->mpeg_enc_ctx;
+
+    int drop_frame_flag;
+    int time_code_hours, time_code_minutes;
+    int time_code_seconds, time_code_pictures;
+    int broken_link;
+
+    init_get_bits(&s->gb, buf, buf_size*8);
+
+    drop_frame_flag = get_bits1(&s->gb);
+    
+    time_code_hours=get_bits(&s->gb,5);
+    time_code_minutes = get_bits(&s->gb,6);
+    skip_bits1(&s->gb);//marker bit
+    time_code_seconds = get_bits(&s->gb,6);
+    time_code_pictures = get_bits(&s->gb,6);
+
+    /*broken_link indicate that after editing the
+      reference frames of the first B-Frames after GOP I-Frame
+      are missing (open gop)*/
+    broken_link = get_bits1(&s->gb);
+
+    if(s->avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(s->avctx, AV_LOG_DEBUG, "GOP (%2d:%02d:%02d.[%02d]) broken_link=%d\n",
+	    time_code_hours, time_code_minutes, time_code_seconds,
+	    time_code_pictures, broken_link);
+}
 /**
  * finds the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
@@ -2706,7 +2835,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
         input_size = buf_end - buf_ptr;
 
         if(avctx->debug & FF_DEBUG_STARTCODE){
-            av_log(avctx, AV_LOG_DEBUG, "%3X at %d left %d\n", start_code, buf_ptr-buf, input_size);
+            av_log(avctx, AV_LOG_DEBUG, "%3X at %zd left %d\n", start_code, buf_ptr-buf, input_size);
         }
 
                 /* prepare data for next start code */
@@ -2731,6 +2860,8 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
                     break;
                 case GOP_START_CODE:
                     s2->first_field=0;
+                    mpeg_decode_gop(avctx, 
+                                          buf_ptr, input_size);
                     break;
                 default:
                     if (start_code >= SLICE_MIN_START_CODE &&
diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c
index d01405f54..a9eed4e36 100644
--- a/src/libffmpeg/libavcodec/mpegaudiodec.c
+++ b/src/libffmpeg/libavcodec/mpegaudiodec.c
@@ -23,7 +23,6 @@
  */ 
 
 //#define DEBUG
-#include <math.h>
 #include "avcodec.h"
 #include "mpegaudio.h"
 #include "dsputil.h"
@@ -401,11 +400,11 @@ static int decode_init(AVCodecContext * avctx)
         }
 
 	/* compute n ^ (4/3) and store it in mantissa/exp format */
-	if (!av_mallocz_static(&table_4_3_exp,
-			       TABLE_4_3_SIZE * sizeof(table_4_3_exp[0])))
+	table_4_3_exp= av_mallocz_static(TABLE_4_3_SIZE * sizeof(table_4_3_exp[0]));
+        if(!table_4_3_exp)
 	    return -1;
-	if (!av_mallocz_static(&table_4_3_value,
-			       TABLE_4_3_SIZE * sizeof(table_4_3_value[0])))
+	table_4_3_value= av_mallocz_static(TABLE_4_3_SIZE * sizeof(table_4_3_value[0]));
+        if(!table_4_3_value)
             return -1;
         
         int_pow_init();
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 32a92917c..bef088a41 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -283,7 +283,9 @@ static void copy_picture(Picture *dst, Picture *src){
     dst->type= FF_BUFFER_TYPE_COPY;
 }
 
-static void copy_picture_attributes(AVFrame *dst, AVFrame *src){
+static void copy_picture_attributes(MpegEncContext *s, AVFrame *dst, AVFrame *src){
+    int i;
+
     dst->pict_type              = src->pict_type;
     dst->quality                = src->quality;
     dst->coded_picture_number   = src->coded_picture_number;
@@ -292,6 +294,32 @@ static void copy_picture_attributes(AVFrame *dst, AVFrame *src){
     dst->pts                    = src->pts;
     dst->interlaced_frame       = src->interlaced_frame;
     dst->top_field_first        = src->top_field_first;
+
+    if(s->avctx->me_threshold){
+        if(!src->motion_val[0])
+            av_log(s->avctx, AV_LOG_ERROR, "AVFrame.motion_val not set!\n");
+        if(!src->mb_type)
+            av_log(s->avctx, AV_LOG_ERROR, "AVFrame.mb_type not set!\n");
+        if(!src->ref_index[0])
+            av_log(s->avctx, AV_LOG_ERROR, "AVFrame.ref_index not set!\n");
+        if(src->motion_subsample_log2 != dst->motion_subsample_log2)
+            av_log(s->avctx, AV_LOG_ERROR, "AVFrame.motion_subsample_log2 doesnt match! (%d!=%d)\n",
+            src->motion_subsample_log2, dst->motion_subsample_log2);
+
+        memcpy(dst->mb_type, src->mb_type, s->mb_stride * s->mb_height * sizeof(dst->mb_type[0]));
+        
+        for(i=0; i<2; i++){
+            int stride= ((16*s->mb_width )>>src->motion_subsample_log2) + 1;
+            int height= ((16*s->mb_height)>>src->motion_subsample_log2);
+
+            if(src->motion_val[i] && src->motion_val[i] != dst->motion_val[i]){
+                memcpy(dst->motion_val[i], src->motion_val[i], 2*stride*height*sizeof(int16_t));
+            }
+            if(src->ref_index[i] && src->ref_index[i] != dst->ref_index[i]){
+                memcpy(dst->ref_index[i], src->ref_index[i], s->b8_stride*2*s->mb_height*sizeof(int8_t));
+            }
+        }
+    }
 }
 
 /**
@@ -350,13 +378,14 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
             for(i=0; i<2; i++){
                 CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b4_array_size+2)  * sizeof(int16_t))
                 pic->motion_val[i]= pic->motion_val_base[i]+2;
-                CHECKED_ALLOCZ(pic->ref_index[i] , b8_array_size * sizeof(uint8_t))
+                CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t))
             }
             pic->motion_subsample_log2= 2;
         }else if(s->out_format == FMT_H263 || s->encoding || (s->avctx->debug&FF_DEBUG_MV) || (s->avctx->debug_mv)){
             for(i=0; i<2; i++){
-                CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t)*2) //FIXME
+                CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t))
                 pic->motion_val[i]= pic->motion_val_base[i]+2;
+                CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t))
             }
             pic->motion_subsample_log2= 3;
         }
@@ -510,7 +539,68 @@ static void update_duplicate_context_after_me(MpegEncContext *dst, MpegEncContex
 #undef COPY
 }
 
-/* init common structure for both encoder and decoder */
+/**
+ * sets the given MpegEncContext to common defaults (same for encoding and decoding).
+ * the changed fields will not depend upon the prior state of the MpegEncContext.
+ */
+static void MPV_common_defaults(MpegEncContext *s){
+    s->y_dc_scale_table=
+    s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+    s->chroma_qscale_table= ff_default_chroma_qscale_table;
+    s->progressive_frame= 1;
+    s->progressive_sequence= 1;
+    s->picture_structure= PICT_FRAME;
+
+    s->coded_picture_number = 0;
+    s->picture_number = 0;
+    s->input_picture_number = 0;
+
+    s->picture_in_gop_number = 0;
+
+    s->f_code = 1;
+    s->b_code = 1;
+}
+
+/**
+ * sets the given MpegEncContext to defaults for decoding.
+ * the changed fields will not depend upon the prior state of the MpegEncContext.
+ */
+void MPV_decode_defaults(MpegEncContext *s){
+    MPV_common_defaults(s);
+}
+
+/**
+ * sets the given MpegEncContext to defaults for encoding.
+ * the changed fields will not depend upon the prior state of the MpegEncContext.
+ */
+
+#ifdef CONFIG_ENCODERS
+void MPV_encode_defaults(MpegEncContext *s){
+    static int done=0;
+    
+    MPV_common_defaults(s);
+    
+    if(!done){
+        int i;
+        done=1;
+
+        default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) );
+        memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1));
+        memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1));
+
+        for(i=-16; i<16; i++){
+            default_fcode_tab[i + MAX_MV]= 1;
+        }
+    }
+    s->me.mv_penalty= default_mv_penalty;
+    s->fcode_tab= default_fcode_tab;
+}
+#endif //CONFIG_ENCODERS
+
+/** 
+ * init common structure for both encoder and decoder.
+ * this assumes that some variables like width/height are already set
+ */
 int MPV_common_init(MpegEncContext *s)
 {
     int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y;
@@ -538,31 +628,14 @@ int MPV_common_init(MpegEncContext *s)
     s->block_wrap[0]=
     s->block_wrap[1]=
     s->block_wrap[2]=
-    s->block_wrap[3]= s->mb_width*2 + 2;
+    s->block_wrap[3]= s->b8_stride;
     s->block_wrap[4]=
-    s->block_wrap[5]= s->mb_width + 2;
-
-    s->y_dc_scale_table=
-    s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
-    s->chroma_qscale_table= ff_default_chroma_qscale_table;
-    if( s->codec_id != CODEC_ID_MPEG1VIDEO && 
-        s->codec_id != CODEC_ID_MPEG2VIDEO) 
-    {
-        /* default structure is frame */
-        s->progressive_frame= 1;
-        s->picture_structure= PICT_FRAME;
-
-        s->y_dc_scale_table=
-        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
-        if (!s->encoding)
-            s->progressive_sequence= 1;
-    }
-    s->coded_picture_number = 0;
-
-    y_size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
-    c_size = (s->mb_width + 2) * (s->mb_height + 2);
+    s->block_wrap[5]= s->mb_stride;
+ 
+    y_size = s->b8_stride * (2 * s->mb_height + 1);
+    c_size = s->mb_stride * (s->mb_height + 1);
     yc_size = y_size + 2 * c_size;
-
+    
     /* convert fourcc to upper case */
     s->avctx->codec_tag=   toupper( s->avctx->codec_tag     &0xFF)          
                         + (toupper((s->avctx->codec_tag>>8 )&0xFF)<<8 )
@@ -642,12 +715,14 @@ int MPV_common_init(MpegEncContext *s)
     }
     if (s->out_format == FMT_H263) {
         /* ac values */
-        CHECKED_ALLOCZ(s->ac_val[0], yc_size * sizeof(int16_t) * 16);
-        s->ac_val[1] = s->ac_val[0] + y_size;
+        CHECKED_ALLOCZ(s->ac_val_base, yc_size * sizeof(int16_t) * 16);
+        s->ac_val[0] = s->ac_val_base + s->b8_stride + 1;
+        s->ac_val[1] = s->ac_val_base + y_size + s->mb_stride + 1;
         s->ac_val[2] = s->ac_val[1] + c_size;
         
         /* cbp values */
-        CHECKED_ALLOCZ(s->coded_block, y_size);
+        CHECKED_ALLOCZ(s->coded_block_base, y_size);
+        s->coded_block= s->coded_block_base + s->b8_stride + 1;
         
         /* divx501 bitstream reorder buffer */
         CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE);
@@ -660,20 +735,18 @@ int MPV_common_init(MpegEncContext *s)
     if (s->h263_pred || s->h263_plus || !s->encoding) {
         /* dc values */
         //MN: we need these for error resilience of intra-frames
-        CHECKED_ALLOCZ(s->dc_val[0], yc_size * sizeof(int16_t));
-        s->dc_val[1] = s->dc_val[0] + y_size;
+        CHECKED_ALLOCZ(s->dc_val_base, yc_size * sizeof(int16_t));
+        s->dc_val[0] = s->dc_val_base + s->b8_stride + 1;
+        s->dc_val[1] = s->dc_val_base + y_size + s->mb_stride + 1;
         s->dc_val[2] = s->dc_val[1] + c_size;
         for(i=0;i<yc_size;i++)
-            s->dc_val[0][i] = 1024;
+            s->dc_val_base[i] = 1024;
     }
 
     /* which mb is a intra block */
     CHECKED_ALLOCZ(s->mbintra_table, mb_array_size);
     memset(s->mbintra_table, 1, mb_array_size);
     
-    /* default structure is frame */
-    s->picture_structure = PICT_FRAME;
-    
     /* init macroblock skip table */
     CHECKED_ALLOCZ(s->mbskip_table, mb_array_size+2);
     //Note the +1 is for a quicker mpeg4 slice_end detection
@@ -748,9 +821,9 @@ void MPV_common_end(MpegEncContext *s)
         av_freep(&s->p_field_select_table[i]);
     }
     
-    av_freep(&s->dc_val[0]);
-    av_freep(&s->ac_val[0]);
-    av_freep(&s->coded_block);
+    av_freep(&s->dc_val_base);
+    av_freep(&s->ac_val_base);
+    av_freep(&s->coded_block_base);
     av_freep(&s->mbintra_table);
     av_freep(&s->cbp_table);
     av_freep(&s->pred_dir_table);
@@ -782,9 +855,9 @@ void MPV_common_end(MpegEncContext *s)
     s->last_picture_ptr=
     s->next_picture_ptr=
     s->current_picture_ptr= NULL;
+
     for(i=0; i<3; i++)
-        if (s->visualization_buffer[i])
-            av_free(s->visualization_buffer[i]);
+        av_freep(&s->visualization_buffer[i]);
 }
 
 #ifdef CONFIG_ENCODERS
@@ -795,6 +868,8 @@ int MPV_encode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     int i, dummy;
     int chroma_h_shift, chroma_v_shift;
+    
+    MPV_encode_defaults(s);
 
     avctx->pix_fmt = PIX_FMT_YUV420P; // FIXME
 
@@ -850,8 +925,15 @@ int MPV_encode_init(AVCodecContext *avctx)
 
     if(avctx->rc_min_rate && avctx->rc_max_rate != avctx->rc_min_rate){
         av_log(avctx, AV_LOG_INFO, "Warning min_rate > 0 but min_rate != max_rate isnt recommanded!\n");
-    }    
+    }
+    
+    if(   s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate 
+       && (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO)
+       && 90000LL * (avctx->rc_buffer_size-1) > s->avctx->rc_max_rate*0xFFFFLL){
         
+        av_log(avctx, AV_LOG_INFO, "Warning vbv_delay will be set to 0xFFFF (=VBR) as the specified vbv buffer is too large for the given bitrate!\n");
+    }
+       
     if((s->flags & CODEC_FLAG_4MV) && s->codec_id != CODEC_ID_MPEG4 
        && s->codec_id != CODEC_ID_H263 && s->codec_id != CODEC_ID_H263P && s->codec_id != CODEC_ID_FLV1){
         av_log(avctx, AV_LOG_ERROR, "4MV not supported by codec\n");
@@ -882,7 +964,13 @@ int MPV_encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_ERROR, "b frames not supported by codec\n");
         return -1;
     }
-    
+
+    if((s->flags & (CODEC_FLAG_INTERLACED_DCT|CODEC_FLAG_INTERLACED_ME|CODEC_FLAG_ALT_SCAN)) 
+       && s->codec_id != CODEC_ID_MPEG4 && s->codec_id != CODEC_ID_MPEG2VIDEO){
+        av_log(avctx, AV_LOG_ERROR, "interlacing not supported by codec\n");
+        return -1;
+    }
+        
     if(s->mpeg_quant && s->codec_id != CODEC_ID_MPEG4){ //FIXME mpeg2 uses that too
         av_log(avctx, AV_LOG_ERROR, "mpeg2 style quantization not supporetd by codec\n");
         return -1;
@@ -1081,28 +1169,6 @@ int MPV_encode_init(AVCodecContext *avctx)
         return -1;
     }
 
-    { /* set up some save defaults, some codecs might override them later */
-        static int done=0;
-        if(!done){
-            int i;
-            done=1;
-
-            default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) );
-            memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1));
-            memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1));
-
-            for(i=-16; i<16; i++){
-                default_fcode_tab[i + MAX_MV]= 1;
-            }
-        }
-    }
-    s->me.mv_penalty= default_mv_penalty;
-    s->fcode_tab= default_fcode_tab;
- 
-    /* dont use mv_penalty table for crap MV as it would be confused */
-    //FIXME remove after fixing / removing old ME
-    if (s->me_method < ME_EPZS) s->me.mv_penalty = default_mv_penalty;
-
     s->encoding = 1;
 
     /* init */
@@ -1122,22 +1188,22 @@ int MPV_encode_init(AVCodecContext *avctx)
     ff_init_me(s);
 #endif /* #if 0 */
 
+#ifdef CONFIG_ENCODERS
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_ENCODERS
 #ifdef CONFIG_RISKY
     if (s->out_format == FMT_H263)
         h263_encode_init(s);
     if(s->msmpeg4_version)
         ff_msmpeg4_encode_init(s);
 #endif
-#endif
 #endif /* #if 0 */
 /* xine: we do want this for MPEG-1 encoding */
     if (s->out_format == FMT_MPEG1)
         ff_mpeg1_encode_init(s);
+#endif
 
-    /* init default q matrix */
+    /* init q matrix */
     for(i=0;i<64;i++) {
         int j= s->dsp.idct_permutation[i];
 #ifdef CONFIG_RISKY
@@ -1170,14 +1236,7 @@ int MPV_encode_init(AVCodecContext *avctx)
 
     if(ff_rate_control_init(s) < 0)
         return -1;
-
-    s->picture_number = 0;
-    s->input_picture_number = 0;
-    s->picture_in_gop_number = 0;
-    /* motion detector init */
-    s->f_code = 1;
-    s->b_code = 1;
-
+    
     return 0;
 }
 
@@ -1321,7 +1380,7 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
     assert(s->last_picture_ptr==NULL || s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3);
 
     /* mark&release old frames */
-    if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr->data[0]) {
+    if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr != s->next_picture_ptr && s->last_picture_ptr->data[0]) {
         avctx->release_buffer(avctx, (AVFrame*)s->last_picture_ptr);
 
         /* release forgotten pictures */
@@ -1351,7 +1410,7 @@ alloc:
             pic= (AVFrame*)&s->picture[i];
         }
 
-        pic->reference= s->pict_type != B_TYPE ? 3 : 0;
+        pic->reference= s->pict_type != B_TYPE && !s->dropable ? 3 : 0;
 
         pic->coded_picture_number= s->coded_picture_number++;
         
@@ -1373,8 +1432,14 @@ alloc:
   if(s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3){
     if (s->pict_type != B_TYPE) {
         s->last_picture_ptr= s->next_picture_ptr;
-        s->next_picture_ptr= s->current_picture_ptr;
+        if(!s->dropable)
+            s->next_picture_ptr= s->current_picture_ptr;
     }
+/*    av_log(s->avctx, AV_LOG_DEBUG, "L%p N%p C%p L%p N%p C%p type:%d drop:%d\n", s->last_picture_ptr, s->next_picture_ptr,s->current_picture_ptr,
+        s->last_picture_ptr    ? s->last_picture_ptr->data[0] : NULL, 
+        s->next_picture_ptr    ? s->next_picture_ptr->data[0] : NULL, 
+        s->current_picture_ptr ? s->current_picture_ptr->data[0] : NULL,
+        s->pict_type, s->dropable);*/
     
     if(s->last_picture_ptr) copy_picture(&s->last_picture, s->last_picture_ptr);
     if(s->next_picture_ptr) copy_picture(&s->next_picture, s->next_picture_ptr);
@@ -1486,7 +1551,7 @@ void MPV_frame_end(MpegEncContext *s)
  * @param color color of the arrow
  */
 static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){
-    int t, x, y, f;
+    int t, x, y, fr, f;
     
     sx= clip(sx, 0, w-1);
     sy= clip(sy, 0, h-1);
@@ -1504,8 +1569,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h
         ex-= sx;
         f= ((ey-sy)<<16)/ex;
         for(x= 0; x <= ex; x++){
-            y= ((x*f) + (1<<15))>>16;
-            buf[y*stride + x]+= color;
+            y = (x*f)>>16;
+            fr= (x*f)&0xFFFF;
+            buf[ y   *stride + x]+= (color*(0x10000-fr))>>16;
+            buf[(y+1)*stride + x]+= (color*         fr )>>16;
         }
     }else{
         if(sy > ey){
@@ -1517,8 +1584,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h
         if(ey) f= ((ex-sx)<<16)/ey;
         else   f= 0;
         for(y= 0; y <= ey; y++){
-            x= ((y*f) + (1<<15))>>16;
-            buf[y*stride + x]+= color;
+            x = (y*f)>>16;
+            fr= (y*f)&0xFFFF;
+            buf[y*stride + x  ]+= (color*(0x10000-fr))>>16;;
+            buf[y*stride + x+1]+= (color*         fr )>>16;;
         }
     }
 }
@@ -1680,12 +1749,13 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
                     if(!USES_LIST(pict->mb_type[mb_index], direction))
                         continue;
 
+                    //FIXME for h264
                     if(IS_8X8(pict->mb_type[mb_index])){
                       int i;
                       for(i=0; i<4; i++){
                         int sx= mb_x*16 + 4 + 8*(i&1);
                         int sy= mb_y*16 + 4 + 8*(i>>1);
-                        int xy= 1 + mb_x*2 + (i&1) + (mb_y*2 + 1 + (i>>1))*(s->mb_width*2 + 2);
+                        int xy= mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride;
                         int mx= (pict->motion_val[direction][xy][0]>>shift) + sx;
                         int my= (pict->motion_val[direction][xy][1]>>shift) + sy;
                         draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
@@ -1695,15 +1765,19 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
                       for(i=0; i<2; i++){
                         int sx=mb_x*16 + 8;
                         int sy=mb_y*16 + 4 + 8*i;
-                        int xy=1 + mb_x*2 + (mb_y*2 + 1 + i)*(s->mb_width*2 + 2);
-                        int mx=(pict->motion_val[direction][xy][0]>>shift) + sx;
-                        int my=(pict->motion_val[direction][xy][1]>>shift) + sy;
-                        draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
+                        int xy= mb_x*2 + (mb_y*2 + i)*s->b8_stride;
+                        int mx=(pict->motion_val[direction][xy][0]>>shift);
+                        int my=(pict->motion_val[direction][xy][1]>>shift);
+                        
+                        if(IS_INTERLACED(pict->mb_type[mb_index]))
+                            my*=2;
+                        
+                        draw_arrow(ptr, sx, sy, mx+sx, my+sy, s->width, s->height, s->linesize, 100);
                       }
                     }else{
                       int sx= mb_x*16 + 8;
                       int sy= mb_y*16 + 8;
-                      int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2);
+                      int xy= mb_x*2 + mb_y*2*s->b8_stride;
                       int mx= (pict->motion_val[direction][xy][0]>>shift) + sx;
                       int my= (pict->motion_val[direction][xy][1]>>shift) + sy;
                       draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
@@ -1880,7 +1954,7 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
             }
         }
     }
-    copy_picture_attributes(pic, pic_arg);
+    copy_picture_attributes(s, pic, pic_arg);
     
     pic->display_picture_number= s->input_picture_number++;
     if(pic->pts != AV_NOPTS_VALUE){ 
@@ -2009,11 +2083,12 @@ static void select_input_picture(MpegEncContext *s){
                 s->reordered_input_picture[0]->data[i]= NULL;
             s->reordered_input_picture[0]->type= 0;
             
-            copy_picture_attributes((AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]);
             pic->reference              = s->reordered_input_picture[0]->reference;
             
             alloc_picture(s, pic, 0);
 
+            copy_picture_attributes(s, (AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]);
+
             s->current_picture_ptr= pic;
         }else{
             // input is not a shared pix -> reuse buffer for current_pix
@@ -2125,7 +2200,8 @@ int MPV_encode_picture(AVCodecContext *avctx,
         }
 
         /* update mpeg1/2 vbv_delay for CBR */    
-        if(s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate && s->out_format == FMT_MPEG1){
+        if(s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate && s->out_format == FMT_MPEG1
+           && 90000LL * (avctx->rc_buffer_size-1) <= s->avctx->rc_max_rate*0xFFFFLL){
             int vbv_delay;
 
             assert(s->repeat_first_field==0);
@@ -2432,9 +2508,17 @@ if(s->quarter_sample)
     src_y = s->mb_y*(16>>field_based) + (motion_y >> 1);
 
     if (s->out_format == FMT_H263) {
-        uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1);
-        uvsrc_x = src_x>>1;
-        uvsrc_y = src_y>>1;
+        if((s->workaround_bugs & FF_BUG_HPEL_CHROMA) && field_based){
+            mx = (motion_x>>1)|(motion_x&1);
+            my = motion_y >>1;
+            uvdxy = ((my & 1) << 1) | (mx & 1);
+            uvsrc_x = s->mb_x* 8               + (mx >> 1);
+            uvsrc_y = s->mb_y*(8>>field_based) + (my >> 1);
+        }else{
+            uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1);
+            uvsrc_x = src_x>>1;
+            uvsrc_y = src_y>>1;
+        }
     } else {
         mx = motion_x / 2;
         my = motion_y / 2;
@@ -2736,8 +2820,8 @@ static inline void MPV_motion(MpegEncContext *s,
     if(s->obmc && s->pict_type != B_TYPE){
         int16_t mv_cache[4][4][2];
         const int xy= s->mb_x + s->mb_y*s->mb_stride;
-        const int mot_stride= s->mb_width*2 + 2;
-        const int mot_xy= 1 + mb_x*2 + (mb_y*2 + 1)*mot_stride;
+        const int mot_stride= s->b8_stride;
+        const int mot_xy= mb_x*2 + mb_y*2*mot_stride;
 
         assert(!s->mb_skiped);
                 
@@ -2993,7 +3077,7 @@ static inline void add_dequant_dct(MpegEncContext *s,
  */
 void ff_clean_intra_table_entries(MpegEncContext *s)
 {
-    int wrap = s->block_wrap[0];
+    int wrap = s->b8_stride;
     int xy = s->block_index[0];
     
     s->dc_val[0][xy           ] = 
@@ -3010,15 +3094,15 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
         s->coded_block[xy + 1 + wrap] = 0;
     }
     /* chroma */
-    wrap = s->block_wrap[4];
-    xy = s->mb_x + 1 + (s->mb_y + 1) * wrap;
+    wrap = s->mb_stride;
+    xy = s->mb_x + s->mb_y * wrap;
     s->dc_val[1][xy] =
     s->dc_val[2][xy] = 1024;
     /* ac pred */
     memset(s->ac_val[1][xy], 0, 16 * sizeof(int16_t));
     memset(s->ac_val[2][xy], 0, 16 * sizeof(int16_t));
     
-    s->mbintra_table[s->mb_x + s->mb_y*s->mb_stride]= 0;
+    s->mbintra_table[xy]= 0;
 }
 
 /* generic function called after a macroblock has been parsed by the
@@ -3338,12 +3422,12 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     const int linesize= s->current_picture.linesize[0]; //not s->linesize as this woulnd be wrong for field pics
     const int uvlinesize= s->current_picture.linesize[1];
         
-    s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
-    s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1)     + s->mb_x*2;
-    s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2;
-    s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2)     + s->mb_x*2;
-    s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
-    s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
+    s->block_index[0]= s->b8_stride*(s->mb_y*2    ) - 2 + s->mb_x*2;
+    s->block_index[1]= s->b8_stride*(s->mb_y*2    ) - 1 + s->mb_x*2;
+    s->block_index[2]= s->b8_stride*(s->mb_y*2 + 1) - 2 + s->mb_x*2;
+    s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
+    s->block_index[4]= s->mb_stride*(s->mb_y + 1)                + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
+    s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
     
     if(s->pict_type==B_TYPE && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME){
         s->dest[0] = s->current_picture.data[0] + s->mb_x * 16 - 16;
@@ -3392,7 +3476,6 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     int dct_offset   = s->linesize*8; //default for progressive frames
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     int wrap_y, wrap_c;
-    int emu=0;
     
     for(i=0; i<6; i++) skip_dct[i]=0;
     
@@ -4065,8 +4148,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
             ff_update_block_index(s);
 
             /* write gob / video packet header  */
-/* xine: do not need this for decode or MPEG-1 encoding modes */
-#if 0
 #ifdef CONFIG_RISKY
             if(s->rtp_mode){
                 int current_packet_size, is_gob_start;
@@ -4121,19 +4202,25 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         s->avctx->rtp_callback(s->ptr_lastgob, current_packet_size, 0);
                     
                     switch(s->codec_id){
+/* xine: do not need this for decode or MPEG-1 encoding modes */
+#if 0
                     case CODEC_ID_MPEG4:
                         ff_mpeg4_encode_video_packet_header(s);
                         ff_mpeg4_clean_buffers(s);
                     break;
+#endif /* #if 0 */
                     case CODEC_ID_MPEG1VIDEO:
                     case CODEC_ID_MPEG2VIDEO:
                         ff_mpeg1_encode_slice_header(s);
                         ff_mpeg1_clean_buffers(s);
                     break;
+/* xine: do not need this for decode or MPEG-1 encoding modes */
+#if 0
                     case CODEC_ID_H263:
                     case CODEC_ID_H263P:
                         h263_encode_gob_header(s, mb_y);                       
                     break;
+#endif /* #if 0 */
                     }
 
                     if(s->flags&CODEC_FLAG_PASS1){
@@ -4149,8 +4236,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 }
             }
 #endif
-#endif /* #if 0 */
-
 
             if(  (s->resync_mb_x   == s->mb_x)
                && s->resync_mb_y+1 == s->mb_y){
@@ -4615,7 +4700,6 @@ static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src)
 
 static void encode_picture(MpegEncContext *s, int picture_number)
 {
-    int mb_x, mb_y;
     int i, j;
     int bits;
 
@@ -4651,19 +4735,18 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     for(i=1; i<s->avctx->thread_count; i++){
         ff_update_duplicate_context(s->thread_context[i], s);
     }
-    
+
+    ff_init_me(s);
+
     /* Estimate motion for every MB */
     if(s->pict_type != I_TYPE){
-/* xine: do not need this for decode or MPEG-1 encoding modes */
-#if 0
-        if(s->pict_type != B_TYPE){
+        if(s->pict_type != B_TYPE && s->avctx->me_threshold==0){
             if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
                 s->avctx->execute(s->avctx, pre_estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
             }
         }
 
         s->avctx->execute(s->avctx, estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
-#endif /* #if 0 */
     }else /* if(s->pict_type == I_TYPE) */{
         /* I-Frame */
         for(i=0; i<s->mb_stride*s->mb_height; i++)
@@ -5194,7 +5277,6 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise?
     int prev_run=0;
     int prev_level=0;
     int qmul, qadd, start_i, last_non_zero, i, dc;
-    const int esc_length= s->ac_esc_length;
     uint8_t * length;
     uint8_t * last_length;
     int lambda;
@@ -5302,7 +5384,6 @@ STOP_TIMER("init rem[]")
 #endif
     for(;;){
         int best_score=s->dsp.try_8x8basis(rem, weight, basis[0], 0);
-        int nochange_score= best_score;
         int best_coeff=0;
         int best_change=0;
         int run2, best_unquant_change, analyze_gradient;
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 171d66d83..cd42177f5 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -138,7 +138,6 @@ typedef struct Picture{
      */
     uint8_t *interpolated[3];
     int16_t (*motion_val_base[2])[2];
-    int8_t *ref_index[2];
     uint32_t *mb_type_base;
 #define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if theres just one type
 #define IS_INTRA4x4(a)   ((a)&MB_TYPE_INTRA4x4)
@@ -201,6 +200,10 @@ typedef struct MotionEstContext{
     int co_located_mv[4][2];           ///< mv from last p frame for direct mode ME 
     int direct_basis_mv[4][2];
     uint8_t *scratchpad;               ///< data area for the me algo, so that the ME doesnt need to malloc/free 
+    uint8_t *best_mb;
+    uint8_t *temp_mb[2];
+    uint8_t *temp;
+    int best_bits;
     uint32_t *map;                     ///< map to avoid duplicate evaluations 
     uint32_t *score_map;               ///< map to store the scores 
     int map_generation;  
@@ -208,31 +211,32 @@ typedef struct MotionEstContext{
     int penalty_factor;
     int sub_penalty_factor;
     int mb_penalty_factor;
+    int flags;
+    int sub_flags;
+    int mb_flags;
     int pre_pass;                      ///< = 1 for the pre pass 
     int dia_size;
     int xmin;
     int xmax;
     int ymin;
     int ymax;
+    int pred_x;
+    int pred_y;
+    uint8_t *src[4][4];
+    uint8_t *ref[4][4];
+    int stride;
+    int uvstride;
+/*    cmp, chroma_cmp;*/
+    op_pixels_func (*hpel_put)[4];
+    op_pixels_func (*hpel_avg)[4];
+    qpel_mc_func (*qpel_put)[16];
+    qpel_mc_func (*qpel_avg)[16];
     uint8_t (*mv_penalty)[MAX_MV*2+1];  ///< amount of bits needed to encode a MV 
+    uint8_t *current_mv_penalty;
     int (*sub_motion_search)(struct MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-                                  int pred_x, int pred_y, uint8_t *src_data[3],
-                                  uint8_t *ref_data[6], int stride, int uvstride,
-                                  int size, int h, uint8_t * const mv_penalty);
-    int (*motion_search[7])(struct MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3],
-                             uint8_t *ref_data[6], int stride, int uvstride, int16_t (*last_mv)[2], 
-                             int ref_mv_scale, uint8_t * const mv_penalty);
-    int (*pre_motion_search)(struct MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y, uint8_t *src_data[3], 
-                             uint8_t *ref_data[6], int stride, int uvstride, int16_t (*last_mv)[2], 
-                             int ref_mv_scale, uint8_t * const mv_penalty);
-    int (*get_mb_score)(struct MpegEncContext * s, int mx, int my, int pred_x, int pred_y, uint8_t *src_data[3],
-                                  uint8_t *ref_data[6], int stride, int uvstride,    
-                                  uint8_t * const mv_penalty);
+                                  int src_index, int ref_index,
+                                  int size, int h);
 }MotionEstContext;
 
 /**
@@ -321,13 +325,16 @@ typedef struct MpegEncContext {
     Picture *current_picture_ptr;  ///< pointer to the current picture
     uint8_t *visualization_buffer[3]; //< temporary buffer vor MV visualization
     int last_dc[3];                ///< last DC values for MPEG1 
+    int16_t *dc_val_base;
     int16_t *dc_val[3];            ///< used for mpeg4 DC prediction, all 3 arrays must be continuous 
     int16_t dc_cache[4*5];
     int y_dc_scale, c_dc_scale;
     uint8_t *y_dc_scale_table;     ///< qscale -> y_dc_scale table 
     uint8_t *c_dc_scale_table;     ///< qscale -> c_dc_scale table 
     const uint8_t *chroma_qscale_table;  ///< qscale -> chroma_qscale (h263)
+    uint8_t *coded_block_base;
     uint8_t *coded_block;          ///< used for coded block pattern prediction (msmpeg4v3, wmv1)
+    int16_t (*ac_val_base)[16];
     int16_t (*ac_val[3])[16];      ///< used for for mpeg4 AC prediction, all 3 arrays must be continuous 
     int ac_pred;
     uint8_t *prev_pict_types;     ///< previous picture types in bitstream order, used for mb skip 
@@ -352,8 +359,9 @@ typedef struct MpegEncContext {
     int adaptive_quant;         ///< use adaptive quantization 
     int dquant;                 ///< qscale difference to prev qscale  
     int pict_type;              ///< I_TYPE, P_TYPE, B_TYPE, ... 
-    int last_pict_type;
+    int last_pict_type; //FIXME removes
     int last_non_b_pict_type;   ///< used for mpeg4 gmc b-frames & ratecontrol 
+    int dropable;
     int frame_rate_index;
     int frame_rate_ext_n;       ///< MPEG-2 specific framerate modificators (numerator)
     int frame_rate_ext_d;       ///< MPEG-2 specific framerate modificators (denominator)
@@ -706,6 +714,7 @@ typedef struct MpegEncContext {
 
 
 int DCT_common_init(MpegEncContext *s);
+void MPV_decode_defaults(MpegEncContext *s);
 int MPV_common_init(MpegEncContext *s);
 void MPV_common_end(MpegEncContext *s);
 void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
@@ -856,7 +865,7 @@ void mpeg4_encode_mb(MpegEncContext *s,
 void h263_encode_picture_header(MpegEncContext *s, int picture_number);
 void ff_flv_encode_picture_header(MpegEncContext *s, int picture_number);
 void h263_encode_gob_header(MpegEncContext * s, int mb_line);
-int16_t *h263_pred_motion(MpegEncContext * s, int block, 
+int16_t *h263_pred_motion(MpegEncContext * s, int block, int dir,
                         int *px, int *py);
 void mpeg4_pred_ac(MpegEncContext * s, DCTELEM *block, int n, 
                    int dir);
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index b7b88c38f..c6cfebe16 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -425,7 +425,9 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
 #ifdef DEBUG
     intra_count = 0;
+/*
     printf("*****frame %d:\n", frame_count++);
+*/
 #endif
 }
 
@@ -449,7 +451,7 @@ static inline int coded_block_pred(MpegEncContext * s, int n, uint8_t **coded_bl
     int xy, wrap, pred, a, b, c;
 
     xy = s->block_index[n];
-    wrap = s->block_wrap[0];
+    wrap = s->b8_stride;
 
     /* B C
      * A X 
@@ -567,7 +569,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
 
             s->misc_bits += get_bits_diff(s);
 
-            h263_pred_motion(s, 0, &pred_x, &pred_y);
+            h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             msmpeg4v2_encode_motion(s, motion_x - pred_x);
             msmpeg4v2_encode_motion(s, motion_y - pred_y);
         }else{
@@ -578,7 +580,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
             s->misc_bits += get_bits_diff(s);
 
             /* motion vector */
-            h263_pred_motion(s, 0, &pred_x, &pred_y);
+            h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             msmpeg4_encode_motion(s, motion_x - pred_x, 
                                   motion_y - pred_y);
         }
@@ -1549,7 +1551,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         cbp|= cbpy<<2;
         if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C;
         
-        h263_pred_motion(s, 0, &mx, &my);
+        h263_pred_motion(s, 0, 0, &mx, &my);
         mx= msmpeg4v2_decode_motion(s, mx, 1);
         my= msmpeg4v2_decode_motion(s, my, 1);
         
@@ -1637,7 +1639,7 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             s->rl_chroma_table_index = s->rl_table_index;
         }
         set_stat(ST_MV);
-        h263_pred_motion(s, 0, &mx, &my);
+        h263_pred_motion(s, 0, 0, &mx, &my);
         if (msmpeg4_decode_motion(s, &mx, &my) < 0)
             return -1;
         s->mv_dir = MV_DIR_FORWARD;
diff --git a/src/libffmpeg/libavcodec/ppc/Makefile.am b/src/libffmpeg/libavcodec/ppc/Makefile.am
index fbd734c29..50b9d802e 100644
--- a/src/libffmpeg/libavcodec/ppc/Makefile.am
+++ b/src/libffmpeg/libavcodec/ppc/Makefile.am
@@ -11,6 +11,7 @@ noinst_LTLIBRARIES = libavcodec_ppc.la
 
 libavcodec_ppc_src =  dsputil_altivec.c \
 		      dsputil_ppc.c \
+		      fdct_altivec.c \
 		      fft_altivec.c \
 		      idct_altivec.c \
 		      gmc_altivec.c \
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
index 633cae68b..1bc6fb009 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2002 Brian Foley
  * Copyright (c) 2002 Dieter Shirley
- * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -1302,6 +1302,357 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
+int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
+POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
+  int sum;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
+  register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
+  register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+#ifdef CONFIG_DARWIN
+  {
+    register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+    register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+    register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+    register const_vector unsigned char perm1 = (const_vector unsigned char)
+      (0x02, 0x03, 0x00, 0x01,
+       0x06, 0x07, 0x04, 0x05,
+       0x0A, 0x0B, 0x08, 0x09,
+       0x0E, 0x0F, 0x0C, 0x0D);
+    register const_vector unsigned char perm2 = (const_vector unsigned char)
+      (0x04, 0x05, 0x06, 0x07,
+       0x00, 0x01, 0x02, 0x03,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x08, 0x09, 0x0A, 0x0B);
+    register const_vector unsigned char perm3 = (const_vector unsigned char)
+      (0x08, 0x09, 0x0A, 0x0B,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x00, 0x01, 0x02, 0x03,
+       0x04, 0x05, 0x06, 0x07);
+#else
+    register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+    register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+    register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+    register const_vector unsigned char perm1 = (const_vector unsigned char)
+      {0x02, 0x03, 0x00, 0x01,
+       0x06, 0x07, 0x04, 0x05,
+       0x0A, 0x0B, 0x08, 0x09,
+       0x0E, 0x0F, 0x0C, 0x0D};
+    register const_vector unsigned char perm2 = (const_vector unsigned char)
+      {0x04, 0x05, 0x06, 0x07,
+       0x00, 0x01, 0x02, 0x03,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x08, 0x09, 0x0A, 0x0B};
+    register const_vector unsigned char perm3 = (const_vector unsigned char)
+      {0x08, 0x09, 0x0A, 0x0B,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x00, 0x01, 0x02, 0x03,
+       0x04, 0x05, 0x06, 0x07};
+#endif
+
+#define ONEITERBUTTERFLY(i, res)					\
+    {									\
+      register vector unsigned char src1, src2, srcO;		       	\
+      register vector unsigned char dst1, dst2, dstO;		       	\
+      src1 = vec_ld(stride * i, src);					\
+      if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8)	\
+	src2 = vec_ld((stride * i) + 16, src);				\
+      srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));		\
+      dst1 = vec_ld(stride * i, dst);					\
+      if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8)	\
+	dst2 = vec_ld((stride * i) + 16, dst);				\
+      dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));		\
+      /* promote the unsigned chars to signed shorts */			\
+      /* we're in the 8x8 function, we only care for the first 8 */	\
+      register vector signed short srcV =			       	\
+	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
+      register vector signed short dstV =			       	\
+	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+      /* substractions inside the first butterfly */			\
+      register vector signed short but0 = vec_sub(srcV, dstV);	       	\
+      register vector signed short op1 = vec_perm(but0, but0, perm1);  	\
+      register vector signed short but1 = vec_mladd(but0, vprod1, op1);	\
+      register vector signed short op2 = vec_perm(but1, but1, perm2);  	\
+      register vector signed short but2 = vec_mladd(but1, vprod2, op2);	\
+      register vector signed short op3 = vec_perm(but2, but2, perm3);  	\
+      res = vec_mladd(but2, vprod3, op3);				\
+    }
+    ONEITERBUTTERFLY(0, temp0);
+    ONEITERBUTTERFLY(1, temp1);
+    ONEITERBUTTERFLY(2, temp2);
+    ONEITERBUTTERFLY(3, temp3);
+    ONEITERBUTTERFLY(4, temp4);
+    ONEITERBUTTERFLY(5, temp5);
+    ONEITERBUTTERFLY(6, temp6);
+    ONEITERBUTTERFLY(7, temp7);
+  }
+#undef ONEITERBUTTERFLY
+  {
+    register vector signed int vsum;
+    register vector signed short line0 = vec_add(temp0, temp1);
+    register vector signed short line1 = vec_sub(temp0, temp1);
+    register vector signed short line2 = vec_add(temp2, temp3);
+    register vector signed short line3 = vec_sub(temp2, temp3);
+    register vector signed short line4 = vec_add(temp4, temp5);
+    register vector signed short line5 = vec_sub(temp4, temp5);
+    register vector signed short line6 = vec_add(temp6, temp7);
+    register vector signed short line7 = vec_sub(temp6, temp7);
+    
+    register vector signed short line0B = vec_add(line0, line2);
+    register vector signed short line2B = vec_sub(line0, line2);
+    register vector signed short line1B = vec_add(line1, line3);
+    register vector signed short line3B = vec_sub(line1, line3);
+    register vector signed short line4B = vec_add(line4, line6);
+    register vector signed short line6B = vec_sub(line4, line6);
+    register vector signed short line5B = vec_add(line5, line7);
+    register vector signed short line7B = vec_sub(line5, line7);
+    
+    register vector signed short line0C = vec_add(line0B, line4B);
+    register vector signed short line4C = vec_sub(line0B, line4B);
+    register vector signed short line1C = vec_add(line1B, line5B);
+    register vector signed short line5C = vec_sub(line1B, line5B);
+    register vector signed short line2C = vec_add(line2B, line6B);
+    register vector signed short line6C = vec_sub(line2B, line6B);
+    register vector signed short line3C = vec_add(line3B, line7B);
+    register vector signed short line7C = vec_sub(line3B, line7B);
+    
+    vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+    vsum = vec_sum4s(vec_abs(line1C), vsum);
+    vsum = vec_sum4s(vec_abs(line2C), vsum);
+    vsum = vec_sum4s(vec_abs(line3C), vsum);
+    vsum = vec_sum4s(vec_abs(line4C), vsum);
+    vsum = vec_sum4s(vec_abs(line5C), vsum);
+    vsum = vec_sum4s(vec_abs(line6C), vsum);
+    vsum = vec_sum4s(vec_abs(line7C), vsum);
+    vsum = vec_sums(vsum, (vector signed int)vzero);
+    vsum = vec_splat(vsum, 3);
+    vec_ste(vsum, 0, &sum);
+  }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
+  return sum;
+}
+
+/*
+  16x8 works with 16 elements ; it allows to avoid replicating
+  loads, and give the compiler more rooms for scheduling.
+  It's only used from inside hadamard8_diff16_altivec.
+  
+  Unfortunately, it seems gcc-3.3 is a bit dumb, and
+  the compiled code has a LOT of spill code, it seems
+  gcc (unlike xlc) cannot keep everything in registers
+  by itself. The following code include hand-made
+  registers allocation. It's not clean, but on
+  a 7450 the resulting code is much faster (best case
+  fall from 700+ cycles to 550).
+  
+  xlc doesn't add spill code, but it doesn't know how to
+  schedule for the 7450, and its code isn't much faster than
+  gcc-3.3 on the 7450 (but uses 25% less instructions...)
+  
+  On the 970, the hand-made RA is still a win (arount 690
+  vs. around 780), but xlc goes to around 660 on the
+  regular C code...
+*/
+
+static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
+  int sum;
+  register vector signed short
+    temp0 asm ("v0"),
+    temp1 asm ("v1"),
+    temp2 asm ("v2"),
+    temp3 asm ("v3"),
+    temp4 asm ("v4"),
+    temp5 asm ("v5"),
+    temp6 asm ("v6"),
+    temp7 asm ("v7");
+  register vector signed short
+    temp0S asm ("v8"),
+    temp1S asm ("v9"),
+    temp2S asm ("v10"),
+    temp3S asm ("v11"),
+    temp4S asm ("v12"),
+    temp5S asm ("v13"),
+    temp6S asm ("v14"),
+    temp7S asm ("v15");
+  register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
+  {
+#ifdef CONFIG_DARWIN
+    register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+    register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+    register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+    register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
+      (0x02, 0x03, 0x00, 0x01,
+       0x06, 0x07, 0x04, 0x05,
+       0x0A, 0x0B, 0x08, 0x09,
+       0x0E, 0x0F, 0x0C, 0x0D);
+    register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
+      (0x04, 0x05, 0x06, 0x07,
+       0x00, 0x01, 0x02, 0x03,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x08, 0x09, 0x0A, 0x0B);
+    register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
+      (0x08, 0x09, 0x0A, 0x0B,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x00, 0x01, 0x02, 0x03,
+       0x04, 0x05, 0x06, 0x07);
+#else
+    register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+    register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+    register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+    register const_vector unsigned char perm1 = (const_vector unsigned char)
+      {0x02, 0x03, 0x00, 0x01,
+       0x06, 0x07, 0x04, 0x05,
+       0x0A, 0x0B, 0x08, 0x09,
+       0x0E, 0x0F, 0x0C, 0x0D};
+    register const_vector unsigned char perm2 = (const_vector unsigned char)
+      {0x04, 0x05, 0x06, 0x07,
+       0x00, 0x01, 0x02, 0x03,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x08, 0x09, 0x0A, 0x0B};
+    register const_vector unsigned char perm3 = (const_vector unsigned char)
+      {0x08, 0x09, 0x0A, 0x0B,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x00, 0x01, 0x02, 0x03,
+       0x04, 0x05, 0x06, 0x07};
+#endif
+#define ONEITERBUTTERFLY(i, res1, res2)					\
+    {									\
+      register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
+      register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
+      src1 = vec_ld(stride * i, src);					\
+      src2 = vec_ld((stride * i) + 16, src);				\
+      register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+      dst1 = vec_ld(stride * i, dst);					\
+      dst2 = vec_ld((stride * i) + 16, dst);				\
+      register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+      /* promote the unsigned chars to signed shorts */			\
+      register vector signed short srcV asm ("v24") =                   \
+	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
+      register vector signed short dstV asm ("v25") =                   \
+	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+      register vector signed short srcW asm ("v26") =                   \
+	(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
+      register vector signed short dstW asm ("v27") =                   \
+	(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
+      /* substractions inside the first butterfly */			\
+      register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
+      register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
+      register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
+      register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
+      register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
+      register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
+      register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
+      register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
+      register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
+      register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
+      register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
+      res1 = vec_mladd(but2, vprod3, op3);				\
+      register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
+      res2 = vec_mladd(but2S, vprod3, op3S);				\
+    }
+    ONEITERBUTTERFLY(0, temp0, temp0S);
+    ONEITERBUTTERFLY(1, temp1, temp1S);
+    ONEITERBUTTERFLY(2, temp2, temp2S);
+    ONEITERBUTTERFLY(3, temp3, temp3S);
+    ONEITERBUTTERFLY(4, temp4, temp4S);
+    ONEITERBUTTERFLY(5, temp5, temp5S);
+    ONEITERBUTTERFLY(6, temp6, temp6S);
+    ONEITERBUTTERFLY(7, temp7, temp7S);
+  }
+#undef ONEITERBUTTERFLY
+  {
+    register vector signed int vsum;
+    register vector signed short line0 = vec_add(temp0, temp1);
+    register vector signed short line1 = vec_sub(temp0, temp1);
+    register vector signed short line2 = vec_add(temp2, temp3);
+    register vector signed short line3 = vec_sub(temp2, temp3);
+    register vector signed short line4 = vec_add(temp4, temp5);
+    register vector signed short line5 = vec_sub(temp4, temp5);
+    register vector signed short line6 = vec_add(temp6, temp7);
+    register vector signed short line7 = vec_sub(temp6, temp7);
+      
+    register vector signed short line0B = vec_add(line0, line2);
+    register vector signed short line2B = vec_sub(line0, line2);
+    register vector signed short line1B = vec_add(line1, line3);
+    register vector signed short line3B = vec_sub(line1, line3);
+    register vector signed short line4B = vec_add(line4, line6);
+    register vector signed short line6B = vec_sub(line4, line6);
+    register vector signed short line5B = vec_add(line5, line7);
+    register vector signed short line7B = vec_sub(line5, line7);
+      
+    register vector signed short line0C = vec_add(line0B, line4B);
+    register vector signed short line4C = vec_sub(line0B, line4B);
+    register vector signed short line1C = vec_add(line1B, line5B);
+    register vector signed short line5C = vec_sub(line1B, line5B);
+    register vector signed short line2C = vec_add(line2B, line6B);
+    register vector signed short line6C = vec_sub(line2B, line6B);
+    register vector signed short line3C = vec_add(line3B, line7B);
+    register vector signed short line7C = vec_sub(line3B, line7B);
+      
+    vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+    vsum = vec_sum4s(vec_abs(line1C), vsum);
+    vsum = vec_sum4s(vec_abs(line2C), vsum);
+    vsum = vec_sum4s(vec_abs(line3C), vsum);
+    vsum = vec_sum4s(vec_abs(line4C), vsum);
+    vsum = vec_sum4s(vec_abs(line5C), vsum);
+    vsum = vec_sum4s(vec_abs(line6C), vsum);
+    vsum = vec_sum4s(vec_abs(line7C), vsum);
+
+    register vector signed short line0S = vec_add(temp0S, temp1S);
+    register vector signed short line1S = vec_sub(temp0S, temp1S);
+    register vector signed short line2S = vec_add(temp2S, temp3S);
+    register vector signed short line3S = vec_sub(temp2S, temp3S);
+    register vector signed short line4S = vec_add(temp4S, temp5S);
+    register vector signed short line5S = vec_sub(temp4S, temp5S);
+    register vector signed short line6S = vec_add(temp6S, temp7S);
+    register vector signed short line7S = vec_sub(temp6S, temp7S);
+
+    register vector signed short line0BS = vec_add(line0S, line2S);
+    register vector signed short line2BS = vec_sub(line0S, line2S);
+    register vector signed short line1BS = vec_add(line1S, line3S);
+    register vector signed short line3BS = vec_sub(line1S, line3S);
+    register vector signed short line4BS = vec_add(line4S, line6S);
+    register vector signed short line6BS = vec_sub(line4S, line6S);
+    register vector signed short line5BS = vec_add(line5S, line7S);
+    register vector signed short line7BS = vec_sub(line5S, line7S);
+
+    register vector signed short line0CS = vec_add(line0BS, line4BS);
+    register vector signed short line4CS = vec_sub(line0BS, line4BS);
+    register vector signed short line1CS = vec_add(line1BS, line5BS);
+    register vector signed short line5CS = vec_sub(line1BS, line5BS);
+    register vector signed short line2CS = vec_add(line2BS, line6BS);
+    register vector signed short line6CS = vec_sub(line2BS, line6BS);
+    register vector signed short line3CS = vec_add(line3BS, line7BS);
+    register vector signed short line7CS = vec_sub(line3BS, line7BS);
+
+    vsum = vec_sum4s(vec_abs(line0CS), vsum);
+    vsum = vec_sum4s(vec_abs(line1CS), vsum);
+    vsum = vec_sum4s(vec_abs(line2CS), vsum);
+    vsum = vec_sum4s(vec_abs(line3CS), vsum);
+    vsum = vec_sum4s(vec_abs(line4CS), vsum);
+    vsum = vec_sum4s(vec_abs(line5CS), vsum);
+    vsum = vec_sum4s(vec_abs(line6CS), vsum);
+    vsum = vec_sum4s(vec_abs(line7CS), vsum);
+    vsum = vec_sums(vsum, (vector signed int)vzero);
+    vsum = vec_splat(vsum, 3);
+    vec_ste(vsum, 0, &sum);
+  }
+  return sum;
+}
+
+int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
+POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
+  int score;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
+  score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+  if (h==16) {
+    dst += 8*stride;
+    src += 8*stride;
+    score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+  }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
+  return score;
+}
+
 int has_altivec(void)
 {
 #ifdef CONFIG_DARWIN
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
index 93448a1ad..e2729ab22 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2002 Brian Foley
  * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -45,6 +46,8 @@ extern void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l
 extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
 extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
 extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
+extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
+extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
 
 extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder);
 
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
index b8372e51e..b70de7328 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2002 Brian Foley
  * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -59,6 +60,8 @@ static unsigned char* perfname[] = {
   "put_no_rnd_pixels8_xy2_altivec",
   "put_pixels16_xy2_altivec",
   "put_no_rnd_pixels16_xy2_altivec",
+  "hadamard8_diff8x8_altivec",
+  "hadamard8_diff16_altivec",
   "clear_blocks_dcbz32_ppc",
   "clear_blocks_dcbz128_ppc"
 };
@@ -262,7 +265,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
         c->add_bytes= add_bytes_altivec;
 #endif /* 0 */
         c->put_pixels_tab[0][0] = put_pixels16_altivec;
-        /* the tow functions do the same thing, so use the same code */
+        /* the two functions do the same thing, so use the same code */
         c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
         c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
 // next one disabled as it's untested.
@@ -276,6 +279,9 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
         
 	c->gmc1 = gmc1_altivec;
 
+	c->hadamard8_diff[0] = hadamard8_diff16_altivec;
+	c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
+
 #ifdef CONFIG_ENCODERS
 	if (avctx->dct_algo == FF_DCT_AUTO ||
 	    avctx->dct_algo == FF_DCT_ALTIVEC)
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h
index d672edfcb..8b34c6b45 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -50,6 +50,8 @@ enum powerpc_perf_index {
   altivec_put_no_rnd_pixels8_xy2_num,
   altivec_put_pixels16_xy2_num,
   altivec_put_no_rnd_pixels16_xy2_num,
+  altivec_hadamard8_diff8x8_num,
+  altivec_hadamard8_diff16_num,
   powerpc_clear_blocks_dcbz32,
   powerpc_clear_blocks_dcbz128,
   powerpc_perf_total
@@ -63,6 +65,8 @@ enum powerpc_data_index {
 };
 extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
 
+#ifndef POWERPC_MODE_64BITS
+#define POWERP_PMC_DATATYPE unsigned long
 #define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
 #define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
 #if (POWERPC_NUM_PMC_ENABLED > 2)
@@ -79,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
 #define POWERPC_GET_PMC5(a) do {} while (0)
 #define POWERPC_GET_PMC6(a) do {} while (0)
 #endif
-#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index;
+#else /* POWERPC_MODE_64BITS */
+#define POWERP_PMC_DATATYPE unsigned long long
+#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
+#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a))
+#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a))
+#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#endif /* POWERPC_MODE_64BITS */
+#define POWERPC_PERF_DECLARE(a, cond)				\
+  POWERP_PMC_DATATYPE						\
+    pmc_start[POWERPC_NUM_PMC_ENABLED],				\
+    pmc_stop[POWERPC_NUM_PMC_ENABLED],				\
+    pmc_loop_index;
 #define POWERPC_PERF_START_COUNT(a, cond) do { \
   POWERPC_GET_PMC6(pmc_start[5]); \
   POWERPC_GET_PMC5(pmc_start[4]); \
@@ -101,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
         pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
         pmc_loop_index++)         \
     {                             \
-      if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
-      {                           \
-        unsigned long diff =      \
+      if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index])  \
+	{							  \
+        POWERP_PMC_DATATYPE diff =				  \
           pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index];   \
         if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
           perfdata[pmc_loop_index][a][powerpc_data_min] = diff;   \
diff --git a/src/libffmpeg/libavcodec/ppc/fdct_altivec.c b/src/libffmpeg/libavcodec/ppc/fdct_altivec.c
new file mode 100644
index 000000000..99df5ced3
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/fdct_altivec.c
@@ -0,0 +1,498 @@
+/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the
+ * AltiVec optimized library for the FFMPEG Multimedia System
+ * Copyright (C) 2003  James Klicman <james@klicman.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+#include "../common.h"
+#include "../dsputil.h"
+#include "dsputil_altivec.h"
+#include "gcc_fixes.h"
+
+
+#define vs16(v) ((vector signed short)(v))
+#define vs32(v) ((vector signed int)(v))
+#define vu8(v)  ((vector unsigned char)(v))
+#define vu16(v) ((vector unsigned short)(v))
+#define vu32(v) ((vector unsigned int)(v))
+
+
+#define C1     0.98078525066375732421875000 /* cos(1*PI/16) */
+#define C2     0.92387950420379638671875000 /* cos(2*PI/16) */
+#define C3     0.83146959543228149414062500 /* cos(3*PI/16) */
+#define C4     0.70710676908493041992187500 /* cos(4*PI/16) */
+#define C5     0.55557024478912353515625000 /* cos(5*PI/16) */
+#define C6     0.38268342614173889160156250 /* cos(6*PI/16) */
+#define C7     0.19509032368659973144531250 /* cos(7*PI/16) */
+#define SQRT_2 1.41421353816986083984375000 /* sqrt(2)      */
+
+
+#define W0 -(2 * C2)
+#define W1 (2 * C6)
+#define W2 (SQRT_2 * C6)
+#define W3 (SQRT_2 * C3)
+#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
+#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
+#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
+#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
+#define W8 (SQRT_2 * ( C7 - C3))
+#define W9 (SQRT_2 * (-C1 - C3))
+#define WA (SQRT_2 * (-C3 - C5))
+#define WB (SQRT_2 * ( C5 - C3))
+
+
+static vector float fdctconsts[3] = {
+    (vector float)AVV( W0, W1, W2, W3 ),
+    (vector float)AVV( W4, W5, W6, W7 ),
+    (vector float)AVV( W8, W9, WA, WB )
+};
+
+#define LD_W0 vec_splat(cnsts0, 0)
+#define LD_W1 vec_splat(cnsts0, 1)
+#define LD_W2 vec_splat(cnsts0, 2)
+#define LD_W3 vec_splat(cnsts0, 3)
+#define LD_W4 vec_splat(cnsts1, 0)
+#define LD_W5 vec_splat(cnsts1, 1)
+#define LD_W6 vec_splat(cnsts1, 2)
+#define LD_W7 vec_splat(cnsts1, 3)
+#define LD_W8 vec_splat(cnsts2, 0)
+#define LD_W9 vec_splat(cnsts2, 1)
+#define LD_WA vec_splat(cnsts2, 2)
+#define LD_WB vec_splat(cnsts2, 3)
+
+
+#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */                  \
+    x0 = vec_add(b0, b7);               /* x0 = b0 + b7; */         \
+    x7 = vec_sub(b0, b7);               /* x7 = b0 - b7; */         \
+    x1 = vec_add(b1, b6);               /* x1 = b1 + b6; */         \
+    x6 = vec_sub(b1, b6);               /* x6 = b1 - b6; */         \
+    x2 = vec_add(b2, b5);               /* x2 = b2 + b5; */         \
+    x5 = vec_sub(b2, b5);               /* x5 = b2 - b5; */         \
+    x3 = vec_add(b3, b4);               /* x3 = b3 + b4; */         \
+    x4 = vec_sub(b3, b4);               /* x4 = b3 - b4; */         \
+                                                                    \
+    b7 = vec_add(x0, x3);               /* b7 = x0 + x3; */         \
+    b1 = vec_add(x1, x2);               /* b1 = x1 + x2; */         \
+    b0 = vec_add(b7, b1);               /* b0 = b7 + b1; */         \
+    b4 = vec_sub(b7, b1);               /* b4 = b7 - b1; */         \
+                                                                    \
+    b2 = vec_sub(x0, x3);               /* b2 = x0 - x3; */         \
+    b6 = vec_sub(x1, x2);               /* b6 = x1 - x2; */         \
+    b5 = vec_add(b6, b2);               /* b5 = b6 + b2; */         \
+    cnst = LD_W2;                                                   \
+    b5 = vec_madd(cnst, b5, mzero);     /* b5 = b5 * W2; */         \
+    cnst = LD_W1;                                                   \
+    b2 = vec_madd(cnst, b2, b5);        /* b2 = b5 + b2 * W1; */    \
+    cnst = LD_W0;                                                   \
+    b6 = vec_madd(cnst, b6, b5);        /* b6 = b5 + b6 * W0; */    \
+                                                                    \
+    x0 = vec_add(x4, x7);               /* x0 = x4 + x7; */         \
+    x1 = vec_add(x5, x6);               /* x1 = x5 + x6; */         \
+    x2 = vec_add(x4, x6);               /* x2 = x4 + x6; */         \
+    x3 = vec_add(x5, x7);               /* x3 = x5 + x7; */         \
+    x8 = vec_add(x2, x3);               /* x8 = x2 + x3; */         \
+    cnst = LD_W3;                                                   \
+    x8 = vec_madd(cnst, x8, mzero);     /* x8 = x8 * W3; */         \
+                                                                    \
+    cnst = LD_W8;                                                   \
+    x0 = vec_madd(cnst, x0, mzero);     /* x0 *= W8; */             \
+    cnst = LD_W9;                                                   \
+    x1 = vec_madd(cnst, x1, mzero);     /* x1 *= W9; */             \
+    cnst = LD_WA;                                                   \
+    x2 = vec_madd(cnst, x2, x8);        /* x2 = x2 * WA + x8; */    \
+    cnst = LD_WB;                                                   \
+    x3 = vec_madd(cnst, x3, x8);        /* x3 = x3 * WB + x8; */    \
+                                                                    \
+    cnst = LD_W4;                                                   \
+    b7 = vec_madd(cnst, x4, x0);        /* b7 = x4 * W4 + x0; */    \
+    cnst = LD_W5;                                                   \
+    b5 = vec_madd(cnst, x5, x1);        /* b5 = x5 * W5 + x1; */    \
+    cnst = LD_W6;                                                   \
+    b3 = vec_madd(cnst, x6, x1);        /* b3 = x6 * W6 + x1; */    \
+    cnst = LD_W7;                                                   \
+    b1 = vec_madd(cnst, x7, x0);        /* b1 = x7 * W7 + x0; */    \
+                                                                    \
+    b7 = vec_add(b7, x2);               /* b7 = b7 + x2; */         \
+    b5 = vec_add(b5, x3);               /* b5 = b5 + x3; */         \
+    b3 = vec_add(b3, x2);               /* b3 = b3 + x2; */         \
+    b1 = vec_add(b1, x3);               /* b1 = b1 + x3; */         \
+    /* }}} */
+
+#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */                  \
+    x0 = vec_add(b0, b7);               /* x0 = b0 + b7; */         \
+    x7 = vec_sub(b0, b7);               /* x7 = b0 - b7; */         \
+    x1 = vec_add(b1, b6);               /* x1 = b1 + b6; */         \
+    x6 = vec_sub(b1, b6);               /* x6 = b1 - b6; */         \
+    x2 = vec_add(b2, b5);               /* x2 = b2 + b5; */         \
+    x5 = vec_sub(b2, b5);               /* x5 = b2 - b5; */         \
+    x3 = vec_add(b3, b4);               /* x3 = b3 + b4; */         \
+    x4 = vec_sub(b3, b4);               /* x4 = b3 - b4; */         \
+                                                                    \
+    b7 = vec_add(x0, x3);               /* b7 = x0 + x3; */         \
+    b1 = vec_add(x1, x2);               /* b1 = x1 + x2; */         \
+    b0 = vec_add(b7, b1);               /* b0 = b7 + b1; */         \
+    b4 = vec_sub(b7, b1);               /* b4 = b7 - b1; */         \
+                                                                    \
+    b2 = vec_sub(x0, x3);               /* b2 = x0 - x3; */         \
+    b6 = vec_sub(x1, x2);               /* b6 = x1 - x2; */         \
+    b5 = vec_add(b6, b2);               /* b5 = b6 + b2; */         \
+    cnst = LD_W2;                                                   \
+    b5 = vec_madd(cnst, b5, mzero);     /* b5 = b5 * W2; */         \
+    cnst = LD_W1;                                                   \
+    b2 = vec_madd(cnst, b2, b5);        /* b2 = b5 + b2 * W1; */    \
+    cnst = LD_W0;                                                   \
+    b6 = vec_madd(cnst, b6, b5);        /* b6 = b5 + b6 * W0; */    \
+                                                                    \
+    x0 = vec_add(x4, x7);               /* x0 = x4 + x7; */         \
+    x1 = vec_add(x5, x6);               /* x1 = x5 + x6; */         \
+    x2 = vec_add(x4, x6);               /* x2 = x4 + x6; */         \
+    x3 = vec_add(x5, x7);               /* x3 = x5 + x7; */         \
+    x8 = vec_add(x2, x3);               /* x8 = x2 + x3; */         \
+    cnst = LD_W3;                                                   \
+    x8 = vec_madd(cnst, x8, mzero);     /* x8 = x8 * W3; */         \
+                                                                    \
+    cnst = LD_W8;                                                   \
+    x0 = vec_madd(cnst, x0, mzero);     /* x0 *= W8; */             \
+    cnst = LD_W9;                                                   \
+    x1 = vec_madd(cnst, x1, mzero);     /* x1 *= W9; */             \
+    cnst = LD_WA;                                                   \
+    x2 = vec_madd(cnst, x2, x8);        /* x2 = x2 * WA + x8; */    \
+    cnst = LD_WB;                                                   \
+    x3 = vec_madd(cnst, x3, x8);        /* x3 = x3 * WB + x8; */    \
+                                                                    \
+    cnst = LD_W4;                                                   \
+    b7 = vec_madd(cnst, x4, x0);        /* b7 = x4 * W4 + x0; */    \
+    cnst = LD_W5;                                                   \
+    b5 = vec_madd(cnst, x5, x1);        /* b5 = x5 * W5 + x1; */    \
+    cnst = LD_W6;                                                   \
+    b3 = vec_madd(cnst, x6, x1);        /* b3 = x6 * W6 + x1; */    \
+    cnst = LD_W7;                                                   \
+    b1 = vec_madd(cnst, x7, x0);        /* b1 = x7 * W7 + x0; */    \
+                                                                    \
+    b7 = vec_add(b7, x2);               /* b7 += x2; */             \
+    b5 = vec_add(b5, x3);               /* b5 += x3; */             \
+    b3 = vec_add(b3, x2);               /* b3 += x2; */             \
+    b1 = vec_add(b1, x3);               /* b1 += x3; */             \
+    /* }}} */
+
+
+
+/* two dimensional discrete cosine transform */
+
+void fdct_altivec(int16_t *block)
+{
+POWERPC_PERF_DECLARE(altivec_fdct, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+POWERPC_PERF_START_COUNT(altivec_fdct, 1);
+    void ff_jpeg_fdct_islow(int16_t *block);
+    ff_jpeg_fdct_islow(block);
+POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    vector signed short *bp;
+    vector float *cp;
+    vector float b00, b10, b20, b30, b40, b50, b60, b70;
+    vector float b01, b11, b21, b31, b41, b51, b61, b71;
+    vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
+    vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+    POWERPC_PERF_START_COUNT(altivec_fdct, 1);
+
+
+    /* setup constants {{{ */
+    /* mzero = -0.0 */
+    vu32(mzero) = vec_splat_u32(-1);
+    vu32(mzero) = vec_sl(vu32(mzero), vu32(mzero));
+    cp = fdctconsts;
+    cnsts0 = vec_ld(0, cp); cp++;
+    cnsts1 = vec_ld(0, cp); cp++;
+    cnsts2 = vec_ld(0, cp);
+    /* }}} */
+
+
+    /* 8x8 matrix transpose (vector short[8]) {{{ */
+#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
+
+    bp = (vector signed short*)block;
+    vs16(b00) = vec_ld(0,    bp);
+    vs16(b40) = vec_ld(16*4, bp);
+    vs16(b01) = MERGE_S16(h, b00, b40);
+    vs16(b11) = MERGE_S16(l, b00, b40);
+    bp++;
+    vs16(b10) = vec_ld(0,    bp);
+    vs16(b50) = vec_ld(16*4, bp);
+    vs16(b21) = MERGE_S16(h, b10, b50);
+    vs16(b31) = MERGE_S16(l, b10, b50);
+    bp++;
+    vs16(b20) = vec_ld(0,    bp);
+    vs16(b60) = vec_ld(16*4, bp);
+    vs16(b41) = MERGE_S16(h, b20, b60);
+    vs16(b51) = MERGE_S16(l, b20, b60);
+    bp++;
+    vs16(b30) = vec_ld(0,    bp);
+    vs16(b70) = vec_ld(16*4, bp);
+    vs16(b61) = MERGE_S16(h, b30, b70);
+    vs16(b71) = MERGE_S16(l, b30, b70);
+
+    vs16(x0) = MERGE_S16(h, b01, b41);
+    vs16(x1) = MERGE_S16(l, b01, b41);
+    vs16(x2) = MERGE_S16(h, b11, b51);
+    vs16(x3) = MERGE_S16(l, b11, b51);
+    vs16(x4) = MERGE_S16(h, b21, b61);
+    vs16(x5) = MERGE_S16(l, b21, b61);
+    vs16(x6) = MERGE_S16(h, b31, b71);
+    vs16(x7) = MERGE_S16(l, b31, b71);
+
+    vs16(b00) = MERGE_S16(h, x0, x4);
+    vs16(b10) = MERGE_S16(l, x0, x4);
+    vs16(b20) = MERGE_S16(h, x1, x5);
+    vs16(b30) = MERGE_S16(l, x1, x5);
+    vs16(b40) = MERGE_S16(h, x2, x6);
+    vs16(b50) = MERGE_S16(l, x2, x6);
+    vs16(b60) = MERGE_S16(h, x3, x7);
+    vs16(b70) = MERGE_S16(l, x3, x7);
+
+#undef MERGE_S16
+    /* }}} */
+
+
+/* Some of the initial calculations can be done as vector short before
+ * conversion to vector float.  The following code section takes advantage
+ * of this.
+ */
+#if 1
+    /* fdct rows {{{ */
+    vs16(x0) = vec_add(vs16(b00), vs16(b70));
+    vs16(x7) = vec_sub(vs16(b00), vs16(b70));
+    vs16(x1) = vec_add(vs16(b10), vs16(b60));
+    vs16(x6) = vec_sub(vs16(b10), vs16(b60));
+    vs16(x2) = vec_add(vs16(b20), vs16(b50));
+    vs16(x5) = vec_sub(vs16(b20), vs16(b50));
+    vs16(x3) = vec_add(vs16(b30), vs16(b40));
+    vs16(x4) = vec_sub(vs16(b30), vs16(b40));
+
+    vs16(b70) = vec_add(vs16(x0), vs16(x3));
+    vs16(b10) = vec_add(vs16(x1), vs16(x2));
+
+    vs16(b00) = vec_add(vs16(b70), vs16(b10));
+    vs16(b40) = vec_sub(vs16(b70), vs16(b10));
+
+#define CTF0(n) \
+    vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
+    vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
+    b##n##1 = vec_ctf(vs32(b##n##1), 0); \
+    b##n##0 = vec_ctf(vs32(b##n##0), 0);
+
+    CTF0(0);
+    CTF0(4);
+
+    vs16(b20) = vec_sub(vs16(x0), vs16(x3));
+    vs16(b60) = vec_sub(vs16(x1), vs16(x2));
+
+    CTF0(2);
+    CTF0(6);
+
+#undef CTF0
+
+    x0 = vec_add(b60, b20);
+    x1 = vec_add(b61, b21);
+
+    cnst = LD_W2;
+    x0 = vec_madd(cnst, x0, mzero);
+    x1 = vec_madd(cnst, x1, mzero);
+    cnst = LD_W1;
+    b20 = vec_madd(cnst, b20, x0);
+    b21 = vec_madd(cnst, b21, x1);
+    cnst = LD_W0;
+    b60 = vec_madd(cnst, b60, x0);
+    b61 = vec_madd(cnst, b61, x1);
+
+#define CTFX(x,b) \
+    vs32(b##0) = vec_unpackh(vs16(x)); \
+    vs32(b##1) = vec_unpackl(vs16(x)); \
+    b##0 = vec_ctf(vs32(b##0), 0); \
+    b##1 = vec_ctf(vs32(b##1), 0); \
+
+    CTFX(x4, b7);
+    CTFX(x5, b5);
+    CTFX(x6, b3);
+    CTFX(x7, b1);
+
+#undef CTFX
+
+
+    x0 = vec_add(b70, b10);
+    x1 = vec_add(b50, b30);
+    x2 = vec_add(b70, b30);
+    x3 = vec_add(b50, b10);
+    x8 = vec_add(x2, x3);
+    cnst = LD_W3;
+    x8 = vec_madd(cnst, x8, mzero);
+
+    cnst = LD_W8;
+    x0 = vec_madd(cnst, x0, mzero);
+    cnst = LD_W9;
+    x1 = vec_madd(cnst, x1, mzero);
+    cnst = LD_WA;
+    x2 = vec_madd(cnst, x2, x8);
+    cnst = LD_WB;
+    x3 = vec_madd(cnst, x3, x8);
+
+    cnst = LD_W4;
+    b70 = vec_madd(cnst, b70, x0);
+    cnst = LD_W5;
+    b50 = vec_madd(cnst, b50, x1);
+    cnst = LD_W6;
+    b30 = vec_madd(cnst, b30, x1);
+    cnst = LD_W7;
+    b10 = vec_madd(cnst, b10, x0);
+
+    b70 = vec_add(b70, x2);
+    b50 = vec_add(b50, x3);
+    b30 = vec_add(b30, x2);
+    b10 = vec_add(b10, x3);
+
+
+    x0 = vec_add(b71, b11);
+    x1 = vec_add(b51, b31);
+    x2 = vec_add(b71, b31);
+    x3 = vec_add(b51, b11);
+    x8 = vec_add(x2, x3);
+    cnst = LD_W3;
+    x8 = vec_madd(cnst, x8, mzero);
+
+    cnst = LD_W8;
+    x0 = vec_madd(cnst, x0, mzero);
+    cnst = LD_W9;
+    x1 = vec_madd(cnst, x1, mzero);
+    cnst = LD_WA;
+    x2 = vec_madd(cnst, x2, x8);
+    cnst = LD_WB;
+    x3 = vec_madd(cnst, x3, x8);
+
+    cnst = LD_W4;
+    b71 = vec_madd(cnst, b71, x0);
+    cnst = LD_W5;
+    b51 = vec_madd(cnst, b51, x1);
+    cnst = LD_W6;
+    b31 = vec_madd(cnst, b31, x1);
+    cnst = LD_W7;
+    b11 = vec_madd(cnst, b11, x0);
+
+    b71 = vec_add(b71, x2);
+    b51 = vec_add(b51, x3);
+    b31 = vec_add(b31, x2);
+    b11 = vec_add(b11, x3);
+    /* }}} */
+#else
+    /* convert to float {{{ */
+#define CTF(n) \
+    vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
+    vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
+    b##n##1 = vec_ctf(vs32(b##n##1), 0); \
+    b##n##0 = vec_ctf(vs32(b##n##0), 0); \
+
+    CTF(0);
+    CTF(1);
+    CTF(2);
+    CTF(3);
+    CTF(4);
+    CTF(5);
+    CTF(6);
+    CTF(7);
+
+#undef CTF
+    /* }}} */
+
+    FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70);
+    FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71);
+#endif
+
+
+    /* 8x8 matrix transpose (vector float[8][2]) {{{ */
+    x0 = vec_mergel(b00, b20);
+    x1 = vec_mergeh(b00, b20);
+    x2 = vec_mergel(b10, b30);
+    x3 = vec_mergeh(b10, b30);
+
+    b00 = vec_mergeh(x1, x3);
+    b10 = vec_mergel(x1, x3);
+    b20 = vec_mergeh(x0, x2);
+    b30 = vec_mergel(x0, x2);
+
+    x4 = vec_mergel(b41, b61);
+    x5 = vec_mergeh(b41, b61);
+    x6 = vec_mergel(b51, b71);
+    x7 = vec_mergeh(b51, b71);
+
+    b41 = vec_mergeh(x5, x7);
+    b51 = vec_mergel(x5, x7);
+    b61 = vec_mergeh(x4, x6);
+    b71 = vec_mergel(x4, x6);
+
+    x0 = vec_mergel(b01, b21);
+    x1 = vec_mergeh(b01, b21);
+    x2 = vec_mergel(b11, b31);
+    x3 = vec_mergeh(b11, b31);
+
+    x4 = vec_mergel(b40, b60);
+    x5 = vec_mergeh(b40, b60);
+    x6 = vec_mergel(b50, b70);
+    x7 = vec_mergeh(b50, b70);
+
+    b40 = vec_mergeh(x1, x3);
+    b50 = vec_mergel(x1, x3);
+    b60 = vec_mergeh(x0, x2);
+    b70 = vec_mergel(x0, x2);
+
+    b01 = vec_mergeh(x5, x7);
+    b11 = vec_mergel(x5, x7);
+    b21 = vec_mergeh(x4, x6);
+    b31 = vec_mergel(x4, x6);
+    /* }}} */
+
+
+    FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
+    FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
+
+
+    /* round, convert back to short {{{ */
+#define CTS(n) \
+    b##n##0 = vec_round(b##n##0); \
+    b##n##1 = vec_round(b##n##1); \
+    vs32(b##n##0) = vec_cts(b##n##0, 0); \
+    vs32(b##n##1) = vec_cts(b##n##1, 0); \
+    vs16(b##n##0) = vec_pack(vs32(b##n##0), vs32(b##n##1)); \
+    vec_st(vs16(b##n##0), 0, bp);
+
+    bp = (vector signed short*)block;
+    CTS(0); bp++;
+    CTS(1); bp++;
+    CTS(2); bp++;
+    CTS(3); bp++;
+    CTS(4); bp++;
+    CTS(5); bp++;
+    CTS(6); bp++;
+    CTS(7);
+
+#undef CTS
+    /* }}} */
+
+POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* vim:set foldmethod=marker foldlevel=0: */
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
index 51b387792..91e744af9 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
@@ -1,6 +1,9 @@
 /*
  * Copyright (c) 2002 Dieter Shirley
  *
+ * dct_unquantize_h263_altivec:
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
+ *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
index ce4bf8a47..c8269eb9a 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
@@ -1,84 +1,86 @@
-/*
- * Copyright (c) 2002 Dieter Shirley
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
- 
-#include "../dsputil.h"
-#include "../mpegvideo.h"
-#include <time.h>
-
-#ifdef HAVE_ALTIVEC
-#include "dsputil_altivec.h"
-#endif
-
-extern int dct_quantize_altivec(MpegEncContext *s,  
-        DCTELEM *block, int n,
-        int qscale, int *overflow);
+/*
+ * Copyright (c) 2002 Dieter Shirley
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+ 
+#include "../dsputil.h"
+#include "../mpegvideo.h"
+#include <time.h>
+
+#ifdef HAVE_ALTIVEC
+#include "dsputil_altivec.h"
+#endif
+
+extern int dct_quantize_altivec(MpegEncContext *s,  
+        DCTELEM *block, int n,
+        int qscale, int *overflow);
 extern void dct_unquantize_h263_altivec(MpegEncContext *s,
                                         DCTELEM *block, int n, int qscale);
-
-extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
-
-
-void MPV_common_init_ppc(MpegEncContext *s)
-{
+
+extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
+extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
+
+
+void MPV_common_init_ppc(MpegEncContext *s)
+{
 #ifdef HAVE_ALTIVEC
-    if (has_altivec())
-    {
-        if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
-                (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
-        {
-            s->dsp.idct_put = idct_put_altivec;
-            s->dsp.idct_add = idct_add_altivec;
+    if (has_altivec())
+    {
+        if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
+                (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
+        {
+            s->dsp.idct_put = idct_put_altivec;
+            s->dsp.idct_add = idct_add_altivec;
 #ifndef ALTIVEC_USE_REFERENCE_C_CODE
-            s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+            s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
             s->dsp.idct_permutation_type = FF_NO_IDCT_PERM;
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
-        }
-
-        // Test to make sure that the dct required alignments are met.
-        if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
-                (((long)(s->q_inter_matrix) & 0x0f) != 0))
-        {
-            av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
-                    "to use Altivec DCT. Reverting to non-altivec version.\n");
-            return;
-        }
-
-        if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
-        {
-            av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
-                    "to use Altivec DCT. Reverting to non-altivec version.\n");
-            return;
-        }
-
-
-        if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
-                (s->avctx->dct_algo == FF_DCT_ALTIVEC))
-        {
-            s->dct_quantize = dct_quantize_altivec;
+        }
+
+        // Test to make sure that the dct required alignments are met.
+        if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
+                (((long)(s->q_inter_matrix) & 0x0f) != 0))
+        {
+            av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
+                    "to use Altivec DCT. Reverting to non-altivec version.\n");
+            return;
+        }
+
+        if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
+        {
+            av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
+                    "to use Altivec DCT. Reverting to non-altivec version.\n");
+            return;
+        }
+
+
+        if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
+                (s->avctx->dct_algo == FF_DCT_ALTIVEC))
+        {
+#if 0 /* seems to cause trouble under some circumstances */
+            s->dct_quantize = dct_quantize_altivec;
+#endif
             s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
             s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
-        }
-    } else
-#endif
-    {
-        /* Non-AltiVec PPC optimisations here */
-    }
-}
-
+        }
+    } else
+#endif
+    {
+        /* Non-AltiVec PPC optimisations here */
+    }
+}
+
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 11c9734fc..b67ec3974 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -388,6 +388,10 @@ static int rv20_decode_picture_header(MpegEncContext *s)
 //            return -1;
         }
         seq= get_bits(&s->gb, 15);
+        if (s->avctx->sub_id == 0x20201002 && get_bits(&s->gb, 1)){
+            av_log(s->avctx, AV_LOG_ERROR, "unknown bit4 set\n");
+//            return -1;
+        }
         mb_pos= get_bits(&s->gb, av_log2(s->mb_num-1)+1);
         s->mb_x= mb_pos % s->mb_width;
         s->mb_y= mb_pos / s->mb_width;
@@ -395,7 +399,7 @@ static int rv20_decode_picture_header(MpegEncContext *s)
         seq= get_bits(&s->gb, 8)*128;
         mb_pos= ff_h263_decode_mba(s);
     }
-//printf("%d\n", seq);
+//av_log(s->avctx, AV_LOG_DEBUG, "%d\n", seq);
     seq |= s->time &~0x7FFF;
     if(seq - s->time >  0x4000) seq -= 0x8000;
     if(seq - s->time < -0x4000) seq += 0x8000;
@@ -414,7 +418,10 @@ static int rv20_decode_picture_header(MpegEncContext *s)
         }
     }
 //    printf("%d %d %d %d %d\n", seq, (int)s->time, (int)s->last_non_b_time, s->pp_time, s->pb_time);
-
+/*for(i=0; i<32; i++){
+    av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
+}
+av_log(s->avctx, AV_LOG_DEBUG, "\n");*/
     s->no_rounding= get_bits1(&s->gb);
     
     s->f_code = 1;
@@ -441,6 +448,8 @@ static int rv10_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     static int done=0;
 
+    MPV_decode_defaults(s);
+    
     s->avctx= avctx;
     s->out_format = FMT_H263;
     s->codec_id= avctx->codec_id;
@@ -476,6 +485,7 @@ static int rv10_decode_init(AVCodecContext *avctx)
         s->low_delay=1;
         break;
     case 0x20200002:
+    case 0x20201002:
     case 0x30202002:
     case 0x30203002:
         s->low_delay=0;
@@ -490,8 +500,6 @@ static int rv10_decode_init(AVCodecContext *avctx)
 
     h263_decode_init_vlc(s);
 
-    s->progressive_sequence=1;
-
     /* init rv vlc */
     if (!done) {
         init_vlc(&rv_dc_lum, DC_VLC_BITS, 256, 
@@ -556,10 +564,6 @@ static int rv10_decode_packet(AVCodecContext *avctx,
             return -1;
     }
 
-    if(s->pict_type == B_TYPE){ //FIXME remove after cleaning mottion_val indexing
-        memset(s->current_picture.motion_val[0], 0, sizeof(int16_t)*2*(s->mb_width*2+2)*(s->mb_height*2+2));
-    }
-
 #ifdef DEBUG
     printf("qscale=%d\n", s->qscale);
 #endif
@@ -592,9 +596,9 @@ static int rv10_decode_packet(AVCodecContext *avctx,
     s->block_wrap[0]=
     s->block_wrap[1]=
     s->block_wrap[2]=
-    s->block_wrap[3]= s->mb_width*2 + 2;
+    s->block_wrap[3]= s->b8_stride;
     s->block_wrap[4]=
-    s->block_wrap[5]= s->mb_width + 2;
+    s->block_wrap[5]= s->mb_stride;
     ff_init_block_index(s);
     /* decode each macroblock */
 
@@ -669,10 +673,6 @@ static int rv10_decode_frame(AVCodecContext *avctx,
             return -1;
     }
     
-    if(s->pict_type == B_TYPE){ //FIXME remove after cleaning mottion_val indexing
-        memset(s->current_picture.motion_val[0], 0, sizeof(int16_t)*2*(s->mb_width*2+2)*(s->mb_height*2+2));
-    }
-
     if(s->mb_y>=s->mb_height){
         MPV_frame_end(s);
     
diff --git a/src/libffmpeg/libavcodec/smc.c b/src/libffmpeg/libavcodec/smc.c
index 87db50005..e937b03c8 100644
--- a/src/libffmpeg/libavcodec/smc.c
+++ b/src/libffmpeg/libavcodec/smc.c
@@ -36,9 +36,6 @@
 #include "avcodec.h"
 #include "dsputil.h"
 
-#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log
-#define fprintf(...) {} 
-
 #define CPAIR 2
 #define CQUAD 4
 #define COCTET 8
@@ -75,7 +72,7 @@ typedef struct SmcContext {
     total_blocks--; \
     if (total_blocks < 0) \
     { \
-        printf("warning: block counter just went negative (this should not happen)\n"); \
+        av_log(s->avctx, AV_LOG_INFO, "warning: block counter just went negative (this should not happen)\n"); \
         return; \
     } \
 }
@@ -124,7 +121,7 @@ static void smc_decode_stream(SmcContext *s)
     chunk_size = BE_32(&s->buf[stream_ptr]) & 0x00FFFFFF;
     stream_ptr += 4;
     if (chunk_size != s->size)
-        printf("warning: MOV chunk size != encoded chunk size (%d != %d); using MOV chunk size\n",
+        av_log(s->avctx, AV_LOG_INFO, "warning: MOV chunk size != encoded chunk size (%d != %d); using MOV chunk size\n",
             chunk_size, s->size);
 
     chunk_size = s->size;
@@ -135,13 +132,13 @@ static void smc_decode_stream(SmcContext *s)
         /* sanity checks */
         /* make sure stream ptr hasn't gone out of bounds */
         if (stream_ptr > chunk_size) {
-            printf("SMC decoder just went out of bounds (stream ptr = %d, chunk size = %d)\n",
+            av_log(s->avctx, AV_LOG_INFO, "SMC decoder just went out of bounds (stream ptr = %d, chunk size = %d)\n",
                 stream_ptr, chunk_size);
             return;
         }
         /* make sure the row pointer hasn't gone wild */
         if (row_ptr >= image_size) {
-            printf("SMC decoder just went out of bounds (row ptr = %d, height = %d)\n",
+            av_log(s->avctx, AV_LOG_INFO, "SMC decoder just went out of bounds (row ptr = %d, height = %d)\n",
                 row_ptr, image_size);
             return;
         }
@@ -164,7 +161,7 @@ static void smc_decode_stream(SmcContext *s)
 
             /* sanity check */
             if ((row_ptr == 0) && (pixel_ptr == 0)) {
-                printf("encountered repeat block opcode (%02X) but no blocks rendered yet\n",
+                av_log(s->avctx, AV_LOG_INFO, "encountered repeat block opcode (%02X) but no blocks rendered yet\n",
                     opcode & 0xF0);
                 break;
             }
@@ -198,7 +195,7 @@ static void smc_decode_stream(SmcContext *s)
 
             /* sanity check */
             if ((row_ptr == 0) && (pixel_ptr < 2 * 4)) {
-                printf("encountered repeat block opcode (%02X) but not enough blocks rendered yet\n",
+        	av_log(s->avctx, AV_LOG_INFO, "encountered repeat block opcode (%02X) but not enough blocks rendered yet\n",
                     opcode & 0xF0);
                 break;
             }
@@ -425,7 +422,7 @@ static void smc_decode_stream(SmcContext *s)
             break;
 
         case 0xF0:
-            printf("0xF0 opcode seen in SMC chunk (xine developers would like to know)\n");
+            av_log(s->avctx, AV_LOG_INFO, "0xF0 opcode seen in SMC chunk (contact the developers)\n");
             break;
         }
     }
@@ -462,7 +459,7 @@ static int smc_decode_frame(AVCodecContext *avctx,
     s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE | 
                             FF_BUFFER_HINTS_REUSABLE | FF_BUFFER_HINTS_READABLE;
     if (avctx->reget_buffer(avctx, &s->frame)) {
-        printf ("reget_buffer() failed\n");
+        av_log(s->avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
         return -1;
     }
 
diff --git a/src/libffmpeg/libavcodec/sparc/Makefile.am b/src/libffmpeg/libavcodec/sparc/Makefile.am
new file mode 100644
index 000000000..cdf16e3ad
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/Makefile.am
@@ -0,0 +1,15 @@
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS = $(LIBFFMPEG_CFLAGS)
+ASFLAGS =
+
+noinst_LTLIBRARIES = libavcodec_sparc.la
+
+libavcodec_sparc_src = dsputil_vis.c
+libavcodec_sparc_dummy = libavcodec_sparc_dummy.c
+
+EXTRA_DIST = $(libavcodec_sparc_src) $(libavcodec_sparc_dummy) vis.h
+
+sparc_modules = $(libavcodec_sparc_src)
+
+libavcodec_sparc_la_SOURCES = $(sparc_modules) $(libavcodec_sparc_dummy)
diff --git a/src/libffmpeg/libavcodec/sparc/dsputil_vis.c b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
new file mode 100644
index 000000000..434cf74ac
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
@@ -0,0 +1,4107 @@
+/*
+ * dsputil_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of ffmpeg, a free MPEG-4 video stream decoder.
+ * See http://ffmpeg.sourceforge.net/ for updates.
+ *
+ * ffmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * ffmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the Lesser GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
+   The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
+ */
+
+#include "config.h"
+
+#ifdef ARCH_SPARC
+
+#include <inttypes.h>
+#include <signal.h>
+#include <setjmp.h>
+
+#include "../dsputil.h"
+
+#include "vis.h"
+
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16.  So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ *	fxor		f0, f2, f10
+ *	fand		f10, f4, f10
+ *	fmul8x16	f8, f10, f10
+ *	fand		f10, f6, f10
+ *	for		f0, f2, f12
+ *	fpsub16		f12, f10, f10
+ */
+
+#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
+
+#define DUP4(x) {x, x, x, x}
+#define DUP8(x) {x, x, x, x, x, x, x, x}
+static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
+static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
+static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
+static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
+static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
+static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
+static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
+static const int16_t constants256_512[] ATTR_ALIGN(8) =
+	{256, 512, 256, 512};
+static const int16_t constants256_1024[] ATTR_ALIGN(8) =
+	{256, 1024, 256, 1024};
+
+#define REF_0		0
+#define REF_0_1		1
+#define REF_2		2
+#define REF_2_1		3
+#define REF_4		4
+#define REF_4_1		5
+#define REF_6		6
+#define REF_6_1		7
+#define REF_S0		8
+#define REF_S0_1	9
+#define REF_S2		10
+#define REF_S2_1	11
+#define REF_S4		12
+#define REF_S4_1	13
+#define REF_S6		14
+#define REF_S6_1	15
+#define DST_0		16
+#define DST_1		17
+#define DST_2		18
+#define DST_3		19
+#define CONST_1		20
+#define CONST_2		20
+#define CONST_3		20
+#define CONST_6		20
+#define MASK_fe		20
+#define CONST_128	22
+#define CONST_256	22
+#define CONST_512	22
+#define CONST_1024	22
+#define TMP0		24
+#define TMP1		25
+#define TMP2		26
+#define TMP3		27
+#define TMP4		28
+#define TMP5		29
+#define ZERO		30
+#define MASK_7f		30
+
+#define TMP6		32
+#define TMP8		34
+#define TMP10		36
+#define TMP12		38
+#define TMP14		40
+#define TMP16		42
+#define TMP18		44
+#define TMP20		46
+#define TMP22		48
+#define TMP24		50
+#define TMP26		52
+#define TMP28		54
+#define TMP30		56
+#define TMP32		58
+
+static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	do {	/* 5 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+
+		vis_ld64_2(ref, 16, TMP4);
+		ref += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+
+		vis_faligndata(TMP2, TMP4, REF_2);
+		vis_st64_2(REF_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	do {	/* 4 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+		ref += stride;
+
+		/* stall */
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+
+static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(dest[8], DST_2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP6);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_ld64_2(ref, 16, TMP4);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_xor(DST_2, REF_2, TMP8);
+
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_or(DST_0, REF_0, TMP10);
+		vis_ld64_2(dest, stride, DST_0);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+
+		vis_or(DST_2, REF_2, TMP12);
+		vis_ld64_2(dest, stride_8, DST_2);
+
+		vis_ld64(ref[0], TMP14);
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+
+		dest += stride;
+		vis_ld64_2(ref, 8, TMP16);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 16, TMP18);
+		vis_faligndata(TMP2, TMP4, REF_2);
+		ref += stride;
+
+		vis_xor(DST_0, REF_0, TMP20);
+
+		vis_and(TMP20, MASK_fe, TMP20);
+
+		vis_xor(DST_2, REF_2, TMP22);
+		vis_mul8x16(CONST_128, TMP20, TMP20);
+
+		vis_and(TMP22, MASK_fe, TMP22);
+
+		vis_or(DST_0, REF_0, TMP24);
+		vis_mul8x16(CONST_128, TMP22, TMP22);
+
+		vis_or(DST_2, REF_2, TMP26);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_ld64_2(dest, stride_8, DST_2);
+		vis_faligndata(TMP16, TMP18, REF_2);
+
+		vis_and(TMP20, MASK_7f, TMP20);
+
+		vis_and(TMP22, MASK_7f, TMP22);
+
+		vis_psub16(TMP24, TMP20, TMP20);
+		vis_st64(TMP20, dest[0]);
+
+		vis_psub16(TMP26, TMP22, TMP22);
+		vis_st64_2(TMP22, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP6);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_xor(DST_2, REF_2, TMP8);
+
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_or(DST_0, REF_0, TMP10);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+
+	vis_or(DST_2, REF_2, TMP12);
+	vis_ld64_2(dest, stride_8, DST_2);
+
+	vis_ld64(ref[0], TMP14);
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+
+	dest += stride;
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_xor(DST_0, REF_0, TMP20);
+
+	vis_and(TMP20, MASK_fe, TMP20);
+
+	vis_xor(DST_2, REF_2, TMP22);
+	vis_mul8x16(CONST_128, TMP20, TMP20);
+
+	vis_and(TMP22, MASK_fe, TMP22);
+
+	vis_or(DST_0, REF_0, TMP24);
+	vis_mul8x16(CONST_128, TMP22, TMP22);
+
+	vis_or(DST_2, REF_2, TMP26);
+
+	vis_and(TMP20, MASK_7f, TMP20);
+
+	vis_and(TMP22, MASK_7f, TMP22);
+
+	vis_psub16(TMP24, TMP20, TMP20);
+	vis_st64(TMP20, dest[0]);
+
+	vis_psub16(TMP26, TMP22, TMP22);
+	vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP4);
+
+		vis_ld64(ref[8], TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_ld64(ref[0], TMP12);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(ref[8], TMP2);
+		vis_xor(DST_0, REF_0, TMP0);
+		ref += stride;
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+
+		vis_faligndata(TMP12, TMP2, REF_0);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_psub16(TMP6, TMP0, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP4);
+
+	vis_ld64(ref[8], TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(DST_0, REF_0, TMP6);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(DST_0, REF_0, TMP0);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, TMP4);
+	vis_st64(TMP4, dest[0]);
+	dest += stride;
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_or(DST_0, REF_0, TMP6);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_psub16(TMP6, TMP0, TMP4);
+	vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0],    TMP0);
+
+	vis_ld64_2(ref, 8,  TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 34 cycles */
+		vis_ld64(ref[0],    TMP0);
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_ld64_2(ref, 8,  TMP2);
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_ld64_2(ref, 16, TMP4);
+		vis_and(TMP6, MASK_fe, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],    TMP14);
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_ld64_2(ref, 8,  TMP16);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_ld64_2(ref, 16, TMP18);
+		ref += stride;
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_faligndata(TMP16, TMP18, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP14, TMP16, REF_2);
+			vis_faligndata(TMP16, TMP18, REF_6);
+		} else {
+			vis_src1(TMP16, REF_2);
+			vis_src1(TMP18, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0],    TMP0);
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_ld64_2(ref, 8,  TMP2);
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 20 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+		ref += stride;
+
+		vis_ld64(ref[0], TMP8);
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+		} else {
+			vis_src1(TMP2, REF_2);
+		}
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_or(REF_0, REF_2, TMP14);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+		vis_faligndata(TMP8, TMP10, REF_0);
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP8, TMP10, REF_2);
+		} else {
+			vis_src1(TMP10, REF_2);
+		}
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_or(REF_0, REF_2, TMP14);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+}
+
+static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	do {	/* 26 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[16], TMP4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(dest[8], DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_mul8x16au(REF_0,   CONST_256, TMP0);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO, REF_2_1, TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_mul8x16al(DST_0,   CONST_512, TMP4);
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_mul8x16al(DST_1,   CONST_512, TMP6);
+
+		vis_mul8x16au(REF_6,   CONST_256, TMP12);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4,   CONST_256, TMP16);
+
+		vis_padd16(TMP0, CONST_3, TMP8);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+		vis_padd16(TMP2, CONST_3, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_padd16(TMP16, TMP12, TMP0);
+
+		vis_st64(DST_0, dest[0]);
+		vis_mul8x16al(DST_2,   CONST_512, TMP4);
+		vis_padd16(TMP18, TMP14, TMP2);
+
+		vis_mul8x16al(DST_3,   CONST_512, TMP6);
+		vis_padd16(TMP0, CONST_3, TMP0);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[8]);
+
+		ref += stride;
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_times_2 = stride << 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	height >>= 2;
+	do {	/* 47 cycles */
+		vis_ld64(ref[0],   TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[0],   TMP4);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 8, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],   TMP8);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP4, TMP6, REF_4);
+
+		vis_ld64(ref[0],   TMP12);
+
+		vis_ld64_2(ref, 8, TMP14);
+		ref += stride;
+		vis_faligndata(TMP8, TMP10, REF_S0);
+
+		vis_faligndata(TMP12, TMP14, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+
+			vis_ld64(dest[0], DST_0);
+			vis_faligndata(TMP0, TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_faligndata(TMP4, TMP6, REF_6);
+
+			vis_faligndata(TMP8, TMP10, REF_S2);
+
+			vis_faligndata(TMP12, TMP14, REF_S6);
+		} else {
+			vis_ld64(dest[0], DST_0);
+			vis_src1(TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_src1(TMP6, REF_6);
+
+			vis_src1(TMP10, REF_S2);
+
+			vis_src1(TMP14, REF_S6);
+		}
+
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP16, TMP0);
+		vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP10, CONST_3, TMP10);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP8, TMP16, TMP8);
+
+		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+		vis_padd16(TMP10, TMP18, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_pmerge(ZERO,     REF_S0,     TMP0);
+
+		vis_pmerge(ZERO,     REF_S2,     TMP24);
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP24, TMP0);
+		vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP10, CONST_3, TMP10);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+		vis_padd16(TMP0, TMP16, TMP0);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(TMP8, TMP20, TMP8);
+
+		vis_padd16(TMP10, TMP22, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP6);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64_2(ref, 8, TMP8);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_ld64_2(ref, 16, TMP10);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP6, TMP8, REF_2);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP8, TMP10, REF_6);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_xor(REF_4, REF_6, TMP16);
+
+		vis_ld64_2(ref, 16, TMP4);
+		ref += stride;
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_ld64(ref[0], TMP6);
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_ld64_2(ref, 8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_and(TMP16, MASK_fe, TMP16);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_mul8x16(CONST_128, TMP16, TMP16);
+		vis_xor(REF_0, REF_2, TMP0);
+
+		vis_xor(REF_4, REF_6, TMP2);
+
+		vis_or(REF_0, REF_2, TMP20);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_and(TMP16, MASK_7f, TMP16);
+
+		vis_psub16(TMP14, TMP12, TMP12);
+		vis_st64(TMP12, dest[0]);
+
+		vis_psub16(TMP18, TMP16, TMP16);
+		vis_st64_2(TMP16, dest, 8);
+		dest += stride;
+
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP2, MASK_fe, TMP2);
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16(CONST_128, TMP2, TMP2);
+
+		vis_faligndata(TMP8, TMP10, REF_6);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_and(TMP2, MASK_7f, TMP2);
+
+		vis_psub16(TMP20, TMP0, TMP0);
+		vis_st64(TMP0, dest[0]);
+
+		vis_psub16(TMP18, TMP2, TMP2);
+		vis_st64_2(TMP2, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_xor(REF_4, REF_6, TMP16);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_and(TMP16, MASK_fe, TMP16);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_mul8x16(CONST_128, TMP16, TMP16);
+	vis_xor(REF_0, REF_2, TMP0);
+
+	vis_xor(REF_4, REF_6, TMP2);
+
+	vis_or(REF_0, REF_2, TMP20);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_and(TMP16, MASK_7f, TMP16);
+
+	vis_psub16(TMP14, TMP12, TMP12);
+	vis_st64(TMP12, dest[0]);
+
+	vis_psub16(TMP18, TMP16, TMP16);
+	vis_st64_2(TMP16, dest, 8);
+	dest += stride;
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP2, MASK_fe, TMP2);
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_mul8x16(CONST_128, TMP2, TMP2);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_and(TMP2, MASK_7f, TMP2);
+
+	vis_psub16(TMP20, TMP0, TMP0);
+	vis_st64(TMP0, dest[0]);
+
+	vis_psub16(TMP18, TMP2, TMP2);
+	vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP4);
+
+	vis_ld64_2(ref, 8, TMP6);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP4, TMP6, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_2);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_6);
+	height >>= 1;
+
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP12);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_pmerge(ZERO,       REF_6,     TMP16);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_pmerge(ZERO,     REF_4,     TMP4);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+
+		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+		vis_faligndata(TMP8, TMP10, REF_6);
+		vis_mul8x16al(DST_0,   CONST_512, TMP20);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_1,   CONST_512, TMP22);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP4, CONST_3, TMP4);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_padd16(TMP6, CONST_3, TMP6);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+		vis_mul8x16al(REF_S0,   CONST_512, TMP20);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_mul8x16al(REF_S2,   CONST_512, TMP24);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_2,   CONST_256, TMP28);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+		vis_padd16(TMP16, TMP4, TMP16);
+		vis_mul8x16au(REF_6,   CONST_256, REF_S4);
+
+		vis_padd16(TMP18, TMP6, TMP18);
+		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+		vis_pack16(TMP12, DST_0);
+		vis_padd16(TMP28, TMP0, TMP12);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP30, TMP2, TMP14);
+
+		vis_pack16(TMP16, DST_2);
+		vis_padd16(REF_S4, TMP4, TMP16);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(REF_S6, TMP6, TMP18);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_pack16(TMP16, DST_2);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+
+	height >>= 1;
+	do {	/* 20 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP8);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+
+		vis_ld64_2(dest, stride, DST_2);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+		vis_pmerge(ZERO,       REF_0,     TMP12);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+		vis_pmerge(ZERO,       REF_0_1,   TMP14);
+
+		vis_padd16(TMP12, CONST_3, TMP12);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP14, CONST_3, TMP14);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_faligndata(TMP4, TMP6, REF_2);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_mul8x16au(REF_2,   CONST_256, TMP20);
+
+		vis_padd16(TMP8, TMP16, TMP0);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+		vis_padd16(TMP10, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+
+		vis_padd16(TMP12, TMP24, TMP0);
+
+		vis_padd16(TMP14, TMP26, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+			      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants2[0], CONST_2);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16au(REF_0, CONST_256, TMP0);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_padd16(TMP0, CONST_2, TMP8);
+		vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+		vis_padd16(TMP2, CONST_2, TMP10);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+		vis_padd16(TMP8, TMP4, TMP8);
+		vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+		vis_padd16(TMP10, TMP6, TMP10);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP8, TMP12);
+
+		vis_padd16(TMP14, TMP10, TMP14);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP0, CONST_2, TMP12);
+
+		vis_mul8x16au(REF_S0, CONST_256, TMP0);
+		vis_padd16(TMP2, CONST_2, TMP14);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_padd16(TMP12, TMP4, TMP12);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP4);
+		vis_padd16(TMP14, TMP6, TMP14);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+		vis_padd16(TMP20, TMP12, TMP20);
+
+		vis_padd16(TMP22, TMP14, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(TMP0, TMP4, TMP24);
+
+		vis_mul8x16au(REF_S4, CONST_256, TMP0);
+		vis_padd16(TMP2, TMP6, TMP26);
+
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+		vis_padd16(TMP24, TMP8, TMP24);
+
+		vis_padd16(TMP26, TMP10, TMP26);
+		vis_pack16(TMP24, DST_0);
+
+		vis_pack16(TMP26, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_pmerge(ZERO, REF_S6, TMP4);
+
+		vis_pmerge(ZERO,      REF_S6_1,  TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_padd16(TMP0, TMP12, TMP0);
+
+		vis_padd16(TMP2, TMP14, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants2[0], CONST_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 26 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0,   CONST_256, TMP8);
+		vis_pmerge(ZERO,        REF_S2,    TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+		vis_pmerge(ZERO,        REF_S2_1,  TMP14);
+
+		vis_ld64_2(ref, stride, TMP4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_pmerge(ZERO, REF_S4, TMP18);
+
+		vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_padd16(TMP18, CONST_2, TMP18);
+		vis_mul8x16au(REF_S6,   CONST_256, TMP22);
+
+		vis_padd16(TMP20, CONST_2, TMP20);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+		vis_mul8x16au(REF_S0,   CONST_256, TMP26);
+		vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+		vis_mul8x16au(REF_S2,   CONST_256, TMP30);
+		vis_padd16(TMP18, TMP22, TMP18);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP8,  TMP18, TMP8);
+
+		vis_padd16(TMP10, TMP20, TMP10);
+
+		vis_padd16(TMP8,  TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP8,  DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP18, TMP26, TMP18);
+
+		vis_padd16(TMP20, TMP28, TMP20);
+
+		vis_padd16(TMP18, TMP30, TMP18);
+
+		vis_padd16(TMP20, TMP32, TMP20);
+		vis_pack16(TMP18, DST_2);
+
+		vis_pack16(TMP20, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+			      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants6[0], CONST_6);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {	/* 55 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_0, TMP0);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP0, CONST_6, TMP0);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP2, CONST_6, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+		vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+		vis_padd16(TMP12, TMP30, TMP12);
+
+		vis_padd16(TMP14, TMP32, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP4, CONST_6, TMP4);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP6, CONST_6, TMP6);
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+		vis_padd16(TMP4, TMP8, TMP4);
+		vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
+
+		vis_padd16(TMP6, TMP10, TMP6);
+
+		vis_padd16(TMP20, TMP4, TMP20);
+
+		vis_padd16(TMP22, TMP6, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_padd16(TMP20, REF_0, TMP20);
+		vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+		vis_padd16(TMP22, REF_2, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO,      REF_S4_1,  REF_2);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_padd16(REF_4, TMP0, TMP8);
+
+		vis_mul8x16au(REF_S6, CONST_256, REF_4);
+		vis_padd16(REF_6, TMP2, TMP10);
+
+		vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(REF_0, TMP4, REF_0);
+
+		vis_mul8x16al(DST_2,   CONST_1024, TMP30);
+		vis_padd16(REF_2, TMP6, REF_2);
+
+		vis_mul8x16al(DST_3,   CONST_1024, TMP32);
+		vis_padd16(REF_0, REF_4, REF_0);
+
+		vis_padd16(REF_2, REF_6, REF_2);
+
+		vis_padd16(REF_0, TMP30, REF_0);
+
+		/* stall */
+
+		vis_padd16(REF_2, TMP32, REF_2);
+		vis_pack16(REF_0, DST_2);
+
+		vis_pack16(REF_2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64(constants6[0], CONST_6);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP8);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		vis_ld64_2(dest, stride, DST_2);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_S4, TMP22);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP24);
+
+		vis_mul8x16au(REF_S6, CONST_256, TMP26);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP28);
+
+		vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+		vis_padd16(TMP22, CONST_6, TMP22);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+		vis_padd16(TMP24, CONST_6, TMP24);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP24, TMP28, TMP24);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP26);
+		vis_padd16(TMP8, TMP22, TMP8);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+		vis_padd16(TMP10, TMP24, TMP10);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(REF_S4, TMP22, TMP12);
+
+		vis_padd16(REF_S6, TMP24, TMP14);
+
+		vis_padd16(TMP12, TMP26, TMP12);
+
+		vis_padd16(TMP14, TMP28, TMP14);
+
+		vis_padd16(TMP12, REF_0, TMP12);
+
+		vis_padd16(TMP14, REF_2, TMP14);
+		vis_pack16(TMP12, DST_2);
+
+		vis_pack16(TMP14, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+/* End of rounding code */
+
+/* Start of no rounding code */
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y)>>1 == (x&y)+((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16.  So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ *	fxor		f0, f2, f10
+ *	fand		f10, f4, f10
+ *	fmul8x16	f8, f10, f10
+ *	fand		f10, f6, f10
+ *	fand		f0, f2, f12
+ *	fpadd16		f12, f10, f10
+ */
+
+static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+				      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	do {	/* 5 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+
+		vis_ld64_2(ref, 16, TMP4);
+		ref += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+
+		vis_faligndata(TMP2, TMP4, REF_2);
+		vis_st64_2(REF_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	do {	/* 4 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+		ref += stride;
+
+		/* stall */
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+
+static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(dest[8], DST_2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP6);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_ld64_2(ref, 16, TMP4);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_xor(DST_2, REF_2, TMP8);
+
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_and(DST_0, REF_0, TMP10);
+		vis_ld64_2(dest, stride, DST_0);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+
+		vis_and(DST_2, REF_2, TMP12);
+		vis_ld64_2(dest, stride_8, DST_2);
+
+		vis_ld64(ref[0], TMP14);
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_padd16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_padd16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+
+		dest += stride;
+		vis_ld64_2(ref, 8, TMP16);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 16, TMP18);
+		vis_faligndata(TMP2, TMP4, REF_2);
+		ref += stride;
+
+		vis_xor(DST_0, REF_0, TMP20);
+
+		vis_and(TMP20, MASK_fe, TMP20);
+
+		vis_xor(DST_2, REF_2, TMP22);
+		vis_mul8x16(CONST_128, TMP20, TMP20);
+
+		vis_and(TMP22, MASK_fe, TMP22);
+
+		vis_and(DST_0, REF_0, TMP24);
+		vis_mul8x16(CONST_128, TMP22, TMP22);
+
+		vis_and(DST_2, REF_2, TMP26);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_ld64_2(dest, stride_8, DST_2);
+		vis_faligndata(TMP16, TMP18, REF_2);
+
+		vis_and(TMP20, MASK_7f, TMP20);
+
+		vis_and(TMP22, MASK_7f, TMP22);
+
+		vis_padd16(TMP24, TMP20, TMP20);
+		vis_st64(TMP20, dest[0]);
+
+		vis_padd16(TMP26, TMP22, TMP22);
+		vis_st64_2(TMP22, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP6);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_xor(DST_2, REF_2, TMP8);
+
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_and(DST_0, REF_0, TMP10);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+
+	vis_and(DST_2, REF_2, TMP12);
+	vis_ld64_2(dest, stride_8, DST_2);
+
+	vis_ld64(ref[0], TMP14);
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_padd16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_padd16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+
+	dest += stride;
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_xor(DST_0, REF_0, TMP20);
+
+	vis_and(TMP20, MASK_fe, TMP20);
+
+	vis_xor(DST_2, REF_2, TMP22);
+	vis_mul8x16(CONST_128, TMP20, TMP20);
+
+	vis_and(TMP22, MASK_fe, TMP22);
+
+	vis_and(DST_0, REF_0, TMP24);
+	vis_mul8x16(CONST_128, TMP22, TMP22);
+
+	vis_and(DST_2, REF_2, TMP26);
+
+	vis_and(TMP20, MASK_7f, TMP20);
+
+	vis_and(TMP22, MASK_7f, TMP22);
+
+	vis_padd16(TMP24, TMP20, TMP20);
+	vis_st64(TMP20, dest[0]);
+
+	vis_padd16(TMP26, TMP22, TMP22);
+	vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP4);
+
+		vis_ld64(ref[8], TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_and(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_ld64(ref[0], TMP12);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(ref[8], TMP2);
+		vis_xor(DST_0, REF_0, TMP0);
+		ref += stride;
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_padd16(TMP6, TMP4, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_and(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+
+		vis_faligndata(TMP12, TMP2, REF_0);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_padd16(TMP6, TMP0, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP4);
+
+	vis_ld64(ref[8], TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_and(DST_0, REF_0, TMP6);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(DST_0, REF_0, TMP0);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_padd16(TMP6, TMP4, TMP4);
+	vis_st64(TMP4, dest[0]);
+	dest += stride;
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_and(DST_0, REF_0, TMP6);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_padd16(TMP6, TMP0, TMP4);
+	vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0],    TMP0);
+
+	vis_ld64_2(ref, 8,  TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 34 cycles */
+		vis_ld64(ref[0],    TMP0);
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_ld64_2(ref, 8,  TMP2);
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_ld64_2(ref, 16, TMP4);
+		vis_and(TMP6, MASK_fe, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],    TMP14);
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_ld64_2(ref, 8,  TMP16);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_and(REF_0, REF_2, TMP10);
+
+		vis_ld64_2(ref, 16, TMP18);
+		ref += stride;
+		vis_and(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_padd16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_padd16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_and(REF_0, REF_2, TMP10);
+
+		vis_and(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_faligndata(TMP16, TMP18, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP14, TMP16, REF_2);
+			vis_faligndata(TMP16, TMP18, REF_6);
+		} else {
+			vis_src1(TMP16, REF_2);
+			vis_src1(TMP18, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_padd16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_padd16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0],    TMP0);
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_ld64_2(ref, 8,  TMP2);
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_and(REF_0, REF_2, TMP10);
+
+	vis_and(REF_4, REF_6, TMP12);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_padd16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_padd16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_and(REF_0, REF_2, TMP10);
+
+	vis_and(REF_4, REF_6, TMP12);
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_padd16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_padd16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 20 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+		ref += stride;
+
+		vis_ld64(ref[0], TMP8);
+		vis_and(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+		} else {
+			vis_src1(TMP2, REF_2);
+		}
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_padd16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_and(REF_0, REF_2, TMP14);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+		vis_faligndata(TMP8, TMP10, REF_0);
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP8, TMP10, REF_2);
+		} else {
+			vis_src1(TMP10, REF_2);
+		}
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_padd16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_and(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_padd16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_and(REF_0, REF_2, TMP14);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_padd16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+}
+
+static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	do {	/* 26 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[16], TMP4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(dest[8], DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_mul8x16au(REF_0,   CONST_256, TMP0);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO, REF_2_1, TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_mul8x16al(DST_0,   CONST_512, TMP4);
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_mul8x16al(DST_1,   CONST_512, TMP6);
+
+		vis_mul8x16au(REF_6,   CONST_256, TMP12);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4,   CONST_256, TMP16);
+
+		vis_padd16(TMP0, CONST_3, TMP8);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+		vis_padd16(TMP2, CONST_3, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_padd16(TMP16, TMP12, TMP0);
+
+		vis_st64(DST_0, dest[0]);
+		vis_mul8x16al(DST_2,   CONST_512, TMP4);
+		vis_padd16(TMP18, TMP14, TMP2);
+
+		vis_mul8x16al(DST_3,   CONST_512, TMP6);
+		vis_padd16(TMP0, CONST_3, TMP0);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[8]);
+
+		ref += stride;
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_times_2 = stride << 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	height >>= 2;
+	do {	/* 47 cycles */
+		vis_ld64(ref[0],   TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[0],   TMP4);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 8, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],   TMP8);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP4, TMP6, REF_4);
+
+		vis_ld64(ref[0],   TMP12);
+
+		vis_ld64_2(ref, 8, TMP14);
+		ref += stride;
+		vis_faligndata(TMP8, TMP10, REF_S0);
+
+		vis_faligndata(TMP12, TMP14, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+
+			vis_ld64(dest[0], DST_0);
+			vis_faligndata(TMP0, TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_faligndata(TMP4, TMP6, REF_6);
+
+			vis_faligndata(TMP8, TMP10, REF_S2);
+
+			vis_faligndata(TMP12, TMP14, REF_S6);
+		} else {
+			vis_ld64(dest[0], DST_0);
+			vis_src1(TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_src1(TMP6, REF_6);
+
+			vis_src1(TMP10, REF_S2);
+
+			vis_src1(TMP14, REF_S6);
+		}
+
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP16, TMP0);
+		vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP10, CONST_3, TMP10);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP8, TMP16, TMP8);
+
+		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+		vis_padd16(TMP10, TMP18, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_pmerge(ZERO,     REF_S0,     TMP0);
+
+		vis_pmerge(ZERO,     REF_S2,     TMP24);
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP24, TMP0);
+		vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP10, CONST_3, TMP10);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+		vis_padd16(TMP0, TMP16, TMP0);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(TMP8, TMP20, TMP8);
+
+		vis_padd16(TMP10, TMP22, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP6);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64_2(ref, 8, TMP8);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_ld64_2(ref, 16, TMP10);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP6, TMP8, REF_2);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP8, TMP10, REF_6);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_xor(REF_4, REF_6, TMP16);
+
+		vis_ld64_2(ref, 16, TMP4);
+		ref += stride;
+		vis_and(REF_0, REF_2, TMP14);
+
+		vis_ld64(ref[0], TMP6);
+		vis_and(REF_4, REF_6, TMP18);
+
+		vis_ld64_2(ref, 8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_and(TMP16, MASK_fe, TMP16);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_mul8x16(CONST_128, TMP16, TMP16);
+		vis_xor(REF_0, REF_2, TMP0);
+
+		vis_xor(REF_4, REF_6, TMP2);
+
+		vis_and(REF_0, REF_2, TMP20);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_and(TMP16, MASK_7f, TMP16);
+
+		vis_padd16(TMP14, TMP12, TMP12);
+		vis_st64(TMP12, dest[0]);
+
+		vis_padd16(TMP18, TMP16, TMP16);
+		vis_st64_2(TMP16, dest, 8);
+		dest += stride;
+
+		vis_and(REF_4, REF_6, TMP18);
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP2, MASK_fe, TMP2);
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16(CONST_128, TMP2, TMP2);
+
+		vis_faligndata(TMP8, TMP10, REF_6);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_and(TMP2, MASK_7f, TMP2);
+
+		vis_padd16(TMP20, TMP0, TMP0);
+		vis_st64(TMP0, dest[0]);
+
+		vis_padd16(TMP18, TMP2, TMP2);
+		vis_st64_2(TMP2, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_xor(REF_4, REF_6, TMP16);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_and(REF_0, REF_2, TMP14);
+
+	vis_and(REF_4, REF_6, TMP18);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_and(TMP16, MASK_fe, TMP16);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_mul8x16(CONST_128, TMP16, TMP16);
+	vis_xor(REF_0, REF_2, TMP0);
+
+	vis_xor(REF_4, REF_6, TMP2);
+
+	vis_and(REF_0, REF_2, TMP20);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_and(TMP16, MASK_7f, TMP16);
+
+	vis_padd16(TMP14, TMP12, TMP12);
+	vis_st64(TMP12, dest[0]);
+
+	vis_padd16(TMP18, TMP16, TMP16);
+	vis_st64_2(TMP16, dest, 8);
+	dest += stride;
+
+	vis_and(REF_4, REF_6, TMP18);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP2, MASK_fe, TMP2);
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_mul8x16(CONST_128, TMP2, TMP2);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_and(TMP2, MASK_7f, TMP2);
+
+	vis_padd16(TMP20, TMP0, TMP0);
+	vis_st64(TMP0, dest[0]);
+
+	vis_padd16(TMP18, TMP2, TMP2);
+	vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+
+	ref = vis_alignaddr(ref);
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP4);
+
+	vis_ld64_2(ref, 8, TMP6);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP4, TMP6, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_and(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+		vis_and(REF_0, REF_2, TMP14);
+
+		vis_padd16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_2);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_padd16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_and(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+	vis_and(REF_0, REF_2, TMP14);
+
+	vis_padd16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_padd16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_6);
+	height >>= 1;
+
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP12);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_pmerge(ZERO,       REF_6,     TMP16);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_pmerge(ZERO,     REF_4,     TMP4);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+
+		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+		vis_faligndata(TMP8, TMP10, REF_6);
+		vis_mul8x16al(DST_0,   CONST_512, TMP20);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_1,   CONST_512, TMP22);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP4, CONST_3, TMP4);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_padd16(TMP6, CONST_3, TMP6);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+		vis_mul8x16al(REF_S0,   CONST_512, TMP20);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_mul8x16al(REF_S2,   CONST_512, TMP24);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_2,   CONST_256, TMP28);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+		vis_padd16(TMP16, TMP4, TMP16);
+		vis_mul8x16au(REF_6,   CONST_256, REF_S4);
+
+		vis_padd16(TMP18, TMP6, TMP18);
+		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+		vis_pack16(TMP12, DST_0);
+		vis_padd16(TMP28, TMP0, TMP12);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP30, TMP2, TMP14);
+
+		vis_pack16(TMP16, DST_2);
+		vis_padd16(REF_S4, TMP4, TMP16);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(REF_S6, TMP6, TMP18);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_pack16(TMP16, DST_2);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+
+	height >>= 1;
+	do {	/* 20 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP8);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+
+		vis_ld64_2(dest, stride, DST_2);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+		vis_pmerge(ZERO,       REF_0,     TMP12);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+		vis_pmerge(ZERO,       REF_0_1,   TMP14);
+
+		vis_padd16(TMP12, CONST_3, TMP12);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP14, CONST_3, TMP14);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_faligndata(TMP4, TMP6, REF_2);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_mul8x16au(REF_2,   CONST_256, TMP20);
+
+		vis_padd16(TMP8, TMP16, TMP0);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+		vis_padd16(TMP10, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+
+		vis_padd16(TMP12, TMP24, TMP0);
+
+		vis_padd16(TMP14, TMP26, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+				       const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants1[0], CONST_1);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16au(REF_0, CONST_256, TMP0);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_padd16(TMP0, CONST_2, TMP8);
+		vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+		vis_padd16(TMP2, CONST_1, TMP10);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+		vis_padd16(TMP8, TMP4, TMP8);
+		vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+		vis_padd16(TMP10, TMP6, TMP10);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP8, TMP12);
+
+		vis_padd16(TMP14, TMP10, TMP14);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP0, CONST_1, TMP12);
+
+		vis_mul8x16au(REF_S0, CONST_256, TMP0);
+		vis_padd16(TMP2, CONST_1, TMP14);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_padd16(TMP12, TMP4, TMP12);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP4);
+		vis_padd16(TMP14, TMP6, TMP14);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+		vis_padd16(TMP20, TMP12, TMP20);
+
+		vis_padd16(TMP22, TMP14, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(TMP0, TMP4, TMP24);
+
+		vis_mul8x16au(REF_S4, CONST_256, TMP0);
+		vis_padd16(TMP2, TMP6, TMP26);
+
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+		vis_padd16(TMP24, TMP8, TMP24);
+
+		vis_padd16(TMP26, TMP10, TMP26);
+		vis_pack16(TMP24, DST_0);
+
+		vis_pack16(TMP26, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_pmerge(ZERO, REF_S6, TMP4);
+
+		vis_pmerge(ZERO,      REF_S6_1,  TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_padd16(TMP0, TMP12, TMP0);
+
+		vis_padd16(TMP2, TMP14, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+				      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants1[0], CONST_1);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 26 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0,   CONST_256, TMP8);
+		vis_pmerge(ZERO,        REF_S2,    TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+		vis_pmerge(ZERO,        REF_S2_1,  TMP14);
+
+		vis_ld64_2(ref, stride, TMP4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_pmerge(ZERO, REF_S4, TMP18);
+
+		vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_padd16(TMP18, CONST_1, TMP18);
+		vis_mul8x16au(REF_S6,   CONST_256, TMP22);
+
+		vis_padd16(TMP20, CONST_1, TMP20);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+		vis_mul8x16au(REF_S0,   CONST_256, TMP26);
+		vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+		vis_mul8x16au(REF_S2,   CONST_256, TMP30);
+		vis_padd16(TMP18, TMP22, TMP18);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP8,  TMP18, TMP8);
+
+		vis_padd16(TMP10, TMP20, TMP10);
+
+		vis_padd16(TMP8,  TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP8,  DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP18, TMP26, TMP18);
+
+		vis_padd16(TMP20, TMP28, TMP20);
+
+		vis_padd16(TMP18, TMP30, TMP18);
+
+		vis_padd16(TMP20, TMP32, TMP20);
+		vis_pack16(TMP18, DST_2);
+
+		vis_pack16(TMP20, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+				       const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants6[0], CONST_6);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {	/* 55 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_0, TMP0);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP0, CONST_6, TMP0);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP2, CONST_6, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+		vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+		vis_padd16(TMP12, TMP30, TMP12);
+
+		vis_padd16(TMP14, TMP32, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP4, CONST_6, TMP4);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP6, CONST_6, TMP6);
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+		vis_padd16(TMP4, TMP8, TMP4);
+		vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
+
+		vis_padd16(TMP6, TMP10, TMP6);
+
+		vis_padd16(TMP20, TMP4, TMP20);
+
+		vis_padd16(TMP22, TMP6, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_padd16(TMP20, REF_0, TMP20);
+		vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+		vis_padd16(TMP22, REF_2, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO,      REF_S4_1,  REF_2);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_padd16(REF_4, TMP0, TMP8);
+
+		vis_mul8x16au(REF_S6, CONST_256, REF_4);
+		vis_padd16(REF_6, TMP2, TMP10);
+
+		vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(REF_0, TMP4, REF_0);
+
+		vis_mul8x16al(DST_2,   CONST_1024, TMP30);
+		vis_padd16(REF_2, TMP6, REF_2);
+
+		vis_mul8x16al(DST_3,   CONST_1024, TMP32);
+		vis_padd16(REF_0, REF_4, REF_0);
+
+		vis_padd16(REF_2, REF_6, REF_2);
+
+		vis_padd16(REF_0, TMP30, REF_0);
+
+		/* stall */
+
+		vis_padd16(REF_2, TMP32, REF_2);
+		vis_pack16(REF_0, DST_2);
+
+		vis_pack16(REF_2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+				      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64(constants6[0], CONST_6);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP8);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		vis_ld64_2(dest, stride, DST_2);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_S4, TMP22);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP24);
+
+		vis_mul8x16au(REF_S6, CONST_256, TMP26);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP28);
+
+		vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+		vis_padd16(TMP22, CONST_6, TMP22);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+		vis_padd16(TMP24, CONST_6, TMP24);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP24, TMP28, TMP24);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP26);
+		vis_padd16(TMP8, TMP22, TMP8);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+		vis_padd16(TMP10, TMP24, TMP10);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(REF_S4, TMP22, TMP12);
+
+		vis_padd16(REF_S6, TMP24, TMP14);
+
+		vis_padd16(TMP12, TMP26, TMP12);
+
+		vis_padd16(TMP14, TMP28, TMP14);
+
+		vis_padd16(TMP12, REF_0, TMP12);
+
+		vis_padd16(TMP14, REF_2, TMP14);
+		vis_pack16(TMP12, DST_2);
+
+		vis_pack16(TMP14, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+/* End of no rounding code */
+
+void get_pixels_vis(uint8_t *restrict dest, const uint8_t *_ref, int stride)
+{
+  int i;
+  uint8_t *ref = (uint8_t*)_ref;
+  ref = vis_alignaddr(ref);
+
+  for (i = 0; i < 8; i++)
+    {
+      vis_ld64(ref[0], TMP0);
+      vis_st64(TMP0, dest[0]);
+      dest += 8;
+      ref += stride;
+    }
+}
+
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+ 
+static void sigill_handler (int sig)
+{
+    if (!canjump) {
+        signal (sig, SIG_DFL);
+        raise (sig);
+    }
+                                                                                
+    canjump = 0;
+    siglongjmp (jmpbuf, 1);
+}
+
+#define ACCEL_SPARC_VIS 1
+#define ACCEL_SPARC_VIS2 2
+
+static int vis_level ()
+{
+    int accel = 0;
+
+    signal (SIGILL, sigill_handler);
+    if (sigsetjmp (jmpbuf, 1)) {
+        signal (SIGILL, SIG_DFL);
+        return accel;
+    }
+ 
+    canjump = 1;
+ 
+    /* pdist %f0, %f0, %f0 */
+    __asm__ __volatile__(".word\t0x81b007c0");
+                                                                                
+    canjump = 0;
+    accel |= ACCEL_SPARC_VIS;
+                                                                                
+    if (sigsetjmp (jmpbuf, 1)) {
+        signal (SIGILL, SIG_DFL);
+        return accel;
+    }
+                                                                                
+    canjump = 1;
+                                                                                
+    /* edge8n %g0, %g0, %g0 */
+    __asm__ __volatile__(".word\t0x81b00020");
+                                                                                
+    canjump = 0;
+    accel |= ACCEL_SPARC_VIS2;
+                                                                                
+    signal (SIGILL, SIG_DFL);
+
+    return accel;
+}
+
+/* libavcodec initialization code */
+void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
+{
+  /* VIS specific optimisations */
+  int accel = vis_level ();
+
+  if (accel & ACCEL_SPARC_VIS) {
+      c->get_pixels = get_pixels_vis;
+      c->put_pixels_tab[0][0] = MC_put_o_16_vis;
+      c->put_pixels_tab[0][1] = MC_put_x_16_vis;
+      c->put_pixels_tab[0][2] = MC_put_y_16_vis;
+      c->put_pixels_tab[0][3] = MC_put_xy_16_vis;
+      
+      c->put_pixels_tab[1][0] = MC_put_o_8_vis;
+      c->put_pixels_tab[1][1] = MC_put_x_8_vis;
+      c->put_pixels_tab[1][2] = MC_put_y_8_vis;
+      c->put_pixels_tab[1][3] = MC_put_xy_8_vis;
+      
+      c->avg_pixels_tab[0][0] = MC_avg_o_16_vis;
+      c->avg_pixels_tab[0][1] = MC_avg_x_16_vis;
+      c->avg_pixels_tab[0][2] = MC_avg_y_16_vis;
+      c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis;
+  
+      c->avg_pixels_tab[1][0] = MC_avg_o_8_vis;
+      c->avg_pixels_tab[1][1] = MC_avg_x_8_vis;
+      c->avg_pixels_tab[1][2] = MC_avg_y_8_vis;
+      c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis;
+  
+      c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis;
+      c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis;
+      c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis;
+      c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis;
+      
+      c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis;
+      c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis;
+      c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis;
+      c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis;
+  
+      c->avg_no_rnd_pixels_tab[0][0] = MC_avg_no_round_o_16_vis;
+      c->avg_no_rnd_pixels_tab[0][1] = MC_avg_no_round_x_16_vis;
+      c->avg_no_rnd_pixels_tab[0][2] = MC_avg_no_round_y_16_vis;
+      c->avg_no_rnd_pixels_tab[0][3] = MC_avg_no_round_xy_16_vis;
+  
+      c->avg_no_rnd_pixels_tab[1][0] = MC_avg_no_round_o_8_vis;
+      c->avg_no_rnd_pixels_tab[1][1] = MC_avg_no_round_x_8_vis;
+      c->avg_no_rnd_pixels_tab[1][2] = MC_avg_no_round_y_8_vis;
+      c->avg_no_rnd_pixels_tab[1][3] = MC_avg_no_round_xy_8_vis;
+  }
+}
+
+#endif  /* !(ARCH_SPARC) */
diff --git a/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c b/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c
new file mode 100644
index 000000000..a09ee4e28
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/libavcodec_sparc_dummy.c
@@ -0,0 +1,2 @@
+
+char libavcodec_mlib_dummy;
diff --git a/src/libffmpeg/libavcodec/sparc/vis.h b/src/libffmpeg/libavcodec/sparc/vis.h
new file mode 100644
index 000000000..07dda2949
--- /dev/null
+++ b/src/libffmpeg/libavcodec/sparc/vis.h
@@ -0,0 +1,328 @@
+/*
+ * vis.h
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* You may be asking why I hard-code the instruction opcodes and don't
+ * use the normal VIS assembler mnenomics for the VIS instructions.
+ *
+ * The reason is that Sun, in their infinite wisdom, decided that a binary
+ * using a VIS instruction will cause it to be marked (in the ELF headers)
+ * as doing so, and this prevents the OS from loading such binaries if the
+ * current cpu doesn't have VIS.  There is no way to easily override this
+ * behavior of the assembler that I am aware of.
+ *
+ * This totally defeats what libmpeg2 is trying to do which is allow a
+ * single binary to be created, and then detect the availability of VIS
+ * at runtime.
+ *
+ * I'm not saying that tainting the binary by default is bad, rather I'm
+ * saying that not providing a way to override this easily unnecessarily
+ * ties people's hands.
+ *
+ * Thus, we do the opcode encoding by hand and output 32-bit words in
+ * the assembler to keep the binary from becoming tainted.
+ */
+
+#define vis_opc_base	((0x1 << 31) | (0x36 << 19))
+#define vis_opf(X)	((X) << 5)
+#define vis_sreg(X)	(X)
+#define vis_dreg(X)	(((X)&0x1f)|((X)>>5))
+#define vis_rs1_s(X)	(vis_sreg(X) << 14)
+#define vis_rs1_d(X)	(vis_dreg(X) << 14)
+#define vis_rs2_s(X)	(vis_sreg(X) << 0)
+#define vis_rs2_d(X)	(vis_dreg(X) << 0)
+#define vis_rd_s(X)	(vis_sreg(X) << 25)
+#define vis_rd_d(X)	(vis_dreg(X) << 25)
+
+#define vis_ss2s(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_dd2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_d(rs1) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_ss2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_sd2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d2s(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s2d(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d12d(opf,rs1,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_d(rs1) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d22d(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_s12s(opf,rs1,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s22s(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s(opf,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rd_s(rd)))
+
+#define vis_d(opf,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rd_d(rd)))
+
+#define vis_r2m(op,rd,mem) \
+	__asm__ __volatile__ (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
+
+#define vis_r2m_2(op,rd,mem1,mem2) \
+	__asm__ __volatile__ (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
+
+#define vis_m2r(op,mem,rd) \
+	__asm__ __volatile__ (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
+
+#define vis_m2r_2(op,mem1,mem2,rd) \
+	__asm__ __volatile__ (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
+
+static inline void vis_set_gsr(unsigned int _val)
+{
+	register unsigned int val asm("g1");
+
+	val = _val;
+	__asm__ __volatile__(".word 0xa7804000"
+			     : : "r" (val));
+}
+
+#define VIS_GSR_ALIGNADDR_MASK	0x0000007
+#define VIS_GSR_ALIGNADDR_SHIFT	0
+#define VIS_GSR_SCALEFACT_MASK	0x0000078
+#define VIS_GSR_SCALEFACT_SHIFT	3
+
+#define vis_ld32(mem,rs1)		vis_m2r(ld, mem, rs1)
+#define vis_ld32_2(mem1,mem2,rs1)	vis_m2r_2(ld, mem1, mem2, rs1)
+#define vis_st32(rs1,mem)		vis_r2m(st, rs1, mem)
+#define vis_st32_2(rs1,mem1,mem2)	vis_r2m_2(st, rs1, mem1, mem2)
+#define vis_ld64(mem,rs1)		vis_m2r(ldd, mem, rs1)
+#define vis_ld64_2(mem1,mem2,rs1)	vis_m2r_2(ldd, mem1, mem2, rs1)
+#define vis_st64(rs1,mem)		vis_r2m(std, rs1, mem)
+#define vis_st64_2(rs1,mem1,mem2)	vis_r2m_2(std, rs1, mem1, mem2)
+
+#define vis_ldblk(mem, rd) \
+do {	register void *__mem asm("g1"); \
+	__mem = &(mem); \
+	__asm__ __volatile__(".word 0xc1985e00 | %1" \
+			     : \
+			     : "r" (__mem), \
+			       "i" (vis_rd_d(rd)) \
+			     : "memory"); \
+} while (0)
+
+#define vis_stblk(rd, mem) \
+do {	register void *__mem asm("g1"); \
+	__mem = &(mem); \
+	__asm__ __volatile__(".word 0xc1b85e00 | %1" \
+			     : \
+			     : "r" (__mem), \
+			       "i" (vis_rd_d(rd)) \
+			     : "memory"); \
+} while (0)
+
+#define vis_membar_storestore()	\
+	__asm__ __volatile__(".word 0x8143e008" : : : "memory")
+
+#define vis_membar_sync()	\
+	__asm__ __volatile__(".word 0x8143e040" : : : "memory")
+
+/* 16 and 32 bit partitioned addition and subtraction.  The normal
+ * versions perform 4 16-bit or 2 32-bit additions or subtractions.
+ * The 's' versions perform 2 16-bit or 1 32-bit additions or
+ * subtractions.
+ */
+
+#define vis_padd16(rs1,rs2,rd)		vis_dd2d(0x50, rs1, rs2, rd)
+#define vis_padd16s(rs1,rs2,rd)		vis_ss2s(0x51, rs1, rs2, rd)
+#define vis_padd32(rs1,rs2,rd)		vis_dd2d(0x52, rs1, rs2, rd)
+#define vis_padd32s(rs1,rs2,rd)		vis_ss2s(0x53, rs1, rs2, rd)
+#define vis_psub16(rs1,rs2,rd)		vis_dd2d(0x54, rs1, rs2, rd)
+#define vis_psub16s(rs1,rs2,rd)		vis_ss2s(0x55, rs1, rs2, rd)
+#define vis_psub32(rs1,rs2,rd)		vis_dd2d(0x56, rs1, rs2, rd)
+#define vis_psub32s(rs1,rs2,rd)		vis_ss2s(0x57, rs1, rs2, rd)
+
+/* Pixel formatting instructions.  */
+
+#define vis_pack16(rs2,rd)		vis_d2s( 0x3b,      rs2, rd)
+#define vis_pack32(rs1,rs2,rd)		vis_dd2d(0x3a, rs1, rs2, rd)
+#define vis_packfix(rs2,rd)		vis_d2s( 0x3d,      rs2, rd)
+#define vis_expand(rs2,rd)		vis_s2d( 0x4d,      rs2, rd)
+#define vis_pmerge(rs1,rs2,rd)		vis_ss2d(0x4b, rs1, rs2, rd)
+
+/* Partitioned multiply instructions.  */
+
+#define vis_mul8x16(rs1,rs2,rd)		vis_sd2d(0x31, rs1, rs2, rd)
+#define vis_mul8x16au(rs1,rs2,rd)	vis_ss2d(0x33, rs1, rs2, rd)
+#define vis_mul8x16al(rs1,rs2,rd)	vis_ss2d(0x35, rs1, rs2, rd)
+#define vis_mul8sux16(rs1,rs2,rd)	vis_dd2d(0x36, rs1, rs2, rd)
+#define vis_mul8ulx16(rs1,rs2,rd)	vis_dd2d(0x37, rs1, rs2, rd)
+#define vis_muld8sux16(rs1,rs2,rd)	vis_ss2d(0x38, rs1, rs2, rd)
+#define vis_muld8ulx16(rs1,rs2,rd)	vis_ss2d(0x39, rs1, rs2, rd)
+
+/* Alignment instructions.  */
+
+static inline void *vis_alignaddr(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x18) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(1)));
+
+	return ptr;
+}
+
+static inline void vis_alignaddr_g0(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x18) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(0)));
+}
+
+static inline void *vis_alignaddrl(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x19) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(1)));
+
+	return ptr;
+}
+
+static inline void vis_alignaddrl_g0(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x19) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(0)));
+}
+
+#define vis_faligndata(rs1,rs2,rd)	vis_dd2d(0x48, rs1, rs2, rd)
+
+/* Logical operate instructions.  */
+
+#define vis_fzero(rd)			vis_d(   0x60,           rd)
+#define vis_fzeros(rd)			vis_s(   0x61,           rd)
+#define vis_fone(rd)			vis_d(   0x7e,           rd)
+#define vis_fones(rd)			vis_s(   0x7f,           rd)
+#define vis_src1(rs1,rd)		vis_d12d(0x74, rs1,      rd)
+#define vis_src1s(rs1,rd)		vis_s12s(0x75, rs1,      rd)
+#define vis_src2(rs2,rd)		vis_d22d(0x78,      rs2, rd)
+#define vis_src2s(rs2,rd)		vis_s22s(0x79,      rs2, rd)
+#define vis_not1(rs1,rd)		vis_d12d(0x6a, rs1,      rd)
+#define vis_not1s(rs1,rd)		vis_s12s(0x6b, rs1,      rd)
+#define vis_not2(rs2,rd)		vis_d22d(0x66,      rs2, rd)
+#define vis_not2s(rs2,rd)		vis_s22s(0x67,      rs2, rd)
+#define vis_or(rs1,rs2,rd)		vis_dd2d(0x7c, rs1, rs2, rd)
+#define vis_ors(rs1,rs2,rd)		vis_ss2s(0x7d, rs1, rs2, rd)
+#define vis_nor(rs1,rs2,rd)		vis_dd2d(0x62, rs1, rs2, rd)
+#define vis_nors(rs1,rs2,rd)		vis_ss2s(0x63, rs1, rs2, rd)
+#define vis_and(rs1,rs2,rd)		vis_dd2d(0x70, rs1, rs2, rd)
+#define vis_ands(rs1,rs2,rd)		vis_ss2s(0x71, rs1, rs2, rd)
+#define vis_nand(rs1,rs2,rd)		vis_dd2d(0x6e, rs1, rs2, rd)
+#define vis_nands(rs1,rs2,rd)		vis_ss2s(0x6f, rs1, rs2, rd)
+#define vis_xor(rs1,rs2,rd)		vis_dd2d(0x6c, rs1, rs2, rd)
+#define vis_xors(rs1,rs2,rd)		vis_ss2s(0x6d, rs1, rs2, rd)
+#define vis_xnor(rs1,rs2,rd)		vis_dd2d(0x72, rs1, rs2, rd)
+#define vis_xnors(rs1,rs2,rd)		vis_ss2s(0x73, rs1, rs2, rd)
+#define vis_ornot1(rs1,rs2,rd)		vis_dd2d(0x7a, rs1, rs2, rd)
+#define vis_ornot1s(rs1,rs2,rd)		vis_ss2s(0x7b, rs1, rs2, rd)
+#define vis_ornot2(rs1,rs2,rd)		vis_dd2d(0x76, rs1, rs2, rd)
+#define vis_ornot2s(rs1,rs2,rd)		vis_ss2s(0x77, rs1, rs2, rd)
+#define vis_andnot1(rs1,rs2,rd)		vis_dd2d(0x68, rs1, rs2, rd)
+#define vis_andnot1s(rs1,rs2,rd)	vis_ss2s(0x69, rs1, rs2, rd)
+#define vis_andnot2(rs1,rs2,rd)		vis_dd2d(0x64, rs1, rs2, rd)
+#define vis_andnot2s(rs1,rs2,rd)	vis_ss2s(0x65, rs1, rs2, rd)
+
+/* Pixel component distance.  */
+
+#define vis_pdist(rs1,rs2,rd)		vis_dd2d(0x3e, rs1, rs2, rd)
diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c
index 6a15270b7..781194f03 100644
--- a/src/libffmpeg/libavcodec/svq1.c
+++ b/src/libffmpeg/libavcodec/svq1.c
@@ -783,6 +783,8 @@ static int svq1_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     int i;
 
+    MPV_decode_defaults(s);
+
     s->avctx = avctx;
     s->width = (avctx->width+3)&~3;
     s->height = (avctx->height+3)&~3;
diff --git a/src/libffmpeg/libavcodec/truemotion1.c b/src/libffmpeg/libavcodec/truemotion1.c
index 35bf3a788..2f6310192 100644
--- a/src/libffmpeg/libavcodec/truemotion1.c
+++ b/src/libffmpeg/libavcodec/truemotion1.c
@@ -36,9 +36,6 @@
 #include "avcodec.h"
 #include "dsputil.h"
 
-#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log
-#define fprintf(...) {} 
-
 #include "truemotion1data.h"
 
 typedef struct TrueMotion1Context {
@@ -232,7 +229,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     header.header_size = ((s->buf[0] >> 5) | (s->buf[0] << 3)) & 0x7f;
     if (s->buf[0] < 0x10)
     {
-        printf("invalid header size\n");
+        av_log(s->avctx, AV_LOG_ERROR, "invalid header size\n");
         return -1;
     }
 
@@ -282,7 +279,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     }
 
     if (header.compression > 17) {
-        printf("invalid compression type (%d)\n", header.compression);
+        av_log(s->avctx, AV_LOG_ERROR, "invalid compression type (%d)\n", header.compression);
         return -1;
     }
     
@@ -296,7 +293,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
         if (header.vectable < 4)
             sel_vector_table = tables[header.vectable - 1];
         else {
-            printf("invalid vector table id (%d)\n", header.vectable);
+            av_log(s->avctx, AV_LOG_ERROR, "invalid vector table id (%d)\n", header.vectable);
             return -1;
         }
     }
@@ -305,7 +302,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     {
         if (compression_types[header.compression].algorithm == ALGO_RGB24H)
         {
-            printf("24bit compression not yet supported\n");
+            av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n");
         }
         else
             gen_vector_table(s, sel_vector_table);
@@ -354,7 +351,7 @@ static int truemotion1_decode_init(AVCodecContext *avctx)
 #define GET_NEXT_INDEX() \
 {\
     if (index_stream_index >= s->index_stream_size) { \
-        printf (" help! truemotion1 decoder went out of bounds\n"); \
+        av_log(s->avctx, AV_LOG_INFO, " help! truemotion1 decoder went out of bounds\n"); \
         return; \
     } \
     index = s->index_stream[index_stream_index++] * 4; \
@@ -542,7 +539,7 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
 
     s->frame.reference = 1;
     if (avctx->get_buffer(avctx, &s->frame) < 0) {
-        fprintf(stderr, "truemotion1: get_buffer() failed\n");
+        av_log(s->avctx, AV_LOG_ERROR, "truemotion1: get_buffer() failed\n");
         return -1;
     }
 
@@ -561,7 +558,7 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
         memcpy(s->frame.data[0], s->prev_frame.data[0],
             s->frame.linesize[0] * s->avctx->height);
     } else if (compression_types[s->compression].algorithm == ALGO_RGB24H) {
-        printf ("  24-bit Duck TrueMotion decoding not yet implemented\n");
+        av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n");
     } else {
         truemotion1_decode_16bit(s);
     }
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index 145f9df65..ffa0cb855 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -60,47 +60,40 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size)
     if(min_size < *size) 
         return ptr;
     
-    *size= min_size + 10*1024;
+    *size= 17*min_size/16 + 32;
 
     return av_realloc(ptr, *size);
 }
 
 
-/* allocation of static arrays - do not use for normal allocation */
 static unsigned int last_static = 0;
-static char*** array_static = NULL;
-static const unsigned int grow_static = 64; // ^2
-void *__av_mallocz_static(void** location, unsigned int size)
+static unsigned int allocated_static = 0;
+static void** array_static = NULL;
+
+/**
+ * allocation of static arrays - do not use for normal allocation.
+ */
+void *av_mallocz_static(unsigned int size)
 {
-    unsigned int l = (last_static + grow_static) & ~(grow_static - 1);
     void *ptr = av_mallocz(size);
-    if (!ptr)
-	return NULL;
-
-    if (location)
-    {
-	if (l > last_static)
-	    array_static = av_realloc(array_static, l);
-	array_static[last_static++] = (char**) location;
-	*location = ptr;
+
+    if(ptr){ 
+        array_static =av_fast_realloc(array_static, &allocated_static, sizeof(void*)*(last_static+1));
+        array_static[last_static++] = ptr;
     }
+
     return ptr;
 }
-/* free all static arrays and reset pointers to 0 */
+
+/**
+ * free all static arrays and reset pointers to 0.
+ */
 void av_free_static(void)
 {
-    if (array_static)
-    {
-	unsigned i;
-	for (i = 0; i < last_static; i++)
-	{
-	    av_free(*array_static[i]);
-            *array_static[i] = NULL;
-	}
-	av_free(array_static);
-	array_static = 0;
+    while(last_static){
+        av_freep(&array_static[--last_static]);
     }
-    last_static = 0;
+    av_freep(&array_static);
 }
 
 /**
diff --git a/src/libffmpeg/libavcodec/vmdav.c b/src/libffmpeg/libavcodec/vmdav.c
index 47c77513d..c09af1369 100644
--- a/src/libffmpeg/libavcodec/vmdav.c
+++ b/src/libffmpeg/libavcodec/vmdav.c
@@ -47,9 +47,6 @@
 #include "avcodec.h"
 #include "dsputil.h"
 
-#define printf(...) {} //(f)printf() usage is forbidden in libavcodec, use av_log
-#define fprintf(...) {} 
-
 #define VMD_HEADER_SIZE 0x330
 #define PALETTE_COUNT 256
 
@@ -245,7 +242,7 @@ static void vmd_decode(VmdVideoContext *s)
                     }
                 } while (ofs < frame_width);
                 if (ofs > frame_width) {
-                    printf (" VMD video: offset > width (%d > %d)\n",
+                    av_log(s->avctx, AV_LOG_ERROR, "VMD video: offset > width (%d > %d)\n",
                         ofs, frame_width);
                     break;
                 }
@@ -283,7 +280,7 @@ static void vmd_decode(VmdVideoContext *s)
                     }
                 } while (ofs < frame_width);
                 if (ofs > frame_width) {
-                    printf (" VMD video: offset > width (%d > %d)\n",
+                    av_log(s->avctx, AV_LOG_ERROR, "VMD video: offset > width (%d > %d)\n",
                         ofs, frame_width);
                 }
                 dp += s->frame.linesize[0];
@@ -311,7 +308,7 @@ static int vmdvideo_decode_init(AVCodecContext *avctx)
 
     /* make sure the VMD header made it */
     if (s->avctx->extradata_size != VMD_HEADER_SIZE) {
-        printf("  VMD video: expected extradata size of %d\n", 
+        av_log(s->avctx, AV_LOG_ERROR, "VMD video: expected extradata size of %d\n", 
             VMD_HEADER_SIZE);
         return -1;
     }
@@ -350,7 +347,7 @@ static int vmdvideo_decode_frame(AVCodecContext *avctx,
 
     s->frame.reference = 1;
     if (avctx->get_buffer(avctx, &s->frame)) {
-        printf ("  VMD Video: get_buffer() failed\n");
+        av_log(s->avctx, AV_LOG_ERROR, "VMD Video: get_buffer() failed\n");
         return -1;
     }
 
@@ -389,6 +386,7 @@ static int vmdvideo_decode_end(AVCodecContext *avctx)
  */
 
 typedef struct VmdAudioContext {
+    AVCodecContext *avctx;
     int channels;
     int bits;
     int block_align;
@@ -403,12 +401,13 @@ static int vmdaudio_decode_init(AVCodecContext *avctx)
     VmdAudioContext *s = (VmdAudioContext *)avctx->priv_data;
     int i;
 
+    s->avctx = avctx;
     s->channels = avctx->channels;
     s->bits = avctx->bits_per_sample;
     s->block_align = avctx->block_align;
 
-printf ("  %d channels, %d bits/sample, block align = %d, sample rate = %d\n",
-  s->channels, s->bits, s->block_align, avctx->sample_rate);
+    av_log(s->avctx, AV_LOG_DEBUG, "%d channels, %d bits/sample, block align = %d, sample rate = %d\n",
+	    s->channels, s->bits, s->block_align, avctx->sample_rate);
 
     /* set up the steps8 and steps16 tables */
     for (i = 0; i < 8; i++) {
@@ -465,8 +464,8 @@ static int vmdaudio_loadsound(VmdAudioContext *s, unsigned char *data,
     int bytes_decoded = 0;
     int i;
 
-if (silence)
-  printf (" silent block!\n");
+    if (silence)
+	av_log(s->avctx, AV_LOG_INFO, "silent block!\n");
     if (s->channels == 2) {
 
         /* stereo handling */
@@ -520,7 +519,6 @@ static int vmdaudio_decode_frame(AVCodecContext *avctx,
     unsigned char *p = buf + 16;
     unsigned char *p_end = buf + buf_size;
 
-printf ("    processing audio frame with %d bytes\n", buf_size);
     if (buf_size < 16)
         return buf_size;
 
@@ -529,7 +527,6 @@ printf ("    processing audio frame with %d bytes\n", buf_size);
         /* the chunk contains audio */
         *data_size = vmdaudio_loadsound(s, output_samples, p, 0);
     } else if (buf[6] == 2) {
-printf ("  hey! audio case #2\n");
         /* the chunk contains audio and silence mixed together */
         sound_flags = LE_32(p);
         p += 4;
@@ -549,13 +546,10 @@ printf ("  hey! audio case #2\n");
             sound_flags >>= 1;
         }
     } else if (buf[6] == 3) {
-printf ("  hey! audio case #3\n");
         /* silent chunk */
         *data_size = vmdaudio_loadsound(s, output_samples, p, 1);
     }
 
-printf ("      final sample count = %d, byte count = %d\n", (*data_size) / 2,
-  *data_size);
     return buf_size;
 }
 
diff --git a/src/libffmpeg/libavcodec/vp3.c b/src/libffmpeg/libavcodec/vp3.c
index eadfd39b9..0667d99eb 100644
--- a/src/libffmpeg/libavcodec/vp3.c
+++ b/src/libffmpeg/libavcodec/vp3.c
@@ -268,9 +268,11 @@ typedef struct Vp3DecodeContext {
     VLC ac_vlc_3[16];
     VLC ac_vlc_4[16];
 
-    int16_t intra_y_dequant[64];
-    int16_t intra_c_dequant[64];
-    int16_t inter_dequant[64];
+    /* these arrays need to be on 16-byte boundaries since SSE2 operations
+     * index into them */
+    int16_t __align16 intra_y_dequant[64];
+    int16_t __align16 intra_c_dequant[64];
+    int16_t __align16 inter_dequant[64];
 
     /* This table contains superblock_count * 16 entries. Each set of 16
      * numbers corresponds to the fragment indices 0..15 of the superblock.
diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c
index 25498c4d2..cf2db1494 100644
--- a/src/libffmpeg/libavcodec/wmadec.c
+++ b/src/libffmpeg/libavcodec/wmadec.c
@@ -20,6 +20,15 @@
 /**
  * @file wmadec.c
  * WMA compatible decoder.
+ * This decoder handles Microsoft Windows Media Audio data, versions 1 & 2.
+ * WMA v1 is identified by audio format 0x160 in Microsoft media files 
+ * (ASF/AVI/WAV). WMA v2 is identified by audio format 0x161.
+ *
+ * To use this decoder, a calling application must supply the extra data
+ * bytes provided with the WMA data. These are the extra, codec-specific
+ * bytes at the end of a WAVEFORMATEX data structure. Transmit these bytes 
+ * to the decoder using the extradata[_size] fields in AVCodecContext. There 
+ * should be 4 extra bytes for v1 data and 6 extra bytes for v2 data.
  */
 
 #include "avcodec.h"
diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c
index 130a7f89d..376f0706e 100644
--- a/src/libffmpeg/libavcodec/wmv2.c
+++ b/src/libffmpeg/libavcodec/wmv2.c
@@ -181,7 +181,7 @@ int ff_wmv2_encode_picture_header(MpegEncContext * s, int picture_number)
         put_bits(&s->pb, 1, s->dc_table_index);
         put_bits(&s->pb, 1, s->mv_table_index);
     
-        s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
+        s->inter_intra_pred= 0;//(s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
     }
     s->esc3_level_length= 0;
     s->esc3_run_length= 0;
@@ -216,7 +216,7 @@ void ff_wmv2_encode_mb(MpegEncContext * s,
                  wmv2_inter_table[w->cbp_table_index][cbp + 64][0]);
 
         /* motion vector */
-        h263_pred_motion(s, 0, &pred_x, &pred_y);
+        h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
         msmpeg4_encode_motion(s, motion_x - pred_x, 
                               motion_y - pred_y);
     } else {
@@ -443,7 +443,7 @@ int ff_wmv2_decode_secondary_picture_header(MpegEncContext * s)
         s->dc_table_index = get_bits1(&s->gb);
         s->mv_table_index = get_bits1(&s->gb);
     
-        s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
+        s->inter_intra_pred= 0;//(s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
         s->no_rounding ^= 1;
         
         if(s->avctx->debug&FF_DEBUG_PICT_INFO){
@@ -504,7 +504,7 @@ static int16_t *wmv2_pred_motion(Wmv2Context *w, int *px, int *py){
     int xy, wrap, diff, type;
     int16_t *A, *B, *C, *mot_val;
 
-    wrap = s->block_wrap[0];
+    wrap = s->b8_stride;
     xy = s->block_index[0];
 
     mot_val = s->current_picture.motion_val[0][xy];
author	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2004-04-25 18:57:04 +0000
committer	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2004-04-25 18:57:04 +0000
commit	a2a44876712f079610f0396fb9a682ea47e05b6e (patch)
tree	1f3e328dfe6a5f9fa7c79e7a23bf6310be2827fd
parent	acb7dc0f256afc24e875a168da989ef25d86b7b7 (diff)
download	xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.gz xine-lib-a2a44876712f079610f0396fb9a682ea47e05b6e.tar.bz2