syncing ffmpeg (with some compilation fixes)

- fixes wma bugs - mace, huffyuv and mp3 decoders imported (but not enabled) tested: wma (v1 and v2), mpeg4, msmpeg4 v1, v2 and v3, divx3, divx4, divx5, xvid and dv decoders. everything looks fine. CVS patchset: 3828 CVS date: 2003/01/08 13:18:42
author: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-01-08 13:18:42 +0000
committer: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-01-08 13:18:42 +0000
commit: 6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea (patch)
tree: e70be493d1222b10f96aa5efac01c0ec0d5bcc97
parent: 1fb58a63872660424777d41389e426dc90f1b660 (diff)
download: xine-lib-6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea.tar.gz
xine-lib-6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea.tar.bz2
51 files changed, 11333 insertions, 1746 deletions
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index a92987fe6..1298e8790 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -4,7 +4,7 @@
 
 SUBDIRS = armv4l i386 mlib alpha
 
-EXTRA_DIST = fdctref.c imgresample.c
+EXTRA_DIST = fdctref.c imgresample.c motion_est_template.c 
 
 AM_CFLAGS = $(LIBFFMPEG_CFLAGS)
 ASFLAGS =
@@ -22,15 +22,18 @@ libavcodec_la_SOURCES = \
 	fft.c \
 	h263.c \
 	h263dec.c \
+	huffyuv.c \
 	imgconvert.c \
 	jfdctfst.c \
 	jfdctint.c \
 	jrevdct.c \
 	mdct.c \
+	mace.c \
 	mem.c \
 	mjpeg.c \
 	motion_est.c \
 	mpeg12.c \
+	mpegaudiodec.c \
 	mpegvideo.c \
 	msmpeg4.c \
 	ratecontrol.c \
@@ -39,6 +42,7 @@ libavcodec_la_SOURCES = \
 	svq1.c \
 	utils.c \
 	wmadec.c 
+#	wmv2.c
 #imgresample.c
 
 libavcodec_la_LDFLAGS = \
@@ -57,6 +61,9 @@ noinst_HEADERS = \
 	mangle.h \
 	mpeg4data.h \
 	mpeg12data.h \
+	mpegaudio.h \
+	mpegaudiodectab.h \
+	mpegaudiotab.h \
 	mpegvideo.h \
 	msmpeg4data.h \
 	simple_idct.h \
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 470fbc3d9..fd7eafbd4 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -5,8 +5,8 @@
 
 #define LIBAVCODEC_VERSION_INT 0x000406
 #define LIBAVCODEC_VERSION     "0.4.6"
-#define LIBAVCODEC_BUILD       4643
-#define LIBAVCODEC_BUILD_STR   "4643"
+#define LIBAVCODEC_BUILD       4652
+#define LIBAVCODEC_BUILD_STR   "4652"
 
 enum CodecID {
     CODEC_ID_NONE, 
@@ -60,7 +60,6 @@ enum CodecType {
 };
 
 enum PixelFormat {
-    PIX_FMT_ANY = -1,
     PIX_FMT_YUV420P,
     PIX_FMT_YUV422,
     PIX_FMT_RGB24,
@@ -119,7 +118,7 @@ static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG,
                                        ME_X1, ME_EPZS, ME_FULL };
 
 
-#define FF_MAX_B_FRAMES 4
+#define FF_MAX_B_FRAMES 8
 
 /* encoding support
    these flags can be passed in AVCodecContext.flags before initing 
@@ -147,6 +146,8 @@ static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG,
 #define CODEC_FLAG_INTERLACED_DCT 0x00040000 /* use interlaced dct */
 #define CODEC_FLAG_LOW_DELAY      0x00080000 /* force low delay / will fail on b frames */
 #define CODEC_FLAG_ALT_SCAN       0x00100000 /* use alternate scan */
+#define CODEC_FLAG_TRELLIS_QUANT  0x00200000 /* use trellis quantization */
+#define CODEC_FLAG_GLOBAL_HEADER  0x00400000 /* place global headers in extradata instead of every keyframe */
 
 /* codec capabilities */
 
@@ -159,11 +160,11 @@ static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG,
 
 #define FRAME_RATE_BASE 10000
 
-#define FF_COMMON_PICTURE \
+#define FF_COMMON_FRAME \
     uint8_t *data[4];\
     int linesize[4];\
     /**\
-     * pointer to the first allocated byte of the picture. can be used in get_buffer/release_buffer
+     * pointer to the first allocated byte of the picture. can be used in get_buffer/release_buffer\
      * this isnt used by lavc unless the default get/release_buffer() is used\
      * encoding: \
      * decoding: \
@@ -260,15 +261,28 @@ static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG,
      * decoding: unused\
      */\
     uint64_t error[4];\
+\
+    /**\
+     * type of the buffer (to keep track of who has to dealloc data[*])\
+     * encoding: set by the one who allocs it\
+     * decoding: set by the one who allocs it\
+     * Note: user allocated (direct rendering) & internal buffers can not coexist currently\ 
+     */\
+    int type;\
+
+#define FF_BUFFER_TYPE_INTERNAL 1
+#define FF_BUFFER_TYPE_USER     2 // Direct rendering buffers
+#define FF_BUFFER_TYPE_SHARED   4 // input frame for encoding(wont be dealloced)
+
 
 #define FF_I_TYPE 1 // Intra
 #define FF_P_TYPE 2 // Predicted
 #define FF_B_TYPE 3 // Bi-dir predicted
 #define FF_S_TYPE 4 // S(GMC)-VOP MPEG4
 
-typedef struct AVVideoFrame {
-    FF_COMMON_PICTURE
-} AVVideoFrame;
+typedef struct AVFrame {
+    FF_COMMON_FRAME
+} AVFrame;
 
 typedef struct AVCodecContext {
     /**
@@ -383,13 +397,6 @@ typedef struct AVCodecContext {
                            previous encoded frame */
     
     /**
-     * 1 -> keyframe, 0-> not (this if for audio only, for video, AVVideoFrame.key_frame should be used)
-     * encoding: set by lavc (for the outputed bitstream, not the input frame)
-     * decoding: set by lavc (for the decoded  bitstream, not the displayed frame)
-     */
-    int key_frame;
-
-    /**
      * number of frames the decoded output will be delayed relative to 
      * the encoded input
      * encoding: set by lavc.
@@ -561,7 +568,7 @@ typedef struct AVCodecContext {
      * encoding: unused
      * decoding: set by lavc, user can override
      */
-    int (*get_buffer)(struct AVCodecContext *c, AVVideoFrame *pic);
+    int (*get_buffer)(struct AVCodecContext *c, AVFrame *pic);
     
     /**
      * called to release buffers which where allocated with get_buffer.
@@ -570,7 +577,7 @@ typedef struct AVCodecContext {
      * encoding: unused
      * decoding: set by lavc, user can override
      */
-    void (*release_buffer)(struct AVCodecContext *c, AVVideoFrame *pic);
+    void (*release_buffer)(struct AVCodecContext *c, AVFrame *pic);
 
     /**
      * is 1 if the decoded stream contains b frames, 0 otherwise
@@ -807,7 +814,7 @@ typedef struct AVCodecContext {
      * encoding: set by lavc
      * decoding: set by lavc
      */
-    AVVideoFrame *coded_picture;
+    AVFrame *coded_frame;
 
     /**
      * debug 
@@ -821,7 +828,10 @@ typedef struct AVCodecContext {
 #define FF_DEBUG_MB_TYPE   8
 #define FF_DEBUG_QP        16
 #define FF_DEBUG_MV        32
-#define FF_DEBUG_VIS_MV    64
+#define FF_DEBUG_VIS_MV    0x00000040
+#define FF_DEBUG_SKIP      0x00000080
+#define FF_DEBUG_STARTCODE 0x00000100
+#define FF_DEBUG_PTS       0x00000200
     
     /**
      * error
@@ -829,6 +839,91 @@ typedef struct AVCodecContext {
      * decoding: unused
      */
     uint64_t error[4];
+    
+    /**
+     * minimum MB quantizer
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int mb_qmin;
+
+    /**
+     * maximum MB quantizer
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int mb_qmax;
+    
+    /**
+     * motion estimation compare function
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int me_cmp;
+    /**
+     * subpixel motion estimation compare function
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int me_sub_cmp;
+    /**
+     * macroblock compare function (not supported yet)
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int mb_cmp;
+#define FF_CMP_SAD  0
+#define FF_CMP_SSE  1
+#define FF_CMP_SATD 2
+#define FF_CMP_DCT  3
+#define FF_CMP_PSNR 4
+#define FF_CMP_BIT  5
+#define FF_CMP_RD   6
+#define FF_CMP_ZERO 7
+#define FF_CMP_CHROMA 256
+    
+    /**
+     * ME diamond size & shape
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int dia_size;
+
+    /**
+     * amount of previous MV predictors (2a+1 x 2a+1 square)
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int last_predictor_count;
+
+    /**
+     * pre pass for motion estimation
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int pre_me;
+
+    /**
+     * motion estimation pre pass compare function
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int me_pre_cmp;
+
+    /**
+     * ME pre pass diamond size & shape
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int pre_dia_size;
+
+    /**
+     * subpel ME quality
+     * encoding: set by user.
+     * decoding: unused
+     */
+    int me_subpel_quality;
+
 } AVCodecContext;
 
 typedef struct AVCodec {
@@ -988,16 +1083,16 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode);
 
 void avcodec_get_context_defaults(AVCodecContext *s);
 AVCodecContext *avcodec_alloc_context(void);
-AVVideoFrame *avcodec_alloc_picture(void);
+AVFrame *avcodec_alloc_frame(void);
 
-int avcodec_default_get_buffer(AVCodecContext *s, AVVideoFrame *pic);
-void avcodec_default_release_buffer(AVCodecContext *s, AVVideoFrame *pic);
+int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic);
+void avcodec_default_release_buffer(AVCodecContext *s, AVFrame *pic);
 
 int avcodec_open(AVCodecContext *avctx, AVCodec *codec);
 int avcodec_decode_audio(AVCodecContext *avctx, INT16 *samples, 
                          int *frame_size_ptr,
                          UINT8 *buf, int buf_size);
-int avcodec_decode_video(AVCodecContext *avctx, AVVideoFrame *picture, 
+int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture, 
                          int *got_picture_ptr,
                          UINT8 *buf, int buf_size);
 int avcodec_parse_frame(AVCodecContext *avctx, UINT8 **pdata, 
@@ -1006,7 +1101,7 @@ int avcodec_parse_frame(AVCodecContext *avctx, UINT8 **pdata,
 int avcodec_encode_audio(AVCodecContext *avctx, UINT8 *buf, int buf_size, 
                          const short *samples);
 int avcodec_encode_video(AVCodecContext *avctx, UINT8 *buf, int buf_size, 
-                         const AVVideoFrame *pict);
+                         const AVFrame *pict);
 
 int avcodec_close(AVCodecContext *avctx);
 
@@ -1014,6 +1109,41 @@ void avcodec_register_all(void);
 
 void avcodec_flush_buffers(AVCodecContext *avctx);
 
+typedef struct {
+    /** options' name with default value*/
+    const char* name;
+    /** English text help */
+    const char* help;
+    /** type of variable */
+    int type;
+#define FF_CONF_TYPE_BOOL 1     // boolean - true,1,on  (or simply presence)
+#define FF_CONF_TYPE_DOUBLE 2   // double
+#define FF_CONF_TYPE_INT 3      // integer
+#define FF_CONF_TYPE_STRING 4   // string (finished with \0)
+#define FF_CONF_TYPE_MASK 0x1f	// mask for types - upper bits are various flags
+#define FF_CONF_TYPE_EXPERT 0x20 // flag for expert option
+#define FF_CONF_TYPE_FLAG (FF_CONF_TYPE_BOOL | 0x40)
+#define FF_CONF_TYPE_RCOVERIDE (FF_CONF_TYPE_STRING | 0x80)
+    /** where the parsed value should be stored */
+    void* val;
+    /** min value  (min == max   ->  no limits) */
+    double min;
+    /** maximum value for double/int */
+    double max;
+    /** default boo [0,1]l/double/int value */
+    double defval;
+    /**
+     * default string value (with optional semicolon delimited extra option-list
+     * i.e.   option1;option2;option3
+     * defval might select other then first argument as default
+     */
+    const char* defstr;
+    /** char* list of supported codecs (i.e. ",msmpeg4,h263," NULL - everything */
+    const char* supported;
+} avc_config_t;
+
+void avcodec_getopt(AVCodecContext* avctx, char* str, avc_config_t** config);
+
 /**
  * Interface for 0.5.0 version
  *
@@ -1098,7 +1228,7 @@ void __av_freep(void **ptr);
 #define av_freep(p) __av_freep((void **)(p))
 /* for static data only */
 /* call av_free_static to release all staticaly allocated tables */
-void av_free_static();
+void av_free_static(void);
 void *__av_mallocz_static(void** location, unsigned int size);
 #define av_mallocz_static(p, s) __av_mallocz_static((void **)(p), s)
 
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 9dfd7dcda..405ddaa09 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -33,6 +33,10 @@
 #        define ENODATA  61
 #    endif
 
+#ifndef M_PI
+#define M_PI    3.14159265358979323846
+#endif
+
 #endif /* HAVE_AV_CONFIG_H */
 
 /* Suppress restrict if it was not defined in config.h.  */
@@ -79,9 +83,6 @@ typedef INT64 int64_t;
 #        define UINT64_C(c)    (c ## ULL)
 #    endif /* __MINGW32__ */
 
-#    define M_PI    3.14159265358979323846
-#    define M_SQRT2 1.41421356237309504880  /* sqrt(2) */
-
 #    ifdef _DEBUG
 #        define DEBUG
 #    endif
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index 1e177116a..0d7556f65 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -20,6 +20,7 @@
  */
 #include "avcodec.h"
 #include "dsputil.h"
+#include "mpegvideo.h"
 
 int ff_bit_exact=0;
 
@@ -144,7 +145,28 @@ static int pix_norm1_c(UINT8 * pix, int line_size)
 }
 
 
-static int pix_norm_c(UINT8 * pix1, UINT8 * pix2, int line_size)
+static int sse8_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
+{
+    int s, i;
+    UINT32 *sq = squareTbl + 256;
+
+    s = 0;
+    for (i = 0; i < 8; i++) {
+        s += sq[pix1[0] - pix2[0]];
+        s += sq[pix1[1] - pix2[1]];
+        s += sq[pix1[2] - pix2[2]];
+        s += sq[pix1[3] - pix2[3]];
+        s += sq[pix1[4] - pix2[4]];
+        s += sq[pix1[5] - pix2[5]];
+        s += sq[pix1[6] - pix2[6]];
+        s += sq[pix1[7] - pix2[7]];
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    return s;
+}
+
+static int sse16_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
 {
     int s, i, j;
     UINT32 *sq = squareTbl + 256;
@@ -759,6 +781,7 @@ static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStr
     }
 }
 
+
 #define QPEL_MC(r, OPNAME, RND, OP) \
 static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
@@ -808,6 +831,7 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStrid
 static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
     int i;\
+    \
     for(i=0; i<h; i++)\
     {\
         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
@@ -831,9 +855,10 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStri
     }\
 }\
 \
-static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
+static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride){\
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
     int i;\
+    const int w=16;\
     for(i=0; i<w; i++)\
     {\
         const int src0= src[0*srcStride];\
@@ -1024,21 +1049,21 @@ static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 half[256];\
     copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
 }\
 \
 static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     copy_block17(full, src, 24, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
 }\
 \
 static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 half[256];\
     copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1048,8 +1073,8 @@ static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfHV[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1059,8 +1084,8 @@ static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfHV[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1070,8 +1095,8 @@ static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfHV[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1081,22 +1106,22 @@ static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfHV[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[272];\
     UINT8 halfHV[256];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[272];\
     UINT8 halfHV[256];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1106,8 +1131,8 @@ static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfHV[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
@@ -1117,14 +1142,14 @@ static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfHV[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[272];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 }
 
 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
@@ -1141,7 +1166,103 @@ QPEL_MC(0, avg_       , _       , op_avg)
 #undef op_put
 #undef op_put_no_rnd
 
-static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    int i;
+
+    for(i=0; i<h; i++){
+        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
+        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
+        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
+        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
+        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
+        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
+        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
+        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
+        dst+=dstStride;
+        src+=srcStride;        
+    }
+}
+
+static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    int i;
+
+    for(i=0; i<w; i++){
+        const int src_1= src[ -srcStride];
+        const int src0 = src[0          ];
+        const int src1 = src[  srcStride];
+        const int src2 = src[2*srcStride];
+        const int src3 = src[3*srcStride];
+        const int src4 = src[4*srcStride];
+        const int src5 = src[5*srcStride];
+        const int src6 = src[6*srcStride];
+        const int src7 = src[7*srcStride];
+        const int src8 = src[8*srcStride];
+        const int src9 = src[9*srcStride];
+        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
+        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
+        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
+        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
+        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
+        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
+        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
+        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
+        src++;
+        dst++;
+    }
+}
+
+static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
+    put_pixels8_c(dst, src, stride, 8);
+}
+
+static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
+    uint8_t half[64];
+    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
+    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
+}
+
+static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
+    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
+}
+
+static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
+    uint8_t half[64];
+    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
+    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
+}
+
+static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
+    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
+}
+
+static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
+    uint8_t halfH[88];
+    uint8_t halfV[64];
+    uint8_t halfHV[64];
+    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
+    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
+    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
+    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
+}
+static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
+    uint8_t halfH[88];
+    uint8_t halfV[64];
+    uint8_t halfHV[64];
+    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
+    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
+    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
+    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
+}
+static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
+    uint8_t halfH[88];
+    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
+    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
+}
+
+
+static inline int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
 
@@ -1257,7 +1378,7 @@ static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
     return s;
 }
 
-static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static inline int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
 
@@ -1341,10 +1462,18 @@ static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
     return s;
 }
 
-void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
+static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
+    return pix_abs16x16_c(a,b,stride);
+}
+
+static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
+    return pix_abs8x8_c(a,b,stride);
+}
+
+void ff_block_permute(DCTELEM *block, UINT8 *permutation, const UINT8 *scantable, int last)
 {
     int i;
-    INT16 temp[64];
+    DCTELEM temp[64];
     
     if(last<=0) return;
     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
@@ -1399,6 +1528,156 @@ static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
         dst[i+0] = src1[i+0]-src2[i+0];
 }
 
+#define BUTTERFLY2(o1,o2,i1,i2) \
+o1= (i1)+(i2);\
+o2= (i1)-(i2);
+
+#define BUTTERFLY1(x,y) \
+{\
+    int a,b;\
+    a= x;\
+    b= y;\
+    x= a+b;\
+    y= a-b;\
+}
+
+#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
+
+static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
+    int i;
+    int temp[64];
+    int sum=0;
+
+    for(i=0; i<8; i++){
+        //FIXME try pointer walks
+        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
+        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
+        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
+        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
+        
+        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
+        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
+        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
+        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
+        
+        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
+        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
+        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
+        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
+    }
+
+    for(i=0; i<8; i++){
+        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
+        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
+        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
+        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
+        
+        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
+        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
+        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
+        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
+
+        sum += 
+             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
+            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
+            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
+            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
+    }
+#if 0
+static int maxi=0;
+if(sum>maxi){
+    maxi=sum;
+    printf("MAX:%d\n", maxi);
+}
+#endif
+    return sum;
+}
+
+static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
+    int i;
+    int temp[64];
+    int sum=0;
+//FIXME OOOPS ignore 0 term instead of mean mess
+    for(i=0; i<8; i++){
+        //FIXME try pointer walks
+        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
+        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
+        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
+        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
+        
+        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
+        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
+        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
+        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
+        
+        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
+        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
+        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
+        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
+    }
+
+    for(i=0; i<8; i++){
+        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
+        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
+        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
+        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
+        
+        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
+        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
+        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
+        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
+    
+        sum += 
+             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
+            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
+            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
+            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
+    }
+    
+    return sum;
+}
+
+static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
+    MpegEncContext * const s= (MpegEncContext *)c;
+    DCTELEM temp[64];
+    int sum=0, i;
+
+    s->dsp.diff_pixels(temp, src1, src2, stride);
+    s->fdct(temp);
+
+    for(i=0; i<64; i++)
+        sum+= ABS(temp[i]);
+        
+    return sum;
+}
+
+void simple_idct(INT16 *block); //FIXME
+
+static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
+    MpegEncContext * const s= (MpegEncContext *)c;
+    DCTELEM temp[64], bak[64];
+    int sum=0, i;
+
+    s->mb_intra=0;
+    
+    s->dsp.diff_pixels(temp, src1, src2, stride);
+    
+    memcpy(bak, temp, 64*sizeof(DCTELEM));
+    
+    s->dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+    s->dct_unquantize(s, temp, 0, s->qscale);
+    simple_idct(temp); //FIXME 
+    
+    for(i=0; i<64; i++)
+        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
+        
+    return sum;
+}
+
+WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
+WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
+WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
+
 void dsputil_init(DSPContext* c, unsigned mask)
 {
     static int init_done = 0;
@@ -1429,7 +1708,8 @@ void dsputil_init(DSPContext* c, unsigned mask)
     c->clear_blocks = clear_blocks_c;
     c->pix_sum = pix_sum_c;
     c->pix_norm1 = pix_norm1_c;
-    c->pix_norm = pix_norm_c;
+    c->sse[0]= sse16_c;
+    c->sse[1]= sse8_c;
 
     /* TODO [0] 16  [1] 8 */
     c->pix_abs16x16     = pix_abs16x16_c;
@@ -1489,6 +1769,28 @@ void dsputil_init(DSPContext* c, unsigned mask)
     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
 #undef dspfunc
 
+    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
+    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
+    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
+    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
+    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
+    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
+    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
+    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
+    
+    c->hadamard8_diff[0]= hadamard8_diff16_c;
+    c->hadamard8_diff[1]= hadamard8_diff_c;
+    c->hadamard8_abs = hadamard8_abs_c;
+    
+    c->dct_sad[0]= dct_sad16x16_c;
+    c->dct_sad[1]= dct_sad8x8_c;
+    
+    c->sad[0]= sad16x16_c;
+    c->sad[1]= sad8x8_c;
+    
+    c->quant_psnr[0]= quant_psnr16x16_c;
+    c->quant_psnr[1]= quant_psnr8x8_c;
+    
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
 
@@ -1516,7 +1818,6 @@ void dsputil_init(DSPContext* c, unsigned mask)
 #ifdef HAVE_MMI
     dsputil_init_mmi(c, mask);
 #endif
-
 }
 
 /* remove any non bit exact operation (testing purpose) */
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index d52b0419c..f34a8f078 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -84,13 +84,10 @@ static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 
 /* motion estimation */
 
-typedef int (*op_pixels_abs_func)(UINT8 *blk1/*align width (8 or 16)*/, UINT8 *blk2/*align 1*/, int line_size);
-/*
-int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx);
-*/
+typedef int (*op_pixels_abs_func)(UINT8 *blk1/*align width (8 or 16)*/, UINT8 *blk2/*align 1*/, int line_size)/* __attribute__ ((const))*/;
+
+typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, UINT8 *blk1/*align width (8 or 16)*/, UINT8 *blk2/*align 1*/, int line_size)/* __attribute__ ((const))*/;
+
 typedef struct DSPContext {
     /* pixel ops : interface with DCT */
     void (*get_pixels)(DCTELEM *block/*align 16*/, const UINT8 *pixels/*align 8*/, int line_size);
@@ -103,7 +100,17 @@ typedef struct DSPContext {
     void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
     int (*pix_sum)(UINT8 * pix, int line_size);
     int (*pix_norm1)(UINT8 * pix, int line_size);
-    int (*pix_norm)(UINT8 * pix1, UINT8 * pix2, int line_size);
+    me_cmp_func sad[2]; /* identical to pix_absAxA except additional void * */
+    me_cmp_func sse[2];
+    me_cmp_func hadamard8_diff[2];
+    me_cmp_func dct_sad[2];
+    me_cmp_func quant_psnr[2];
+    int (*hadamard8_abs )(uint8_t *src, int stride, int mean);
+
+    me_cmp_func me_pre_cmp[11];
+    me_cmp_func me_cmp[11];
+    me_cmp_func me_sub_cmp[11];
+    me_cmp_func mb_cmp[11];
 
     /* maybe create an array for 16/8 functions */
     op_pixels_func put_pixels_tab[2][4];
@@ -114,6 +121,7 @@ typedef struct DSPContext {
     qpel_mc_func avg_qpel_pixels_tab[2][16];
     qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
     qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
+    qpel_mc_func put_mspel_pixels_tab[8];
 
     op_pixels_abs_func pix_abs16x16;
     op_pixels_abs_func pix_abs16x16_x2;
@@ -125,9 +133,8 @@ typedef struct DSPContext {
     op_pixels_abs_func pix_abs8x8_xy2;
     
     /* huffyuv specific */
-    //FIXME note: alignment isnt guranteed currently but could be if needed
     void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
-    void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/,int w);
+    void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
 } DSPContext;
 
 void dsputil_init(DSPContext* p, unsigned mask);
@@ -165,6 +172,7 @@ static inline void emms(void)
 }
 #endif
 
+
 #define emms_c() \
 {\
     if (mm_flags & MM_MMX)\
@@ -266,6 +274,8 @@ int fft_init(FFTContext *s, int nbits, int inverse);
 void fft_permute(FFTContext *s, FFTComplex *z);
 void fft_calc_c(FFTContext *s, FFTComplex *z);
 void fft_calc_sse(FFTContext *s, FFTComplex *z);
+void fft_calc_altivec(FFTContext *s, FFTComplex *z);
+
 static inline void fft_calc(FFTContext *s, FFTComplex *z)
 {
     s->fft_calc(s, z);
@@ -290,6 +300,14 @@ void ff_mdct_calc(MDCTContext *s, FFTSample *out,
                const FFTSample *input, FFTSample *tmp);
 void ff_mdct_end(MDCTContext *s);
 
+#define WARPER88_1616(name8, name16)\
+static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride){\
+    return name8(s, dst           , src           , stride)\
+          +name8(s, dst+8         , src+8         , stride)\
+          +name8(s, dst  +8*stride, src  +8*stride, stride)\
+          +name8(s, dst+8+8*stride, src+8+8*stride, stride);\
+}
+
 #ifndef HAVE_LRINTF
 /* XXX: add ISOC specific test to avoid specific BSD testing. */
 /* better than nothing implementation. */
diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c
index 05128aee4..f436caf12 100644
--- a/src/libffmpeg/libavcodec/dv.c
+++ b/src/libffmpeg/libavcodec/dv.c
@@ -33,7 +33,7 @@ typedef struct DVVideoDecodeContext {
     int sampling_411; /* 0 = 420, 1 = 411 */
     int width, height;
     UINT8 *current_picture[3]; /* picture structure */
-    AVVideoFrame picture;
+    AVFrame picture;
     int linesize[3];
     DCTELEM block[5*6][64] __align8;
     UINT8 dv_zigzag[2][64];
@@ -497,7 +497,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
 {
     DVVideoDecodeContext *s = avctx->priv_data;
     int sct, dsf, apt, ds, nb_dif_segs, vs, width, height, i, packet_size;
-    unsigned size;
     UINT8 *buf_ptr;
     const UINT16 *mb_pos_ptr;
     
@@ -595,8 +594,8 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
     emms_c();
 
     /* return image */
-    *data_size = sizeof(AVVideoFrame);
-    *(AVVideoFrame*)data= s->picture;
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data= s->picture;
     
     avctx->release_buffer(avctx, &s->picture);
     
diff --git a/src/libffmpeg/libavcodec/error_resilience.c b/src/libffmpeg/libavcodec/error_resilience.c
index 5a8df74d2..3cb8d40bc 100644
--- a/src/libffmpeg/libavcodec/error_resilience.c
+++ b/src/libffmpeg/libavcodec/error_resilience.c
@@ -18,8 +18,6 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <alloca.h>
-
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
@@ -297,16 +295,14 @@ static void v_block_filter(MpegEncContext *s, UINT8 *dst, int w, int h, int stri
 }
 
 static void guess_mv(MpegEncContext *s){
-    UINT8 *fixed;
+    UINT8 fixed[s->mb_num];
 #define MV_FROZEN    3
 #define MV_CHANGED   2
 #define MV_UNCHANGED 1
     const int mb_width = s->mb_width;
     const int mb_height= s->mb_height;
     int i, depth, num_avail;
-
-    fixed = alloca(s->mb_num);
-
+   
     num_avail=0;
     for(i=0; i<s->mb_num; i++){
         int f=0;
diff --git a/src/libffmpeg/libavcodec/fft.c b/src/libffmpeg/libavcodec/fft.c
index 0f5181ac3..f060992f4 100644
--- a/src/libffmpeg/libavcodec/fft.c
+++ b/src/libffmpeg/libavcodec/fft.c
@@ -51,33 +51,48 @@ int fft_init(FFTContext *s, int nbits, int inverse)
     s->exptab1 = NULL;
 
     /* compute constant table for HAVE_SSE version */
-#if defined(HAVE_MMX) && 0
-    if (mm_flags & MM_SSE) {
-        int np, nblocks, np2, l;
-        FFTComplex *q;
-
-        np = 1 << nbits;
-        nblocks = np >> 3;
-        np2 = np >> 1;
-        s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex));
-        if (!s->exptab1)
-            goto fail;
-        q = s->exptab1;
-        do {
-            for(l = 0; l < np2; l += 2 * nblocks) {
-                *q++ = s->exptab[l];
-                *q++ = s->exptab[l + nblocks];
-
-                q->re = -s->exptab[l].im;
-                q->im = s->exptab[l].re;
-                q++;
-                q->re = -s->exptab[l + nblocks].im;
-                q->im = s->exptab[l + nblocks].re;
-                q++;
-            }
-            nblocks = nblocks >> 1;
-        } while (nblocks != 0);
-        av_freep(&s->exptab);
+#if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(HAVE_ALTIVEC)
+    {
+        int has_vectors;
+
+#if defined(HAVE_MMX)
+        has_vectors = mm_support() & MM_SSE;
+#else
+        /* XXX: should also use mm_support() ? */
+        has_vectors = has_altivec() & MM_ALTIVEC;
+#endif
+        if (has_vectors) {
+            int np, nblocks, np2, l;
+            FFTComplex *q;
+            
+            np = 1 << nbits;
+            nblocks = np >> 3;
+            np2 = np >> 1;
+            s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex));
+            if (!s->exptab1)
+                goto fail;
+            q = s->exptab1;
+            do {
+                for(l = 0; l < np2; l += 2 * nblocks) {
+                    *q++ = s->exptab[l];
+                    *q++ = s->exptab[l + nblocks];
+
+                    q->re = -s->exptab[l].im;
+                    q->im = s->exptab[l].re;
+                    q++;
+                    q->re = -s->exptab[l + nblocks].im;
+                    q->im = s->exptab[l + nblocks].re;
+                    q++;
+                }
+                nblocks = nblocks >> 1;
+            } while (nblocks != 0);
+            av_freep(&s->exptab);
+#if defined(HAVE_MMX)
+            s->fft_calc = fft_calc_sse;
+#else
+            s->fft_calc = fft_calc_altivec;
+#endif
+        }
     }
 #endif
 
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index 0d0799bb5..bc21e0cd8 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -82,12 +82,13 @@ static UINT16 (*mv_penalty)[MAX_MV*2+1]= NULL;
 static UINT8 fcode_tab[MAX_MV*2+1];
 static UINT8 umv_fcode_tab[MAX_MV*2+1];
 
-static UINT32 uni_mpeg4_intra_rl_bits[64*64*2*2];
-static UINT8  uni_mpeg4_intra_rl_len [64*64*2*2];
-static UINT32 uni_mpeg4_inter_rl_bits[64*64*2*2];
-static UINT8  uni_mpeg4_inter_rl_len [64*64*2*2];
-#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128 + (run)*256 + (level))
+static uint32_t uni_mpeg4_intra_rl_bits[64*64*2*2];
+static uint8_t  uni_mpeg4_intra_rl_len [64*64*2*2];
+static uint32_t uni_mpeg4_inter_rl_bits[64*64*2*2];
+static uint8_t  uni_mpeg4_inter_rl_len [64*64*2*2];
+//#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128 + (run)*256 + (level))
 //#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128*64 + (run) + (level)*64)
+#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128*64 + (run)*128 + (level))
 
 /* mpeg4
 inter
@@ -204,10 +205,6 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
 		
         put_bits(&s->pb,1,0); /* Reference Picture Resampling: off */
         put_bits(&s->pb,1,0); /* Reduced-Resolution Update: off */
-        if (s->pict_type == I_TYPE)
-            s->no_rounding = 0;
-        else
-            s->no_rounding ^= 1;
         put_bits(&s->pb,1,s->no_rounding); /* Rounding Type */
         put_bits(&s->pb,2,0); /* Reserved */
         put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */
@@ -392,6 +389,57 @@ void ff_clean_mpeg4_qscales(MpegEncContext *s){
     }
 }
 
+void ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
+    const int mb_index= s->mb_x + s->mb_y*s->mb_width;
+    int xy= s->block_index[0];
+    uint16_t time_pp= s->pp_time;
+    uint16_t time_pb= s->pb_time;
+    int i;
+        
+    //FIXME avoid divides
+    switch(s->co_located_type_table[mb_index]){
+    case 0:
+        s->mv_type= MV_TYPE_16X16;
+        s->mv[0][0][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
+        s->mv[0][0][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
+        s->mv[1][0][0] = mx ? s->mv[0][0][0] - s->motion_val[xy][0]
+                            : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp;
+        s->mv[1][0][1] = my ? s->mv[0][0][1] - s->motion_val[xy][1] 
+                            : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp;
+        break;
+    case CO_LOCATED_TYPE_4MV:
+        s->mv_type = MV_TYPE_8X8;
+        for(i=0; i<4; i++){
+            xy= s->block_index[i];
+            s->mv[0][i][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
+            s->mv[0][i][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
+            s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->motion_val[xy][0]
+                                : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp;
+            s->mv[1][i][1] = my ? s->mv[0][i][1] - s->motion_val[xy][1] 
+                                : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp;
+        }
+        break;
+    case CO_LOCATED_TYPE_FIELDMV:
+        s->mv_type = MV_TYPE_FIELD;
+        for(i=0; i<2; i++){
+            if(s->top_field_first){
+                time_pp= s->pp_field_time - s->field_select_table[mb_index][i] + i;
+                time_pb= s->pb_field_time - s->field_select_table[mb_index][i] + i;
+            }else{
+                time_pp= s->pp_field_time + s->field_select_table[mb_index][i] - i;
+                time_pb= s->pb_field_time + s->field_select_table[mb_index][i] - i;
+            }
+            s->mv[0][i][0] = s->field_mv_table[mb_index][i][0]*time_pb/time_pp + mx;
+            s->mv[0][i][1] = s->field_mv_table[mb_index][i][1]*time_pb/time_pp + my;
+            s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->field_mv_table[mb_index][i][0]
+                                : s->field_mv_table[mb_index][i][0]*(time_pb - time_pp)/time_pp;
+            s->mv[1][i][1] = my ? s->mv[0][i][1] - s->field_mv_table[mb_index][i][1] 
+                                : s->field_mv_table[mb_index][i][1]*(time_pb - time_pp)/time_pp;
+        }
+        break;
+    }
+}
+
 #ifdef CONFIG_ENCODERS
 void mpeg4_encode_mb(MpegEncContext * s,
 		    DCTELEM block[6][64],
@@ -442,7 +490,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
 
                 return;
             }
-
+            
             if ((cbp | motion_x | motion_y | mb_type) ==0) {
                 /* direct MB with MV={0,0} */
                 assert(s->dquant==0);
@@ -1386,7 +1434,7 @@ void h263_encode_init(MpegEncContext *s)
 
         init_mv_penalty_and_fcode(s);
     }
-    s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p
+    s->me.mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p
     
     // use fcodes >1 only for mpeg4 & h263 & h263p FIXME
     switch(s->codec_id){
@@ -1394,6 +1442,11 @@ void h263_encode_init(MpegEncContext *s)
         s->fcode_tab= fcode_tab;
         s->min_qcoeff= -2048;
         s->max_qcoeff=  2047;
+        s->intra_ac_vlc_length     = uni_mpeg4_intra_rl_len;
+        s->intra_ac_vlc_last_length= uni_mpeg4_intra_rl_len + 128*64;
+        s->inter_ac_vlc_length     = uni_mpeg4_inter_rl_len;
+        s->inter_ac_vlc_last_length= uni_mpeg4_inter_rl_len + 128*64;
+        s->ac_esc_length= 7+2+1+6+1+12+1;
         break;
     case CODEC_ID_H263P:
         s->fcode_tab= umv_fcode_tab;
@@ -1517,17 +1570,77 @@ void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){
     }
 }
 
-static void mpeg4_encode_vol_header(MpegEncContext * s)
+static void mpeg4_encode_gop_header(MpegEncContext * s){
+    int hours, minutes, seconds;
+    
+    put_bits(&s->pb, 16, 0);
+    put_bits(&s->pb, 16, GOP_STARTCODE);
+    
+    seconds= s->time/s->time_increment_resolution;
+    minutes= seconds/60; seconds %= 60;
+    hours= minutes/60; minutes %= 60;
+    hours%=24;
+
+    put_bits(&s->pb, 5, hours);
+    put_bits(&s->pb, 6, minutes);
+    put_bits(&s->pb, 1, 1);
+    put_bits(&s->pb, 6, seconds);
+    
+    put_bits(&s->pb, 1, 0); //closed gov == NO
+    put_bits(&s->pb, 1, 0); //broken link == NO
+
+    ff_mpeg4_stuffing(&s->pb);
+}
+
+static void mpeg4_encode_visual_object_header(MpegEncContext * s){
+    int profile_and_level_indication;
+    int vo_ver_id;
+    
+    if(s->max_b_frames || s->quarter_sample){
+        profile_and_level_indication= 0xF1; // adv simple level 1
+        vo_ver_id= 5;
+    }else{
+        profile_and_level_indication= 0x01; // simple level 1
+        vo_ver_id= 1;
+    }
+    //FIXME levels
+
+    put_bits(&s->pb, 16, 0);
+    put_bits(&s->pb, 16, VOS_STARTCODE);
+    
+    put_bits(&s->pb, 8, profile_and_level_indication);
+    
+    put_bits(&s->pb, 16, 0);
+    put_bits(&s->pb, 16, VISUAL_OBJ_STARTCODE);
+    
+    put_bits(&s->pb, 1, 1);
+        put_bits(&s->pb, 4, vo_ver_id);
+        put_bits(&s->pb, 3, 1); //priority
+ 
+    put_bits(&s->pb, 4, 1); //visual obj type== video obj
+    
+    put_bits(&s->pb, 1, 0); //video signal type == no clue //FIXME
+
+    ff_mpeg4_stuffing(&s->pb);
+}
+
+static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_number)
 {
-    int vo_ver_id=1; //must be 2 if we want GMC or q-pel
+    int vo_ver_id;
     char buf[255];
 
-    s->vo_type= s->has_b_frames ? CORE_VO_TYPE : SIMPLE_VO_TYPE;
+    if(s->max_b_frames || s->quarter_sample){
+        vo_ver_id= 5;
+        s->vo_type= ADV_SIMPLE_VO_TYPE;
+    }else{
+        vo_ver_id= 1;
+        s->vo_type= SIMPLE_VO_TYPE;
+    }
 
     put_bits(&s->pb, 16, 0);
-    put_bits(&s->pb, 16, 0x100);        /* video obj */
+    put_bits(&s->pb, 16, 0x100 + vo_number);        /* video obj */
     put_bits(&s->pb, 16, 0);
-    put_bits(&s->pb, 16, 0x120);        /* video obj layer */
+    put_bits(&s->pb, 16, 0x120 + vol_number);       /* video obj layer */
 
     put_bits(&s->pb, 1, 0);		/* random access vol */
     put_bits(&s->pb, 8, s->vo_type);	/* video obj type indication */
@@ -1570,7 +1683,7 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
     put_bits(&s->pb, 1, 1);		/* obmc disable */
     if (vo_ver_id == 1) {
         put_bits(&s->pb, 1, s->vol_sprite_usage=0);		/* sprite enable */
-    }else{ /* vo_ver_id == 2 */
+    }else{
         put_bits(&s->pb, 2, s->vol_sprite_usage=0);		/* sprite enable */
     }
     
@@ -1580,7 +1693,7 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
     if(s->mpeg_quant) put_bits(&s->pb, 2, 0); /* no custom matrixes */
 
     if (vo_ver_id != 1)
-        put_bits(&s->pb, 1, s->quarter_sample=0);
+        put_bits(&s->pb, 1, s->quarter_sample);
     put_bits(&s->pb, 1, 1);		/* complexity estimation disable */
     s->resync_marker= s->rtp_mode;
     put_bits(&s->pb, 1, s->resync_marker ? 0 : 1);/* resync marker disable */
@@ -1614,9 +1727,11 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     int time_div, time_mod;
     
     if(s->pict_type==I_TYPE){
-        s->no_rounding=0;
-        if(picture_number==0 || !s->strict_std_compliance)
-            mpeg4_encode_vol_header(s);
+        if(!(s->flags&CODEC_FLAG_GLOBAL_HEADER)){
+            mpeg4_encode_visual_object_header(s);
+            mpeg4_encode_vol_header(s, 0, 0);
+        }
+        mpeg4_encode_gop_header(s);
     }
     
     s->partitioned_frame= s->data_partitioning && s->pict_type!=B_TYPE;
@@ -1624,7 +1739,7 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 //printf("num:%d rate:%d base:%d\n", s->picture_number, s->frame_rate, FRAME_RATE_BASE);
     
     put_bits(&s->pb, 16, 0);	        /* vop header */
-    put_bits(&s->pb, 16, 0x1B6);	/* vop header */
+    put_bits(&s->pb, 16, VOP_STARTCODE);	/* vop header */
     put_bits(&s->pb, 2, s->pict_type - 1);	/* pict type: I = 0 , P = 1 */
 
     time_div= s->time/s->time_increment_resolution;
@@ -1641,7 +1756,6 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     put_bits(&s->pb, 1, 1);	/* vop coded */
     if (    s->pict_type == P_TYPE 
         || (s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE)) {
-        s->no_rounding ^= 1;
 	put_bits(&s->pb, 1, s->no_rounding);	/* rounding type */
     }
     put_bits(&s->pb, 3, 0);	/* intra dc VLC threshold */
@@ -1992,6 +2106,61 @@ static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n
     }
 #endif
 }
+
+static inline int mpeg4_get_block_length(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
+                               UINT8 *scan_table)
+{
+    int i, last_non_zero;
+    const RLTable *rl;
+    UINT8 *len_tab;
+    const int last_index = s->block_last_index[n];
+    int len=0;
+
+    if (s->mb_intra) { //Note gcc (3.2.1 at least) will optimize this away
+	/* mpeg4 based DC predictor */
+	//mpeg4_encode_dc(dc_pb, intra_dc, n); //FIXME
+        if(last_index<1) return len;
+	i = 1;
+        rl = &rl_intra;
+        len_tab = uni_mpeg4_intra_rl_len;
+    } else {
+        if(last_index<0) return 0;
+	i = 0;
+        rl = &rl_inter;
+        len_tab = uni_mpeg4_inter_rl_len;
+    }
+
+    /* AC coefs */
+    last_non_zero = i - 1;
+    for (; i < last_index; i++) {
+	int level = block[ scan_table[i] ];
+	if (level) {
+	    int run = i - last_non_zero - 1;
+            level+=64;
+            if((level&(~127)) == 0){
+                const int index= UNI_MPEG4_ENC_INDEX(0, run, level);
+                len += len_tab[index];
+            }else{ //ESC3
+                len += 7+2+1+6+1+12+1;
+            }
+	    last_non_zero = i;
+	}
+    }
+    /*if(i<=last_index)*/{
+	int level = block[ scan_table[i] ];
+        int run = i - last_non_zero - 1;
+        level+=64;
+        if((level&(~127)) == 0){
+            const int index= UNI_MPEG4_ENC_INDEX(1, run, level);
+            len += len_tab[index];
+        }else{ //ESC3
+            len += 7+2+1+6+1+12+1;
+        }
+    }
+    
+    return len;
+}
+
 #endif
 
 
@@ -3046,8 +3215,6 @@ int ff_h263_decode_mb(MpegEncContext *s,
         int modb1; // first bit of modb
         int modb2; // second bit of modb
         int mb_type;
-        uint16_t time_pp;
-        uint16_t time_pb;
         int xy;
 
         s->mb_intra = 0; //B-frames never contain intra blocks
@@ -3169,9 +3336,6 @@ int ff_h263_decode_mb(MpegEncContext *s,
         }
           
         if(mb_type==4 || mb_type==MB_TYPE_B_DIRECT){
-            int mb_index= s->mb_x + s->mb_y*s->mb_width;
-            int i;
-            
             if(mb_type==4)
                 mx=my=0;
             else{
@@ -3180,55 +3344,7 @@ int ff_h263_decode_mb(MpegEncContext *s,
             }
  
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
-            xy= s->block_index[0];
-            time_pp= s->pp_time;
-            time_pb= s->pb_time;
-            
-            //FIXME avoid divides
-            switch(s->co_located_type_table[mb_index]){
-            case 0:
-                s->mv_type= MV_TYPE_16X16;
-                s->mv[0][0][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
-                s->mv[0][0][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
-                s->mv[1][0][0] = mx ? s->mv[0][0][0] - s->motion_val[xy][0]
-                                    : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp;
-                s->mv[1][0][1] = my ? s->mv[0][0][1] - s->motion_val[xy][1] 
-                                    : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp;
-                PRINT_MB_TYPE(mb_type==4 ? "D" : "S");
-                break;
-            case CO_LOCATED_TYPE_4MV:
-                s->mv_type = MV_TYPE_8X8;
-                for(i=0; i<4; i++){
-                    xy= s->block_index[i];
-                    s->mv[0][i][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
-                    s->mv[0][i][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
-                    s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->motion_val[xy][0]
-                                        : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp;
-                    s->mv[1][i][1] = my ? s->mv[0][i][1] - s->motion_val[xy][1] 
-                                        : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp;
-                }
-                PRINT_MB_TYPE("4");
-                break;
-            case CO_LOCATED_TYPE_FIELDMV:
-                s->mv_type = MV_TYPE_FIELD;
-                for(i=0; i<2; i++){
-                    if(s->top_field_first){
-                        time_pp= s->pp_field_time - s->field_select_table[mb_index][i] + i;
-                        time_pb= s->pb_field_time - s->field_select_table[mb_index][i] + i;
-                    }else{
-                        time_pp= s->pp_field_time + s->field_select_table[mb_index][i] - i;
-                        time_pb= s->pb_field_time + s->field_select_table[mb_index][i] - i;
-                    }
-                    s->mv[0][i][0] = s->field_mv_table[mb_index][i][0]*time_pb/time_pp + mx;
-                    s->mv[0][i][1] = s->field_mv_table[mb_index][i][1]*time_pb/time_pp + my;
-                    s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->field_mv_table[mb_index][i][0]
-                                        : s->field_mv_table[mb_index][i][0]*(time_pb - time_pp)/time_pp;
-                    s->mv[1][i][1] = my ? s->mv[0][i][1] - s->field_mv_table[mb_index][i][1] 
-                                        : s->field_mv_table[mb_index][i][1]*(time_pb - time_pp)/time_pp;
-                }
-                PRINT_MB_TYPE("=");
-                break;
-            }
+            ff_mpeg4_set_direct_mv(s, mx, my);
         }
         
         if(mb_type<0 || mb_type>4){
@@ -3624,7 +3740,7 @@ static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
 #if 1 
                     {
                         const int abs_level= ABS(level);
-                        if(abs_level<=MAX_LEVEL && run<=MAX_RUN){
+                        if(abs_level<=MAX_LEVEL && run<=MAX_RUN && !(s->workaround_bugs&FF_BUG_AC_VLC)){
                             const int run1= run - rl->max_run[last][abs_level] - 1;
                             if(abs_level <= rl->max_level[last][run]){
                                 fprintf(stderr, "illegal 3. esc, vlc encoding possible\n");
@@ -3887,7 +4003,7 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
     int a= 2<<s->sprite_warping_accuracy;
     int rho= 3-s->sprite_warping_accuracy;
     int r=16/a;
-    int vop_ref[4][2];
+    const int vop_ref[4][2]= {{0,0}, {s->width,0}, {0, s->height}, {s->width, s->height}}; // only true for rectangle shapes
     int d[4][2]={{0,0}, {0,0}, {0,0}, {0,0}};
     int sprite_ref[4][2];
     int virtual_ref[2][2];
@@ -3897,15 +4013,6 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
     int h= s->height;
     int min_ab;
 
-    vop_ref[0][0] = 0;
-    vop_ref[0][1] = 0;
-    vop_ref[1][0] = s->width;
-    vop_ref[1][1] = 0;
-    vop_ref[2][0] = 0;
-    vop_ref[2][1] = s->height;
-    vop_ref[3][0] = s->width;
-    vop_ref[3][1] = s->height;
-
     for(i=0; i<s->num_sprite_warping_points; i++){
         int length;
         int x=0, y=0;
@@ -4105,6 +4212,22 @@ printf("offset: %d:%d , delta: %d %d %d %d, shift %d\n",
 #endif
 }
 
+static int mpeg4_decode_gop_header(MpegEncContext * s, GetBitContext *gb){
+    int hours, minutes, seconds;
+
+    hours= get_bits(gb, 5);
+    minutes= get_bits(gb, 6);
+    skip_bits1(gb);
+    seconds= get_bits(gb, 6);
+
+    s->time_base= seconds + 60*(minutes + 60*hours);
+
+    skip_bits1(gb);
+    skip_bits1(gb);
+    
+    return 0;
+}
+
 static int decode_vol_header(MpegEncContext *s, GetBitContext *gb){
     int width, height, vo_ver_id;
 
@@ -4462,6 +4585,8 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
     }
     
     s->current_picture.pts= s->time*1000LL*1000LL / s->time_increment_resolution;
+    if(s->avctx->debug&FF_DEBUG_PTS)
+        printf("MPEG4 PTS: %f\n", s->current_picture.pts/(1000.0*1000.0));
     
     if(check_marker(gb, "before vop_coded")==0 && s->picture_number==0){
         printf("hmm, seems the headers arnt complete, trying to guess time_increment_bits\n");
@@ -4557,11 +4682,11 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
              s->b_code=1;
 
          if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-             printf("qp:%d fc:%d bc:%d type:%s size:%d pro:%d alt:%d top:%d qpel:%d part:%d resync:%d w:%d a:%d\n", 
+             printf("qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d\n", 
                  s->qscale, s->f_code, s->b_code, 
                  s->pict_type == I_TYPE ? "I" : (s->pict_type == P_TYPE ? "P" : (s->pict_type == B_TYPE ? "B" : "S")), 
                  gb->size,s->progressive_sequence, s->alternate_scan, s->top_field_first, 
-                 s->quarter_sample, s->data_partitioning, s->resync_marker, s->num_sprite_warping_points,
+                 s->quarter_sample ? "q" : "h", s->data_partitioning, s->resync_marker, s->num_sprite_warping_points,
                  s->sprite_warping_accuracy); 
          }
 
@@ -4626,17 +4751,51 @@ int ff_mpeg4_decode_picture_header(MpegEncContext * s, GetBitContext *gb)
         if((startcode&0xFFFFFF00) != 0x100)
             continue; //no startcode
         
+        if(s->avctx->debug&FF_DEBUG_STARTCODE){
+            printf("startcode: %3X ", startcode);
+            if     (startcode<=0x11F) printf("Video Object Start");
+            else if(startcode<=0x12F) printf("Video Object Layer Start");
+            else if(startcode<=0x13F) printf("Reserved");
+            else if(startcode<=0x15F) printf("FGS bp start");
+            else if(startcode<=0x1AF) printf("Reserved");
+            else if(startcode==0x1B0) printf("Visual Object Seq Start");
+            else if(startcode==0x1B1) printf("Visual Object Seq End");
+            else if(startcode==0x1B2) printf("User Data");
+            else if(startcode==0x1B3) printf("Group of VOP start");
+            else if(startcode==0x1B4) printf("Video Session Error");
+            else if(startcode==0x1B5) printf("Visual Object Start");
+            else if(startcode==0x1B6) printf("Video Object Plane start");
+            else if(startcode==0x1B7) printf("slice start");
+            else if(startcode==0x1B8) printf("extension start");
+            else if(startcode==0x1B9) printf("fgs start");
+            else if(startcode==0x1BA) printf("FBA Object start");
+            else if(startcode==0x1BB) printf("FBA Object Plane start");
+            else if(startcode==0x1BC) printf("Mesh Object start");
+            else if(startcode==0x1BD) printf("Mesh Object Plane start");
+            else if(startcode==0x1BE) printf("Still Textutre Object start");
+            else if(startcode==0x1BF) printf("Textutre Spatial Layer start");
+            else if(startcode==0x1C0) printf("Textutre SNR Layer start");
+            else if(startcode==0x1C1) printf("Textutre Tile start");
+            else if(startcode==0x1C2) printf("Textutre Shape Layer start");
+            else if(startcode==0x1C3) printf("stuffing start");
+            else if(startcode<=0x1C5) printf("reserved");
+            else if(startcode<=0x1FF) printf("System start");
+            printf(" at %d\n", get_bits_count(gb));
+        }
+
         switch(startcode){
         case 0x120:
             decode_vol_header(s, gb);
             break;
-        case 0x1b2:
+        case USER_DATA_STARTCODE:
             decode_user_data(s, gb);
             break;
-        case 0x1b6:
+        case GOP_STARTCODE:
+            mpeg4_decode_gop_header(s, gb);
+            break;
+        case VOP_STARTCODE:
             return decode_vop_header(s, gb);
         default:
-//            printf("startcode %X found\n", startcode);
             break;
         }
 
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 1f8fb44a8..93a14a06e 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -40,7 +40,7 @@ static inline long long rdtsc()
 }
 #endif
 
-static int h263_decode_init(AVCodecContext *avctx)
+int ff_h263_decode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
 
@@ -55,6 +55,8 @@ static int h263_decode_init(AVCodecContext *avctx)
     s->quant_precision=5;
     s->progressive_sequence=1;
     s->decode_mb= ff_h263_decode_mb;
+    s->low_delay= 1;
+    avctx->pix_fmt= PIX_FMT_YUV420P;
 
     /* select sub codec */
     switch(avctx->codec->id) {
@@ -64,7 +66,7 @@ static int h263_decode_init(AVCodecContext *avctx)
     case CODEC_ID_MPEG4:
         s->time_increment_bits = 4; /* default value for broken headers */
         s->h263_pred = 1;
-        s->has_b_frames = 1; //default, might be overriden in the vol header during header parsing
+        s->low_delay = 0; //default, might be overriden in the vol header during header parsing
         break;
     case CODEC_ID_MSMPEG4V1:
         s->h263_msmpeg4 = 1;
@@ -112,7 +114,7 @@ static int h263_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int h263_decode_end(AVCodecContext *avctx)
+int ff_h263_decode_end(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
 
@@ -249,7 +251,7 @@ static int decode_slice(MpegEncContext *s){
        &&   (s->workaround_bugs&FF_BUG_AUTODETECT) 
        &&    s->gb.size*8 - get_bits_count(&s->gb) >=0
        &&    s->gb.size*8 - get_bits_count(&s->gb) < 48
-       &&   !s->resync_marker
+//       &&   !s->resync_marker
        &&   !s->data_partitioning){
         
         const int bits_count= get_bits_count(&s->gb);
@@ -342,13 +344,13 @@ static int mpeg4_find_frame_end(MpegEncContext *s, UINT8 *buf, int buf_size){
     return -1;
 }
 
-static int h263_decode_frame(AVCodecContext *avctx, 
+int ff_h263_decode_frame(AVCodecContext *avctx, 
                              void *data, int *data_size,
                              UINT8 *buf, int buf_size)
 {
     MpegEncContext *s = avctx->priv_data;
     int ret,i;
-    AVVideoFrame *pict = data; 
+    AVFrame *pict = data; 
     float new_aspect;
     
 #ifdef PRINT_FRAME_TIME
@@ -415,9 +417,11 @@ retry:
         if (MPV_common_init(s) < 0) //we need the idct permutaton for reading a custom matrix
             return -1;
     }
-        
+      
     /* let's go :-) */
-    if (s->h263_msmpeg4) {
+    if (s->msmpeg4_version==5) {
+        ret= ff_wmv2_decode_picture_header(s);
+    } else if (s->msmpeg4_version) {
         ret = msmpeg4_decode_picture_header(s);
     } else if (s->h263_pred) {
         if(s->avctx->extradata_size && s->picture_number==0){
@@ -430,25 +434,23 @@ retry:
 
         if(s->flags& CODEC_FLAG_LOW_DELAY)
             s->low_delay=1;
-
-        s->has_b_frames= !s->low_delay;
     } else if (s->h263_intel) {
         ret = intel_h263_decode_picture_header(s);
     } else {
         ret = h263_decode_picture_header(s);
     }
-    avctx->has_b_frames= s->has_b_frames;
+    avctx->has_b_frames= !s->low_delay;
 
     if(s->workaround_bugs&FF_BUG_AUTODETECT){
         if(s->avctx->fourcc == ff_get_fourcc("XVIX")) 
             s->workaround_bugs|= FF_BUG_XVID_ILACE;
-
+#if 0
         if(s->avctx->fourcc == ff_get_fourcc("MP4S")) 
             s->workaround_bugs|= FF_BUG_AC_VLC;
         
         if(s->avctx->fourcc == ff_get_fourcc("M4S2")) 
             s->workaround_bugs|= FF_BUG_AC_VLC;
-                
+#endif
         if(s->avctx->fourcc == ff_get_fourcc("UMP4")){
             s->workaround_bugs|= FF_BUG_UMP4;
             s->workaround_bugs|= FF_BUG_AC_VLC;
@@ -464,6 +466,9 @@ retry:
         if(s->avctx->fourcc == ff_get_fourcc("XVID") && s->xvid_build==0)
             s->padding_bug_score= 256*256*256*64;
         
+        if(s->xvid_build && s->xvid_build<=3)
+            s->padding_bug_score= 256*256*256*64;
+        
         if(s->xvid_build && s->xvid_build<=1)
             s->workaround_bugs|= FF_BUG_QPEL_CHROMA;
 
@@ -531,7 +536,7 @@ retry:
     s->current_picture.key_frame= s->pict_type == I_TYPE;
 
     /* skip b frames if we dont have reference frames */
-    if(s->num_available_buffers<2 && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
+    if(s->last_picture.data[0]==NULL && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
     /* skip b frames if we are in a hurry */
     if(avctx->hurry_up && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
     /* skip everything if we are in a hurry>=5 */
@@ -635,7 +640,6 @@ retry:
         }
         if(num_end_markers || error){
             fprintf(stderr, "concealing errors\n");
-//printf("type:%d\n", s->pict_type);
             ff_error_resilience(s);
         }
     }
@@ -676,20 +680,33 @@ retry:
 
 }
 #endif
-    if(s->pict_type==B_TYPE || (!s->has_b_frames)){
-        *pict= *(AVVideoFrame*)&s->current_picture;
+
+    if(s->pict_type==B_TYPE || s->low_delay){
+        *pict= *(AVFrame*)&s->current_picture;
     } else {
-        *pict= *(AVVideoFrame*)&s->last_picture;
+        *pict= *(AVFrame*)&s->last_picture;
+    }
+
+    if(avctx->debug&FF_DEBUG_QP){
+        int8_t *qtab= pict->qscale_table;
+        int x,y;
+        
+        for(y=0; y<s->mb_height; y++){
+            for(x=0; x<s->mb_width; x++){
+                printf("%2d ", qtab[x + y*s->mb_width]);
+            }
+            printf("\n");
+        }
+        printf("\n");
     }
 
     /* Return the Picture timestamp as the frame number */
     /* we substract 1 because it is added on utils.c    */
     avctx->frame_number = s->picture_number - 1;
 
-    /* dont output the last pic after seeking 
-       note we allready added +1 for the current pix in MPV_frame_end(s) */
-    if(s->num_available_buffers>=2 || (!s->has_b_frames))
-        *data_size = sizeof(AVVideoFrame);
+    /* dont output the last pic after seeking */
+    if(s->last_picture.data[0] || s->low_delay)
+        *data_size = sizeof(AVFrame);
 #ifdef PRINT_FRAME_TIME
 printf("%Ld\n", rdtsc()-time);
 #endif
@@ -701,10 +718,10 @@ AVCodec mpeg4_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_MPEG4,
     sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
 };
 
@@ -713,10 +730,10 @@ AVCodec h263_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_H263,
     sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
 };
 
@@ -725,10 +742,10 @@ AVCodec msmpeg4v1_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_MSMPEG4V1,
     sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
 };
 
@@ -737,10 +754,10 @@ AVCodec msmpeg4v2_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_MSMPEG4V2,
     sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
 };
 
@@ -749,10 +766,10 @@ AVCodec msmpeg4v3_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_MSMPEG4V3,
     sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
 };
 
@@ -761,22 +778,10 @@ AVCodec wmv1_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_WMV1,
     sizeof(MpegEncContext),
-    h263_decode_init,
-    NULL,
-    h263_decode_end,
-    h263_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
-};
-
-AVCodec wmv2_decoder = {
-    "wmv2",
-    CODEC_TYPE_VIDEO,
-    CODEC_ID_WMV2,
-    sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
 };
 
@@ -785,10 +790,10 @@ AVCodec h263i_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_H263I,
     sizeof(MpegEncContext),
-    h263_decode_init,
+    ff_h263_decode_init,
     NULL,
-    h263_decode_end,
-    h263_decode_frame,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
 };
 
diff --git a/src/libffmpeg/libavcodec/huffyuv.c b/src/libffmpeg/libavcodec/huffyuv.c
new file mode 100644
index 000000000..0eb701037
--- /dev/null
+++ b/src/libffmpeg/libavcodec/huffyuv.c
@@ -0,0 +1,1101 @@
+/*
+ * huffyuv codec for libavcodec
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * see http://www.pcisys.net/~melanson/codecs/huffyuv.txt for a description of
+ * the algorithm used 
+ */
+
+#include "common.h"
+#include "avcodec.h"
+#include "dsputil.h"
+
+#ifndef INT64_MAX
+#define INT64_MAX 9223372036854775807LL
+#endif
+
+#define VLC_BITS 11
+
+typedef enum Predictor{
+    LEFT= 0,
+    PLANE,
+    MEDIAN,
+} Predictor;
+ 
+typedef struct HYuvContext{
+    AVCodecContext *avctx;
+    Predictor predictor;
+    GetBitContext gb;
+    PutBitContext pb;
+    int interlaced;
+    int decorrelate;
+    int bitstream_bpp;
+    int version;
+    int yuy2;                               //use yuy2 instead of 422P
+    int bgr32;                              //use bgr32 instead of bgr24
+    int width, height;
+    int flags;
+    int picture_number;
+    int last_slice_end;
+    uint8_t __align8 temp[3][2500];
+    uint64_t stats[3][256];
+    uint8_t len[3][256];
+    uint32_t bits[3][256];
+    VLC vlc[3];
+    AVFrame picture;
+    uint8_t __align8 bitstream_buffer[1024*1024*3]; //FIXME dynamic alloc or some other solution
+    DSPContext dsp; 
+}HYuvContext;
+
+static inline void bswap_buf(uint32_t *dst, uint32_t *src, int w){
+    int i;
+    
+    for(i=0; i+8<=w; i+=8){
+        dst[i+0]= bswap_32(src[i+0]);
+        dst[i+1]= bswap_32(src[i+1]);
+        dst[i+2]= bswap_32(src[i+2]);
+        dst[i+3]= bswap_32(src[i+3]);
+        dst[i+4]= bswap_32(src[i+4]);
+        dst[i+5]= bswap_32(src[i+5]);
+        dst[i+6]= bswap_32(src[i+6]);
+        dst[i+7]= bswap_32(src[i+7]);
+    }
+    for(;i<w; i++){
+        dst[i+0]= bswap_32(src[i+0]);
+    }
+}
+
+static inline int add_left_prediction(uint8_t *dst, uint8_t *src, int w, int acc){
+    int i;
+
+    for(i=0; i<w-1; i++){
+        acc+= src[i];
+        dst[i]= acc;
+        i++;
+        acc+= src[i];
+        dst[i]= acc;
+    }
+
+    for(; i<w; i++){
+        acc+= src[i];
+        dst[i]= acc;
+    }
+
+    return acc;
+}
+
+static inline void add_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
+    int i;
+    uint8_t l, lt;
+
+    l= *left;
+    lt= *left_top;
+
+    for(i=0; i<w; i++){
+        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
+        lt= src1[i];
+        dst[i]= l;
+    }    
+
+    *left= l;
+    *left_top= lt;
+}
+//FIXME optimize
+static inline void sub_median_prediction(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
+    int i;
+    uint8_t l, lt;
+
+    l= *left;
+    lt= *left_top;
+
+    for(i=0; i<w; i++){
+        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
+        lt= src1[i];
+        l= src2[i];
+        dst[i]= l - pred;
+    }    
+
+    *left= l;
+    *left_top= lt;
+}
+
+
+static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w, int *red, int *green, int *blue){
+    int i;
+    int r,g,b;
+    r= *red;
+    g= *green;
+    b= *blue;
+
+    for(i=0; i<w; i++){
+        b+= src[4*i+0];
+        g+= src[4*i+1];
+        r+= src[4*i+2];
+        
+        dst[4*i+0]= b;
+        dst[4*i+1]= g;
+        dst[4*i+2]= r;
+    }
+
+    *red= r;
+    *green= g;
+    *blue= b;
+}
+
+static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst, uint8_t *src, int w, int left){
+    int i;
+    if(w<32){
+        for(i=0; i<w; i++){
+            const int temp= src[i];
+            dst[i]= temp - left;
+            left= temp;
+        }
+        return left;
+    }else{
+        for(i=0; i<16; i++){
+            const int temp= src[i];
+            dst[i]= temp - left;
+            left= temp;
+        }
+        s->dsp.diff_bytes(dst+16, src+16, src+15, w-16);
+        return src[w-1];
+    }
+}
+
+static void read_len_table(uint8_t *dst, GetBitContext *gb){
+    int i, val, repeat;
+  
+    for(i=0; i<256;){
+        repeat= get_bits(gb, 3);
+        val   = get_bits(gb, 5);
+        if(repeat==0)
+            repeat= get_bits(gb, 8);
+//printf("%d %d\n", val, repeat);
+        while (repeat--)
+            dst[i++] = val;
+    }
+}
+
+static int generate_bits_table(uint32_t *dst, uint8_t *len_table){
+    int len, index;
+    uint32_t bits=0;
+
+    for(len=32; len>0; len--){
+        int bit= 1<<(32-len);
+        for(index=0; index<256; index++){
+            if(len_table[index]==len){
+                if(bits & (bit-1)){
+                    fprintf(stderr, "Error generating huffman table\n");
+                    return -1;
+                }
+                dst[index]= bits>>(32-len);
+                bits+= bit;
+            }
+        }
+    }
+    return 0;
+}
+
+static void generate_len_table(uint8_t *dst, uint64_t *stats, int size){
+    uint64_t counts[2*size];
+    int up[2*size];
+    int offset, i, next;
+    
+    for(offset=1; ; offset<<=1){
+        for(i=0; i<size; i++){
+            counts[i]= stats[i] + offset - 1;
+        }
+        
+        for(next=size; next<size*2; next++){
+            uint64_t min1, min2;
+            int min1_i, min2_i;
+            
+            min1=min2= INT64_MAX;
+            min1_i= min2_i=-1;
+            
+            for(i=0; i<next; i++){
+                if(min2 > counts[i]){
+                    if(min1 > counts[i]){
+                        min2= min1;
+                        min2_i= min1_i;
+                        min1= counts[i];
+                        min1_i= i;
+                    }else{
+                        min2= counts[i];
+                        min2_i= i;
+                    }
+                }
+            }
+            
+            if(min2==INT64_MAX) break;
+            
+            counts[next]= min1 + min2;
+            counts[min1_i]=
+            counts[min2_i]= INT64_MAX;
+            up[min1_i]=
+            up[min2_i]= next;
+            up[next]= -1;
+        }
+        
+        for(i=0; i<size; i++){
+            int len;
+            int index=i;
+            
+            for(len=0; up[index] != -1; len++)
+                index= up[index];
+                
+            if(len > 32) break;
+            
+            dst[i]= len;
+        }
+        if(i==size) break;
+    }
+}
+
+static int read_huffman_tables(HYuvContext *s, uint8_t *src, int length){
+    GetBitContext gb;
+    int i;
+    
+    init_get_bits(&gb, src, length);
+    
+    for(i=0; i<3; i++){
+        read_len_table(s->len[i], &gb);
+        
+        if(generate_bits_table(s->bits[i], s->len[i])<0){
+            return -1;
+        }
+#if 0
+for(j=0; j<256; j++){
+printf("%6X, %2d,  %3d\n", s->bits[i][j], s->len[i][j], j);
+}
+#endif
+        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1, s->bits[i], 4, 4);
+    }
+    
+    return 0;
+}
+
+static int read_old_huffman_tables(HYuvContext *s){
+#if 0    
+    GetBitContext gb;
+    int i;
+
+    init_get_bits(&gb, classic_shift_luma, sizeof(classic_shift_luma));
+    read_len_table(s->len[0], &gb);
+    init_get_bits(&gb, classic_shift_chroma, sizeof(classic_shift_chroma));
+    read_len_table(s->len[1], &gb);
+    
+    for(i=0; i<256; i++) s->bits[0][i] = classic_add_luma  [i];
+    for(i=0; i<256; i++) s->bits[1][i] = classic_add_chroma[i];
+
+    if(s->bitstream_bpp >= 24){
+        memcpy(s->bits[1], s->bits[0], 256*sizeof(uint32_t));
+        memcpy(s->len[1] , s->len [0], 256*sizeof(uint8_t));
+    }
+    memcpy(s->bits[2], s->bits[1], 256*sizeof(uint32_t));
+    memcpy(s->len[2] , s->len [1], 256*sizeof(uint8_t));
+    
+    for(i=0; i<3; i++)
+        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1, s->bits[i], 4, 4);
+    
+    return 0;
+#else
+    fprintf(stderr, "v1 huffyuv is not supported \n");
+    return -1;
+#endif
+}
+
+static int decode_init(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+    int width, height;
+
+    s->avctx= avctx;
+    s->flags= avctx->flags;
+        
+    dsputil_init(&s->dsp, avctx->dsp_mask);
+    
+    width= s->width= avctx->width;
+    height= s->height= avctx->height;
+    avctx->coded_frame= &s->picture;
+
+s->bgr32=1;
+    assert(width && height);
+//if(avctx->extradata)
+//  printf("extradata:%X, extradata_size:%d\n", *(uint32_t*)avctx->extradata, avctx->extradata_size);
+    if(avctx->extradata_size){
+        if((avctx->bits_per_sample&7) && avctx->bits_per_sample != 12)
+            s->version=1; // do such files exist at all?
+        else
+            s->version=2;
+    }else
+        s->version=0;
+    
+    if(s->version==2){
+        int method;
+
+        method= ((uint8_t*)avctx->extradata)[0];
+        s->decorrelate= method&64 ? 1 : 0;
+        s->predictor= method&63;
+        s->bitstream_bpp= ((uint8_t*)avctx->extradata)[1];
+        if(s->bitstream_bpp==0) 
+            s->bitstream_bpp= avctx->bits_per_sample&~7;
+            
+        if(read_huffman_tables(s, ((uint8_t*)avctx->extradata)+4, avctx->extradata_size) < 0)
+            return -1;
+    }else{
+        switch(avctx->bits_per_sample&7){
+        case 1:
+            s->predictor= LEFT;
+            s->decorrelate= 0;
+            break;
+        case 2:
+            s->predictor= LEFT;
+            s->decorrelate= 1;
+            break;
+        case 3:
+            s->predictor= PLANE;
+            s->decorrelate= avctx->bits_per_sample >= 24;
+            break;
+        case 4:
+            s->predictor= MEDIAN;
+            s->decorrelate= 0;
+            break;
+        default:
+            s->predictor= LEFT; //OLD
+            s->decorrelate= 0;
+            break;
+        }
+        s->bitstream_bpp= avctx->bits_per_sample & ~7;
+        
+        if(read_old_huffman_tables(s) < 0)
+            return -1;
+    }
+    
+    s->interlaced= height > 288;
+    
+    switch(s->bitstream_bpp){
+    case 12:
+        avctx->pix_fmt = PIX_FMT_YUV420P;
+        break;
+    case 16:
+        if(s->yuy2){
+            avctx->pix_fmt = PIX_FMT_YUV422;
+        }else{
+            avctx->pix_fmt = PIX_FMT_YUV422P;
+        }
+        break;
+    case 24:
+    case 32:
+        if(s->bgr32){
+            avctx->pix_fmt = PIX_FMT_BGRA32;
+        }else{
+            avctx->pix_fmt = PIX_FMT_BGR24;
+        }
+        break;
+    default:
+        assert(0);
+    }
+    
+//    printf("pred:%d bpp:%d hbpp:%d il:%d\n", s->predictor, s->bitstream_bpp, avctx->bits_per_sample, s->interlaced);
+    
+    return 0;
+}
+
+static void store_table(HYuvContext *s, uint8_t *len){
+    int i;
+    int index= s->avctx->extradata_size;
+
+    for(i=0; i<256;){
+        int cur=i;
+        int val= len[i];
+        int repeat;
+        
+        for(; i<256 && len[i]==val; i++);
+        
+        repeat= i - cur;
+        
+        if(repeat>7){
+            ((uint8_t*)s->avctx->extradata)[index++]= val;
+            ((uint8_t*)s->avctx->extradata)[index++]= repeat;
+        }else{
+            ((uint8_t*)s->avctx->extradata)[index++]= val | (repeat<<5);
+        }
+    }
+    
+    s->avctx->extradata_size= index;
+}
+
+static int encode_init(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+    int i, j, width, height;
+
+    s->avctx= avctx;
+    s->flags= avctx->flags;
+        
+    dsputil_init(&s->dsp, avctx->dsp_mask);
+    
+    width= s->width= avctx->width;
+    height= s->height= avctx->height;
+    
+    assert(width && height);
+    
+    avctx->extradata= av_mallocz(1024*10);
+    avctx->stats_out= av_mallocz(1024*10);
+    s->version=2;
+    
+    avctx->coded_frame= &s->picture;
+    s->picture.pict_type= FF_I_TYPE;
+    s->picture.key_frame= 1;
+    
+    switch(avctx->pix_fmt){
+    case PIX_FMT_YUV420P:
+        if(avctx->strict_std_compliance>=0){
+            fprintf(stderr, "YV12-huffyuv is experimental, there WILL be no compatbility! (use (v)strict=-1)\n");
+            return -1;
+        }
+        s->bitstream_bpp= 12;
+        break;
+    case PIX_FMT_YUV422P:
+        s->bitstream_bpp= 16;
+        break;
+    default:
+        fprintf(stderr, "format not supported\n");
+        return -1;
+    }
+    avctx->bits_per_sample= s->bitstream_bpp;
+    s->decorrelate= s->bitstream_bpp >= 24;
+    s->predictor= avctx->prediction_method;
+    
+    ((uint8_t*)avctx->extradata)[0]= s->predictor;
+    ((uint8_t*)avctx->extradata)[1]= s->bitstream_bpp;
+    ((uint8_t*)avctx->extradata)[2]=
+    ((uint8_t*)avctx->extradata)[3]= 0;
+    s->avctx->extradata_size= 4;
+    
+    if(avctx->stats_in){
+        char *p= avctx->stats_in;
+    
+        for(i=0; i<3; i++)
+            for(j=0; j<256; j++)
+                s->stats[i][j]= 1;
+
+        for(;;){
+            for(i=0; i<3; i++){
+                char *next;
+
+                for(j=0; j<256; j++){
+                    s->stats[i][j]+= strtol(p, &next, 0);
+                    if(next==p) return -1;
+                    p=next;
+                }        
+            }
+            if(p[0]==0 || p[1]==0 || p[2]==0) break;
+        }
+    }else{
+        for(i=0; i<3; i++)
+            for(j=0; j<256; j++){
+                int d= FFMIN(j, 256-j);
+                
+                s->stats[i][j]= 100000000/(d+1);
+            }
+    }
+    
+    for(i=0; i<3; i++){
+        generate_len_table(s->len[i], s->stats[i], 256);
+
+        if(generate_bits_table(s->bits[i], s->len[i])<0){
+            return -1;
+        }
+        
+        store_table(s, s->len[i]);
+    }
+
+    for(i=0; i<3; i++)
+        for(j=0; j<256; j++)
+            s->stats[i][j]= 0;
+    
+    s->interlaced= height > 288;
+    
+//    printf("pred:%d bpp:%d hbpp:%d il:%d\n", s->predictor, s->bitstream_bpp, avctx->bits_per_sample, s->interlaced);
+    
+    s->picture_number=0;
+    
+    return 0;
+}
+
+static void decode_422_bitstream(HYuvContext *s, int count){
+    int i;
+    
+    count/=2;
+    
+    for(i=0; i<count; i++){
+        s->temp[0][2*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3); 
+        s->temp[1][  i  ]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+        s->temp[0][2*i+1]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3); 
+        s->temp[2][  i  ]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
+    }
+}
+
+static void decode_gray_bitstream(HYuvContext *s, int count){
+    int i;
+    
+    count/=2;
+    
+    for(i=0; i<count; i++){
+        s->temp[0][2*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3); 
+        s->temp[0][2*i+1]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3); 
+    }
+}
+
+static void encode_422_bitstream(HYuvContext *s, int count){
+    int i;
+    
+    count/=2;
+    if(s->flags&CODEC_FLAG_PASS1){
+        for(i=0; i<count; i++){
+            s->stats[0][ s->temp[0][2*i  ] ]++;
+            s->stats[1][ s->temp[1][  i  ] ]++;
+            s->stats[0][ s->temp[0][2*i+1] ]++;
+            s->stats[2][ s->temp[2][  i  ] ]++;
+        }
+    }else{
+        for(i=0; i<count; i++){
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i  ] ], s->bits[0][ s->temp[0][2*i  ] ]);
+            put_bits(&s->pb, s->len[1][ s->temp[1][  i  ] ], s->bits[1][ s->temp[1][  i  ] ]);
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i+1] ], s->bits[0][ s->temp[0][2*i+1] ]);
+            put_bits(&s->pb, s->len[2][ s->temp[2][  i  ] ], s->bits[2][ s->temp[2][  i  ] ]);
+        }
+    }
+}
+
+static void encode_gray_bitstream(HYuvContext *s, int count){
+    int i;
+    
+    count/=2;
+    if(s->flags&CODEC_FLAG_PASS1){
+        for(i=0; i<count; i++){
+            s->stats[0][ s->temp[0][2*i  ] ]++;
+            s->stats[0][ s->temp[0][2*i+1] ]++;
+        }
+    }else{
+        for(i=0; i<count; i++){
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i  ] ], s->bits[0][ s->temp[0][2*i  ] ]);
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i+1] ], s->bits[0][ s->temp[0][2*i+1] ]);
+        }
+    }
+}
+
+static void decode_bgr_bitstream(HYuvContext *s, int count){
+    int i;
+    
+    if(s->decorrelate){
+        if(s->bitstream_bpp==24){
+            for(i=0; i<count; i++){
+                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) + s->temp[0][4*i+1];
+                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) + s->temp[0][4*i+1];
+            }
+        }else{
+            for(i=0; i<count; i++){
+                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) + s->temp[0][4*i+1];
+                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) + s->temp[0][4*i+1]; 
+                                   get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); //?!
+            }
+        }
+    }else{
+        if(s->bitstream_bpp==24){
+            for(i=0; i<count; i++){
+                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
+                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
+            }
+        }else{
+            for(i=0; i<count; i++){
+                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
+                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
+                                   get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); //?!
+            }
+        }
+    }
+}
+
+static void draw_slice(HYuvContext *s, int y){
+    int h, cy;
+    UINT8 *src_ptr[3];
+    
+    if(s->avctx->draw_horiz_band==NULL) 
+        return;
+        
+    h= y - s->last_slice_end;
+    y -= h;
+    
+    if(s->bitstream_bpp==12){
+        cy= y>>1;
+    }else{
+        cy= y;
+    }
+    
+    src_ptr[0] = s->picture.data[0] + s->picture.linesize[0]*y;
+    src_ptr[1] = s->picture.data[1] + s->picture.linesize[1]*cy;
+    src_ptr[2] = s->picture.data[2] + s->picture.linesize[2]*cy;
+    emms_c();
+
+    s->avctx->draw_horiz_band(s->avctx, src_ptr, s->picture.linesize[0], y, s->width, h);
+    
+    s->last_slice_end= y + h;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
+    HYuvContext *s = avctx->priv_data;
+    const int width= s->width;
+    const int width2= s->width>>1;
+    const int height= s->height;
+    int fake_ystride, fake_ustride, fake_vstride;
+    AVFrame * const p= &s->picture;
+
+    AVFrame *picture = data;
+
+    *data_size = 0;
+
+    /* no supplementary picture */
+    if (buf_size == 0)
+        return 0;
+
+    bswap_buf((uint32_t*)s->bitstream_buffer, (uint32_t*)buf, buf_size/4);
+    
+    init_get_bits(&s->gb, s->bitstream_buffer, buf_size);
+
+    p->reference= 0;
+    if(avctx->get_buffer(avctx, p) < 0){
+        fprintf(stderr, "get_buffer() failed\n");
+        return -1;
+    }
+
+    fake_ystride= s->interlaced ? p->linesize[0]*2  : p->linesize[0];
+    fake_ustride= s->interlaced ? p->linesize[1]*2  : p->linesize[1];
+    fake_vstride= s->interlaced ? p->linesize[2]*2  : p->linesize[2];
+    
+    s->last_slice_end= 0;
+        
+    if(s->bitstream_bpp<24){
+        int y, cy;
+        int lefty, leftu, leftv;
+        int lefttopy, lefttopu, lefttopv;
+        
+        if(s->yuy2){
+            p->data[0][3]= get_bits(&s->gb, 8);
+            p->data[0][2]= get_bits(&s->gb, 8);
+            p->data[0][1]= get_bits(&s->gb, 8);
+            p->data[0][0]= get_bits(&s->gb, 8);
+            
+            fprintf(stderr, "YUY2 output isnt implemenetd yet\n");
+            return -1;
+        }else{
+        
+            leftv= p->data[2][0]= get_bits(&s->gb, 8);
+            lefty= p->data[0][1]= get_bits(&s->gb, 8);
+            leftu= p->data[1][0]= get_bits(&s->gb, 8);
+                   p->data[0][0]= get_bits(&s->gb, 8);
+        
+            switch(s->predictor){
+            case LEFT:
+            case PLANE:
+                decode_422_bitstream(s, width-2);
+                lefty= add_left_prediction(p->data[0] + 2, s->temp[0], width-2, lefty);
+                if(!(s->flags&CODEC_FLAG_GRAY)){
+                    leftu= add_left_prediction(p->data[1] + 1, s->temp[1], width2-1, leftu);
+                    leftv= add_left_prediction(p->data[2] + 1, s->temp[2], width2-1, leftv);
+                }
+
+                for(cy=y=1; y<s->height; y++,cy++){
+                    uint8_t *ydst, *udst, *vdst;
+                    
+                    if(s->bitstream_bpp==12){
+                        decode_gray_bitstream(s, width);
+                    
+                        ydst= p->data[0] + p->linesize[0]*y;
+
+                        lefty= add_left_prediction(ydst, s->temp[0], width, lefty);
+                        if(s->predictor == PLANE){
+                            if(y>s->interlaced)
+                                s->dsp.add_bytes(ydst, ydst - fake_ystride, width);
+                        }
+                        y++;
+                        if(y>=s->height) break;
+                    }
+                    
+                    draw_slice(s, y);
+                    
+                    ydst= p->data[0] + p->linesize[0]*y;
+                    udst= p->data[1] + p->linesize[1]*cy;
+                    vdst= p->data[2] + p->linesize[2]*cy;
+                    
+                    decode_422_bitstream(s, width);
+                    lefty= add_left_prediction(ydst, s->temp[0], width, lefty);
+                    if(!(s->flags&CODEC_FLAG_GRAY)){
+                        leftu= add_left_prediction(udst, s->temp[1], width2, leftu);
+                        leftv= add_left_prediction(vdst, s->temp[2], width2, leftv);
+                    }
+                    if(s->predictor == PLANE){
+                        if(cy>s->interlaced){
+                            s->dsp.add_bytes(ydst, ydst - fake_ystride, width);
+                            if(!(s->flags&CODEC_FLAG_GRAY)){
+                                s->dsp.add_bytes(udst, udst - fake_ustride, width2);
+                                s->dsp.add_bytes(vdst, vdst - fake_vstride, width2);
+                            }
+                        }
+                    }
+                }
+                draw_slice(s, height);
+                
+                break;
+            case MEDIAN:
+                /* first line except first 2 pixels is left predicted */
+                decode_422_bitstream(s, width-2);
+                lefty= add_left_prediction(p->data[0] + 2, s->temp[0], width-2, lefty);
+                if(!(s->flags&CODEC_FLAG_GRAY)){
+                    leftu= add_left_prediction(p->data[1] + 1, s->temp[1], width2-1, leftu);
+                    leftv= add_left_prediction(p->data[2] + 1, s->temp[2], width2-1, leftv);
+                }
+                
+                cy=y=1;
+                
+                /* second line is left predicted for interlaced case */
+                if(s->interlaced){
+                    decode_422_bitstream(s, width);
+                    lefty= add_left_prediction(p->data[0] + p->linesize[0], s->temp[0], width, lefty);
+                    if(!(s->flags&CODEC_FLAG_GRAY)){
+                        leftu= add_left_prediction(p->data[1] + p->linesize[2], s->temp[1], width2, leftu);
+                        leftv= add_left_prediction(p->data[2] + p->linesize[1], s->temp[2], width2, leftv);
+                    }
+                    y++; cy++;
+                }
+
+                /* next 4 pixels are left predicted too */
+                decode_422_bitstream(s, 4);
+                lefty= add_left_prediction(p->data[0] + fake_ystride, s->temp[0], 4, lefty);
+                if(!(s->flags&CODEC_FLAG_GRAY)){
+                    leftu= add_left_prediction(p->data[1] + fake_ustride, s->temp[1], 2, leftu);
+                    leftv= add_left_prediction(p->data[2] + fake_vstride, s->temp[2], 2, leftv);
+                }
+
+                /* next line except the first 4 pixels is median predicted */
+                lefttopy= p->data[0][3];
+                decode_422_bitstream(s, width-4);
+                add_median_prediction(p->data[0] + fake_ystride+4, p->data[0]+4, s->temp[0], width-4, &lefty, &lefttopy);
+                if(!(s->flags&CODEC_FLAG_GRAY)){
+                    lefttopu= p->data[1][1];
+                    lefttopv= p->data[2][1];
+                    add_median_prediction(p->data[1] + fake_ustride+2, p->data[1]+2, s->temp[1], width2-2, &leftu, &lefttopu);
+                    add_median_prediction(p->data[2] + fake_vstride+2, p->data[2]+2, s->temp[2], width2-2, &leftv, &lefttopv);
+                }
+                y++; cy++;
+                
+                for(; y<height; y++,cy++){
+                    uint8_t *ydst, *udst, *vdst;
+
+                    if(s->bitstream_bpp==12){
+                        while(2*cy > y){
+                            decode_gray_bitstream(s, width);
+                            ydst= p->data[0] + p->linesize[0]*y;
+                            add_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
+                            y++;
+                        }
+                        if(y>=height) break;
+                    }
+                    draw_slice(s, y);
+
+                    decode_422_bitstream(s, width);
+
+                    ydst= p->data[0] + p->linesize[0]*y;
+                    udst= p->data[1] + p->linesize[1]*cy;
+                    vdst= p->data[2] + p->linesize[2]*cy;
+
+                    add_median_prediction(ydst, ydst - fake_ystride, s->temp[0], width, &lefty, &lefttopy);
+                    if(!(s->flags&CODEC_FLAG_GRAY)){
+                        add_median_prediction(udst, udst - fake_ustride, s->temp[1], width2, &leftu, &lefttopu);
+                        add_median_prediction(vdst, vdst - fake_vstride, s->temp[2], width2, &leftv, &lefttopv);
+                    }
+                }
+
+                draw_slice(s, height);
+                break;
+            }
+        }
+    }else{
+        int y;
+        int leftr, leftg, leftb;
+        const int last_line= (height-1)*p->linesize[0];
+        
+        if(s->bitstream_bpp==32){
+                   p->data[0][last_line+3]= get_bits(&s->gb, 8);
+            leftr= p->data[0][last_line+2]= get_bits(&s->gb, 8);
+            leftg= p->data[0][last_line+1]= get_bits(&s->gb, 8);
+            leftb= p->data[0][last_line+0]= get_bits(&s->gb, 8);
+        }else{
+            leftr= p->data[0][last_line+2]= get_bits(&s->gb, 8);
+            leftg= p->data[0][last_line+1]= get_bits(&s->gb, 8);
+            leftb= p->data[0][last_line+0]= get_bits(&s->gb, 8);
+            skip_bits(&s->gb, 8);
+        }
+        
+        if(s->bgr32){
+            switch(s->predictor){
+            case LEFT:
+            case PLANE:
+                decode_bgr_bitstream(s, width-1);
+                add_left_prediction_bgr32(p->data[0] + last_line+4, s->temp[0], width-1, &leftr, &leftg, &leftb);
+
+                for(y=s->height-2; y>=0; y--){ //yes its stored upside down
+                    decode_bgr_bitstream(s, width);
+                    
+                    add_left_prediction_bgr32(p->data[0] + p->linesize[0]*y, s->temp[0], width, &leftr, &leftg, &leftb);
+                    if(s->predictor == PLANE){
+                        if((y&s->interlaced)==0){
+                            s->dsp.add_bytes(p->data[0] + p->linesize[0]*y, 
+                                             p->data[0] + p->linesize[0]*y + fake_ystride, fake_ystride);
+                        }
+                    }
+                }
+                draw_slice(s, height); // just 1 large slice as this isnt possible in reverse order
+                break;
+            default:
+                fprintf(stderr, "prediction type not supported!\n");
+            }
+        }else{
+
+            fprintf(stderr, "BGR24 output isnt implemenetd yet\n");
+            return -1;
+        }
+    }
+    emms_c();
+    
+    *picture= *p;
+    
+    avctx->release_buffer(avctx, p);
+
+    *data_size = sizeof(AVFrame);
+    
+    return (get_bits_count(&s->gb)+7)>>3;
+}
+
+static int decode_end(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+    int i;
+    
+    for(i=0; i<3; i++){
+        free_vlc(&s->vlc[i]);
+    }
+
+    if(avctx->get_buffer == avcodec_default_get_buffer){
+        for(i=0; i<4; i++){
+            av_freep(&s->picture.base[i]);
+            s->picture.data[i]= NULL;
+        }
+        av_freep(&s->picture.opaque);
+    }
+
+    return 0;
+}
+
+static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+    HYuvContext *s = avctx->priv_data;
+    AVFrame *pict = data;
+    const int width= s->width;
+    const int width2= s->width>>1;
+    const int height= s->height;
+    const int fake_ystride= s->interlaced ? pict->linesize[0]*2  : pict->linesize[0];
+    const int fake_ustride= s->interlaced ? pict->linesize[1]*2  : pict->linesize[1];
+    const int fake_vstride= s->interlaced ? pict->linesize[2]*2  : pict->linesize[2];
+    AVFrame * const p= &s->picture;
+    int i, size;
+
+    init_put_bits(&s->pb, buf, buf_size, NULL, NULL);
+    
+    *p = *pict;
+    
+    if(avctx->pix_fmt == PIX_FMT_YUV422P || avctx->pix_fmt == PIX_FMT_YUV420P){
+        int lefty, leftu, leftv, y, cy;
+
+        put_bits(&s->pb, 8, leftv= p->data[2][0]);
+        put_bits(&s->pb, 8, lefty= p->data[0][1]);
+        put_bits(&s->pb, 8, leftu= p->data[1][0]);
+        put_bits(&s->pb, 8,        p->data[0][0]);
+        
+        lefty= sub_left_prediction(s, s->temp[0], p->data[0]+2, width-2 , lefty);
+        leftu= sub_left_prediction(s, s->temp[1], p->data[1]+1, width2-1, leftu);
+        leftv= sub_left_prediction(s, s->temp[2], p->data[2]+1, width2-1, leftv);
+        
+        encode_422_bitstream(s, width-2);
+        
+        if(s->predictor==MEDIAN){
+            int lefttopy, lefttopu, lefttopv;
+            cy=y=1;
+            if(s->interlaced){
+                lefty= sub_left_prediction(s, s->temp[0], p->data[0]+p->linesize[0], width , lefty);
+                leftu= sub_left_prediction(s, s->temp[1], p->data[1]+p->linesize[1], width2, leftu);
+                leftv= sub_left_prediction(s, s->temp[2], p->data[2]+p->linesize[2], width2, leftv);
+        
+                encode_422_bitstream(s, width);
+                y++; cy++;
+            }
+            
+            lefty= sub_left_prediction(s, s->temp[0], p->data[0]+fake_ystride, 4, lefty);
+            leftu= sub_left_prediction(s, s->temp[1], p->data[1]+fake_ystride, 2, leftu);
+            leftv= sub_left_prediction(s, s->temp[2], p->data[2]+fake_ystride, 2, leftv);
+        
+            encode_422_bitstream(s, 4);
+
+            lefttopy= p->data[0][3];
+            lefttopu= p->data[1][1];
+            lefttopv= p->data[2][1];
+            sub_median_prediction(s->temp[0], p->data[0]+4, p->data[0] + fake_ystride+4, width-4 , &lefty, &lefttopy);
+            sub_median_prediction(s->temp[1], p->data[1]+2, p->data[1] + fake_ustride+2, width2-2, &leftu, &lefttopu);
+            sub_median_prediction(s->temp[2], p->data[2]+2, p->data[2] + fake_vstride+2, width2-2, &leftv, &lefttopv);
+            encode_422_bitstream(s, width-4);
+            y++; cy++;
+
+            for(; y<height; y++,cy++){
+                uint8_t *ydst, *udst, *vdst;
+                    
+                if(s->bitstream_bpp==12){
+                    while(2*cy > y){
+                        ydst= p->data[0] + p->linesize[0]*y;
+                        sub_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
+                        encode_gray_bitstream(s, width);
+                        y++;
+                    }
+                    if(y>=height) break;
+                }
+                ydst= p->data[0] + p->linesize[0]*y;
+                udst= p->data[1] + p->linesize[1]*cy;
+                vdst= p->data[2] + p->linesize[2]*cy;
+
+                sub_median_prediction(s->temp[0], ydst - fake_ystride, ydst, width , &lefty, &lefttopy);
+                sub_median_prediction(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
+                sub_median_prediction(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
+
+                encode_422_bitstream(s, width);
+            }
+        }else{
+            for(cy=y=1; y<height; y++,cy++){
+                uint8_t *ydst, *udst, *vdst;
+                
+                /* encode a luma only line & y++ */
+                if(s->bitstream_bpp==12){
+                    ydst= p->data[0] + p->linesize[0]*y;
+
+                    if(s->predictor == PLANE && s->interlaced < y){
+                        s->dsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
+
+                        lefty= sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
+                    }else{
+                        lefty= sub_left_prediction(s, s->temp[0], ydst, width , lefty);
+                    }
+                    encode_gray_bitstream(s, width);
+                    y++;
+                    if(y>=height) break;
+                }
+                
+                ydst= p->data[0] + p->linesize[0]*y;
+                udst= p->data[1] + p->linesize[1]*cy;
+                vdst= p->data[2] + p->linesize[2]*cy;
+
+                if(s->predictor == PLANE && s->interlaced < cy){
+                    s->dsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
+                    s->dsp.diff_bytes(s->temp[2], udst, udst - fake_ustride, width2);
+                    s->dsp.diff_bytes(s->temp[3], vdst, vdst - fake_vstride, width2);
+
+                    lefty= sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
+                    leftu= sub_left_prediction(s, s->temp[1], s->temp[2], width2, leftu);
+                    leftv= sub_left_prediction(s, s->temp[2], s->temp[3], width2, leftv);
+                }else{
+                    lefty= sub_left_prediction(s, s->temp[0], ydst, width , lefty);
+                    leftu= sub_left_prediction(s, s->temp[1], udst, width2, leftu);
+                    leftv= sub_left_prediction(s, s->temp[2], vdst, width2, leftv);
+                }
+
+                encode_422_bitstream(s, width);
+            }
+        }        
+    }else{
+        fprintf(stderr, "Format not supported!\n");
+    }
+    emms_c();
+    
+    size= (get_bit_count(&s->pb)+31)/32;
+    
+    if((s->flags&CODEC_FLAG_PASS1) && (s->picture_number&31)==0){
+        int j;
+        char *p= avctx->stats_out;
+        for(i=0; i<3; i++){
+            for(j=0; j<256; j++){
+                sprintf(p, "%Ld ", s->stats[i][j]);
+                p+= strlen(p);
+                s->stats[i][j]= 0;
+            }
+            sprintf(p, "\n");
+            p++;
+        }
+    }else{
+        flush_put_bits(&s->pb);
+        bswap_buf((uint32_t*)buf, (uint32_t*)buf, size);
+    }
+    
+    s->picture_number++;
+
+    return size*4;
+}
+
+static int encode_end(AVCodecContext *avctx)
+{
+//    HYuvContext *s = avctx->priv_data;
+
+    av_freep(&avctx->extradata);
+    av_freep(&avctx->stats_out);
+    
+    return 0;
+}
+
+AVCodec huffyuv_decoder = {
+    "huffyuv",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_HUFFYUV,
+    sizeof(HYuvContext),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    CODEC_CAP_DR1 | CODEC_CAP_DRAW_HORIZ_BAND,
+    NULL
+};
+
+AVCodec huffyuv_encoder = {
+    "huffyuv",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_HUFFYUV,
+    sizeof(HYuvContext),
+    encode_init,
+    encode_frame,
+    encode_end,
+};
diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am
index 0ef6bb0eb..6386800f6 100644
--- a/src/libffmpeg/libavcodec/i386/Makefile.am
+++ b/src/libffmpeg/libavcodec/i386/Makefile.am
@@ -16,11 +16,11 @@ libavcodec_mmx_src = \
 	cputest.c \
 	dsputil_mmx.c \
 	fdct_mmx.c \
+	fft_sse.c \
 	idct_mmx.c \
 	motion_est_mmx.c \
 	mpegvideo_mmx.c \
 	simple_idct_mmx.c
-#	fft_sse.c - needs new header from gcc 3.1
 
 libavcodec_mmx_dummy = libavcodec_mmx_dummy.c
 
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 12a360154..5fce7f914 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -43,11 +43,21 @@ int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 
+int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
+int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
+int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
+int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
+
 /* pixel operations */
 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
 
+static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
+static const uint64_t ff_pw_3  __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
+static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
+static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
+
 #define JUMPALIGN() __asm __volatile (".balign 8"::)
 #define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
 
@@ -213,7 +223,7 @@ static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
     );
 }
 
-static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
+static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
 {
     asm volatile(
         "pxor %%mm7, %%mm7	\n\t"
@@ -496,12 +506,853 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     for(; i<w; i++)
         dst[i+0] = src1[i+0]-src2[i+0];
 }
+#define LBUTTERFLY(a,b)\
+    "paddw " #b ", " #a "		\n\t"\
+    "paddw " #b ", " #b "		\n\t"\
+    "psubw " #a ", " #b "		\n\t"
+
+#define HADAMARD48\
+        LBUTTERFLY(%%mm0, %%mm1)\
+        LBUTTERFLY(%%mm2, %%mm3)\
+        LBUTTERFLY(%%mm4, %%mm5)\
+        LBUTTERFLY(%%mm6, %%mm7)\
+        \
+        LBUTTERFLY(%%mm0, %%mm2)\
+        LBUTTERFLY(%%mm1, %%mm3)\
+        LBUTTERFLY(%%mm4, %%mm6)\
+        LBUTTERFLY(%%mm5, %%mm7)\
+        \
+        LBUTTERFLY(%%mm0, %%mm4)\
+        LBUTTERFLY(%%mm1, %%mm5)\
+        LBUTTERFLY(%%mm2, %%mm6)\
+        LBUTTERFLY(%%mm3, %%mm7)
+
+#define MMABS(a,z)\
+    "pxor " #z ", " #z "		\n\t"\
+    "pcmpgtw " #a ", " #z "		\n\t"\
+    "pxor " #z ", " #a "		\n\t"\
+    "psubw " #z ", " #a "		\n\t"
+
+#define MMABS_SUM(a,z, sum)\
+    "pxor " #z ", " #z "		\n\t"\
+    "pcmpgtw " #a ", " #z "		\n\t"\
+    "pxor " #z ", " #a "		\n\t"\
+    "psubw " #z ", " #a "		\n\t"\
+    "paddusw " #a ", " #sum "		\n\t"
+
+    
+#define SBUTTERFLY(a,b,t,n)\
+    "movq " #a ", " #t "		\n\t" /* abcd */\
+    "punpckl" #n " " #b ", " #a "	\n\t" /* aebf */\
+    "punpckh" #n " " #b ", " #t "	\n\t" /* cgdh */\
+    
+#define TRANSPOSE4(a,b,c,d,t)\
+    SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
+    SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
+    SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
+    SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
+
+#define LOAD4(o, a, b, c, d)\
+        "movq "#o"(%1), " #a "		\n\t"\
+        "movq "#o"+16(%1), " #b "	\n\t"\
+        "movq "#o"+32(%1), " #c "	\n\t"\
+        "movq "#o"+48(%1), " #d "	\n\t"
+
+#define STORE4(o, a, b, c, d)\
+        "movq "#a", "#o"(%1)		\n\t"\
+        "movq "#b", "#o"+16(%1)		\n\t"\
+        "movq "#c", "#o"+32(%1)		\n\t"\
+        "movq "#d", "#o"+48(%1)		\n\t"\
+
+static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
+    uint64_t temp[16] __align8;
+    int sum=0;
+
+    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
 
+    asm volatile(
+        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
+        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        
+        "movq %%mm7, 112(%1)		\n\t"
+        
+        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
+        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
+        
+        "movq 112(%1), %%mm7 		\n\t"
+        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
+
+        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
+        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        
+        "movq %%mm7, 120(%1)		\n\t"
+        
+        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
+        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
+        
+        "movq 120(%1), %%mm7 		\n\t"
+        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+        "movq %%mm7, %%mm5		\n\t"//FIXME remove
+        "movq %%mm6, %%mm7		\n\t"
+        "movq %%mm0, %%mm6		\n\t"
+//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
+        
+        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
+//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        "movq %%mm7, 64(%1)		\n\t"
+        MMABS(%%mm0, %%mm7)
+        MMABS_SUM(%%mm1, %%mm7, %%mm0)
+        MMABS_SUM(%%mm2, %%mm7, %%mm0)
+        MMABS_SUM(%%mm3, %%mm7, %%mm0)
+        MMABS_SUM(%%mm4, %%mm7, %%mm0)
+        MMABS_SUM(%%mm5, %%mm7, %%mm0)
+        MMABS_SUM(%%mm6, %%mm7, %%mm0)
+        "movq 64(%1), %%mm1		\n\t"
+        MMABS_SUM(%%mm1, %%mm7, %%mm0)
+        "movq %%mm0, 64(%1)		\n\t"
+        
+        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
+        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        "movq %%mm7, (%1)		\n\t"
+        MMABS(%%mm0, %%mm7)
+        MMABS_SUM(%%mm1, %%mm7, %%mm0)
+        MMABS_SUM(%%mm2, %%mm7, %%mm0)
+        MMABS_SUM(%%mm3, %%mm7, %%mm0)
+        MMABS_SUM(%%mm4, %%mm7, %%mm0)
+        MMABS_SUM(%%mm5, %%mm7, %%mm0)
+        MMABS_SUM(%%mm6, %%mm7, %%mm0)
+        "movq (%1), %%mm1		\n\t"
+        MMABS_SUM(%%mm1, %%mm7, %%mm0)
+        "movq 64(%1), %%mm1		\n\t"
+        MMABS_SUM(%%mm1, %%mm7, %%mm0)
+        
+        "movq %%mm0, %%mm1		\n\t"
+        "psrlq $32, %%mm0		\n\t"
+        "paddusw %%mm1, %%mm0		\n\t"
+        "movq %%mm0, %%mm1		\n\t"
+        "psrlq $16, %%mm0		\n\t"
+        "paddusw %%mm1, %%mm0		\n\t"
+        "movd %%mm0, %0			\n\t"
+                
+        : "=r" (sum)
+        : "r"(temp)
+    );
+    return sum&0xFFFF;
+}
+
+WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
+
+#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
+#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
+
+#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
+        "paddw " #m4 ", " #m3 "		\n\t" /* x1 */\
+        "movq "MANGLE(ff_pw_20)", %%mm4		\n\t" /* 20 */\
+        "pmullw " #m3 ", %%mm4		\n\t" /* 20x1 */\
+        "movq "#in7", " #m3 "		\n\t" /* d */\
+        "movq "#in0", %%mm5		\n\t" /* D */\
+        "paddw " #m3 ", %%mm5		\n\t" /* x4 */\
+        "psubw %%mm5, %%mm4		\n\t" /* 20x1 - x4 */\
+        "movq "#in1", %%mm5		\n\t" /* C */\
+        "movq "#in2", %%mm6		\n\t" /* B */\
+        "paddw " #m6 ", %%mm5		\n\t" /* x3 */\
+        "paddw " #m5 ", %%mm6		\n\t" /* x2 */\
+        "paddw %%mm6, %%mm6		\n\t" /* 2x2 */\
+        "psubw %%mm6, %%mm5		\n\t" /* -2x2 + x3 */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm5	\n\t" /* -6x2 + 3x3 */\
+        "paddw " #rnd ", %%mm4		\n\t" /* x2 */\
+        "paddw %%mm4, %%mm5		\n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
+        "psraw $5, %%mm5		\n\t"\
+        "packuswb %%mm5, %%mm5		\n\t"\
+        OP(%%mm5, out, %%mm7, d)
+
+#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
+void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    uint64_t temp;\
+\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "1:				\n\t"\
+        "movq  (%0), %%mm0		\n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm1		\n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm2		\n\t" /* ABCDEFGH */\
+        "punpcklbw %%mm7, %%mm0		\n\t" /* 0A0B0C0D */\
+        "punpckhbw %%mm7, %%mm1		\n\t" /* 0E0F0G0H */\
+        "pshufw $0x90, %%mm0, %%mm5	\n\t" /* 0A0A0B0C */\
+        "pshufw $0x41, %%mm0, %%mm6	\n\t" /* 0B0A0A0B */\
+        "movq %%mm2, %%mm3		\n\t" /* ABCDEFGH */\
+        "movq %%mm2, %%mm4		\n\t" /* ABCDEFGH */\
+        "psllq $8, %%mm2		\n\t" /* 0ABCDEFG */\
+        "psllq $16, %%mm3		\n\t" /* 00ABCDEF */\
+        "psllq $24, %%mm4		\n\t" /* 000ABCDE */\
+        "punpckhbw %%mm7, %%mm2		\n\t" /* 0D0E0F0G */\
+        "punpckhbw %%mm7, %%mm3		\n\t" /* 0C0D0E0F */\
+        "punpckhbw %%mm7, %%mm4		\n\t" /* 0B0C0D0E */\
+        "paddw %%mm3, %%mm5		\n\t" /* b */\
+        "paddw %%mm2, %%mm6		\n\t" /* c */\
+        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
+        "psubw %%mm5, %%mm6		\n\t" /* c - 2b */\
+        "pshufw $0x06, %%mm0, %%mm5	\n\t" /* 0C0B0A0A */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm6		\n\t" /* 3c - 6b */\
+        "paddw %%mm4, %%mm0		\n\t" /* a */\
+        "paddw %%mm1, %%mm5		\n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm0		\n\t" /* 20a */\
+        "psubw %%mm5, %%mm0		\n\t" /* 20a - d */\
+        "paddw %6, %%mm6		\n\t"\
+        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0		\n\t"\
+        "movq %%mm0, %5			\n\t"\
+        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
+        \
+        "movq 5(%0), %%mm0		\n\t" /* FGHIJKLM */\
+        "movq %%mm0, %%mm5		\n\t" /* FGHIJKLM */\
+        "movq %%mm0, %%mm6		\n\t" /* FGHIJKLM */\
+        "psrlq $8, %%mm0		\n\t" /* GHIJKLM0 */\
+        "psrlq $16, %%mm5		\n\t" /* HIJKLM00 */\
+        "punpcklbw %%mm7, %%mm0		\n\t" /* 0G0H0I0J */\
+        "punpcklbw %%mm7, %%mm5		\n\t" /* 0H0I0J0K */\
+        "paddw %%mm0, %%mm2		\n\t" /* b */\
+        "paddw %%mm5, %%mm3		\n\t" /* c */\
+        "paddw %%mm2, %%mm2		\n\t" /* 2b */\
+        "psubw %%mm2, %%mm3		\n\t" /* c - 2b */\
+        "movq %%mm6, %%mm2		\n\t" /* FGHIJKLM */\
+        "psrlq $24, %%mm6		\n\t" /* IJKLM000 */\
+        "punpcklbw %%mm7, %%mm2		\n\t" /* 0F0G0H0I */\
+        "punpcklbw %%mm7, %%mm6		\n\t" /* 0I0J0K0L */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm3		\n\t" /* 3c - 6b */\
+        "paddw %%mm2, %%mm1		\n\t" /* a */\
+        "paddw %%mm6, %%mm4		\n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm1		\n\t" /* 20a */\
+        "psubw %%mm4, %%mm3		\n\t" /* - 6b +3c - d */\
+        "paddw %6, %%mm1		\n\t"\
+        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b +3c - d */\
+        "psraw $5, %%mm3		\n\t"\
+        "movq %5, %%mm1			\n\t"\
+        "packuswb %%mm3, %%mm1		\n\t"\
+        OP_MMX2(%%mm1, (%1),%%mm4, q)\
+        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
+        \
+        "movq 9(%0), %%mm1		\n\t" /* JKLMNOPQ */\
+        "movq %%mm1, %%mm4		\n\t" /* JKLMNOPQ */\
+        "movq %%mm1, %%mm3		\n\t" /* JKLMNOPQ */\
+        "psrlq $8, %%mm1		\n\t" /* KLMNOPQ0 */\
+        "psrlq $16, %%mm4		\n\t" /* LMNOPQ00 */\
+        "punpcklbw %%mm7, %%mm1		\n\t" /* 0K0L0M0N */\
+        "punpcklbw %%mm7, %%mm4		\n\t" /* 0L0M0N0O */\
+        "paddw %%mm1, %%mm5		\n\t" /* b */\
+        "paddw %%mm4, %%mm0		\n\t" /* c */\
+        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
+        "psubw %%mm5, %%mm0		\n\t" /* c - 2b */\
+        "movq %%mm3, %%mm5		\n\t" /* JKLMNOPQ */\
+        "psrlq $24, %%mm3		\n\t" /* MNOPQ000 */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm0		\n\t" /* 3c - 6b */\
+        "punpcklbw %%mm7, %%mm3		\n\t" /* 0M0N0O0P */\
+        "paddw %%mm3, %%mm2		\n\t" /* d */\
+        "psubw %%mm2, %%mm0		\n\t" /* -6b + 3c - d */\
+        "movq %%mm5, %%mm2		\n\t" /* JKLMNOPQ */\
+        "punpcklbw %%mm7, %%mm2		\n\t" /* 0J0K0L0M */\
+        "punpckhbw %%mm7, %%mm5		\n\t" /* 0N0O0P0Q */\
+        "paddw %%mm2, %%mm6		\n\t" /* a */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm6		\n\t" /* 20a */\
+        "paddw %6, %%mm0		\n\t"\
+        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0		\n\t"\
+        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
+        \
+        "paddw %%mm5, %%mm3		\n\t" /* a */\
+        "pshufw $0xF9, %%mm5, %%mm6	\n\t" /* 0O0P0Q0Q */\
+        "paddw %%mm4, %%mm6		\n\t" /* b */\
+        "pshufw $0xBE, %%mm5, %%mm4	\n\t" /* 0P0Q0Q0P */\
+        "pshufw $0x6F, %%mm5, %%mm5	\n\t" /* 0Q0Q0P0O */\
+        "paddw %%mm1, %%mm4		\n\t" /* c */\
+        "paddw %%mm2, %%mm5		\n\t" /* d */\
+        "paddw %%mm6, %%mm6		\n\t" /* 2b */\
+        "psubw %%mm6, %%mm4		\n\t" /* c - 2b */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm3		\n\t" /* 20a */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm4		\n\t" /* 3c - 6b */\
+        "psubw %%mm5, %%mm3		\n\t" /* -6b + 3c - d */\
+        "paddw %6, %%mm4		\n\t"\
+        "paddw %%mm3, %%mm4		\n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm4		\n\t"\
+        "packuswb %%mm4, %%mm0		\n\t"\
+        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
+        \
+        "addl %3, %0			\n\t"\
+        "addl %4, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b				\n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
+        : "memory"\
+    );\
+}\
+\
+static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    int i;\
+    int16_t temp[16];\
+    /* quick HACK, XXX FIXME MUST be optimized */\
+    for(i=0; i<h; i++)\
+    {\
+        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
+        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
+        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
+        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
+        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
+        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
+        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
+        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
+        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
+        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
+        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
+        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
+        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
+        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
+        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
+        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
+        asm volatile(\
+            "movq (%0), %%mm0		\n\t"\
+            "movq 8(%0), %%mm1		\n\t"\
+            "paddw %2, %%mm0		\n\t"\
+            "paddw %2, %%mm1		\n\t"\
+            "psraw $5, %%mm0		\n\t"\
+            "psraw $5, %%mm1		\n\t"\
+            "packuswb %%mm1, %%mm0	\n\t"\
+            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
+            "movq 16(%0), %%mm0		\n\t"\
+            "movq 24(%0), %%mm1		\n\t"\
+            "paddw %2, %%mm0		\n\t"\
+            "paddw %2, %%mm1		\n\t"\
+            "psraw $5, %%mm0		\n\t"\
+            "psraw $5, %%mm1		\n\t"\
+            "packuswb %%mm1, %%mm0	\n\t"\
+            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
+            : "memory"\
+        );\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    uint64_t temp;\
+\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "1:				\n\t"\
+        "movq  (%0), %%mm0		\n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm1		\n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm2		\n\t" /* ABCDEFGH */\
+        "punpcklbw %%mm7, %%mm0		\n\t" /* 0A0B0C0D */\
+        "punpckhbw %%mm7, %%mm1		\n\t" /* 0E0F0G0H */\
+        "pshufw $0x90, %%mm0, %%mm5	\n\t" /* 0A0A0B0C */\
+        "pshufw $0x41, %%mm0, %%mm6	\n\t" /* 0B0A0A0B */\
+        "movq %%mm2, %%mm3		\n\t" /* ABCDEFGH */\
+        "movq %%mm2, %%mm4		\n\t" /* ABCDEFGH */\
+        "psllq $8, %%mm2		\n\t" /* 0ABCDEFG */\
+        "psllq $16, %%mm3		\n\t" /* 00ABCDEF */\
+        "psllq $24, %%mm4		\n\t" /* 000ABCDE */\
+        "punpckhbw %%mm7, %%mm2		\n\t" /* 0D0E0F0G */\
+        "punpckhbw %%mm7, %%mm3		\n\t" /* 0C0D0E0F */\
+        "punpckhbw %%mm7, %%mm4		\n\t" /* 0B0C0D0E */\
+        "paddw %%mm3, %%mm5		\n\t" /* b */\
+        "paddw %%mm2, %%mm6		\n\t" /* c */\
+        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
+        "psubw %%mm5, %%mm6		\n\t" /* c - 2b */\
+        "pshufw $0x06, %%mm0, %%mm5	\n\t" /* 0C0B0A0A */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm6		\n\t" /* 3c - 6b */\
+        "paddw %%mm4, %%mm0		\n\t" /* a */\
+        "paddw %%mm1, %%mm5		\n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm0		\n\t" /* 20a */\
+        "psubw %%mm5, %%mm0		\n\t" /* 20a - d */\
+        "paddw %6, %%mm6		\n\t"\
+        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0		\n\t"\
+        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
+        \
+        "movd 5(%0), %%mm5		\n\t" /* FGHI */\
+        "punpcklbw %%mm7, %%mm5		\n\t" /* 0F0G0H0I */\
+        "pshufw $0xF9, %%mm5, %%mm6	\n\t" /* 0G0H0I0I */\
+        "paddw %%mm5, %%mm1		\n\t" /* a */\
+        "paddw %%mm6, %%mm2		\n\t" /* b */\
+        "pshufw $0xBE, %%mm5, %%mm6	\n\t" /* 0H0I0I0H */\
+        "pshufw $0x6F, %%mm5, %%mm5	\n\t" /* 0I0I0H0G */\
+        "paddw %%mm6, %%mm3		\n\t" /* c */\
+        "paddw %%mm5, %%mm4		\n\t" /* d */\
+        "paddw %%mm2, %%mm2		\n\t" /* 2b */\
+        "psubw %%mm2, %%mm3		\n\t" /* c - 2b */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm1		\n\t" /* 20a */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm3		\n\t" /* 3c - 6b */\
+        "psubw %%mm4, %%mm3		\n\t" /* -6b + 3c - d */\
+        "paddw %6, %%mm1		\n\t"\
+        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm3		\n\t"\
+        "packuswb %%mm3, %%mm0		\n\t"\
+        OP_MMX2(%%mm0, (%1), %%mm4, q)\
+        \
+        "addl %3, %0			\n\t"\
+        "addl %4, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
+        : "memory"\
+    );\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    int i;\
+    int16_t temp[8];\
+    /* quick HACK, XXX FIXME MUST be optimized */\
+    for(i=0; i<h; i++)\
+    {\
+        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
+        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
+        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
+        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
+        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
+        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
+        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
+        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
+        asm volatile(\
+            "movq (%0), %%mm0		\n\t"\
+            "movq 8(%0), %%mm1		\n\t"\
+            "paddw %2, %%mm0		\n\t"\
+            "paddw %2, %%mm1		\n\t"\
+            "psraw $5, %%mm0		\n\t"\
+            "psraw $5, %%mm1		\n\t"\
+            "packuswb %%mm1, %%mm0	\n\t"\
+            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
+            :"memory"\
+        );\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}
+
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
+\
+static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    uint64_t temp[17*4];\
+    uint64_t *temp_ptr= temp;\
+    int count= 17;\
+\
+    /*FIXME unroll */\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "1:				\n\t"\
+        "movq (%0), %%mm0		\n\t"\
+        "movq (%0), %%mm1		\n\t"\
+        "movq 8(%0), %%mm2		\n\t"\
+        "movq 8(%0), %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "punpckhbw %%mm7, %%mm1		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpckhbw %%mm7, %%mm3		\n\t"\
+        "movq %%mm0, (%1)		\n\t"\
+        "movq %%mm1, 17*8(%1)		\n\t"\
+        "movq %%mm2, 2*17*8(%1)		\n\t"\
+        "movq %%mm3, 3*17*8(%1)		\n\t"\
+        "addl $8, %1			\n\t"\
+        "addl %3, %0			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
+        : "r" (srcStride)\
+        : "memory"\
+    );\
+    \
+    temp_ptr= temp;\
+    count=4;\
+    \
+/*FIXME reorder for speed */\
+    asm volatile(\
+        /*"pxor %%mm7, %%mm7		\n\t"*/\
+        "1:				\n\t"\
+        "movq (%0), %%mm0		\n\t"\
+        "movq 8(%0), %%mm1		\n\t"\
+        "movq 16(%0), %%mm2		\n\t"\
+        "movq 24(%0), %%mm3		\n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"  \
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
+        \
+        "addl $136, %0			\n\t"\
+        "addl %6, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        \
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
+        : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
+        :"memory"\
+    );\
+}\
+\
+void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    uint64_t temp[9*4];\
+    uint64_t *temp_ptr= temp;\
+    int count= 9;\
+\
+    /*FIXME unroll */\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "1:				\n\t"\
+        "movq (%0), %%mm0		\n\t"\
+        "movq (%0), %%mm1		\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "punpckhbw %%mm7, %%mm1		\n\t"\
+        "movq %%mm0, (%1)		\n\t"\
+        "movq %%mm1, 9*8(%1)		\n\t"\
+        "addl $8, %1			\n\t"\
+        "addl %3, %0			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
+        : "r" (srcStride)\
+        : "memory"\
+    );\
+    \
+    temp_ptr= temp;\
+    count=2;\
+    \
+/*FIXME reorder for speed */\
+    asm volatile(\
+        /*"pxor %%mm7, %%mm7		\n\t"*/\
+        "1:				\n\t"\
+        "movq (%0), %%mm0		\n\t"\
+        "movq 8(%0), %%mm1		\n\t"\
+        "movq 16(%0), %%mm2		\n\t"\
+        "movq 24(%0), %%mm3		\n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
+        "addl %4, %1			\n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
+                \
+        "addl $72, %0			\n\t"\
+        "addl %6, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+         \
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
+        : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
+        : "memory"\
+   );\
+}\
+\
+static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+    OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 8*2 + 18*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 8*2 + 18*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 8*2 + 9*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 8*2 + 9*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src  , 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 9*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 9*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 8*2 + 9*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[8*2 + 8*2 + 9*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[9*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+    OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 16*2 + 18*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 16*2 + 18*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src  , 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[16*2 + 16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
+    uint8_t * const halfV= ((uint8_t*)half);\
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}
+
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "	\n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "	\n\t"\
+"pavgusb " #temp ", " #a "	\n\t"\
+"mov" #size " " #a ", " #b "	\n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "	\n\t"\
+"pavgb " #temp ", " #a "	\n\t"\
+"mov" #size " " #a ", " #b "	\n\t"
+
+QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
+QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
+QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
+QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
+QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
+QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
+QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
 
 #if 0
 static void just_return() { return; }
 #endif
 
+#define SET_QPEL_FUNC(postfix1, postfix2) \
+    c->put_ ## postfix1 = put_ ## postfix2;\
+    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
+    c->avg_ ## postfix1 = avg_ ## postfix2;
+    
 void dsputil_init_mmx(DSPContext* c, unsigned mask)
 {
     mm_flags = mm_support();
@@ -576,10 +1427,16 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
         c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
         c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
         c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
-        
+                
         c->add_bytes= add_bytes_mmx;
         c->diff_bytes= diff_bytes_mmx;
-
+        
+        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
+        c->hadamard8_diff[1]= hadamard8_diff_mmx;
+        
+        c->sad[0]= sad16x16_mmx;
+        c->sad[1]= sad8x8_mmx;
+        
         if (mm_flags & MM_MMXEXT) {
             c->pix_abs16x16     = pix_abs16x16_mmx2;
             c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx2;
@@ -591,6 +1448,9 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
             c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx2;
             c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
 
+            c->sad[0]= sad16x16_mmx2;
+            c->sad[1]= sad8x8_mmx2;
+            
             c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
             c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
@@ -610,6 +1470,41 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+
+#if 1
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
+#endif
         } else if (mm_flags & MM_3DNOW) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
@@ -630,6 +1525,39 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+        
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
+            SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
         }
     }
 
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
index 6873432ce..4a8841156 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
@@ -53,6 +53,38 @@ static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size
 	:"%eax", "memory");
 }
 
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" 16(%2), %%mm0		\n\t"
+	PAVGB" 24(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+        "addl	$32, %2			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");
+}
+
 static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
@@ -92,6 +124,34 @@ static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
 	:"r" (line_size)
 	:"%eax", "memory");
 }
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGB" 16(%2), %%mm0		\n\t"
+	PAVGB" 24(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+        "addl	$32, %2			\n\t"
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");
+}
  
 /* GL: this function does incorrect rounding if overflow */
 static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
index 3605e03f9..0ae1cd99d 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -54,6 +54,46 @@ static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
 	:"eax", "memory");
 }
 
+static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm5, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	16(%2), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+	"movq	(%1), %%mm2		\n\t"
+	"movq	24(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	"addl	$32, %2			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	%%mm5, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"(src1Stride), "D"(dstStride)
+	:"memory");
+}
+
 static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
@@ -90,7 +130,7 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
 	"movq	9(%1, %3), %%mm3	\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, 8(%2)		\n\t"
-	"movq	%%mm5, 8(%2, %3)		\n\t"
+	"movq	%%mm5, 8(%2, %3)	\n\t"
 	"addl	%%eax, %1		\n\t"
 	"addl	%%eax, %2		\n\t"
 	"subl	$4, %0			\n\t"
@@ -100,6 +140,42 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }
 
+static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	8(%1), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"movq	%%mm5, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	16(%2), %%mm1		\n\t"
+	"movq	8(%1), %%mm2		\n\t"
+	"movq	24(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"movq	%%mm5, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"addl	$32, %2			\n\t"
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"(src1Stride), "D"(dstStride)
+	:"memory"); 
+}
+
 static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
@@ -195,6 +271,124 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }
 
+static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	%%mm3, (%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$8, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
+static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	%%mm3, (%0)		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	%%mm3, 8(%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$16, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
 // avg_pixels
 // in case more speed is needed - unroling would certainly help
 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
@@ -259,6 +453,27 @@ static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
     } while (--h);
 }
 
+static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	    "movq  %1, %%mm0		\n\t"
+	    "movq  %2, %%mm1		\n\t"
+	    "movq  %0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, %0		\n\t"
+	    :"+m"(*dst)
+	    :"m"(*src1), "m"(*src2)
+	    :"memory");
+	dst += dstStride;
+        src1 += src1Stride;
+        src2 += 8;
+    } while (--h);
+}
+
 static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
@@ -285,6 +500,33 @@ static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
     } while (--h);
 }
 
+static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	    "movq  %1, %%mm0		\n\t"
+	    "movq  %2, %%mm1		\n\t"
+	    "movq  %0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, %0		\n\t"
+	    "movq  8%1, %%mm0		\n\t"
+	    "movq  8%2, %%mm1		\n\t"
+	    "movq  8%0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, 8%0		\n\t"
+	    :"+m"(*dst)
+	    :"m"(*src1), "m"(*src2)
+	    :"memory");
+	dst += dstStride;
+        src1 += src1Stride;
+        src2 += 16;
+    } while (--h);
+}
+
 static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
@@ -399,6 +641,133 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }
 
+static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    MOVQ_BFE(mm5);
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	64(%2), %%mm2		\n\t"
+	"movq	136(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	(%0), %%mm4		\n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
+	"movq	%%mm0, (%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$8, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
+static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    MOVQ_BFE(mm5);
+    __asm __volatile(
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	256(%2), %%mm2		\n\t"
+	"movq	528(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	(%0), %%mm4		\n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
+	"movq	%%mm0, (%0)		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm1		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm3		\n\t"
+	"paddusw %%mm1, %%mm3		\n\t"
+	"psrlw	$2, %%mm3		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	8(%2), %%mm1		\n\t"
+	"movq	264(%2), %%mm2		\n\t"
+	"movq	536(%2), %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm0		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm0, %%mm1		\n\t"
+	"paddusw %%mm2, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm4		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"packuswb  %%mm4, %%mm3		\n\t"
+	"movq	8(%0), %%mm4		\n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
+	"movq	%%mm0, 8(%0)		\n\t"
+        "addl	%4, %0			\n\t"
+        "addl	%4, %1			\n\t"
+        "addl	$16, %2			\n\t" 
+        "decl	%3			\n\t"
+	"jnz	1b			\n\t"
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
+	:"r"(stride)
+	:"memory");
+}
+
+
 //FIXME optimize
 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
     DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
diff --git a/src/libffmpeg/libavcodec/i386/fft_sse.c b/src/libffmpeg/libavcodec/i386/fft_sse.c
index 8e8e36b0f..175cea506 100644
--- a/src/libffmpeg/libavcodec/i386/fft_sse.c
+++ b/src/libffmpeg/libavcodec/i386/fft_sse.c
@@ -19,11 +19,16 @@
 #include "../dsputil.h"
 #include <math.h>
 
+#ifdef HAVE_BUILTIN_VECTOR
+
 #include <xmmintrin.h>
 
 static const float p1p1p1m1[4] __attribute__((aligned(16))) = 
     { 1.0, 1.0, 1.0, -1.0 };
 
+static const float p1p1m1p1[4] __attribute__((aligned(16))) = 
+    { 1.0, 1.0, -1.0, 1.0 };
+
 static const float p1p1m1m1[4] __attribute__((aligned(16))) = 
     { 1.0, 1.0, -1.0, -1.0 };
 
@@ -54,6 +59,11 @@ void fft_calc_sse(FFTContext *s, FFTComplex *z)
         r = (__m128 *)&z[0];
         c1 = *(__m128 *)p1p1m1m1;
         c2 = *(__m128 *)p1p1p1m1;
+        if (s->inverse)
+            c2 = *(__m128 *)p1p1m1p1;
+        else
+            c2 = *(__m128 *)p1p1p1m1;
+
         j = (np >> 2);
         do {
             a = r[0];
@@ -126,3 +136,5 @@ void fft_calc_sse(FFTContext *s, FFTComplex *z)
         nloops = nloops << 1;
     } while (nblocks != 0);
 }
+
+#endif
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index 3368e7333..fa85db67b 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -274,6 +274,15 @@ int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
+int sad8x8_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t":);\
+\
+    sad8_ ## suf(blk1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
 \
 int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 {\
@@ -324,6 +333,16 @@ int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
+int sad16x16_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t":);\
+\
+    sad8_ ## suf(blk1  , blk2  , stride, 4);\
+    sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
 int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index cb7af3e62..d936abfd5 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -529,7 +529,6 @@ static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
 void MPV_common_init_mmx(MpegEncContext *s)
 {
     if (mm_flags & MM_MMX) {
-        int i;
         const int dct_algo = s->avctx->dct_algo;
         const int idct_algo= s->avctx->idct_algo;
         
diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c
index d1e88a970..bdf6fe65d 100644
--- a/src/libffmpeg/libavcodec/imgconvert.c
+++ b/src/libffmpeg/libavcodec/imgconvert.c
@@ -720,8 +720,6 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
 {
     int i;
 
-    assert(pix_fmt != PIX_FMT_ANY && dst_pix_fmt != PIX_FMT_ANY);
-
     if (dst_pix_fmt == pix_fmt) {
         switch(pix_fmt) {
         case PIX_FMT_YUV420P:
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index b1cfab973..28147fc72 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -512,7 +512,7 @@ static void component_resample(ImgReSampleContext *s,
         else
 #endif
 #ifdef HAVE_ALTIVEC
-            if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS == 8)
+            if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS <= 6)
                 v_resample16_altivec(output, owidth,
                                 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
                                 &s->v_filters[phase_y][0]);
diff --git a/src/libffmpeg/libavcodec/mace.c b/src/libffmpeg/libavcodec/mace.c
new file mode 100644
index 000000000..18aaacaf1
--- /dev/null
+++ b/src/libffmpeg/libavcodec/mace.c
@@ -0,0 +1,441 @@
+/*
+ * MACE decoder
+ * Copyright (c) 2002 Laszlo Torok <torokl@alpha.dfmk.hu>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "avcodec.h"
+
+/*
+ * Adapted to ffmpeg by Francois Revol <revol@free.fr>
+ * (removed 68k REG stuff, changed types, added some statics and consts,
+ * libavcodec api, context stuff, interlaced stereo out).
+ */
+
+static const UINT16 MACEtab1[] = { 0xfff3, 0x0008, 0x004c, 0x00de, 0x00de, 0x004c, 0x0008, 0xfff3 };
+
+static const UINT16 MACEtab3[] = { 0xffee, 0x008c, 0x008c, 0xffee };
+
+static const UINT16 MACEtab2[][8] = {
+    { 0x0025, 0x0074, 0x00CE, 0x014A, 0xFEB5, 0xFF31, 0xFF8B, 0xFFDA },
+    { 0x0027, 0x0079, 0x00D8, 0x015A, 0xFEA5, 0xFF27, 0xFF86, 0xFFD8 },
+    { 0x0029, 0x007F, 0x00E1, 0x0169, 0xFE96, 0xFF1E, 0xFF80, 0xFFD6 },
+    { 0x002A, 0x0084, 0x00EB, 0x0179, 0xFE86, 0xFF14, 0xFF7B, 0xFFD5 },
+    { 0x002C, 0x0089, 0x00F5, 0x0188, 0xFE77, 0xFF0A, 0xFF76, 0xFFD3 },
+    { 0x002E, 0x0090, 0x0100, 0x019A, 0xFE65, 0xFEFF, 0xFF6F, 0xFFD1 },
+    { 0x0030, 0x0096, 0x010B, 0x01AC, 0xFE53, 0xFEF4, 0xFF69, 0xFFCF },
+    { 0x0033, 0x009D, 0x0118, 0x01C1, 0xFE3E, 0xFEE7, 0xFF62, 0xFFCC },
+    { 0x0035, 0x00A5, 0x0125, 0x01D6, 0xFE29, 0xFEDA, 0xFF5A, 0xFFCA },
+    { 0x0037, 0x00AC, 0x0132, 0x01EA, 0xFE15, 0xFECD, 0xFF53, 0xFFC8 },
+    { 0x003A, 0x00B3, 0x013F, 0x01FF, 0xFE00, 0xFEC0, 0xFF4C, 0xFFC5 },
+    { 0x003C, 0x00BB, 0x014D, 0x0216, 0xFDE9, 0xFEB2, 0xFF44, 0xFFC3 },
+    { 0x003F, 0x00C3, 0x015C, 0x022D, 0xFDD2, 0xFEA3, 0xFF3C, 0xFFC0 },
+    { 0x0042, 0x00CD, 0x016C, 0x0247, 0xFDB8, 0xFE93, 0xFF32, 0xFFBD },
+    { 0x0045, 0x00D6, 0x017C, 0x0261, 0xFD9E, 0xFE83, 0xFF29, 0xFFBA },
+    { 0x0048, 0x00DF, 0x018C, 0x027B, 0xFD84, 0xFE73, 0xFF20, 0xFFB7 },
+    { 0x004B, 0x00E9, 0x019E, 0x0297, 0xFD68, 0xFE61, 0xFF16, 0xFFB4 },
+    { 0x004F, 0x00F4, 0x01B1, 0x02B6, 0xFD49, 0xFE4E, 0xFF0B, 0xFFB0 },
+    { 0x0052, 0x00FE, 0x01C5, 0x02D5, 0xFD2A, 0xFE3A, 0xFF01, 0xFFAD },
+    { 0x0056, 0x0109, 0x01D8, 0x02F4, 0xFD0B, 0xFE27, 0xFEF6, 0xFFA9 },
+    { 0x005A, 0x0116, 0x01EF, 0x0318, 0xFCE7, 0xFE10, 0xFEE9, 0xFFA5 },
+    { 0x005E, 0x0122, 0x0204, 0x033A, 0xFCC5, 0xFDFB, 0xFEDD, 0xFFA1 },
+    { 0x0062, 0x012F, 0x021A, 0x035E, 0xFCA1, 0xFDE5, 0xFED0, 0xFF9D },
+    { 0x0066, 0x013C, 0x0232, 0x0385, 0xFC7A, 0xFDCD, 0xFEC3, 0xFF99 },
+    { 0x006B, 0x014B, 0x024C, 0x03AE, 0xFC51, 0xFDB3, 0xFEB4, 0xFF94 },
+    { 0x0070, 0x0159, 0x0266, 0x03D7, 0xFC28, 0xFD99, 0xFEA6, 0xFF8F },
+    { 0x0075, 0x0169, 0x0281, 0x0403, 0xFBFC, 0xFD7E, 0xFE96, 0xFF8A },
+    { 0x007A, 0x0179, 0x029E, 0x0432, 0xFBCD, 0xFD61, 0xFE86, 0xFF85 },
+    { 0x007F, 0x018A, 0x02BD, 0x0463, 0xFB9C, 0xFD42, 0xFE75, 0xFF80 },
+    { 0x0085, 0x019B, 0x02DC, 0x0494, 0xFB6B, 0xFD23, 0xFE64, 0xFF7A },
+    { 0x008B, 0x01AE, 0x02FC, 0x04C8, 0xFB37, 0xFD03, 0xFE51, 0xFF74 },
+    { 0x0091, 0x01C1, 0x031F, 0x0500, 0xFAFF, 0xFCE0, 0xFE3E, 0xFF6E },
+    { 0x0098, 0x01D5, 0x0343, 0x0539, 0xFAC6, 0xFCBC, 0xFE2A, 0xFF67 },
+    { 0x009F, 0x01EA, 0x0368, 0x0575, 0xFA8A, 0xFC97, 0xFE15, 0xFF60 },
+    { 0x00A6, 0x0200, 0x038F, 0x05B3, 0xFA4C, 0xFC70, 0xFDFF, 0xFF59 },
+    { 0x00AD, 0x0217, 0x03B7, 0x05F3, 0xFA0C, 0xFC48, 0xFDE8, 0xFF52 },
+    { 0x00B5, 0x022E, 0x03E1, 0x0636, 0xF9C9, 0xFC1E, 0xFDD1, 0xFF4A },
+    { 0x00BD, 0x0248, 0x040E, 0x067F, 0xF980, 0xFBF1, 0xFDB7, 0xFF42 },
+    { 0x00C5, 0x0262, 0x043D, 0x06CA, 0xF935, 0xFBC2, 0xFD9D, 0xFF3A },
+    { 0x00CE, 0x027D, 0x046D, 0x0717, 0xF8E8, 0xFB92, 0xFD82, 0xFF31 },
+    { 0x00D7, 0x0299, 0x049F, 0x0767, 0xF898, 0xFB60, 0xFD66, 0xFF28 },
+    { 0x00E1, 0x02B7, 0x04D5, 0x07BC, 0xF843, 0xFB2A, 0xFD48, 0xFF1E },
+    { 0x00EB, 0x02D6, 0x050B, 0x0814, 0xF7EB, 0xFAF4, 0xFD29, 0xFF14 },
+    { 0x00F6, 0x02F7, 0x0545, 0x0871, 0xF78E, 0xFABA, 0xFD08, 0xFF09 },
+    { 0x0101, 0x0318, 0x0581, 0x08D1, 0xF72E, 0xFA7E, 0xFCE7, 0xFEFE },
+    { 0x010C, 0x033C, 0x05C0, 0x0935, 0xF6CA, 0xFA3F, 0xFCC3, 0xFEF3 },
+    { 0x0118, 0x0361, 0x0602, 0x099F, 0xF660, 0xF9FD, 0xFC9E, 0xFEE7 },
+    { 0x0125, 0x0387, 0x0646, 0x0A0C, 0xF5F3, 0xF9B9, 0xFC78, 0xFEDA },
+    { 0x0132, 0x03B0, 0x068E, 0x0A80, 0xF57F, 0xF971, 0xFC4F, 0xFECD },
+    { 0x013F, 0x03DA, 0x06D9, 0x0AF7, 0xF508, 0xF926, 0xFC25, 0xFEC0 },
+    { 0x014E, 0x0406, 0x0728, 0x0B75, 0xF48A, 0xF8D7, 0xFBF9, 0xFEB1 },
+    { 0x015D, 0x0434, 0x077A, 0x0BF9, 0xF406, 0xF885, 0xFBCB, 0xFEA2 },
+    { 0x016C, 0x0464, 0x07CF, 0x0C82, 0xF37D, 0xF830, 0xFB9B, 0xFE93 },
+    { 0x017C, 0x0496, 0x0828, 0x0D10, 0xF2EF, 0xF7D7, 0xFB69, 0xFE83 },
+    { 0x018E, 0x04CB, 0x0886, 0x0DA6, 0xF259, 0xF779, 0xFB34, 0xFE71 },
+    { 0x019F, 0x0501, 0x08E6, 0x0E41, 0xF1BE, 0xF719, 0xFAFE, 0xFE60 },
+    { 0x01B2, 0x053B, 0x094C, 0x0EE3, 0xF11C, 0xF6B3, 0xFAC4, 0xFE4D },
+    { 0x01C5, 0x0576, 0x09B6, 0x0F8E, 0xF071, 0xF649, 0xFA89, 0xFE3A },
+    { 0x01D9, 0x05B5, 0x0A26, 0x1040, 0xEFBF, 0xF5D9, 0xFA4A, 0xFE26 },
+    { 0x01EF, 0x05F6, 0x0A9A, 0x10FA, 0xEF05, 0xF565, 0xFA09, 0xFE10 },
+    { 0x0205, 0x063A, 0x0B13, 0x11BC, 0xEE43, 0xF4EC, 0xF9C5, 0xFDFA },
+    { 0x021C, 0x0681, 0x0B91, 0x1285, 0xED7A, 0xF46E, 0xF97E, 0xFDE3 },
+    { 0x0234, 0x06CC, 0x0C15, 0x1359, 0xECA6, 0xF3EA, 0xF933, 0xFDCB },
+    { 0x024D, 0x071A, 0x0CA0, 0x1437, 0xEBC8, 0xF35F, 0xF8E5, 0xFDB2 },
+    { 0x0267, 0x076A, 0x0D2F, 0x151D, 0xEAE2, 0xF2D0, 0xF895, 0xFD98 },
+    { 0x0283, 0x07C0, 0x0DC7, 0x160F, 0xE9F0, 0xF238, 0xF83F, 0xFD7C },
+    { 0x029F, 0x0818, 0x0E63, 0x170A, 0xE8F5, 0xF19C, 0xF7E7, 0xFD60 },
+    { 0x02BD, 0x0874, 0x0F08, 0x1811, 0xE7EE, 0xF0F7, 0xF78B, 0xFD42 },
+    { 0x02DD, 0x08D5, 0x0FB4, 0x1926, 0xE6D9, 0xF04B, 0xF72A, 0xFD22 },
+    { 0x02FE, 0x093A, 0x1067, 0x1A44, 0xE5BB, 0xEF98, 0xF6C5, 0xFD01 },
+    { 0x0320, 0x09A3, 0x1122, 0x1B70, 0xE48F, 0xEEDD, 0xF65C, 0xFCDF },
+    { 0x0344, 0x0A12, 0x11E7, 0x1CAB, 0xE354, 0xEE18, 0xF5ED, 0xFCBB },
+    { 0x0369, 0x0A84, 0x12B2, 0x1DF0, 0xE20F, 0xED4D, 0xF57B, 0xFC96 },
+    { 0x0390, 0x0AFD, 0x1389, 0x1F48, 0xE0B7, 0xEC76, 0xF502, 0xFC6F },
+    { 0x03B8, 0x0B7A, 0x1467, 0x20AC, 0xDF53, 0xEB98, 0xF485, 0xFC47 },
+    { 0x03E3, 0x0BFE, 0x1551, 0x2223, 0xDDDC, 0xEAAE, 0xF401, 0xFC1C },
+    { 0x040F, 0x0C87, 0x1645, 0x23A9, 0xDC56, 0xE9BA, 0xF378, 0xFBF0 },
+    { 0x043E, 0x0D16, 0x1744, 0x2541, 0xDABE, 0xE8BB, 0xF2E9, 0xFBC1 },
+    { 0x046E, 0x0DAB, 0x184C, 0x26E8, 0xD917, 0xE7B3, 0xF254, 0xFB91 },
+    { 0x04A1, 0x0E47, 0x1961, 0x28A4, 0xD75B, 0xE69E, 0xF1B8, 0xFB5E },
+    { 0x04D6, 0x0EEA, 0x1A84, 0x2A75, 0xD58A, 0xE57B, 0xF115, 0xFB29 },
+    { 0x050D, 0x0F95, 0x1BB3, 0x2C5B, 0xD3A4, 0xE44C, 0xF06A, 0xFAF2 },
+    { 0x0547, 0x1046, 0x1CEF, 0x2E55, 0xD1AA, 0xE310, 0xEFB9, 0xFAB8 },
+    { 0x0583, 0x1100, 0x1E3A, 0x3066, 0xCF99, 0xE1C5, 0xEEFF, 0xFA7C },
+    { 0x05C2, 0x11C3, 0x1F94, 0x3292, 0xCD6D, 0xE06B, 0xEE3C, 0xFA3D },
+    { 0x0604, 0x128E, 0x20FC, 0x34D2, 0xCB2D, 0xDF03, 0xED71, 0xF9FB },
+    { 0x0649, 0x1362, 0x2275, 0x372E, 0xC8D1, 0xDD8A, 0xEC9D, 0xF9B6 },
+    { 0x0690, 0x143F, 0x23FF, 0x39A4, 0xC65B, 0xDC00, 0xEBC0, 0xF96F },
+    { 0x06DC, 0x1527, 0x259A, 0x3C37, 0xC3C8, 0xDA65, 0xEAD8, 0xF923 },
+    { 0x072A, 0x1619, 0x2749, 0x3EE8, 0xC117, 0xD8B6, 0xE9E6, 0xF8D5 },
+    { 0x077C, 0x1715, 0x2909, 0x41B6, 0xBE49, 0xD6F6, 0xE8EA, 0xF883 },
+    { 0x07D1, 0x181D, 0x2ADF, 0x44A6, 0xBB59, 0xD520, 0xE7E2, 0xF82E },
+    { 0x082B, 0x1930, 0x2CC7, 0x47B4, 0xB84B, 0xD338, 0xE6CF, 0xF7D4 },
+    { 0x0888, 0x1A50, 0x2EC6, 0x4AE7, 0xB518, 0xD139, 0xE5AF, 0xF777 },
+    { 0x08EA, 0x1B7D, 0x30DE, 0x4E40, 0xB1BF, 0xCF21, 0xE482, 0xF715 },
+    { 0x094F, 0x1CB7, 0x330C, 0x51BE, 0xAE41, 0xCCF3, 0xE348, 0xF6B0 },
+    { 0x09BA, 0x1DFF, 0x3554, 0x5565, 0xAA9A, 0xCAAB, 0xE200, 0xF645 },
+    { 0x0A29, 0x1F55, 0x37B4, 0x5932, 0xA6CD, 0xC84B, 0xE0AA, 0xF5D6 },
+    { 0x0A9D, 0x20BC, 0x3A31, 0x5D2E, 0xA2D1, 0xC5CE, 0xDF43, 0xF562 },
+    { 0x0B16, 0x2231, 0x3CC9, 0x6156, 0x9EA9, 0xC336, 0xDDCE, 0xF4E9 },
+    { 0x0B95, 0x23B8, 0x3F80, 0x65AF, 0x9A50, 0xC07F, 0xDC47, 0xF46A },
+    { 0x0C19, 0x2551, 0x4256, 0x6A39, 0x95C6, 0xBDA9, 0xDAAE, 0xF3E6 },
+    { 0x0CA4, 0x26FB, 0x454C, 0x6EF7, 0x9108, 0xBAB3, 0xD904, 0xF35B },
+    { 0x0D34, 0x28B8, 0x4864, 0x73EB, 0x8C14, 0xB79B, 0xD747, 0xF2CB },
+    { 0x0DCB, 0x2A8A, 0x4B9F, 0x7918, 0x86E7, 0xB460, 0xD575, 0xF234 },
+    { 0x0E68, 0x2C6F, 0x4EFE, 0x7E7E, 0x8181, 0xB101, 0xD390, 0xF197 },
+    { 0x0F0D, 0x2E6B, 0x5285, 0x7FFF, 0x8000, 0xAD7A, 0xD194, 0xF0F2 },
+    { 0x0FB9, 0x307E, 0x5635, 0x7FFF, 0x8000, 0xA9CA, 0xCF81, 0xF046 },
+    { 0x106D, 0x32A7, 0x5A0D, 0x7FFF, 0x8000, 0xA5F2, 0xCD58, 0xEF92 },
+    { 0x1128, 0x34EA, 0x5E12, 0x7FFF, 0x8000, 0xA1ED, 0xCB15, 0xEED7 },
+    { 0x11ED, 0x3747, 0x6245, 0x7FFF, 0x8000, 0x9DBA, 0xC8B8, 0xEE12 },
+    { 0x12B9, 0x39BF, 0x66A8, 0x7FFF, 0x8000, 0x9957, 0xC640, 0xED46 },
+    { 0x138F, 0x3C52, 0x6B3C, 0x7FFF, 0x8000, 0x94C3, 0xC3AD, 0xEC70 },
+    { 0x146F, 0x3F04, 0x7006, 0x7FFF, 0x8000, 0x8FF9, 0xC0FB, 0xEB90 },
+    { 0x1558, 0x41D3, 0x7505, 0x7FFF, 0x8000, 0x8AFA, 0xBE2C, 0xEAA7 },
+    { 0x164C, 0x44C3, 0x7A3E, 0x7FFF, 0x8000, 0x85C1, 0xBB3C, 0xE9B3 },
+    { 0x174B, 0x47D5, 0x7FB3, 0x7FFF, 0x8000, 0x804C, 0xB82A, 0xE8B4 },
+    { 0x1855, 0x4B0A, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0xB4F5, 0xE7AA },
+    { 0x196B, 0x4E63, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0xB19C, 0xE694 },
+    { 0x1A8D, 0x51E3, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0xAE1C, 0xE572 },
+    { 0x1BBD, 0x558B, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0xAA74, 0xE442 },
+    { 0x1CFA, 0x595C, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0xA6A3, 0xE305 },
+    { 0x1E45, 0x5D59, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0xA2A6, 0xE1BA },
+    { 0x1F9F, 0x6184, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0x9E7B, 0xE060 },
+    { 0x2108, 0x65DE, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0x9A21, 0xDEF7 },
+    { 0x2281, 0x6A6A, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0x9595, 0xDD7E },
+    { 0x240C, 0x6F29, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0x90D6, 0xDBF3 },
+    { 0x25A7, 0x741F, 0x7FFF, 0x7FFF, 0x8000, 0x8000, 0x8BE0, 0xDA58 },
+};
+
+static const UINT16 MACEtab4[][8] = {
+    { 0x0040, 0x00D8, 0xFF27, 0xFFBF, 0, 0, 0, 0 },  { 0x0043, 0x00E2, 0xFF1D, 0xFFBC, 0, 0, 0, 0 },
+    { 0x0046, 0x00EC, 0xFF13, 0xFFB9, 0, 0, 0, 0 },  { 0x004A, 0x00F6, 0xFF09, 0xFFB5, 0, 0, 0, 0 },
+    { 0x004D, 0x0101, 0xFEFE, 0xFFB2, 0, 0, 0, 0 },  { 0x0050, 0x010C, 0xFEF3, 0xFFAF, 0, 0, 0, 0 },
+    { 0x0054, 0x0118, 0xFEE7, 0xFFAB, 0, 0, 0, 0 },  { 0x0058, 0x0126, 0xFED9, 0xFFA7, 0, 0, 0, 0 },
+    { 0x005C, 0x0133, 0xFECC, 0xFFA3, 0, 0, 0, 0 },  { 0x0060, 0x0141, 0xFEBE, 0xFF9F, 0, 0, 0, 0 },
+    { 0x0064, 0x014E, 0xFEB1, 0xFF9B, 0, 0, 0, 0 },  { 0x0068, 0x015E, 0xFEA1, 0xFF97, 0, 0, 0, 0 },
+    { 0x006D, 0x016D, 0xFE92, 0xFF92, 0, 0, 0, 0 },  { 0x0072, 0x017E, 0xFE81, 0xFF8D, 0, 0, 0, 0 },
+    { 0x0077, 0x018F, 0xFE70, 0xFF88, 0, 0, 0, 0 },  { 0x007C, 0x01A0, 0xFE5F, 0xFF83, 0, 0, 0, 0 },
+    { 0x0082, 0x01B2, 0xFE4D, 0xFF7D, 0, 0, 0, 0 },  { 0x0088, 0x01C6, 0xFE39, 0xFF77, 0, 0, 0, 0 },
+    { 0x008E, 0x01DB, 0xFE24, 0xFF71, 0, 0, 0, 0 },  { 0x0094, 0x01EF, 0xFE10, 0xFF6B, 0, 0, 0, 0 },
+    { 0x009B, 0x0207, 0xFDF8, 0xFF64, 0, 0, 0, 0 },  { 0x00A2, 0x021D, 0xFDE2, 0xFF5D, 0, 0, 0, 0 },
+    { 0x00A9, 0x0234, 0xFDCB, 0xFF56, 0, 0, 0, 0 },  { 0x00B0, 0x024E, 0xFDB1, 0xFF4F, 0, 0, 0, 0 },
+    { 0x00B9, 0x0269, 0xFD96, 0xFF46, 0, 0, 0, 0 },  { 0x00C1, 0x0284, 0xFD7B, 0xFF3E, 0, 0, 0, 0 },
+    { 0x00C9, 0x02A1, 0xFD5E, 0xFF36, 0, 0, 0, 0 },  { 0x00D2, 0x02BF, 0xFD40, 0xFF2D, 0, 0, 0, 0 },
+    { 0x00DC, 0x02DF, 0xFD20, 0xFF23, 0, 0, 0, 0 },  { 0x00E6, 0x02FF, 0xFD00, 0xFF19, 0, 0, 0, 0 },
+    { 0x00F0, 0x0321, 0xFCDE, 0xFF0F, 0, 0, 0, 0 },  { 0x00FB, 0x0346, 0xFCB9, 0xFF04, 0, 0, 0, 0 },
+    { 0x0106, 0x036C, 0xFC93, 0xFEF9, 0, 0, 0, 0 },  { 0x0112, 0x0392, 0xFC6D, 0xFEED, 0, 0, 0, 0 },
+    { 0x011E, 0x03BB, 0xFC44, 0xFEE1, 0, 0, 0, 0 },  { 0x012B, 0x03E5, 0xFC1A, 0xFED4, 0, 0, 0, 0 },
+    { 0x0138, 0x0411, 0xFBEE, 0xFEC7, 0, 0, 0, 0 },  { 0x0146, 0x0441, 0xFBBE, 0xFEB9, 0, 0, 0, 0 },
+    { 0x0155, 0x0472, 0xFB8D, 0xFEAA, 0, 0, 0, 0 },  { 0x0164, 0x04A4, 0xFB5B, 0xFE9B, 0, 0, 0, 0 },
+    { 0x0174, 0x04D9, 0xFB26, 0xFE8B, 0, 0, 0, 0 },  { 0x0184, 0x0511, 0xFAEE, 0xFE7B, 0, 0, 0, 0 },
+    { 0x0196, 0x054A, 0xFAB5, 0xFE69, 0, 0, 0, 0 },  { 0x01A8, 0x0587, 0xFA78, 0xFE57, 0, 0, 0, 0 },
+    { 0x01BB, 0x05C6, 0xFA39, 0xFE44, 0, 0, 0, 0 },  { 0x01CE, 0x0608, 0xF9F7, 0xFE31, 0, 0, 0, 0 },
+    { 0x01E3, 0x064D, 0xF9B2, 0xFE1C, 0, 0, 0, 0 },  { 0x01F9, 0x0694, 0xF96B, 0xFE06, 0, 0, 0, 0 },
+    { 0x020F, 0x06E0, 0xF91F, 0xFDF0, 0, 0, 0, 0 },  { 0x0227, 0x072E, 0xF8D1, 0xFDD8, 0, 0, 0, 0 },
+    { 0x0240, 0x0781, 0xF87E, 0xFDBF, 0, 0, 0, 0 },  { 0x0259, 0x07D7, 0xF828, 0xFDA6, 0, 0, 0, 0 },
+    { 0x0274, 0x0831, 0xF7CE, 0xFD8B, 0, 0, 0, 0 },  { 0x0290, 0x088E, 0xF771, 0xFD6F, 0, 0, 0, 0 },
+    { 0x02AE, 0x08F0, 0xF70F, 0xFD51, 0, 0, 0, 0 },  { 0x02CC, 0x0955, 0xF6AA, 0xFD33, 0, 0, 0, 0 },
+    { 0x02EC, 0x09C0, 0xF63F, 0xFD13, 0, 0, 0, 0 },  { 0x030D, 0x0A2F, 0xF5D0, 0xFCF2, 0, 0, 0, 0 },
+    { 0x0330, 0x0AA4, 0xF55B, 0xFCCF, 0, 0, 0, 0 },  { 0x0355, 0x0B1E, 0xF4E1, 0xFCAA, 0, 0, 0, 0 },
+    { 0x037B, 0x0B9D, 0xF462, 0xFC84, 0, 0, 0, 0 },  { 0x03A2, 0x0C20, 0xF3DF, 0xFC5D, 0, 0, 0, 0 },
+    { 0x03CC, 0x0CAB, 0xF354, 0xFC33, 0, 0, 0, 0 },  { 0x03F8, 0x0D3D, 0xF2C2, 0xFC07, 0, 0, 0, 0 },
+    { 0x0425, 0x0DD3, 0xF22C, 0xFBDA, 0, 0, 0, 0 },  { 0x0454, 0x0E72, 0xF18D, 0xFBAB, 0, 0, 0, 0 },
+    { 0x0486, 0x0F16, 0xF0E9, 0xFB79, 0, 0, 0, 0 },  { 0x04B9, 0x0FC3, 0xF03C, 0xFB46, 0, 0, 0, 0 },
+    { 0x04F0, 0x1078, 0xEF87, 0xFB0F, 0, 0, 0, 0 },  { 0x0528, 0x1133, 0xEECC, 0xFAD7, 0, 0, 0, 0 },
+    { 0x0563, 0x11F7, 0xEE08, 0xFA9C, 0, 0, 0, 0 },  { 0x05A1, 0x12C6, 0xED39, 0xFA5E, 0, 0, 0, 0 },
+    { 0x05E1, 0x139B, 0xEC64, 0xFA1E, 0, 0, 0, 0 },  { 0x0624, 0x147C, 0xEB83, 0xF9DB, 0, 0, 0, 0 },
+    { 0x066A, 0x1565, 0xEA9A, 0xF995, 0, 0, 0, 0 },  { 0x06B3, 0x165A, 0xE9A5, 0xF94C, 0, 0, 0, 0 },
+    { 0x0700, 0x175A, 0xE8A5, 0xF8FF, 0, 0, 0, 0 },  { 0x0750, 0x1865, 0xE79A, 0xF8AF, 0, 0, 0, 0 },
+    { 0x07A3, 0x197A, 0xE685, 0xF85C, 0, 0, 0, 0 },  { 0x07FB, 0x1A9D, 0xE562, 0xF804, 0, 0, 0, 0 },
+    { 0x0856, 0x1BCE, 0xE431, 0xF7A9, 0, 0, 0, 0 },  { 0x08B5, 0x1D0C, 0xE2F3, 0xF74A, 0, 0, 0, 0 },
+    { 0x0919, 0x1E57, 0xE1A8, 0xF6E6, 0, 0, 0, 0 },  { 0x0980, 0x1FB2, 0xE04D, 0xF67F, 0, 0, 0, 0 },
+    { 0x09ED, 0x211D, 0xDEE2, 0xF612, 0, 0, 0, 0 },  { 0x0A5F, 0x2296, 0xDD69, 0xF5A0, 0, 0, 0, 0 },
+    { 0x0AD5, 0x2422, 0xDBDD, 0xF52A, 0, 0, 0, 0 },  { 0x0B51, 0x25BF, 0xDA40, 0xF4AE, 0, 0, 0, 0 },
+    { 0x0BD2, 0x276E, 0xD891, 0xF42D, 0, 0, 0, 0 },  { 0x0C5A, 0x2932, 0xD6CD, 0xF3A5, 0, 0, 0, 0 },
+    { 0x0CE7, 0x2B08, 0xD4F7, 0xF318, 0, 0, 0, 0 },  { 0x0D7A, 0x2CF4, 0xD30B, 0xF285, 0, 0, 0, 0 },
+    { 0x0E14, 0x2EF4, 0xD10B, 0xF1EB, 0, 0, 0, 0 },  { 0x0EB5, 0x310C, 0xCEF3, 0xF14A, 0, 0, 0, 0 },
+    { 0x0F5D, 0x333E, 0xCCC1, 0xF0A2, 0, 0, 0, 0 },  { 0x100C, 0x3587, 0xCA78, 0xEFF3, 0, 0, 0, 0 },
+    { 0x10C4, 0x37EB, 0xC814, 0xEF3B, 0, 0, 0, 0 },  { 0x1183, 0x3A69, 0xC596, 0xEE7C, 0, 0, 0, 0 },
+    { 0x124B, 0x3D05, 0xC2FA, 0xEDB4, 0, 0, 0, 0 },  { 0x131C, 0x3FBE, 0xC041, 0xECE3, 0, 0, 0, 0 },
+    { 0x13F7, 0x4296, 0xBD69, 0xEC08, 0, 0, 0, 0 },  { 0x14DB, 0x458F, 0xBA70, 0xEB24, 0, 0, 0, 0 },
+    { 0x15C9, 0x48AA, 0xB755, 0xEA36, 0, 0, 0, 0 },  { 0x16C2, 0x4BE9, 0xB416, 0xE93D, 0, 0, 0, 0 },
+    { 0x17C6, 0x4F4C, 0xB0B3, 0xE839, 0, 0, 0, 0 },  { 0x18D6, 0x52D5, 0xAD2A, 0xE729, 0, 0, 0, 0 },
+    { 0x19F2, 0x5688, 0xA977, 0xE60D, 0, 0, 0, 0 },  { 0x1B1A, 0x5A65, 0xA59A, 0xE4E5, 0, 0, 0, 0 },
+    { 0x1C50, 0x5E6D, 0xA192, 0xE3AF, 0, 0, 0, 0 },  { 0x1D93, 0x62A4, 0x9D5B, 0xE26C, 0, 0, 0, 0 },
+    { 0x1EE5, 0x670C, 0x98F3, 0xE11A, 0, 0, 0, 0 },  { 0x2046, 0x6BA5, 0x945A, 0xDFB9, 0, 0, 0, 0 },
+    { 0x21B7, 0x7072, 0x8F8D, 0xDE48, 0, 0, 0, 0 },  { 0x2338, 0x7578, 0x8A87, 0xDCC7, 0, 0, 0, 0 },
+    { 0x24CB, 0x7AB5, 0x854A, 0xDB34, 0, 0, 0, 0 },  { 0x266F, 0x7FFF, 0x8000, 0xD990, 0, 0, 0, 0 },
+    { 0x2826, 0x7FFF, 0x8000, 0xD7D9, 0, 0, 0, 0 },  { 0x29F1, 0x7FFF, 0x8000, 0xD60E, 0, 0, 0, 0 },
+    { 0x2BD0, 0x7FFF, 0x8000, 0xD42F, 0, 0, 0, 0 },  { 0x2DC5, 0x7FFF, 0x8000, 0xD23A, 0, 0, 0, 0 },
+    { 0x2FD0, 0x7FFF, 0x8000, 0xD02F, 0, 0, 0, 0 },  { 0x31F2, 0x7FFF, 0x8000, 0xCE0D, 0, 0, 0, 0 },
+    { 0x342C, 0x7FFF, 0x8000, 0xCBD3, 0, 0, 0, 0 },  { 0x3681, 0x7FFF, 0x8000, 0xC97E, 0, 0, 0, 0 },
+    { 0x38F0, 0x7FFF, 0x8000, 0xC70F, 0, 0, 0, 0 },  { 0x3B7A, 0x7FFF, 0x8000, 0xC485, 0, 0, 0, 0 },
+    { 0x3E22, 0x7FFF, 0x8000, 0xC1DD, 0, 0, 0, 0 },  { 0x40E7, 0x7FFF, 0x8000, 0xBF18, 0, 0, 0, 0 },
+};
+/* end of constants */
+
+typedef struct MACEContext {
+  short index, lev, factor, prev2, previous, level;
+  short *outPtr;
+} MACEContext;
+
+/* /// "chomp3()" */
+static void chomp3(MACEContext *ctx,
+            UINT8 val,
+            const UINT16 tab1[],
+            const UINT16 tab2[][8])
+{
+  short current;
+
+  current=(short)tab2[(ctx->index & 0x7f0) >> 4][val];
+  if (current+ctx->lev > 32767) current=32767;
+  else if (current+ctx->lev < -32768) current=-32767;
+  else current+=ctx->lev;
+  ctx->lev=current-(current >> 3);
+//  *ctx->outPtr++=current >> 8;
+  *ctx->outPtr++=current;
+  if ( ( ctx->index += tab1[val]-(ctx->index>>5) ) < 0 ) ctx->index = 0;
+}
+/* \\\ */
+
+/* /// "Exp1to3()" */
+static void Exp1to3(MACEContext *ctx,
+             UINT8 *inBuffer,
+             void *outBuffer,
+             UINT32 cnt,
+             UINT32 numChannels,
+             UINT32 whichChannel)
+{
+   UINT8 pkt;
+
+/*
+   if (inState) {
+     ctx->index=inState[0];
+     ctx->lev=inState[1];
+   } else
+*/
+   ctx->index=ctx->lev=0;
+
+   inBuffer+=(whichChannel-1)*2;
+
+   ctx->outPtr=outBuffer;
+
+   while (cnt>0) {
+     pkt=inBuffer[0];
+     chomp3(ctx, pkt       & 7, MACEtab1, MACEtab2);
+     chomp3(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4);
+     chomp3(ctx, pkt >> 5     , MACEtab1, MACEtab2);
+     pkt=inBuffer[1];
+     chomp3(ctx, pkt       & 7, MACEtab1, MACEtab2);
+     chomp3(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4);
+     chomp3(ctx, pkt >> 5     , MACEtab1, MACEtab2);
+
+     inBuffer+=numChannels*2;
+     --cnt;
+   }
+
+/*
+   if (outState) {
+     outState[0]=ctx->index;
+     outState[1]=ctx->lev;
+   }
+*/
+}
+/* \\\ */
+
+/* /// "chomp6()" */
+static void chomp6(MACEContext *ctx,
+            UINT8 val,
+            const UINT16 tab1[],
+            const UINT16 tab2[][8])
+{
+  short current;
+
+  current=(short)tab2[(ctx->index & 0x7f0) >> 4][val];
+
+  if ((ctx->previous^current)>=0) {
+    if (ctx->factor+506>32767) ctx->factor=32767;
+    else ctx->factor+=506;
+  } else {
+    if (ctx->factor-314<-32768) ctx->factor=-32767;
+    else ctx->factor-=314;
+  }
+
+  if (current+ctx->level>32767) current=32767;
+  else if (current+ctx->level<-32768) current=-32767;
+  else current+=ctx->level;
+
+  ctx->level=((current*ctx->factor) >> 15);
+  current>>=1;
+
+//  *ctx->outPtr++=(ctx->previous+ctx->prev2-((ctx->prev2-current) >> 2)) >> 8;
+//  *ctx->outPtr++=(ctx->previous+current+((ctx->prev2-current) >> 2)) >> 8;
+  *ctx->outPtr++=(ctx->previous+ctx->prev2-((ctx->prev2-current) >> 2));
+  *ctx->outPtr++=(ctx->previous+current+((ctx->prev2-current) >> 2));
+
+  ctx->prev2=ctx->previous;
+  ctx->previous=current;
+
+  if( ( ctx->index += tab1[val]-(ctx->index>>5) ) < 0 ) ctx->index = 0;
+}
+/* \\\ */
+
+/* /// "Exp1to6()" */
+static void Exp1to6(MACEContext *ctx,
+             UINT8 *inBuffer,
+             void *outBuffer,
+             UINT32 cnt,
+             UINT32 numChannels,
+             UINT32 whichChannel)
+{
+   UINT8 pkt;
+
+/*
+   if (inState) {
+     ctx->previous=inState[0];
+     ctx->prev2=inState[1];
+     ctx->index=inState[2];
+     ctx->level=inState[3];
+     ctx->factor=inState[4];
+   } else
+*/
+   ctx->previous=ctx->prev2=ctx->index=ctx->level=ctx->factor=0;
+
+   inBuffer+=(whichChannel-1);
+   ctx->outPtr=outBuffer;
+
+   while (cnt>0) {
+     pkt=*inBuffer;
+
+     chomp6(ctx, pkt >> 5     , MACEtab1, MACEtab2);
+     chomp6(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4);
+     chomp6(ctx, pkt       & 7, MACEtab1, MACEtab2);
+
+     inBuffer+=numChannels;
+     --cnt;
+   }
+
+/*
+   if (outState) {
+     outState[0]=ctx->previous;
+     outState[1]=ctx->prev2;
+     outState[2]=ctx->index;
+     outState[3]=ctx->level;
+     outState[4]=ctx->factor;
+   }
+*/
+}
+/* \\\ */
+
+static int mace_decode_init(AVCodecContext * avctx)
+{
+    if (avctx->channels > 2)
+        return -1;
+    return 0;
+}
+
+static int mace_decode_frame(AVCodecContext *avctx,
+                            void *data, int *data_size,
+                            UINT8 *buf, int buf_size)
+{
+    short *samples;
+    MACEContext *c = avctx->priv_data;
+
+    samples = (short *)data;
+    switch (avctx->codec->id) {
+    case CODEC_ID_MACE3:
+puts("mace_decode_frame[3]()");
+        Exp1to3(c, buf, samples, buf_size / 2, avctx->channels, 1);
+        if (avctx->channels == 2)
+            Exp1to3(c, buf, samples+1, buf_size / 2, 2, 2);
+        *data_size = 2 * 3 * buf_size;
+        break;
+    case CODEC_ID_MACE6:
+puts("mace_decode_frame[6]()");
+        Exp1to6(c, buf, samples, buf_size, avctx->channels, 1);
+        if (avctx->channels == 2)
+            Exp1to6(c, buf, samples+1, buf_size, 2, 2);
+        *data_size = 2 * 6 * buf_size;
+        break;
+    default:
+        *data_size = 0;
+        return -1;
+    }
+    return buf_size;
+}
+
+AVCodec mace3_decoder = {
+    "mace3",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_MACE3,
+    sizeof(MACEContext),
+    mace_decode_init,
+    NULL,
+    NULL,
+    mace_decode_frame,
+};
+
+AVCodec mace6_decoder = {
+    "mace6",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_MACE6,
+    sizeof(MACEContext),
+    mace_decode_init,
+    NULL,
+    NULL,
+    mace_decode_frame,
+};
+
diff --git a/src/libffmpeg/libavcodec/mdct.c b/src/libffmpeg/libavcodec/mdct.c
index 0f70ed850..91a927cb0 100644
--- a/src/libffmpeg/libavcodec/mdct.c
+++ b/src/libffmpeg/libavcodec/mdct.c
@@ -31,10 +31,10 @@ int ff_mdct_init(MDCTContext *s, int nbits, int inverse)
     s->nbits = nbits;
     s->n = n;
     n4 = n >> 2;
-    s->tcos = malloc(n4 * sizeof(FFTSample));
+    s->tcos = av_malloc(n4 * sizeof(FFTSample));
     if (!s->tcos)
         goto fail;
-    s->tsin = malloc(n4 * sizeof(FFTSample));
+    s->tsin = av_malloc(n4 * sizeof(FFTSample));
     if (!s->tsin)
         goto fail;
 
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index eed155c19..9b4943582 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -720,6 +720,8 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
     s->buffer_size = 102400; /* smaller buffer should be enough,
 				but photojpg files could ahive bigger sizes */
     s->buffer = av_malloc(s->buffer_size);
+    if (!s->buffer)
+	return -1;
     s->start_code = -1;
     s->first_picture = 1;
     s->org_width = avctx->width;
@@ -1180,17 +1182,28 @@ static int mjpeg_decode_app(MJpegDecodeContext *s)
 	    get_bits(&s->gb, 8), get_bits(&s->gb, 8));
 	if (get_bits(&s->gb, 8) == 0)
 	{
-	    int x_density = get_bits(&s->gb, 16);
-	    int y_density = get_bits(&s->gb, 16);
+	    int x_density, y_density; 
+	    x_density = get_bits(&s->gb, 16);
+	    y_density = get_bits(&s->gb, 16);
 
+	    dprintf("x/y density: %d (%f), %d (%f)\n", x_density,
+		(float)x_density, y_density, (float)y_density);
+#if 0
             //MN: needs to be checked
-            s->avctx->aspect_ratio= s->width*y_density/((float)s->height*x_density);
+            if(x_density)
+//                s->avctx->aspect_ratio= s->width*y_density/((float)s->height*x_density);
+		s->avctx->aspect_ratio = (float)x_density/y_density;
+		/* it's better, but every JFIF I have seen stores 1:1 */
+            else
+                s->avctx->aspect_ratio= 0.0;
+#endif
 	}
 	else
 	{
 	    skip_bits(&s->gb, 16);
 	    skip_bits(&s->gb, 16);
 	}
+
 	t_w = get_bits(&s->gb, 8);
 	t_h = get_bits(&s->gb, 8);
 	if (t_w && t_h)
@@ -1431,7 +1444,6 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
 		case EOI:
 eoi_parser:
 		    {
-                        int l;
                         if (s->interlaced) {
                             s->bottom_field ^= 1;
                             /* if not bottom field, do not output image yet */
@@ -1440,15 +1452,8 @@ eoi_parser:
                         }
                         for(i=0;i<3;i++) {
                             picture->data[i] = s->current_picture[i];
-#if 1
-                            l = s->linesize[i];
-                            if (s->interlaced)
-                                l >>= 1;
-                            picture->linesize[i] = l;
-#else
 			    picture->linesize[i] = (s->interlaced) ?
 				s->linesize[i] >> 1 : s->linesize[i];
-#endif
                         }
                         *data_size = sizeof(AVPicture);
                         avctx->height = s->height;
diff --git a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
index 109564f02..b951cd455 100644
--- a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
+++ b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
@@ -14,7 +14,7 @@
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "../dsputil.h"
@@ -23,281 +23,247 @@
 #include <mlib_types.h>
 #include <mlib_status.h>
 #include <mlib_sys.h>
-#include <mlib_algebra.h>
 #include <mlib_video.h>
 
-static void get_pixels_mlib(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
-{
-  int i;
-
-  for (i=0;i<8;i++) {
-    mlib_VectorConvert_S16_U8_Mod((mlib_s16 *)block, (mlib_u8 *)pixels, 8);
 
-    pixels += line_size;
-    block += 8;
-  }
-}
+/* copy block, width 16 pixel, height 8/16 */
 
-static void diff_pixels_mlib(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2, int line_size)
+static void put_pixels16_mlib (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
-  int i;
-
-  for (i=0;i<8;i++) {
-    mlib_VectorSub_S16_U8_Mod((mlib_s16 *)block, (mlib_u8 *)s1, (mlib_u8 *)s2, 8);
-
-    s1 += line_size;
-    s2 += line_size;
-    block += 8;
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoCopyRef_U8_U8_16x16(dest, (uint8_t *)ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_16x8 (dest, (uint8_t *)ref, stride);
 }
 
-static void put_pixels_clamped_mlib(const DCTELEM *block, UINT8 *restrict pixels, int line_size)
+static void put_pixels16_x2_mlib (uint8_t * dest, const uint8_t * ref,
+				  int stride, int height)
 {
-  int i;
-
-  for(i=0;i<8;i++) {
-    mlib_VectorConvert_U8_S16_Sat(pixels, (mlib_s16 *)block, 8);
-
-    pixels += line_size;
-    block += 8;
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpX_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void add_pixels_clamped_mlib(const DCTELEM *block, UINT8 *restrict pixels, int line_size)
+static void put_pixels16_y2_mlib (uint8_t * dest, const uint8_t * ref,
+				  int stride, int height)
 {
-  mlib_VideoAddBlock_U8_S16(pixels, (mlib_s16 *)block, line_size);
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void put_pixels16_mlib (uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void put_pixels16_xy2_mlib(uint8_t * dest, const uint8_t * ref,
+				  int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoCopyRef_U8_U8_16x16(dest, (uint8_t *)ref, stride);
-  }
-  else {
-    mlib_VideoCopyRef_U8_U8_16x8 (dest, (uint8_t *)ref, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16) 
+	mlib_VideoInterpXY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void put_pixels16_x2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
-{
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpX_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpX_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
-  }
-}
 
-static void put_pixels16_y2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
-{
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpY_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
-  }
-}
+/* copy block, width 8 pixel, height 8/16 */
 
-static void put_pixels16_xy2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void put_pixels8_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpXY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpXY_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoCopyRef_U8_U8_8x16(dest, (uint8_t *)ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *)ref, stride);
 }
 
-static void put_pixels8_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void put_pixels8_x2_mlib (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoCopyRef_U8_U8_8x16(dest, (uint8_t *)ref, stride);
-  }
-  else {
-    mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *)ref, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void put_pixels8_x2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void put_pixels8_y2_mlib (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpX_U8_U8_8x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void put_pixels8_y2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void put_pixels8_xy2_mlib(uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpY_U8_U8_8x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16) 
+	mlib_VideoInterpXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void put_pixels8_xy2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+
+/* average/merge dest+source block, width 16 pixel, height 8/16 */
+
+static void avg_pixels16_mlib (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpXY_U8_U8_8x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoCopyRefAve_U8_U8_16x16(dest, (uint8_t *)ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_16x8 (dest, (uint8_t *)ref, stride);
 }
 
-static void avg_pixels16_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void avg_pixels16_x2_mlib (uint8_t * dest, const uint8_t * ref,
+				  int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16){
-    mlib_VideoCopyRefAve_U8_U8_16x16(dest, (uint8_t *)ref, stride);
-  }
-  else {
-    mlib_VideoCopyRefAve_U8_U8_16x8(dest, (uint8_t *)ref, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpAveX_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void avg_pixels16_x2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void avg_pixels16_y2_mlib (uint8_t * dest, const uint8_t * ref,
+				  int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpAveX_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpAveX_U8_U8_16x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpAveY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void avg_pixels16_y2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void avg_pixels16_xy2_mlib (uint8_t * dest, const uint8_t * ref,
+				   int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpAveY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpAveY_U8_U8_16x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpAveXY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_16x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void avg_pixels16_xy2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+
+/* average/merge dest+source block, width 8 pixel, height 8/16 */
+
+static void avg_pixels8_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpAveXY_U8_U8_16x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpAveXY_U8_U8_16x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoCopyRefAve_U8_U8_8x16(dest, (uint8_t *)ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, (uint8_t *)ref, stride);
 }
 
-static void avg_pixels8_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void avg_pixels8_x2_mlib (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoCopyRefAve_U8_U8_8x16(dest, (uint8_t *)ref, stride);
-  }
-  else {
-    mlib_VideoCopyRefAve_U8_U8_8x8(dest, (uint8_t *)ref, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpAveX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void avg_pixels8_x2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void avg_pixels8_y2_mlib (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpAveX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpAveY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void avg_pixels8_y2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+static void avg_pixels8_xy2_mlib (uint8_t * dest, const uint8_t * ref,
+				  int stride, int height)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpAveY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpAveY_U8_U8_8x8(dest, (uint8_t *)ref, stride, stride);
-  }
+    assert(height == 16 || height == 8);
+    if (height == 16)
+	mlib_VideoInterpAveXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
 }
 
-static void avg_pixels8_xy2_mlib(uint8_t *dest, const uint8_t *ref, int stride, int height)
+
+static void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
+
+
+static void add_pixels_clamped_mlib(const DCTELEM *block, UINT8 *pixels, int line_size)
 {
-  assert(height == 16 || height == 8);
-  if (height == 16) {
-    mlib_VideoInterpAveXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-  }
-  else {
-    mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-  }
+    mlib_VideoAddBlock_U8_S16(pixels, (mlib_s16 *)block, line_size);
 }
 
-void dsputil_init_mlib(DSPContext* c, unsigned mask)
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+static void ff_idct_put_mlib(UINT8 *dest, int line_size, DCTELEM *data)
 {
-  c->get_pixels = get_pixels_mlib;
-  c->diff_pixels = diff_pixels_mlib;
-
-  c->put_pixels_clamped = put_pixels_clamped_mlib;
-  c->add_pixels_clamped = add_pixels_clamped_mlib;
-
-  c->put_pixels_tab[0][0] = put_pixels16_mlib;
-  c->put_pixels_tab[0][1] = put_pixels16_x2_mlib; 
-  c->put_pixels_tab[0][2] = put_pixels16_y2_mlib;
-  c->put_pixels_tab[0][3] = put_pixels16_xy2_mlib;
-  c->put_pixels_tab[1][0] = put_pixels8_mlib;
-  c->put_pixels_tab[1][1] = put_pixels8_x2_mlib;
-  c->put_pixels_tab[1][2] = put_pixels8_y2_mlib;
-  c->put_pixels_tab[1][3] = put_pixels8_xy2_mlib;
-
-  c->avg_pixels_tab[0][0] = avg_pixels16_mlib;
-  c->avg_pixels_tab[0][1] = avg_pixels16_x2_mlib;
-  c->avg_pixels_tab[0][2] = avg_pixels16_y2_mlib;
-  c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mlib; 
-  c->avg_pixels_tab[1][0] = avg_pixels8_mlib;
-  c->avg_pixels_tab[1][1] = avg_pixels8_x2_mlib;
-  c->avg_pixels_tab[1][2] = avg_pixels8_y2_mlib;
-  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mlib;
+    mlib_VideoIDCT8x8_S16_S16 (data, data);
+    put_pixels_clamped(data, dest, line_size);
 }
 
-static void ff_fdct_mlib(DCTELEM *data)
+static void ff_idct_add_mlib(UINT8 *dest, int line_size, DCTELEM *data)
 {
-  mlib_VideoDCT8x8_S16_S16(data, data);
+    mlib_VideoIDCT8x8_S16_S16 (data, data);
+    mlib_VideoAddBlock_U8_S16(dest, (mlib_s16 *)data, line_size);
 }
 
-static void ff_idct_put_mlib(UINT8 *dest, int line_size, DCTELEM *data)
+static void ff_fdct_mlib(DCTELEM *data)
 {
-  mlib_VideoIDCT8x8_S16_S16(data, data);
-  put_pixels_clamped_mlib(data, dest, line_size);
+    mlib_VideoDCT8x8_S16_S16 (data, data);
 }
 
-static void ff_idct_add_mlib(UINT8 *dest, int line_size, DCTELEM *data)
+void dsputil_init_mlib(DSPContext* c, unsigned mask)
 {
-  mlib_VideoIDCT8x8_S16_S16(data, data);
-  add_pixels_clamped_mlib(data, dest, line_size);
+    c->put_pixels_tab[0][0] = put_pixels16_mlib;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_mlib;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_mlib;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_mlib;
+    c->put_pixels_tab[1][0] = put_pixels8_mlib;
+    c->put_pixels_tab[1][1] = put_pixels8_x2_mlib;
+    c->put_pixels_tab[1][2] = put_pixels8_y2_mlib;
+    c->put_pixels_tab[1][3] = put_pixels8_xy2_mlib;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_mlib;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_mlib;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_mlib;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mlib;
+    c->avg_pixels_tab[1][0] = avg_pixels8_mlib;
+    c->avg_pixels_tab[1][1] = avg_pixels8_x2_mlib;
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mlib;
+    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mlib;
+
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mlib;
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mlib;
+
+    c->add_pixels_clamped = add_pixels_clamped_mlib;
+    put_pixels_clamped = c->put_pixels_clamped;
 }
 
 void MPV_common_init_mlib(MpegEncContext *s)
 {
-  if(s->avctx->dct_algo==FF_DCT_AUTO || s->avctx->dct_algo==FF_DCT_MLIB) {
-    s->fdct = ff_fdct_mlib;
-  }
-
-  if (s->avctx->idct_algo==FF_IDCT_AUTO || s->avctx->idct_algo==FF_IDCT_MLIB) {
-    s->idct_put = ff_idct_put_mlib;
-    s->idct_add = ff_idct_add_mlib; 
-    s->idct_permutation_type = FF_NO_IDCT_PERM;
-  }
-}
+    int i;
+
+    if(s->avctx->dct_algo==FF_DCT_AUTO || s->avctx->dct_algo==FF_DCT_MLIB){
+	s->fdct = ff_fdct_mlib;
+    }
 
+    if(s->avctx->idct_algo==FF_IDCT_AUTO || s->avctx->idct_algo==FF_IDCT_MLIB){
+        s->idct_put= ff_idct_put_mlib;
+        s->idct_add= ff_idct_add_mlib;
+        s->idct_permutation_type= FF_NO_IDCT_PERM;
+    }
+}
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 547f4cdc5..8310db8d5 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -26,21 +26,325 @@
 #include "dsputil.h"
 #include "mpegvideo.h"
 
+//#undef NDEBUG
+//#include <assert.h>
+
 #define SQ(a) ((a)*(a))
-#define INTER_BIAS	257
 
-#define P_LAST P[0]
 #define P_LEFT P[1]
 #define P_TOP P[2]
 #define P_TOPRIGHT P[3]
 #define P_MEDIAN P[4]
-#define P_LAST_LEFT P[5]
-#define P_LAST_RIGHT P[6]
-#define P_LAST_TOP P[7]
-#define P_LAST_BOTTOM P[8]
 #define P_MV1 P[9]
 
+static inline int sad_hpel_motion_search(MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, Picture *picture,
+                                  int n, int size, uint16_t * const mv_penalty);
+
+static inline int update_map_generation(MpegEncContext * s)
+{
+    s->me.map_generation+= 1<<(ME_MAP_MV_BITS*2);
+    if(s->me.map_generation==0){
+        s->me.map_generation= 1<<(ME_MAP_MV_BITS*2);
+        memset(s->me.map, 0, sizeof(uint32_t)*ME_MAP_SIZE);
+    }
+    return s->me.map_generation;
+}
+
+/* shape adaptive search stuff */
+typedef struct Minima{
+    int height;
+    int x, y;
+    int checked;
+}Minima;
+
+static int minima_cmp(const void *a, const void *b){
+    Minima *da = (Minima *) a;
+    Minima *db = (Minima *) b;
+    
+    return da->height - db->height;
+}
+                                  
+/* SIMPLE */
+#define RENAME(a) simple_ ## a
+
+#define CMP(d, x, y, size)\
+d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride);
+
+#define CMP_HPEL(d, dx, dy, x, y, size)\
+{\
+    const int dxy= (dx) + 2*(dy);\
+    hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, (16>>size));\
+    d = cmp_sub(s, s->me.scratchpad, src_y, stride);\
+}
+
+#define CMP_QPEL(d, dx, dy, x, y, size)\
+{\
+    const int dxy= (dx) + 4*(dy);\
+    qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\
+    d = cmp_sub(s, s->me.scratchpad, src_y, stride);\
+}
+
+#include "motion_est_template.c"
+#undef RENAME
+#undef CMP
+#undef CMP_HPEL
+#undef CMP_QPEL
+#undef INIT
+
+/* SIMPLE CHROMA */
+#define RENAME(a) simple_chroma_ ## a
+
+#define CMP(d, x, y, size)\
+d = cmp(s, src_y, (ref_y) + (x) + (y)*(stride), stride);\
+if(chroma_cmp){\
+    int dxy= ((x)&1) + 2*((y)&1);\
+    int c= ((x)>>1) + ((y)>>1)*uvstride;\
+\
+    chroma_hpel_put[0][dxy](s->me.scratchpad, ref_u + c, uvstride, 8);\
+    d += chroma_cmp(s, s->me.scratchpad, src_u, uvstride);\
+    chroma_hpel_put[0][dxy](s->me.scratchpad, ref_v + c, uvstride, 8);\
+    d += chroma_cmp(s, s->me.scratchpad, src_v, uvstride);\
+}
+
+#define CMP_HPEL(d, dx, dy, x, y, size)\
+{\
+    const int dxy= (dx) + 2*(dy);\
+    hpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride, (16>>size));\
+    d = cmp_sub(s, s->me.scratchpad, src_y, stride);\
+    if(chroma_cmp_sub){\
+        int cxy= (dxy) | ((x)&1) | (2*((y)&1));\
+        int c= ((x)>>1) + ((y)>>1)*uvstride;\
+        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, 8);\
+        d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride);\
+        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, 8);\
+        d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride);\
+    }\
+}
+
+#define CMP_QPEL(d, dx, dy, x, y, size)\
+{\
+    const int dxy= (dx) + 4*(dy);\
+    qpel_put[0][dxy](s->me.scratchpad, (ref_y) + (x) + (y)*(stride), stride);\
+    d = cmp_sub(s, s->me.scratchpad, src_y, stride);\
+    if(chroma_cmp_sub){\
+        int cxy, c;\
+        int cx= (4*(x) + (dx))/2;\
+        int cy= (4*(y) + (dy))/2;\
+        cx= (cx>>1)|(cx&1);\
+        cy= (cy>>1)|(cy&1);\
+        cxy= (cx&1) + 2*(cy&1);\
+        c= ((cx)>>1) + ((cy)>>1)*uvstride;\
+        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_u + c, uvstride, 8);\
+        d += chroma_cmp_sub(s, s->me.scratchpad, src_u, uvstride);\
+        chroma_hpel_put[0][cxy](s->me.scratchpad, ref_v + c, uvstride, 8);\
+        d += chroma_cmp_sub(s, s->me.scratchpad, src_v, uvstride);\
+    }\
+}
+
+#include "motion_est_template.c"
+#undef RENAME
+#undef CMP
+#undef CMP_HPEL
+#undef CMP_QPEL
+#undef INIT
+
+/* SIMPLE DIRECT HPEL */
+#define RENAME(a) simple_direct_hpel_ ## a
+//FIXME precalc divisions stuff
+
+#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\
+if((x) >= xmin && 2*(x) + (dx) <= 2*xmax && (y) >= ymin && 2*(y) + (dy) <= 2*ymax){\
+    const int hx= 2*(x) + (dx);\
+    const int hy= 2*(y) + (dy);\
+    if(s->mv_type==MV_TYPE_8X8){\
+        int i;\
+        for(i=0; i<4; i++){\
+            int fx = s->me.direct_basis_mv[i][0] + hx;\
+            int fy = s->me.direct_basis_mv[i][1] + hy;\
+            int bx = hx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\
+            int by = hy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\
+            int fxy= (fx&1) + 2*(fy&1);\
+            int bxy= (bx&1) + 2*(by&1);\
+\
+            uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\
+            hpel_put[1][fxy](dst, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 8);\
+            hpel_avg[1][bxy](dst, (ref2_y) + (bx>>1) + (by>>1)*(stride), stride, 8);\
+        }\
+    }else{\
+        int fx = s->me.direct_basis_mv[0][0] + hx;\
+        int fy = s->me.direct_basis_mv[0][1] + hy;\
+        int bx = hx ? fx - s->me.co_located_mv[0][0] : s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp;\
+        int by = hy ? fy - s->me.co_located_mv[0][1] : s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp;\
+        int fxy= (fx&1) + 2*(fy&1);\
+        int bxy= (bx&1) + 2*(by&1);\
+\
+        hpel_put[0][fxy](s->me.scratchpad, (ref_y ) + (fx>>1) + (fy>>1)*(stride), stride, 16);\
+        hpel_avg[0][bxy](s->me.scratchpad, (ref2_y) + (bx>>1) + (by>>1)*(stride), stride, 16);\
+    }\
+    d = cmp_func(s, s->me.scratchpad, src_y, stride);\
+}else\
+    d= 256*256*256*32;
+
+
+#define CMP_HPEL(d, dx, dy, x, y, size)\
+    CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub)
+
+#define CMP(d, x, y, size)\
+    CMP_DIRECT(d, 0, 0, x, y, size, cmp)
+    
+#include "motion_est_template.c"
+#undef RENAME
+#undef CMP
+#undef CMP_HPEL
+#undef CMP_QPEL
+#undef INIT
+#undef CMP_DIRECT
+
+/* SIMPLE DIRECT QPEL */
+#define RENAME(a) simple_direct_qpel_ ## a
+
+#define CMP_DIRECT(d, dx, dy, x, y, size, cmp_func)\
+if((x) >= xmin && 4*(x) + (dx) <= 4*xmax && (y) >= ymin && 4*(y) + (dy) <= 4*ymax){\
+    const int qx= 4*(x) + (dx);\
+    const int qy= 4*(y) + (dy);\
+    if(s->mv_type==MV_TYPE_8X8){\
+        int i;\
+        for(i=0; i<4; i++){\
+            int fx = s->me.direct_basis_mv[i][0] + qx;\
+            int fy = s->me.direct_basis_mv[i][1] + qy;\
+            int bx = qx ? fx - s->me.co_located_mv[i][0] : s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + (i &1)*16;\
+            int by = qy ? fy - s->me.co_located_mv[i][1] : s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + (i>>1)*16;\
+            int fxy= (fx&3) + 4*(fy&3);\
+            int bxy= (bx&3) + 4*(by&3);\
+\
+            uint8_t *dst= s->me.scratchpad + 8*(i&1) + 8*stride*(i>>1);\
+            qpel_put[1][fxy](dst, (ref_y ) + (fx>>2) + (fy>>2)*(stride), stride);\
+            qpel_avg[1][bxy](dst, (ref2_y) + (bx>>2) + (by>>2)*(stride), stride);\
+        }\
+    }else{\
+        int fx = s->me.direct_basis_mv[0][0] + qx;\
+        int fy = s->me.direct_basis_mv[0][1] + qy;\
+        int bx = qx ? fx - s->me.co_located_mv[0][0] : s->me.co_located_mv[0][0]*(time_pb - time_pp)/time_pp;\
+        int by = qy ? fy - s->me.co_located_mv[0][1] : s->me.co_located_mv[0][1]*(time_pb - time_pp)/time_pp;\
+        int fxy= (fx&3) + 4*(fy&3);\
+        int bxy= (bx&3) + 4*(by&3);\
+\
+        qpel_put[0][fxy](s->me.scratchpad, (ref_y ) + (fx>>2) + (fy>>2)*(stride), stride);\
+        qpel_avg[0][bxy](s->me.scratchpad, (ref2_y) + (bx>>2) + (by>>2)*(stride), stride);\
+    }\
+    d = cmp_func(s, s->me.scratchpad, src_y, stride);\
+}else\
+    d= 256*256*256*32;
+
+
+#define CMP_QPEL(d, dx, dy, x, y, size)\
+    CMP_DIRECT(d, dx, dy, x, y, size, cmp_sub)
+
+#define CMP(d, x, y, size)\
+    CMP_DIRECT(d, 0, 0, x, y, size, cmp)
+
+#include "motion_est_template.c"
+#undef RENAME
+#undef CMP
+#undef CMP_HPEL
+#undef CMP_QPEL
+#undef INIT
+#undef CMP__DIRECT
+
+
+static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride){
+    return 0;
+}
+
+static void set_cmp(MpegEncContext *s, me_cmp_func *cmp, int type){
+    DSPContext* c= &s->dsp;
+    int i;
+    
+    memset(cmp, 0, sizeof(void*)*11);
 
+    switch(type&0xFF){
+    case FF_CMP_SAD:
+        cmp[0]= c->sad[0];
+        cmp[1]= c->sad[1];
+        break;
+    case FF_CMP_SATD:
+        cmp[0]= c->hadamard8_diff[0];
+        cmp[1]= c->hadamard8_diff[1];
+        break;
+    case FF_CMP_SSE:
+        cmp[0]= c->sse[0];
+        cmp[1]= c->sse[1];
+        break;
+    case FF_CMP_DCT:
+        cmp[0]= c->dct_sad[0];
+        cmp[1]= c->dct_sad[1];
+        break;
+    case FF_CMP_PSNR:
+        cmp[0]= c->quant_psnr[0];
+        cmp[1]= c->quant_psnr[1];
+        break;
+    case FF_CMP_ZERO:
+        for(i=0; i<7; i++){
+            cmp[i]= zero_cmp;
+        }
+        break;
+    default:
+        fprintf(stderr,"internal error in cmp function selection\n");
+    }
+};
+
+static inline int get_penalty_factor(MpegEncContext *s, int type){
+
+    switch(type){
+    default:
+    case FF_CMP_SAD:
+        return s->qscale;
+    case FF_CMP_SSE:
+//        return s->qscale*8;
+    case FF_CMP_DCT:
+    case FF_CMP_SATD:
+        return s->qscale*8;
+    }
+}
+
+void ff_init_me(MpegEncContext *s){
+    set_cmp(s, s->dsp.me_pre_cmp, s->avctx->me_pre_cmp);
+    set_cmp(s, s->dsp.me_cmp, s->avctx->me_cmp);
+    set_cmp(s, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
+    set_cmp(s, s->dsp.mb_cmp, s->avctx->mb_cmp);
+
+    if(s->flags&CODEC_FLAG_QPEL){
+        if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
+            s->me.sub_motion_search= simple_chroma_qpel_motion_search;
+        else
+            s->me.sub_motion_search= simple_qpel_motion_search;
+    }else{
+        if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
+            s->me.sub_motion_search= simple_chroma_hpel_motion_search;
+        else if(s->avctx->me_sub_cmp == FF_CMP_SAD && s->avctx->me_cmp == FF_CMP_SAD)
+            s->me.sub_motion_search= sad_hpel_motion_search;
+        else
+            s->me.sub_motion_search= simple_hpel_motion_search;
+    }
+
+    if(s->avctx->me_cmp&FF_CMP_CHROMA){
+        s->me.motion_search[0]= simple_chroma_epzs_motion_search;
+        s->me.motion_search[1]= simple_chroma_epzs_motion_search4;
+    }else{
+        s->me.motion_search[0]= simple_epzs_motion_search;
+        s->me.motion_search[1]= simple_epzs_motion_search4;
+    }
+    
+    if(s->avctx->me_pre_cmp&FF_CMP_CHROMA){
+        s->me.pre_motion_search= simple_chroma_epzs_motion_search;
+    }else{
+        s->me.pre_motion_search= simple_epzs_motion_search;
+    }
+}
+      
 static int pix_dev(UINT8 * pix, int line_size, int mean)
 {
     int s, i, j;
@@ -294,495 +598,39 @@ static int phods_motion_search(MpegEncContext * s,
 
 #define Z_THRESHOLD 256
 
-#define CHECK_MV(x,y)\
-{\
-    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
-    if(map[index]!=key){\
-        d = s->dsp.pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
-        map[index]= key;\
-        score_map[index]= d;\
-    }\
-}
-
-#define CHECK_MV_DIR(x,y,new_dir)\
-{\
-    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
-    if(map[index]!=key){\
-        d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-        if(d<dmin){\
-            best[0]=x;\
-            best[1]=y;\
-            dmin=d;\
-            next_dir= new_dir;\
-        }\
-        map[index]= key;\
-        score_map[index]= d;\
-    }\
-}
-
-#define CHECK_MV4(x,y)\
-{\
-    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
-    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
-    if(map[index]!=key){\
-        d = s->dsp.pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
-        map[index]= key;\
-        score_map[index]= d;\
-    }\
-}
-
-#define check(x,y,S,v)\
-if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
-if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
-if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
-if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
-
-
-static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
-                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
-                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift,
-                                       uint32_t *map, uint16_t *score_map, int map_generation,
-                                       op_pixels_abs_func pix_abs)
-{
-    int next_dir=-1;
-
-    for(;;){
-        int d;
-        const int dir= next_dir;
-        const int x= best[0];
-        const int y= best[1];
-        next_dir=-1;
-
-//printf("%d", dir);
-        if(dir!=2 && x>xmin) CHECK_MV_DIR(x-1, y  , 0)
-        if(dir!=3 && y>ymin) CHECK_MV_DIR(x  , y-1, 1)
-        if(dir!=0 && x<xmax) CHECK_MV_DIR(x+1, y  , 2)
-        if(dir!=1 && y<ymax) CHECK_MV_DIR(x  , y+1, 3)
-
-        if(next_dir==-1){
-            return dmin;
-        }
-    }
-
-/*    for(;;){
-        int d;
-        const int x= best[0];
-        const int y= best[1];
-        const int last_min=dmin;
-        if(x>xmin) CHECK_MV(x-1, y  )
-        if(y>xmin) CHECK_MV(x  , y-1)
-        if(x<xmax) CHECK_MV(x+1, y  )
-        if(y<xmax) CHECK_MV(x  , y+1)
-        if(x>xmin && y>ymin) CHECK_MV(x-1, y-1)
-        if(x>xmin && y<ymax) CHECK_MV(x-1, y+1)
-        if(x<xmax && y>ymin) CHECK_MV(x+1, y-1)
-        if(x<xmax && y<ymax) CHECK_MV(x+1, y+1)
-        if(x-1>xmin) CHECK_MV(x-2, y  )
-        if(y-1>xmin) CHECK_MV(x  , y-2)
-        if(x+1<xmax) CHECK_MV(x+2, y  )
-        if(y+1<xmax) CHECK_MV(x  , y+2)
-        if(x-1>xmin && y-1>ymin) CHECK_MV(x-2, y-2)
-        if(x-1>xmin && y+1<ymax) CHECK_MV(x-2, y+2)
-        if(x+1<xmax && y-1>ymin) CHECK_MV(x+2, y-2)
-        if(x+1<xmax && y+1<ymax) CHECK_MV(x+2, y+2)
-        if(dmin==last_min) return dmin;
-    }
-    */
-}
-
-#if 1
-#define SNAKE_1 3
-#define SNAKE_2 2
-#else
-#define SNAKE_1 7
-#define SNAKE_2 3
-#endif
-static inline int snake_search(MpegEncContext * s, int *best, int dmin,
-                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
-                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift,
-                                       uint32_t *map, uint16_t *score_map,int map_generation,
-                                       op_pixels_abs_func pix_abs)
-{
-    int dir=0;
-    int c=1;
-    static int x_dir[8]= {1,1,0,-1,-1,-1, 0, 1};
-    static int y_dir[8]= {0,1,1, 1, 0,-1,-1,-1};
-    int fails=0;
-    int last_d[2];
-
-/*static int good=0;
-static int bad=0;
-static int point=0;
-
-point++;
-if(256*256*256*64%point==0)
-{
-    printf("%d %d %d\n", good, bad, point);
-}*/
-
-    last_d[0] = dmin;
-    last_d[1] = dmin;
-
-    for(;;){
-        int x= best[0];
-        int y= best[1];
-        int d;
-        x+=x_dir[dir];
-        y+=y_dir[dir];
-        if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
-            const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;
-            const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);
-            if(map[index]!=key){
-                d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
-                d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
-                map[index]=key;
-                score_map[index]=d;
-            }else
-                d= dmin+1;
-        }else{
-            d = dmin + 10000; //FIXME smarter boundary handling
-        }
-        if(d<dmin){
-            best[0]=x;
-            best[1]=y;
-            dmin=d;
-
-            if(last_d[1] - last_d[0] > last_d[0] - d) c= -c;
-            dir+=c;
-
-            fails=0;
-//good++;
-            last_d[1]=last_d[0];
-            last_d[0]=d;
-        }else{
-//bad++;
-            if(fails){
-                if(fails>=SNAKE_1+1) return dmin;
-            }else{
-                if(dir&1) dir-= c*3;
-                else      c= -c;
-//                c= -c;
-            }
-            dir+=c*SNAKE_2;
-            fails++;
-        }
-        dir&=7;
-    }
-}
-
-static inline int cross_search(MpegEncContext * s, int *best, int dmin,
-                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
-                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift,
-                                       uint32_t *map, uint16_t *score_map,int map_generation,
-                                       op_pixels_abs_func pix_abs)
-{
-    static int x_dir[4]= {-1, 0, 1, 0};
-    static int y_dir[4]= { 0,-1, 0, 1};
-    int improvement[2]={100000, 100000};
-    int dirs[2]={2, 3};
-    int dir;
-    int last_dir= -1;
-    
-    for(;;){
-        dir= dirs[ improvement[0] > improvement[1] ? 0 : 1 ];
-        if(improvement[dir&1]==-1) return dmin;
-        
-        {
-            const int x= best[0] + x_dir[dir];
-            const int y= best[1] + y_dir[dir];
-            const int key= (y<<ME_MAP_MV_BITS) + x + map_generation;
-            const int index= ((y<<ME_MAP_SHIFT) + x)&(ME_MAP_SIZE-1);
-            int d;
-            if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
-                if(map[index]!=key){
-                    d = pix_abs(new_pic, old_pic + x + y*pic_stride, pic_stride);
-                    d += (mv_penalty[(x<<shift)-pred_x] + mv_penalty[(y<<shift)-pred_y])*quant;
-                    map[index]=key;
-                    score_map[index]=d;
-                    if(d<dmin){
-                        improvement[dir&1]= dmin-d;
-                        improvement[(dir&1)^1]++;
-                        dmin=d;
-                        best[0]= x;
-                        best[1]= y;
-                        last_dir=dir;
-                        continue;
-                    }
-                }else{
-                    d= score_map[index];
-                }
-            }else{
-                d= dmin + 1000; //FIXME is this a good idea?
-            }
-            /* evaluated point was cached or checked and worse */
-
-            if(last_dir==dir){
-                improvement[dir&1]= -1;
-            }else{
-                improvement[dir&1]= d-dmin;
-                last_dir= dirs[dir&1]= dir^2;
-            }
-        }
-    }
-}
-
-static inline int update_map_generation(MpegEncContext * s)
-{
-    s->me_map_generation+= 1<<(ME_MAP_MV_BITS*2);
-    if(s->me_map_generation==0){
-        s->me_map_generation= 1<<(ME_MAP_MV_BITS*2);
-        memset(s->me_map, 0, sizeof(uint32_t)*ME_MAP_SIZE);
-    }
-    return s->me_map_generation;
-}
-
-static int epzs_motion_search(MpegEncContext * s,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y,
-                             int xmin, int ymin, int xmax, int ymax, uint8_t * ref_picture)
-{
-    int best[2]={0, 0};
-    int d, dmin; 
-    UINT8 *new_pic, *old_pic;
-    const int pic_stride= s->linesize;
-    const int pic_xy= (s->mb_y*pic_stride + s->mb_x)*16;
-    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    int quant= s->qscale; // qscale of the prev frame
-    const int shift= 1+s->quarter_sample;
-    uint32_t *map= s->me_map;
-    uint16_t *score_map= s->me_score_map;
-    int map_generation;
-
-    new_pic = s->new_picture.data[0] + pic_xy;
-    old_pic = ref_picture + pic_xy;
-    
-    map_generation= update_map_generation(s);
-
-    dmin = s->dsp.pix_abs16x16(new_pic, old_pic, pic_stride);
-    map[0]= map_generation;
-    score_map[0]= dmin;
-
-    /* first line */
-    if ((s->mb_y == 0 || s->first_slice_line)) {
-        CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
-        CHECK_MV(P_LAST[0]>>shift, P_LAST[1]>>shift)
-    }else{
-        if(dmin<256 && ( P_LEFT[0]    |P_LEFT[1]
-                        |P_TOP[0]     |P_TOP[1]
-                        |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){
-            *mx_ptr= 0;
-            *my_ptr= 0;
-            s->skip_me=1;
-            return dmin;
-        }
-        CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
-        if(dmin>256*2){
-            CHECK_MV(P_LAST[0]    >>shift, P_LAST[1]    >>shift)
-            CHECK_MV(P_LEFT[0]    >>shift, P_LEFT[1]    >>shift)
-            CHECK_MV(P_TOP[0]     >>shift, P_TOP[1]     >>shift)
-            CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
-        }
-    }
-    if(dmin>256*4){
-        CHECK_MV(P_LAST_RIGHT[0] >>shift, P_LAST_RIGHT[1] >>shift)
-        CHECK_MV(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift)
-    }
-#if 0 //doest only slow things down
-    if(dmin>512*3){
-        int step;
-        dmin= score_map[0];
-        best[0]= best[1]=0;
-        for(step=128; step>0; step>>=1){
-            const int step2= step;
-            int y;
-            for(y=-step2+best[1]; y<=step2+best[1]; y+=step){
-                int x;
-                if(y<ymin || y>ymax) continue;
-
-                for(x=-step2+best[0]; x<=step2+best[0]; x+=step){
-                    if(x<xmin || x>xmax) continue;
-                    if(x==best[0] && y==best[1]) continue;
-                    CHECK_MV(x,y)
-                }
-            }
-        }
-    }
-#endif
-//check(best[0],best[1],0, b0)
-    if(s->me_method==ME_EPZS)
-        dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
-				   shift, map, score_map, map_generation, s->dsp.pix_abs16x16);
-    else
-        dmin=         cross_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
-                                   shift, map, score_map, map_generation, s->dsp.pix_abs16x16);
-//check(best[0],best[1],0, b1)
-    *mx_ptr= best[0];
-    *my_ptr= best[1];    
-
-//    printf("%d %d %d \n", best[0], best[1], dmin);
-    return dmin;
-}
-
-static int epzs_motion_search4(MpegEncContext * s, int block,
-                             int *mx_ptr, int *my_ptr,
-                             int P[10][2], int pred_x, int pred_y,
-                             int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
-{
-    int best[2]={0, 0};
-    int d, dmin; 
-    UINT8 *new_pic, *old_pic;
-    const int pic_stride= s->linesize;
-    const int pic_xy= ((s->mb_y*2 + (block>>1))*pic_stride + s->mb_x*2 + (block&1))*8;
-    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    int quant= s->qscale; // qscale of the prev frame
-    const int shift= 1+s->quarter_sample;
-    uint32_t *map= s->me_map;
-    uint16_t *score_map= s->me_score_map;
-    int map_generation;
-
-    new_pic = s->new_picture.data[0] + pic_xy;
-    old_pic = ref_picture + pic_xy;
-
-    map_generation= update_map_generation(s);
-
-    dmin = 1000000;
-//printf("%d %d %d %d //",xmin, ymin, xmax, ymax); 
-    /* first line */
-    if ((s->mb_y == 0 || s->first_slice_line) && block<2) {
-	CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
-        CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift)
-        CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift)
-    }else{
-        CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift)
-        //FIXME try some early stop
-        if(dmin>64*2){
-            CHECK_MV4(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
-            CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
-            CHECK_MV4(P_TOP[0]>>shift, P_TOP[1]>>shift)
-            CHECK_MV4(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
-            CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift)
-        }
-    }
-    if(dmin>64*4){
-        CHECK_MV4(P_LAST_RIGHT[0]>>shift, P_LAST_RIGHT[1]>>shift)
-        CHECK_MV4(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift)
-    }
-
-    if(s->me_method==ME_EPZS)
-        dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
-				   shift, map, score_map, map_generation, s->dsp.pix_abs8x8);
-    else
-        dmin=         cross_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
-                                   shift, map, score_map, map_generation, s->dsp.pix_abs8x8);
-
-    *mx_ptr= best[0];
-    *my_ptr= best[1];    
-
-//    printf("%d %d %d \n", best[0], best[1], dmin);
-    return dmin;
-}
-
-#define CHECK_HALF_MV(suffix, x, y) \
+#define CHECK_SAD_HALF_MV(suffix, x, y) \
 {\
     d= pix_abs_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
-    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
+    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*penalty_factor;\
     COPY3_IF_LT(dminh, d, dx, x, dy, y)\
 }
 
-    
-/* The idea would be to make half pel ME after Inter/Intra decision to 
-   save time. */
-static inline int halfpel_motion_search(MpegEncContext * s,
+static inline int sad_hpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y, uint8_t *ref_picture,
-                                  op_pixels_abs_func pix_abs_x2, 
-                                  op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n)
+                                  int pred_x, int pred_y, Picture *picture,
+                                  int n, int size, uint16_t * const mv_penalty)
 {
-    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    const int quant= s->qscale;
+    uint8_t *ref_picture= picture->data[0];
+    uint32_t *score_map= s->me.score_map;
+    const int penalty_factor= s->me.sub_penalty_factor;
     int mx, my, xx, yy, dminh;
     UINT8 *pix, *ptr;
-
-    if(s->skip_me){
-        *mx_ptr = 0;
-        *my_ptr = 0;
-        return dmin;
-    }
-
-    xx = 16 * s->mb_x + 8*(n&1);
-    yy = 16 * s->mb_y + 8*(n>>1);
-    pix =  s->new_picture.data[0] + (yy * s->linesize) + xx;
-
-    mx = *mx_ptr;
-    my = *my_ptr;
-    ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx);
+    op_pixels_abs_func pix_abs_x2;
+    op_pixels_abs_func pix_abs_y2;
+    op_pixels_abs_func pix_abs_xy2;
     
-    dminh = dmin;
-
-    if (mx > xmin && mx < xmax && 
-        my > ymin && my < ymax) {
-        int dx=0, dy=0;
-        int d, pen_x, pen_y; 
-
-        mx<<=1;
-        my<<=1;
-        
-        pen_x= pred_x + mx;
-        pen_y= pred_y + my;
-
-        ptr-= s->linesize;
-        CHECK_HALF_MV(xy2, -1, -1)
-        CHECK_HALF_MV(y2 ,  0, -1)
-        CHECK_HALF_MV(xy2, +1, -1)
-        
-        ptr+= s->linesize;
-        CHECK_HALF_MV(x2 , -1,  0)
-        CHECK_HALF_MV(x2 , +1,  0)
-        CHECK_HALF_MV(xy2, -1, +1)
-        CHECK_HALF_MV(y2 ,  0, +1)
-        CHECK_HALF_MV(xy2, +1, +1)
-
-        mx+=dx;
-        my+=dy;
+    if(size==0){
+        pix_abs_x2 = s->dsp.pix_abs16x16_x2;
+        pix_abs_y2 = s->dsp.pix_abs16x16_y2;
+        pix_abs_xy2= s->dsp.pix_abs16x16_xy2;
     }else{
-        mx<<=1;
-        my<<=1;
+        pix_abs_x2 = s->dsp.pix_abs8x8_x2;
+        pix_abs_y2 = s->dsp.pix_abs8x8_y2;
+        pix_abs_xy2= s->dsp.pix_abs8x8_xy2;
     }
 
-    *mx_ptr = mx;
-    *my_ptr = my;
-    return dminh;
-}
-
-static inline int fast_halfpel_motion_search(MpegEncContext * s,
-				  int *mx_ptr, int *my_ptr, int dmin,
-				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y, uint8_t *ref_picture,
-                                  op_pixels_abs_func pix_abs_x2, 
-                                  op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n)
-{
-    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    uint16_t *score_map= s->me_score_map;
-    const int quant= s->qscale;
-    int mx, my, xx, yy, dminh;
-    UINT8 *pix, *ptr;
-
-    if(s->skip_me){
+    if(s->me.skip){
 //    printf("S");
         *mx_ptr = 0;
         *my_ptr = 0;
@@ -818,51 +666,51 @@ static inline int fast_halfpel_motion_search(MpegEncContext * s,
 
         ptr-= s->linesize;
         if(t<=b){
-            CHECK_HALF_MV(y2 ,  0, -1)
+            CHECK_SAD_HALF_MV(y2 , 0, -1)
             if(l<=r){
-                CHECK_HALF_MV(xy2, -1, -1)
+                CHECK_SAD_HALF_MV(xy2, -1, -1)
                 if(t+r<=b+l){
-                    CHECK_HALF_MV(xy2, +1, -1)
+                    CHECK_SAD_HALF_MV(xy2, +1, -1)
                     ptr+= s->linesize;
                 }else{
                     ptr+= s->linesize;
-                    CHECK_HALF_MV(xy2, -1, +1)
+                    CHECK_SAD_HALF_MV(xy2, -1, +1)
                 }
-                CHECK_HALF_MV(x2 , -1,  0)
+                CHECK_SAD_HALF_MV(x2 , -1,  0)
             }else{
-                CHECK_HALF_MV(xy2, +1, -1)
+                CHECK_SAD_HALF_MV(xy2, +1, -1)
                 if(t+l<=b+r){
-                    CHECK_HALF_MV(xy2, -1, -1)
+                    CHECK_SAD_HALF_MV(xy2, -1, -1)
                     ptr+= s->linesize;
                 }else{
                     ptr+= s->linesize;
-                    CHECK_HALF_MV(xy2, +1, +1)
+                    CHECK_SAD_HALF_MV(xy2, +1, +1)
                 }
-                CHECK_HALF_MV(x2 , +1,  0)
+                CHECK_SAD_HALF_MV(x2 , +1,  0)
             }
         }else{
             if(l<=r){
                 if(t+l<=b+r){
-                    CHECK_HALF_MV(xy2, -1, -1)
+                    CHECK_SAD_HALF_MV(xy2, -1, -1)
                     ptr+= s->linesize;
                 }else{
                     ptr+= s->linesize;
-                    CHECK_HALF_MV(xy2, +1, +1)
+                    CHECK_SAD_HALF_MV(xy2, +1, +1)
                 }
-                CHECK_HALF_MV(x2 , -1,  0)
-                CHECK_HALF_MV(xy2, -1, +1)
+                CHECK_SAD_HALF_MV(x2 , -1,  0)
+                CHECK_SAD_HALF_MV(xy2, -1, +1)
             }else{
                 if(t+r<=b+l){
-                    CHECK_HALF_MV(xy2, +1, -1)
+                    CHECK_SAD_HALF_MV(xy2, +1, -1)
                     ptr+= s->linesize;
                 }else{
                     ptr+= s->linesize;
-                    CHECK_HALF_MV(xy2, -1, +1)
+                    CHECK_SAD_HALF_MV(xy2, -1, +1)
                 }
-                CHECK_HALF_MV(x2 , +1,  0)
-                CHECK_HALF_MV(xy2, +1, +1)
+                CHECK_SAD_HALF_MV(x2 , +1,  0)
+                CHECK_SAD_HALF_MV(xy2, +1, +1)
             }
-            CHECK_HALF_MV(y2 ,  0, +1)
+            CHECK_SAD_HALF_MV(y2 ,  0, +1)
         }
         mx+=dx;
         my+=dy;
@@ -913,12 +761,10 @@ static inline void get_limits(MpegEncContext *s, int *range, int *xmin, int *ymi
         *ymin = -16;
         if (s->h263_plus)
             *range *= 2;
-        if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4){
+        if(s->avctx->codec->id!=CODEC_ID_MPEG4){
             *xmax = s->mb_width*16;
             *ymax = s->mb_height*16;
         }else {
-            /* XXX: dunno if this is correct but ffmpeg4 decoder wont like it otherwise 
-	            (cuz the drawn edge isnt large enough))*/
             *xmax = s->width;
             *ymax = s->height;
         }
@@ -936,6 +782,7 @@ static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, in
     int P[10][2];
     uint8_t *ref_picture= s->last_picture.data[0];
     int dmin_sum=0;
+    uint16_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
 
     for(block=0; block<4; block++){
         int mx4, my4;
@@ -957,21 +804,13 @@ static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, in
         const int rel_ymin4= ymin - block_y*8;
         const int rel_ymax4= ymax - block_y*8 + 8;
 #endif
-        P_LAST[0] = s->motion_val[mot_xy    ][0];
-        P_LAST[1] = s->motion_val[mot_xy    ][1];
         P_LEFT[0] = s->motion_val[mot_xy - 1][0];
         P_LEFT[1] = s->motion_val[mot_xy - 1][1];
-        P_LAST_RIGHT[0] = s->motion_val[mot_xy + 1][0];
-        P_LAST_RIGHT[1] = s->motion_val[mot_xy + 1][1];
-        P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 1*mot_stride][0];
-        P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 1*mot_stride][1];
 
         if(P_LEFT[0]       > (rel_xmax4<<shift)) P_LEFT[0]       = (rel_xmax4<<shift);
-        if(P_LAST_RIGHT[0] < (rel_xmin4<<shift)) P_LAST_RIGHT[0] = (rel_xmin4<<shift);
-        if(P_LAST_BOTTOM[1]< (rel_ymin4<<shift)) P_LAST_BOTTOM[1]= (rel_ymin4<<shift);
 
         /* special case for first line */
-        if ((s->mb_y == 0 || s->first_slice_line) && block<2) {
+        if (s->mb_y == 0 && block<2) {
             pred_x4= P_LEFT[0];
             pred_y4= P_LEFT[1];
         } else {
@@ -998,11 +837,11 @@ static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, in
         P_MV1[0]= mx;
         P_MV1[1]= my;
 
-        dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, ref_picture);
+        dmin4 = s->me.motion_search[1](s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
+                                       &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);
 
-        dmin4= fast_halfpel_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
-					  pred_x4, pred_y4, ref_picture, s->dsp.pix_abs8x8_x2,
-					  s->dsp.pix_abs8x8_y2, s->dsp.pix_abs8x8_xy2, block);
+        dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
+					  pred_x4, pred_y4, &s->last_picture, block, 1, mv_penalty);
  
         s->motion_val[ s->block_index[block] ][0]= mx4;
         s->motion_val[ s->block_index[block] ][1]= my4;
@@ -1024,13 +863,19 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     int mb_type=0;
     uint8_t *ref_picture= s->last_picture.data[0];
     Picture * const pic= &s->current_picture;
+    uint16_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
+    
+    assert(s->quarter_sample==0 || s->quarter_sample==1);
+
+    s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
+    s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
 
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
     rel_xmin= xmin - mb_x*16;
     rel_xmax= xmax - mb_x*16;
     rel_ymin= ymin - mb_y*16;
     rel_ymax= ymax - mb_y*16;
-    s->skip_me=0;
+    s->me.skip=0;
 
     switch(s->me_method) {
     case ME_ZERO:
@@ -1061,24 +906,12 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             const int mot_stride = s->block_wrap[0];
             const int mot_xy = s->block_index[0];
 
-            P_LAST[0]       = s->motion_val[mot_xy    ][0];
-            P_LAST[1]       = s->motion_val[mot_xy    ][1];
             P_LEFT[0]       = s->motion_val[mot_xy - 1][0];
             P_LEFT[1]       = s->motion_val[mot_xy - 1][1];
-            P_LAST_RIGHT[0] = s->motion_val[mot_xy + 2][0];
-            P_LAST_RIGHT[1] = s->motion_val[mot_xy + 2][1];
-            P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 2*mot_stride][0];
-            P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 2*mot_stride][1];
 
             if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
-            if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
-            if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
 
-            /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line)) {
-                pred_x= P_LEFT[0];
-                pred_y= P_LEFT[1];
-            } else {
+            if(mb_y) {
                 P_TOP[0]      = s->motion_val[mot_xy - mot_stride    ][0];
                 P_TOP[1]      = s->motion_val[mot_xy - mot_stride    ][1];
                 P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + 2][0];
@@ -1097,9 +930,14 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
                     pred_x= P_LEFT[0];
                     pred_y= P_LEFT[1];
                 }
+            }else{
+                pred_x= P_LEFT[0];
+                pred_y= P_LEFT[1];
             }
+
         }
-        dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture);
+        dmin = s->me.motion_search[0](s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                      &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);
  
         break;
     }
@@ -1115,8 +953,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     sum = s->dsp.pix_sum(pix, s->linesize);
     
     varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)(sum*sum))>>8) + 500 + 128)>>8;
-    // FIXME: MMX OPTIMIZE
-    vard = (s->dsp.pix_norm(pix, ppix, s->linesize)+128)>>8;
+    vard = (s->dsp.sse[0](NULL, pix, ppix, s->linesize)+128)>>8;
 
 //printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
     pic->mb_var   [s->mb_width * mb_y + mb_x] = varc;
@@ -1134,26 +971,20 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
         if (vard <= 64 || vard < varc)
             s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
         else
-            s->scene_change_score+= 20;
+            s->scene_change_score+= s->qscale;
 
         if (vard*2 + 200 > varc)
             mb_type|= MB_TYPE_INTRA;
         if (varc*2 + 200 > vard){
             mb_type|= MB_TYPE_INTER;
-            if(s->me_method >= ME_EPZS)
-                fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-					   pred_x, pred_y, ref_picture, s->dsp.pix_abs16x16_x2,
-					   s->dsp.pix_abs16x16_y2, s->dsp.pix_abs16x16_xy2, 0);
-            else
-                halfpel_motion_search(     s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-				           pred_x, pred_y, ref_picture, s->dsp.pix_abs16x16_x2,
-				           s->dsp.pix_abs16x16_y2, s->dsp.pix_abs16x16_xy2, 0);
+            s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
+				   pred_x, pred_y, &s->last_picture, 0, 0, mv_penalty);
         }else{
-            mx <<=1;
-            my <<=1;
+            mx <<=shift;
+            my <<=shift;
         }
         if((s->flags&CODEC_FLAG_4MV)
-           && !s->skip_me && varc>50 && vard>10){
+           && !s->me.skip && varc>50 && vard>10){
             mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
             mb_type|=MB_TYPE_INTER4V;
 
@@ -1162,19 +993,14 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             set_p_mv_tables(s, mx, my, 1);
     }else{
         if (vard <= 64 || vard < varc) {
+//        if (sadP <= 32 || sadP < sadI + 500) {
             s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
             mb_type|= MB_TYPE_INTER;
             if (s->me_method != ME_ZERO) {
-                if(s->me_method >= ME_EPZS)
-		    dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-                                           pred_x, pred_y, ref_picture, s->dsp.pix_abs16x16_x2, s->dsp.pix_abs16x16_y2,
-                                           s->dsp.pix_abs16x16_xy2, 0);
-                else
-                    dmin= halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-                                           pred_x, pred_y, ref_picture, s->dsp.pix_abs16x16_x2, s->dsp.pix_abs16x16_y2,
-                                           s->dsp.pix_abs16x16_xy2, 0);
+                dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
+                                            pred_x, pred_y, &s->last_picture, 0, 0, mv_penalty);
                 if((s->flags&CODEC_FLAG_4MV)
-                   && !s->skip_me && varc>50 && vard>10){
+                   && !s->me.skip && varc>50 && vard>10){
                     int dmin4= mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
                     if(dmin4 + 128 <dmin)
                         mb_type= MB_TYPE_INTER4V;
@@ -1182,8 +1008,8 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
                 set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
 
             } else {
-                mx <<=1;
-                my <<=1;
+                mx <<=shift;
+                my <<=shift;
             }
 #if 0
             if (vard < 10) {
@@ -1203,8 +1029,67 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
 }
 
+int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
+                                    int mb_x, int mb_y)
+{
+    int mx, my, range, dmin;
+    int xmin, ymin, xmax, ymax;
+    int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
+    int pred_x=0, pred_y=0;
+    int P[10][2];
+    const int shift= 1+s->quarter_sample;
+    uint16_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
+    const int mv_stride= s->mb_width + 2;
+    const int xy= mb_x + 1 + (mb_y + 1)*mv_stride;
+    
+    assert(s->quarter_sample==0 || s->quarter_sample==1);
+
+    s->me.pre_penalty_factor    = get_penalty_factor(s, s->avctx->me_pre_cmp);
+
+    get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
+    rel_xmin= xmin - mb_x*16;
+    rel_xmax= xmax - mb_x*16;
+    rel_ymin= ymin - mb_y*16;
+    rel_ymax= ymax - mb_y*16;
+    s->me.skip=0;
+
+    P_LEFT[0]       = s->p_mv_table[xy + 1][0];
+    P_LEFT[1]       = s->p_mv_table[xy + 1][1];
+
+    if(P_LEFT[0]       < (rel_xmin<<shift)) P_LEFT[0]       = (rel_xmin<<shift);
+
+    /* special case for first line */
+    if (mb_y == s->mb_height-1) {
+        pred_x= P_LEFT[0];
+        pred_y= P_LEFT[1];
+        P_TOP[0]= P_TOPRIGHT[0]= P_MEDIAN[0]=
+        P_TOP[1]= P_TOPRIGHT[1]= P_MEDIAN[1]= 0; //FIXME 
+    } else {
+        P_TOP[0]      = s->p_mv_table[xy + mv_stride    ][0];
+        P_TOP[1]      = s->p_mv_table[xy + mv_stride    ][1];
+        P_TOPRIGHT[0] = s->p_mv_table[xy + mv_stride - 1][0];
+        P_TOPRIGHT[1] = s->p_mv_table[xy + mv_stride - 1][1];
+        if(P_TOP[1]      < (rel_ymin<<shift)) P_TOP[1]     = (rel_ymin<<shift);
+        if(P_TOPRIGHT[0] > (rel_xmax<<shift)) P_TOPRIGHT[0]= (rel_xmax<<shift);
+        if(P_TOPRIGHT[1] < (rel_ymin<<shift)) P_TOPRIGHT[1]= (rel_ymin<<shift);
+    
+        P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+        P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+        pred_x = P_MEDIAN[0];
+        pred_y = P_MEDIAN[1];
+    }
+    dmin = s->me.pre_motion_search(s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                   &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);
+
+    s->p_mv_table[xy][0] = mx<<shift;
+    s->p_mv_table[xy][1] = my<<shift;
+    
+    return dmin;
+}
+
 int ff_estimate_motion_b(MpegEncContext * s,
-                       int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *ref_picture, int f_code)
+                       int mb_x, int mb_y, int16_t (*mv_table)[2], Picture *picture, int f_code)
 {
     int mx, my, range, dmin;
     int xmin, ymin, xmax, ymax;
@@ -1214,7 +1099,13 @@ int ff_estimate_motion_b(MpegEncContext * s,
     const int shift= 1+s->quarter_sample;
     const int mot_stride = s->mb_width + 2;
     const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
-    
+    uint8_t * const ref_picture= picture->data[0];
+    uint16_t * const mv_penalty= s->me.mv_penalty[f_code] + MAX_MV;
+    int mv_scale;
+        
+    s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
+    s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
+
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, f_code);
     rel_xmin= xmin - mb_x*16;
     rel_xmax= xmax - mb_x*16;
@@ -1247,23 +1138,13 @@ int ff_estimate_motion_b(MpegEncContext * s,
     case ME_X1:
     case ME_EPZS:
        {
-
-            P_LAST[0]        = mv_table[mot_xy    ][0];
-            P_LAST[1]        = mv_table[mot_xy    ][1];
             P_LEFT[0]        = mv_table[mot_xy - 1][0];
             P_LEFT[1]        = mv_table[mot_xy - 1][1];
-            P_LAST_RIGHT[0]  = mv_table[mot_xy + 1][0];
-            P_LAST_RIGHT[1]  = mv_table[mot_xy + 1][1];
-            P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0];
-            P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1];
 
             if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
-            if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
-            if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
 
             /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line)) {
-            } else {
+            if (mb_y) {
                 P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
                 P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
                 P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
@@ -1278,22 +1159,29 @@ int ff_estimate_motion_b(MpegEncContext * s,
             pred_x= P_LEFT[0];
             pred_y= P_LEFT[1];
         }
-        dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture);
+        
+        if(mv_table == s->b_forw_mv_table){
+            mv_scale= (s->pb_time<<16) / (s->pp_time<<shift);
+        }else{
+            mv_scale= ((s->pb_time - s->pp_time)<<16) / (s->pp_time<<shift);
+        }
+        
+        dmin = s->me.motion_search[0](s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                      picture, s->p_mv_table, mv_scale, mv_penalty);
  
         break;
     }
     
-    dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-                                pred_x, pred_y, ref_picture, s->dsp.pix_abs16x16_x2, s->dsp.pix_abs16x16_y2,
-                                s->dsp.pix_abs16x16_xy2, 0);
+    dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
+				   pred_x, pred_y, picture, 0, 0, mv_penalty);
 //printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
     mv_table[mot_xy][0]= mx;
     mv_table[mot_xy][1]= my;
+
     return dmin;
 }
 
-
 static inline int check_bidir_mv(MpegEncContext * s,
                    int mb_x, int mb_y,
                    int motion_fx, int motion_fy,
@@ -1302,45 +1190,57 @@ static inline int check_bidir_mv(MpegEncContext * s,
                    int pred_bx, int pred_by)
 {
     //FIXME optimize?
-    //FIXME direct mode penalty
-    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
-    uint8_t *dest_y = s->me_scratchpad;
+    //FIXME move into template?
+    //FIXME better f_code prediction (max mv & distance)
+    UINT16 *mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    uint8_t *dest_y = s->me.scratchpad;
     uint8_t *ptr;
     int dxy;
     int src_x, src_y;
     int fbmin;
 
-    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->qscale;
-
-    dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
-    src_x = mb_x * 16 + (motion_fx >> 1);
-    src_y = mb_y * 16 + (motion_fy >> 1);
-    src_x = clip(src_x, -16, s->width);
-    if (src_x == s->width)
-        dxy&= 2;
-    src_y = clip(src_y, -16, s->height);
-    if (src_y == s->height)
-        dxy&= 1;
-
-    ptr = s->last_picture.data[0] + (src_y * s->linesize) + src_x;
-    s->dsp.put_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
-
-    fbmin += (mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->qscale;
-
-    dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
-    src_x = mb_x * 16 + (motion_bx >> 1);
-    src_y = mb_y * 16 + (motion_by >> 1);
-    src_x = clip(src_x, -16, s->width);
-    if (src_x == s->width)
-        dxy&= 2;
-    src_y = clip(src_y, -16, s->height);
-    if (src_y == s->height)
-        dxy&= 1;
-
-    ptr = s->next_picture.data[0] + (src_y * s->linesize) + src_x;
-    s->dsp.avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
-
-    fbmin += s->dsp.pix_abs16x16(s->new_picture.data[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
+    if(s->quarter_sample){
+        dxy = ((motion_fy & 3) << 2) | (motion_fx & 3);
+        src_x = mb_x * 16 + (motion_fx >> 2);
+        src_y = mb_y * 16 + (motion_fy >> 2);
+        assert(src_x >=-16 && src_x<=s->width);
+        assert(src_y >=-16 && src_y<=s->height);
+
+        ptr = s->last_picture.data[0] + (src_y * s->linesize) + src_x;
+        s->dsp.put_qpel_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize);
+
+        dxy = ((motion_by & 3) << 2) | (motion_bx & 3);
+        src_x = mb_x * 16 + (motion_bx >> 2);
+        src_y = mb_y * 16 + (motion_by >> 2);
+        assert(src_x >=-16 && src_x<=s->width);
+        assert(src_y >=-16 && src_y<=s->height);
+    
+        ptr = s->next_picture.data[0] + (src_y * s->linesize) + src_x;
+        s->dsp.avg_qpel_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize);
+    }else{
+        dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
+        src_x = mb_x * 16 + (motion_fx >> 1);
+        src_y = mb_y * 16 + (motion_fy >> 1);
+        assert(src_x >=-16 && src_x<=s->width);
+        assert(src_y >=-16 && src_y<=s->height);
+
+        ptr = s->last_picture.data[0] + (src_y * s->linesize) + src_x;
+        s->dsp.put_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
+
+        dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
+        src_x = mb_x * 16 + (motion_bx >> 1);
+        src_y = mb_y * 16 + (motion_by >> 1);
+        assert(src_x >=-16 && src_x<=s->width);
+        assert(src_y >=-16 && src_y<=s->height);
+    
+        ptr = s->next_picture.data[0] + (src_y * s->linesize) + src_x;
+        s->dsp.avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
+    }
+
+    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->me.sub_penalty_factor
+           +(mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->me.sub_penalty_factor;
+           + s->dsp.me_sub_cmp[0](s, s->new_picture.data[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
+
     return fbmin;
 }
 
@@ -1377,146 +1277,102 @@ static inline int direct_search(MpegEncContext * s,
     int P[10][2];
     const int mot_stride = s->mb_width + 2;
     const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
-    int dmin, dmin2;
-    int motion_fx, motion_fy, motion_bx, motion_by, motion_bx0, motion_by0;
-    int motion_dx, motion_dy;
-    const int motion_px= s->p_mv_table[mot_xy][0];
-    const int motion_py= s->p_mv_table[mot_xy][1];
+    const int shift= 1+s->quarter_sample;
+    int dmin, i;
     const int time_pp= s->pp_time;
     const int time_pb= s->pb_time;
-    const int time_bp= time_pp - time_pb;
-    int bx, by;
-    int mx, my, mx2, my2;
-    uint8_t *ref_picture= s->me_scratchpad - (mb_x - 1 + (mb_y - 1)*s->linesize)*16;
+    int mx, my, xmin, xmax, ymin, ymax;
     int16_t (*mv_table)[2]= s->b_direct_mv_table;
-/*    uint16_t *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; */ // f_code of the prev frame
-
-    /* thanks to iso-mpeg the rounding is different for the zero vector, so we need to handle that ... */
-    motion_fx= (motion_px*time_pb)/time_pp;
-    motion_fy= (motion_py*time_pb)/time_pp;
-    motion_bx0= (-motion_px*time_bp)/time_pp;
-    motion_by0= (-motion_py*time_bp)/time_pp;
-    motion_dx= motion_dy=0;
-    dmin2= check_bidir_mv(s, mb_x, mb_y, 
-                          motion_fx, motion_fy,
-                          motion_bx0, motion_by0,
-                          motion_fx, motion_fy,
-                          motion_bx0, motion_by0) - s->qscale;
-
-    motion_bx= motion_fx - motion_px;
-    motion_by= motion_fy - motion_py;
-    for(by=-1; by<2; by++){
-        for(bx=-1; bx<2; bx++){
-            uint8_t *dest_y = s->me_scratchpad + (by+1)*s->linesize*16 + (bx+1)*16;
-            uint8_t *ptr;
-            int dxy;
-            int src_x, src_y;
-            const int width= s->width;
-            const int height= s->height;
-
-            dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
-            src_x = (mb_x + bx) * 16 + (motion_fx >> 1);
-            src_y = (mb_y + by) * 16 + (motion_fy >> 1);
-            src_x = clip(src_x, -16, width);
-            if (src_x == width) dxy &= ~1;
-            src_y = clip(src_y, -16, height);
-            if (src_y == height) dxy &= ~2;
-
-            ptr = s->last_picture.data[0] + (src_y * s->linesize) + src_x;
-            s->dsp.put_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
-
-            dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
-            src_x = (mb_x + bx) * 16 + (motion_bx >> 1);
-            src_y = (mb_y + by) * 16 + (motion_by >> 1);
-            src_x = clip(src_x, -16, width);
-            if (src_x == width) dxy &= ~1;
-            src_y = clip(src_y, -16, height);
-            if (src_y == height) dxy &= ~2;
-
-	    s->dsp.avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
-        }
+    uint16_t * const mv_penalty= s->me.mv_penalty[1] + MAX_MV;
+    
+    ymin= xmin=(-32)>>shift;
+    ymax= xmax=   31>>shift;
+
+    if(s->co_located_type_table[mb_x + mb_y*s->mb_width]==CO_LOCATED_TYPE_4MV){
+        s->mv_type= MV_TYPE_8X8;
+    }else{
+        s->mv_type= MV_TYPE_16X16;
     }
 
-    P_LAST[0]        = mv_table[mot_xy    ][0];
-    P_LAST[1]        = mv_table[mot_xy    ][1];
-    P_LEFT[0]        = mv_table[mot_xy - 1][0];
-    P_LEFT[1]        = mv_table[mot_xy - 1][1];
-    P_LAST_RIGHT[0]  = mv_table[mot_xy + 1][0];
-    P_LAST_RIGHT[1]  = mv_table[mot_xy + 1][1];
-    P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0];
-    P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1];
-/*
-    if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
-    if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
-    if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
-*/
+    for(i=0; i<4; i++){
+        int index= s->block_index[i];
+        int min, max;
+    
+        s->me.co_located_mv[i][0]= s->motion_val[index][0];
+        s->me.co_located_mv[i][1]= s->motion_val[index][1];
+        s->me.direct_basis_mv[i][0]= s->me.co_located_mv[i][0]*time_pb/time_pp + ((i& 1)<<(shift+3));
+        s->me.direct_basis_mv[i][1]= s->me.co_located_mv[i][1]*time_pb/time_pp + ((i>>1)<<(shift+3));
+//        s->me.direct_basis_mv[1][i][0]= s->me.co_located_mv[i][0]*(time_pb - time_pp)/time_pp + ((i &1)<<(shift+3);
+//        s->me.direct_basis_mv[1][i][1]= s->me.co_located_mv[i][1]*(time_pb - time_pp)/time_pp + ((i>>1)<<(shift+3);
+
+        max= FFMAX(s->me.direct_basis_mv[i][0], s->me.direct_basis_mv[i][0] - s->me.co_located_mv[i][0])>>shift;
+        min= FFMIN(s->me.direct_basis_mv[i][0], s->me.direct_basis_mv[i][0] - s->me.co_located_mv[i][0])>>shift;
+        max+= (2*mb_x + (i& 1))*8 - 1; // +-1 is for the simpler rounding
+        min+= (2*mb_x + (i& 1))*8 + 1;
+        xmax= FFMIN(xmax, s->width - max);
+        xmin= FFMAX(xmin, - 16     - min);
+
+        max= FFMAX(s->me.direct_basis_mv[i][1], s->me.direct_basis_mv[i][1] - s->me.co_located_mv[i][1])>>shift;
+        min= FFMIN(s->me.direct_basis_mv[i][1], s->me.direct_basis_mv[i][1] - s->me.co_located_mv[i][1])>>shift;
+        max+= (2*mb_y + (i>>1))*8 - 1; // +-1 is for the simpler rounding
+        min+= (2*mb_y + (i>>1))*8 + 1;
+        ymax= FFMIN(ymax, s->height - max);
+        ymin= FFMAX(ymin, - 16      - min);
+        
+        if(s->mv_type == MV_TYPE_16X16) break;
+    }
+    
+    assert(xmax <= 15 && ymax <= 15 && xmin >= -16 && ymin >= -16);
+    
+    if(xmax < 0 || xmin >0 || ymax < 0 || ymin > 0){
+        s->b_direct_mv_table[mot_xy][0]= 0;
+        s->b_direct_mv_table[mot_xy][1]= 0;
+
+        return 256*256*256*64;
+    }
+
+    P_LEFT[0]        = clip(mv_table[mot_xy - 1][0], xmin<<shift, xmax<<shift);
+    P_LEFT[1]        = clip(mv_table[mot_xy - 1][1], ymin<<shift, ymax<<shift);
+
     /* special case for first line */
-    if ((mb_y == 0 || s->first_slice_line)) {
-    } else {
-        P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
-        P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
-        P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
-        P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1         ][1];
+    if (mb_y) {
+        P_TOP[0]      = clip(mv_table[mot_xy - mot_stride             ][0], xmin<<shift, xmax<<shift);
+        P_TOP[1]      = clip(mv_table[mot_xy - mot_stride             ][1], ymin<<shift, ymax<<shift);
+        P_TOPRIGHT[0] = clip(mv_table[mot_xy - mot_stride + 1         ][0], xmin<<shift, xmax<<shift);
+        P_TOPRIGHT[1] = clip(mv_table[mot_xy - mot_stride + 1         ][1], ymin<<shift, ymax<<shift);
     
         P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
         P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
-    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, -16, -16, 15, 15, ref_picture);
-    if(mx==0 && my==0) dmin=99999999; // not representable, due to rounding stuff
-    if(dmin2<dmin){ 
-        dmin= dmin2;
-        mx=0;
-        my=0;
-    }
-#if 1
-    mx2= mx= mx*2; 
-    my2= my= my*2;
-    for(by=-1; by<2; by++){
-        if(my2+by < -32) continue;
-        for(bx=-1; bx<2; bx++){
-            if(bx==0 && by==0) continue;
-            if(mx2+bx < -32) continue;
-            dmin2= check_bidir_mv(s, mb_x, mb_y, 
-                          mx2+bx+motion_fx, my2+by+motion_fy,
-                          mx2+bx+motion_bx, my2+by+motion_by,
-                          mx2+bx+motion_fx, my2+by+motion_fy,
-                          motion_bx, motion_by) - s->qscale;
-            
-            if(dmin2<dmin){
-                dmin=dmin2;
-                mx= mx2 + bx;
-                my= my2 + by;
-            }
-        }
-    }
-#else
-    mx*=2; my*=2;
-#endif
-    if(mx==0 && my==0){
-        motion_bx= motion_bx0;
-        motion_by= motion_by0;
+    
+    if(s->flags&CODEC_FLAG_QPEL){
+        dmin = simple_direct_qpel_epzs_motion_search(s, 0, &mx, &my, P, 0, 0, xmin, ymin, xmax, ymax, 
+                                                     &s->last_picture, mv_table, 1<<14, mv_penalty);
+        dmin = simple_direct_qpel_qpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax,
+                                                0, 0, &s->last_picture, 0, 0, mv_penalty);
+    }else{
+        dmin = simple_direct_hpel_epzs_motion_search(s, 0, &mx, &my, P, 0, 0, xmin, ymin, xmax, ymax, 
+                                                     &s->last_picture, mv_table, 1<<15, mv_penalty);
+        dmin = simple_direct_hpel_hpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax,
+                                                0, 0, &s->last_picture, 0, 0, mv_penalty);
     }
 
     s->b_direct_mv_table[mot_xy][0]= mx;
     s->b_direct_mv_table[mot_xy][1]= my;
-    s->b_direct_forw_mv_table[mot_xy][0]= motion_fx + mx;
-    s->b_direct_forw_mv_table[mot_xy][1]= motion_fy + my;
-    s->b_direct_back_mv_table[mot_xy][0]= motion_bx + mx;
-    s->b_direct_back_mv_table[mot_xy][1]= motion_by + my;
     return dmin;
 }
 
 void ff_estimate_b_frame_motion(MpegEncContext * s,
                              int mb_x, int mb_y)
 {
-    const int quant= s->qscale;
+    const int penalty_factor= s->me.penalty_factor;
     int fmin, bmin, dmin, fbmin;
     int type=0;
     
     dmin= direct_search(s, mb_x, mb_y);
 
-    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, s->last_picture.data[0], s->f_code);
-    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, s->next_picture.data[0], s->b_code) - quant;
+    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, &s->last_picture, s->f_code);
+    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, &s->next_picture, s->b_code) - penalty_factor;
 //printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
 
     fbmin= bidir_refine(s, mb_x, mb_y);
@@ -1544,22 +1400,10 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
 
     if(s->flags&CODEC_FLAG_HQ){
         type= MB_TYPE_FORWARD | MB_TYPE_BACKWARD | MB_TYPE_BIDIR | MB_TYPE_DIRECT; //FIXME something smarter
+        if(dmin>256*256*16) type&= ~MB_TYPE_DIRECT; //dont try direct mode if its invalid for this MB
     }
 
-/*
-{
-static int count=0;
-static int sum=0;
-if(type==MB_TYPE_DIRECT){
-  int diff= ABS(s->b_forw_mv_table)
-}
-}*/
-
     s->mb_type[mb_y*s->mb_width + mb_x]= type;
-/*    if(mb_y==0 && mb_x==0) printf("\n");
-    if(mb_x==0) printf("\n");
-    printf("%d", av_log2(type));
-*/
 }
 
 /* find best f_code for ME which do unlimited searches */
@@ -1572,7 +1416,7 @@ int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
         int best_fcode=-1;
         int best_score=-10000000;
 
-        for(i=0; i<8; i++) score[i]= s->mb_num*(8-i); //FIXME *2 and all other too so its the same but nicer
+        for(i=0; i<8; i++) score[i]= s->mb_num*(8-i);
 
         for(y=0; y<s->mb_height; y++){
             int x;
diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c
new file mode 100644
index 000000000..d1ca6e7fb
--- /dev/null
+++ b/src/libffmpeg/libavcodec/motion_est_template.c
@@ -0,0 +1,1006 @@
+/*
+ * Motion estimation 
+ * Copyright (c) 2002 Michael Niedermayer
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+//lets hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...)
+//Note, the last line is there to kill these ugly unused var warnings
+#define LOAD_COMMON(x, y)\
+    uint32_t * const score_map= s->me.score_map;\
+    const int stride= s->linesize;\
+    const int uvstride= s->uvlinesize;\
+    const int time_pp= s->pp_time;\
+    const int time_pb= s->pb_time;\
+    uint8_t * const src_y= s->new_picture.data[0] + ((y) * stride) + (x);\
+    uint8_t * const src_u= s->new_picture.data[1] + (((y)>>1) * uvstride) + ((x)>>1);\
+    uint8_t * const src_v= s->new_picture.data[2] + (((y)>>1) * uvstride) + ((x)>>1);\
+    uint8_t * const ref_y= ref_picture->data[0] + ((y) * stride) + (x);\
+    uint8_t * const ref_u= ref_picture->data[1] + (((y)>>1) * uvstride) + ((x)>>1);\
+    uint8_t * const ref_v= ref_picture->data[2] + (((y)>>1) * uvstride) + ((x)>>1);\
+    uint8_t * const ref2_y= s->next_picture.data[0] + ((y) * stride) + (x);\
+    op_pixels_func (*hpel_put)[4];\
+    op_pixels_func (*hpel_avg)[4]= &s->dsp.avg_pixels_tab[size];\
+    op_pixels_func (*chroma_hpel_put)[4];\
+    qpel_mc_func (*qpel_put)[16];\
+    qpel_mc_func (*qpel_avg)[16]= &s->dsp.avg_qpel_pixels_tab[size];\
+    const __attribute__((unused)) int unu= time_pp + time_pb + (int)src_u + (int)src_v + (int)ref_u + (int)ref_v\
+                                           + (int)ref2_y + (int)hpel_avg + (int)qpel_avg;\
+    if(s->no_rounding /*FIXME b_type*/){\
+        hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];\
+        chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];\
+        qpel_put= &s->dsp.put_no_rnd_qpel_pixels_tab[size];\
+    }else{\
+        hpel_put=& s->dsp.put_pixels_tab[size];\
+        chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];\
+        qpel_put= &s->dsp.put_qpel_pixels_tab[size];\
+    }
+
+
+#ifdef CMP_HPEL
+    
+#define CHECK_HALF_MV(dx, dy, x, y)\
+{\
+    const int hx= 2*(x)+(dx);\
+    const int hy= 2*(y)+(dy);\
+    CMP_HPEL(d, dx, dy, x, y, size);\
+    d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\
+    COPY3_IF_LT(dmin, d, bx, hx, by, hy)\
+}
+
+#if 0
+static int RENAME(hpel_motion_search)(MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, Picture *ref_picture, 
+                                  int n, int size, uint16_t * const mv_penalty)
+{
+    const int xx = 16 * s->mb_x + 8*(n&1);
+    const int yy = 16 * s->mb_y + 8*(n>>1);
+    const int mx = *mx_ptr;
+    const int my = *my_ptr;
+    const int penalty_factor= s->me.sub_penalty_factor;
+    
+    LOAD_COMMON(xx, yy);
+    
+ //   INIT;
+ //FIXME factorize
+    me_cmp_func cmp, chroma_cmp, cmp_sub, chroma_cmp_sub;
+
+    if(s->no_rounding /*FIXME b_type*/){
+        hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];
+        chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];
+    }else{
+        hpel_put=& s->dsp.put_pixels_tab[size];
+        chroma_hpel_put= &s->dsp.put_pixels_tab[size+1];
+    }
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+    cmp_sub= s->dsp.me_sub_cmp[size];
+    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
+
+    if(s->me.skip){ //FIXME somehow move up (benchmark)
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }
+        
+    if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
+        CMP_HPEL(dmin, 0, 0, mx, my, size);
+        if(mx || my)
+            dmin += (mv_penalty[2*mx - pred_x] + mv_penalty[2*my - pred_y])*penalty_factor;
+    }
+        
+    if (mx > xmin && mx < xmax && 
+        my > ymin && my < ymax) {
+        int bx=2*mx, by=2*my;
+        int d= dmin;
+        
+        CHECK_HALF_MV(1, 1, mx-1, my-1)
+        CHECK_HALF_MV(0, 1, mx  , my-1)        
+        CHECK_HALF_MV(1, 1, mx  , my-1)
+        CHECK_HALF_MV(1, 0, mx-1, my  )
+        CHECK_HALF_MV(1, 0, mx  , my  )
+        CHECK_HALF_MV(1, 1, mx-1, my  )
+        CHECK_HALF_MV(0, 1, mx  , my  )        
+        CHECK_HALF_MV(1, 1, mx  , my  )
+
+        assert(bx >= xmin*2 || bx <= xmax*2 || by >= ymin*2 || by <= ymax*2);
+
+        *mx_ptr = bx;
+        *my_ptr = by;
+    }else{
+        *mx_ptr =2*mx;
+        *my_ptr =2*my;
+    }
+
+    return dmin;
+}
+
+#else
+static int RENAME(hpel_motion_search)(MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, Picture *ref_picture, 
+                                  int n, int size, uint16_t * const mv_penalty)
+{
+    const int xx = 16 * s->mb_x + 8*(n&1);
+    const int yy = 16 * s->mb_y + 8*(n>>1);
+    const int mx = *mx_ptr;
+    const int my = *my_ptr;   
+    const int penalty_factor= s->me.sub_penalty_factor;
+    me_cmp_func cmp_sub, chroma_cmp_sub;
+
+    LOAD_COMMON(xx, yy);
+    
+ //FIXME factorize
+
+    cmp_sub= s->dsp.me_sub_cmp[size];
+    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
+
+    if(s->me.skip){ //FIXME move out of hpel?
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }
+        
+    if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
+        CMP_HPEL(dmin, 0, 0, mx, my, size);
+        if(mx || my)
+            dmin += (mv_penalty[2*mx - pred_x] + mv_penalty[2*my - pred_y])*penalty_factor;
+    }
+        
+    if (mx > xmin && mx < xmax && 
+        my > ymin && my < ymax) {
+        int bx=2*mx, by=2*my;
+        int d= dmin;
+        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] 
+                     + (mv_penalty[bx   - pred_x] + mv_penalty[by-2 - pred_y])*s->me.penalty_factor;
+        const int l= score_map[(index- 1               )&(ME_MAP_SIZE-1)]
+                     + (mv_penalty[bx-2 - pred_x] + mv_penalty[by   - pred_y])*s->me.penalty_factor;
+        const int r= score_map[(index+ 1               )&(ME_MAP_SIZE-1)]
+                     + (mv_penalty[bx+2 - pred_x] + mv_penalty[by   - pred_y])*s->me.penalty_factor;
+        const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
+                     + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*s->me.penalty_factor;
+    
+#if 0
+        int key;
+        int map_generation= s->me.map_generation;
+        uint32_t *map= s->me.map;
+        key= ((my-1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
+        assert(map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        key= ((my+1)<<ME_MAP_MV_BITS) + (mx) + map_generation;
+        assert(map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] == key);
+        key= ((my)<<ME_MAP_MV_BITS) + (mx+1) + map_generation;
+        assert(map[(index+1)&(ME_MAP_SIZE-1)] == key);
+        key= ((my)<<ME_MAP_MV_BITS) + (mx-1) + map_generation;
+        assert(map[(index-1)&(ME_MAP_SIZE-1)] == key);
+#endif                
+        if(t<=b){
+            CHECK_HALF_MV(0, 1, mx  ,my-1)
+            if(l<=r){
+                CHECK_HALF_MV(1, 1, mx-1, my-1)
+                if(t+r<=b+l){
+                    CHECK_HALF_MV(1, 1, mx  , my-1)
+                }else{
+                    CHECK_HALF_MV(1, 1, mx-1, my  )
+                }
+                CHECK_HALF_MV(1, 0, mx-1, my  )
+            }else{
+                CHECK_HALF_MV(1, 1, mx  , my-1)
+                if(t+l<=b+r){
+                    CHECK_HALF_MV(1, 1, mx-1, my-1)
+                }else{
+                    CHECK_HALF_MV(1, 1, mx  , my  )
+                }
+                CHECK_HALF_MV(1, 0, mx  , my  )
+            }
+        }else{
+            if(l<=r){
+                if(t+l<=b+r){
+                    CHECK_HALF_MV(1, 1, mx-1, my-1)
+                }else{
+                    CHECK_HALF_MV(1, 1, mx  , my  )
+                }
+                CHECK_HALF_MV(1, 0, mx-1, my)
+                CHECK_HALF_MV(1, 1, mx-1, my)
+            }else{
+                if(t+r<=b+l){
+                    CHECK_HALF_MV(1, 1, mx  , my-1)
+                }else{
+                    CHECK_HALF_MV(1, 1, mx-1, my)
+                }
+                CHECK_HALF_MV(1, 0, mx  , my)
+                CHECK_HALF_MV(1, 1, mx  , my)
+            }
+            CHECK_HALF_MV(0, 1, mx  , my)
+        }
+        assert(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
+
+        *mx_ptr = bx;
+        *my_ptr = by;
+    }else{
+        *mx_ptr =2*mx;
+        *my_ptr =2*my;
+    }
+
+    return dmin;
+}
+#endif
+
+#endif /* CMP_HPEL */
+
+#ifdef CMP_QPEL
+
+#define CHECK_QUARTER_MV(dx, dy, x, y)\
+{\
+    const int hx= 4*(x)+(dx);\
+    const int hy= 4*(y)+(dy);\
+    CMP_QPEL(d, dx, dy, x, y, size);\
+    d += (mv_penalty[hx - pred_x] + mv_penalty[hy - pred_y])*penalty_factor;\
+    COPY3_IF_LT(dmin, d, bx, hx, by, hy)\
+}
+
+static int RENAME(qpel_motion_search)(MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, Picture *ref_picture, 
+                                  int n, int size, uint16_t * const mv_penalty)
+{
+    const int xx = 16 * s->mb_x + 8*(n&1);
+    const int yy = 16 * s->mb_y + 8*(n>>1);
+    const int mx = *mx_ptr;
+    const int my = *my_ptr;   
+    const int penalty_factor= s->me.sub_penalty_factor;
+    const int map_generation= s->me.map_generation;
+    const int subpel_quality= s->avctx->me_subpel_quality;
+    uint32_t *map= s->me.map;
+    me_cmp_func cmp, chroma_cmp;
+    me_cmp_func cmp_sub, chroma_cmp_sub;
+
+    LOAD_COMMON(xx, yy);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1]; //factorize FIXME
+ //FIXME factorize
+
+    cmp_sub= s->dsp.me_sub_cmp[size];
+    chroma_cmp_sub= s->dsp.me_sub_cmp[size+1];
+
+    if(s->me.skip){ //FIXME somehow move up (benchmark)
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }
+        
+    if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
+        CMP_QPEL(dmin, 0, 0, mx, my, size);
+        if(mx || my)
+            dmin += (mv_penalty[4*mx - pred_x] + mv_penalty[4*my - pred_y])*penalty_factor;
+    }
+        
+    if (mx > xmin && mx < xmax && 
+        my > ymin && my < ymax) {
+        int bx=4*mx, by=4*my;
+        int d= dmin;
+        int i, nx, ny;
+        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int t= score_map[(index-(1<<ME_MAP_SHIFT)  )&(ME_MAP_SIZE-1)];
+        const int l= score_map[(index- 1                 )&(ME_MAP_SIZE-1)];
+        const int r= score_map[(index+ 1                 )&(ME_MAP_SIZE-1)];
+        const int b= score_map[(index+(1<<ME_MAP_SHIFT)  )&(ME_MAP_SIZE-1)];
+        const int c= score_map[(index                    )&(ME_MAP_SIZE-1)];
+        int best[8];
+        int best_pos[8][2];
+        
+        memset(best, 64, sizeof(int)*8);
+#if 1
+        if(s->me.dia_size>=2){        
+            const int tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            const int bl= score_map[(index+(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            const int tr= score_map[(index-(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
+            const int br= score_map[(index+(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
+
+            for(ny= -3; ny <= 3; ny++){
+                for(nx= -3; nx <= 3; nx++){
+                    const int t2= nx*nx*(tr + tl - 2*t) + 4*nx*(tr-tl) + 32*t;
+                    const int c2= nx*nx*( r +  l - 2*c) + 4*nx*( r- l) + 32*c;
+                    const int b2= nx*nx*(br + bl - 2*b) + 4*nx*(br-bl) + 32*b;
+                    int score= ny*ny*(b2 + t2 - 2*c2) + 4*ny*(b2 - t2) + 32*c2;
+                    int i;
+                    
+                    if((nx&3)==0 && (ny&3)==0) continue;
+                    
+                    score += 1024*(mv_penalty[4*mx + nx - pred_x] + mv_penalty[4*my + ny - pred_y])*penalty_factor;
+                    
+//                    if(nx&1) score-=1024*s->me.penalty_factor;
+//                    if(ny&1) score-=1024*s->me.penalty_factor;
+                    
+                    for(i=0; i<8; i++){
+                        if(score < best[i]){
+                            memmove(&best[i+1], &best[i], sizeof(int)*(7-i));
+                            memmove(&best_pos[i+1][0], &best_pos[i][0], sizeof(int)*2*(7-i));
+                            best[i]= score;
+                            best_pos[i][0]= nx + 4*mx;
+                            best_pos[i][1]= ny + 4*my;
+                            break;
+                        }
+                    }
+                }
+            }
+        }else{
+            int tl;
+            const int cx = 4*(r - l);
+            const int cx2= r + l - 2*c; 
+            const int cy = 4*(b - t);
+            const int cy2= b + t - 2*c;
+            int cxy;
+              
+            if(map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)] == (my<<ME_MAP_MV_BITS) + mx + map_generation && 0){ //FIXME
+                tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            }else{
+                CMP(tl, mx-1, my-1, size); //FIXME wrong if chroma me is different
+            }
+            
+            cxy= 2*tl + (cx + cy)/4 - (cx2 + cy2) - 2*c; 
+           
+            assert(16*cx2 + 4*cx + 32*c == 32*r);
+            assert(16*cx2 - 4*cx + 32*c == 32*l);
+            assert(16*cy2 + 4*cy + 32*c == 32*b);
+            assert(16*cy2 - 4*cy + 32*c == 32*t);
+            assert(16*cxy + 16*cy2 + 16*cx2 - 4*cy - 4*cx + 32*c == 32*tl);
+            
+            for(ny= -3; ny <= 3; ny++){
+                for(nx= -3; nx <= 3; nx++){
+                    int score= ny*nx*cxy + nx*nx*cx2 + ny*ny*cy2 + nx*cx + ny*cy + 32*c; //FIXME factor
+                    int i;
+                    
+                    if((nx&3)==0 && (ny&3)==0) continue;
+                
+                    score += 32*(mv_penalty[4*mx + nx - pred_x] + mv_penalty[4*my + ny - pred_y])*penalty_factor;
+//                    if(nx&1) score-=32*s->me.penalty_factor;
+  //                  if(ny&1) score-=32*s->me.penalty_factor;
+                    
+                    for(i=0; i<8; i++){
+                        if(score < best[i]){
+                            memmove(&best[i+1], &best[i], sizeof(int)*(7-i));
+                            memmove(&best_pos[i+1][0], &best_pos[i][0], sizeof(int)*2*(7-i));
+                            best[i]= score;
+                            best_pos[i][0]= nx + 4*mx;
+                            best_pos[i][1]= ny + 4*my;
+                            break;
+                        }
+                    }
+                }
+            }            
+        }
+        for(i=0; i<subpel_quality; i++){
+            nx= best_pos[i][0];
+            ny= best_pos[i][1];
+            CHECK_QUARTER_MV(nx&3, ny&3, nx>>2, ny>>2)
+        }
+
+#if 0
+            const int tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            const int bl= score_map[(index+(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
+            const int tr= score_map[(index-(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
+            const int br= score_map[(index+(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
+//            if(l < r && l < t && l < b && l < tl && l < bl && l < tr && l < br && bl < tl){
+            if(tl<br){
+
+//            nx= FFMAX(4*mx - bx, bx - 4*mx);
+//            ny= FFMAX(4*my - by, by - 4*my);
+            
+            static int stats[7][7], count;
+            count++;
+            stats[4*mx - bx + 3][4*my - by + 3]++;
+            if(256*256*256*64 % count ==0){
+                for(i=0; i<49; i++){
+                    if((i%7)==0) printf("\n");
+                    printf("%6d ", stats[0][i]);
+                }
+                printf("\n");
+            }
+            }
+#endif
+#else
+
+        CHECK_QUARTER_MV(2, 2, mx-1, my-1)
+        CHECK_QUARTER_MV(0, 2, mx  , my-1)        
+        CHECK_QUARTER_MV(2, 2, mx  , my-1)
+        CHECK_QUARTER_MV(2, 0, mx  , my  )
+        CHECK_QUARTER_MV(2, 2, mx  , my  )
+        CHECK_QUARTER_MV(0, 2, mx  , my  )
+        CHECK_QUARTER_MV(2, 2, mx-1, my  )
+        CHECK_QUARTER_MV(2, 0, mx-1, my  )
+        
+        nx= bx;
+        ny= by;
+        
+        for(i=0; i<8; i++){
+            int ox[8]= {0, 1, 1, 1, 0,-1,-1,-1};
+            int oy[8]= {1, 1, 0,-1,-1,-1, 0, 1};
+            CHECK_QUARTER_MV((nx + ox[i])&3, (ny + oy[i])&3, (nx + ox[i])>>2, (ny + oy[i])>>2)
+        }
+#endif
+#if 0
+        //outer ring
+        CHECK_QUARTER_MV(1, 3, mx-1, my-1)
+        CHECK_QUARTER_MV(1, 2, mx-1, my-1)
+        CHECK_QUARTER_MV(1, 1, mx-1, my-1)
+        CHECK_QUARTER_MV(2, 1, mx-1, my-1)
+        CHECK_QUARTER_MV(3, 1, mx-1, my-1)
+        CHECK_QUARTER_MV(0, 1, mx  , my-1)
+        CHECK_QUARTER_MV(1, 1, mx  , my-1)
+        CHECK_QUARTER_MV(2, 1, mx  , my-1)
+        CHECK_QUARTER_MV(3, 1, mx  , my-1)
+        CHECK_QUARTER_MV(3, 2, mx  , my-1)
+        CHECK_QUARTER_MV(3, 3, mx  , my-1)
+        CHECK_QUARTER_MV(3, 0, mx  , my  )
+        CHECK_QUARTER_MV(3, 1, mx  , my  )
+        CHECK_QUARTER_MV(3, 2, mx  , my  )
+        CHECK_QUARTER_MV(3, 3, mx  , my  )
+        CHECK_QUARTER_MV(2, 3, mx  , my  )
+        CHECK_QUARTER_MV(1, 3, mx  , my  )
+        CHECK_QUARTER_MV(0, 3, mx  , my  )
+        CHECK_QUARTER_MV(3, 3, mx-1, my  )
+        CHECK_QUARTER_MV(2, 3, mx-1, my  )
+        CHECK_QUARTER_MV(1, 3, mx-1, my  )
+        CHECK_QUARTER_MV(1, 2, mx-1, my  )
+        CHECK_QUARTER_MV(1, 1, mx-1, my  )
+        CHECK_QUARTER_MV(1, 0, mx-1, my  )
+#endif
+        assert(bx >= xmin*4 && bx <= xmax*4 && by >= ymin*4 && by <= ymax*4);
+
+        *mx_ptr = bx;
+        *my_ptr = by;
+    }else{
+        *mx_ptr =4*mx;
+        *my_ptr =4*my;
+    }
+
+    return dmin;
+}
+
+#endif /* CMP_QPEL */
+
+#define CHECK_MV(x,y)\
+{\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+/*printf("check_mv %d %d\n", x, y);*/\
+    if(map[index]!=key){\
+        CMP(d, x, y, size);\
+        map[index]= key;\
+        score_map[index]= d;\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+/*printf("score:%d\n", d);*/\
+        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
+    }\
+}
+
+#define CHECK_CLIPED_MV(ax,ay)\
+{\
+    const int x= FFMAX(xmin, FFMIN(ax, xmax));\
+    const int y= FFMAX(ymin, FFMIN(ay, ymax));\
+    CHECK_MV(x, y)\
+}
+
+#define CHECK_MV_DIR(x,y,new_dir)\
+{\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+/*printf("check_mv_dir %d %d %d\n", x, y, new_dir);*/\
+    if(map[index]!=key){\
+        CMP(d, x, y, size);\
+        map[index]= key;\
+        score_map[index]= d;\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*penalty_factor;\
+/*printf("score:%d\n", d);*/\
+        if(d<dmin){\
+            best[0]=x;\
+            best[1]=y;\
+            dmin=d;\
+            next_dir= new_dir;\
+        }\
+    }\
+}
+
+#define check(x,y,S,v)\
+if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
+
+
+static inline int RENAME(small_diamond_search)(MpegEncContext * s, int *best, int dmin,
+                                       Picture *ref_picture,
+                                       int const pred_x, int const pred_y, int const penalty_factor,
+                                       int const xmin, int const ymin, int const xmax, int const ymax, int const shift,
+                                       uint32_t *map, int map_generation, int size, uint16_t * const mv_penalty
+                                       )
+{
+    me_cmp_func cmp, chroma_cmp;
+    int next_dir=-1;
+    LOAD_COMMON(s->mb_x*16, s->mb_y*16);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+
+    { /* ensure that the best point is in the MAP as h/qpel refinement needs it */
+        const int key= (best[1]<<ME_MAP_MV_BITS) + best[0] + map_generation;
+        const int index= ((best[1]<<ME_MAP_SHIFT) + best[0])&(ME_MAP_SIZE-1);
+        if(map[index]!=key){ //this will be executed only very rarey
+            CMP(score_map[index], best[0], best[1], size);
+            map[index]= key;
+        }
+    }
+
+    for(;;){
+        int d;
+        const int dir= next_dir;
+        const int x= best[0];
+        const int y= best[1];
+        next_dir=-1;
+
+//printf("%d", dir);
+        if(dir!=2 && x>xmin) CHECK_MV_DIR(x-1, y  , 0)
+        if(dir!=3 && y>ymin) CHECK_MV_DIR(x  , y-1, 1)
+        if(dir!=0 && x<xmax) CHECK_MV_DIR(x+1, y  , 2)
+        if(dir!=1 && y<ymax) CHECK_MV_DIR(x  , y+1, 3)
+
+        if(next_dir==-1){
+            return dmin;
+        }
+    }
+}
+
+static inline int RENAME(funny_diamond_search)(MpegEncContext * s, int *best, int dmin,
+                                       Picture *ref_picture,
+                                       int const pred_x, int const pred_y, int const penalty_factor,
+                                       int const xmin, int const ymin, int const xmax, int const ymax, int const shift,
+                                       uint32_t *map, int map_generation, int size, uint16_t * const mv_penalty
+                                       )
+{
+    me_cmp_func cmp, chroma_cmp;
+    int dia_size;
+    LOAD_COMMON(s->mb_x*16, s->mb_y*16);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+
+    for(dia_size=1; dia_size<=4; dia_size++){
+        int dir;
+        const int x= best[0];
+        const int y= best[1];
+        
+        if(dia_size&(dia_size-1)) continue;
+
+        if(   x + dia_size > xmax
+           || x - dia_size < xmin
+           || y + dia_size > ymax
+           || y - dia_size < ymin)
+           continue;
+        
+        for(dir= 0; dir<dia_size; dir+=2){
+            int d;
+
+            CHECK_MV(x + dir           , y + dia_size - dir);
+            CHECK_MV(x + dia_size - dir, y - dir           );
+            CHECK_MV(x - dir           , y - dia_size + dir);
+            CHECK_MV(x - dia_size + dir, y + dir           );
+        }
+
+        if(x!=best[0] || y!=best[1])
+            dia_size=0;
+#if 0
+{
+int dx, dy, i;
+static int stats[8*8];
+dx= ABS(x-best[0]);
+dy= ABS(y-best[1]);
+if(dy>dx){
+    dx^=dy; dy^=dx; dx^=dy;
+}
+stats[dy*8 + dx] ++;
+if(256*256*256*64 % (stats[0]+1)==0){
+    for(i=0; i<64; i++){
+        if((i&7)==0) printf("\n");
+        printf("%8d ", stats[i]);
+    }
+    printf("\n");
+}
+}
+#endif
+    }
+    return dmin;    
+}
+
+#define SAB_CHECK_MV(ax,ay)\
+{\
+    const int key= ((ay)<<ME_MAP_MV_BITS) + (ax) + map_generation;\
+    const int index= (((ay)<<ME_MAP_SHIFT) + (ax))&(ME_MAP_SIZE-1);\
+/*printf("sab check %d %d\n", ax, ay);*/\
+    if(map[index]!=key){\
+        CMP(d, ax, ay, size);\
+        map[index]= key;\
+        score_map[index]= d;\
+        d += (mv_penalty[((ax)<<shift)-pred_x] + mv_penalty[((ay)<<shift)-pred_y])*penalty_factor;\
+/*printf("score: %d\n", d);*/\
+        if(d < minima[minima_count-1].height){\
+            int j=0;\
+            \
+            while(d >= minima[j].height) j++;\
+\
+            memmove(&minima [j+1], &minima [j], (minima_count - j - 1)*sizeof(Minima));\
+\
+            minima[j].checked= 0;\
+            minima[j].height= d;\
+            minima[j].x= ax;\
+            minima[j].y= ay;\
+            \
+            i=-1;\
+            continue;\
+        }\
+    }\
+}
+
+#define MAX_SAB_SIZE 16
+static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int dmin,
+                                       Picture *ref_picture,
+                                       int const pred_x, int const pred_y, int const penalty_factor,
+                                       int const xmin, int const ymin, int const xmax, int const ymax, int const shift,
+                                       uint32_t *map, int map_generation, int size, uint16_t * const mv_penalty
+                                       )
+{
+    me_cmp_func cmp, chroma_cmp;
+    Minima minima[MAX_SAB_SIZE];
+    const int minima_count= ABS(s->me.dia_size);
+    int i, j;
+    LOAD_COMMON(s->mb_x*16, s->mb_y*16);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+    
+    for(j=i=0; i<ME_MAP_SIZE; i++){
+        uint32_t key= map[i];
+
+        key += (1<<(ME_MAP_MV_BITS-1)) + (1<<(2*ME_MAP_MV_BITS-1));
+        
+        if((key&((-1)<<(2*ME_MAP_MV_BITS))) != map_generation) continue;
+        
+        assert(j<MAX_SAB_SIZE); //max j = number of predictors
+        
+        minima[j].height= score_map[i];
+        minima[j].x= key & ((1<<ME_MAP_MV_BITS)-1); key>>=ME_MAP_MV_BITS;
+        minima[j].y= key & ((1<<ME_MAP_MV_BITS)-1);
+        minima[j].x-= (1<<(ME_MAP_MV_BITS-1));
+        minima[j].y-= (1<<(ME_MAP_MV_BITS-1));
+        minima[j].checked=0;
+        if(minima[j].x || minima[j].y)
+            minima[j].height+= (mv_penalty[((minima[j].x)<<shift)-pred_x] + mv_penalty[((minima[j].y)<<shift)-pred_y])*penalty_factor;
+        
+        j++;
+    }
+    
+    qsort(minima, j, sizeof(Minima), minima_cmp);
+    
+    for(; j<minima_count; j++){
+        minima[j].height=256*256*256*64;
+        minima[j].checked=0;
+        minima[j].x= minima[j].y=0;
+    }
+    
+    for(i=0; i<minima_count; i++){
+        const int x= minima[i].x;
+        const int y= minima[i].y;
+        int d;
+        
+        if(minima[i].checked) continue;
+        
+        if(   x >= xmax || x <= xmin
+           || y >= ymax || y <= ymin)
+           continue;
+
+        SAB_CHECK_MV(x-1, y)
+        SAB_CHECK_MV(x+1, y)
+        SAB_CHECK_MV(x  , y-1)
+        SAB_CHECK_MV(x  , y+1)
+        
+        minima[i].checked= 1;
+    }
+    
+    best[0]= minima[0].x;
+    best[1]= minima[0].y;
+    dmin= minima[0].height;
+    
+    if(   best[0] < xmax && best[0] > xmin
+       && best[1] < ymax && best[1] > ymin){
+        int d;
+        //ensure that the refernece samples for hpel refinement are in the map
+        CHECK_MV(best[0]-1, best[1])
+        CHECK_MV(best[0]+1, best[1])
+        CHECK_MV(best[0], best[1]-1)
+        CHECK_MV(best[0], best[1]+1)
+    }
+    return dmin;    
+}
+
+static inline int RENAME(var_diamond_search)(MpegEncContext * s, int *best, int dmin,
+                                       Picture *ref_picture,
+                                       int const pred_x, int const pred_y, int const penalty_factor,
+                                       int const xmin, int const ymin, int const xmax, int const ymax, int const shift,
+                                       uint32_t *map, int map_generation, int size, uint16_t * const mv_penalty
+                                       )
+{
+    me_cmp_func cmp, chroma_cmp;
+    int dia_size;
+    LOAD_COMMON(s->mb_x*16, s->mb_y*16);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+
+    for(dia_size=1; dia_size<=s->me.dia_size; dia_size++){
+        int dir, start, end;
+        const int x= best[0];
+        const int y= best[1];
+
+        start= FFMAX(0, y + dia_size - ymax);
+        end  = FFMIN(dia_size, xmax - x + 1);
+        for(dir= start; dir<end; dir++){
+            int d;
+
+//check(x + dir,y + dia_size - dir,0, a0)
+            CHECK_MV(x + dir           , y + dia_size - dir);
+        }
+
+        start= FFMAX(0, x + dia_size - xmax);
+        end  = FFMIN(dia_size, y - ymin + 1);
+        for(dir= start; dir<end; dir++){
+            int d;
+
+//check(x + dia_size - dir, y - dir,0, a1)
+            CHECK_MV(x + dia_size - dir, y - dir           );
+        }
+
+        start= FFMAX(0, -y + dia_size + ymin );
+        end  = FFMIN(dia_size, x - xmin + 1);
+        for(dir= start; dir<end; dir++){
+            int d;
+
+//check(x - dir,y - dia_size + dir,0, a2)
+            CHECK_MV(x - dir           , y - dia_size + dir);
+        }
+
+        start= FFMAX(0, -x + dia_size + xmin );
+        end  = FFMIN(dia_size, ymax - y + 1);
+        for(dir= start; dir<end; dir++){
+            int d;
+
+//check(x - dia_size + dir, y + dir,0, a3)
+            CHECK_MV(x - dia_size + dir, y + dir           );
+        }
+
+        if(x!=best[0] || y!=best[1])
+            dia_size=0;
+#if 0
+{
+int dx, dy, i;
+static int stats[8*8];
+dx= ABS(x-best[0]);
+dy= ABS(y-best[1]);
+stats[dy*8 + dx] ++;
+if(256*256*256*64 % (stats[0]+1)==0){
+    for(i=0; i<64; i++){
+        if((i&7)==0) printf("\n");
+        printf("%6d ", stats[i]);
+    }
+    printf("\n");
+}
+}
+#endif
+    }
+    return dmin;    
+}
+
+static int RENAME(epzs_motion_search)(MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, uint16_t * const mv_penalty)
+{
+    int best[2]={0, 0};
+    int d, dmin; 
+    const int shift= 1+s->quarter_sample;
+    uint32_t *map= s->me.map;
+    int map_generation;
+    const int penalty_factor= s->me.penalty_factor;
+    const int size=0;
+    const int ref_mv_stride= s->mb_width+2;
+    const int ref_mv_xy= 1 + s->mb_x + (s->mb_y + 1)*ref_mv_stride;
+    me_cmp_func cmp, chroma_cmp;
+    LOAD_COMMON(s->mb_x*16, s->mb_y*16);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+    
+    map_generation= update_map_generation(s);
+
+    CMP(dmin, 0, 0, size);
+    map[0]= map_generation;
+    score_map[0]= dmin;
+
+    /* first line */
+    if (s->mb_y == 0) {
+        CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+        CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
+                        (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
+    }else{
+        if(dmin<256 && ( P_LEFT[0]    |P_LEFT[1]
+                        |P_TOP[0]     |P_TOP[1]
+                        |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){
+            *mx_ptr= 0;
+            *my_ptr= 0;
+            s->me.skip=1;
+            return dmin;
+        }
+        CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
+        if(dmin>256*2){
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
+            CHECK_MV(P_LEFT[0]    >>shift, P_LEFT[1]    >>shift)
+            CHECK_MV(P_TOP[0]     >>shift, P_TOP[1]     >>shift)
+            CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
+        }
+    }
+    if(dmin>256*4){
+        if(s->me.pre_pass){
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy-1][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy-1][1]*ref_mv_scale + (1<<15))>>16)
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy-ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy-ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+        }else{
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy+1][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy+1][1]*ref_mv_scale + (1<<15))>>16)
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+        }
+    }
+
+    if(s->avctx->last_predictor_count){
+        const int count= s->avctx->last_predictor_count;
+        const int xstart= FFMAX(0, s->mb_x - count);
+        const int ystart= FFMAX(0, s->mb_y - count);
+        const int xend= FFMIN(s->mb_width , s->mb_x + count + 1);
+        const int yend= FFMIN(s->mb_height, s->mb_y + count + 1);
+        int mb_y;
+
+        for(mb_y=ystart; mb_y<yend; mb_y++){
+            int mb_x;
+            for(mb_x=xstart; mb_x<xend; mb_x++){
+                const int xy= mb_x + 1 + (mb_y + 1)*ref_mv_stride;
+                int mx= (last_mv[xy][0]*ref_mv_scale + (1<<15))>>16;
+                int my= (last_mv[xy][1]*ref_mv_scale + (1<<15))>>16;
+
+                if(mx>xmax || mx<xmin || my>ymax || my<ymin) continue;
+                CHECK_MV(mx,my)
+            }
+        }
+    }
+
+//check(best[0],best[1],0, b0)
+    if(s->me.dia_size==-1)
+        dmin= RENAME(funny_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+    else if(s->me.dia_size<-1)
+        dmin= RENAME(sab_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+    else if(s->me.dia_size<2)
+        dmin= RENAME(small_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+    else
+        dmin= RENAME(var_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+
+//check(best[0],best[1],0, b1)
+    *mx_ptr= best[0];
+    *my_ptr= best[1];    
+
+//    printf("%d %d %d \n", best[0], best[1], dmin);
+    return dmin;
+}
+
+#ifndef CMP_DIRECT /* no 4mv search needed in direct mode */
+static int RENAME(epzs_motion_search4)(MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, uint16_t * const mv_penalty)
+{
+    int best[2]={0, 0};
+    int d, dmin; 
+    const int shift= 1+s->quarter_sample;
+    uint32_t *map= s->me.map;
+    int map_generation;
+    const int penalty_factor= s->me.penalty_factor;
+    const int size=1;
+    const int ref_mv_stride= s->mb_width+2;
+    const int ref_mv_xy= 1 + s->mb_x + (s->mb_y + 1)*ref_mv_stride;
+    me_cmp_func cmp, chroma_cmp;
+    LOAD_COMMON((s->mb_x*2 + (block&1))*8, (s->mb_y*2 + (block>>1))*8);
+    
+    cmp= s->dsp.me_cmp[size];
+    chroma_cmp= s->dsp.me_cmp[size+1];
+
+    map_generation= update_map_generation(s);
+
+    dmin = 1000000;
+//printf("%d %d %d %d //",xmin, ymin, xmax, ymax); 
+    /* first line */
+    if (s->mb_y == 0 && block<2) {
+	CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+        CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
+                        (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
+        CHECK_MV(P_MV1[0]>>shift, P_MV1[1]>>shift)
+    }else{
+        CHECK_MV(P_MV1[0]>>shift, P_MV1[1]>>shift)
+        //FIXME try some early stop
+        if(dmin>64*2){
+            CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
+            CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+            CHECK_MV(P_TOP[0]>>shift, P_TOP[1]>>shift)
+            CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
+            CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
+                            (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
+        }
+    }
+    if(dmin>64*4){
+        CHECK_CLIPED_MV((last_mv[ref_mv_xy+1][0]*ref_mv_scale + (1<<15))>>16, 
+                        (last_mv[ref_mv_xy+1][1]*ref_mv_scale + (1<<15))>>16)
+        CHECK_CLIPED_MV((last_mv[ref_mv_xy+ref_mv_stride][0]*ref_mv_scale + (1<<15))>>16, 
+                        (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
+    }
+
+    if(s->me.dia_size==-1)
+        dmin= RENAME(funny_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+    else if(s->me.dia_size<-1)
+        dmin= RENAME(sab_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+    else if(s->me.dia_size<2)
+        dmin= RENAME(small_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+    else
+        dmin= RENAME(var_diamond_search)(s, best, dmin, ref_picture,
+                                   pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
+				   shift, map, map_generation, size, mv_penalty);
+
+    *mx_ptr= best[0];
+    *my_ptr= best[1];    
+
+//    printf("%d %d %d \n", best[0], best[1], dmin);
+    return dmin;
+}
+#endif /* !CMP_DIRECT */
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index e625a7262..10abf1024 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -66,9 +66,14 @@ static inline int mpeg2_decode_block_intra(MpegEncContext *s,
                                     int n);
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred);
 
+#ifdef CONFIG_ENCODERS
 static UINT16 mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
 static UINT8 fcode_tab[MAX_MV*2+1];
 
+static uint32_t uni_mpeg1_ac_vlc_bits[64*64*2];
+static uint8_t  uni_mpeg1_ac_vlc_len [64*64*2];
+#endif
+
 static inline int get_bits_diff(MpegEncContext *s){
     int bits,ret;
     
@@ -118,6 +123,51 @@ static void init_2d_vlc_rl(RLTable *rl)
     }
 }
 
+static void init_uni_ac_vlc(RLTable *rl, uint32_t *uni_ac_vlc_bits, uint8_t *uni_ac_vlc_len){
+    int i;
+
+    for(i=0; i<128; i++){
+        int level= i-64;
+        int run;
+        for(run=0; run<64; run++){
+            int len, bits, code;
+            
+            int alevel= ABS(level);
+            int sign= (level>>31)&1;
+
+            if (alevel > rl->max_level[0][run])
+                code= 111; /*rl->n*/
+            else
+                code= rl->index_run[0][run] + alevel - 1;
+
+            if (code < 111 /* rl->n */) {
+	    	/* store the vlc & sign at once */
+                len=   mpeg1_vlc[code][1]+1;
+                bits= (mpeg1_vlc[code][0]<<1) + sign;
+            } else {
+                len=  mpeg1_vlc[111/*rl->n*/][1]+6;
+                bits= mpeg1_vlc[111/*rl->n*/][0]<<6;
+
+                bits|= run;
+                if (alevel < 128) {
+                    bits<<=8; len+=8;
+                    bits|= level & 0xff;
+                } else {
+                    bits<<=16; len+=16;
+                    bits|= level & 0xff;
+                    if (level < 0) {
+                        bits|= 0x8001 + level + 255;
+                    } else {
+                        bits|= level & 0xffff;
+                    }
+                }
+            }
+
+            uni_ac_vlc_bits[UNI_AC_ENC_INDEX(run, i)]= bits;
+            uni_ac_vlc_len [UNI_AC_ENC_INDEX(run, i)]= len;
+        }
+    }
+}
 
 static void put_header(MpegEncContext *s, int header)
 {
@@ -131,8 +181,12 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
 {
         unsigned int vbv_buffer_size;
         unsigned int fps, v;
-        int n;
+        int n, i;
         UINT64 time_code;
+        float best_aspect_error= 1E10;
+        float aspect_ratio= s->avctx->aspect_ratio;
+        
+        if(aspect_ratio==0.0) aspect_ratio= s->width / (float)s->height; //pixel aspect 1:1 (VGA)
         
         if (s->current_picture.key_frame) {
             /* mpeg1 header repeated every gop */
@@ -154,7 +208,18 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
  
             put_bits(&s->pb, 12, s->width);
             put_bits(&s->pb, 12, s->height);
-            put_bits(&s->pb, 4, 1); /* 1/1 aspect ratio */
+            
+            for(i=1; i<15; i++){
+                float error= mpeg1_aspect[i] - s->width/(s->height*aspect_ratio);
+                error= ABS(error);
+                
+                if(error < best_aspect_error){
+                    best_aspect_error= error;
+                    s->aspect_ratio_info= i;
+                }
+            }
+            
+            put_bits(&s->pb, 4, s->aspect_ratio_info);
             put_bits(&s->pb, 4, s->frame_rate_index);
             v = s->bit_rate / 400;
             if (v > 0x3ffff)
@@ -439,6 +504,7 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val)
 
 void ff_mpeg1_encode_init(MpegEncContext *s)
 {
+#ifdef CONFIG_ENCODERS
     static int done=0;
 
     common_init(s);
@@ -450,12 +516,14 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
 
         done=1;
         init_rl(&rl_mpeg1);
-	
+
 	for(i=0; i<64; i++)
 	{
 		mpeg1_max_level[0][i]= rl_mpeg1.max_level[0][i];
 		mpeg1_index_run[0][i]= rl_mpeg1.index_run[0][i];
 	}
+        
+        init_uni_ac_vlc(&rl_mpeg1, uni_mpeg1_ac_vlc_bits, uni_mpeg1_ac_vlc_len);
 
 	/* build unified dc encoding tables */
 	for(i=-255; i<256; i++)
@@ -511,12 +579,15 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
             }
         }
     }
-    s->mv_penalty= mv_penalty;
+    s->me.mv_penalty= mv_penalty;
     s->fcode_tab= fcode_tab;
     s->min_qcoeff=-255;
     s->max_qcoeff= 255;
     s->intra_quant_bias= 3<<(QUANT_BIAS_SHIFT-3); //(a + x*3/8)/x
     s->inter_quant_bias= 0;
+    s->intra_ac_vlc_length=
+    s->inter_ac_vlc_length= uni_mpeg1_ac_vlc_len;
+#endif
 }
 
 static inline void encode_dc(MpegEncContext *s, int diff, int component)
@@ -587,12 +658,8 @@ static void mpeg1_encode_block(MpegEncContext *s,
             sign&=1;
 
 //            code = get_rl_index(rl, 0, run, alevel);
-            if (alevel > mpeg1_max_level[0][run])
-                code= 111; /*rl->n*/
-            else
+            if (alevel <= mpeg1_max_level[0][run]){
                 code= mpeg1_index_run[0][run] + alevel - 1;
-
-            if (code < 111 /* rl->n */) {
 	    	/* store the vlc & sign at once */
                 put_bits(&s->pb, mpeg1_vlc[code][1]+1, (mpeg1_vlc[code][0]<<1) + sign);
             } else {
@@ -721,6 +788,7 @@ static int mpeg_decode_mb(MpegEncContext *s,
             s->mv[1][0][0] = s->last_mv[1][0][0];
             s->mv[1][0][1] = s->last_mv[1][0][1];
         }
+
         s->mb_skiped = 1;
         return 0;
     }
@@ -1431,8 +1499,9 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
 static void mpeg_decode_sequence_extension(MpegEncContext *s)
 {
     int horiz_size_ext, vert_size_ext;
-    int bit_rate_ext, vbv_buf_ext, low_delay;
+    int bit_rate_ext, vbv_buf_ext;
     int frame_rate_ext_n, frame_rate_ext_d;
+    float aspect;
 
     skip_bits(&s->gb, 8); /* profil and level */
     s->progressive_sequence = get_bits1(&s->gb); /* progressive_sequence */
@@ -1445,7 +1514,7 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
     s->bit_rate = ((s->bit_rate / 400) | (bit_rate_ext << 12)) * 400;
     skip_bits1(&s->gb); /* marker */
     vbv_buf_ext = get_bits(&s->gb, 8);
-    low_delay = get_bits1(&s->gb);
+    s->low_delay = get_bits1(&s->gb);
     frame_rate_ext_n = get_bits(&s->gb, 2);
     frame_rate_ext_d = get_bits(&s->gb, 5);
     if (frame_rate_ext_d >= 1)
@@ -1453,6 +1522,10 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
     dprintf("sequence extension\n");
     s->mpeg2 = 1;
     s->avctx->sub_id = 2; /* indicates mpeg2 found */
+
+    aspect= mpeg2_aspect[s->aspect_ratio_info];
+    if(aspect>0.0)      s->avctx->aspect_ratio= s->width/(aspect*s->height);
+    else if(aspect<0.0) s->avctx->aspect_ratio= -1.0/aspect;
 }
 
 static void mpeg_decode_quant_matrix_extension(MpegEncContext *s)
@@ -1575,7 +1648,7 @@ static void mpeg_decode_extension(AVCodecContext *avctx,
  *         DECODE_SLICE_EOP if the end of the picture is reached
  */
 static int mpeg_decode_slice(AVCodecContext *avctx, 
-                              AVVideoFrame *pict,
+                              AVFrame *pict,
                               int start_code,
                               UINT8 *buf, int buf_size)
 {
@@ -1597,6 +1670,15 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
         s->first_slice = 0;
         if(MPV_frame_start(s, avctx) < 0)
             return DECODE_SLICE_FATAL_ERROR;
+            
+        if(s->avctx->debug&FF_DEBUG_PICT_INFO){
+             printf("qp:%d fc:%d%d%d%d %s %s %s %s dc:%d pstruct:%d fdct:%d cmv:%d qtype:%d ivlc:%d rff:%d %s\n", 
+                 s->qscale, s->mpeg_f_code[0][0],s->mpeg_f_code[0][1],s->mpeg_f_code[1][0],s->mpeg_f_code[1][1],
+                 s->pict_type == I_TYPE ? "I" : (s->pict_type == P_TYPE ? "P" : (s->pict_type == B_TYPE ? "B" : "S")), 
+                 s->progressive_sequence ? "pro" :"", s->alternate_scan ? "alt" :"", s->top_field_first ? "top" :"", 
+                 s->intra_dc_precision, s->picture_structure, s->frame_pred_frame_dct, s->concealment_motion_vectors,
+                 s->q_scale_type, s->intra_vlc_format, s->repeat_first_field, s->chroma_420_type ? "420" :"");
+        }
     }
 
     init_get_bits(&s->gb, buf, buf_size);
@@ -1682,16 +1764,16 @@ eos: //end of slice
 
         MPV_frame_end(s);
 
-        if (s->pict_type == B_TYPE) {
-            *pict= *(AVVideoFrame*)&s->current_picture;
+        if (s->pict_type == B_TYPE || s->low_delay) {
+            *pict= *(AVFrame*)&s->current_picture;
         } else {
             s->picture_number++;
             /* latency of 1 frame for I and P frames */
             /* XXX: use another variable than picture_number */
-            if (s->picture_number == 1) {
+            if (s->last_picture.data[0] == NULL) {
                 return DECODE_SLICE_OK;
             } else {
-                *pict= *(AVVideoFrame*)&s->last_picture;
+                *pict= *(AVFrame*)&s->last_picture;
             }
         }
         return DECODE_SLICE_EOP;
@@ -1706,12 +1788,18 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     Mpeg1Context *s1 = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     int width, height, i, v, j;
+    float aspect;
 
     init_get_bits(&s->gb, buf, buf_size);
 
     width = get_bits(&s->gb, 12);
     height = get_bits(&s->gb, 12);
-    skip_bits(&s->gb, 4);
+    s->aspect_ratio_info= get_bits(&s->gb, 4);
+    if(!s->mpeg2){
+        aspect= mpeg1_aspect[s->aspect_ratio_info];
+        if(aspect!=0.0) avctx->aspect_ratio= width/(aspect*height);
+    }
+
     s->frame_rate_index = get_bits(&s->gb, 4);
     if (s->frame_rate_index == 0)
         return -1;
@@ -1730,7 +1818,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         }
         s->width = width;
         s->height = height;
-        avctx->has_b_frames= s->has_b_frames = 1;
+        avctx->has_b_frames= 1;
         s->avctx = avctx;
         avctx->width = width;
         avctx->height = height;
@@ -1813,7 +1901,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
     Mpeg1Context *s = avctx->priv_data;
     UINT8 *buf_end, *buf_ptr, *buf_start;
     int len, start_code_found, ret, code, start_code, input_size;
-    AVVideoFrame *picture = data;
+    AVFrame *picture = data;
     MpegEncContext *s2 = &s->mpeg_enc_ctx;
             
     dprintf("fill_buffer\n");
@@ -1823,9 +1911,9 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
     /* special case for last picture */
     if (buf_size == 0) {
         if (s2->picture_number > 0) {
-            *picture= *(AVVideoFrame*)&s2->next_picture;
+            *picture= *(AVFrame*)&s2->next_picture;
 
-            *data_size = sizeof(AVVideoFrame);
+            *data_size = sizeof(AVFrame);
         }
         return 0;
     }
@@ -1896,15 +1984,16 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
                                           s->buffer, input_size);
                     break;
                 default:
-                    /* skip b frames if we dont have reference frames */
-                    if(s2->last_picture.data[0]==NULL && s2->pict_type==B_TYPE) break;
-                    /* skip b frames if we are in a hurry */
-                    if(avctx->hurry_up && s2->pict_type==B_TYPE) break;
-                    /* skip everything if we are in a hurry>=5 */
-                    if(avctx->hurry_up>=5) break;
-
                     if (start_code >= SLICE_MIN_START_CODE &&
                         start_code <= SLICE_MAX_START_CODE) {
+                        
+                        /* skip b frames if we dont have reference frames */
+                        if(s2->last_picture.data[0]==NULL && s2->pict_type==B_TYPE) break;
+                        /* skip b frames if we are in a hurry */
+                        if(avctx->hurry_up && s2->pict_type==B_TYPE) break;
+                        /* skip everything if we are in a hurry>=5 */
+                        if(avctx->hurry_up>=5) break;
+
                         ret = mpeg_decode_slice(avctx, picture,
                                                 start_code, s->buffer, input_size);
                         if (ret == DECODE_SLICE_EOP) {
diff --git a/src/libffmpeg/libavcodec/mpeg12data.h b/src/libffmpeg/libavcodec/mpeg12data.h
index a1a7166be..8bf063ec1 100644
--- a/src/libffmpeg/libavcodec/mpeg12data.h
+++ b/src/libffmpeg/libavcodec/mpeg12data.h
@@ -411,4 +411,32 @@ UINT8 ff_mpeg1_dc_scale_table[128]={ // MN: mpeg2 really can have such large qsc
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 };
 
+static const float mpeg1_aspect[16]={
+    0.0000,
+    1.0000,
+    0.6735,
+    0.7031,
+    
+    0.7615,
+    0.8055,
+    0.8437,
+    0.8935,
+
+    0.9157,
+    0.9815,
+    1.0255,
+    1.0695,
+
+    1.0950,
+    1.1575,
+    1.2015,
+};
+
+static const float mpeg2_aspect[16]={
+    0,
+    1.0,
+    -3.0/4.0,
+    -9.0/16.0,
+    -1.0/2.21,
+};
 
diff --git a/src/libffmpeg/libavcodec/mpeg4data.h b/src/libffmpeg/libavcodec/mpeg4data.h
index ef612c2fa..54a280d07 100644
--- a/src/libffmpeg/libavcodec/mpeg4data.h
+++ b/src/libffmpeg/libavcodec/mpeg4data.h
@@ -4,8 +4,13 @@
 #define BIN_ONLY_SHAPE   2
 #define GRAY_SHAPE       3
 
-#define SIMPLE_VO_TYPE 1
-#define CORE_VO_TYPE   3
+#define SIMPLE_VO_TYPE             1
+#define CORE_VO_TYPE               3
+#define MAIN_VO_TYPE               4
+#define NBIT_VO_TYPE               5
+#define ARTS_VO_TYPE               10
+#define ACE_VO_TYPE                12
+#define ADV_SIMPLE_VO_TYPE         17
 
 // aspect_ratio_info
 #define EXTENDED_PAR 15
@@ -22,6 +27,12 @@
 #define MB_TYPE_B_BACKW  2
 #define MB_TYPE_B_FORW   3
 
+#define VOS_STARTCODE        0x1B0
+#define USER_DATA_STARTCODE  0x1B2
+#define GOP_STARTCODE        0x1B3
+#define VISUAL_OBJ_STARTCODE 0x1B5
+#define VOP_STARTCODE        0x1B6
+
 /* dc encoding for mpeg4 */
 const UINT8 DCtab_lum[13][2] =
 {
diff --git a/src/libffmpeg/libavcodec/mpegaudio.h b/src/libffmpeg/libavcodec/mpegaudio.h
new file mode 100644
index 000000000..3381dbed2
--- /dev/null
+++ b/src/libffmpeg/libavcodec/mpegaudio.h
@@ -0,0 +1,27 @@
+/* mpeg audio declarations for both encoder and decoder */
+
+/* max frame size, in samples */
+#define MPA_FRAME_SIZE 1152 
+
+/* max compressed frame size */
+#define MPA_MAX_CODED_FRAME_SIZE 1792
+
+#define MPA_MAX_CHANNELS 2
+
+#define SBLIMIT 32 /* number of subbands */
+
+#define MPA_STEREO  0
+#define MPA_JSTEREO 1
+#define MPA_DUAL    2
+#define MPA_MONO    3
+
+int l2_select_table(int bitrate, int nb_channels, int freq, int lsf);
+
+extern const UINT16 mpa_bitrate_tab[2][3][15];
+extern const UINT16 mpa_freq_tab[3];
+extern const unsigned char *alloc_tables[5];
+extern const double enwindow[512];
+extern const int sblimit_table[5];
+extern const int quant_steps[17];
+extern const int quant_bits[17];
+extern const INT32 mpa_enwindow[257];
diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c
new file mode 100644
index 000000000..b2c0966aa
--- /dev/null
+++ b/src/libffmpeg/libavcodec/mpegaudiodec.c
@@ -0,0 +1,2499 @@
+/*
+ * MPEG Audio decoder
+ * Copyright (c) 2001, 2002 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+//#define DEBUG
+#include "avcodec.h"
+#include "mpegaudio.h"
+
+/*
+ * TODO:
+ *  - in low precision mode, use more 16 bit multiplies in synth filter
+ *  - test lsf / mpeg25 extensively.
+ */
+
+/* define USE_HIGHPRECISION to have a bit exact (but slower) mpeg
+   audio decoder */
+#ifdef CONFIG_MPEGAUDIO_HP
+#define USE_HIGHPRECISION
+#endif
+
+#ifdef USE_HIGHPRECISION
+#define FRAC_BITS   23   /* fractional bits for sb_samples and dct */
+#define WFRAC_BITS  16   /* fractional bits for window */
+#else
+#define FRAC_BITS   15   /* fractional bits for sb_samples and dct */
+#define WFRAC_BITS  14   /* fractional bits for window */
+#endif
+
+#define FRAC_ONE    (1 << FRAC_BITS)
+
+#define MULL(a,b) (((INT64)(a) * (INT64)(b)) >> FRAC_BITS)
+#define MUL64(a,b) ((INT64)(a) * (INT64)(b))
+#define FIX(a)   ((int)((a) * FRAC_ONE))
+/* WARNING: only correct for posititive numbers */
+#define FIXR(a)   ((int)((a) * FRAC_ONE + 0.5))
+#define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS)
+
+#if FRAC_BITS <= 15
+typedef INT16 MPA_INT;
+#else
+typedef INT32 MPA_INT;
+#endif
+
+/****************/
+
+#define HEADER_SIZE 4
+#define BACKSTEP_SIZE 512
+
+typedef struct MPADecodeContext {
+    UINT8 inbuf1[2][MPA_MAX_CODED_FRAME_SIZE + BACKSTEP_SIZE];	/* input buffer */
+    int inbuf_index;
+    UINT8 *inbuf_ptr, *inbuf;
+    int frame_size;
+    int free_format_frame_size; /* frame size in case of free format
+                                   (zero if currently unknown) */
+    /* next header (used in free format parsing) */
+    UINT32 free_format_next_header; 
+    int error_protection;
+    int layer;
+    int sample_rate;
+    int sample_rate_index; /* between 0 and 8 */
+    int bit_rate;
+    int old_frame_size;
+    GetBitContext gb;
+    int nb_channels;
+    int mode;
+    int mode_ext;
+    int lsf;
+    MPA_INT synth_buf[MPA_MAX_CHANNELS][512 * 2];
+    int synth_buf_offset[MPA_MAX_CHANNELS];
+    INT32 sb_samples[MPA_MAX_CHANNELS][36][SBLIMIT];
+    INT32 mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */
+#ifdef DEBUG
+    int frame_count;
+#endif
+} MPADecodeContext;
+
+/* layer 3 "granule" */
+typedef struct GranuleDef {
+    UINT8 scfsi;
+    int part2_3_length;
+    int big_values;
+    int global_gain;
+    int scalefac_compress;
+    UINT8 block_type;
+    UINT8 switch_point;
+    int table_select[3];
+    int subblock_gain[3];
+    UINT8 scalefac_scale;
+    UINT8 count1table_select;
+    int region_size[3]; /* number of huffman codes in each region */
+    int preflag;
+    int short_start, long_end; /* long/short band indexes */
+    UINT8 scale_factors[40];
+    INT32 sb_hybrid[SBLIMIT * 18]; /* 576 samples */
+} GranuleDef;
+
+#define MODE_EXT_MS_STEREO 2
+#define MODE_EXT_I_STEREO  1
+
+/* layer 3 huffman tables */
+typedef struct HuffTable {
+    int xsize;
+    const UINT8 *bits;
+    const UINT16 *codes;
+} HuffTable;
+
+#include "mpegaudiodectab.h"
+
+/* vlc structure for decoding layer 3 huffman tables */
+static VLC huff_vlc[16]; 
+static UINT8 *huff_code_table[16];
+static VLC huff_quad_vlc[2];
+/* computed from band_size_long */
+static UINT16 band_index_long[9][23];
+/* XXX: free when all decoders are closed */
+#define TABLE_4_3_SIZE (8191 + 16)
+static INT8  *table_4_3_exp;
+#if FRAC_BITS <= 15
+static UINT16 *table_4_3_value;
+#else
+static UINT32 *table_4_3_value;
+#endif
+/* intensity stereo coef table */
+static INT32 is_table[2][16];
+static INT32 is_table_lsf[2][2][16];
+static INT32 csa_table[8][2];
+static INT32 mdct_win[8][36];
+
+/* lower 2 bits: modulo 3, higher bits: shift */
+static UINT16 scale_factor_modshift[64];
+/* [i][j]:  2^(-j/3) * FRAC_ONE * 2^(i+2) / (2^(i+2) - 1) */
+static INT32 scale_factor_mult[15][3];
+/* mult table for layer 2 group quantization */
+
+#define SCALE_GEN(v) \
+{ FIXR(1.0 * (v)), FIXR(0.7937005259 * (v)), FIXR(0.6299605249 * (v)) }
+
+static INT32 scale_factor_mult2[3][3] = {
+    SCALE_GEN(4.0 / 3.0), /* 3 steps */
+    SCALE_GEN(4.0 / 5.0), /* 5 steps */
+    SCALE_GEN(4.0 / 9.0), /* 9 steps */
+};
+
+/* 2^(n/4) */
+static UINT32 scale_factor_mult3[4] = {
+    FIXR(1.0),
+    FIXR(1.18920711500272106671),
+    FIXR(1.41421356237309504880),
+    FIXR(1.68179283050742908605),
+};
+
+static MPA_INT window[512];
+    
+/* layer 1 unscaling */
+/* n = number of bits of the mantissa minus 1 */
+static inline int l1_unscale(int n, int mant, int scale_factor)
+{
+    int shift, mod;
+    INT64 val;
+
+    shift = scale_factor_modshift[scale_factor];
+    mod = shift & 3;
+    shift >>= 2;
+    val = MUL64(mant + (-1 << n) + 1, scale_factor_mult[n-1][mod]);
+    shift += n;
+    /* NOTE: at this point, 1 <= shift >= 21 + 15 */
+    return (int)((val + (1LL << (shift - 1))) >> shift);
+}
+
+static inline int l2_unscale_group(int steps, int mant, int scale_factor)
+{
+    int shift, mod, val;
+
+    shift = scale_factor_modshift[scale_factor];
+    mod = shift & 3;
+    shift >>= 2;
+
+    val = (mant - (steps >> 1)) * scale_factor_mult2[steps >> 2][mod];
+    /* NOTE: at this point, 0 <= shift <= 21 */
+    if (shift > 0)
+        val = (val + (1 << (shift - 1))) >> shift;
+    return val;
+}
+
+/* compute value^(4/3) * 2^(exponent/4). It normalized to FRAC_BITS */
+static inline int l3_unscale(int value, int exponent)
+{
+#if FRAC_BITS <= 15    
+    unsigned int m;
+#else
+    UINT64 m;
+#endif
+    int e;
+
+    e = table_4_3_exp[value];
+    e += (exponent >> 2);
+    e = FRAC_BITS - e;
+#if FRAC_BITS <= 15    
+    if (e > 31)
+        e = 31;
+#endif
+    m = table_4_3_value[value];
+#if FRAC_BITS <= 15    
+    m = (m * scale_factor_mult3[exponent & 3]);
+    m = (m + (1 << (e-1))) >> e;
+    return m;
+#else
+    m = MUL64(m, scale_factor_mult3[exponent & 3]);
+    m = (m + (UINT64_C(1) << (e-1))) >> e;
+    return m;
+#endif
+}
+
+/* all integer n^(4/3) computation code */
+#define DEV_ORDER 13
+
+#define POW_FRAC_BITS 24
+#define POW_FRAC_ONE    (1 << POW_FRAC_BITS)
+#define POW_FIX(a)   ((int)((a) * POW_FRAC_ONE))
+#define POW_MULL(a,b) (((INT64)(a) * (INT64)(b)) >> POW_FRAC_BITS)
+
+static int dev_4_3_coefs[DEV_ORDER];
+
+static int pow_mult3[3] = {
+    POW_FIX(1.0),
+    POW_FIX(1.25992104989487316476),
+    POW_FIX(1.58740105196819947474),
+};
+
+static void int_pow_init(void)
+{
+    int i, a;
+
+    a = POW_FIX(1.0);
+    for(i=0;i<DEV_ORDER;i++) {
+        a = POW_MULL(a, POW_FIX(4.0 / 3.0) - i * POW_FIX(1.0)) / (i + 1);
+        dev_4_3_coefs[i] = a;
+    }
+}
+
+/* return the mantissa and the binary exponent */
+static int int_pow(int i, int *exp_ptr)
+{
+    int e, er, eq, j;
+    int a, a1;
+    
+    /* renormalize */
+    a = i;
+    e = POW_FRAC_BITS;
+    while (a < (1 << (POW_FRAC_BITS - 1))) {
+        a = a << 1;
+        e--;
+    }
+    a -= (1 << POW_FRAC_BITS);
+    a1 = 0;
+    for(j = DEV_ORDER - 1; j >= 0; j--)
+        a1 = POW_MULL(a, dev_4_3_coefs[j] + a1);
+    a = (1 << POW_FRAC_BITS) + a1;
+    /* exponent compute (exact) */
+    e = e * 4;
+    er = e % 3;
+    eq = e / 3;
+    a = POW_MULL(a, pow_mult3[er]);
+    while (a >= 2 * POW_FRAC_ONE) {
+        a = a >> 1;
+        eq++;
+    }
+    /* convert to float */
+    while (a < POW_FRAC_ONE) {
+        a = a << 1;
+        eq--;
+    }
+    /* now POW_FRAC_ONE <= a < 2 * POW_FRAC_ONE */
+#if POW_FRAC_BITS > FRAC_BITS
+    a = (a + (1 << (POW_FRAC_BITS - FRAC_BITS - 1))) >> (POW_FRAC_BITS - FRAC_BITS);
+    /* correct overflow */
+    if (a >= 2 * (1 << FRAC_BITS)) {
+        a = a >> 1;
+        eq++;
+    }
+#endif
+    *exp_ptr = eq;
+    return a;
+}
+
+static int decode_init(AVCodecContext * avctx)
+{
+    MPADecodeContext *s = avctx->priv_data;
+    static int init=0;
+    int i, j, k;
+
+    if(!init) {
+        /* scale factors table for layer 1/2 */
+        for(i=0;i<64;i++) {
+            int shift, mod;
+            /* 1.0 (i = 3) is normalized to 2 ^ FRAC_BITS */
+            shift = (i / 3);
+            mod = i % 3;
+            scale_factor_modshift[i] = mod | (shift << 2);
+        }
+
+        /* scale factor multiply for layer 1 */
+        for(i=0;i<15;i++) {
+            int n, norm;
+            n = i + 2;
+            norm = ((INT64_C(1) << n) * FRAC_ONE) / ((1 << n) - 1);
+            scale_factor_mult[i][0] = MULL(FIXR(1.0 * 2.0), norm);
+            scale_factor_mult[i][1] = MULL(FIXR(0.7937005259 * 2.0), norm);
+            scale_factor_mult[i][2] = MULL(FIXR(0.6299605249 * 2.0), norm);
+            dprintf("%d: norm=%x s=%x %x %x\n",
+                    i, norm, 
+                    scale_factor_mult[i][0],
+                    scale_factor_mult[i][1],
+                    scale_factor_mult[i][2]);
+        }
+        
+        /* window */
+        /* max = 18760, max sum over all 16 coefs : 44736 */
+        for(i=0;i<257;i++) {
+            int v;
+            v = mpa_enwindow[i];
+#if WFRAC_BITS < 16
+            v = (v + (1 << (16 - WFRAC_BITS - 1))) >> (16 - WFRAC_BITS);
+#endif
+            window[i] = v;
+            if ((i & 63) != 0)
+                v = -v;
+            if (i != 0)
+                window[512 - i] = v;
+        }
+        
+        /* huffman decode tables */
+        huff_code_table[0] = NULL;
+        for(i=1;i<16;i++) {
+            const HuffTable *h = &mpa_huff_tables[i];
+            int xsize, n, x, y;
+            UINT8 *code_table;
+
+            xsize = h->xsize;
+            n = xsize * xsize;
+            /* XXX: fail test */
+            init_vlc(&huff_vlc[i], 8, n, 
+                     h->bits, 1, 1, h->codes, 2, 2);
+            
+            code_table = av_mallocz(n);
+            j = 0;
+            for(x=0;x<xsize;x++) {
+                for(y=0;y<xsize;y++)
+                    code_table[j++] = (x << 4) | y;
+            }
+            huff_code_table[i] = code_table;
+        }
+        for(i=0;i<2;i++) {
+            init_vlc(&huff_quad_vlc[i], i == 0 ? 7 : 4, 16, 
+                     mpa_quad_bits[i], 1, 1, mpa_quad_codes[i], 1, 1);
+        }
+
+        for(i=0;i<9;i++) {
+            k = 0;
+            for(j=0;j<22;j++) {
+                band_index_long[i][j] = k;
+                k += band_size_long[i][j];
+            }
+            band_index_long[i][22] = k;
+        }
+
+	/* compute n ^ (4/3) and store it in mantissa/exp format */
+	if (!av_mallocz_static(&table_4_3_exp,
+			       TABLE_4_3_SIZE * sizeof(table_4_3_exp[0])))
+	    return -1;
+	if (!av_mallocz_static(&table_4_3_value,
+			       TABLE_4_3_SIZE * sizeof(table_4_3_value[0])))
+            return -1;
+        
+        int_pow_init();
+        for(i=1;i<TABLE_4_3_SIZE;i++) {
+            int e, m;
+            m = int_pow(i, &e);
+#if 0
+            /* test code */
+            {
+                double f, fm;
+                int e1, m1;
+                f = pow((double)i, 4.0 / 3.0);
+                fm = frexp(f, &e1);
+                m1 = FIXR(2 * fm);
+#if FRAC_BITS <= 15
+                if ((unsigned short)m1 != m1) {
+                    m1 = m1 >> 1;
+                    e1++;
+                }
+#endif
+                e1--;
+                if (m != m1 || e != e1) {
+                    printf("%4d: m=%x m1=%x e=%d e1=%d\n",
+                           i, m, m1, e, e1);
+                }
+            }
+#endif
+            /* normalized to FRAC_BITS */
+            table_4_3_value[i] = m;
+            table_4_3_exp[i] = e;
+        }
+        
+        for(i=0;i<7;i++) {
+            float f;
+            int v;
+            if (i != 6) {
+                f = tan((double)i * M_PI / 12.0);
+                v = FIXR(f / (1.0 + f));
+            } else {
+                v = FIXR(1.0);
+            }
+            is_table[0][i] = v;
+            is_table[1][6 - i] = v;
+        }
+        /* invalid values */
+        for(i=7;i<16;i++)
+            is_table[0][i] = is_table[1][i] = 0.0;
+
+        for(i=0;i<16;i++) {
+            double f;
+            int e, k;
+
+            for(j=0;j<2;j++) {
+                e = -(j + 1) * ((i + 1) >> 1);
+                f = pow(2.0, e / 4.0);
+                k = i & 1;
+                is_table_lsf[j][k ^ 1][i] = FIXR(f);
+                is_table_lsf[j][k][i] = FIXR(1.0);
+                dprintf("is_table_lsf %d %d: %x %x\n", 
+                        i, j, is_table_lsf[j][0][i], is_table_lsf[j][1][i]);
+            }
+        }
+
+        for(i=0;i<8;i++) {
+            float ci, cs, ca;
+            ci = ci_table[i];
+            cs = 1.0 / sqrt(1.0 + ci * ci);
+            ca = cs * ci;
+            csa_table[i][0] = FIX(cs);
+            csa_table[i][1] = FIX(ca);
+        }
+
+        /* compute mdct windows */
+        for(i=0;i<36;i++) {
+            int v;
+            v = FIXR(sin(M_PI * (i + 0.5) / 36.0));
+            mdct_win[0][i] = v;
+            mdct_win[1][i] = v;
+            mdct_win[3][i] = v;
+        }
+        for(i=0;i<6;i++) {
+            mdct_win[1][18 + i] = FIXR(1.0);
+            mdct_win[1][24 + i] = FIXR(sin(M_PI * ((i + 6) + 0.5) / 12.0));
+            mdct_win[1][30 + i] = FIXR(0.0);
+
+            mdct_win[3][i] = FIXR(0.0);
+            mdct_win[3][6 + i] = FIXR(sin(M_PI * (i + 0.5) / 12.0));
+            mdct_win[3][12 + i] = FIXR(1.0);
+        }
+
+        for(i=0;i<12;i++)
+            mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0));
+        
+        /* NOTE: we do frequency inversion adter the MDCT by changing
+           the sign of the right window coefs */
+        for(j=0;j<4;j++) {
+            for(i=0;i<36;i+=2) {
+                mdct_win[j + 4][i] = mdct_win[j][i];
+                mdct_win[j + 4][i + 1] = -mdct_win[j][i + 1];
+            }
+        }
+
+#if defined(DEBUG)
+        for(j=0;j<8;j++) {
+            printf("win%d=\n", j);
+            for(i=0;i<36;i++)
+                printf("%f, ", (double)mdct_win[j][i] / FRAC_ONE);
+            printf("\n");
+        }
+#endif
+        init = 1;
+    }
+
+    s->inbuf_index = 0;
+    s->inbuf = &s->inbuf1[s->inbuf_index][BACKSTEP_SIZE];
+    s->inbuf_ptr = s->inbuf;
+#ifdef DEBUG
+    s->frame_count = 0;
+#endif
+    return 0;
+}
+
+/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */;
+
+/* cos(i*pi/64) */
+
+#define COS0_0  FIXR(0.50060299823519630134)
+#define COS0_1  FIXR(0.50547095989754365998)
+#define COS0_2  FIXR(0.51544730992262454697)
+#define COS0_3  FIXR(0.53104259108978417447)
+#define COS0_4  FIXR(0.55310389603444452782)
+#define COS0_5  FIXR(0.58293496820613387367)
+#define COS0_6  FIXR(0.62250412303566481615)
+#define COS0_7  FIXR(0.67480834145500574602)
+#define COS0_8  FIXR(0.74453627100229844977)
+#define COS0_9  FIXR(0.83934964541552703873)
+#define COS0_10 FIXR(0.97256823786196069369)
+#define COS0_11 FIXR(1.16943993343288495515)
+#define COS0_12 FIXR(1.48416461631416627724)
+#define COS0_13 FIXR(2.05778100995341155085)
+#define COS0_14 FIXR(3.40760841846871878570)
+#define COS0_15 FIXR(10.19000812354805681150)
+
+#define COS1_0 FIXR(0.50241928618815570551)
+#define COS1_1 FIXR(0.52249861493968888062)
+#define COS1_2 FIXR(0.56694403481635770368)
+#define COS1_3 FIXR(0.64682178335999012954)
+#define COS1_4 FIXR(0.78815462345125022473)
+#define COS1_5 FIXR(1.06067768599034747134)
+#define COS1_6 FIXR(1.72244709823833392782)
+#define COS1_7 FIXR(5.10114861868916385802)
+
+#define COS2_0 FIXR(0.50979557910415916894)
+#define COS2_1 FIXR(0.60134488693504528054)
+#define COS2_2 FIXR(0.89997622313641570463)
+#define COS2_3 FIXR(2.56291544774150617881)
+
+#define COS3_0 FIXR(0.54119610014619698439)
+#define COS3_1 FIXR(1.30656296487637652785)
+
+#define COS4_0 FIXR(0.70710678118654752439)
+
+/* butterfly operator */
+#define BF(a, b, c)\
+{\
+    tmp0 = tab[a] + tab[b];\
+    tmp1 = tab[a] - tab[b];\
+    tab[a] = tmp0;\
+    tab[b] = MULL(tmp1, c);\
+}
+
+#define BF1(a, b, c, d)\
+{\
+    BF(a, b, COS4_0);\
+    BF(c, d, -COS4_0);\
+    tab[c] += tab[d];\
+}
+
+#define BF2(a, b, c, d)\
+{\
+    BF(a, b, COS4_0);\
+    BF(c, d, -COS4_0);\
+    tab[c] += tab[d];\
+    tab[a] += tab[c];\
+    tab[c] += tab[b];\
+    tab[b] += tab[d];\
+}
+
+#define ADD(a, b) tab[a] += tab[b]
+
+/* DCT32 without 1/sqrt(2) coef zero scaling. */
+static void dct32(INT32 *out, INT32 *tab)
+{
+    int tmp0, tmp1;
+
+    /* pass 1 */
+    BF(0, 31, COS0_0);
+    BF(1, 30, COS0_1);
+    BF(2, 29, COS0_2);
+    BF(3, 28, COS0_3);
+    BF(4, 27, COS0_4);
+    BF(5, 26, COS0_5);
+    BF(6, 25, COS0_6);
+    BF(7, 24, COS0_7);
+    BF(8, 23, COS0_8);
+    BF(9, 22, COS0_9);
+    BF(10, 21, COS0_10);
+    BF(11, 20, COS0_11);
+    BF(12, 19, COS0_12);
+    BF(13, 18, COS0_13);
+    BF(14, 17, COS0_14);
+    BF(15, 16, COS0_15);
+
+    /* pass 2 */
+    BF(0, 15, COS1_0);
+    BF(1, 14, COS1_1);
+    BF(2, 13, COS1_2);
+    BF(3, 12, COS1_3);
+    BF(4, 11, COS1_4);
+    BF(5, 10, COS1_5);
+    BF(6,  9, COS1_6);
+    BF(7,  8, COS1_7);
+    
+    BF(16, 31, -COS1_0);
+    BF(17, 30, -COS1_1);
+    BF(18, 29, -COS1_2);
+    BF(19, 28, -COS1_3);
+    BF(20, 27, -COS1_4);
+    BF(21, 26, -COS1_5);
+    BF(22, 25, -COS1_6);
+    BF(23, 24, -COS1_7);
+    
+    /* pass 3 */
+    BF(0, 7, COS2_0);
+    BF(1, 6, COS2_1);
+    BF(2, 5, COS2_2);
+    BF(3, 4, COS2_3);
+    
+    BF(8, 15, -COS2_0);
+    BF(9, 14, -COS2_1);
+    BF(10, 13, -COS2_2);
+    BF(11, 12, -COS2_3);
+    
+    BF(16, 23, COS2_0);
+    BF(17, 22, COS2_1);
+    BF(18, 21, COS2_2);
+    BF(19, 20, COS2_3);
+    
+    BF(24, 31, -COS2_0);
+    BF(25, 30, -COS2_1);
+    BF(26, 29, -COS2_2);
+    BF(27, 28, -COS2_3);
+
+    /* pass 4 */
+    BF(0, 3, COS3_0);
+    BF(1, 2, COS3_1);
+    
+    BF(4, 7, -COS3_0);
+    BF(5, 6, -COS3_1);
+    
+    BF(8, 11, COS3_0);
+    BF(9, 10, COS3_1);
+    
+    BF(12, 15, -COS3_0);
+    BF(13, 14, -COS3_1);
+    
+    BF(16, 19, COS3_0);
+    BF(17, 18, COS3_1);
+    
+    BF(20, 23, -COS3_0);
+    BF(21, 22, -COS3_1);
+    
+    BF(24, 27, COS3_0);
+    BF(25, 26, COS3_1);
+    
+    BF(28, 31, -COS3_0);
+    BF(29, 30, -COS3_1);
+    
+    /* pass 5 */
+    BF1(0, 1, 2, 3);
+    BF2(4, 5, 6, 7);
+    BF1(8, 9, 10, 11);
+    BF2(12, 13, 14, 15);
+    BF1(16, 17, 18, 19);
+    BF2(20, 21, 22, 23);
+    BF1(24, 25, 26, 27);
+    BF2(28, 29, 30, 31);
+    
+    /* pass 6 */
+    
+    ADD( 8, 12);
+    ADD(12, 10);
+    ADD(10, 14);
+    ADD(14,  9);
+    ADD( 9, 13);
+    ADD(13, 11);
+    ADD(11, 15);
+
+    out[ 0] = tab[0];
+    out[16] = tab[1];
+    out[ 8] = tab[2];
+    out[24] = tab[3];
+    out[ 4] = tab[4];
+    out[20] = tab[5];
+    out[12] = tab[6];
+    out[28] = tab[7];
+    out[ 2] = tab[8];
+    out[18] = tab[9];
+    out[10] = tab[10];
+    out[26] = tab[11];
+    out[ 6] = tab[12];
+    out[22] = tab[13];
+    out[14] = tab[14];
+    out[30] = tab[15];
+    
+    ADD(24, 28);
+    ADD(28, 26);
+    ADD(26, 30);
+    ADD(30, 25);
+    ADD(25, 29);
+    ADD(29, 27);
+    ADD(27, 31);
+
+    out[ 1] = tab[16] + tab[24];
+    out[17] = tab[17] + tab[25];
+    out[ 9] = tab[18] + tab[26];
+    out[25] = tab[19] + tab[27];
+    out[ 5] = tab[20] + tab[28];
+    out[21] = tab[21] + tab[29];
+    out[13] = tab[22] + tab[30];
+    out[29] = tab[23] + tab[31];
+    out[ 3] = tab[24] + tab[20];
+    out[19] = tab[25] + tab[21];
+    out[11] = tab[26] + tab[22];
+    out[27] = tab[27] + tab[23];
+    out[ 7] = tab[28] + tab[18];
+    out[23] = tab[29] + tab[19];
+    out[15] = tab[30] + tab[17];
+    out[31] = tab[31];
+}
+
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
+
+#if FRAC_BITS <= 15
+
+#define OUT_SAMPLE(sum)\
+{\
+    int sum1;\
+    sum1 = (sum + (1 << (OUT_SHIFT - 1))) >> OUT_SHIFT;\
+    if (sum1 < -32768)\
+        sum1 = -32768;\
+    else if (sum1 > 32767)\
+        sum1 = 32767;\
+    *samples = sum1;\
+    samples += incr;\
+}
+
+#define SUM8(off, op)                           \
+{                                               \
+    sum op w[0 * 64 + off] * p[0 * 64];\
+    sum op w[1 * 64 + off] * p[1 * 64];\
+    sum op w[2 * 64 + off] * p[2 * 64];\
+    sum op w[3 * 64 + off] * p[3 * 64];\
+    sum op w[4 * 64 + off] * p[4 * 64];\
+    sum op w[5 * 64 + off] * p[5 * 64];\
+    sum op w[6 * 64 + off] * p[6 * 64];\
+    sum op w[7 * 64 + off] * p[7 * 64];\
+}
+
+#else
+
+#define OUT_SAMPLE(sum)\
+{\
+    int sum1;\
+    sum1 = (int)((sum + (INT64_C(1) << (OUT_SHIFT - 1))) >> OUT_SHIFT);\
+    if (sum1 < -32768)\
+        sum1 = -32768;\
+    else if (sum1 > 32767)\
+        sum1 = 32767;\
+    *samples = sum1;\
+    samples += incr;\
+}
+
+#define SUM8(off, op)                           \
+{                                               \
+    sum op MUL64(w[0 * 64 + off], p[0 * 64]);\
+    sum op MUL64(w[1 * 64 + off], p[1 * 64]);\
+    sum op MUL64(w[2 * 64 + off], p[2 * 64]);\
+    sum op MUL64(w[3 * 64 + off], p[3 * 64]);\
+    sum op MUL64(w[4 * 64 + off], p[4 * 64]);\
+    sum op MUL64(w[5 * 64 + off], p[5 * 64]);\
+    sum op MUL64(w[6 * 64 + off], p[6 * 64]);\
+    sum op MUL64(w[7 * 64 + off], p[7 * 64]);\
+}
+
+#endif
+
+/* 32 sub band synthesis filter. Input: 32 sub band samples, Output:
+   32 samples. */
+/* XXX: optimize by avoiding ring buffer usage */
+static void synth_filter(MPADecodeContext *s1,
+                         int ch, INT16 *samples, int incr, 
+                         INT32 sb_samples[SBLIMIT])
+{
+    INT32 tmp[32];
+    register MPA_INT *synth_buf, *p;
+    register MPA_INT *w;
+    int j, offset, v;
+#if FRAC_BITS <= 15
+    int sum;
+#else
+    INT64 sum;
+#endif
+
+    dct32(tmp, sb_samples);
+    
+    offset = s1->synth_buf_offset[ch];
+    synth_buf = s1->synth_buf[ch] + offset;
+
+    for(j=0;j<32;j++) {
+        v = tmp[j];
+#if FRAC_BITS <= 15
+        /* NOTE: can cause a loss in precision if very high amplitude
+           sound */
+        if (v > 32767)
+            v = 32767;
+        else if (v < -32768)
+            v = -32768;
+#endif
+        synth_buf[j] = v;
+    }
+    /* copy to avoid wrap */
+    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(MPA_INT));
+
+    w = window;
+    for(j=0;j<16;j++) {
+        sum = 0;
+        p = synth_buf + 16 + j;    /* 0-15  */
+        SUM8(0, +=);
+        p = synth_buf + 48 - j;    /* 32-47 */
+        SUM8(32, -=);
+        OUT_SAMPLE(sum);
+        w++;
+    }
+    
+    p = synth_buf + 32; /* 48 */
+    sum = 0;
+    SUM8(32, -=);
+    OUT_SAMPLE(sum);
+    w++;
+
+    for(j=17;j<32;j++) {
+        sum = 0;
+        p = synth_buf + 48 - j; /* 17-31 */
+        SUM8(0, -=);
+        p = synth_buf + 16 + j; /* 49-63 */
+        SUM8(32, -=);
+        OUT_SAMPLE(sum);
+        w++;
+    }
+    offset = (offset - 32) & 511;
+    s1->synth_buf_offset[ch] = offset;
+}
+
+/* cos(pi*i/24) */
+#define C1  FIXR(0.99144486137381041114)
+#define C3  FIXR(0.92387953251128675612)
+#define C5  FIXR(0.79335334029123516458)
+#define C7  FIXR(0.60876142900872063941)
+#define C9  FIXR(0.38268343236508977173)
+#define C11 FIXR(0.13052619222005159154)
+
+/* 12 points IMDCT. We compute it "by hand" by factorizing obvious
+   cases. */
+static void imdct12(int *out, int *in)
+{
+    int tmp;
+    INT64 in1_3, in1_9, in4_3, in4_9;
+
+    in1_3 = MUL64(in[1], C3);
+    in1_9 = MUL64(in[1], C9);
+    in4_3 = MUL64(in[4], C3);
+    in4_9 = MUL64(in[4], C9);
+    
+    tmp = FRAC_RND(MUL64(in[0], C7) - in1_3 - MUL64(in[2], C11) + 
+                   MUL64(in[3], C1) - in4_9 - MUL64(in[5], C5));
+    out[0] = tmp;
+    out[5] = -tmp;
+    tmp = FRAC_RND(MUL64(in[0] - in[3], C9) - in1_3 + 
+                   MUL64(in[2] + in[5], C3) - in4_9);
+    out[1] = tmp;
+    out[4] = -tmp;
+    tmp = FRAC_RND(MUL64(in[0], C11) - in1_9 + MUL64(in[2], C7) -
+                   MUL64(in[3], C5) + in4_3 - MUL64(in[5], C1));
+    out[2] = tmp;
+    out[3] = -tmp;
+    tmp = FRAC_RND(MUL64(-in[0], C5) + in1_9 + MUL64(in[2], C1) + 
+                   MUL64(in[3], C11) - in4_3 - MUL64(in[5], C7));
+    out[6] = tmp;
+    out[11] = tmp;
+    tmp = FRAC_RND(MUL64(-in[0] + in[3], C3) - in1_9 + 
+                   MUL64(in[2] + in[5], C9) + in4_3);
+    out[7] = tmp;
+    out[10] = tmp;
+    tmp = FRAC_RND(-MUL64(in[0], C1) - in1_3 - MUL64(in[2], C5) -
+                   MUL64(in[3], C7) - in4_9 - MUL64(in[5], C11));
+    out[8] = tmp;
+    out[9] = tmp;
+}
+
+#undef C1
+#undef C3
+#undef C5
+#undef C7
+#undef C9
+#undef C11
+
+/* cos(pi*i/18) */
+#define C1 FIXR(0.98480775301220805936)
+#define C2 FIXR(0.93969262078590838405)
+#define C3 FIXR(0.86602540378443864676)
+#define C4 FIXR(0.76604444311897803520)
+#define C5 FIXR(0.64278760968653932632)
+#define C6 FIXR(0.5)
+#define C7 FIXR(0.34202014332566873304)
+#define C8 FIXR(0.17364817766693034885)
+
+/* 0.5 / cos(pi*(2*i+1)/36) */
+static const int icos36[9] = {
+    FIXR(0.50190991877167369479),
+    FIXR(0.51763809020504152469),
+    FIXR(0.55168895948124587824),
+    FIXR(0.61038729438072803416),
+    FIXR(0.70710678118654752439),
+    FIXR(0.87172339781054900991),
+    FIXR(1.18310079157624925896),
+    FIXR(1.93185165257813657349),
+    FIXR(5.73685662283492756461),
+};
+
+static const int icos72[18] = {
+    /* 0.5 / cos(pi*(2*i+19)/72) */
+    FIXR(0.74009361646113053152),
+    FIXR(0.82133981585229078570),
+    FIXR(0.93057949835178895673),
+    FIXR(1.08284028510010010928),
+    FIXR(1.30656296487637652785),
+    FIXR(1.66275476171152078719),
+    FIXR(2.31011315767264929558),
+    FIXR(3.83064878777019433457),
+    FIXR(11.46279281302667383546),
+
+    /* 0.5 / cos(pi*(2*(i + 18) +19)/72) */
+    FIXR(-0.67817085245462840086),
+    FIXR(-0.63023620700513223342),
+    FIXR(-0.59284452371708034528),
+    FIXR(-0.56369097343317117734),
+    FIXR(-0.54119610014619698439),
+    FIXR(-0.52426456257040533932),
+    FIXR(-0.51213975715725461845),
+    FIXR(-0.50431448029007636036),
+    FIXR(-0.50047634258165998492),
+};
+
+/* using Lee like decomposition followed by hand coded 9 points DCT */
+static void imdct36(int *out, int *in)
+{
+    int i, j, t0, t1, t2, t3, s0, s1, s2, s3;
+    int tmp[18], *tmp1, *in1;
+    INT64 in3_3, in6_6;
+
+    for(i=17;i>=1;i--)
+        in[i] += in[i-1];
+    for(i=17;i>=3;i-=2)
+        in[i] += in[i-2];
+
+    for(j=0;j<2;j++) {
+        tmp1 = tmp + j;
+        in1 = in + j;
+
+        in3_3 = MUL64(in1[2*3], C3);
+        in6_6 = MUL64(in1[2*6], C6);
+
+        tmp1[0] = FRAC_RND(MUL64(in1[2*1], C1) + in3_3 + 
+                           MUL64(in1[2*5], C5) + MUL64(in1[2*7], C7));
+        tmp1[2] = in1[2*0] + FRAC_RND(MUL64(in1[2*2], C2) + 
+                                      MUL64(in1[2*4], C4) + in6_6 + 
+                                      MUL64(in1[2*8], C8));
+        tmp1[4] = FRAC_RND(MUL64(in1[2*1] - in1[2*5] - in1[2*7], C3));
+        tmp1[6] = FRAC_RND(MUL64(in1[2*2] - in1[2*4] - in1[2*8], C6)) - 
+            in1[2*6] + in1[2*0];
+        tmp1[8] = FRAC_RND(MUL64(in1[2*1], C5) - in3_3 - 
+                           MUL64(in1[2*5], C7) + MUL64(in1[2*7], C1));
+        tmp1[10] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C8) - 
+                                       MUL64(in1[2*4], C2) + in6_6 + 
+                                       MUL64(in1[2*8], C4));
+        tmp1[12] = FRAC_RND(MUL64(in1[2*1], C7) - in3_3 + 
+                            MUL64(in1[2*5], C1) - 
+                            MUL64(in1[2*7], C5));
+        tmp1[14] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C4) + 
+                                       MUL64(in1[2*4], C8) + in6_6 - 
+                                       MUL64(in1[2*8], C2));
+        tmp1[16] = in1[2*0] - in1[2*2] + in1[2*4] - in1[2*6] + in1[2*8];
+    }
+
+    i = 0;
+    for(j=0;j<4;j++) {
+        t0 = tmp[i];
+        t1 = tmp[i + 2];
+        s0 = t1 + t0;
+        s2 = t1 - t0;
+
+        t2 = tmp[i + 1];
+        t3 = tmp[i + 3];
+        s1 = MULL(t3 + t2, icos36[j]);
+        s3 = MULL(t3 - t2, icos36[8 - j]);
+        
+        t0 = MULL(s0 + s1, icos72[9 + 8 - j]);
+        t1 = MULL(s0 - s1, icos72[8 - j]);
+        out[18 + 9 + j] = t0;
+        out[18 + 8 - j] = t0;
+        out[9 + j] = -t1;
+        out[8 - j] = t1;
+        
+        t0 = MULL(s2 + s3, icos72[9+j]);
+        t1 = MULL(s2 - s3, icos72[j]);
+        out[18 + 9 + (8 - j)] = t0;
+        out[18 + j] = t0;
+        out[9 + (8 - j)] = -t1;
+        out[j] = t1;
+        i += 4;
+    }
+
+    s0 = tmp[16];
+    s1 = MULL(tmp[17], icos36[4]);
+    t0 = MULL(s0 + s1, icos72[9 + 4]);
+    t1 = MULL(s0 - s1, icos72[4]);
+    out[18 + 9 + 4] = t0;
+    out[18 + 8 - 4] = t0;
+    out[9 + 4] = -t1;
+    out[8 - 4] = t1;
+}
+
+/* fast header check for resync */
+static int check_header(UINT32 header)
+{
+    /* header */
+    if ((header & 0xffe00000) != 0xffe00000)
+	return -1;
+    /* layer check */
+    if (((header >> 17) & 3) == 0)
+	return -1;
+    /* bit rate */
+    if (((header >> 12) & 0xf) == 0xf)
+	return -1;
+    /* frequency */
+    if (((header >> 10) & 3) == 3)
+	return -1;
+    return 0;
+}
+
+/* header + layer + bitrate + freq + lsf/mpeg25 */
+#define SAME_HEADER_MASK \
+   (0xffe00000 | (3 << 17) | (0xf << 12) | (3 << 10) | (3 << 19))
+
+/* header decoding. MUST check the header before because no
+   consistency check is done there. Return 1 if free format found and
+   that the frame size must be computed externally */
+static int decode_header(MPADecodeContext *s, UINT32 header)
+{
+    int sample_rate, frame_size, mpeg25, padding;
+    int sample_rate_index, bitrate_index;
+    if (header & (1<<20)) {
+        s->lsf = (header & (1<<19)) ? 0 : 1;
+        mpeg25 = 0;
+    } else {
+        s->lsf = 1;
+        mpeg25 = 1;
+    }
+    
+    s->layer = 4 - ((header >> 17) & 3);
+    /* extract frequency */
+    sample_rate_index = (header >> 10) & 3;
+    sample_rate = mpa_freq_tab[sample_rate_index] >> (s->lsf + mpeg25);
+    sample_rate_index += 3 * (s->lsf + mpeg25);
+    s->sample_rate_index = sample_rate_index;
+    s->error_protection = ((header >> 16) & 1) ^ 1;
+    s->sample_rate = sample_rate;
+
+    bitrate_index = (header >> 12) & 0xf;
+    padding = (header >> 9) & 1;
+    //extension = (header >> 8) & 1;
+    s->mode = (header >> 6) & 3;
+    s->mode_ext = (header >> 4) & 3;
+    //copyright = (header >> 3) & 1;
+    //original = (header >> 2) & 1;
+    //emphasis = header & 3;
+
+    if (s->mode == MPA_MONO)
+        s->nb_channels = 1;
+    else
+        s->nb_channels = 2;
+    
+    if (bitrate_index != 0) {
+        frame_size = mpa_bitrate_tab[s->lsf][s->layer - 1][bitrate_index];
+        s->bit_rate = frame_size * 1000;
+        switch(s->layer) {
+        case 1:
+            frame_size = (frame_size * 12000) / sample_rate;
+            frame_size = (frame_size + padding) * 4;
+            break;
+        case 2:
+            frame_size = (frame_size * 144000) / sample_rate;
+            frame_size += padding;
+            break;
+        default:
+        case 3:
+            frame_size = (frame_size * 144000) / (sample_rate << s->lsf);
+            frame_size += padding;
+            break;
+        }
+        s->frame_size = frame_size;
+    } else {
+        /* if no frame size computed, signal it */
+        if (!s->free_format_frame_size)
+            return 1;
+        /* free format: compute bitrate and real frame size from the
+           frame size we extracted by reading the bitstream */
+        s->frame_size = s->free_format_frame_size;
+        switch(s->layer) {
+        case 1:
+            s->frame_size += padding  * 4;
+            s->bit_rate = (s->frame_size * sample_rate) / 48000;
+            break;
+        case 2:
+            s->frame_size += padding;
+            s->bit_rate = (s->frame_size * sample_rate) / 144000;
+            break;
+        default:
+        case 3:
+            s->frame_size += padding;
+            s->bit_rate = (s->frame_size * (sample_rate << s->lsf)) / 144000;
+            break;
+        }
+    }
+    
+#if defined(DEBUG)
+    printf("layer%d, %d Hz, %d kbits/s, ",
+           s->layer, s->sample_rate, s->bit_rate);
+    if (s->nb_channels == 2) {
+        if (s->layer == 3) {
+            if (s->mode_ext & MODE_EXT_MS_STEREO)
+                printf("ms-");
+            if (s->mode_ext & MODE_EXT_I_STEREO)
+                printf("i-");
+        }
+        printf("stereo");
+    } else {
+        printf("mono");
+    }
+    printf("\n");
+#endif
+    return 0;
+}
+
+/* return the number of decoded frames */
+static int mp_decode_layer1(MPADecodeContext *s)
+{
+    int bound, i, v, n, ch, j, mant;
+    UINT8 allocation[MPA_MAX_CHANNELS][SBLIMIT];
+    UINT8 scale_factors[MPA_MAX_CHANNELS][SBLIMIT];
+
+    if (s->mode == MPA_JSTEREO) 
+        bound = (s->mode_ext + 1) * 4;
+    else
+        bound = SBLIMIT;
+
+    /* allocation bits */
+    for(i=0;i<bound;i++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            allocation[ch][i] = get_bits(&s->gb, 4);
+        }
+    }
+    for(i=bound;i<SBLIMIT;i++) {
+        allocation[0][i] = get_bits(&s->gb, 4);
+    }
+
+    /* scale factors */
+    for(i=0;i<bound;i++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            if (allocation[ch][i])
+                scale_factors[ch][i] = get_bits(&s->gb, 6);
+        }
+    }
+    for(i=bound;i<SBLIMIT;i++) {
+        if (allocation[0][i]) {
+            scale_factors[0][i] = get_bits(&s->gb, 6);
+            scale_factors[1][i] = get_bits(&s->gb, 6);
+        }
+    }
+    
+    /* compute samples */
+    for(j=0;j<12;j++) {
+        for(i=0;i<bound;i++) {
+            for(ch=0;ch<s->nb_channels;ch++) {
+                n = allocation[ch][i];
+                if (n) {
+                    mant = get_bits(&s->gb, n + 1);
+                    v = l1_unscale(n, mant, scale_factors[ch][i]);
+                } else {
+                    v = 0;
+                }
+                s->sb_samples[ch][j][i] = v;
+            }
+        }
+        for(i=bound;i<SBLIMIT;i++) {
+            n = allocation[0][i];
+            if (n) {
+                mant = get_bits(&s->gb, n + 1);
+                v = l1_unscale(n, mant, scale_factors[0][i]);
+                s->sb_samples[0][j][i] = v;
+                v = l1_unscale(n, mant, scale_factors[1][i]);
+                s->sb_samples[1][j][i] = v;
+            } else {
+                s->sb_samples[0][j][i] = 0;
+                s->sb_samples[1][j][i] = 0;
+            }
+        }
+    }
+    return 12;
+}
+
+/* bitrate is in kb/s */
+int l2_select_table(int bitrate, int nb_channels, int freq, int lsf)
+{
+    int ch_bitrate, table;
+    
+    ch_bitrate = bitrate / nb_channels;
+    if (!lsf) {
+        if ((freq == 48000 && ch_bitrate >= 56) ||
+            (ch_bitrate >= 56 && ch_bitrate <= 80)) 
+            table = 0;
+        else if (freq != 48000 && ch_bitrate >= 96) 
+            table = 1;
+        else if (freq != 32000 && ch_bitrate <= 48) 
+            table = 2;
+        else 
+            table = 3;
+    } else {
+        table = 4;
+    }
+    return table;
+}
+
+static int mp_decode_layer2(MPADecodeContext *s)
+{
+    int sblimit; /* number of used subbands */
+    const unsigned char *alloc_table;
+    int table, bit_alloc_bits, i, j, ch, bound, v;
+    unsigned char bit_alloc[MPA_MAX_CHANNELS][SBLIMIT];
+    unsigned char scale_code[MPA_MAX_CHANNELS][SBLIMIT];
+    unsigned char scale_factors[MPA_MAX_CHANNELS][SBLIMIT][3], *sf;
+    int scale, qindex, bits, steps, k, l, m, b;
+
+    /* select decoding table */
+    table = l2_select_table(s->bit_rate / 1000, s->nb_channels, 
+                            s->sample_rate, s->lsf);
+    sblimit = sblimit_table[table];
+    alloc_table = alloc_tables[table];
+
+    if (s->mode == MPA_JSTEREO) 
+        bound = (s->mode_ext + 1) * 4;
+    else
+        bound = sblimit;
+
+    dprintf("bound=%d sblimit=%d\n", bound, sblimit);
+    /* parse bit allocation */
+    j = 0;
+    for(i=0;i<bound;i++) {
+        bit_alloc_bits = alloc_table[j];
+        for(ch=0;ch<s->nb_channels;ch++) {
+            bit_alloc[ch][i] = get_bits(&s->gb, bit_alloc_bits);
+        }
+        j += 1 << bit_alloc_bits;
+    }
+    for(i=bound;i<sblimit;i++) {
+        bit_alloc_bits = alloc_table[j];
+        v = get_bits(&s->gb, bit_alloc_bits);
+        bit_alloc[0][i] = v;
+        bit_alloc[1][i] = v;
+        j += 1 << bit_alloc_bits;
+    }
+
+#ifdef DEBUG
+    {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            for(i=0;i<sblimit;i++)
+                printf(" %d", bit_alloc[ch][i]);
+            printf("\n");
+        }
+    }
+#endif
+
+    /* scale codes */
+    for(i=0;i<sblimit;i++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            if (bit_alloc[ch][i]) 
+                scale_code[ch][i] = get_bits(&s->gb, 2);
+        }
+    }
+    
+    /* scale factors */
+    for(i=0;i<sblimit;i++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            if (bit_alloc[ch][i]) {
+                sf = scale_factors[ch][i];
+                switch(scale_code[ch][i]) {
+                default:
+                case 0:
+                    sf[0] = get_bits(&s->gb, 6);
+                    sf[1] = get_bits(&s->gb, 6);
+                    sf[2] = get_bits(&s->gb, 6);
+                    break;
+                case 2:
+                    sf[0] = get_bits(&s->gb, 6);
+                    sf[1] = sf[0];
+                    sf[2] = sf[0];
+                    break;
+                case 1:
+                    sf[0] = get_bits(&s->gb, 6);
+                    sf[2] = get_bits(&s->gb, 6);
+                    sf[1] = sf[0];
+                    break;
+                case 3:
+                    sf[0] = get_bits(&s->gb, 6);
+                    sf[2] = get_bits(&s->gb, 6);
+                    sf[1] = sf[2];
+                    break;
+                }
+            }
+        }
+    }
+
+#ifdef DEBUG
+    for(ch=0;ch<s->nb_channels;ch++) {
+        for(i=0;i<sblimit;i++) {
+            if (bit_alloc[ch][i]) {
+                sf = scale_factors[ch][i];
+                printf(" %d %d %d", sf[0], sf[1], sf[2]);
+            } else {
+                printf(" -");
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    /* samples */
+    for(k=0;k<3;k++) {
+        for(l=0;l<12;l+=3) {
+            j = 0;
+            for(i=0;i<bound;i++) {
+                bit_alloc_bits = alloc_table[j];
+                for(ch=0;ch<s->nb_channels;ch++) {
+                    b = bit_alloc[ch][i];
+                    if (b) {
+                        scale = scale_factors[ch][i][k];
+                        qindex = alloc_table[j+b];
+                        bits = quant_bits[qindex];
+                        if (bits < 0) {
+                            /* 3 values at the same time */
+                            v = get_bits(&s->gb, -bits);
+                            steps = quant_steps[qindex];
+                            s->sb_samples[ch][k * 12 + l + 0][i] = 
+                                l2_unscale_group(steps, v % steps, scale);
+                            v = v / steps;
+                            s->sb_samples[ch][k * 12 + l + 1][i] = 
+                                l2_unscale_group(steps, v % steps, scale);
+                            v = v / steps;
+                            s->sb_samples[ch][k * 12 + l + 2][i] = 
+                                l2_unscale_group(steps, v, scale);
+                        } else {
+                            for(m=0;m<3;m++) {
+                                v = get_bits(&s->gb, bits);
+                                v = l1_unscale(bits - 1, v, scale);
+                                s->sb_samples[ch][k * 12 + l + m][i] = v;
+                            }
+                        }
+                    } else {
+                        s->sb_samples[ch][k * 12 + l + 0][i] = 0;
+                        s->sb_samples[ch][k * 12 + l + 1][i] = 0;
+                        s->sb_samples[ch][k * 12 + l + 2][i] = 0;
+                    }
+                }
+                /* next subband in alloc table */
+                j += 1 << bit_alloc_bits; 
+            }
+            /* XXX: find a way to avoid this duplication of code */
+            for(i=bound;i<sblimit;i++) {
+                bit_alloc_bits = alloc_table[j];
+                b = bit_alloc[0][i];
+                if (b) {
+                    int mant, scale0, scale1;
+                    scale0 = scale_factors[0][i][k];
+                    scale1 = scale_factors[1][i][k];
+                    qindex = alloc_table[j+b];
+                    bits = quant_bits[qindex];
+                    if (bits < 0) {
+                        /* 3 values at the same time */
+                        v = get_bits(&s->gb, -bits);
+                        steps = quant_steps[qindex];
+                        mant = v % steps;
+                        v = v / steps;
+                        s->sb_samples[0][k * 12 + l + 0][i] = 
+                            l2_unscale_group(steps, mant, scale0);
+                        s->sb_samples[1][k * 12 + l + 0][i] = 
+                            l2_unscale_group(steps, mant, scale1);
+                        mant = v % steps;
+                        v = v / steps;
+                        s->sb_samples[0][k * 12 + l + 1][i] = 
+                            l2_unscale_group(steps, mant, scale0);
+                        s->sb_samples[1][k * 12 + l + 1][i] = 
+                            l2_unscale_group(steps, mant, scale1);
+                        s->sb_samples[0][k * 12 + l + 2][i] = 
+                            l2_unscale_group(steps, v, scale0);
+                        s->sb_samples[1][k * 12 + l + 2][i] = 
+                            l2_unscale_group(steps, v, scale1);
+                    } else {
+                        for(m=0;m<3;m++) {
+                            mant = get_bits(&s->gb, bits);
+                            s->sb_samples[0][k * 12 + l + m][i] = 
+                                l1_unscale(bits - 1, mant, scale0);
+                            s->sb_samples[1][k * 12 + l + m][i] = 
+                                l1_unscale(bits - 1, mant, scale1);
+                        }
+                    }
+                } else {
+                    s->sb_samples[0][k * 12 + l + 0][i] = 0;
+                    s->sb_samples[0][k * 12 + l + 1][i] = 0;
+                    s->sb_samples[0][k * 12 + l + 2][i] = 0;
+                    s->sb_samples[1][k * 12 + l + 0][i] = 0;
+                    s->sb_samples[1][k * 12 + l + 1][i] = 0;
+                    s->sb_samples[1][k * 12 + l + 2][i] = 0;
+                }
+                /* next subband in alloc table */
+                j += 1 << bit_alloc_bits; 
+            }
+            /* fill remaining samples to zero */
+            for(i=sblimit;i<SBLIMIT;i++) {
+                for(ch=0;ch<s->nb_channels;ch++) {
+                    s->sb_samples[ch][k * 12 + l + 0][i] = 0;
+                    s->sb_samples[ch][k * 12 + l + 1][i] = 0;
+                    s->sb_samples[ch][k * 12 + l + 2][i] = 0;
+                }
+            }
+        }
+    }
+    return 3 * 12;
+}
+
+/*
+ * Seek back in the stream for backstep bytes (at most 511 bytes)
+ */
+static void seek_to_maindata(MPADecodeContext *s, long backstep)
+{
+    UINT8 *ptr;
+
+    /* compute current position in stream */
+    ptr = s->gb.buffer + (get_bits_count(&s->gb)>>3);
+
+    /* copy old data before current one */
+    ptr -= backstep;
+    memcpy(ptr, s->inbuf1[s->inbuf_index ^ 1] + 
+           BACKSTEP_SIZE + s->old_frame_size - backstep, backstep);
+    /* init get bits again */
+    init_get_bits(&s->gb, ptr, s->frame_size + backstep);
+
+    /* prepare next buffer */
+    s->inbuf_index ^= 1;
+    s->inbuf = &s->inbuf1[s->inbuf_index][BACKSTEP_SIZE];
+    s->old_frame_size = s->frame_size;
+}
+
+static inline void lsf_sf_expand(int *slen,
+                                 int sf, int n1, int n2, int n3)
+{
+    if (n3) {
+        slen[3] = sf % n3;
+        sf /= n3;
+    } else {
+        slen[3] = 0;
+    }
+    if (n2) {
+        slen[2] = sf % n2;
+        sf /= n2;
+    } else {
+        slen[2] = 0;
+    }
+    slen[1] = sf % n1;
+    sf /= n1;
+    slen[0] = sf;
+}
+
+static void exponents_from_scale_factors(MPADecodeContext *s, 
+                                         GranuleDef *g,
+                                         INT16 *exponents)
+{
+    const UINT8 *bstab, *pretab;
+    int len, i, j, k, l, v0, shift, gain, gains[3];
+    INT16 *exp_ptr;
+
+    exp_ptr = exponents;
+    gain = g->global_gain - 210;
+    shift = g->scalefac_scale + 1;
+
+    bstab = band_size_long[s->sample_rate_index];
+    pretab = mpa_pretab[g->preflag];
+    for(i=0;i<g->long_end;i++) {
+        v0 = gain - ((g->scale_factors[i] + pretab[i]) << shift);
+        len = bstab[i];
+        for(j=len;j>0;j--)
+            *exp_ptr++ = v0;
+    }
+
+    if (g->short_start < 13) {
+        bstab = band_size_short[s->sample_rate_index];
+        gains[0] = gain - (g->subblock_gain[0] << 3);
+        gains[1] = gain - (g->subblock_gain[1] << 3);
+        gains[2] = gain - (g->subblock_gain[2] << 3);
+        k = g->long_end;
+        for(i=g->short_start;i<13;i++) {
+            len = bstab[i];
+            for(l=0;l<3;l++) {
+                v0 = gains[l] - (g->scale_factors[k++] << shift);
+                for(j=len;j>0;j--)
+                *exp_ptr++ = v0;
+            }
+        }
+    }
+}
+
+/* handle n = 0 too */
+static inline int get_bitsz(GetBitContext *s, int n)
+{
+    if (n == 0)
+        return 0;
+    else
+        return get_bits(s, n);
+}
+
+static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
+                          INT16 *exponents, int end_pos)
+{
+    int s_index;
+    int linbits, code, x, y, l, v, i, j, k, pos;
+    GetBitContext last_gb;
+    VLC *vlc;
+    UINT8 *code_table;
+
+    /* low frequencies (called big values) */
+    s_index = 0;
+    for(i=0;i<3;i++) {
+        j = g->region_size[i];
+        if (j == 0)
+            continue;
+        /* select vlc table */
+        k = g->table_select[i];
+        l = mpa_huff_data[k][0];
+        linbits = mpa_huff_data[k][1];
+        vlc = &huff_vlc[l];
+        code_table = huff_code_table[l];
+
+        /* read huffcode and compute each couple */
+        for(;j>0;j--) {
+            if (get_bits_count(&s->gb) >= end_pos)
+                break;
+            if (code_table) {
+                code = get_vlc(&s->gb, vlc);
+                if (code < 0)
+                    return -1;
+                y = code_table[code];
+                x = y >> 4;
+                y = y & 0x0f;
+            } else {
+                x = 0;
+                y = 0;
+            }
+            dprintf("region=%d n=%d x=%d y=%d exp=%d\n", 
+                    i, g->region_size[i] - j, x, y, exponents[s_index]);
+            if (x) {
+                if (x == 15)
+                    x += get_bitsz(&s->gb, linbits);
+                v = l3_unscale(x, exponents[s_index]);
+                if (get_bits1(&s->gb))
+                    v = -v;
+            } else {
+                v = 0;
+            }
+            g->sb_hybrid[s_index++] = v;
+            if (y) {
+                if (y == 15)
+                    y += get_bitsz(&s->gb, linbits);
+                v = l3_unscale(y, exponents[s_index]);
+                if (get_bits1(&s->gb))
+                    v = -v;
+            } else {
+                v = 0;
+            }
+            g->sb_hybrid[s_index++] = v;
+        }
+    }
+            
+    /* high frequencies */
+    vlc = &huff_quad_vlc[g->count1table_select];
+    last_gb.buffer = NULL;
+    while (s_index <= 572) {
+        pos = get_bits_count(&s->gb);
+        if (pos >= end_pos) {
+            if (pos > end_pos && last_gb.buffer != NULL) {
+                /* some encoders generate an incorrect size for this
+                   part. We must go back into the data */
+                s_index -= 4;
+                s->gb = last_gb;
+            }
+            break;
+        }
+        last_gb= s->gb;
+
+        code = get_vlc(&s->gb, vlc);
+        dprintf("t=%d code=%d\n", g->count1table_select, code);
+        if (code < 0)
+            return -1;
+        for(i=0;i<4;i++) {
+            if (code & (8 >> i)) {
+                /* non zero value. Could use a hand coded function for
+                   'one' value */
+                v = l3_unscale(1, exponents[s_index]);
+                if(get_bits1(&s->gb))
+                    v = -v;
+            } else {
+                v = 0;
+            }
+            g->sb_hybrid[s_index++] = v;
+        }
+    }
+    while (s_index < 576)
+        g->sb_hybrid[s_index++] = 0;
+    return 0;
+}
+
+/* Reorder short blocks from bitstream order to interleaved order. It
+   would be faster to do it in parsing, but the code would be far more
+   complicated */
+static void reorder_block(MPADecodeContext *s, GranuleDef *g)
+{
+    int i, j, k, len;
+    INT32 *ptr, *dst, *ptr1;
+    INT32 tmp[576];
+
+    if (g->block_type != 2)
+        return;
+
+    if (g->switch_point) {
+        if (s->sample_rate_index != 8) {
+            ptr = g->sb_hybrid + 36;
+        } else {
+            ptr = g->sb_hybrid + 48;
+        }
+    } else {
+        ptr = g->sb_hybrid;
+    }
+    
+    for(i=g->short_start;i<13;i++) {
+        len = band_size_short[s->sample_rate_index][i];
+        ptr1 = ptr;
+        for(k=0;k<3;k++) {
+            dst = tmp + k;
+            for(j=len;j>0;j--) {
+                *dst = *ptr++;
+                dst += 3;
+            }
+        }
+        memcpy(ptr1, tmp, len * 3 * sizeof(INT32));
+    }
+}
+
+#define ISQRT2 FIXR(0.70710678118654752440)
+
+static void compute_stereo(MPADecodeContext *s,
+                           GranuleDef *g0, GranuleDef *g1)
+{
+    int i, j, k, l;
+    INT32 v1, v2;
+    int sf_max, tmp0, tmp1, sf, len, non_zero_found;
+    INT32 (*is_tab)[16];
+    INT32 *tab0, *tab1;
+    int non_zero_found_short[3];
+
+    /* intensity stereo */
+    if (s->mode_ext & MODE_EXT_I_STEREO) {
+        if (!s->lsf) {
+            is_tab = is_table;
+            sf_max = 7;
+        } else {
+            is_tab = is_table_lsf[g1->scalefac_compress & 1];
+            sf_max = 16;
+        }
+            
+        tab0 = g0->sb_hybrid + 576;
+        tab1 = g1->sb_hybrid + 576;
+
+        non_zero_found_short[0] = 0;
+        non_zero_found_short[1] = 0;
+        non_zero_found_short[2] = 0;
+        k = (13 - g1->short_start) * 3 + g1->long_end - 3;
+        for(i = 12;i >= g1->short_start;i--) {
+            /* for last band, use previous scale factor */
+            if (i != 11)
+                k -= 3;
+            len = band_size_short[s->sample_rate_index][i];
+            for(l=2;l>=0;l--) {
+                tab0 -= len;
+                tab1 -= len;
+                if (!non_zero_found_short[l]) {
+                    /* test if non zero band. if so, stop doing i-stereo */
+                    for(j=0;j<len;j++) {
+                        if (tab1[j] != 0) {
+                            non_zero_found_short[l] = 1;
+                            goto found1;
+                        }
+                    }
+                    sf = g1->scale_factors[k + l];
+                    if (sf >= sf_max)
+                        goto found1;
+
+                    v1 = is_tab[0][sf];
+                    v2 = is_tab[1][sf];
+                    for(j=0;j<len;j++) {
+                        tmp0 = tab0[j];
+                        tab0[j] = MULL(tmp0, v1);
+                        tab1[j] = MULL(tmp0, v2);
+                    }
+                } else {
+                found1:
+                    if (s->mode_ext & MODE_EXT_MS_STEREO) {
+                        /* lower part of the spectrum : do ms stereo
+                           if enabled */
+                        for(j=0;j<len;j++) {
+                            tmp0 = tab0[j];
+                            tmp1 = tab1[j];
+                            tab0[j] = MULL(tmp0 + tmp1, ISQRT2);
+                            tab1[j] = MULL(tmp0 - tmp1, ISQRT2);
+                        }
+                    }
+                }
+            }
+        }
+
+        non_zero_found = non_zero_found_short[0] | 
+            non_zero_found_short[1] | 
+            non_zero_found_short[2];
+
+        for(i = g1->long_end - 1;i >= 0;i--) {
+            len = band_size_long[s->sample_rate_index][i];
+            tab0 -= len;
+            tab1 -= len;
+            /* test if non zero band. if so, stop doing i-stereo */
+            if (!non_zero_found) {
+                for(j=0;j<len;j++) {
+                    if (tab1[j] != 0) {
+                        non_zero_found = 1;
+                        goto found2;
+                    }
+                }
+                /* for last band, use previous scale factor */
+                k = (i == 21) ? 20 : i;
+                sf = g1->scale_factors[k];
+                if (sf >= sf_max)
+                    goto found2;
+                v1 = is_tab[0][sf];
+                v2 = is_tab[1][sf];
+                for(j=0;j<len;j++) {
+                    tmp0 = tab0[j];
+                    tab0[j] = MULL(tmp0, v1);
+                    tab1[j] = MULL(tmp0, v2);
+                }
+            } else {
+            found2:
+                if (s->mode_ext & MODE_EXT_MS_STEREO) {
+                    /* lower part of the spectrum : do ms stereo
+                       if enabled */
+                    for(j=0;j<len;j++) {
+                        tmp0 = tab0[j];
+                        tmp1 = tab1[j];
+                        tab0[j] = MULL(tmp0 + tmp1, ISQRT2);
+                        tab1[j] = MULL(tmp0 - tmp1, ISQRT2);
+                    }
+                }
+            }
+        }
+    } else if (s->mode_ext & MODE_EXT_MS_STEREO) {
+        /* ms stereo ONLY */
+        /* NOTE: the 1/sqrt(2) normalization factor is included in the
+           global gain */
+        tab0 = g0->sb_hybrid;
+        tab1 = g1->sb_hybrid;
+        for(i=0;i<576;i++) {
+            tmp0 = tab0[i];
+            tmp1 = tab1[i];
+            tab0[i] = tmp0 + tmp1;
+            tab1[i] = tmp0 - tmp1;
+        }
+    }
+}
+
+static void compute_antialias(MPADecodeContext *s,
+                              GranuleDef *g)
+{
+    INT32 *ptr, *p0, *p1, *csa;
+    int n, tmp0, tmp1, i, j;
+
+    /* we antialias only "long" bands */
+    if (g->block_type == 2) {
+        if (!g->switch_point)
+            return;
+        /* XXX: check this for 8000Hz case */
+        n = 1;
+    } else {
+        n = SBLIMIT - 1;
+    }
+    
+    ptr = g->sb_hybrid + 18;
+    for(i = n;i > 0;i--) {
+        p0 = ptr - 1;
+        p1 = ptr;
+        csa = &csa_table[0][0];
+        for(j=0;j<8;j++) {
+            tmp0 = *p0;
+            tmp1 = *p1;
+            *p0 = FRAC_RND(MUL64(tmp0, csa[0]) - MUL64(tmp1, csa[1]));
+            *p1 = FRAC_RND(MUL64(tmp0, csa[1]) + MUL64(tmp1, csa[0]));
+            p0--;
+            p1++;
+            csa += 2;
+        }
+        ptr += 18;
+    }
+}
+
+static void compute_imdct(MPADecodeContext *s,
+                          GranuleDef *g, 
+                          INT32 *sb_samples,
+                          INT32 *mdct_buf)
+{
+    INT32 *ptr, *win, *win1, *buf, *buf2, *out_ptr, *ptr1;
+    INT32 in[6];
+    INT32 out[36];
+    INT32 out2[12];
+    int i, j, k, mdct_long_end, v, sblimit;
+
+    /* find last non zero block */
+    ptr = g->sb_hybrid + 576;
+    ptr1 = g->sb_hybrid + 2 * 18;
+    while (ptr >= ptr1) {
+        ptr -= 6;
+        v = ptr[0] | ptr[1] | ptr[2] | ptr[3] | ptr[4] | ptr[5];
+        if (v != 0)
+            break;
+    }
+    sblimit = ((ptr - g->sb_hybrid) / 18) + 1;
+
+    if (g->block_type == 2) {
+        /* XXX: check for 8000 Hz */
+        if (g->switch_point)
+            mdct_long_end = 2;
+        else
+            mdct_long_end = 0;
+    } else {
+        mdct_long_end = sblimit;
+    }
+
+    buf = mdct_buf;
+    ptr = g->sb_hybrid;
+    for(j=0;j<mdct_long_end;j++) {
+        imdct36(out, ptr);
+        /* apply window & overlap with previous buffer */
+        out_ptr = sb_samples + j;
+        /* select window */
+        if (g->switch_point && j < 2)
+            win1 = mdct_win[0];
+        else
+            win1 = mdct_win[g->block_type];
+        /* select frequency inversion */
+        win = win1 + ((4 * 36) & -(j & 1));
+        for(i=0;i<18;i++) {
+            *out_ptr = MULL(out[i], win[i]) + buf[i];
+            buf[i] = MULL(out[i + 18], win[i + 18]);
+            out_ptr += SBLIMIT;
+        }
+        ptr += 18;
+        buf += 18;
+    }
+    for(j=mdct_long_end;j<sblimit;j++) {
+        for(i=0;i<6;i++) {
+            out[i] = 0;
+            out[6 + i] = 0;
+            out[30+i] = 0;
+        }
+        /* select frequency inversion */
+        win = mdct_win[2] + ((4 * 36) & -(j & 1));
+        buf2 = out + 6;
+        for(k=0;k<3;k++) {
+            /* reorder input for short mdct */
+            ptr1 = ptr + k;
+            for(i=0;i<6;i++) {
+                in[i] = *ptr1;
+                ptr1 += 3;
+            }
+            imdct12(out2, in);
+            /* apply 12 point window and do small overlap */
+            for(i=0;i<6;i++) {
+                buf2[i] = MULL(out2[i], win[i]) + buf2[i];
+                buf2[i + 6] = MULL(out2[i + 6], win[i + 6]);
+            }
+            buf2 += 6;
+        }
+        /* overlap */
+        out_ptr = sb_samples + j;
+        for(i=0;i<18;i++) {
+            *out_ptr = out[i] + buf[i];
+            buf[i] = out[i + 18];
+            out_ptr += SBLIMIT;
+        }
+        ptr += 18;
+        buf += 18;
+    }
+    /* zero bands */
+    for(j=sblimit;j<SBLIMIT;j++) {
+        /* overlap */
+        out_ptr = sb_samples + j;
+        for(i=0;i<18;i++) {
+            *out_ptr = buf[i];
+            buf[i] = 0;
+            out_ptr += SBLIMIT;
+        }
+        buf += 18;
+    }
+}
+
+#if defined(DEBUG)
+void sample_dump(int fnum, INT32 *tab, int n)
+{
+    static FILE *files[16], *f;
+    char buf[512];
+    int i;
+    INT32 v;
+    
+    f = files[fnum];
+    if (!f) {
+        sprintf(buf, "/tmp/out%d.%s.pcm", 
+                fnum, 
+#ifdef USE_HIGHPRECISION
+                "hp"
+#else
+                "lp"
+#endif
+                );
+        f = fopen(buf, "w");
+        if (!f)
+            return;
+        files[fnum] = f;
+    }
+    
+    if (fnum == 0) {
+        static int pos = 0;
+        printf("pos=%d\n", pos);
+        for(i=0;i<n;i++) {
+            printf(" %0.4f", (double)tab[i] / FRAC_ONE);
+            if ((i % 18) == 17)
+                printf("\n");
+        }
+        pos += n;
+    }
+    for(i=0;i<n;i++) {
+        /* normalize to 23 frac bits */
+        v = tab[i] << (23 - FRAC_BITS);
+        fwrite(&v, 1, sizeof(INT32), f);
+    }
+}
+#endif
+
+
+/* main layer3 decoding function */
+static int mp_decode_layer3(MPADecodeContext *s)
+{
+    int nb_granules, main_data_begin, private_bits;
+    int gr, ch, blocksplit_flag, i, j, k, n, bits_pos, bits_left;
+    GranuleDef granules[2][2], *g;
+    INT16 exponents[576];
+
+    /* read side info */
+    if (s->lsf) {
+        main_data_begin = get_bits(&s->gb, 8);
+        if (s->nb_channels == 2)
+            private_bits = get_bits(&s->gb, 2);
+        else
+            private_bits = get_bits(&s->gb, 1);
+        nb_granules = 1;
+    } else {
+        main_data_begin = get_bits(&s->gb, 9);
+        if (s->nb_channels == 2)
+            private_bits = get_bits(&s->gb, 3);
+        else
+            private_bits = get_bits(&s->gb, 5);
+        nb_granules = 2;
+        for(ch=0;ch<s->nb_channels;ch++) {
+            granules[ch][0].scfsi = 0; /* all scale factors are transmitted */
+            granules[ch][1].scfsi = get_bits(&s->gb, 4);
+        }
+    }
+    
+    for(gr=0;gr<nb_granules;gr++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            dprintf("gr=%d ch=%d: side_info\n", gr, ch);
+            g = &granules[ch][gr];
+            g->part2_3_length = get_bits(&s->gb, 12);
+            g->big_values = get_bits(&s->gb, 9);
+            g->global_gain = get_bits(&s->gb, 8);
+            /* if MS stereo only is selected, we precompute the
+               1/sqrt(2) renormalization factor */
+            if ((s->mode_ext & (MODE_EXT_MS_STEREO | MODE_EXT_I_STEREO)) == 
+                MODE_EXT_MS_STEREO)
+                g->global_gain -= 2;
+            if (s->lsf)
+                g->scalefac_compress = get_bits(&s->gb, 9);
+            else
+                g->scalefac_compress = get_bits(&s->gb, 4);
+            blocksplit_flag = get_bits(&s->gb, 1);
+            if (blocksplit_flag) {
+                g->block_type = get_bits(&s->gb, 2);
+                if (g->block_type == 0)
+                    return -1;
+                g->switch_point = get_bits(&s->gb, 1);
+                for(i=0;i<2;i++)
+                    g->table_select[i] = get_bits(&s->gb, 5);
+                for(i=0;i<3;i++) 
+                    g->subblock_gain[i] = get_bits(&s->gb, 3);
+                /* compute huffman coded region sizes */
+                if (g->block_type == 2)
+                    g->region_size[0] = (36 / 2);
+                else {
+                    if (s->sample_rate_index <= 2) 
+                        g->region_size[0] = (36 / 2);
+                    else if (s->sample_rate_index != 8) 
+                        g->region_size[0] = (54 / 2);
+                    else
+                        g->region_size[0] = (108 / 2);
+                }
+                g->region_size[1] = (576 / 2);
+            } else {
+                int region_address1, region_address2, l;
+                g->block_type = 0;
+                g->switch_point = 0;
+                for(i=0;i<3;i++)
+                    g->table_select[i] = get_bits(&s->gb, 5);
+                /* compute huffman coded region sizes */
+                region_address1 = get_bits(&s->gb, 4);
+                region_address2 = get_bits(&s->gb, 3);
+                dprintf("region1=%d region2=%d\n", 
+                        region_address1, region_address2);
+                g->region_size[0] = 
+                    band_index_long[s->sample_rate_index][region_address1 + 1] >> 1;
+                l = region_address1 + region_address2 + 2;
+                /* should not overflow */
+                if (l > 22)
+                    l = 22;
+                g->region_size[1] = 
+                    band_index_long[s->sample_rate_index][l] >> 1;
+            }
+            /* convert region offsets to region sizes and truncate
+               size to big_values */
+            g->region_size[2] = (576 / 2);
+            j = 0;
+            for(i=0;i<3;i++) {
+                k = g->region_size[i];
+                if (k > g->big_values)
+                    k = g->big_values;
+                g->region_size[i] = k - j;
+                j = k;
+            }
+
+            /* compute band indexes */
+            if (g->block_type == 2) {
+                if (g->switch_point) {
+                    /* if switched mode, we handle the 36 first samples as
+                       long blocks.  For 8000Hz, we handle the 48 first
+                       exponents as long blocks (XXX: check this!) */
+                    if (s->sample_rate_index <= 2)
+                        g->long_end = 8;
+                    else if (s->sample_rate_index != 8)
+                        g->long_end = 6;
+                    else
+                        g->long_end = 4; /* 8000 Hz */
+                    
+                    if (s->sample_rate_index != 8)
+                        g->short_start = 3;
+                    else
+                        g->short_start = 2; 
+                } else {
+                    g->long_end = 0;
+                    g->short_start = 0;
+                }
+            } else {
+                g->short_start = 13;
+                g->long_end = 22;
+            }
+            
+            g->preflag = 0;
+            if (!s->lsf)
+                g->preflag = get_bits(&s->gb, 1);
+            g->scalefac_scale = get_bits(&s->gb, 1);
+            g->count1table_select = get_bits(&s->gb, 1);
+            dprintf("block_type=%d switch_point=%d\n",
+                    g->block_type, g->switch_point);
+        }
+    }
+
+    /* now we get bits from the main_data_begin offset */
+    dprintf("seekback: %d\n", main_data_begin);
+    seek_to_maindata(s, main_data_begin);
+
+    for(gr=0;gr<nb_granules;gr++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            g = &granules[ch][gr];
+            
+            bits_pos = get_bits_count(&s->gb);
+            
+            if (!s->lsf) {
+                UINT8 *sc;
+                int slen, slen1, slen2;
+
+                /* MPEG1 scale factors */
+                slen1 = slen_table[0][g->scalefac_compress];
+                slen2 = slen_table[1][g->scalefac_compress];
+                dprintf("slen1=%d slen2=%d\n", slen1, slen2);
+                if (g->block_type == 2) {
+                    n = g->switch_point ? 17 : 18;
+                    j = 0;
+                    for(i=0;i<n;i++)
+                        g->scale_factors[j++] = get_bitsz(&s->gb, slen1);
+                    for(i=0;i<18;i++)
+                        g->scale_factors[j++] = get_bitsz(&s->gb, slen2);
+                    for(i=0;i<3;i++)
+                        g->scale_factors[j++] = 0;
+                } else {
+                    sc = granules[ch][0].scale_factors;
+                    j = 0;
+                    for(k=0;k<4;k++) {
+                        n = (k == 0 ? 6 : 5);
+                        if ((g->scfsi & (0x8 >> k)) == 0) {
+                            slen = (k < 2) ? slen1 : slen2;
+                            for(i=0;i<n;i++)
+                                g->scale_factors[j++] = get_bitsz(&s->gb, slen);
+                        } else {
+                            /* simply copy from last granule */
+                            for(i=0;i<n;i++) {
+                                g->scale_factors[j] = sc[j];
+                                j++;
+                            }
+                        }
+                    }
+                    g->scale_factors[j++] = 0;
+                }
+#if defined(DEBUG)
+                {
+                    printf("scfsi=%x gr=%d ch=%d scale_factors:\n", 
+                           g->scfsi, gr, ch);
+                    for(i=0;i<j;i++)
+                        printf(" %d", g->scale_factors[i]);
+                    printf("\n");
+                }
+#endif
+            } else {
+                int tindex, tindex2, slen[4], sl, sf;
+
+                /* LSF scale factors */
+                if (g->block_type == 2) {
+                    tindex = g->switch_point ? 2 : 1;
+                } else {
+                    tindex = 0;
+                }
+                sf = g->scalefac_compress;
+                if ((s->mode_ext & MODE_EXT_I_STEREO) && ch == 1) {
+                    /* intensity stereo case */
+                    sf >>= 1;
+                    if (sf < 180) {
+                        lsf_sf_expand(slen, sf, 6, 6, 0);
+                        tindex2 = 3;
+                    } else if (sf < 244) {
+                        lsf_sf_expand(slen, sf - 180, 4, 4, 0);
+                        tindex2 = 4;
+                    } else {
+                        lsf_sf_expand(slen, sf - 244, 3, 0, 0);
+                        tindex2 = 5;
+                    }
+                } else {
+                    /* normal case */
+                    if (sf < 400) {
+                        lsf_sf_expand(slen, sf, 5, 4, 4);
+                        tindex2 = 0;
+                    } else if (sf < 500) {
+                        lsf_sf_expand(slen, sf - 400, 5, 4, 0);
+                        tindex2 = 1;
+                    } else {
+                        lsf_sf_expand(slen, sf - 500, 3, 0, 0);
+                        tindex2 = 2;
+                        g->preflag = 1;
+                    }
+                }
+
+                j = 0;
+                for(k=0;k<4;k++) {
+                    n = lsf_nsf_table[tindex2][tindex][k];
+                    sl = slen[k];
+                    for(i=0;i<n;i++)
+                        g->scale_factors[j++] = get_bitsz(&s->gb, sl);
+                }
+                /* XXX: should compute exact size */
+                for(;j<40;j++)
+                    g->scale_factors[j] = 0;
+#if defined(DEBUG)
+                {
+                    printf("gr=%d ch=%d scale_factors:\n", 
+                           gr, ch);
+                    for(i=0;i<40;i++)
+                        printf(" %d", g->scale_factors[i]);
+                    printf("\n");
+                }
+#endif
+            }
+
+            exponents_from_scale_factors(s, g, exponents);
+
+            /* read Huffman coded residue */
+            if (huffman_decode(s, g, exponents,
+                               bits_pos + g->part2_3_length) < 0)
+                return -1;
+#if defined(DEBUG)
+            sample_dump(0, g->sb_hybrid, 576);
+#endif
+
+            /* skip extension bits */
+            bits_left = g->part2_3_length - (get_bits_count(&s->gb) - bits_pos);
+            if (bits_left < 0) {
+                dprintf("bits_left=%d\n", bits_left);
+                return -1;
+            }
+            while (bits_left >= 16) {
+                skip_bits(&s->gb, 16);
+                bits_left -= 16;
+            }
+            if (bits_left > 0)
+                skip_bits(&s->gb, bits_left);
+        } /* ch */
+
+        if (s->nb_channels == 2)
+            compute_stereo(s, &granules[0][gr], &granules[1][gr]);
+
+        for(ch=0;ch<s->nb_channels;ch++) {
+            g = &granules[ch][gr];
+
+            reorder_block(s, g);
+#if defined(DEBUG)
+            sample_dump(0, g->sb_hybrid, 576);
+#endif
+            compute_antialias(s, g);
+#if defined(DEBUG)
+            sample_dump(1, g->sb_hybrid, 576);
+#endif
+            compute_imdct(s, g, &s->sb_samples[ch][18 * gr][0], s->mdct_buf[ch]); 
+#if defined(DEBUG)
+            sample_dump(2, &s->sb_samples[ch][18 * gr][0], 576);
+#endif
+        }
+    } /* gr */
+    return nb_granules * 18;
+}
+
+static int mp_decode_frame(MPADecodeContext *s, 
+                           short *samples)
+{
+    int i, nb_frames, ch;
+    short *samples_ptr;
+
+    init_get_bits(&s->gb, s->inbuf + HEADER_SIZE, 
+                  s->inbuf_ptr - s->inbuf - HEADER_SIZE);
+    
+    /* skip error protection field */
+    if (s->error_protection)
+        get_bits(&s->gb, 16);
+
+    dprintf("frame %d:\n", s->frame_count);
+    switch(s->layer) {
+    case 1:
+        nb_frames = mp_decode_layer1(s);
+        break;
+    case 2:
+        nb_frames = mp_decode_layer2(s);
+        break;
+    case 3:
+    default:
+        nb_frames = mp_decode_layer3(s);
+        break;
+    }
+#if defined(DEBUG)
+    for(i=0;i<nb_frames;i++) {
+        for(ch=0;ch<s->nb_channels;ch++) {
+            int j;
+            printf("%d-%d:", i, ch);
+            for(j=0;j<SBLIMIT;j++)
+                printf(" %0.6f", (double)s->sb_samples[ch][i][j] / FRAC_ONE);
+            printf("\n");
+        }
+    }
+#endif
+    /* apply the synthesis filter */
+    for(ch=0;ch<s->nb_channels;ch++) {
+        samples_ptr = samples + ch;
+        for(i=0;i<nb_frames;i++) {
+            synth_filter(s, ch, samples_ptr, s->nb_channels,
+                         s->sb_samples[ch][i]);
+            samples_ptr += 32 * s->nb_channels;
+        }
+    }
+#ifdef DEBUG
+    s->frame_count++;        
+#endif
+    return nb_frames * 32 * sizeof(short) * s->nb_channels;
+}
+
+static int decode_frame(AVCodecContext * avctx,
+			void *data, int *data_size,
+			UINT8 * buf, int buf_size)
+{
+    MPADecodeContext *s = avctx->priv_data;
+    UINT32 header;
+    UINT8 *buf_ptr;
+    int len, out_size;
+    short *out_samples = data;
+
+    *data_size = 0;
+    buf_ptr = buf;
+    while (buf_size > 0) {
+	len = s->inbuf_ptr - s->inbuf;
+	if (s->frame_size == 0) {
+            /* special case for next header for first frame in free
+               format case (XXX: find a simpler method) */
+            if (s->free_format_next_header != 0) {
+                s->inbuf[0] = s->free_format_next_header >> 24;
+                s->inbuf[1] = s->free_format_next_header >> 16;
+                s->inbuf[2] = s->free_format_next_header >> 8;
+                s->inbuf[3] = s->free_format_next_header;
+                s->inbuf_ptr = s->inbuf + 4;
+                s->free_format_next_header = 0;
+                goto got_header;
+            }
+	    /* no header seen : find one. We need at least HEADER_SIZE
+               bytes to parse it */
+	    len = HEADER_SIZE - len;
+	    if (len > buf_size)
+		len = buf_size;
+	    if (len > 0) {
+		memcpy(s->inbuf_ptr, buf_ptr, len);
+		buf_ptr += len;
+		buf_size -= len;
+		s->inbuf_ptr += len;
+	    }
+	    if ((s->inbuf_ptr - s->inbuf) >= HEADER_SIZE) {
+            got_header:
+		header = (s->inbuf[0] << 24) | (s->inbuf[1] << 16) |
+		    (s->inbuf[2] << 8) | s->inbuf[3];
+
+		if (check_header(header) < 0) {
+		    /* no sync found : move by one byte (inefficient, but simple!) */
+		    memcpy(s->inbuf, s->inbuf + 1, s->inbuf_ptr - s->inbuf - 1);
+		    s->inbuf_ptr--;
+                    dprintf("skip %x\n", header);
+                    /* reset free format frame size to give a chance
+                       to get a new bitrate */
+                    s->free_format_frame_size = 0;
+		} else {
+		    if (decode_header(s, header) == 1) {
+                        /* free format: prepare to compute frame size */
+			s->frame_size = -1;
+                    }
+                    /* update codec info */
+                    avctx->sample_rate = s->sample_rate;
+                    avctx->channels = s->nb_channels;
+                    avctx->bit_rate = s->bit_rate;
+                    avctx->frame_size = s->frame_size;
+		}
+	    }
+        } else if (s->frame_size == -1) {
+            /* free format : find next sync to compute frame size */
+	    len = MPA_MAX_CODED_FRAME_SIZE - len;
+	    if (len > buf_size)
+		len = buf_size;
+            if (len == 0) {
+		/* frame too long: resync */
+                s->frame_size = 0;
+		memcpy(s->inbuf, s->inbuf + 1, s->inbuf_ptr - s->inbuf - 1);
+		s->inbuf_ptr--;
+            } else {
+                UINT8 *p, *pend;
+                UINT32 header1;
+                int padding;
+
+                memcpy(s->inbuf_ptr, buf_ptr, len);
+                /* check for header */
+                p = s->inbuf_ptr - 3;
+                pend = s->inbuf_ptr + len - 4;
+                while (p <= pend) {
+                    header = (p[0] << 24) | (p[1] << 16) |
+                        (p[2] << 8) | p[3];
+                    header1 = (s->inbuf[0] << 24) | (s->inbuf[1] << 16) |
+                        (s->inbuf[2] << 8) | s->inbuf[3];
+                    /* check with high probability that we have a
+                       valid header */
+                    if ((header & SAME_HEADER_MASK) ==
+                        (header1 & SAME_HEADER_MASK)) {
+                        /* header found: update pointers */
+                        len = (p + 4) - s->inbuf_ptr;
+                        buf_ptr += len;
+                        buf_size -= len;
+                        s->inbuf_ptr = p;
+                        /* compute frame size */
+                        s->free_format_next_header = header;
+                        s->free_format_frame_size = s->inbuf_ptr - s->inbuf;
+                        padding = (header1 >> 9) & 1;
+                        if (s->layer == 1)
+                            s->free_format_frame_size -= padding * 4;
+                        else
+                            s->free_format_frame_size -= padding;
+                        dprintf("free frame size=%d padding=%d\n", 
+                                s->free_format_frame_size, padding);
+                        decode_header(s, header1);
+                        goto next_data;
+                    }
+                    p++;
+                }
+                /* not found: simply increase pointers */
+                buf_ptr += len;
+                s->inbuf_ptr += len;
+                buf_size -= len;
+            }
+	} else if (len < s->frame_size) {
+            if (s->frame_size > MPA_MAX_CODED_FRAME_SIZE)
+                s->frame_size = MPA_MAX_CODED_FRAME_SIZE;
+	    len = s->frame_size - len;
+	    if (len > buf_size)
+		len = buf_size;
+	    memcpy(s->inbuf_ptr, buf_ptr, len);
+	    buf_ptr += len;
+	    s->inbuf_ptr += len;
+	    buf_size -= len;
+	} else {
+            out_size = mp_decode_frame(s, out_samples);
+	    s->inbuf_ptr = s->inbuf;
+	    s->frame_size = 0;
+	    *data_size = out_size;
+	    break;
+	}
+    next_data:
+	;
+    }
+    return buf_ptr - buf;
+}
+
+AVCodec mp2_decoder =
+{
+    "mp2",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_MP2,
+    sizeof(MPADecodeContext),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame,
+};
+
+AVCodec mp3_decoder =
+{
+    "mp3",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_MP3LAME,
+    sizeof(MPADecodeContext),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame,
+};
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef C8
+#undef FRAC_BITS
+#undef HEADER_SIZE
diff --git a/src/libffmpeg/libavcodec/mpegaudiodectab.h b/src/libffmpeg/libavcodec/mpegaudiodectab.h
new file mode 100644
index 000000000..7cec73ebe
--- /dev/null
+++ b/src/libffmpeg/libavcodec/mpegaudiodectab.h
@@ -0,0 +1,770 @@
+
+const UINT16 mpa_bitrate_tab[2][3][15] = {
+    { {0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448 },
+      {0, 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384 },
+      {0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 } },
+    { {0, 32, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 224, 256},
+      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160},
+      {0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160}
+    }
+};
+
+const UINT16 mpa_freq_tab[3] = { 44100, 48000, 32000 };
+
+/*******************************************************/
+/* half mpeg encoding window (full precision) */
+const INT32 mpa_enwindow[257] = {
+     0,    -1,    -1,    -1,    -1,    -1,    -1,    -2,
+    -2,    -2,    -2,    -3,    -3,    -4,    -4,    -5,
+    -5,    -6,    -7,    -7,    -8,    -9,   -10,   -11,
+   -13,   -14,   -16,   -17,   -19,   -21,   -24,   -26,
+   -29,   -31,   -35,   -38,   -41,   -45,   -49,   -53,
+   -58,   -63,   -68,   -73,   -79,   -85,   -91,   -97,
+  -104,  -111,  -117,  -125,  -132,  -139,  -147,  -154,
+  -161,  -169,  -176,  -183,  -190,  -196,  -202,  -208,
+   213,   218,   222,   225,   227,   228,   228,   227,
+   224,   221,   215,   208,   200,   189,   177,   163,
+   146,   127,   106,    83,    57,    29,    -2,   -36,
+   -72,  -111,  -153,  -197,  -244,  -294,  -347,  -401,
+  -459,  -519,  -581,  -645,  -711,  -779,  -848,  -919,
+  -991, -1064, -1137, -1210, -1283, -1356, -1428, -1498,
+ -1567, -1634, -1698, -1759, -1817, -1870, -1919, -1962,
+ -2001, -2032, -2057, -2075, -2085, -2087, -2080, -2063,
+  2037,  2000,  1952,  1893,  1822,  1739,  1644,  1535,
+  1414,  1280,  1131,   970,   794,   605,   402,   185,
+   -45,  -288,  -545,  -814, -1095, -1388, -1692, -2006,
+ -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788,
+ -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597,
+ -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585,
+ -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750,
+ -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134,
+  6574,  5959,  5288,  4561,  3776,  2935,  2037,  1082,
+    70,  -998, -2122, -3300, -4533, -5818, -7154, -8540,
+ -9975,-11455,-12980,-14548,-16155,-17799,-19478,-21189,
+-22929,-24694,-26482,-28289,-30112,-31947,-33791,-35640,
+-37489,-39336,-41176,-43006,-44821,-46617,-48390,-50137,
+-51853,-53534,-55178,-56778,-58333,-59838,-61289,-62684,
+-64019,-65290,-66494,-67629,-68692,-69679,-70590,-71420,
+-72169,-72835,-73415,-73908,-74313,-74630,-74856,-74992,
+ 75038,
+};
+
+/*******************************************************/
+/* layer 2 tables */
+
+const int sblimit_table[5] = { 27 , 30 , 8, 12 , 30 };
+
+const int quant_steps[17] = {
+    3,     5,    7,    9,    15,
+    31,    63,  127,  255,   511,
+    1023,  2047, 4095, 8191, 16383,
+    32767, 65535
+};
+
+/* we use a negative value if grouped */
+const int quant_bits[17] = {
+    -5,  -7,  3, -10, 4, 
+     5,  6,  7,  8,  9,
+    10, 11, 12, 13, 14,
+    15, 16 
+};
+
+/* encoding tables which give the quantization index. Note how it is
+   possible to store them efficiently ! */
+static const unsigned char alloc_table_0[] = {
+ 4,  0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 
+ 4,  0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 
+ 4,  0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+};
+
+static const unsigned char alloc_table_1[] = {
+ 4,  0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 
+ 4,  0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 
+ 4,  0,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 3,  0,  1,  2,  3,  4,  5, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+ 2,  0,  1, 16, 
+};
+
+static const unsigned char alloc_table_2[] = {
+ 4,  0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 
+ 4,  0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+};
+
+static const unsigned char alloc_table_3[] = {
+ 4,  0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 
+ 4,  0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+};
+
+static const unsigned char alloc_table_4[] = {
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
+ 4,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 3,  0,  1,  3,  4,  5,  6,  7, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+ 2,  0,  1,  3, 
+};
+
+const unsigned char *alloc_tables[5] = 
+{ alloc_table_0, alloc_table_1, alloc_table_2, alloc_table_3, alloc_table_4, };
+
+/*******************************************************/
+/* layer 3 tables */
+
+/* layer3 scale factor size */
+static const UINT8 slen_table[2][16] = {
+    { 0, 0, 0, 0, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
+    { 0, 1, 2, 3, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3 },
+};
+
+/* number of lsf scale factors for a given size */
+static const UINT8 lsf_nsf_table[6][3][4] = {
+    { {  6,  5,  5, 5 }, {  9,  9,  9, 9 }, {  6,  9,  9, 9 } },
+    { {  6,  5,  7, 3 }, {  9,  9, 12, 6 }, {  6,  9, 12, 6 } },
+    { { 11, 10,  0, 0 }, { 18, 18,  0, 0 }, { 15, 18,  0, 0 } },
+    { {  7,  7,  7, 0 }, { 12, 12, 12, 0 }, {  6, 15, 12, 0 } }, 
+    { {  6,  6,  6, 3 }, { 12,  9,  9, 6 }, {  6, 12,  9, 6 } },
+    { {  8,  8,  5, 0 }, { 15, 12,  9, 0 }, {  6, 18,  9, 0 } },
+};
+
+/* mpegaudio layer 3 huffman tables */
+
+const UINT16 mpa_huffcodes_1[4] = {
+ 0x0001, 0x0001, 0x0001, 0x0000,
+};
+
+const UINT8 mpa_huffbits_1[4] = {
+  1,  3,  2,  3,
+};
+
+const UINT16 mpa_huffcodes_2[9] = {
+ 0x0001, 0x0002, 0x0001, 0x0003, 0x0001, 0x0001, 0x0003, 0x0002,
+ 0x0000,
+};
+
+const UINT8 mpa_huffbits_2[9] = {
+  1,  3,  6,  3,  3,  5,  5,  5,
+  6,
+};
+
+const UINT16 mpa_huffcodes_3[9] = {
+ 0x0003, 0x0002, 0x0001, 0x0001, 0x0001, 0x0001, 0x0003, 0x0002,
+ 0x0000,
+};
+
+const UINT8 mpa_huffbits_3[9] = {
+  2,  2,  6,  3,  2,  5,  5,  5,
+  6,
+};
+
+const UINT16 mpa_huffcodes_5[16] = {
+ 0x0001, 0x0002, 0x0006, 0x0005, 0x0003, 0x0001, 0x0004, 0x0004,
+ 0x0007, 0x0005, 0x0007, 0x0001, 0x0006, 0x0001, 0x0001, 0x0000,
+};
+
+const UINT8 mpa_huffbits_5[16] = {
+  1,  3,  6,  7,  3,  3,  6,  7,
+  6,  6,  7,  8,  7,  6,  7,  8,
+};
+
+const UINT16 mpa_huffcodes_6[16] = {
+ 0x0007, 0x0003, 0x0005, 0x0001, 0x0006, 0x0002, 0x0003, 0x0002,
+ 0x0005, 0x0004, 0x0004, 0x0001, 0x0003, 0x0003, 0x0002, 0x0000,
+};
+
+const UINT8 mpa_huffbits_6[16] = {
+  3,  3,  5,  7,  3,  2,  4,  5,
+  4,  4,  5,  6,  6,  5,  6,  7,
+};
+
+const UINT16 mpa_huffcodes_7[36] = {
+ 0x0001, 0x0002, 0x000a, 0x0013, 0x0010, 0x000a, 0x0003, 0x0003,
+ 0x0007, 0x000a, 0x0005, 0x0003, 0x000b, 0x0004, 0x000d, 0x0011,
+ 0x0008, 0x0004, 0x000c, 0x000b, 0x0012, 0x000f, 0x000b, 0x0002,
+ 0x0007, 0x0006, 0x0009, 0x000e, 0x0003, 0x0001, 0x0006, 0x0004,
+ 0x0005, 0x0003, 0x0002, 0x0000,
+};
+
+const UINT8 mpa_huffbits_7[36] = {
+  1,  3,  6,  8,  8,  9,  3,  4,
+  6,  7,  7,  8,  6,  5,  7,  8,
+  8,  9,  7,  7,  8,  9,  9,  9,
+  7,  7,  8,  9,  9, 10,  8,  8,
+  9, 10, 10, 10,
+};
+
+const UINT16 mpa_huffcodes_8[36] = {
+ 0x0003, 0x0004, 0x0006, 0x0012, 0x000c, 0x0005, 0x0005, 0x0001,
+ 0x0002, 0x0010, 0x0009, 0x0003, 0x0007, 0x0003, 0x0005, 0x000e,
+ 0x0007, 0x0003, 0x0013, 0x0011, 0x000f, 0x000d, 0x000a, 0x0004,
+ 0x000d, 0x0005, 0x0008, 0x000b, 0x0005, 0x0001, 0x000c, 0x0004,
+ 0x0004, 0x0001, 0x0001, 0x0000,
+};
+
+const UINT8 mpa_huffbits_8[36] = {
+  2,  3,  6,  8,  8,  9,  3,  2,
+  4,  8,  8,  8,  6,  4,  6,  8,
+  8,  9,  8,  8,  8,  9,  9, 10,
+  8,  7,  8,  9, 10, 10,  9,  8,
+  9,  9, 11, 11,
+};
+
+const UINT16 mpa_huffcodes_9[36] = {
+ 0x0007, 0x0005, 0x0009, 0x000e, 0x000f, 0x0007, 0x0006, 0x0004,
+ 0x0005, 0x0005, 0x0006, 0x0007, 0x0007, 0x0006, 0x0008, 0x0008,
+ 0x0008, 0x0005, 0x000f, 0x0006, 0x0009, 0x000a, 0x0005, 0x0001,
+ 0x000b, 0x0007, 0x0009, 0x0006, 0x0004, 0x0001, 0x000e, 0x0004,
+ 0x0006, 0x0002, 0x0006, 0x0000,
+};
+
+const UINT8 mpa_huffbits_9[36] = {
+  3,  3,  5,  6,  8,  9,  3,  3,
+  4,  5,  6,  8,  4,  4,  5,  6,
+  7,  8,  6,  5,  6,  7,  7,  8,
+  7,  6,  7,  7,  8,  9,  8,  7,
+  8,  8,  9,  9,
+};
+
+const UINT16 mpa_huffcodes_10[64] = {
+ 0x0001, 0x0002, 0x000a, 0x0017, 0x0023, 0x001e, 0x000c, 0x0011,
+ 0x0003, 0x0003, 0x0008, 0x000c, 0x0012, 0x0015, 0x000c, 0x0007,
+ 0x000b, 0x0009, 0x000f, 0x0015, 0x0020, 0x0028, 0x0013, 0x0006,
+ 0x000e, 0x000d, 0x0016, 0x0022, 0x002e, 0x0017, 0x0012, 0x0007,
+ 0x0014, 0x0013, 0x0021, 0x002f, 0x001b, 0x0016, 0x0009, 0x0003,
+ 0x001f, 0x0016, 0x0029, 0x001a, 0x0015, 0x0014, 0x0005, 0x0003,
+ 0x000e, 0x000d, 0x000a, 0x000b, 0x0010, 0x0006, 0x0005, 0x0001,
+ 0x0009, 0x0008, 0x0007, 0x0008, 0x0004, 0x0004, 0x0002, 0x0000,
+};
+
+const UINT8 mpa_huffbits_10[64] = {
+  1,  3,  6,  8,  9,  9,  9, 10,
+  3,  4,  6,  7,  8,  9,  8,  8,
+  6,  6,  7,  8,  9, 10,  9,  9,
+  7,  7,  8,  9, 10, 10,  9, 10,
+  8,  8,  9, 10, 10, 10, 10, 10,
+  9,  9, 10, 10, 11, 11, 10, 11,
+  8,  8,  9, 10, 10, 10, 11, 11,
+  9,  8,  9, 10, 10, 11, 11, 11,
+};
+
+const UINT16 mpa_huffcodes_11[64] = {
+ 0x0003, 0x0004, 0x000a, 0x0018, 0x0022, 0x0021, 0x0015, 0x000f,
+ 0x0005, 0x0003, 0x0004, 0x000a, 0x0020, 0x0011, 0x000b, 0x000a,
+ 0x000b, 0x0007, 0x000d, 0x0012, 0x001e, 0x001f, 0x0014, 0x0005,
+ 0x0019, 0x000b, 0x0013, 0x003b, 0x001b, 0x0012, 0x000c, 0x0005,
+ 0x0023, 0x0021, 0x001f, 0x003a, 0x001e, 0x0010, 0x0007, 0x0005,
+ 0x001c, 0x001a, 0x0020, 0x0013, 0x0011, 0x000f, 0x0008, 0x000e,
+ 0x000e, 0x000c, 0x0009, 0x000d, 0x000e, 0x0009, 0x0004, 0x0001,
+ 0x000b, 0x0004, 0x0006, 0x0006, 0x0006, 0x0003, 0x0002, 0x0000,
+};
+
+const UINT8 mpa_huffbits_11[64] = {
+  2,  3,  5,  7,  8,  9,  8,  9,
+  3,  3,  4,  6,  8,  8,  7,  8,
+  5,  5,  6,  7,  8,  9,  8,  8,
+  7,  6,  7,  9,  8, 10,  8,  9,
+  8,  8,  8,  9,  9, 10,  9, 10,
+  8,  8,  9, 10, 10, 11, 10, 11,
+  8,  7,  7,  8,  9, 10, 10, 10,
+  8,  7,  8,  9, 10, 10, 10, 10,
+};
+
+const UINT16 mpa_huffcodes_12[64] = {
+ 0x0009, 0x0006, 0x0010, 0x0021, 0x0029, 0x0027, 0x0026, 0x001a,
+ 0x0007, 0x0005, 0x0006, 0x0009, 0x0017, 0x0010, 0x001a, 0x000b,
+ 0x0011, 0x0007, 0x000b, 0x000e, 0x0015, 0x001e, 0x000a, 0x0007,
+ 0x0011, 0x000a, 0x000f, 0x000c, 0x0012, 0x001c, 0x000e, 0x0005,
+ 0x0020, 0x000d, 0x0016, 0x0013, 0x0012, 0x0010, 0x0009, 0x0005,
+ 0x0028, 0x0011, 0x001f, 0x001d, 0x0011, 0x000d, 0x0004, 0x0002,
+ 0x001b, 0x000c, 0x000b, 0x000f, 0x000a, 0x0007, 0x0004, 0x0001,
+ 0x001b, 0x000c, 0x0008, 0x000c, 0x0006, 0x0003, 0x0001, 0x0000,
+};
+
+const UINT8 mpa_huffbits_12[64] = {
+  4,  3,  5,  7,  8,  9,  9,  9,
+  3,  3,  4,  5,  7,  7,  8,  8,
+  5,  4,  5,  6,  7,  8,  7,  8,
+  6,  5,  6,  6,  7,  8,  8,  8,
+  7,  6,  7,  7,  8,  8,  8,  9,
+  8,  7,  8,  8,  8,  9,  8,  9,
+  8,  7,  7,  8,  8,  9,  9, 10,
+  9,  8,  8,  9,  9,  9,  9, 10,
+};
+
+const UINT16 mpa_huffcodes_13[256] = {
+ 0x0001, 0x0005, 0x000e, 0x0015, 0x0022, 0x0033, 0x002e, 0x0047,
+ 0x002a, 0x0034, 0x0044, 0x0034, 0x0043, 0x002c, 0x002b, 0x0013,
+ 0x0003, 0x0004, 0x000c, 0x0013, 0x001f, 0x001a, 0x002c, 0x0021,
+ 0x001f, 0x0018, 0x0020, 0x0018, 0x001f, 0x0023, 0x0016, 0x000e,
+ 0x000f, 0x000d, 0x0017, 0x0024, 0x003b, 0x0031, 0x004d, 0x0041,
+ 0x001d, 0x0028, 0x001e, 0x0028, 0x001b, 0x0021, 0x002a, 0x0010,
+ 0x0016, 0x0014, 0x0025, 0x003d, 0x0038, 0x004f, 0x0049, 0x0040,
+ 0x002b, 0x004c, 0x0038, 0x0025, 0x001a, 0x001f, 0x0019, 0x000e,
+ 0x0023, 0x0010, 0x003c, 0x0039, 0x0061, 0x004b, 0x0072, 0x005b,
+ 0x0036, 0x0049, 0x0037, 0x0029, 0x0030, 0x0035, 0x0017, 0x0018,
+ 0x003a, 0x001b, 0x0032, 0x0060, 0x004c, 0x0046, 0x005d, 0x0054,
+ 0x004d, 0x003a, 0x004f, 0x001d, 0x004a, 0x0031, 0x0029, 0x0011,
+ 0x002f, 0x002d, 0x004e, 0x004a, 0x0073, 0x005e, 0x005a, 0x004f,
+ 0x0045, 0x0053, 0x0047, 0x0032, 0x003b, 0x0026, 0x0024, 0x000f,
+ 0x0048, 0x0022, 0x0038, 0x005f, 0x005c, 0x0055, 0x005b, 0x005a,
+ 0x0056, 0x0049, 0x004d, 0x0041, 0x0033, 0x002c, 0x002b, 0x002a,
+ 0x002b, 0x0014, 0x001e, 0x002c, 0x0037, 0x004e, 0x0048, 0x0057,
+ 0x004e, 0x003d, 0x002e, 0x0036, 0x0025, 0x001e, 0x0014, 0x0010,
+ 0x0035, 0x0019, 0x0029, 0x0025, 0x002c, 0x003b, 0x0036, 0x0051,
+ 0x0042, 0x004c, 0x0039, 0x0036, 0x0025, 0x0012, 0x0027, 0x000b,
+ 0x0023, 0x0021, 0x001f, 0x0039, 0x002a, 0x0052, 0x0048, 0x0050,
+ 0x002f, 0x003a, 0x0037, 0x0015, 0x0016, 0x001a, 0x0026, 0x0016,
+ 0x0035, 0x0019, 0x0017, 0x0026, 0x0046, 0x003c, 0x0033, 0x0024,
+ 0x0037, 0x001a, 0x0022, 0x0017, 0x001b, 0x000e, 0x0009, 0x0007,
+ 0x0022, 0x0020, 0x001c, 0x0027, 0x0031, 0x004b, 0x001e, 0x0034,
+ 0x0030, 0x0028, 0x0034, 0x001c, 0x0012, 0x0011, 0x0009, 0x0005,
+ 0x002d, 0x0015, 0x0022, 0x0040, 0x0038, 0x0032, 0x0031, 0x002d,
+ 0x001f, 0x0013, 0x000c, 0x000f, 0x000a, 0x0007, 0x0006, 0x0003,
+ 0x0030, 0x0017, 0x0014, 0x0027, 0x0024, 0x0023, 0x0035, 0x0015,
+ 0x0010, 0x0017, 0x000d, 0x000a, 0x0006, 0x0001, 0x0004, 0x0002,
+ 0x0010, 0x000f, 0x0011, 0x001b, 0x0019, 0x0014, 0x001d, 0x000b,
+ 0x0011, 0x000c, 0x0010, 0x0008, 0x0001, 0x0001, 0x0000, 0x0001,
+};
+
+const UINT8 mpa_huffbits_13[256] = {
+  1,  4,  6,  7,  8,  9,  9, 10,
+  9, 10, 11, 11, 12, 12, 13, 13,
+  3,  4,  6,  7,  8,  8,  9,  9,
+  9,  9, 10, 10, 11, 12, 12, 12,
+  6,  6,  7,  8,  9,  9, 10, 10,
+  9, 10, 10, 11, 11, 12, 13, 13,
+  7,  7,  8,  9,  9, 10, 10, 10,
+ 10, 11, 11, 11, 11, 12, 13, 13,
+  8,  7,  9,  9, 10, 10, 11, 11,
+ 10, 11, 11, 12, 12, 13, 13, 14,
+  9,  8,  9, 10, 10, 10, 11, 11,
+ 11, 11, 12, 11, 13, 13, 14, 14,
+  9,  9, 10, 10, 11, 11, 11, 11,
+ 11, 12, 12, 12, 13, 13, 14, 14,
+ 10,  9, 10, 11, 11, 11, 12, 12,
+ 12, 12, 13, 13, 13, 14, 16, 16,
+  9,  8,  9, 10, 10, 11, 11, 12,
+ 12, 12, 12, 13, 13, 14, 15, 15,
+ 10,  9, 10, 10, 11, 11, 11, 13,
+ 12, 13, 13, 14, 14, 14, 16, 15,
+ 10, 10, 10, 11, 11, 12, 12, 13,
+ 12, 13, 14, 13, 14, 15, 16, 17,
+ 11, 10, 10, 11, 12, 12, 12, 12,
+ 13, 13, 13, 14, 15, 15, 15, 16,
+ 11, 11, 11, 12, 12, 13, 12, 13,
+ 14, 14, 15, 15, 15, 16, 16, 16,
+ 12, 11, 12, 13, 13, 13, 14, 14,
+ 14, 14, 14, 15, 16, 15, 16, 16,
+ 13, 12, 12, 13, 13, 13, 15, 14,
+ 14, 17, 15, 15, 15, 17, 16, 16,
+ 12, 12, 13, 14, 14, 14, 15, 14,
+ 15, 15, 16, 16, 19, 18, 19, 16,
+};
+
+const UINT16 mpa_huffcodes_15[256] = {
+ 0x0007, 0x000c, 0x0012, 0x0035, 0x002f, 0x004c, 0x007c, 0x006c,
+ 0x0059, 0x007b, 0x006c, 0x0077, 0x006b, 0x0051, 0x007a, 0x003f,
+ 0x000d, 0x0005, 0x0010, 0x001b, 0x002e, 0x0024, 0x003d, 0x0033,
+ 0x002a, 0x0046, 0x0034, 0x0053, 0x0041, 0x0029, 0x003b, 0x0024,
+ 0x0013, 0x0011, 0x000f, 0x0018, 0x0029, 0x0022, 0x003b, 0x0030,
+ 0x0028, 0x0040, 0x0032, 0x004e, 0x003e, 0x0050, 0x0038, 0x0021,
+ 0x001d, 0x001c, 0x0019, 0x002b, 0x0027, 0x003f, 0x0037, 0x005d,
+ 0x004c, 0x003b, 0x005d, 0x0048, 0x0036, 0x004b, 0x0032, 0x001d,
+ 0x0034, 0x0016, 0x002a, 0x0028, 0x0043, 0x0039, 0x005f, 0x004f,
+ 0x0048, 0x0039, 0x0059, 0x0045, 0x0031, 0x0042, 0x002e, 0x001b,
+ 0x004d, 0x0025, 0x0023, 0x0042, 0x003a, 0x0034, 0x005b, 0x004a,
+ 0x003e, 0x0030, 0x004f, 0x003f, 0x005a, 0x003e, 0x0028, 0x0026,
+ 0x007d, 0x0020, 0x003c, 0x0038, 0x0032, 0x005c, 0x004e, 0x0041,
+ 0x0037, 0x0057, 0x0047, 0x0033, 0x0049, 0x0033, 0x0046, 0x001e,
+ 0x006d, 0x0035, 0x0031, 0x005e, 0x0058, 0x004b, 0x0042, 0x007a,
+ 0x005b, 0x0049, 0x0038, 0x002a, 0x0040, 0x002c, 0x0015, 0x0019,
+ 0x005a, 0x002b, 0x0029, 0x004d, 0x0049, 0x003f, 0x0038, 0x005c,
+ 0x004d, 0x0042, 0x002f, 0x0043, 0x0030, 0x0035, 0x0024, 0x0014,
+ 0x0047, 0x0022, 0x0043, 0x003c, 0x003a, 0x0031, 0x0058, 0x004c,
+ 0x0043, 0x006a, 0x0047, 0x0036, 0x0026, 0x0027, 0x0017, 0x000f,
+ 0x006d, 0x0035, 0x0033, 0x002f, 0x005a, 0x0052, 0x003a, 0x0039,
+ 0x0030, 0x0048, 0x0039, 0x0029, 0x0017, 0x001b, 0x003e, 0x0009,
+ 0x0056, 0x002a, 0x0028, 0x0025, 0x0046, 0x0040, 0x0034, 0x002b,
+ 0x0046, 0x0037, 0x002a, 0x0019, 0x001d, 0x0012, 0x000b, 0x000b,
+ 0x0076, 0x0044, 0x001e, 0x0037, 0x0032, 0x002e, 0x004a, 0x0041,
+ 0x0031, 0x0027, 0x0018, 0x0010, 0x0016, 0x000d, 0x000e, 0x0007,
+ 0x005b, 0x002c, 0x0027, 0x0026, 0x0022, 0x003f, 0x0034, 0x002d,
+ 0x001f, 0x0034, 0x001c, 0x0013, 0x000e, 0x0008, 0x0009, 0x0003,
+ 0x007b, 0x003c, 0x003a, 0x0035, 0x002f, 0x002b, 0x0020, 0x0016,
+ 0x0025, 0x0018, 0x0011, 0x000c, 0x000f, 0x000a, 0x0002, 0x0001,
+ 0x0047, 0x0025, 0x0022, 0x001e, 0x001c, 0x0014, 0x0011, 0x001a,
+ 0x0015, 0x0010, 0x000a, 0x0006, 0x0008, 0x0006, 0x0002, 0x0000,
+};
+
+const UINT8 mpa_huffbits_15[256] = {
+  3,  4,  5,  7,  7,  8,  9,  9,
+  9, 10, 10, 11, 11, 11, 12, 13,
+  4,  3,  5,  6,  7,  7,  8,  8,
+  8,  9,  9, 10, 10, 10, 11, 11,
+  5,  5,  5,  6,  7,  7,  8,  8,
+  8,  9,  9, 10, 10, 11, 11, 11,
+  6,  6,  6,  7,  7,  8,  8,  9,
+  9,  9, 10, 10, 10, 11, 11, 11,
+  7,  6,  7,  7,  8,  8,  9,  9,
+  9,  9, 10, 10, 10, 11, 11, 11,
+  8,  7,  7,  8,  8,  8,  9,  9,
+  9,  9, 10, 10, 11, 11, 11, 12,
+  9,  7,  8,  8,  8,  9,  9,  9,
+  9, 10, 10, 10, 11, 11, 12, 12,
+  9,  8,  8,  9,  9,  9,  9, 10,
+ 10, 10, 10, 10, 11, 11, 11, 12,
+  9,  8,  8,  9,  9,  9,  9, 10,
+ 10, 10, 10, 11, 11, 12, 12, 12,
+  9,  8,  9,  9,  9,  9, 10, 10,
+ 10, 11, 11, 11, 11, 12, 12, 12,
+ 10,  9,  9,  9, 10, 10, 10, 10,
+ 10, 11, 11, 11, 11, 12, 13, 12,
+ 10,  9,  9,  9, 10, 10, 10, 10,
+ 11, 11, 11, 11, 12, 12, 12, 13,
+ 11, 10,  9, 10, 10, 10, 11, 11,
+ 11, 11, 11, 11, 12, 12, 13, 13,
+ 11, 10, 10, 10, 10, 11, 11, 11,
+ 11, 12, 12, 12, 12, 12, 13, 13,
+ 12, 11, 11, 11, 11, 11, 11, 11,
+ 12, 12, 12, 12, 13, 13, 12, 13,
+ 12, 11, 11, 11, 11, 11, 11, 12,
+ 12, 12, 12, 12, 13, 13, 13, 13,
+};
+
+const UINT16 mpa_huffcodes_16[256] = {
+ 0x0001, 0x0005, 0x000e, 0x002c, 0x004a, 0x003f, 0x006e, 0x005d,
+ 0x00ac, 0x0095, 0x008a, 0x00f2, 0x00e1, 0x00c3, 0x0178, 0x0011,
+ 0x0003, 0x0004, 0x000c, 0x0014, 0x0023, 0x003e, 0x0035, 0x002f,
+ 0x0053, 0x004b, 0x0044, 0x0077, 0x00c9, 0x006b, 0x00cf, 0x0009,
+ 0x000f, 0x000d, 0x0017, 0x0026, 0x0043, 0x003a, 0x0067, 0x005a,
+ 0x00a1, 0x0048, 0x007f, 0x0075, 0x006e, 0x00d1, 0x00ce, 0x0010,
+ 0x002d, 0x0015, 0x0027, 0x0045, 0x0040, 0x0072, 0x0063, 0x0057,
+ 0x009e, 0x008c, 0x00fc, 0x00d4, 0x00c7, 0x0183, 0x016d, 0x001a,
+ 0x004b, 0x0024, 0x0044, 0x0041, 0x0073, 0x0065, 0x00b3, 0x00a4,
+ 0x009b, 0x0108, 0x00f6, 0x00e2, 0x018b, 0x017e, 0x016a, 0x0009,
+ 0x0042, 0x001e, 0x003b, 0x0038, 0x0066, 0x00b9, 0x00ad, 0x0109,
+ 0x008e, 0x00fd, 0x00e8, 0x0190, 0x0184, 0x017a, 0x01bd, 0x0010,
+ 0x006f, 0x0036, 0x0034, 0x0064, 0x00b8, 0x00b2, 0x00a0, 0x0085,
+ 0x0101, 0x00f4, 0x00e4, 0x00d9, 0x0181, 0x016e, 0x02cb, 0x000a,
+ 0x0062, 0x0030, 0x005b, 0x0058, 0x00a5, 0x009d, 0x0094, 0x0105,
+ 0x00f8, 0x0197, 0x018d, 0x0174, 0x017c, 0x0379, 0x0374, 0x0008,
+ 0x0055, 0x0054, 0x0051, 0x009f, 0x009c, 0x008f, 0x0104, 0x00f9,
+ 0x01ab, 0x0191, 0x0188, 0x017f, 0x02d7, 0x02c9, 0x02c4, 0x0007,
+ 0x009a, 0x004c, 0x0049, 0x008d, 0x0083, 0x0100, 0x00f5, 0x01aa,
+ 0x0196, 0x018a, 0x0180, 0x02df, 0x0167, 0x02c6, 0x0160, 0x000b,
+ 0x008b, 0x0081, 0x0043, 0x007d, 0x00f7, 0x00e9, 0x00e5, 0x00db,
+ 0x0189, 0x02e7, 0x02e1, 0x02d0, 0x0375, 0x0372, 0x01b7, 0x0004,
+ 0x00f3, 0x0078, 0x0076, 0x0073, 0x00e3, 0x00df, 0x018c, 0x02ea,
+ 0x02e6, 0x02e0, 0x02d1, 0x02c8, 0x02c2, 0x00df, 0x01b4, 0x0006,
+ 0x00ca, 0x00e0, 0x00de, 0x00da, 0x00d8, 0x0185, 0x0182, 0x017d,
+ 0x016c, 0x0378, 0x01bb, 0x02c3, 0x01b8, 0x01b5, 0x06c0, 0x0004,
+ 0x02eb, 0x00d3, 0x00d2, 0x00d0, 0x0172, 0x017b, 0x02de, 0x02d3,
+ 0x02ca, 0x06c7, 0x0373, 0x036d, 0x036c, 0x0d83, 0x0361, 0x0002,
+ 0x0179, 0x0171, 0x0066, 0x00bb, 0x02d6, 0x02d2, 0x0166, 0x02c7,
+ 0x02c5, 0x0362, 0x06c6, 0x0367, 0x0d82, 0x0366, 0x01b2, 0x0000,
+ 0x000c, 0x000a, 0x0007, 0x000b, 0x000a, 0x0011, 0x000b, 0x0009,
+ 0x000d, 0x000c, 0x000a, 0x0007, 0x0005, 0x0003, 0x0001, 0x0003,
+};
+
+const UINT8 mpa_huffbits_16[256] = {
+  1,  4,  6,  8,  9,  9, 10, 10,
+ 11, 11, 11, 12, 12, 12, 13,  9,
+  3,  4,  6,  7,  8,  9,  9,  9,
+ 10, 10, 10, 11, 12, 11, 12,  8,
+  6,  6,  7,  8,  9,  9, 10, 10,
+ 11, 10, 11, 11, 11, 12, 12,  9,
+  8,  7,  8,  9,  9, 10, 10, 10,
+ 11, 11, 12, 12, 12, 13, 13, 10,
+  9,  8,  9,  9, 10, 10, 11, 11,
+ 11, 12, 12, 12, 13, 13, 13,  9,
+  9,  8,  9,  9, 10, 11, 11, 12,
+ 11, 12, 12, 13, 13, 13, 14, 10,
+ 10,  9,  9, 10, 11, 11, 11, 11,
+ 12, 12, 12, 12, 13, 13, 14, 10,
+ 10,  9, 10, 10, 11, 11, 11, 12,
+ 12, 13, 13, 13, 13, 15, 15, 10,
+ 10, 10, 10, 11, 11, 11, 12, 12,
+ 13, 13, 13, 13, 14, 14, 14, 10,
+ 11, 10, 10, 11, 11, 12, 12, 13,
+ 13, 13, 13, 14, 13, 14, 13, 11,
+ 11, 11, 10, 11, 12, 12, 12, 12,
+ 13, 14, 14, 14, 15, 15, 14, 10,
+ 12, 11, 11, 11, 12, 12, 13, 14,
+ 14, 14, 14, 14, 14, 13, 14, 11,
+ 12, 12, 12, 12, 12, 13, 13, 13,
+ 13, 15, 14, 14, 14, 14, 16, 11,
+ 14, 12, 12, 12, 13, 13, 14, 14,
+ 14, 16, 15, 15, 15, 17, 15, 11,
+ 13, 13, 11, 12, 14, 14, 13, 14,
+ 14, 15, 16, 15, 17, 15, 14, 11,
+  9,  8,  8,  9,  9, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11,  8,
+};
+
+const UINT16 mpa_huffcodes_24[256] = {
+ 0x000f, 0x000d, 0x002e, 0x0050, 0x0092, 0x0106, 0x00f8, 0x01b2,
+ 0x01aa, 0x029d, 0x028d, 0x0289, 0x026d, 0x0205, 0x0408, 0x0058,
+ 0x000e, 0x000c, 0x0015, 0x0026, 0x0047, 0x0082, 0x007a, 0x00d8,
+ 0x00d1, 0x00c6, 0x0147, 0x0159, 0x013f, 0x0129, 0x0117, 0x002a,
+ 0x002f, 0x0016, 0x0029, 0x004a, 0x0044, 0x0080, 0x0078, 0x00dd,
+ 0x00cf, 0x00c2, 0x00b6, 0x0154, 0x013b, 0x0127, 0x021d, 0x0012,
+ 0x0051, 0x0027, 0x004b, 0x0046, 0x0086, 0x007d, 0x0074, 0x00dc,
+ 0x00cc, 0x00be, 0x00b2, 0x0145, 0x0137, 0x0125, 0x010f, 0x0010,
+ 0x0093, 0x0048, 0x0045, 0x0087, 0x007f, 0x0076, 0x0070, 0x00d2,
+ 0x00c8, 0x00bc, 0x0160, 0x0143, 0x0132, 0x011d, 0x021c, 0x000e,
+ 0x0107, 0x0042, 0x0081, 0x007e, 0x0077, 0x0072, 0x00d6, 0x00ca,
+ 0x00c0, 0x00b4, 0x0155, 0x013d, 0x012d, 0x0119, 0x0106, 0x000c,
+ 0x00f9, 0x007b, 0x0079, 0x0075, 0x0071, 0x00d7, 0x00ce, 0x00c3,
+ 0x00b9, 0x015b, 0x014a, 0x0134, 0x0123, 0x0110, 0x0208, 0x000a,
+ 0x01b3, 0x0073, 0x006f, 0x006d, 0x00d3, 0x00cb, 0x00c4, 0x00bb,
+ 0x0161, 0x014c, 0x0139, 0x012a, 0x011b, 0x0213, 0x017d, 0x0011,
+ 0x01ab, 0x00d4, 0x00d0, 0x00cd, 0x00c9, 0x00c1, 0x00ba, 0x00b1,
+ 0x00a9, 0x0140, 0x012f, 0x011e, 0x010c, 0x0202, 0x0179, 0x0010,
+ 0x014f, 0x00c7, 0x00c5, 0x00bf, 0x00bd, 0x00b5, 0x00ae, 0x014d,
+ 0x0141, 0x0131, 0x0121, 0x0113, 0x0209, 0x017b, 0x0173, 0x000b,
+ 0x029c, 0x00b8, 0x00b7, 0x00b3, 0x00af, 0x0158, 0x014b, 0x013a,
+ 0x0130, 0x0122, 0x0115, 0x0212, 0x017f, 0x0175, 0x016e, 0x000a,
+ 0x028c, 0x015a, 0x00ab, 0x00a8, 0x00a4, 0x013e, 0x0135, 0x012b,
+ 0x011f, 0x0114, 0x0107, 0x0201, 0x0177, 0x0170, 0x016a, 0x0006,
+ 0x0288, 0x0142, 0x013c, 0x0138, 0x0133, 0x012e, 0x0124, 0x011c,
+ 0x010d, 0x0105, 0x0200, 0x0178, 0x0172, 0x016c, 0x0167, 0x0004,
+ 0x026c, 0x012c, 0x0128, 0x0126, 0x0120, 0x011a, 0x0111, 0x010a,
+ 0x0203, 0x017c, 0x0176, 0x0171, 0x016d, 0x0169, 0x0165, 0x0002,
+ 0x0409, 0x0118, 0x0116, 0x0112, 0x010b, 0x0108, 0x0103, 0x017e,
+ 0x017a, 0x0174, 0x016f, 0x016b, 0x0168, 0x0166, 0x0164, 0x0000,
+ 0x002b, 0x0014, 0x0013, 0x0011, 0x000f, 0x000d, 0x000b, 0x0009,
+ 0x0007, 0x0006, 0x0004, 0x0007, 0x0005, 0x0003, 0x0001, 0x0003,
+};
+
+const UINT8 mpa_huffbits_24[256] = {
+  4,  4,  6,  7,  8,  9,  9, 10,
+ 10, 11, 11, 11, 11, 11, 12,  9,
+  4,  4,  5,  6,  7,  8,  8,  9,
+  9,  9, 10, 10, 10, 10, 10,  8,
+  6,  5,  6,  7,  7,  8,  8,  9,
+  9,  9,  9, 10, 10, 10, 11,  7,
+  7,  6,  7,  7,  8,  8,  8,  9,
+  9,  9,  9, 10, 10, 10, 10,  7,
+  8,  7,  7,  8,  8,  8,  8,  9,
+  9,  9, 10, 10, 10, 10, 11,  7,
+  9,  7,  8,  8,  8,  8,  9,  9,
+  9,  9, 10, 10, 10, 10, 10,  7,
+  9,  8,  8,  8,  8,  9,  9,  9,
+  9, 10, 10, 10, 10, 10, 11,  7,
+ 10,  8,  8,  8,  9,  9,  9,  9,
+ 10, 10, 10, 10, 10, 11, 11,  8,
+ 10,  9,  9,  9,  9,  9,  9,  9,
+  9, 10, 10, 10, 10, 11, 11,  8,
+ 10,  9,  9,  9,  9,  9,  9, 10,
+ 10, 10, 10, 10, 11, 11, 11,  8,
+ 11,  9,  9,  9,  9, 10, 10, 10,
+ 10, 10, 10, 11, 11, 11, 11,  8,
+ 11, 10,  9,  9,  9, 10, 10, 10,
+ 10, 10, 10, 11, 11, 11, 11,  8,
+ 11, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 11, 11, 11, 11, 11,  8,
+ 11, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11,  8,
+ 12, 10, 10, 10, 10, 10, 10, 11,
+ 11, 11, 11, 11, 11, 11, 11,  8,
+  8,  7,  7,  7,  7,  7,  7,  7,
+  7,  7,  7,  8,  8,  8,  8,  4,
+};
+
+const HuffTable mpa_huff_tables[16] = {
+{ 1, NULL, NULL },
+{ 2, mpa_huffbits_1, mpa_huffcodes_1 },
+{ 3, mpa_huffbits_2, mpa_huffcodes_2 },
+{ 3, mpa_huffbits_3, mpa_huffcodes_3 },
+{ 4, mpa_huffbits_5, mpa_huffcodes_5 },
+{ 4, mpa_huffbits_6, mpa_huffcodes_6 },
+{ 6, mpa_huffbits_7, mpa_huffcodes_7 },
+{ 6, mpa_huffbits_8, mpa_huffcodes_8 },
+{ 6, mpa_huffbits_9, mpa_huffcodes_9 },
+{ 8, mpa_huffbits_10, mpa_huffcodes_10 },
+{ 8, mpa_huffbits_11, mpa_huffcodes_11 },
+{ 8, mpa_huffbits_12, mpa_huffcodes_12 },
+{ 16, mpa_huffbits_13, mpa_huffcodes_13 },
+{ 16, mpa_huffbits_15, mpa_huffcodes_15 },
+{ 16, mpa_huffbits_16, mpa_huffcodes_16 },
+{ 16, mpa_huffbits_24, mpa_huffcodes_24 },
+};
+
+const UINT8 mpa_huff_data[32][2] = {
+{ 0, 0 },
+{ 1, 0 },
+{ 2, 0 },
+{ 3, 0 },
+{ 0, 0 },
+{ 4, 0 },
+{ 5, 0 },
+{ 6, 0 },
+{ 7, 0 },
+{ 8, 0 },
+{ 9, 0 },
+{ 10, 0 },
+{ 11, 0 },
+{ 12, 0 },
+{ 0, 0 },
+{ 13, 0 },
+{ 14, 1 },
+{ 14, 2 },
+{ 14, 3 },
+{ 14, 4 },
+{ 14, 6 },
+{ 14, 8 },
+{ 14, 10 },
+{ 14, 13 },
+{ 15, 4 },
+{ 15, 5 },
+{ 15, 6 },
+{ 15, 7 },
+{ 15, 8 },
+{ 15, 9 },
+{ 15, 11 },
+{ 15, 13 },
+};
+
+
+/* huffman tables for quadrules */
+static UINT8 mpa_quad_codes[2][16] = {
+    {  1,  5,  4,  5,  6,  5,  4,  4, 7,  3,  6,  0,  7,  2,  3,  1, },
+    { 15, 14, 13, 12, 11, 10,  9,  8, 7,  6,  5,  4,  3,  2,  1,  0, },
+};
+
+static UINT8 mpa_quad_bits[2][16] = {
+    { 1, 4, 4, 5, 4, 6, 5, 6, 4, 5, 5, 6, 5, 6, 6, 6, },
+    { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, },
+};
+
+/* band size tables */
+const UINT8 band_size_long[9][22] = {
+{ 4, 4, 4, 4, 4, 4, 6, 6, 8, 8, 10,
+  12, 16, 20, 24, 28, 34, 42, 50, 54, 76, 158, }, /* 44100 */
+{ 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 10,
+  12, 16, 18, 22, 28, 34, 40, 46, 54, 54, 192, }, /* 48000 */
+{ 4, 4, 4, 4, 4, 4, 6, 6, 8, 10, 12,
+  16, 20, 24, 30, 38, 46, 56, 68, 84, 102, 26, }, /* 32000 */
+{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16,
+  20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 22050 */
+{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16,
+  18, 22, 26, 32, 38, 46, 52, 64, 70, 76, 36, }, /* 24000 */
+{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16,
+  20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 16000 */
+{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16,
+  20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 11025 */
+{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16,
+  20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 12000 */
+{ 12, 12, 12, 12, 12, 12, 16, 20, 24, 28, 32,
+  40, 48, 56, 64, 76, 90, 2, 2, 2, 2, 2, }, /* 8000 */
+};
+
+const UINT8 band_size_short[9][13] = {
+{ 4, 4, 4, 4, 6, 8, 10, 12, 14, 18, 22, 30, 56, }, /* 44100 */
+{ 4, 4, 4, 4, 6, 6, 10, 12, 14, 16, 20, 26, 66, }, /* 48000 */
+{ 4, 4, 4, 4, 6, 8, 12, 16, 20, 26, 34, 42, 12, }, /* 32000 */
+{ 4, 4, 4, 6, 6, 8, 10, 14, 18, 26, 32, 42, 18, }, /* 22050 */
+{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 32, 44, 12, }, /* 24000 */
+{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 16000 */
+{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 11025 */
+{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 12000 */
+{ 8, 8, 8, 12, 16, 20, 24, 28, 36, 2, 2, 2, 26, }, /* 8000 */
+};
+
+const UINT8 mpa_pretab[2][22] = {
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 2, 0 },
+};
+
+/* table for alias reduction (XXX: store it as integer !) */
+const float ci_table[8] = {
+    -0.6, -0.535, -0.33, -0.185, -0.095, -0.041, -0.0142, -0.0037,
+};
diff --git a/src/libffmpeg/libavcodec/mpegaudiotab.h b/src/libffmpeg/libavcodec/mpegaudiotab.h
new file mode 100644
index 000000000..41fb0fdbd
--- /dev/null
+++ b/src/libffmpeg/libavcodec/mpegaudiotab.h
@@ -0,0 +1,92 @@
+/*
+ * mpeg audio layer 2 tables. Most of them come from the mpeg audio
+ * specification.
+ * 
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ *
+ * The licence of this code is contained in file LICENCE found in the
+ * same archive 
+ */
+
+#define SQRT2 1.41421356237309514547
+
+static const int costab32[30] = {
+    FIX(0.54119610014619701222),
+    FIX(1.3065629648763763537),
+    
+    FIX(0.50979557910415917998),
+    FIX(2.5629154477415054814),
+    FIX(0.89997622313641556513),
+    FIX(0.60134488693504528634),
+    
+    FIX(0.5024192861881556782),
+    FIX(5.1011486186891552563),
+    FIX(0.78815462345125020249),
+    FIX(0.64682178335999007679),
+    FIX(0.56694403481635768927),
+    FIX(1.0606776859903470633),
+    FIX(1.7224470982383341955),
+    FIX(0.52249861493968885462),
+    
+    FIX(10.19000812354803287),
+    FIX(0.674808341455005678),
+    FIX(1.1694399334328846596),
+    FIX(0.53104259108978413284),
+    FIX(2.0577810099534108446),
+    FIX(0.58293496820613388554),
+    FIX(0.83934964541552681272),
+    FIX(0.50547095989754364798),
+    FIX(3.4076084184687189804),
+    FIX(0.62250412303566482475),
+    FIX(0.97256823786196078263),
+    FIX(0.51544730992262455249),
+    FIX(1.4841646163141661852),
+    FIX(0.5531038960344445421),
+    FIX(0.74453627100229857749),
+    FIX(0.5006029982351962726),
+};
+
+static const int bitinv32[32] = {
+    0,  16,  8, 24,  4,  20,  12,  28,
+    2,  18, 10, 26,  6,  22,  14,  30,
+    1,  17,  9, 25,  5,  21,  13,  29,
+    3,  19, 11, 27,  7,  23,  15,  31
+};
+
+
+static INT16 filter_bank[512];
+
+static int scale_factor_table[64];
+#ifdef USE_FLOATS
+static float scale_factor_inv_table[64];
+#else
+static INT8 scale_factor_shift[64];
+static unsigned short scale_factor_mult[64];
+#endif
+static unsigned char scale_diff_table[128];
+
+/* total number of bits per allocation group */
+static unsigned short total_quant_bits[17];
+
+/* signal to noise ratio of each quantification step (could be
+   computed from quant_steps[]). The values are dB multiplied by 10 
+*/
+static unsigned short quant_snr[17] = { 
+     70, 110, 160, 208,
+    253, 316, 378, 439,
+    499, 559, 620, 680, 
+    740, 800, 861, 920, 
+    980
+};
+
+/* fixed psycho acoustic model. Values of SNR taken from the 'toolame'
+   project */
+static const float fixed_smr[SBLIMIT] =  {
+    30, 17, 16, 10, 3, 12, 8, 2.5,
+    5, 5, 6, 6, 5, 6, 10, 6,
+    -4, -10, -21, -30, -42, -55, -68, -75,
+    -75, -75, -75, -75, -91, -107, -110, -108
+};
+
+static const unsigned char nb_scale_factors[4] = { 3, 2, 1, 2 };
+
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index c1a0b0b55..8206df470 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -41,10 +41,9 @@ static void dct_unquantize_h263_c(MpegEncContext *s,
                                   DCTELEM *block, int n, int qscale);
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w);
 static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
+static int dct_quantize_trellis_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
 
 void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edges_c;
-static void emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
-                                    int src_x, int src_y, int w, int h);
 
 
 /* enable all paranoid tests for rounding, overflows, etc... */
@@ -64,8 +63,8 @@ static const uint16_t aanscales[64] = {
     19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
     16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
     12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
-    8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
-    4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+    8867 , 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+    4520 ,  6270,  5906,  5315,  4520,  3552,  2446,  1247
 };
 
 /* Input permutation for the simple_idct_mmx */
@@ -87,9 +86,6 @@ static const uint8_t h263_chroma_roundtab[16] = {
 static UINT16 (*default_mv_penalty)[MAX_MV*2+1]=NULL;
 static UINT8 default_fcode_tab[MAX_MV*2+1];
 
-/* default motion estimation */
-int motion_estimation_method = ME_EPZS;
-
 static void convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
                            const UINT16 *quant_matrix, int bias, int qmin, int qmax)
 {
@@ -127,7 +123,8 @@ static void convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16
                    so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905
                    so 32768        >= (1<<19) / (qscale * quant_matrix[i]) >= 67
                 */
-                qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
+                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / (qscale * quant_matrix[j]));
+//                qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
                 qmat16[qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[j]);
 
                 if(qmat16[qscale][i]==0 || qmat16[qscale][i]==128*256) qmat16[qscale][i]=128*256-1;
@@ -233,6 +230,10 @@ int DCT_common_init(MpegEncContext *s)
     MPV_common_init_ppc(s);
 #endif
 
+    if(s->flags&CODEC_FLAG_TRELLIS_QUANT){
+        s->dct_quantize= dct_quantize_trellis_c; //move before MPV_common_init_*
+    }
+
     switch(s->idct_permutation_type){
     case FF_NO_IDCT_PERM:
         for(i=0; i<64; i++)
@@ -268,40 +269,87 @@ int DCT_common_init(MpegEncContext *s)
 }
 
 /**
- * allocates various arrays for a Picture structure, except the pixels themself.
- * The pixels are allocated/set in te get_buffer()
+ * allocates a Picture
+ * The pixels are allocated/set by calling get_buffer() if shared=0
  */
-static int alloc_picture(MpegEncContext *s, Picture *pic){
-    if (s->encoding) {        
-        CHECKED_ALLOCZ(pic->mb_var   , s->mb_num * sizeof(INT16))
-        CHECKED_ALLOCZ(pic->mc_mb_var, s->mb_num * sizeof(INT16))
-        CHECKED_ALLOCZ(pic->mb_mean  , s->mb_num * sizeof(INT8))
+static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
+    
+    if(shared){
+        assert(pic->data[0]);
+        assert(pic->type == 0 || pic->type == FF_BUFFER_TYPE_SHARED);
+        pic->type= FF_BUFFER_TYPE_SHARED;
+    }else{
+        int r;
+        
+        assert(!pic->data[0]);
+        
+        r= s->avctx->get_buffer(s->avctx, (AVFrame*)pic);
+        
+        if(r<0 || !pic->age || !pic->type || !pic->data[0]){
+            fprintf(stderr, "get_buffer() failed (%d %d %d %X)\n", r, pic->age, pic->type, (int)pic->data[0]);
+            return -1;
+        }
+
+        if(s->linesize && (s->linesize != pic->linesize[0] || s->uvlinesize != pic->linesize[1])){
+            fprintf(stderr, "get_buffer() failed (stride changed)\n");
+            return -1;
+        }
+
+        if(pic->linesize[1] != pic->linesize[2]){
+            fprintf(stderr, "get_buffer() failed (uv stride missmatch)\n");
+            return -1;
+        }
+
+        s->linesize  = pic->linesize[0];
+        s->uvlinesize= pic->linesize[1];
     }
+    
+    if(pic->qscale_table==NULL){
+        if (s->encoding) {        
+            CHECKED_ALLOCZ(pic->mb_var   , s->mb_num * sizeof(INT16))
+            CHECKED_ALLOCZ(pic->mc_mb_var, s->mb_num * sizeof(INT16))
+            CHECKED_ALLOCZ(pic->mb_mean  , s->mb_num * sizeof(INT8))
+        }
 
-    CHECKED_ALLOCZ(pic->mbskip_table , s->mb_num * sizeof(UINT8)+1) //the +1 is for the slice end check
-    CHECKED_ALLOCZ(pic->qscale_table , s->mb_num * sizeof(UINT8))
-    pic->qstride= s->mb_width;
+        CHECKED_ALLOCZ(pic->mbskip_table , s->mb_num * sizeof(UINT8)+1) //the +1 is for the slice end check
+        CHECKED_ALLOCZ(pic->qscale_table , s->mb_num * sizeof(UINT8))
+        pic->qstride= s->mb_width;
+    }
     
     return 0;
 fail: //for the CHECKED_ALLOCZ macro
     return -1;
 }
 
+/**
+ * deallocates a picture
+ */
 static void free_picture(MpegEncContext *s, Picture *pic){
     int i;
-    
+
+    if(pic->data[0] && pic->type!=FF_BUFFER_TYPE_SHARED){
+        s->avctx->release_buffer(s->avctx, (AVFrame*)pic);
+    }
+
     av_freep(&pic->mb_var);
     av_freep(&pic->mc_mb_var);
     av_freep(&pic->mb_mean);
     av_freep(&pic->mbskip_table);
     av_freep(&pic->qscale_table);
     
-    if(s->avctx->get_buffer == avcodec_default_get_buffer){
+    if(pic->type == FF_BUFFER_TYPE_INTERNAL){
         for(i=0; i<4; i++){
             av_freep(&pic->base[i]);
             pic->data[i]= NULL;
         }
         av_freep(&pic->opaque);
+        pic->type= 0;
+    }else if(pic->type == FF_BUFFER_TYPE_SHARED){
+        for(i=0; i<4; i++){
+            pic->base[i]=
+            pic->data[i]= NULL;
+        }
+        pic->type= 0;        
     }
 }
 
@@ -334,9 +382,10 @@ int MPV_common_init(MpegEncContext *s)
                      + (toupper((s->avctx->fourcc>>16)&0xFF)<<16) 
                      + (toupper((s->avctx->fourcc>>24)&0xFF)<<24);
 
-    CHECKED_ALLOCZ(s->edge_emu_buffer, (s->width+64)*2*17*2); //(width + edge + align)*interlaced*MBsize*tolerance
+    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*17*2); //(width + edge + align)*interlaced*MBsize*tolerance
+    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*17;
 
-    s->avctx->coded_picture= (AVVideoFrame*)&s->current_picture;
+    s->avctx->coded_frame= (AVFrame*)&s->current_picture;
 
     if (s->encoding) {
         int mv_table_size= (s->mb_width+2)*(s->mb_height+2);
@@ -347,15 +396,13 @@ int MPV_common_init(MpegEncContext *s)
         CHECKED_ALLOCZ(s->b_back_mv_table       , mv_table_size * 2 * sizeof(INT16))
         CHECKED_ALLOCZ(s->b_bidir_forw_mv_table , mv_table_size * 2 * sizeof(INT16))
         CHECKED_ALLOCZ(s->b_bidir_back_mv_table , mv_table_size * 2 * sizeof(INT16))
-        CHECKED_ALLOCZ(s->b_direct_forw_mv_table, mv_table_size * 2 * sizeof(INT16))
-        CHECKED_ALLOCZ(s->b_direct_back_mv_table, mv_table_size * 2 * sizeof(INT16))
         CHECKED_ALLOCZ(s->b_direct_mv_table     , mv_table_size * 2 * sizeof(INT16))
 
         //FIXME should be linesize instead of s->width*2 but that isnt known before get_buffer()
-        CHECKED_ALLOCZ(s->me_scratchpad,  s->width*2*16*3*sizeof(uint8_t)) 
+        CHECKED_ALLOCZ(s->me.scratchpad,  s->width*2*16*3*sizeof(uint8_t)) 
         
-        CHECKED_ALLOCZ(s->me_map      , ME_MAP_SIZE*sizeof(uint32_t))
-        CHECKED_ALLOCZ(s->me_score_map, ME_MAP_SIZE*sizeof(uint16_t))
+        CHECKED_ALLOCZ(s->me.map      , ME_MAP_SIZE*sizeof(uint32_t))
+        CHECKED_ALLOCZ(s->me.score_map, ME_MAP_SIZE*sizeof(uint32_t))
 
         if(s->codec_id==CODEC_ID_MPEG4){
             CHECKED_ALLOCZ(s->tex_pb_buffer, PB_BUFFER_SIZE);
@@ -445,20 +492,12 @@ void MPV_common_end(MpegEncContext *s)
 {
     int i;
 
-    for(i=0; i<MAX_PICTURE_COUNT; i++){
-        if(s->picture[i].data[0]){
-            s->avctx->release_buffer(s->avctx, (AVVideoFrame*)&s->picture[i]);
-        }
-    }
-    
     av_freep(&s->mb_type);
     av_freep(&s->p_mv_table);
     av_freep(&s->b_forw_mv_table);
     av_freep(&s->b_back_mv_table);
     av_freep(&s->b_bidir_forw_mv_table);
     av_freep(&s->b_bidir_back_mv_table);
-    av_freep(&s->b_direct_forw_mv_table);
-    av_freep(&s->b_direct_back_mv_table);
     av_freep(&s->b_direct_mv_table);
     av_freep(&s->motion_val);
     av_freep(&s->dc_val[0]);
@@ -467,15 +506,15 @@ void MPV_common_end(MpegEncContext *s)
     av_freep(&s->mbintra_table);
     av_freep(&s->cbp_table);
     av_freep(&s->pred_dir_table);
-    av_freep(&s->me_scratchpad);
-    av_freep(&s->me_map);
-    av_freep(&s->me_score_map);
+    av_freep(&s->me.scratchpad);
+    av_freep(&s->me.map);
+    av_freep(&s->me.score_map);
     
     av_freep(&s->mbskip_table);
     av_freep(&s->bitstream_buffer);
     av_freep(&s->tex_pb_buffer);
     av_freep(&s->pb2_buffer);
-    av_freep(&s->edge_emu_buffer);
+    av_freep(&s->allocated_edge_emu_buffer); s->edge_emu_buffer= NULL;
     av_freep(&s->co_located_type_table);
     av_freep(&s->field_mv_table);
     av_freep(&s->field_select_table);
@@ -525,6 +564,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->chroma_elim_threshold= avctx->chroma_elim_threshold;
     s->strict_std_compliance= avctx->strict_std_compliance;
     s->data_partitioning= avctx->flags & CODEC_FLAG_PART;
+    s->quarter_sample= (avctx->flags & CODEC_FLAG_QPEL)!=0;
     s->mpeg_quant= avctx->mpeg_quant;
 
     if (s->gop_size <= 1) {
@@ -534,12 +574,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->intra_only = 0;
     }
 
-    /* ME algorithm */
-    if (avctx->me_method == 0)
-        /* For compatibility */
-        s->me_method = motion_estimation_method;
-    else
-        s->me_method = avctx->me_method;
+    s->me_method = avctx->me_method;
 
     /* Fixed QSCALE */
     s->fixed_qscale = (avctx->flags & CODEC_FLAG_QSCALE);
@@ -572,6 +607,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         if (mjpeg_init(s) < 0)
             return -1;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_H263:
         if (h263_get_picture_format(s->width, s->height) == 7) {
@@ -580,6 +616,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         }
         s->out_format = FMT_H263;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_H263P:
         s->out_format = FMT_H263;
@@ -591,18 +628,19 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->umvplus = 0;
         s->umvplus_dec = 0;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_RV10:
         s->out_format = FMT_H263;
         s->h263_rv10 = 1;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_MPEG4:
         s->out_format = FMT_H263;
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
-        s->has_b_frames= s->max_b_frames ? 1 : 0;
-        s->low_delay= !s->has_b_frames;
+        s->low_delay= s->max_b_frames ? 0 : 1;
         avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1);
         break;
     case CODEC_ID_MSMPEG4V1:
@@ -612,6 +650,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 1;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_MSMPEG4V2:
         s->out_format = FMT_H263;
@@ -620,6 +659,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 2;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_MSMPEG4V3:
         s->out_format = FMT_H263;
@@ -628,6 +668,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 3;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_WMV1:
         s->out_format = FMT_H263;
@@ -636,6 +677,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 4;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     case CODEC_ID_WMV2:
         s->out_format = FMT_H263;
@@ -644,6 +686,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 5;
         avctx->delay=0;
+        s->low_delay=1;
         break;
     default:
         return -1;
@@ -664,13 +707,14 @@ int MPV_encode_init(AVCodecContext *avctx)
             }
         }
     }
-    s->mv_penalty= default_mv_penalty;
+    s->me.mv_penalty= default_mv_penalty;
     s->fcode_tab= default_fcode_tab;
     s->y_dc_scale_table=
     s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
  
     /* dont use mv_penalty table for crap MV as it would be confused */
-    if (s->me_method < ME_EPZS) s->mv_penalty = default_mv_penalty;
+    //FIXME remove after fixing / removing old ME
+    if (s->me_method < ME_EPZS) s->me.mv_penalty = default_mv_penalty;
 
     s->encoding = 1;
 
@@ -678,6 +722,8 @@ int MPV_encode_init(AVCodecContext *avctx)
     if (MPV_common_init(s) < 0)
         return -1;
     
+    ff_init_me(s);
+
 #ifdef CONFIG_ENCODERS
     if (s->out_format == FMT_H263)
         h263_encode_init(s);
@@ -770,21 +816,41 @@ static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w)
     }
 }
 
+static int find_unused_picture(MpegEncContext *s, int shared){
+    int i;
+    
+    if(shared){
+        for(i=0; i<MAX_PICTURE_COUNT; i++){
+            if(s->picture[i].data[0]==NULL && s->picture[i].type==0) break;
+        }
+    }else{
+        for(i=0; i<MAX_PICTURE_COUNT; i++){
+            if(s->picture[i].data[0]==NULL && s->picture[i].type!=0) break;
+        }
+        for(i=0; i<MAX_PICTURE_COUNT; i++){
+            if(s->picture[i].data[0]==NULL) break;
+        }
+    }
+
+    assert(i<MAX_PICTURE_COUNT);
+    return i;
+}
+
 /* generic function for encode/decode called before a frame is coded/decoded */
 int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
 {
-    int i, r;
-    AVVideoFrame *pic;
+    int i;
+    AVFrame *pic;
 
     s->mb_skiped = 0;
     
     /* mark&release old frames */
     if (s->pict_type != B_TYPE && s->last_picture.data[0]) {
-        Picture *pic= NULL;
         for(i=0; i<MAX_PICTURE_COUNT; i++){
+//printf("%8X %d %d %X %X\n", s->picture[i].data[0], s->picture[i].type, i, s->next_picture.data[0], s->last_picture.data[0]);
             if(s->picture[i].data[0] == s->last_picture.data[0]){
 //                s->picture[i].reference=0;
-                avctx->release_buffer(avctx, (AVVideoFrame*)&s->picture[i]);
+                avctx->release_buffer(avctx, (AVFrame*)&s->picture[i]);
                 break;
             }    
         }
@@ -796,35 +862,20 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
             for(i=0; i<MAX_PICTURE_COUNT; i++){
                 if(s->picture[i].data[0] && s->picture[i].data[0] != s->next_picture.data[0] && s->picture[i].reference){
                     fprintf(stderr, "releasing zombie picture\n");
-                    avctx->release_buffer(avctx, (AVVideoFrame*)&s->picture[i]);                
+                    avctx->release_buffer(avctx, (AVFrame*)&s->picture[i]);                
                 }
             }
         }
     }
 alloc:
     if(!s->encoding){
-        /* find unused Picture */
-        for(i=0; i<MAX_PICTURE_COUNT; i++){
-            if(s->picture[i].data[0]==NULL) break;
-        }
-        assert(i<MAX_PICTURE_COUNT);
+        i= find_unused_picture(s, 0);
     
-        pic= (AVVideoFrame*)&s->picture[i];
+        pic= (AVFrame*)&s->picture[i];
         pic->reference= s->pict_type != B_TYPE;
         pic->coded_picture_number= s->current_picture.coded_picture_number+1;
         
-        r= avctx->get_buffer(avctx, pic);
-    
-        if(r<0 || (s->linesize && (s->linesize != pic->linesize[0] || s->uvlinesize != pic->linesize[1]))){
-            fprintf(stderr, "get_buffer() failed (stride changed), bye bye\n");
-            return -1;
-        }
-
-        s->linesize  = pic->linesize[0];
-        s->uvlinesize= pic->linesize[1];
-    
-        if(pic->qscale_table==NULL)
-            alloc_picture(s, (Picture*)pic);
+        alloc_picture(s, (Picture*)pic, 0);
 
         s->current_picture= s->picture[i];
     }
@@ -860,7 +911,6 @@ alloc:
 void MPV_frame_end(MpegEncContext *s)
 {
     int i;
-
     /* draw edge for correct motion prediction if outside */
     if(s->codec_id!=CODEC_ID_SVQ1){
         if (s->pict_type != B_TYPE && !s->intra_only && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
@@ -874,8 +924,6 @@ void MPV_frame_end(MpegEncContext *s)
     s->last_pict_type    = s->pict_type;
     if(s->pict_type!=B_TYPE){
         s->last_non_b_pict_type= s->pict_type;
-        s->num_available_buffers++;
-        if(s->num_available_buffers>2) s->num_available_buffers= 2;
     }
     
     s->current_picture.quality= s->qscale; //FIXME get average of qscale_table
@@ -893,86 +941,132 @@ void MPV_frame_end(MpegEncContext *s)
 
     /* release non refernce frames */
     for(i=0; i<MAX_PICTURE_COUNT; i++){
-        if(s->picture[i].data[0] && !s->picture[i].reference)
-            s->avctx->release_buffer(s->avctx, (AVVideoFrame*)&s->picture[i]);
+        if(s->picture[i].data[0] && !s->picture[i].reference /*&& s->picture[i].type!=FF_BUFFER_TYPE_SHARED*/)
+            s->avctx->release_buffer(s->avctx, (AVFrame*)&s->picture[i]);
+    }
+    if(s->avctx->debug&FF_DEBUG_SKIP){
+        int x,y;        
+        for(y=0; y<s->mb_height; y++){
+            for(x=0; x<s->mb_width; x++){
+                int count= s->mbskip_table[x + y*s->mb_width];
+                if(count>9) count=9;
+                printf(" %1d", count);
+            }
+            printf("\n");
+        }
+        printf("pict type: %d\n", s->pict_type);
     }
 }
 
-static int load_input_picture(MpegEncContext *s, AVVideoFrame *pic_arg){
-    AVVideoFrame *pic;
-    int i,r;
-    const int encoding_delay= s->max_b_frames;
-
-    /* find unused Picture */
-    for(i=0; i<MAX_PICTURE_COUNT; i++){
-        if(s->picture[i].data[0]==NULL) break;
+static int get_sae(uint8_t *src, int ref, int stride){
+    int x,y;
+    int acc=0;
+    
+    for(y=0; y<16; y++){
+        for(x=0; x<16; x++){
+            acc+= ABS(src[x+y*stride] - ref);
+        }
     }
-    assert(i<MAX_PICTURE_COUNT);
-        
-    pic= (AVVideoFrame*)&s->picture[i];
-    pic->reference= 1;
     
-//    assert(avctx->get_buffer == default_get_buffer || avctx->get_buffer==NULL);
-    r= s->avctx->get_buffer(s->avctx, pic);
+    return acc;
+}
 
-    if(r<0 || (s->linesize && (s->linesize != pic->linesize[0] || s->uvlinesize != pic->linesize[1]))){
-        fprintf(stderr, "get_buffer() failed (stride changed), bye bye\n");
-        return -1;
-    }
+static int get_intra_count(MpegEncContext *s, uint8_t *src, uint8_t *ref, int stride){
+    int x, y, w, h;
+    int acc=0;
     
-    assert(s->linesize==0   || s->linesize  ==pic->linesize[0]);
-    assert(s->uvlinesize==0 || s->uvlinesize==pic->linesize[1]);
-    assert(pic->linesize[1] == pic->linesize[2]);
-    s->linesize  = pic->linesize[0];
-    s->uvlinesize= pic->linesize[1];
+    w= s->width &~15;
+    h= s->height&~15;
     
-    if(pic->qscale_table==NULL)
-        alloc_picture(s, (Picture*)pic);
+    for(y=0; y<h; y+=16){
+        for(x=0; x<w; x+=16){
+            int offset= x + y*stride;
+            int sad = s->dsp.pix_abs16x16(src + offset, ref + offset, stride);
+            int mean= (s->dsp.pix_sum(src + offset, stride) + 128)>>8;
+            int sae = get_sae(src + offset, mean, stride);
+            
+            acc+= sae + 500 < sad;
+        }
+    }
+    return acc;
+}
+
 
-//    assert(s->input_picture[0]==NULL || s->input_picture[0]->data[0]==NULL);
+static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
+    AVFrame *pic;
+    int i;
+    const int encoding_delay= s->max_b_frames;
+    int direct=1;
+
+    if(encoding_delay && !(s->flags&CODEC_FLAG_INPUT_PRESERVED)) direct=0;
+    if(pic_arg->linesize[0] != s->linesize) direct=0;
+    if(pic_arg->linesize[1] != s->uvlinesize) direct=0;
+    if(pic_arg->linesize[2] != s->uvlinesize) direct=0;
+  
+//    printf("%d %d %d %d\n",pic_arg->linesize[0], pic_arg->linesize[1], s->linesize, s->uvlinesize);
     
-    if(s->input_picture[encoding_delay])
-        pic->display_picture_number= s->input_picture[encoding_delay]->display_picture_number + 1;
-//printf("dpn2:%d\n", pic->display_picture_number);
+    if(direct){
+        i= find_unused_picture(s, 1);
 
-    /* shift buffer entries */
-    for(i=1; i<MAX_PICTURE_COUNT /*s->encoding_delay+1*/; i++)
-        s->input_picture[i-1]= s->input_picture[i];
-        
-    s->input_picture[encoding_delay]= (Picture*)pic;
-    pic->pict_type= pic_arg->pict_type;
-    pic->quality= pic_arg->quality;
+        pic= (AVFrame*)&s->picture[i];
+        pic->reference= 1;
     
-    if(   pic->data[0] == pic_arg->data[0] 
-       && pic->data[1] == pic_arg->data[1]
-       && pic->data[2] == pic_arg->data[2]){
-       // empty
+        for(i=0; i<4; i++){
+            pic->data[i]= pic_arg->data[i];
+            pic->linesize[i]= pic_arg->linesize[i];
+        }
+        alloc_picture(s, (Picture*)pic, 1);
     }else{
-        int h_chroma_shift, v_chroma_shift;
+        i= find_unused_picture(s, 0);
+
+        pic= (AVFrame*)&s->picture[i];
+        pic->reference= 1;
+
+        alloc_picture(s, (Picture*)pic, 0);
+
+        if(   pic->data[0] == pic_arg->data[0] 
+           && pic->data[1] == pic_arg->data[1]
+           && pic->data[2] == pic_arg->data[2]){
+       // empty
+        }else{
+            int h_chroma_shift, v_chroma_shift;
         
-        avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
+            avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
         
-        for(i=0; i<3; i++){
-            int src_stride= pic_arg->linesize[i];
-            int dst_stride= i ? s->uvlinesize : s->linesize;
-            int h_shift= i ? h_chroma_shift : 0;
-            int v_shift= i ? v_chroma_shift : 0;
-            int w= s->width >>h_shift;
-            int h= s->height>>v_shift;
-            uint8_t *src= pic_arg->data[i];
-            uint8_t *dst= pic->data[i] + 16;
+            for(i=0; i<3; i++){
+                int src_stride= pic_arg->linesize[i];
+                int dst_stride= i ? s->uvlinesize : s->linesize;
+                int h_shift= i ? h_chroma_shift : 0;
+                int v_shift= i ? v_chroma_shift : 0;
+                int w= s->width >>h_shift;
+                int h= s->height>>v_shift;
+                uint8_t *src= pic_arg->data[i];
+                uint8_t *dst= pic->data[i];
             
-            if(src_stride==dst_stride)
-                memcpy(dst, src, src_stride*h);
-            else{
-                while(h--){
-                    memcpy(dst, src, w);
-                    dst += dst_stride;
-                    src += src_stride;
+                if(src_stride==dst_stride)
+                    memcpy(dst, src, src_stride*h);
+                else{
+                    while(h--){
+                        memcpy(dst, src, w);
+                        dst += dst_stride;
+                        src += src_stride;
+                    }
                 }
             }
         }
     }
+    pic->quality= pic_arg->quality;
+    pic->pict_type= pic_arg->pict_type;
+    pic->pts = pic_arg->pts;
+    
+    if(s->input_picture[encoding_delay])
+        pic->display_picture_number= s->input_picture[encoding_delay]->display_picture_number + 1;
+
+    /* shift buffer entries */
+    for(i=1; i<MAX_PICTURE_COUNT /*s->encoding_delay+1*/; i++)
+        s->input_picture[i-1]= s->input_picture[i];
+        
+    s->input_picture[encoding_delay]= (Picture*)pic;
 
     return 0;
 }
@@ -984,70 +1078,119 @@ static void select_input_picture(MpegEncContext *s){
 
     if(s->reordered_input_picture[0])
         coded_pic_num= s->reordered_input_picture[0]->coded_picture_number + 1;
-//printf("cpn:%d\n", coded_pic_num);
+
     for(i=1; i<MAX_PICTURE_COUNT; i++)
         s->reordered_input_picture[i-1]= s->reordered_input_picture[i];
     s->reordered_input_picture[MAX_PICTURE_COUNT-1]= NULL;
 
     /* set next picture types & ordering */
     if(s->reordered_input_picture[0]==NULL && s->input_picture[0]){
-        if(s->input_picture[0]->pict_type){
-            /* user selected pict_type */
-            if(s->input_picture[0]->pict_type == I_TYPE){
-                s->reordered_input_picture[0]= s->input_picture[0];
-                s->reordered_input_picture[0]->coded_picture_number= coded_pic_num;
-            }else{
-                int b_frames;
+        if(/*s->picture_in_gop_number >= s->gop_size ||*/ s->next_picture.data[0]==NULL || s->intra_only){
+            s->reordered_input_picture[0]= s->input_picture[0];
+            s->reordered_input_picture[0]->pict_type= I_TYPE;
+            s->reordered_input_picture[0]->coded_picture_number= coded_pic_num;
+        }else{
+            int b_frames;
+            
+            if(s->flags&CODEC_FLAG_PASS2){
+                for(i=0; i<s->max_b_frames+1; i++){
+                    int pict_num= s->input_picture[0]->display_picture_number + i;
+                    int pict_type= s->rc_context.entry[pict_num].new_pict_type;
+                    s->input_picture[i]->pict_type= pict_type;
+                    
+                    if(i + 1 >= s->rc_context.num_entries) break;
+                }
+            }
 
+            if(s->input_picture[0]->pict_type){
+                /* user selected pict_type */
                 for(b_frames=0; b_frames<s->max_b_frames+1; b_frames++){
                     if(s->input_picture[b_frames]->pict_type!=B_TYPE) break;
                 }
-                
+            
                 if(b_frames > s->max_b_frames){
                     fprintf(stderr, "warning, too many bframes in a row\n");
                     b_frames = s->max_b_frames;
-                    s->input_picture[b_frames]->pict_type= I_TYPE;
                 }
+            }else if(s->b_frame_strategy==0){
+                b_frames= s->max_b_frames;
+            }else if(s->b_frame_strategy==1){
+                for(i=1; i<s->max_b_frames+1; i++){
+                    if(s->input_picture[i]->b_frame_score==0){
+                        s->input_picture[i]->b_frame_score= 
+                            get_intra_count(s, s->input_picture[i  ]->data[0], 
+                                               s->input_picture[i-1]->data[0], s->linesize) + 1;
+                    }
+                }
+                for(i=0; i<s->max_b_frames; i++){
+                    if(s->input_picture[i]->b_frame_score - 1 > s->mb_num/40) break;
+                }
+                                
+                b_frames= FFMAX(0, i-1);
                 
-                s->reordered_input_picture[0]= s->input_picture[b_frames];
-                s->reordered_input_picture[0]->coded_picture_number= coded_pic_num;
-                for(i=0; i<b_frames; i++){
-                    coded_pic_num++;
-                    s->reordered_input_picture[i+1]= s->input_picture[i];
-                    s->reordered_input_picture[i+1]->coded_picture_number= coded_pic_num;
-                }    
+                /* reset scores */
+                for(i=0; i<b_frames+1; i++){
+                    s->input_picture[i]->b_frame_score=0;
+                }
+            }else{
+                fprintf(stderr, "illegal b frame strategy\n");
+                b_frames=0;
             }
-        }else{
-            if(/*s->picture_in_gop_number >= s->gop_size ||*/ s->next_picture.data[0]==NULL || s->intra_only){
-                s->reordered_input_picture[0]= s->input_picture[0];
+
+            emms_c();
+//static int b_count=0;
+//b_count+= b_frames;
+//printf("b_frames: %d\n", b_count);
+                        
+            s->reordered_input_picture[0]= s->input_picture[b_frames];
+            if(   s->picture_in_gop_number + b_frames >= s->gop_size 
+               || s->reordered_input_picture[0]->pict_type== I_TYPE)
                 s->reordered_input_picture[0]->pict_type= I_TYPE;
-                s->reordered_input_picture[0]->coded_picture_number= coded_pic_num;
-            }else{
-                s->reordered_input_picture[0]= s->input_picture[s->max_b_frames];
-                if(s->picture_in_gop_number + s->max_b_frames >= s->gop_size)
-                    s->reordered_input_picture[0]->pict_type= I_TYPE;
-                else
-                    s->reordered_input_picture[0]->pict_type= P_TYPE;
-                s->reordered_input_picture[0]->coded_picture_number= coded_pic_num;
-                for(i=0; i<s->max_b_frames; i++){
-                    coded_pic_num++;
-                    s->reordered_input_picture[i+1]= s->input_picture[i];
-                    s->reordered_input_picture[i+1]->pict_type= B_TYPE;
-                    s->reordered_input_picture[i+1]->coded_picture_number= coded_pic_num;
-                }    
+            else
+                s->reordered_input_picture[0]->pict_type= P_TYPE;
+            s->reordered_input_picture[0]->coded_picture_number= coded_pic_num;
+            for(i=0; i<b_frames; i++){
+                coded_pic_num++;
+                s->reordered_input_picture[i+1]= s->input_picture[i];
+                s->reordered_input_picture[i+1]->pict_type= B_TYPE;
+                s->reordered_input_picture[i+1]->coded_picture_number= coded_pic_num;
             }
         }
     }
     
     if(s->reordered_input_picture[0]){
-        if(s->reordered_input_picture[0]->pict_type==B_TYPE){
-            s->reordered_input_picture[0]->reference=0;
+       s->reordered_input_picture[0]->reference= s->reordered_input_picture[0]->pict_type!=B_TYPE;
+
+        if(s->reordered_input_picture[0]->type == FF_BUFFER_TYPE_SHARED){
+            int i= find_unused_picture(s, 0);
+            Picture *pic= &s->picture[i];
+
+            s->new_picture= *s->reordered_input_picture[0];
+
+            /* mark us unused / free shared pic */
+            for(i=0; i<4; i++)
+                s->reordered_input_picture[0]->data[i]= NULL;
+            s->reordered_input_picture[0]->type= 0;
+            
+            pic->pict_type = s->reordered_input_picture[0]->pict_type;
+            pic->quality   = s->reordered_input_picture[0]->quality;
+            pic->coded_picture_number = s->reordered_input_picture[0]->coded_picture_number;
+            pic->reference = s->reordered_input_picture[0]->reference;
+            
+            alloc_picture(s, pic, 0);
+
+            s->current_picture= *pic;
+        }else{
+            assert(   s->reordered_input_picture[0]->type==FF_BUFFER_TYPE_USER 
+                   || s->reordered_input_picture[0]->type==FF_BUFFER_TYPE_INTERNAL);
+            
+            s->new_picture= *s->reordered_input_picture[0];
+
+            for(i=0; i<4; i++){
+                s->reordered_input_picture[0]->data[i]-=16; //FIXME dirty
+            }
+            s->current_picture= *s->reordered_input_picture[0];
         }
-        s->current_picture= *s->reordered_input_picture[0];
-        s->new_picture= s->current_picture;
-        s->new_picture.data[0]+=16;
-        s->new_picture.data[1]+=16;
-        s->new_picture.data[2]+=16;
     
         s->picture_number= s->new_picture.display_picture_number;
 //printf("dpn:%d\n", s->picture_number);
@@ -1060,7 +1203,7 @@ int MPV_encode_picture(AVCodecContext *avctx,
                        unsigned char *buf, int buf_size, void *data)
 {
     MpegEncContext *s = avctx->priv_data;
-    AVVideoFrame *pic_arg = data;
+    AVFrame *pic_arg = data;
     int i;
 
     init_put_bits(&s->pb, buf, buf_size, NULL, NULL);
@@ -1151,7 +1294,7 @@ static inline void gmc1_motion(MpegEncContext *s,
     if(s->flags&CODEC_FLAG_EMU_EDGE){
         if(src_x<0 || src_y<0 || src_x + (motion_x&15) + 16 > s->h_edge_pos
                               || src_y + (motion_y&15) + 16 > s->v_edge_pos){
-            emulated_edge_mc(s, ptr, linesize, 17, 17, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
+            ff_emulated_edge_mc(s, ptr, linesize, 17, 17, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
             ptr= s->edge_emu_buffer;
             emu=1;
         }
@@ -1189,14 +1332,14 @@ static inline void gmc1_motion(MpegEncContext *s,
     offset = (src_y * uvlinesize) + src_x + (src_offset>>1);
     ptr = ref_picture[1] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+        ff_emulated_edge_mc(s, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
         ptr= s->edge_emu_buffer;
     }
     s->dsp.gmc1(dest_cb + (dest_offset>>1), ptr, uvlinesize, 8, motion_x&15, motion_y&15, 128 - s->no_rounding);
     
     ptr = ref_picture[2] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+        ff_emulated_edge_mc(s, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
         ptr= s->edge_emu_buffer;
     }
     s->dsp.gmc1(dest_cr + (dest_offset>>1), ptr, uvlinesize, 8, motion_x&15, motion_y&15, 128 - s->no_rounding);
@@ -1268,7 +1411,7 @@ static inline void gmc_motion(MpegEncContext *s,
 }
 
 
-static void emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
+void ff_emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
                                     int src_x, int src_y, int w, int h){
     int x, y;
     int start_y, start_x, end_y, end_x;
@@ -1368,7 +1511,7 @@ if(s->quarter_sample)
     if(s->flags&CODEC_FLAG_EMU_EDGE){
         if(src_x<0 || src_y<0 || src_x + (motion_x&1) + 16 > s->h_edge_pos
                               || src_y + (motion_y&1) + h  > v_edge_pos){
-            emulated_edge_mc(s, ptr - src_offset, s->linesize, 17, 17+field_based, 
+            ff_emulated_edge_mc(s, ptr - src_offset, s->linesize, 17, 17+field_based, 
                              src_x, src_y<<field_based, s->h_edge_pos, s->v_edge_pos);
             ptr= s->edge_emu_buffer + src_offset;
             emu=1;
@@ -1405,7 +1548,7 @@ if(s->quarter_sample)
     offset = (src_y * uvlinesize) + src_x + (src_offset >> 1);
     ptr = ref_picture[1] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9+field_based, 
+        ff_emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9+field_based, 
                          src_x, src_y<<field_based, s->h_edge_pos>>1, s->v_edge_pos>>1);
         ptr= s->edge_emu_buffer + (src_offset >> 1);
     }
@@ -1413,7 +1556,7 @@ if(s->quarter_sample)
 
     ptr = ref_picture[2] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9+field_based, 
+        ff_emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9+field_based, 
                          src_x, src_y<<field_based, s->h_edge_pos>>1, s->v_edge_pos>>1);
         ptr= s->edge_emu_buffer + (src_offset >> 1);
     }
@@ -1453,7 +1596,7 @@ static inline void qpel_motion(MpegEncContext *s,
     if(s->flags&CODEC_FLAG_EMU_EDGE){
         if(src_x<0 || src_y<0 || src_x + (motion_x&3) + 16 > s->h_edge_pos
                               || src_y + (motion_y&3) + h  > v_edge_pos){
-            emulated_edge_mc(s, ptr - src_offset, s->linesize, 17, 17+field_based, 
+            ff_emulated_edge_mc(s, ptr - src_offset, s->linesize, 17, 17+field_based, 
                              src_x, src_y<<field_based, s->h_edge_pos, s->v_edge_pos);
             ptr= s->edge_emu_buffer + src_offset;
             emu=1;
@@ -1498,7 +1641,7 @@ static inline void qpel_motion(MpegEncContext *s,
     offset = (src_y * uvlinesize) + src_x + (src_offset >> 1);
     ptr = ref_picture[1] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9 + field_based, 
+        ff_emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9 + field_based, 
                          src_x, src_y<<field_based, s->h_edge_pos>>1, s->v_edge_pos>>1);
         ptr= s->edge_emu_buffer + (src_offset >> 1);
     }
@@ -1506,7 +1649,7 @@ static inline void qpel_motion(MpegEncContext *s,
     
     ptr = ref_picture[2] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9 + field_based, 
+        ff_emulated_edge_mc(s, ptr - (src_offset >> 1), s->uvlinesize, 9, 9 + field_based, 
                          src_x, src_y<<field_based, s->h_edge_pos>>1, s->v_edge_pos>>1);
         ptr= s->edge_emu_buffer + (src_offset >> 1);
     }
@@ -1542,6 +1685,10 @@ static inline void MPV_motion(MpegEncContext *s,
                         ref_picture, 0,
                         0, pix_op, qpix_op,
                         s->mv[dir][0][0], s->mv[dir][0][1], 16);
+        }else if(s->mspel){
+            ff_mspel_motion(s, dest_y, dest_cb, dest_cr,
+                        ref_picture, pix_op,
+                        s->mv[dir][0][0], s->mv[dir][0][1], 16);
         }else{
             mpeg_motion(s, dest_y, dest_cb, dest_cr, 0,
                         ref_picture, 0,
@@ -1573,7 +1720,7 @@ static inline void MPV_motion(MpegEncContext *s,
                 if(s->flags&CODEC_FLAG_EMU_EDGE){
                     if(src_x<0 || src_y<0 || src_x + (motion_x&3) + 8 > s->h_edge_pos
                                           || src_y + (motion_y&3) + 8 > s->v_edge_pos){
-                        emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
+                        ff_emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
                         ptr= s->edge_emu_buffer;
                     }
                 }
@@ -1604,7 +1751,7 @@ static inline void MPV_motion(MpegEncContext *s,
                 if(s->flags&CODEC_FLAG_EMU_EDGE){
                     if(src_x<0 || src_y<0 || src_x + (motion_x&1) + 8 > s->h_edge_pos
                                           || src_y + (motion_y&1) + 8 > s->v_edge_pos){
-                        emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
+                        ff_emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
                         ptr= s->edge_emu_buffer;
                     }
                 }
@@ -1651,7 +1798,7 @@ static inline void MPV_motion(MpegEncContext *s,
         if(s->flags&CODEC_FLAG_EMU_EDGE){
                 if(src_x<0 || src_y<0 || src_x + (dxy &1) + 8 > s->h_edge_pos>>1
                                       || src_y + (dxy>>1) + 8 > s->v_edge_pos>>1){
-                    emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+                    ff_emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
                     ptr= s->edge_emu_buffer;
                     emu=1;
                 }
@@ -1660,7 +1807,7 @@ static inline void MPV_motion(MpegEncContext *s,
 
         ptr = ref_picture[2] + offset;
         if(emu){
-            emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+            ff_emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
             ptr= s->edge_emu_buffer;
         }
         pix_op[1][dxy](dest_cr, ptr, s->uvlinesize, 8);
@@ -1795,7 +1942,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
 
     /* update motion predictor, not for B-frames as they need the motion_val from the last P/S-Frame */
     if (s->out_format == FMT_H263 && s->pict_type!=B_TYPE) { //FIXME move into h263.c if possible, format specific stuff shouldnt be here
-        
+        //FIXME a lot of thet is only needed for !low_delay
         const int wrap = s->block_wrap[0];
         const int xy = s->block_index[0];
         const int mb_index= s->mb_x + s->mb_y*s->mb_width;
@@ -1931,7 +2078,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
                     add_dequant_dct(s, block[4], 4, dest_cb, s->uvlinesize);
                     add_dequant_dct(s, block[5], 5, dest_cr, s->uvlinesize);
                 }
-            } else {
+            } else if(s->codec_id != CODEC_ID_WMV2){
                 add_dct(s, block[0], 0, dest_y, dct_linesize);
                 add_dct(s, block[1], 1, dest_y + 8, dct_linesize);
                 add_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize);
@@ -1941,6 +2088,8 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
                     add_dct(s, block[4], 4, dest_cb, s->uvlinesize);
                     add_dct(s, block[5], 5, dest_cr, s->uvlinesize);
                 }
+            } else{
+                ff_wmv2_add_mb(s, block, dest_y, dest_cb, dest_cr);
             }
         } else {
             /* dct only in intra block */
@@ -2174,7 +2323,7 @@ static int pix_diff_vcmp16x8(UINT8 *s1, UINT8*s2, int stride){ //FIXME move to d
 
 void ff_draw_horiz_band(MpegEncContext *s){
     if (    s->avctx->draw_horiz_band 
-        && (s->num_available_buffers>=1 || (!s->has_b_frames)) ) {
+        && (s->last_picture.data[0] || s->low_delay) ) {
         UINT8 *src_ptr[3];
         int y, h, offset;
         y = s->mb_y * 16;
@@ -2187,7 +2336,7 @@ void ff_draw_horiz_band(MpegEncContext *s){
         else
             offset = y * s->linesize;
 
-        if(s->pict_type==B_TYPE || (!s->has_b_frames)){
+        if(s->pict_type==B_TYPE || s->low_delay){
             src_ptr[0] = s->current_picture.data[0] + offset;
             src_ptr[1] = s->current_picture.data[1] + (offset >> 2);
             src_ptr[2] = s->current_picture.data[2] + (offset >> 2);
@@ -2243,7 +2392,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         ptr = s->new_picture.data[0] + (mb_y * 16 * wrap_y) + mb_x * 16;
 
         if(mb_x*16+16 > s->width || mb_y*16+16 > s->height){
-            emulated_edge_mc(s, ptr, wrap_y, 16, 16, mb_x*16, mb_y*16, s->width, s->height);
+            ff_emulated_edge_mc(s, ptr, wrap_y, 16, 16, mb_x*16, mb_y*16, s->width, s->height);
             ptr= s->edge_emu_buffer;
             emu=1;
         }
@@ -2275,14 +2424,14 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             int wrap_c = s->uvlinesize;
             ptr = s->new_picture.data[1] + (mb_y * 8 * wrap_c) + mb_x * 8;
             if(emu){
-                emulated_edge_mc(s, ptr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+                ff_emulated_edge_mc(s, ptr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
                 ptr= s->edge_emu_buffer;
             }
 	    s->dsp.get_pixels(s->block[4], ptr, wrap_c);
 
             ptr = s->new_picture.data[2] + (mb_y * 8 * wrap_c) + mb_x * 8;
             if(emu){
-                emulated_edge_mc(s, ptr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+                ff_emulated_edge_mc(s, ptr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
                 ptr= s->edge_emu_buffer;
             }
             s->dsp.get_pixels(s->block[5], ptr, wrap_c);
@@ -2322,7 +2471,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         }
 
         if(mb_x*16+16 > s->width || mb_y*16+16 > s->height){
-            emulated_edge_mc(s, ptr_y, wrap_y, 16, 16, mb_x*16, mb_y*16, s->width, s->height);
+            ff_emulated_edge_mc(s, ptr_y, wrap_y, 16, 16, mb_x*16, mb_y*16, s->width, s->height);
             ptr_y= s->edge_emu_buffer;
             emu=1;
         }
@@ -2354,12 +2503,12 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             skip_dct[5]= 1;
         }else{
             if(emu){
-                emulated_edge_mc(s, ptr_cb, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+                ff_emulated_edge_mc(s, ptr_cb, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
                 ptr_cb= s->edge_emu_buffer;
             }
             s->dsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
             if(emu){
-                emulated_edge_mc(s, ptr_cr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+                ff_emulated_edge_mc(s, ptr_cr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
                 ptr_cr= s->edge_emu_buffer;
             }
             s->dsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
@@ -2441,21 +2590,25 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
 #ifdef CONFIG_ENCODERS
     /* huffman encode */
-    switch(s->out_format) {
-    case FMT_MPEG1:
-        mpeg1_encode_mb(s, s->block, motion_x, motion_y);
-        break;
-    case FMT_H263:
-        if (s->h263_msmpeg4)
-            msmpeg4_encode_mb(s, s->block, motion_x, motion_y);
-        else if(s->h263_pred)
-            mpeg4_encode_mb(s, s->block, motion_x, motion_y);
-        else
-            h263_encode_mb(s, s->block, motion_x, motion_y);
-        break;
-    case FMT_MJPEG:
-        mjpeg_encode_mb(s, s->block);
-        break;
+    switch(s->codec_id){ //FIXME funct ptr could be slightly faster
+    case CODEC_ID_MPEG1VIDEO:
+        mpeg1_encode_mb(s, s->block, motion_x, motion_y); break;
+    case CODEC_ID_MPEG4:
+        mpeg4_encode_mb(s, s->block, motion_x, motion_y); break;
+    case CODEC_ID_MSMPEG4V2:
+    case CODEC_ID_MSMPEG4V3:
+    case CODEC_ID_WMV1:
+        msmpeg4_encode_mb(s, s->block, motion_x, motion_y); break;
+    case CODEC_ID_WMV2:
+         ff_wmv2_encode_mb(s, s->block, motion_x, motion_y); break;
+    case CODEC_ID_MJPEG:
+        mjpeg_encode_mb(s, s->block); break;
+    case CODEC_ID_H263:
+    case CODEC_ID_H263P:
+    case CODEC_ID_RV10:
+        h263_encode_mb(s, s->block, motion_x, motion_y); break;
+    default:
+        assert(0);
     }
 #endif
 }
@@ -2571,13 +2724,18 @@ static inline int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, in
     int x,y;
     
     if(w==16 && h==16) 
-        return s->dsp.pix_norm(src1, src2, stride);
+        return s->dsp.sse[0](NULL, src1, src2, stride);
+    else if(w==8 && h==8)
+        return s->dsp.sse[1](NULL, src1, src2, stride);
     
     for(y=0; y<h; y++){
         for(x=0; x<w; x++){
             acc+= sq[src1[x + y*stride] - src2[x + y*stride]];
         } 
     }
+    
+    assert(acc>=0);
+    
     return acc;
 }
 
@@ -2618,9 +2776,37 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->scene_change_score=0;
     
     s->qscale= (int)(s->frame_qscale + 0.5); //FIXME qscale / ... stuff for ME ratedistoration
-
+    
+    if(s->msmpeg4_version){
+        if(s->pict_type==I_TYPE)
+            s->no_rounding=1;
+        else if(s->flipflop_rounding)
+            s->no_rounding ^= 1;          
+    }else if(s->out_format == FMT_H263){
+        if(s->pict_type==I_TYPE)
+            s->no_rounding=0;
+        else if(s->pict_type!=B_TYPE)
+            s->no_rounding ^= 1;          
+    }
     /* Estimate motion for every MB */
     if(s->pict_type != I_TYPE){
+        if(s->pict_type != B_TYPE){
+            if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
+                s->me.pre_pass=1;
+                s->me.dia_size= s->avctx->pre_dia_size;
+
+                for(mb_y=s->mb_height-1; mb_y >=0 ; mb_y--) {
+                    for(mb_x=s->mb_width-1; mb_x >=0 ; mb_x--) {
+                        s->mb_x = mb_x;
+                        s->mb_y = mb_y;
+                        ff_pre_estimate_p_frame_motion(s, mb_x, mb_y);
+                    }
+                }
+                s->me.pre_pass=0;
+            }
+        }
+
+        s->me.dia_size= s->avctx->dia_size;
         for(mb_y=0; mb_y < s->mb_height; mb_y++) {
             s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
             s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
@@ -2633,13 +2819,12 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                 s->block_index[1]+=2;
                 s->block_index[2]+=2;
                 s->block_index[3]+=2;
-
+                
                 /* compute motion vector & mb_type and store in context */
                 if(s->pict_type==B_TYPE)
                     ff_estimate_b_frame_motion(s, mb_x, mb_y);
                 else
                     ff_estimate_p_frame_motion(s, mb_x, mb_y);
-//                s->mb_type[mb_y*s->mb_width + mb_x]=MB_TYPE_INTER;
             }
         }
     }else /* if(s->pict_type == I_TYPE) */{
@@ -2734,7 +2919,9 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         mjpeg_picture_header(s);
         break;
     case FMT_H263:
-        if (s->h263_msmpeg4) 
+        if (s->codec_id == CODEC_ID_WMV2) 
+            ff_wmv2_encode_picture_header(s, picture_number);
+        else if (s->h263_msmpeg4) 
             msmpeg4_encode_picture_header(s, picture_number);
         else if (s->h263_pred)
             mpeg4_encode_picture_header(s, picture_number);
@@ -2916,15 +3103,14 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                                  &dmin, &next_block, 0, 0);
                 }
                 if(mb_type&MB_TYPE_DIRECT){
+                    int mx= s->b_direct_mv_table[xy][0];
+                    int my= s->b_direct_mv_table[xy][1];
+                    
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
-                    s->mv_type = MV_TYPE_16X16; //FIXME
                     s->mb_intra= 0;
-                    s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0];
-                    s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1];
-                    s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0];
-                    s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1];
+                    ff_mpeg4_set_direct_mv(s, mx, my);
                     encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_DIRECT, pb, pb2, tex_pb, 
-                                 &dmin, &next_block, s->b_direct_mv_table[xy][0], s->b_direct_mv_table[xy][1]);
+                                 &dmin, &next_block, mx, my);
                 }
                 if(mb_type&MB_TYPE_INTRA){
                     s->mv_dir = MV_DIR_FORWARD;
@@ -2989,10 +3175,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                     s->mb_intra= 0;
                     motion_x=s->b_direct_mv_table[xy][0];
                     motion_y=s->b_direct_mv_table[xy][1];
-                    s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0];
-                    s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1];
-                    s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0];
-                    s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1];
+                    ff_mpeg4_set_direct_mv(s, motion_x, motion_y);
                     break;
                 case MB_TYPE_BIDIR:
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
@@ -3037,7 +3220,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 
                 if(s->mb_x*16 + 16 > s->width ) w= s->width - s->mb_x*16;
                 if(s->mb_y*16 + 16 > s->height) h= s->height- s->mb_y*16;
-                
+
                 s->current_picture.error[0] += sse(
                     s,
                     s->new_picture    .data[0] + s->mb_x*16 + s->mb_y*s->linesize*16,
@@ -3094,6 +3277,254 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     }
 }
 
+static int dct_quantize_trellis_c(MpegEncContext *s, 
+                        DCTELEM *block, int n,
+                        int qscale, int *overflow){
+    const int *qmat;
+    const UINT8 *scantable= s->intra_scantable.scantable;
+    int max=0;
+    unsigned int threshold1, threshold2;
+    int bias=0;
+    int run_tab[65];
+    int level_tab[65];
+    int score_tab[65];
+    int last_run=0;
+    int last_level=0;
+    int last_score= 0;
+    int last_i= 0;
+    int coeff[3][64];
+    int coeff_count[64];
+    int lambda, qmul, qadd, start_i, last_non_zero, i;
+    const int esc_length= s->ac_esc_length;
+    uint8_t * length;
+    uint8_t * last_length;
+    int score_limit=0;
+    int left_limit= 0;
+        
+    s->fdct (block);
+
+    qmul= qscale*16;
+    qadd= ((qscale-1)|1)*8;
+
+    if (s->mb_intra) {
+        int q;
+        if (!s->h263_aic) {
+            if (n < 4)
+                q = s->y_dc_scale;
+            else
+                q = s->c_dc_scale;
+            q = q << 3;
+        } else{
+            /* For AIC we skip quant/dequant of INTRADC */
+            q = 1 << 3;
+            qadd=0;
+        }
+            
+        /* note: block[0] is assumed to be positive */
+        block[0] = (block[0] + (q >> 1)) / q;
+        start_i = 1;
+        last_non_zero = 0;
+        qmat = s->q_intra_matrix[qscale];
+        if(s->mpeg_quant || s->codec_id== CODEC_ID_MPEG1VIDEO)
+            bias= 1<<(QMAT_SHIFT-1);
+        length     = s->intra_ac_vlc_length;
+        last_length= s->intra_ac_vlc_last_length;
+    } else {
+        start_i = 0;
+        last_non_zero = -1;
+        qmat = s->q_inter_matrix[qscale];
+        length     = s->inter_ac_vlc_length;
+        last_length= s->inter_ac_vlc_last_length;
+    }
+
+    threshold1= (1<<QMAT_SHIFT) - bias - 1;
+    threshold2= (threshold1<<1);
+
+    for(i=start_i; i<64; i++) {
+        const int j = scantable[i];
+        const int k= i-start_i;
+        int level = block[j];
+        level = level * qmat[j];
+
+//        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
+//           || bias-level >= (1<<(QMAT_SHIFT - 3))){
+        if(((unsigned)(level+threshold1))>threshold2){
+            if(level>0){
+                level= (bias + level)>>QMAT_SHIFT;
+                coeff[0][k]= level;
+                coeff[1][k]= level-1;
+//                coeff[2][k]= level-2;
+            }else{
+                level= (bias - level)>>QMAT_SHIFT;
+                coeff[0][k]= -level;
+                coeff[1][k]= -level+1;
+//                coeff[2][k]= -level+2;
+            }
+            coeff_count[k]= FFMIN(level, 2);
+            max |=level;
+            last_non_zero = i;
+        }else{
+            coeff[0][k]= (level>>31)|1;
+            coeff_count[k]= 1;
+        }
+    }
+    
+    *overflow= s->max_qcoeff < max; //overflow might have happend
+    
+    if(last_non_zero < start_i){
+        memset(block + start_i, 0, (64-start_i)*sizeof(DCTELEM));
+        return last_non_zero;
+    }
+
+    lambda= (qscale*qscale*64*82 + 50)/100; //FIXME finetune
+        
+    score_tab[0]= 0;
+    for(i=0; i<=last_non_zero - start_i; i++){
+        int level_index, run, j;
+        const int dct_coeff= block[ scantable[i + start_i] ];
+        const int zero_distoration= dct_coeff*dct_coeff;
+        int best_score=256*256*256*120;
+
+        last_score += zero_distoration;
+        for(level_index=0; level_index < coeff_count[i]; level_index++){
+            int distoration;
+            int level= coeff[level_index][i];
+            int unquant_coeff;
+            
+            assert(level);
+
+            if(s->out_format == FMT_H263){
+                if(level>0){
+                    unquant_coeff= level*qmul + qadd;
+                }else{
+                    unquant_coeff= level*qmul - qadd;
+                }
+            }else{ //MPEG1
+                j= s->idct_permutation[ scantable[i + start_i] ]; //FIXME optimize
+                if(s->mb_intra){
+                    if (level < 0) {
+                        unquant_coeff = (int)((-level) * qscale * s->intra_matrix[j]) >> 3;
+                        unquant_coeff = -((unquant_coeff - 1) | 1);
+                    } else {
+                        unquant_coeff = (int)(  level  * qscale * s->intra_matrix[j]) >> 3;
+                        unquant_coeff =   (unquant_coeff - 1) | 1;
+                    }
+                }else{
+                    if (level < 0) {
+                        unquant_coeff = ((((-level) << 1) + 1) * qscale * ((int) s->inter_matrix[j])) >> 4;
+                        unquant_coeff = -((unquant_coeff - 1) | 1);
+                    } else {
+                        unquant_coeff = (((  level  << 1) + 1) * qscale * ((int) s->inter_matrix[j])) >> 4;
+                        unquant_coeff =   (unquant_coeff - 1) | 1;
+                    }
+                }
+                unquant_coeff<<= 3;
+            }
+
+            distoration= (unquant_coeff - dct_coeff) * (unquant_coeff - dct_coeff);
+            level+=64;
+            if((level&(~127)) == 0){
+                for(run=0; run<=i - left_limit; run++){
+                    int score= distoration + length[UNI_AC_ENC_INDEX(run, level)]*lambda;
+                    score += score_tab[i-run];
+                    
+                    if(score < best_score){
+                        best_score= 
+                        score_tab[i+1]= score;
+                        run_tab[i+1]= run;
+                        level_tab[i+1]= level-64;
+                    }
+                }
+
+                if(s->out_format == FMT_H263){
+                    for(run=0; run<=i - left_limit; run++){
+                        int score= distoration + last_length[UNI_AC_ENC_INDEX(run, level)]*lambda;
+                        score += score_tab[i-run];
+                        if(score < last_score){
+                            last_score= score;
+                            last_run= run;
+                            last_level= level-64;
+                            last_i= i+1;
+                        }
+                    }
+                }
+            }else{
+                distoration += esc_length*lambda;
+                for(run=0; run<=i - left_limit; run++){
+                    int score= distoration + score_tab[i-run];
+                    
+                    if(score < best_score){
+                        best_score= 
+                        score_tab[i+1]= score;
+                        run_tab[i+1]= run;
+                        level_tab[i+1]= level-64;
+                    }
+                }
+
+                if(s->out_format == FMT_H263){
+                    for(run=0; run<=i - left_limit; run++){
+                        int score= distoration + score_tab[i-run];
+                        if(score < last_score){
+                            last_score= score;
+                            last_run= run;
+                            last_level= level-64;
+                            last_i= i+1;
+                        }
+                    }
+                }
+            }
+        }
+
+        for(j=left_limit; j<=i; j++){
+            score_tab[j] += zero_distoration;
+        }
+        score_limit+= zero_distoration;
+        if(score_tab[i+1] < score_limit)
+            score_limit= score_tab[i+1];
+        
+        //Note: there is a vlc code in mpeg4 which is 1 bit shorter then another one with a shorter run and the same level
+        while(score_tab[ left_limit ] > score_limit + lambda) left_limit++;
+    }
+
+        //FIXME add some cbp penalty
+
+    if(s->out_format != FMT_H263){
+        last_score= 256*256*256*120;
+        for(i= left_limit; i<=last_non_zero - start_i + 1; i++){
+            int score= score_tab[i];
+            if(i) score += lambda*2; //FIXME exacter?
+
+            if(score < last_score){
+                last_score= score;
+                last_i= i;
+                last_level= level_tab[i];
+                last_run= run_tab[i];
+            }
+        }
+    }
+    
+    last_non_zero= last_i - 1 + start_i;
+    memset(block + start_i, 0, (64-start_i)*sizeof(DCTELEM));
+    
+    if(last_non_zero < start_i)
+        return last_non_zero;
+    
+    i= last_i;
+    assert(last_level);
+//FIXME use permutated scantable
+    block[ s->idct_permutation[ scantable[last_non_zero] ] ]= last_level;
+    i -= last_run + 1;
+    
+    for(;i>0 ; i -= run_tab[i] + 1){
+        const int j= s->idct_permutation[ scantable[i - 1 + start_i] ];
+    
+        block[j]= level_tab[i];
+        assert(block[j]);
+    }
+
+    return last_non_zero;
+}
+
 static int dct_quantize_c(MpegEncContext *s, 
                         DCTELEM *block, int n,
                         int qscale, int *overflow)
@@ -3332,6 +3763,16 @@ static void dct_unquantize_h263_c(MpegEncContext *s,
     }
 }
 
+char ff_get_pict_type_char(int pict_type){
+    switch(pict_type){
+    case I_TYPE: return 'I'; 
+    case P_TYPE: return 'P'; 
+    case B_TYPE: return 'B'; 
+    case S_TYPE: return 'S'; 
+    default:     return '?';
+    }
+}
+
 AVCodec mpeg1video_encoder = {
     "mpeg1video",
     CODEC_TYPE_VIDEO,
@@ -3432,12 +3873,3 @@ AVCodec wmv1_encoder = {
     MPV_encode_end,
 };
 
-AVCodec wmv2_encoder = {
-    "wmv2",
-    CODEC_TYPE_VIDEO,
-    CODEC_ID_WMV2,
-    sizeof(MpegEncContext),
-    MPV_encode_init,
-    MPV_encode_picture,
-    MPV_encode_end,
-};
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 46e19d4b1..e6909817a 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -38,7 +38,7 @@ enum OutputFormat {
 #define MAX_FCODE 7
 #define MAX_MV 2048
 
-#define MAX_PICTURE_COUNT 7
+#define MAX_PICTURE_COUNT 15
 
 #define ME_MAP_SIZE 64
 #define ME_MAP_SHIFT 3
@@ -110,13 +110,14 @@ typedef struct ScanTable{
 } ScanTable;
 
 typedef struct Picture{
-    FF_COMMON_PICTURE    
+    FF_COMMON_FRAME
 
     int mb_var_sum;             /* sum of MB variance for current frame */
     int mc_mb_var_sum;          /* motion compensated MB variance for current frame */
     uint16_t *mb_var;           /* Table for MB variances */
     uint16_t *mc_mb_var;        /* Table for motion compensated MB variances */
     uint8_t *mb_mean;           /* Table for MB luminance */
+    int b_frame_score;          /* */
 } Picture;
 
 typedef struct ParseContext{
@@ -128,6 +129,39 @@ typedef struct ParseContext{
     int frame_start_found;
 } ParseContext;
 
+struct MpegEncContext;
+
+typedef struct MotionEstContext{
+    int skip;                          /* set if ME is skiped for the current MB */
+    int co_located_mv[4][2];           /* mv from last p frame for direct mode ME */
+    int direct_basis_mv[4][2];
+    uint8_t *scratchpad;               /* data area for the me algo, so that the ME doesnt need to malloc/free */
+    uint32_t *map;                     /* map to avoid duplicate evaluations */
+    uint32_t *score_map;               /* map to store the scores */
+    int map_generation;  
+    int pre_penalty_factor;
+    int penalty_factor;
+    int sub_penalty_factor;
+    int pre_pass;                      /* = 1 for the pre pass */
+    int dia_size;
+    UINT16 (*mv_penalty)[MAX_MV*2+1];  /* amount of bits needed to encode a MV */
+    int (*sub_motion_search)(struct MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, Picture *ref_picture, 
+                                  int n, int size, uint16_t * const mv_penalty);
+    int (*motion_search[7])(struct MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, uint16_t * const mv_penalty);
+    int (*pre_motion_search)(struct MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, uint16_t * const mv_penalty);
+}MotionEstContext;
+
 typedef struct MpegEncContext {
     struct AVCodecContext *avctx;
     /* the following parameters must be initialized before encoding */
@@ -184,7 +218,6 @@ typedef struct MpegEncContext {
     Picture next_picture;       /* previous picture (for bidir pred) */
     Picture new_picture;        /* source picture for encoding */
     Picture current_picture;    /* buffer to store the decompressed current picture */
-    int num_available_buffers;   /* is 0 at the start & after seeking, after the first I frame its 1 after next I/P 2 */
     int last_dc[3];              /* last DC values for MPEG1 */
     INT16 *dc_val[3];            /* used for mpeg4 DC prediction, all 3 arrays must be continuous */
     int y_dc_scale, c_dc_scale;
@@ -199,7 +232,8 @@ typedef struct MpegEncContext {
     UINT8 *mbintra_table;       /* used to avoid setting {ac, dc, cbp}-pred stuff to zero on inter MB decoding */
     UINT8 *cbp_table;           /* used to store cbp, ac_pred for partitioned decoding */
     UINT8 *pred_dir_table;      /* used to store pred_dir for partitioned decoding */
-    UINT8 *edge_emu_buffer;
+    uint8_t *allocated_edge_emu_buffer;
+    uint8_t *edge_emu_buffer;     /* points into the middle of allocated_edge_emu_buffer */ 
 
     int qscale;                 /* QP */
     float frame_qscale;         /* qscale from the frame level rc FIXME remove*/
@@ -222,15 +256,8 @@ typedef struct MpegEncContext {
     INT16 (*b_back_mv_table)[2];       /* MV table (1MV per MB) backward mode b-frame encoding */
     INT16 (*b_bidir_forw_mv_table)[2]; /* MV table (1MV per MB) bidir mode b-frame encoding */
     INT16 (*b_bidir_back_mv_table)[2]; /* MV table (1MV per MB) bidir mode b-frame encoding */
-    INT16 (*b_direct_forw_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */
-    INT16 (*b_direct_back_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */
     INT16 (*b_direct_mv_table)[2];     /* MV table (1MV per MB) direct mode b-frame encoding */
     int me_method;                     /* ME algorithm */
-    uint8_t *me_scratchpad;            /* data area for the me algo, so that the ME doesnt need to malloc/free */
-    uint32_t *me_map;                  /* map to avoid duplicate evaluations */
-    uint16_t *me_score_map;            /* map to store the SADs */
-    int me_map_generation;
-    int skip_me;                       /* set if ME is skiped for the current MB */
     int scene_change_score;
     int mv_dir;
 #define MV_DIR_BACKWARD  1
@@ -250,10 +277,10 @@ typedef struct MpegEncContext {
     int mv[2][4][2];
     int field_select[2][2];
     int last_mv[2][2][2];             /* last MV, used for MV prediction in MPEG1 & B-frame MPEG4 */
-    UINT16 (*mv_penalty)[MAX_MV*2+1]; /* amount of bits needed to encode a MV, used for ME */
     UINT8 *fcode_tab; /* smallest fcode needed for each MV */
+    
+    MotionEstContext me;
 
-    int has_b_frames;
     int no_rounding; /* apply no rounding to motion compensation (MPEG4, msmpeg4, ...) 
                         for b-frames rounding mode is allways 0 */
 
@@ -289,6 +316,13 @@ typedef struct MpegEncContext {
     int inter_quant_bias;    /* bias for the quantizer */
     int min_qcoeff;          /* minimum encodable coefficient */
     int max_qcoeff;          /* maximum encodable coefficient */
+    int ac_esc_length;       /* num of bits needed to encode the longest esc */
+    uint8_t *intra_ac_vlc_length;
+    uint8_t *intra_ac_vlc_last_length;
+    uint8_t *inter_ac_vlc_length;
+    uint8_t *inter_ac_vlc_last_length;
+#define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
+
     /* precomputed matrix (combine qscale and DCT renorm) */
     int __align8 q_intra_matrix[32][64];
     int __align8 q_inter_matrix[32][64];
@@ -344,7 +378,7 @@ typedef struct MpegEncContext {
     
     int resync_mb_x;                 /* x position of last resync marker */
     int resync_mb_y;                 /* y position of last resync marker */
-    GetBitContext last_resync_gb;    /* used to serach for the next resync marker */
+    GetBitContext last_resync_gb;    /* used to search for the next resync marker */
     int mb_num_left;                 /* number of MBs left in this video packet (for partitioned Slices only)*/
     int next_p_frame_damaged;        /* set if the next p frame is damaged, to avoid showing trashed b frames */
     int error_resilience;
@@ -459,6 +493,7 @@ typedef struct MpegEncContext {
     /* [mb_intra][isChroma][level][run][last] */
     int (*ac_stats)[2][MAX_LEVEL+1][MAX_RUN+1][2];
     int inter_intra_pred;
+    int mspel;
 
     /* decompression specific */
     GetBitContext gb;
@@ -520,6 +555,7 @@ typedef struct MpegEncContext {
     void (*fdct)(DCTELEM *block/* align 16*/);
     void (*idct_put)(UINT8 *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
     void (*idct_add)(UINT8 *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+    //FIXME move above funcs into dspContext perhaps
 } MpegEncContext;
 
 
@@ -529,6 +565,9 @@ void MPV_common_end(MpegEncContext *s);
 void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx);
 void MPV_frame_end(MpegEncContext *s);
+int MPV_encode_init(AVCodecContext *avctx);
+int MPV_encode_end(AVCodecContext *avctx);
+int MPV_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data);
 #ifdef HAVE_MMX
 void MPV_common_init_mmx(MpegEncContext *s);
 #endif
@@ -554,6 +593,10 @@ void ff_clean_intra_table_entries(MpegEncContext *s);
 void ff_init_scantable(MpegEncContext *s, ScanTable *st, const UINT8 *src_scantable);
 void ff_error_resilience(MpegEncContext *s);
 void ff_draw_horiz_band(MpegEncContext *s);
+void ff_emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
+                                    int src_x, int src_y, int w, int h);
+char ff_get_pict_type_char(int pict_type);
+
 
 extern int ff_bit_exact;
 
@@ -584,6 +627,8 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
 int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type);
 void ff_fix_long_p_mvs(MpegEncContext * s);
 void ff_fix_long_b_mvs(MpegEncContext * s, int16_t (*mv_table)[2], int f_code, int type);
+void ff_init_me(MpegEncContext *s);
+int ff_pre_estimate_p_frame_motion(MpegEncContext * s, int mb_x, int mb_y);
 
 
 /* mpeg12.c */
@@ -630,6 +675,11 @@ extern UINT8 ff_mpeg4_y_dc_scale_table[32];
 extern UINT8 ff_mpeg4_c_dc_scale_table[32];
 extern const INT16 ff_mpeg4_default_intra_matrix[64];
 extern const INT16 ff_mpeg4_default_non_intra_matrix[64];
+int ff_h263_decode_init(AVCodecContext *avctx);
+int ff_h263_decode_frame(AVCodecContext *avctx, 
+                             void *data, int *data_size,
+                             UINT8 *buf, int buf_size);
+int ff_h263_decode_end(AVCodecContext *avctx);
 void h263_encode_mb(MpegEncContext *s, 
                     DCTELEM block[6][64],
                     int motion_x, int motion_y);
@@ -666,6 +716,7 @@ int ff_mpeg4_decode_partitions(MpegEncContext *s);
 int ff_mpeg4_get_video_packet_prefix_length(MpegEncContext *s);
 int ff_h263_resync(MpegEncContext *s);
 int ff_h263_get_gob_height(MpegEncContext *s);
+void ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my);
 
 
 /* rv10.c */
@@ -683,7 +734,16 @@ int msmpeg4_decode_picture_header(MpegEncContext * s);
 int msmpeg4_decode_ext_header(MpegEncContext * s, int buf_size);
 int ff_msmpeg4_decode_init(MpegEncContext *s);
 void ff_msmpeg4_encode_init(MpegEncContext *s);
-
+int ff_wmv2_decode_picture_header(MpegEncContext * s);
+void ff_wmv2_add_mb(MpegEncContext *s, DCTELEM block[6][64], uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr);
+void ff_mspel_motion(MpegEncContext *s,
+                               UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
+                               UINT8 **ref_picture, op_pixels_func (*pix_op)[4],
+                               int motion_x, int motion_y, int h);
+int ff_wmv2_encode_picture_header(MpegEncContext * s, int picture_number);
+void ff_wmv2_encode_mb(MpegEncContext * s, 
+                       DCTELEM block[6][64],
+                       int motion_x, int motion_y);
 
 /* mjpegenc.c */
 int mjpeg_init(MpegEncContext *s);
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 76fa3c2d4..a08418874 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -48,12 +48,14 @@
 #define II_BITRATE 128*1024
 #define MBAC_BITRATE 50*1024
 
+#define DEFAULT_INTER_INDEX 3
+
 static UINT32 v2_dc_lum_table[512][2];
 static UINT32 v2_dc_chroma_table[512][2];
 
 static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n);
 static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
-                                       int n, int coded);
+                                       int n, int coded, const uint8_t *scantable);
 static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
 static int msmpeg4_decode_motion(MpegEncContext * s, 
                                  int *mx_ptr, int *my_ptr);
@@ -63,6 +65,7 @@ static inline void msmpeg4_memsetw(short *tab, int val, int n);
 static int get_size_of_code(MpegEncContext * s, RLTable *rl, int last, int run, int level, int intra);
 static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
+static int wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 
 extern UINT32 inverse[256];
 
@@ -160,13 +163,14 @@ static void common_init(MpegEncContext * s)
         }
         break;
     case 4:
+    case 5:
         s->y_dc_scale_table= wmv1_y_dc_scale_table;
         s->c_dc_scale_table= wmv1_c_dc_scale_table;
         break;
     }
 
     
-    if(s->msmpeg4_version==4){
+    if(s->msmpeg4_version>=4){
         ff_init_scantable(s, &s->intra_scantable  , wmv1_scantable[1]);
         ff_init_scantable(s, &s->intra_h_scantable, wmv1_scantable[2]);
         ff_init_scantable(s, &s->intra_v_scantable, wmv1_scantable[3]);
@@ -370,9 +374,9 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     s->per_mb_rl_table = 0;
     if(s->msmpeg4_version==4)
         s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE && s->pict_type==P_TYPE);
+//printf("%d %d %d %d %d\n", s->pict_type, s->bit_rate, s->inter_intra_pred, s->width, s->height);
 
     if (s->pict_type == I_TYPE) {
-        s->no_rounding = 1;
         s->slice_height= s->mb_height/1;
         put_bits(&s->pb, 5, 0x16 + s->mb_height/s->slice_height);
         
@@ -404,12 +408,6 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
             put_bits(&s->pb, 1, s->mv_table_index);
         }
-
-	if(s->flipflop_rounding){
-	    s->no_rounding ^= 1;
-	}else{
-	    s->no_rounding = 0;
-	}
     }
 
     s->esc3_level_length= 0;
@@ -923,7 +921,7 @@ static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int
     }
 
     /* recalculate block_last_index for M$ wmv1 */
-    if(s->msmpeg4_version==4 && s->block_last_index[n]>0){
+    if(s->msmpeg4_version>=4 && s->block_last_index[n]>0){
         for(last_index=63; last_index>=0; last_index--){
             if(block[scantable[last_index]]) break;
         }
@@ -975,7 +973,7 @@ else
                         /* third escape */
                         put_bits(&s->pb, 1, 0);
                         put_bits(&s->pb, 1, last);
-                        if(s->msmpeg4_version==4){
+                        if(s->msmpeg4_version>=4){
                             if(s->esc3_level_length==0){
                                 s->esc3_level_length=8;
                                 s->esc3_run_length= 6;
@@ -1014,7 +1012,7 @@ else
 /****************************************/
 /* decoding stuff */
 
-static VLC mb_non_intra_vlc;
+static VLC mb_non_intra_vlc[4];
 static VLC mb_intra_vlc;
 static VLC dc_lum_vlc[2];
 static VLC dc_chroma_vlc[2];
@@ -1139,9 +1137,12 @@ int ff_msmpeg4_decode_init(MpegEncContext *s)
                  &mvtab[0][1], 2, 1,
                  &mvtab[0][0], 2, 1);
 
-        init_vlc(&mb_non_intra_vlc, MB_NON_INTRA_VLC_BITS, 128, 
-                 &table_mb_non_intra[0][1], 8, 4,
-                 &table_mb_non_intra[0][0], 8, 4);
+        for(i=0; i<4; i++){
+            init_vlc(&mb_non_intra_vlc[i], MB_NON_INTRA_VLC_BITS, 128, 
+                     &wmv2_inter_table[i][0][1], 8, 4,
+                     &wmv2_inter_table[i][0][0], 8, 4); //FIXME name?
+        }
+        
         init_vlc(&mb_intra_vlc, MB_INTRA_VLC_BITS, 64, 
                  &table_mb_intra[0][1], 4, 2,
                  &table_mb_intra[0][0], 4, 2);
@@ -1167,6 +1168,9 @@ int ff_msmpeg4_decode_init(MpegEncContext *s)
     case 4:
         s->decode_mb= msmpeg4v34_decode_mb;
         break;
+    case 5:
+        s->decode_mb= wmv2_decode_mb;
+        break;
     }
     
     s->slice_height= s->mb_height; //to avoid 1/0 if the first frame isnt a keyframe
@@ -1334,6 +1338,7 @@ return -1;
 	    s->no_rounding = 0;
 	}
     }
+//printf("%d %d %d %d %d\n", s->pict_type, s->bit_rate, s->inter_intra_pred, s->width, s->height);
 
     s->esc3_level_length= 0;
     s->esc3_run_length= 0;
@@ -1523,7 +1528,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
     }
 
     for (i = 0; i < 6; i++) {
-        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0)
+        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
 	{
              fprintf(stderr,"\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
              return -1;
@@ -1566,7 +1571,7 @@ printf("S ");
             }
         }
         
-        code = get_vlc2(&s->gb, mb_non_intra_vlc.table, MB_NON_INTRA_VLC_BITS, 3);
+        code = get_vlc2(&s->gb, mb_non_intra_vlc[DEFAULT_INTER_INDEX].table, MB_NON_INTRA_VLC_BITS, 3);
         if (code < 0)
             return -1;
 	//s->mb_intra = (code & 0x40) ? 0 : 1;
@@ -1628,7 +1633,7 @@ printf("%c", s->ac_pred ? 'A' : 'I');
     }
 
     for (i = 0; i < 6; i++) {
-        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0)
+        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
 	{
 	    fprintf(stderr,"\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
 	    return -1;
@@ -1639,13 +1644,12 @@ printf("%c", s->ac_pred ? 'A' : 'I');
 }
 //#define ERROR_DETAILS
 static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
-                              int n, int coded)
+                              int n, int coded, const uint8_t *scan_table)
 {
     int level, i, last, run, run_diff;
     int dc_pred_dir;
     RLTable *rl;
     RL_VLC_ELEM *rl_vlc;
-    const UINT8 *scan_table;
     int qmul, qadd;
 
     if (s->mb_intra) {
@@ -1713,7 +1717,8 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
             s->block_last_index[n] = i;
             return 0;
         }
-        scan_table = s->inter_scantable.permutated;
+        if(!scan_table)
+            scan_table = s->inter_scantable.permutated;
         set_stat(ST_INTER_AC);
         rl_vlc= rl->rl_vlc[s->qscale];
     }
@@ -1889,7 +1894,7 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
             i = 63; /* XXX: not optimal */
         }
     }
-    if(s->msmpeg4_version==4 && i>0) i=63; //FIXME/XXX optimize
+    if(s->msmpeg4_version>=4 && i>0) i=63; //FIXME/XXX optimize
     s->block_last_index[n] = i;
     
     return 0;
@@ -1990,3 +1995,9 @@ static int msmpeg4_decode_motion(MpegEncContext * s,
     *my_ptr = my;
     return 0;
 }
+
+/* cleanest way to support it
+ * there is too much shared between versions so that we cant have 1 file per version & 1 common
+ * as allmost everything would be in the common file 
+ */
+#include "wmv2.c"
diff --git a/src/libffmpeg/libavcodec/msmpeg4data.h b/src/libffmpeg/libavcodec/msmpeg4data.h
index 93a72c54c..3490fc08c 100644
--- a/src/libffmpeg/libavcodec/msmpeg4data.h
+++ b/src/libffmpeg/libavcodec/msmpeg4data.h
@@ -3,7 +3,7 @@
  */
 
 /* intra picture macro block coded block pattern */
-static const UINT16 table_mb_intra[64][2] = {
+static const uint16_t table_mb_intra[64][2] = {
 { 0x1, 1 },{ 0x17, 6 },{ 0x9, 5 },{ 0x5, 5 },
 { 0x6, 5 },{ 0x47, 9 },{ 0x20, 7 },{ 0x10, 7 },
 { 0x2, 5 },{ 0x7c, 9 },{ 0x3a, 7 },{ 0x1d, 7 },
@@ -23,7 +23,7 @@ static const UINT16 table_mb_intra[64][2] = {
 };
 
 /* non intra picture macro block coded block pattern + mb type */
-static const UINT32 table_mb_non_intra[128][2] = {
+static const uint32_t table_mb_non_intra[128][2] = {
 { 0x40, 7 },{ 0x13c9, 13 },{ 0x9fd, 12 },{ 0x1fc, 15 },
 { 0x9fc, 12 },{ 0xa83, 18 },{ 0x12d34, 17 },{ 0x83bc, 16 },
 { 0x83a, 12 },{ 0x7f8, 17 },{ 0x3fd, 16 },{ 0x3ff, 16 },
@@ -60,7 +60,7 @@ static const UINT32 table_mb_non_intra[128][2] = {
 
 /* dc table 0 */
 
-static const UINT32 table0_dc_lum[120][2] = {
+static const uint32_t table0_dc_lum[120][2] = {
 { 0x1, 1 },{ 0x1, 2 },{ 0x1, 4 },{ 0x1, 5 },
 { 0x5, 5 },{ 0x7, 5 },{ 0x8, 6 },{ 0xc, 6 },
 { 0x0, 7 },{ 0x2, 7 },{ 0x12, 7 },{ 0x1a, 7 },
@@ -93,7 +93,7 @@ static const UINT32 table0_dc_lum[120][2] = {
 { 0x6078c, 24 },{ 0x6078d, 24 },{ 0x6078e, 24 },{ 0x6078f, 24 },
 };
 
-static const UINT32 table0_dc_chroma[120][2] = {
+static const uint32_t table0_dc_chroma[120][2] = {
 { 0x0, 2 },{ 0x1, 2 },{ 0x5, 3 },{ 0x9, 4 },
 { 0xd, 4 },{ 0x11, 5 },{ 0x1d, 5 },{ 0x1f, 5 },
 { 0x21, 6 },{ 0x31, 6 },{ 0x38, 6 },{ 0x33, 6 },
@@ -128,7 +128,7 @@ static const UINT32 table0_dc_chroma[120][2] = {
 
 /* dc table 1 */
 
-static const UINT32 table1_dc_lum[120][2] = {
+static const uint32_t table1_dc_lum[120][2] = {
 { 0x2, 2 },{ 0x3, 2 },{ 0x3, 3 },{ 0x2, 4 },
 { 0x5, 4 },{ 0x1, 5 },{ 0x3, 5 },{ 0x8, 5 },
 { 0x0, 6 },{ 0x5, 6 },{ 0xd, 6 },{ 0xf, 6 },
@@ -161,7 +161,7 @@ static const UINT32 table1_dc_lum[120][2] = {
 { 0x1e6964, 26 },{ 0x1e6965, 26 },{ 0x1e6966, 26 },{ 0x1e6967, 26 },
 };
 
-static const UINT32 table1_dc_chroma[120][2] = {
+static const uint32_t table1_dc_chroma[120][2] = {
 { 0x0, 2 },{ 0x1, 2 },{ 0x4, 3 },{ 0x7, 3 },
 { 0xb, 4 },{ 0xd, 4 },{ 0x15, 5 },{ 0x28, 6 },
 { 0x30, 6 },{ 0x32, 6 },{ 0x52, 7 },{ 0x62, 7 },
@@ -196,7 +196,7 @@ static const UINT32 table1_dc_chroma[120][2] = {
 
 /* vlc table 0, for intra luma */
 
-static const UINT16 table0_vlc[133][2] = {
+static const uint16_t table0_vlc[133][2] = {
 { 0x1, 2 },{ 0x6, 3 },{ 0xf, 4 },{ 0x16, 5 },
 { 0x20, 6 },{ 0x18, 7 },{ 0x8, 8 },{ 0x9a, 8 },
 { 0x56, 9 },{ 0x13e, 9 },{ 0xf0, 10 },{ 0x3a5, 10 },
@@ -233,7 +233,7 @@ static const UINT16 table0_vlc[133][2] = {
 { 0x16, 7 },
 };
 
-static const INT8 table0_level[132] = {
+static const int8_t table0_level[132] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14, 15, 16,
   1,  2,  3,  4,  5,  6,  7,  8,
@@ -253,7 +253,7 @@ static const INT8 table0_level[132] = {
   1,  1,  1,  1,
 };
 
-static const INT8 table0_run[132] = {
+static const int8_t table0_run[132] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   1,  1,  1,  1,  1,  1,  1,  1,
@@ -275,7 +275,7 @@ static const INT8 table0_run[132] = {
 
 /* vlc table 1, for intra chroma and P macro blocks */
 
-static const UINT16 table1_vlc[149][2] = {
+static const uint16_t table1_vlc[149][2] = {
 { 0x4, 3 },{ 0x14, 5 },{ 0x17, 7 },{ 0x7f, 8 },
 { 0x154, 9 },{ 0x1f2, 10 },{ 0xbf, 11 },{ 0x65, 12 },
 { 0xaaa, 12 },{ 0x630, 13 },{ 0x1597, 13 },{ 0x3b7, 14 },
@@ -316,7 +316,7 @@ static const UINT16 table1_vlc[149][2] = {
 { 0xd, 9 },
 };
 
-static const INT8 table1_level[148] = {
+static const int8_t table1_level[148] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14,  1,  2,
   3,  4,  5,  6,  7,  8,  9,  1,
@@ -338,7 +338,7 @@ static const INT8 table1_level[148] = {
   1,  1,  1,  1,
 };
 
-static const INT8 table1_run[148] = {
+static const int8_t table1_run[148] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  1,  1,
   1,  1,  1,  1,  1,  1,  1,  2,
@@ -362,7 +362,7 @@ static const INT8 table1_run[148] = {
 
 /* third vlc table */
 
-static const UINT16 table2_vlc[186][2] = {
+static const uint16_t table2_vlc[186][2] = {
 { 0x1, 2 },{ 0x5, 3 },{ 0xd, 4 },{ 0x12, 5 },
 { 0xe, 6 },{ 0x15, 7 },{ 0x13, 8 },{ 0x3f, 8 },
 { 0x4b, 9 },{ 0x11f, 9 },{ 0xb8, 10 },{ 0x3e3, 10 },
@@ -412,7 +412,7 @@ static const UINT16 table2_vlc[186][2] = {
 { 0x23dc, 14 },{ 0x4a, 9 },
 };
 
-static const INT8 table2_level[185] = {
+static const int8_t table2_level[185] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14, 15, 16,
  17, 18, 19,  1,  2,  3,  4,  5,
@@ -439,7 +439,7 @@ static const INT8 table2_level[185] = {
   1,
 };
 
-static const INT8 table2_run[185] = {
+static const int8_t table2_run[185] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  1,  1,  1,  1,  1,
@@ -467,7 +467,7 @@ static const INT8 table2_run[185] = {
 };
 
 /* second non intra vlc table */
-static const UINT16 table4_vlc[169][2] = {
+static const uint16_t table4_vlc[169][2] = {
 { 0x0, 3 },{ 0x3, 4 },{ 0xb, 5 },{ 0x14, 6 },
 { 0x3f, 6 },{ 0x5d, 7 },{ 0xa2, 8 },{ 0xac, 9 },
 { 0x16e, 9 },{ 0x20a, 10 },{ 0x2e2, 10 },{ 0x432, 11 },
@@ -513,7 +513,7 @@ static const UINT16 table4_vlc[169][2] = {
 { 0x169, 9 },
 };
 
-static const INT8 table4_level[168] = {
+static const int8_t table4_level[168] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14, 15, 16,
  17, 18, 19, 20, 21, 22, 23,  1,
@@ -537,7 +537,7 @@ static const INT8 table4_level[168] = {
   1,  1,  1,  1,  1,  1,  1,  1,
 };
 
-static const INT8 table4_run[168] = {
+static const int8_t table4_run[168] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  1,
@@ -561,25 +561,25 @@ static const INT8 table4_run[168] = {
  29, 30, 31, 32, 33, 34, 35, 36,
 };
 
-extern const UINT16 inter_vlc[103][2];
-extern const INT8 inter_level[102];
-extern const INT8 inter_run[102];
+extern const uint16_t inter_vlc[103][2];
+extern const int8_t inter_level[102];
+extern const int8_t inter_run[102];
 
-extern const UINT16 intra_vlc[103][2];
-extern const INT8 intra_level[102];
-extern const INT8 intra_run[102];
+extern const uint16_t intra_vlc[103][2];
+extern const int8_t intra_level[102];
+extern const int8_t intra_run[102];
 
-extern const UINT8 DCtab_lum[13][2];
-extern const UINT8 DCtab_chrom[13][2];
+extern const uint8_t DCtab_lum[13][2];
+extern const uint8_t DCtab_chrom[13][2];
 
-extern const UINT8 cbpy_tab[16][2];
-extern const UINT8 mvtab[33][2];
+extern const uint8_t cbpy_tab[16][2];
+extern const uint8_t mvtab[33][2];
 
-extern const UINT8 intra_MCBPC_code[8];
-extern const UINT8 intra_MCBPC_bits[8];
+extern const uint8_t intra_MCBPC_code[8];
+extern const uint8_t intra_MCBPC_bits[8];
 
-extern const UINT8 inter_MCBPC_code[25];
-extern const UINT8 inter_MCBPC_bits[25];
+extern const uint8_t inter_MCBPC_code[25];
+extern const uint8_t inter_MCBPC_bits[25];
 
 #define NB_RL_TABLES  6
 
@@ -632,7 +632,7 @@ static RLTable rl_table[NB_RL_TABLES] = {
 
 /* motion vector table 0 */
 
-static const UINT16 table0_mv_code[1100] = {
+static const uint16_t table0_mv_code[1100] = {
  0x0001, 0x0003, 0x0005, 0x0007, 0x0003, 0x0008, 0x000c, 0x0001,
  0x0002, 0x001b, 0x0006, 0x000b, 0x0015, 0x0002, 0x000e, 0x000f,
  0x0014, 0x0020, 0x0022, 0x0025, 0x0027, 0x0029, 0x002d, 0x004b,
@@ -773,7 +773,7 @@ static const UINT16 table0_mv_code[1100] = {
  0x5f0d, 0x5f0e, 0x5f0f, 0x0000,
 };
 
-static const UINT8 table0_mv_bits[1100] = {
+static const uint8_t table0_mv_bits[1100] = {
   1,  4,  4,  4,  5,  5,  5,  6,
   6,  6,  7,  7,  7,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,
@@ -914,7 +914,7 @@ static const UINT8 table0_mv_bits[1100] = {
  17, 17, 17,  8,
 };
 
-static const UINT8 table0_mvx[1099] = {
+static const uint8_t table0_mvx[1099] = {
  32, 32, 31, 32, 33, 31, 33, 31,
  33, 32, 34, 32, 30, 32, 31, 34,
  35, 32, 34, 33, 29, 33, 30, 30,
@@ -1055,7 +1055,7 @@ static const UINT8 table0_mvx[1099] = {
  61, 19, 19,
 };
 
-static const UINT8 table0_mvy[1099] = {
+static const uint8_t table0_mvy[1099] = {
  32, 31, 32, 33, 32, 31, 31, 33,
  33, 34, 32, 30, 32, 35, 34, 31,
  32, 29, 33, 30, 32, 34, 33, 31,
@@ -1197,7 +1197,7 @@ static const UINT8 table0_mvy[1099] = {
 };
 
 /* motion vector table 1 */
-static const UINT16 table1_mv_code[1100] = {
+static const uint16_t table1_mv_code[1100] = {
  0x0000, 0x0007, 0x0009, 0x000f, 0x000a, 0x0011, 0x001a, 0x001c,
  0x0011, 0x0031, 0x0025, 0x002d, 0x002f, 0x006f, 0x0075, 0x0041,
  0x004c, 0x004e, 0x005c, 0x0060, 0x0062, 0x0066, 0x0068, 0x0069,
@@ -1338,7 +1338,7 @@ static const UINT16 table1_mv_code[1100] = {
  0x2473, 0x26a2, 0x26a3, 0x000b,
 };
 
-static const UINT8 table1_mv_bits[1100] = {
+static const uint8_t table1_mv_bits[1100] = {
   2,  4,  4,  4,  5,  5,  5,  5,
   6,  6,  7,  7,  7,  7,  7,  8,
   8,  8,  8,  8,  8,  8,  8,  8,
@@ -1479,7 +1479,7 @@ static const UINT8 table1_mv_bits[1100] = {
  15, 15, 15,  4,
 };
 
-static const UINT8 table1_mvx[1099] = {
+static const uint8_t table1_mvx[1099] = {
  32, 31, 32, 31, 33, 32, 33, 33,
  31, 34, 30, 32, 32, 34, 35, 32,
  34, 33, 29, 30, 30, 32, 31, 31,
@@ -1620,7 +1620,7 @@ static const UINT8 table1_mvx[1099] = {
   0, 12, 27,
 };
 
-static const UINT8 table1_mvy[1099] = {
+static const uint8_t table1_mvy[1099] = {
  32, 32, 31, 31, 32, 33, 31, 33,
  33, 32, 32, 30, 34, 31, 32, 29,
  33, 30, 32, 33, 31, 35, 34, 30,
@@ -1764,11 +1764,11 @@ static const UINT8 table1_mvy[1099] = {
 /* motion vector table */
 typedef struct MVTable {
     int n;
-    const UINT16 *table_mv_code;
-    const UINT8 *table_mv_bits;
-    const UINT8 *table_mvx;
-    const UINT8 *table_mvy;
-    UINT16 *table_mv_index; /* encoding: convert mv to index in table_mv */
+    const uint16_t *table_mv_code;
+    const uint8_t *table_mv_bits;
+    const uint8_t *table_mvx;
+    const uint8_t *table_mvy;
+    uint16_t *table_mv_index; /* encoding: convert mv to index in table_mv */
     VLC vlc;                /* decoding: vlc */
 } MVTable;
 
@@ -1789,29 +1789,29 @@ static MVTable mv_tables[2] = {
     }
 };
 
-static const UINT8 v2_mb_type[8][2] = {
+static const uint8_t v2_mb_type[8][2] = {
  {1, 1}, {0   , 2}, {3   , 3}, {9   , 5},
  {5, 4}, {0x21, 7}, {0x20, 7}, {0x11, 6},
 };
 
-static const UINT8 v2_intra_cbpc[4][2] = {
+static const uint8_t v2_intra_cbpc[4][2] = {
  {1, 1}, {0, 3}, {1, 3}, {1, 2},
 };
 
-static UINT8 wmv1_y_dc_scale_table[32]={
+static uint8_t wmv1_y_dc_scale_table[32]={
 //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
     0, 8, 8, 8, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21
 };
-static UINT8 wmv1_c_dc_scale_table[32]={
+static uint8_t wmv1_c_dc_scale_table[32]={
 //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
     0, 8, 8, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22
 };
 
-static UINT8 old_ff_y_dc_scale_table[32]={
+static uint8_t old_ff_y_dc_scale_table[32]={
 //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
     0, 8, 8, 8, 8,10,12,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
 };
-static UINT8 old_ff_c_dc_scale_table[32]={
+static uint8_t old_ff_c_dc_scale_table[32]={
 //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
     0, 8, 8, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22
 };
@@ -1819,7 +1819,7 @@ static UINT8 old_ff_c_dc_scale_table[32]={
 
 #define WMV1_SCANTABLE_COUNT 4
 
-static const UINT8 wmv1_scantable00[64]= {
+static const uint8_t wmv1_scantable00[64]= {
 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11, 
 0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28, 
 0x30, 0x38, 0x29, 0x21, 0x1A, 0x13, 0x0C, 0x05, 
@@ -1829,7 +1829,7 @@ static const UINT8 wmv1_scantable00[64]= {
 0x2C, 0x25, 0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x35, 
 0x3D, 0x3E, 0x36, 0x2E, 0x27, 0x2F, 0x37, 0x3F, 
 };
-static const UINT8 wmv1_scantable01[64]= {
+static const uint8_t wmv1_scantable01[64]= {
 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11, 
 0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28, 
 0x21, 0x30, 0x1A, 0x13, 0x0C, 0x05, 0x06, 0x0D, 
@@ -1839,7 +1839,7 @@ static const UINT8 wmv1_scantable01[64]= {
 0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3C, 0x35, 
 0x3D, 0x2E, 0x27, 0x2F, 0x36, 0x3E, 0x37, 0x3F, 
 };
-static const UINT8 wmv1_scantable02[64]= {
+static const uint8_t wmv1_scantable02[64]= {
 0x00, 0x01, 0x08, 0x02, 0x03, 0x09, 0x10, 0x18, 
 0x11, 0x0A, 0x04, 0x05, 0x0B, 0x12, 0x19, 0x20, 
 0x28, 0x30, 0x21, 0x1A, 0x13, 0x0C, 0x06, 0x07, 
@@ -1849,7 +1849,7 @@ static const UINT8 wmv1_scantable02[64]= {
 0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3B, 0x3C, 0x35, 
 0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, 
 };
-static const UINT8 wmv1_scantable03[64]= {
+static const uint8_t wmv1_scantable03[64]= {
 0x00, 0x08, 0x10, 0x01, 0x18, 0x20, 0x28, 0x09, 
 0x02, 0x03, 0x0A, 0x11, 0x19, 0x30, 0x38, 0x29, 
 0x21, 0x1A, 0x12, 0x0B, 0x04, 0x05, 0x0C, 0x13, 
@@ -1860,14 +1860,141 @@ static const UINT8 wmv1_scantable03[64]= {
 0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, 
 };
 
-static const UINT8 *wmv1_scantable[WMV1_SCANTABLE_COUNT+1]={
+static const uint8_t *wmv1_scantable[WMV1_SCANTABLE_COUNT+1]={
     wmv1_scantable00,
     wmv1_scantable01,
     wmv1_scantable02,
     wmv1_scantable03,
 };
 
-static UINT8 table_inter_intra[4][2]={
+static const uint8_t table_inter_intra[4][2]={
     {0,1},{2,2},{6,3},{7,3}
 };
 
+#define WMV2_INTER_CBP_TABLE_COUNT 4
+
+static const uint32_t table_mb_non_intra2[128][2] = {
+{0x0000A7, 14}, {0x01B2B8, 18}, {0x01B28E, 18}, {0x036575, 19}, 
+{0x006CAC, 16}, {0x000A69, 18}, {0x002934, 20}, {0x00526B, 21}, 
+{0x006CA1, 16}, {0x01B2B9, 18}, {0x0029AD, 20}, {0x029353, 24}, 
+{0x006CA7, 16}, {0x006CAB, 16}, {0x01B2BB, 18}, {0x00029B, 16}, 
+{0x00D944, 17}, {0x000A6A, 18}, {0x0149A8, 23}, {0x03651F, 19}, 
+{0x006CAF, 16}, {0x000A4C, 18}, {0x03651E, 19}, {0x000A48, 18}, 
+{0x00299C, 20}, {0x00299F, 20}, {0x029352, 24}, {0x0029AC, 20}, 
+{0x000296, 16}, {0x00D946, 17}, {0x000A68, 18}, {0x000298, 16}, 
+{0x000527, 17}, {0x00D94D, 17}, {0x0014D7, 19}, {0x036574, 19}, 
+{0x000A5C, 18}, {0x01B299, 18}, {0x00299D, 20}, {0x00299E, 20}, 
+{0x000525, 17}, {0x000A66, 18}, {0x00A4D5, 22}, {0x00149B, 19}, 
+{0x000295, 16}, {0x006CAD, 16}, {0x000A49, 18}, {0x000521, 17}, 
+{0x006CAA, 16}, {0x00D945, 17}, {0x01B298, 18}, {0x00052F, 17}, 
+{0x003654, 15}, {0x006CA0, 16}, {0x000532, 17}, {0x000291, 16}, 
+{0x003652, 15}, {0x000520, 17}, {0x000A5D, 18}, {0x000294, 16}, 
+{0x00009B, 11}, {0x0006E2, 12}, {0x000028, 12}, {0x0001B0, 10}, 
+{0x000001,  3}, {0x000010,  8}, {0x00002F,  6}, {0x00004C, 10}, 
+{0x00000D,  4}, {0x000000, 10}, {0x000006,  9}, {0x000134, 12}, 
+{0x00000C,  4}, {0x000007, 10}, {0x000007,  9}, {0x0006E1, 12}, 
+{0x00000E,  5}, {0x0000DA,  9}, {0x000022,  9}, {0x000364, 11}, 
+{0x00000F,  4}, {0x000006, 10}, {0x00000F,  9}, {0x000135, 12}, 
+{0x000014,  5}, {0x0000DD,  9}, {0x000004,  9}, {0x000015, 11}, 
+{0x00001A,  6}, {0x0001B3, 10}, {0x000005, 10}, {0x0006E3, 12}, 
+{0x00000C,  5}, {0x0000B9,  8}, {0x000004,  8}, {0x0000DB,  9}, 
+{0x00000E,  4}, {0x00000B, 10}, {0x000023,  9}, {0x0006CB, 12}, 
+{0x000005,  6}, {0x0001B1, 10}, {0x000001, 10}, {0x0006E0, 12}, 
+{0x000011,  5}, {0x0000DF,  9}, {0x00000E,  9}, {0x000373, 11}, 
+{0x000003,  5}, {0x0000B8,  8}, {0x000006,  8}, {0x000175,  9}, 
+{0x000015,  5}, {0x000174,  9}, {0x000027,  9}, {0x000372, 11}, 
+{0x000010,  5}, {0x0000BB,  8}, {0x000005,  8}, {0x0000DE,  9}, 
+{0x00000F,  5}, {0x000001,  9}, {0x000012,  8}, {0x000004, 10}, 
+{0x000002,  3}, {0x000016,  5}, {0x000009,  4}, {0x000001,  5}, 
+};
+
+static const uint32_t table_mb_non_intra3[128][2] = {
+{0x0002A1, 10}, {0x005740, 15}, {0x01A0BF, 18}, {0x015D19, 17}, 
+{0x001514, 13}, {0x00461E, 15}, {0x015176, 17}, {0x015177, 17}, 
+{0x0011AD, 13}, {0x00682E, 16}, {0x0682F9, 20}, {0x03417D, 19}, 
+{0x001A36, 14}, {0x002A2D, 14}, {0x00D05E, 17}, {0x006824, 16}, 
+{0x001515, 13}, {0x00545C, 15}, {0x0230E9, 18}, {0x011AFA, 17}, 
+{0x0015D7, 13}, {0x005747, 15}, {0x008D79, 16}, {0x006825, 16}, 
+{0x002BA2, 14}, {0x00A8BA, 16}, {0x0235F6, 18}, {0x015D18, 17}, 
+{0x0011AE, 13}, {0x00346F, 15}, {0x008C3B, 16}, {0x00346E, 15}, 
+{0x000D1A, 13}, {0x00461F, 15}, {0x0682F8, 20}, {0x011875, 17}, 
+{0x002BA1, 14}, {0x008D61, 16}, {0x0235F7, 18}, {0x0230E8, 18}, 
+{0x001513, 13}, {0x008D7B, 16}, {0x011AF4, 17}, {0x011AF5, 17}, 
+{0x001185, 13}, {0x0046BF, 15}, {0x008D60, 16}, {0x008D7C, 16}, 
+{0x001512, 13}, {0x00461C, 15}, {0x00AE8D, 16}, {0x008D78, 16}, 
+{0x000D0E, 13}, {0x003413, 15}, {0x0046B1, 15}, {0x003416, 15}, 
+{0x000AEA, 12}, {0x002A2C, 14}, {0x005741, 15}, {0x002A2F, 14}, 
+{0x000158,  9}, {0x0008D2, 12}, {0x00054C, 11}, {0x000686, 12}, 
+{0x000000,  2}, {0x000069,  8}, {0x00006B,  8}, {0x00068C, 12}, 
+{0x000007,  3}, {0x00015E,  9}, {0x0002A3, 10}, {0x000AE9, 12}, 
+{0x000006,  3}, {0x000231, 10}, {0x0002B8, 10}, {0x001A08, 14}, 
+{0x000010,  5}, {0x0001A9, 10}, {0x000342, 11}, {0x000A88, 12}, 
+{0x000004,  4}, {0x0001A2, 10}, {0x0002A4, 10}, {0x001184, 13}, 
+{0x000012,  5}, {0x000232, 10}, {0x0002B2, 10}, {0x000680, 12}, 
+{0x00001B,  6}, {0x00046A, 11}, {0x00068E, 12}, {0x002359, 14}, 
+{0x000016,  5}, {0x00015F,  9}, {0x0002A0, 10}, {0x00054D, 11}, 
+{0x000005,  4}, {0x000233, 10}, {0x0002B9, 10}, {0x0015D6, 13}, 
+{0x000022,  6}, {0x000468, 11}, {0x000683, 12}, {0x001A0A, 14}, 
+{0x000013,  5}, {0x000236, 10}, {0x0002BB, 10}, {0x001186, 13}, 
+{0x000017,  5}, {0x0001AB, 10}, {0x0002A7, 10}, {0x0008D3, 12}, 
+{0x000014,  5}, {0x000237, 10}, {0x000460, 11}, {0x000D0F, 13}, 
+{0x000019,  6}, {0x0001AA, 10}, {0x0002B3, 10}, {0x000681, 12}, 
+{0x000018,  6}, {0x0001A8, 10}, {0x0002A5, 10}, {0x00068F, 12}, 
+{0x000007,  4}, {0x000055,  7}, {0x000047,  7}, {0x0000AD,  8}, 
+};
+
+static const uint32_t table_mb_non_intra4[128][2] = {
+{0x0000D4,  8}, {0x0021C5, 14}, {0x00F18A, 16}, {0x00D5BC, 16}, 
+{0x000879, 12}, {0x00354D, 14}, {0x010E3F, 17}, {0x010F54, 17}, 
+{0x000866, 12}, {0x00356E, 14}, {0x010F55, 17}, {0x010E3E, 17}, 
+{0x0010CE, 13}, {0x003C84, 14}, {0x00D5BD, 16}, {0x00F18B, 16}, 
+{0x000868, 12}, {0x00438C, 15}, {0x0087AB, 16}, {0x00790B, 15}, 
+{0x000F10, 12}, {0x00433D, 15}, {0x006AD3, 15}, {0x00790A, 15}, 
+{0x001AA7, 13}, {0x0043D4, 15}, {0x00871E, 16}, {0x006ADF, 15}, 
+{0x000D7C, 12}, {0x003C94, 14}, {0x00438D, 15}, {0x006AD2, 15}, 
+{0x0006BC, 11}, {0x0021E9, 14}, {0x006ADA, 15}, {0x006A99, 15}, 
+{0x0010F7, 13}, {0x004389, 15}, {0x006ADB, 15}, {0x0078C4, 15}, 
+{0x000D56, 12}, {0x0035F7, 14}, {0x00438E, 15}, {0x006A98, 15}, 
+{0x000D52, 12}, {0x003C95, 14}, {0x004388, 15}, {0x00433C, 15}, 
+{0x000D54, 12}, {0x001E4B, 13}, {0x003C63, 14}, {0x003C83, 14}, 
+{0x000861, 12}, {0x0021EB, 14}, {0x00356C, 14}, {0x0035F6, 14}, 
+{0x000863, 12}, {0x00219F, 14}, {0x003568, 14}, {0x003C82, 14}, 
+{0x0001AE,  9}, {0x0010C0, 13}, {0x000F11, 12}, {0x001AFA, 13}, 
+{0x000000,  1}, {0x0000F0,  8}, {0x0001AD,  9}, {0x0010C1, 13}, 
+{0x00000A,  4}, {0x0003C5, 10}, {0x000789, 11}, {0x001AB5, 13}, 
+{0x000009,  4}, {0x000435, 11}, {0x000793, 11}, {0x001E40, 13}, 
+{0x00001D,  5}, {0x0003CB, 10}, {0x000878, 12}, {0x001AAF, 13}, 
+{0x00000B,  4}, {0x0003C7, 10}, {0x000791, 11}, {0x001AAB, 13}, 
+{0x00001F,  5}, {0x000436, 11}, {0x0006BF, 11}, {0x000F19, 12}, 
+{0x00003D,  6}, {0x000D51, 12}, {0x0010C4, 13}, {0x0021E8, 14}, 
+{0x000036,  6}, {0x000437, 11}, {0x0006AF, 11}, {0x0010C5, 13}, 
+{0x00000C,  4}, {0x000432, 11}, {0x000794, 11}, {0x001E30, 13}, 
+{0x000042,  7}, {0x000870, 12}, {0x000F24, 12}, {0x001E43, 13}, 
+{0x000020,  6}, {0x00043E, 11}, {0x000795, 11}, {0x001AAA, 13}, 
+{0x000037,  6}, {0x0006AC, 11}, {0x0006AE, 11}, {0x0010F6, 13}, 
+{0x000034,  6}, {0x00043A, 11}, {0x000D50, 12}, {0x001AAE, 13}, 
+{0x000039,  6}, {0x00043F, 11}, {0x00078D, 11}, {0x0010D2, 13}, 
+{0x000038,  6}, {0x00043B, 11}, {0x0006BD, 11}, {0x0010D3, 13}, 
+{0x000011,  5}, {0x0001AC,  9}, {0x0000F3,  8}, {0x000439, 11}, 
+};
+
+static const uint32_t (*wmv2_inter_table[WMV2_INTER_CBP_TABLE_COUNT])[2]={
+    table_mb_non_intra2,
+    table_mb_non_intra3,
+    table_mb_non_intra4,
+    table_mb_non_intra,
+};
+
+static const uint8_t wmv2_scantableA[64]={
+0x00, 0x01, 0x02, 0x08, 0x03, 0x09, 0x0A, 0x10,
+0x04, 0x0B, 0x11, 0x18, 0x12, 0x0C, 0x05, 0x13,
+0x19, 0x0D, 0x14, 0x1A, 0x1B, 0x06, 0x15, 0x1C,
+0x0E, 0x16, 0x1D, 0x07, 0x1E, 0x0F, 0x17, 0x1F,
+};
+
+static const uint8_t wmv2_scantableB[64]={
+0x00, 0x08, 0x01, 0x10, 0x09, 0x18, 0x11, 0x02,
+0x20, 0x0A, 0x19, 0x28, 0x12, 0x30, 0x21, 0x1A, 
+0x38, 0x29, 0x22, 0x03, 0x31, 0x39, 0x0B, 0x2A, 
+0x13, 0x32, 0x1B, 0x3A, 0x23, 0x2B, 0x33, 0x3B,
+};
diff --git a/src/libffmpeg/libavcodec/ppc/Makefile.am b/src/libffmpeg/libavcodec/ppc/Makefile.am
index d791fe4a8..a623a96f3 100644
--- a/src/libffmpeg/libavcodec/ppc/Makefile.am
+++ b/src/libffmpeg/libavcodec/ppc/Makefile.am
@@ -14,6 +14,7 @@ noinst_LTLIBRARIES = libavcodec_ppc.la
 
 libavcodec_ppc_src =  dsputil_altivec.c \
 		      dsputil_ppc.c \
+		      fft_altivec.c \
 		      idct_altivec.c \
 		      mpegvideo_altivec.c \
 		      mpegvideo_ppc.c
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
index ed34a2d92..5f14ed0eb 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -343,48 +343,6 @@ int pix_norm1_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
-
-int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
-{
-    int s, i;
-    vector unsigned char *tv, zero;
-    vector unsigned char pix1v, pix2v, t5;
-    vector unsigned int sv;
-    vector signed int sum;
-
-    zero = vec_splat_u8(0);
-    sv = vec_splat_u32(0);
-    s = 0;
-    for (i = 0; i < 16; i++) {
-        /* Read in the potentially unaligned pixels */
-        tv = (vector unsigned char *) pix1;
-        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
-
-        tv = (vector unsigned char *) pix2;
-        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix2));
-
-        /*
-           Since we want to use unsigned chars, we can take advantage
-           of the fact that abs(a-b)^2 = (a-b)^2.
-        */
-        
-        /* Calculate a sum of abs differences vector */
-        t5 = vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
-
-        /* Square the values and add them to our sum */
-        sv = vec_msum(t5, t5, sv);
-        
-        pix1 += line_size;
-        pix2 += line_size;
-    }
-    /* Sum up the four partial sums, and put the result into s */
-    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
-    sum = vec_splat(sum, 3);
-    vec_ste(sum, 0, &s);
-    return s;
-}
-
-
 int pix_sum_altivec(UINT8 * pix, int line_size)
 {
 
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
index 94fe3a023..d4d259d9e 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
@@ -23,7 +23,6 @@ extern int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 extern int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_norm1_altivec(uint8_t *pix, int line_size);
-extern int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_sum_altivec(UINT8 * pix, int line_size);
 extern void diff_pixels_altivec(DCTELEM* block, const UINT8* s1, const UINT8* s2, int stride);
 extern void get_pixels_altivec(DCTELEM* block, const UINT8 * pixels, int line_size);
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
index ffe3ce063..733d0c156 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -42,7 +42,6 @@ void dsputil_init_ppc(DSPContext* c, unsigned mask)
         c->pix_abs16x16 = pix_abs16x16_altivec;
         c->pix_abs8x8 = pix_abs8x8_altivec;
         c->pix_norm1 = pix_norm1_altivec;
-        c->pix_norm = pix_norm_altivec;
         c->pix_sum = pix_sum_altivec;
         c->diff_pixels = diff_pixels_altivec;
         c->get_pixels = get_pixels_altivec;
diff --git a/src/libffmpeg/libavcodec/ppc/fft_altivec.c b/src/libffmpeg/libavcodec/ppc/fft_altivec.c
new file mode 100644
index 000000000..1a926b77c
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/fft_altivec.c
@@ -0,0 +1,166 @@
+/*
+ * FFT/IFFT transforms
+ * AltiVec-enabled
+ * Copyright (c) 2002 Romain Dolbeau <romain@dolbeau.org>
+ * Based on code Copyright (c) 2002 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "../dsputil.h"
+
+#include "dsputil_altivec.h"
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
+
+// vcprmle is used to keep the same index as in the SSE version.
+// it's the same as vcprm, with the index inversed
+// ('le' is Little Endian)
+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
+
+// used to build inverse/identity vectors (vcii)
+// n is _n_egative, p is _p_ositive
+#define FLOAT_n -1.
+#define FLOAT_p 1.
+
+#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
+
+/**
+ * Do a complex FFT with the parameters defined in fft_init(). The
+ * input data must be permuted before with s->revtab table. No
+ * 1.0/sqrt(n) normalization is done.
+ * AltiVec-enabled
+ * This code assumes that the 'z' pointer is 16 bytes-aligned
+ * It also assumes all FFTComplex are 8 bytes-aligned pair of float
+ * The code is exactly the same as the SSE version, except
+ * that successive MUL + ADD/SUB have been fusionned into
+ * fused multiply-add ('vec_madd' in altivec)
+ *
+ * To test this code you can use fft-test in libavcodec ; use
+ * the following line in libavcodec to compile (MacOS X):
+ * #####
+ * gcc -I. -Ippc -no-cpp-precomp -pipe -O3 -fomit-frame-pointer -mdynamic-no-pic -Wall
+ *     -faltivec -DARCH_POWERPC -DHAVE_ALTIVEC -DCONFIG_DARWIN fft-test.c fft.c
+ *     ppc/fft_altivec.c ppc/dsputil_altivec.c mdct.c -DHAVE_LRINTF -o fft-test
+ * #####
+ */
+void fft_calc_altivec(FFTContext *s, FFTComplex *z)
+{
+    register const vector float vczero = (vector float)( 0., 0., 0., 0.);
+    
+    int ln = s->nbits;
+    int	j, np, np2;
+    int	nblocks, nloops;
+    register FFTComplex *p, *q;
+    FFTComplex *cptr, *cptr1;
+    int k;
+
+    np = 1 << ln;
+
+    {
+        vector float *r, a, b, a1, c1, c2;
+
+        r = (vector float *)&z[0];
+
+        c1 = vcii(p,p,n,n);
+        
+        if (s->inverse)
+            {
+                c2 = vcii(p,p,n,p);
+            }
+        else
+            {
+                c2 = vcii(p,p,p,n);
+            }
+        
+        j = (np >> 2);
+        do {
+            a = vec_ld(0, r);
+            a1 = vec_ld(sizeof(vector float), r);
+            
+            b = vec_perm(a,a,vcprmle(1,0,3,2));
+            a = vec_madd(a,c1,b);
+            /* do the pass 0 butterfly */
+            
+            b = vec_perm(a1,a1,vcprmle(1,0,3,2));
+            b = vec_madd(a1,c1,b);
+            /* do the pass 0 butterfly */
+            
+            /* multiply third by -i */
+            b = vec_perm(b,b,vcprmle(2,3,1,0));
+            
+            /* do the pass 1 butterfly */
+            vec_st(vec_madd(b,c2,a), 0, r);
+            vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
+            
+            r += 2;
+        } while (--j != 0);
+    }
+    /* pass 2 .. ln-1 */
+
+    nblocks = np >> 3;
+    nloops = 1 << 2;
+    np2 = np >> 1;
+
+    cptr1 = s->exptab1;
+    do {
+        p = z;
+        q = z + nloops;
+        j = nblocks;
+        do {
+            cptr = cptr1;
+            k = nloops >> 1;
+            do {
+                vector float a,b,c,t1;
+
+                a = vec_ld(0, (float*)p);
+                b = vec_ld(0, (float*)q);
+                
+                /* complex mul */
+                c = vec_ld(0, (float*)cptr);
+                /*  cre*re cim*re */
+                t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
+                c = vec_ld(sizeof(vector float), (float*)cptr);
+                /*  -cim*im cre*im */
+                b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
+                
+                /* butterfly */
+                vec_st(vec_add(a,b), 0, (float*)p);
+                vec_st(vec_sub(a,b), 0, (float*)q);
+                
+                p += 2;
+                q += 2;
+                cptr += 4;
+            } while (--k);
+            
+            p += nloops;
+            q += nloops;
+        } while (--j);
+        cptr1 += nloops * 2;
+        nblocks = nblocks >> 1;
+        nloops = nloops << 1;
+    } while (nblocks != 0);
+}
+
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
index 18888c8f4..94d608b63 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
@@ -16,10 +16,9 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
  
-#include <time.h>
-#include "../../config.h"
 #include "../dsputil.h"
 #include "../mpegvideo.h"
+#include <time.h>
 
 #ifdef HAVE_ALTIVEC
 #include "dsputil_altivec.h"
diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c
index 63c45c82a..bda408dfe 100644
--- a/src/libffmpeg/libavcodec/ratecontrol.c
+++ b/src/libffmpeg/libavcodec/ratecontrol.c
@@ -17,9 +17,6 @@
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-#include <math.h>
-#include <alloca.h>
-#include "common.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
@@ -27,10 +24,6 @@
 #undef NDEBUG // allways check asserts, the speed effect is far too small to disable them
 #include <assert.h>
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 #ifndef M_E
 #define M_E 2.718281828
 #endif
@@ -230,8 +223,33 @@ static double get_qscale(MpegEncContext *s, RateControlEntry *rce, double rate_f
     const double mb_num= s->mb_num;  
     int i;
 
-    double const_values[20];
-
+    double const_values[]={
+        M_PI,
+        M_E,
+        rce->i_tex_bits*rce->qscale,
+        rce->p_tex_bits*rce->qscale,
+        (rce->i_tex_bits + rce->p_tex_bits)*(double)rce->qscale,
+        rce->mv_bits/mb_num,
+        rce->pict_type == B_TYPE ? (rce->f_code + rce->b_code)*0.5 : rce->f_code,
+        rce->i_count/mb_num,
+        rce->mc_mb_var_sum/mb_num,
+        rce->mb_var_sum/mb_num,
+        rce->pict_type == I_TYPE,
+        rce->pict_type == P_TYPE,
+        rce->pict_type == B_TYPE,
+        rcc->qscale_sum[pict_type] / (double)rcc->frame_count[pict_type],
+        s->qcompress,
+/*        rcc->last_qscale_for[I_TYPE],
+        rcc->last_qscale_for[P_TYPE],
+        rcc->last_qscale_for[B_TYPE],
+        rcc->next_non_b_qscale,*/
+        rcc->i_cplx_sum[I_TYPE] / (double)rcc->frame_count[I_TYPE],
+        rcc->i_cplx_sum[P_TYPE] / (double)rcc->frame_count[P_TYPE],
+        rcc->p_cplx_sum[P_TYPE] / (double)rcc->frame_count[P_TYPE],
+        rcc->p_cplx_sum[B_TYPE] / (double)rcc->frame_count[B_TYPE],
+        (rcc->i_cplx_sum[pict_type] + rcc->p_cplx_sum[pict_type]) / (double)rcc->frame_count[pict_type],
+        0
+    };
     char *const_names[]={
         "PI",
         "E",
@@ -270,32 +288,6 @@ static double get_qscale(MpegEncContext *s, RateControlEntry *rce, double rate_f
         NULL
     };
 
-    const_values[0] = M_PI;
-    const_values[1] = M_E;
-    const_values[2] = rce->i_tex_bits*rce->qscale;
-    const_values[3] = rce->p_tex_bits*rce->qscale;
-    const_values[4] = (rce->i_tex_bits + rce->p_tex_bits)*(double)rce->qscale;
-    const_values[5] = rce->mv_bits/mb_num;
-    const_values[6] = rce->pict_type == B_TYPE ? (rce->f_code + rce->b_code)*0.5 : rce->f_code;
-    const_values[7] = rce->i_count/mb_num;
-    const_values[8] = rce->mc_mb_var_sum/mb_num;
-    const_values[9] = rce->mb_var_sum/mb_num;
-    const_values[10] = rce->pict_type == I_TYPE;
-    const_values[11] = rce->pict_type == P_TYPE;
-    const_values[12] = rce->pict_type == B_TYPE;
-    const_values[13] = rcc->qscale_sum[pict_type] / (double)rcc->frame_count[pict_type];
-    const_values[14] = s->qcompress;
-    /*const_values[] = rcc->last_qscale_for[I_TYPE];
-    const_values[] = rcc->last_qscale_for[P_TYPE];
-    const_values[] = rcc->last_qscale_for[B_TYPE];
-    const_values[] = rcc->next_non_b_qscale;*/
-    const_values[15] = rcc->i_cplx_sum[I_TYPE] / (double)rcc->frame_count[I_TYPE];
-    const_values[16] = rcc->i_cplx_sum[P_TYPE] / (double)rcc->frame_count[P_TYPE];
-    const_values[17] = rcc->p_cplx_sum[P_TYPE] / (double)rcc->frame_count[P_TYPE];
-    const_values[18] = rcc->p_cplx_sum[B_TYPE] / (double)rcc->frame_count[B_TYPE];
-    const_values[19] = (rcc->i_cplx_sum[pict_type] + rcc->p_cplx_sum[pict_type]) / (double)rcc->frame_count[pict_type];
-    const_values[20] = 0;
-
     bits= ff_eval(s->avctx->rc_eq, const_values, const_names, func1, func1_names, NULL, NULL, rce);
     
     rcc->pass1_rc_eq_output_sum+= bits;
@@ -332,7 +324,7 @@ static double get_diff_limited_q(MpegEncContext *s, RateControlEntry *rce, doubl
     const int pict_type= rce->new_pict_type;
     const double last_p_q    = rcc->last_qscale_for[P_TYPE];
     const double last_non_b_q= rcc->last_qscale_for[rcc->last_non_b_pict_type];
-
+    
     if     (pict_type==I_TYPE && (a->i_quant_factor>0.0 || rcc->last_non_b_pict_type==P_TYPE))
         q= last_p_q    *ABS(a->i_quant_factor) + a->i_quant_offset;
     else if(pict_type==B_TYPE && a->b_quant_factor>0.0)
@@ -341,6 +333,7 @@ static double get_diff_limited_q(MpegEncContext *s, RateControlEntry *rce, doubl
     /* last qscale / qdiff stuff */
     if(rcc->last_non_b_pict_type==pict_type || pict_type!=I_TYPE){
         double last_q= rcc->last_qscale_for[pict_type];
+
         if     (q > last_q + a->max_qdiff) q= last_q + a->max_qdiff;
         else if(q < last_q - a->max_qdiff) q= last_q - a->max_qdiff;
     }
@@ -447,11 +440,13 @@ static double predict_size(Predictor *p, double q, double var)
      return p->coeff*var / (q*p->count);
 }
 
+/*
 static double predict_qp(Predictor *p, double size, double var)
 {
 //printf("coeff:%f, count:%f, var:%f, size:%f//\n", p->coeff, p->count, var, size);
      return p->coeff*var / (size*p->count);
 }
+*/
 
 static void update_predictor(Predictor *p, double q, double var, double size)
 {
@@ -473,15 +468,12 @@ static void adaptive_quantization(MpegEncContext *s, double q){
     const float p_masking = s->avctx->p_masking;
     float bits_sum= 0.0;
     float cplx_sum= 0.0;
-    float *cplx_tab;
-    float *bits_tab;
-    const int qmin= 2; //s->avctx->mb_qmin;
-    const int qmax= 31; //s->avctx->mb_qmax;
+    float cplx_tab[s->mb_num];
+    float bits_tab[s->mb_num];
+    const int qmin= s->avctx->mb_qmin;
+    const int qmax= s->avctx->mb_qmax;
     Picture * const pic= &s->current_picture;
-
-    cplx_tab = alloca(s->mb_num * sizeof(float));
-    bits_tab = alloca(s->mb_num * sizeof(float));
-
+    
     for(i=0; i<s->mb_num; i++){
         float temp_cplx= sqrt(pic->mc_mb_var[i]);
         float spat_cplx= sqrt(pic->mb_var[i]);
@@ -663,17 +655,16 @@ float ff_rate_estimate_qscale(MpegEncContext *s)
 
         assert(q>0.0);
     }
-//printf("qmin:%d, qmax:%d, q:%f\n", qmin, qmax, q);
-    
+
+    if(s->avctx->debug&FF_DEBUG_RC){
+        printf("%c qp:%d<%2.1f<%d %d want:%d total:%d comp:%f st_q:%2.2f size:%d var:%d/%d br:%d fps:%d\n",
+        ff_get_pict_type_char(pict_type), qmin, q, qmax, picture_number, (int)wanted_bits/1000, (int)s->total_bits/1000,
+        br_compensation, short_term_q, s->frame_bits, pic->mb_var_sum, pic->mc_mb_var_sum, s->bit_rate/1000, (int)fps
+        );
+    }
 
     if     (q<qmin) q=qmin; 
     else if(q>qmax) q=qmax;
-        
-//    printf("%f %d %d %d\n", q, picture_number, (int)wanted_bits, (int)s->total_bits);
-       
-//printf("diff:%d comp:%f st_q:%f last_size:%d type:%d\n", (int)diff, br_compensation, 
-//       short_term_q, s->frame_bits, pict_type);
-//printf("%d %d\n", s->bit_rate, (int)fps);
 
     if(s->adaptive_quant)
         adaptive_quantization(s, q);
@@ -710,7 +701,7 @@ static int init_pass2(MpegEncContext *s)
     uint64_t all_available_bits= (uint64_t)(s->bit_rate*(double)rcc->num_entries/fps);
     double rate_factor=0;
     double step;
-    int last_i_frame=-10000000;
+    //int last_i_frame=-10000000;
     const int filter_size= (int)(s->qblur*4) | 1;  
     double expected_bits;
     double *qscale, *blured_qscale;
@@ -719,38 +710,7 @@ static int init_pass2(MpegEncContext *s)
     for(i=0; i<rcc->num_entries; i++){
         RateControlEntry *rce= &rcc->entry[i];
         
-        if(s->b_frame_strategy==0 || s->max_b_frames==0){
-            rce->new_pict_type= rce->pict_type;
-        }else{
-            int j;
-            int next_non_b_type=P_TYPE;
-
-            switch(rce->pict_type){
-            case I_TYPE:
-                if(i-last_i_frame>s->gop_size/2){ //FIXME this is not optimal
-                    rce->new_pict_type= I_TYPE;
-                    last_i_frame= i;
-                }else{
-                    rce->new_pict_type= P_TYPE; // will be caught by the scene detection anyway
-                }
-                break;
-            case P_TYPE:
-                rce->new_pict_type= P_TYPE;
-                break;
-            case B_TYPE:
-                for(j=i+1; j<i+s->max_b_frames+2 && j<rcc->num_entries; j++){
-                    if(rcc->entry[j].pict_type != B_TYPE){
-                        next_non_b_type= rcc->entry[j].pict_type;
-                        break;
-                    }
-                }
-                if(next_non_b_type==I_TYPE)
-                    rce->new_pict_type= P_TYPE;
-                else
-                    rce->new_pict_type= B_TYPE;
-                break;
-            }
-        }
+        rce->new_pict_type= rce->pict_type;
         rcc->i_cplx_sum [rce->pict_type] += rce->i_tex_bits*rce->qscale;
         rcc->p_cplx_sum [rce->pict_type] += rce->p_tex_bits*rce->qscale;
         rcc->mv_bits_sum[rce->pict_type] += rce->mv_bits;
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 8039cdb1e..4907c2347 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -472,7 +472,7 @@ static int rv10_decode_frame(AVCodecContext *avctx,
 {
     MpegEncContext *s = avctx->priv_data;
     int i;
-    AVVideoFrame *pict = data; 
+    AVFrame *pict = data; 
 
 #ifdef DEBUG
     printf("*****frame %d size=%d\n", avctx->frame_number, buf_size);
@@ -505,9 +505,9 @@ static int rv10_decode_frame(AVCodecContext *avctx,
     if(s->mb_y>=s->mb_height){
         MPV_frame_end(s);
         
-        *pict= *(AVVideoFrame*)&s->current_picture;
+        *pict= *(AVFrame*)&s->current_picture;
     
-        *data_size = sizeof(AVVideoFrame);
+        *data_size = sizeof(AVFrame);
     }else{
         *data_size = 0;
     }
diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c
index ad27ac594..8c9ce7b93 100644
--- a/src/libffmpeg/libavcodec/simple_idct.c
+++ b/src/libffmpeg/libavcodec/simple_idct.c
@@ -473,3 +473,93 @@ void simple_idct248_put(UINT8 *dest, int line_size, INT16 *block)
         idct4col(dest + line_size + i, 2 * line_size, block + 8 + i);
     }
 }
+
+/* 8x4 & 4x8 WMV2 IDCT */
+#undef CN_SHIFT
+#undef C_SHIFT
+#undef C_FIX
+#undef C1
+#undef C2
+#define CN_SHIFT 12
+#define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
+#define C1 C_FIX(0.6532814824)
+#define C2 C_FIX(0.2705980501)
+#define C3 C_FIX(0.5)
+#define C_SHIFT (4+1+12)
+static inline void idct4col_add(UINT8 *dest, int line_size, const INT16 *col)
+{
+    int c0, c1, c2, c3, a0, a1, a2, a3;
+    const UINT8 *cm = cropTbl + MAX_NEG_CROP;
+
+    a0 = col[8*0];
+    a1 = col[8*1];
+    a2 = col[8*2];
+    a3 = col[8*3];
+    c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1));
+    c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1));
+    c1 = a1 * C1 + a3 * C2;
+    c3 = a1 * C2 - a3 * C1;
+    dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)];
+    dest += line_size;
+    dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)];
+    dest += line_size;
+    dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)];
+    dest += line_size;
+    dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)];
+}
+
+#define RN_SHIFT 15
+#define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
+#define R1 R_FIX(0.6532814824)
+#define R2 R_FIX(0.2705980501)
+#define R3 R_FIX(0.5)
+#define R_SHIFT 11
+static inline void idct4row(INT16 *row)
+{
+    int c0, c1, c2, c3, a0, a1, a2, a3;
+    const UINT8 *cm = cropTbl + MAX_NEG_CROP;
+
+    a0 = row[0];
+    a1 = row[1];
+    a2 = row[2];
+    a3 = row[3];
+    c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1));
+    c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1));
+    c1 = a1 * R1 + a3 * R2;
+    c3 = a1 * R2 - a3 * R1;
+    row[0]= (c0 + c1) >> R_SHIFT;
+    row[1]= (c2 + c3) >> R_SHIFT;
+    row[2]= (c2 - c3) >> R_SHIFT;
+    row[3]= (c0 - c1) >> R_SHIFT;
+}
+
+void simple_idct84_add(UINT8 *dest, int line_size, INT16 *block)
+{
+    int i;
+
+    /* IDCT8 on each line */
+    for(i=0; i<4; i++) {
+        idctRowCondDC(block + i*8);
+    }
+
+    /* IDCT4 and store */
+    for(i=0;i<8;i++) {
+        idct4col_add(dest + i, line_size, block + i);
+    }
+}
+
+void simple_idct48_add(UINT8 *dest, int line_size, INT16 *block)
+{
+    int i;
+
+    /* IDCT4 on each line */
+    for(i=0; i<8; i++) {
+        idct4row(block + i*8);
+    }
+
+    /* IDCT8 and store */
+    for(i=0; i<4; i++){
+        idctSparseColAdd(dest + i, line_size, block + i);
+    }
+}
+
diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h
index 6c6b4f011..428c6072c 100644
--- a/src/libffmpeg/libavcodec/simple_idct.h
+++ b/src/libffmpeg/libavcodec/simple_idct.h
@@ -26,3 +26,6 @@ void ff_simple_idct_put_mmx(UINT8 *dest, int line_size, INT16 *block);
 void simple_idct(short *block);
 
 void simple_idct248_put(UINT8 *dest, int line_size, INT16 *block);
+
+void simple_idct84_add(UINT8 *dest, int line_size, INT16 *block);
+void simple_idct48_add(UINT8 *dest, int line_size, INT16 *block);
diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c
index 6abccf403..77035f1f9 100644
--- a/src/libffmpeg/libavcodec/svq1.c
+++ b/src/libffmpeg/libavcodec/svq1.c
@@ -26,7 +26,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <alloca.h>
 
 #include "common.h"
 #include "avcodec.h"
@@ -1064,7 +1063,7 @@ static int svq1_decode_frame(AVCodecContext *avctx,
   MpegEncContext *s=avctx->priv_data;
   uint8_t      *current, *previous;
   int		result, i, x, y, width, height;
-  AVVideoFrame *pict = data; 
+  AVFrame *pict = data; 
 
   /* initialize bit buffer */
   init_get_bits(&s->gb,buf,buf_size);
@@ -1137,9 +1136,7 @@ static int svq1_decode_frame(AVCodecContext *avctx,
 	current += 16*linesize;
       }
     } else {
-      svq1_pmv_t *pmv;
-
-      pmv = alloca((width/8+3) * sizeof(svq1_pmv_t));
+      svq1_pmv_t pmv[width/8+3];
       /* delta frame */
       memset (pmv, 0, ((width / 8) + 3) * sizeof(svq1_pmv_t));
 
@@ -1164,12 +1161,12 @@ static int svq1_decode_frame(AVCodecContext *avctx,
     }
   }
   
-  *pict = *(AVVideoFrame*)&s->current_picture;
+  *pict = *(AVFrame*)&s->current_picture;
 
 
   MPV_frame_end(s);
   
-  *data_size=sizeof(AVVideoFrame);
+  *data_size=sizeof(AVFrame);
   return buf_size;
 }
 
@@ -1182,7 +1179,7 @@ static int svq1_decode_init(AVCodecContext *avctx)
     s->height = (avctx->height+3)&~3;
     s->codec_id= avctx->codec->id;
     avctx->pix_fmt = PIX_FMT_YUV410P;
-    avctx->has_b_frames= s->has_b_frames=1; // not true, but DP frames and these behave like unidirectional b frames
+    avctx->has_b_frames= 1; // not true, but DP frames and these behave like unidirectional b frames
     s->flags= avctx->flags;
     if (MPV_common_init(s) < 0) return -1;
     return 0;
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index bc7da83ef..af6ba986b 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -120,11 +120,14 @@ typedef struct DefaultPicOpaque{
     uint8_t *data[4];
 }DefaultPicOpaque;
 
-int avcodec_default_get_buffer(AVCodecContext *s, AVVideoFrame *pic){
+int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
     int i;
     const int width = s->width;
     const int height= s->height;
     DefaultPicOpaque *opaque;
+    
+    assert(pic->data[0]==NULL);
+    /* assert(pic->type==0 || pic->type==FF_TYPE_INTERNAL); */
 
     if(pic->opaque){
         opaque= (DefaultPicOpaque *)pic->opaque;
@@ -186,21 +189,24 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVVideoFrame *pic){
             memset(pic->base[i], 128, pic->linesize[i]*h>>v_shift);
         
             if(s->flags&CODEC_FLAG_EMU_EDGE)
-                pic->data[i] = pic->base[i];
+                pic->data[i] = pic->base[i] + 16; //FIXME 16
             else
-                pic->data[i] = pic->base[i] + (pic->linesize[i]*EDGE_WIDTH>>v_shift) + (EDGE_WIDTH>>h_shift);
+                pic->data[i] = pic->base[i] + (pic->linesize[i]*EDGE_WIDTH>>v_shift) + (EDGE_WIDTH>>h_shift) + 16; //FIXME 16
             
             opaque->data[i]= pic->data[i];
         }
         pic->age= 256*256*256*64;
+        pic->type= FF_BUFFER_TYPE_INTERNAL;
     }
 
     return 0;
 }
 
-void avcodec_default_release_buffer(AVCodecContext *s, AVVideoFrame *pic){
+void avcodec_default_release_buffer(AVCodecContext *s, AVFrame *pic){
     int i;
     
+    assert(pic->type==FF_BUFFER_TYPE_INTERNAL);
+    
     for(i=0; i<3; i++)
         pic->data[i]=NULL;
 //printf("R%X\n", pic->opaque);
@@ -211,6 +217,8 @@ void avcodec_get_context_defaults(AVCodecContext *s){
     s->bit_rate_tolerance= s->bit_rate*10;
     s->qmin= 2;
     s->qmax= 31;
+    s->mb_qmin= 2;
+    s->mb_qmax= 31;
     s->rc_eq= "tex^qComp";
     s->qcompress= 0.5;
     s->max_qdiff= 3;
@@ -226,6 +234,7 @@ void avcodec_get_context_defaults(AVCodecContext *s){
     s->me_method= ME_EPZS;
     s->get_buffer= avcodec_default_get_buffer;
     s->release_buffer= avcodec_default_release_buffer;
+    s->me_subpel_quality=8;
 }
 
 /**
@@ -243,11 +252,11 @@ AVCodecContext *avcodec_alloc_context(void){
 }
 
 /**
- * allocates a AVPicture and set it to defaults.
+ * allocates a AVPFrame and set it to defaults.
  * this can be deallocated by simply calling free() 
  */
-AVVideoFrame *avcodec_alloc_picture(void){
-    AVVideoFrame *pic= av_mallocz(sizeof(AVVideoFrame));
+AVFrame *avcodec_alloc_frame(void){
+    AVFrame *pic= av_mallocz(sizeof(AVFrame));
     
     return pic;
 }
@@ -257,6 +266,7 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec)
     int ret;
 
     avctx->codec = codec;
+    avctx->codec_id = codec->id;
     avctx->frame_number = 0;
     if (codec->priv_data_size > 0) {
         avctx->priv_data = av_mallocz(codec->priv_data_size);
@@ -284,7 +294,7 @@ int avcodec_encode_audio(AVCodecContext *avctx, UINT8 *buf, int buf_size,
 }
 
 int avcodec_encode_video(AVCodecContext *avctx, UINT8 *buf, int buf_size, 
-                         const AVVideoFrame *pict)
+                         const AVFrame *pict)
 {
     int ret;
 
@@ -299,7 +309,7 @@ int avcodec_encode_video(AVCodecContext *avctx, UINT8 *buf, int buf_size,
 /* decode a frame. return -1 if error, otherwise return the number of
    bytes used. If no frame could be decompressed, *got_picture_ptr is
    zero. Otherwise, it is non zero */
-int avcodec_decode_video(AVCodecContext *avctx, AVVideoFrame *picture, 
+int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture, 
                          int *got_picture_ptr,
                          UINT8 *buf, int buf_size)
 {
@@ -642,14 +652,40 @@ void avcodec_init(void)
     //dsputil_init();
 }
 
-/* this should be called after seeking and before trying to decode the next frame */
+/* this can be called after seeking and before trying to decode the next keyframe */
 void avcodec_flush_buffers(AVCodecContext *avctx)
 {
+    int i;
     MpegEncContext *s = avctx->priv_data;
-    s->num_available_buffers=0;
+    
+    switch(avctx->codec_id){
+    case CODEC_ID_MPEG1VIDEO:
+    case CODEC_ID_H263:
+    case CODEC_ID_RV10:
+    case CODEC_ID_MJPEG:
+    case CODEC_ID_MJPEGB:
+    case CODEC_ID_MPEG4:
+    case CODEC_ID_MSMPEG4V1:
+    case CODEC_ID_MSMPEG4V2:
+    case CODEC_ID_MSMPEG4V3:
+    case CODEC_ID_WMV1:
+    case CODEC_ID_WMV2:
+    case CODEC_ID_H263P:
+    case CODEC_ID_H263I:
+    case CODEC_ID_SVQ1:
+        for(i=0; i<MAX_PICTURE_COUNT; i++){
+           if(s->picture[i].data[0] && (   s->picture[i].type == FF_BUFFER_TYPE_INTERNAL
+                                        || s->picture[i].type == FF_BUFFER_TYPE_USER))
+            avctx->release_buffer(avctx, (AVFrame*)&s->picture[i]);
+	}
+	s->last_picture.data[0] = s->next_picture.data[0] = NULL;
+        break;
+    default:
+        //FIXME
+        break;
+    }
 }
 
-
 static int raw_encode_init(AVCodecContext *s)
 {
     return 0;
diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c
index 7505a9be3..a6fa2f8b2 100644
--- a/src/libffmpeg/libavcodec/wmadec.c
+++ b/src/libffmpeg/libavcodec/wmadec.c
@@ -87,15 +87,15 @@ typedef struct WMADecodeContext {
     int block_pos; /* current position in frame */
     uint8_t ms_stereo; /* true if mid/side stereo mode */
     uint8_t channel_coded[MAX_CHANNELS]; /* true if channel is coded */
-    float exponents[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    float exponents[MAX_CHANNELS][BLOCK_MAX_SIZE] __attribute__((aligned(16)));
     float max_exponent[MAX_CHANNELS];
     int16_t coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    float coefs[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    float coefs[MAX_CHANNELS][BLOCK_MAX_SIZE] __attribute__((aligned(16)));
     MDCTContext mdct_ctx[BLOCK_NB_SIZES];
-    float *windows[BLOCK_NB_SIZES];
-    FFTSample mdct_tmp[BLOCK_MAX_SIZE]; /* temporary storage for imdct */
+    float *windows[BLOCK_NB_SIZES] __attribute__((aligned(16)));
+    FFTSample mdct_tmp[BLOCK_MAX_SIZE] __attribute__((aligned(16))); /* temporary storage for imdct */
     /* output buffer for one frame and the last for IMDCT windowing */
-    float frame_out[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
+    float frame_out[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] __attribute__((aligned(16)));
     /* last frame info */
     uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
     int last_bitoffset;
@@ -1118,7 +1118,7 @@ static int wma_decode_block(WMADecodeContext *s)
     
     for(ch = 0; ch < s->nb_channels; ch++) {
         if (s->channel_coded[ch]) {
-            FFTSample output[BLOCK_MAX_SIZE * 2];
+            FFTSample output[BLOCK_MAX_SIZE * 2] __attribute__((aligned(16)));
             float *ptr;
             int i, n4, index, n;
 
@@ -1240,7 +1240,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
             /* add bit_offset bits to last frame */
             if ((s->last_superframe_len + ((bit_offset + 7) >> 3)) > 
                 MAX_CODED_SUPERFRAME_SIZE)
-                return -1;
+                goto fail;
             q = s->last_superframe + s->last_superframe_len;
             len = bit_offset;
             while (len > 0) {
@@ -1259,7 +1259,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
             /* this frame is stored in the last superframe and in the
                current one */
             if (wma_decode_frame(s, samples) < 0)
-                return -1;
+                goto fail;
             samples += s->nb_channels * s->frame_len;
         }
 
@@ -1273,7 +1273,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
         s->reset_block_lengths = 1;
         for(i=0;i<nb_frames;i++) {
             if (wma_decode_frame(s, samples) < 0)
-                return -1;
+                goto fail;
             samples += s->nb_channels * s->frame_len;
         }
 
@@ -1283,18 +1283,22 @@ static int wma_decode_superframe(AVCodecContext *avctx,
         pos >>= 3;
         len = buf_size - pos;
         if (len > MAX_CODED_SUPERFRAME_SIZE || len < 0) {
-            return -1;
+            goto fail;
         }
         s->last_superframe_len = len;
         memcpy(s->last_superframe, buf + pos, len);
     } else {
         /* single frame decode */
         if (wma_decode_frame(s, samples) < 0)
-            return -1;
+            goto fail;
         samples += s->nb_channels * s->frame_len;
     }
     *data_size = (int8_t *)samples - (int8_t *)data;
     return s->block_align;
+ fail:
+    /* when error, we reset the bit reservoir */
+    s->last_superframe_len = 0;
+    return -1;
 }
 
 static int wma_decode_end(AVCodecContext *avctx)
diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c
new file mode 100644
index 000000000..d25b7a5f1
--- /dev/null
+++ b/src/libffmpeg/libavcodec/wmv2.c
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2002 The FFmpeg Project.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "simple_idct.h"
+ 
+#define SKIP_TYPE_NONE 0
+#define SKIP_TYPE_MPEG 1
+#define SKIP_TYPE_ROW  2
+#define SKIP_TYPE_COL  3
+
+
+typedef struct Wmv2Context{
+    MpegEncContext s;
+    int j_type_bit;
+    int j_type;
+    int flag3;
+    int flag63;
+    int abt_flag;
+    int abt_type;
+    int abt_type_table[6];
+    int per_mb_abt;
+    int per_block_abt;
+    int mspel_bit;
+    int cbp_table_index;
+    int top_left_mv_flag;
+    int per_mb_rl_bit;
+    int skip_type;
+    int hshift;
+    
+    ScanTable abt_scantable[2];
+    DCTELEM abt_block2[6][64] __align8;
+}Wmv2Context;
+
+static void wmv2_common_init(Wmv2Context * w){
+    MpegEncContext * const s= &w->s;
+        
+    ff_init_scantable(s, &w->abt_scantable[0], wmv2_scantableA);
+    ff_init_scantable(s, &w->abt_scantable[1], wmv2_scantableB);
+}
+
+static int encode_ext_header(Wmv2Context *w){
+    MpegEncContext * const s= &w->s;
+    PutBitContext pb;
+    int code;
+        
+    init_put_bits(&pb, s->avctx->extradata, s->avctx->extradata_size, NULL, NULL);
+
+    put_bits(&pb, 5, s->frame_rate / FRAME_RATE_BASE); //yes 29.97 -> 29
+    put_bits(&pb, 11, FFMIN(s->bit_rate/1024, 2047));
+    
+    put_bits(&pb, 1, w->mspel_bit=1);
+    put_bits(&pb, 1, w->flag3=1);
+    put_bits(&pb, 1, w->abt_flag=1);
+    put_bits(&pb, 1, w->j_type_bit=1);
+    put_bits(&pb, 1, w->top_left_mv_flag=0);
+    put_bits(&pb, 1, w->per_mb_rl_bit=1);
+    put_bits(&pb, 3, code=1);
+    
+    flush_put_bits(&pb);
+
+    s->slice_height = s->mb_height / code;
+    
+    return 0;
+}
+
+static int wmv2_encode_init(AVCodecContext *avctx){
+    Wmv2Context * const w= avctx->priv_data;
+    
+    if(MPV_encode_init(avctx) < 0)
+        return -1;
+    
+    wmv2_common_init(w);
+
+    avctx->extradata_size= 4;
+    avctx->extradata= av_mallocz(avctx->extradata_size + 10);
+    encode_ext_header(w);
+    
+    return 0;
+}
+
+static int wmv2_encode_end(AVCodecContext *avctx){
+    Wmv2Context * const w= avctx->priv_data;
+    
+    if(MPV_encode_end(avctx) < 0)
+        return -1;
+    
+    avctx->extradata_size= 0;
+    av_freep(&avctx->extradata);
+    
+    return 0;
+}
+
+int ff_wmv2_encode_picture_header(MpegEncContext * s, int picture_number)
+{
+    Wmv2Context * const w= (Wmv2Context*)s;
+
+    put_bits(&s->pb, 1, s->pict_type - 1);
+    if(s->pict_type == I_TYPE){
+        put_bits(&s->pb, 7, 0);
+    }
+    put_bits(&s->pb, 5, s->qscale);
+
+    s->dc_table_index = 1;
+    s->mv_table_index = 1; /* only if P frame */
+//    s->use_skip_mb_code = 1; /* only if P frame */
+    s->per_mb_rl_table = 0;
+    s->mspel= 0;
+    w->per_mb_abt=0;
+    w->abt_type=0;
+    w->j_type=0;
+
+    if (s->pict_type == I_TYPE) {
+        if(w->j_type_bit) put_bits(&s->pb, 1, w->j_type);
+        
+        if(w->per_mb_rl_bit) put_bits(&s->pb, 1, s->per_mb_rl_table);
+        
+        if(!s->per_mb_rl_table){
+            code012(&s->pb, s->rl_chroma_table_index);
+            code012(&s->pb, s->rl_table_index);
+        }
+
+        put_bits(&s->pb, 1, s->dc_table_index);
+
+        s->inter_intra_pred= 0;
+        s->no_rounding = 1;
+    }else{
+        int cbp_index;
+
+        put_bits(&s->pb, 2, SKIP_TYPE_NONE);
+        
+        code012(&s->pb, cbp_index=0);
+        if(s->qscale <= 10){
+            int map[3]= {0,2,1};
+            w->cbp_table_index= map[cbp_index];
+        }else if(s->qscale <= 20){
+            int map[3]= {1,0,2};
+            w->cbp_table_index= map[cbp_index];
+        }else{
+            int map[3]= {2,1,0};
+            w->cbp_table_index= map[cbp_index];
+        }
+
+        if(w->mspel_bit) put_bits(&s->pb, 1, s->mspel);
+    
+        if(w->abt_flag){
+            put_bits(&s->pb, 1, w->per_mb_abt^1);
+            if(!w->per_mb_abt){
+                code012(&s->pb, w->abt_type);
+            }
+        }
+
+        if(w->per_mb_rl_bit) put_bits(&s->pb, 1, s->per_mb_rl_table);
+        
+        if(!s->per_mb_rl_table){
+            code012(&s->pb, s->rl_table_index);
+            s->rl_chroma_table_index = s->rl_table_index;
+        }
+        put_bits(&s->pb, 1, s->dc_table_index);
+        put_bits(&s->pb, 1, s->mv_table_index);
+    
+        s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
+        s->no_rounding ^= 1;
+    }
+    s->esc3_level_length= 0;
+    s->esc3_run_length= 0;
+
+    return 0;
+}
+
+// nearly idential to wmv1 but thats just because we dont use the useless M$ crap features
+// its duplicated here in case someone wants to add support for these carp features
+void ff_wmv2_encode_mb(MpegEncContext * s, 
+                       DCTELEM block[6][64],
+                       int motion_x, int motion_y)
+{
+    Wmv2Context * const w= (Wmv2Context*)s;
+    int cbp, coded_cbp, i;
+    int pred_x, pred_y;
+    UINT8 *coded_block;
+
+    handle_slices(s);
+    
+    if (!s->mb_intra) {
+	/* compute cbp */
+        set_stat(ST_INTER_MB);
+	cbp = 0;
+	for (i = 0; i < 6; i++) {
+	    if (s->block_last_index[i] >= 0)
+		cbp |= 1 << (5 - i);
+	}
+        
+        put_bits(&s->pb, 
+                 wmv2_inter_table[w->cbp_table_index][cbp + 64][1], 
+                 wmv2_inter_table[w->cbp_table_index][cbp + 64][0]);
+
+        /* motion vector */
+        h263_pred_motion(s, 0, &pred_x, &pred_y);
+        msmpeg4_encode_motion(s, motion_x - pred_x, 
+                              motion_y - pred_y);
+    } else {
+	/* compute cbp */
+	cbp = 0;
+        coded_cbp = 0;
+	for (i = 0; i < 6; i++) {
+            int val, pred;
+            val = (s->block_last_index[i] >= 1);
+            cbp |= val << (5 - i);
+            if (i < 4) {
+                /* predict value for close blocks only for luma */
+                pred = coded_block_pred(s, i, &coded_block);
+                *coded_block = val;
+                val = val ^ pred;
+            }
+            coded_cbp |= val << (5 - i);
+	}
+#if 0
+        if (coded_cbp)
+            printf("cbp=%x %x\n", cbp, coded_cbp);
+#endif
+
+        if (s->pict_type == I_TYPE) {
+            set_stat(ST_INTRA_MB);
+            put_bits(&s->pb, 
+                     table_mb_intra[coded_cbp][1], table_mb_intra[coded_cbp][0]);
+        } else {
+            put_bits(&s->pb, 
+                     wmv2_inter_table[w->cbp_table_index][cbp][1], 
+                     wmv2_inter_table[w->cbp_table_index][cbp][0]);
+        }
+        set_stat(ST_INTRA_MB);
+        put_bits(&s->pb, 1, 0);	/* no AC prediction yet */
+        if(s->inter_intra_pred){
+            s->h263_aic_dir=0;
+            put_bits(&s->pb, table_inter_intra[s->h263_aic_dir][1], table_inter_intra[s->h263_aic_dir][0]);
+        }
+    }
+
+    for (i = 0; i < 6; i++) {
+        msmpeg4_encode_block(s, block[i], i);
+    }
+}
+
+static void parse_mb_skip(Wmv2Context * w){
+    int mb_x, mb_y;
+    MpegEncContext * const s= &w->s;
+
+    w->skip_type= get_bits(&s->gb, 2);
+    switch(w->skip_type){
+    case SKIP_TYPE_NONE:
+        for(mb_y=0; mb_y<s->mb_height; mb_y++){
+            for(mb_x=0; mb_x<s->mb_width; mb_x++){
+                s->mb_type[mb_y*s->mb_width + mb_x]= 0;
+            }
+        }
+        break;
+    case SKIP_TYPE_MPEG:
+        for(mb_y=0; mb_y<s->mb_height; mb_y++){
+            for(mb_x=0; mb_x<s->mb_width; mb_x++){
+                s->mb_type[mb_y*s->mb_width + mb_x]= get_bits1(&s->gb) ? MB_TYPE_SKIPED : 0;
+            }
+        }
+        break;
+    case SKIP_TYPE_ROW:
+        for(mb_y=0; mb_y<s->mb_height; mb_y++){
+            if(get_bits1(&s->gb)){
+                for(mb_x=0; mb_x<s->mb_width; mb_x++){
+                    s->mb_type[mb_y*s->mb_width + mb_x]=  MB_TYPE_SKIPED;
+                }
+            }else{
+                for(mb_x=0; mb_x<s->mb_width; mb_x++){
+                    s->mb_type[mb_y*s->mb_width + mb_x]= get_bits1(&s->gb) ? MB_TYPE_SKIPED : 0;
+                }
+            }
+        }
+        break;
+    case SKIP_TYPE_COL:
+        for(mb_x=0; mb_x<s->mb_width; mb_x++){
+            if(get_bits1(&s->gb)){
+                for(mb_y=0; mb_y<s->mb_height; mb_y++){
+                    s->mb_type[mb_y*s->mb_width + mb_x]=  MB_TYPE_SKIPED;
+                }
+            }else{
+                for(mb_y=0; mb_y<s->mb_height; mb_y++){
+                    s->mb_type[mb_y*s->mb_width + mb_x]= get_bits1(&s->gb) ? MB_TYPE_SKIPED : 0;
+                }
+            }
+        }
+        break;
+    }
+}
+
+static int decode_ext_header(Wmv2Context *w){
+    MpegEncContext * const s= &w->s;
+    GetBitContext gb;
+    int fps;
+    int code;
+
+    if(s->avctx->extradata_size<4) return -1;
+    
+    init_get_bits(&gb, s->avctx->extradata, s->avctx->extradata_size);
+
+    fps                = get_bits(&gb, 5);
+    s->bit_rate        = get_bits(&gb, 11)*1024;
+    w->mspel_bit       = get_bits1(&gb);
+    w->flag3           = get_bits1(&gb);
+    w->abt_flag        = get_bits1(&gb);
+    w->j_type_bit      = get_bits1(&gb);
+    w->top_left_mv_flag= get_bits1(&gb);
+    w->per_mb_rl_bit   = get_bits1(&gb);
+    code               = get_bits(&gb, 3);
+    
+    if(code==0) return -1;
+            
+    s->slice_height = s->mb_height / code;
+
+    if(s->avctx->debug&FF_DEBUG_PICT_INFO){
+        printf("fps:%d, br:%d, qpbit:%d, abt_flag:%d, j_type_bit:%d, tl_mv_flag:%d, mbrl_bit:%d, code:%d, flag3:%d\n", 
+        fps, s->bit_rate, w->mspel_bit, w->abt_flag, w->j_type_bit, w->top_left_mv_flag, w->per_mb_rl_bit, code, w->flag3);
+    }
+    return 0;
+}
+
+int ff_wmv2_decode_picture_header(MpegEncContext * s)
+{
+    Wmv2Context * const w= (Wmv2Context*)s;
+    int code, i;
+
+#if 0
+{
+int i;
+for(i=0; i<s->gb.size*8; i++)
+    printf("%d", get_bits1(&s->gb));
+//    get_bits1(&s->gb);
+printf("END\n");
+return -1;
+}
+#endif
+    if(s->picture_number==0)
+        decode_ext_header(w);
+
+    s->pict_type = get_bits(&s->gb, 1) + 1;
+    if(s->pict_type == I_TYPE){
+        code = get_bits(&s->gb, 7);
+        printf("I7:%X/\n", code);
+    }
+    s->qscale = get_bits(&s->gb, 5);
+
+    if (s->pict_type == I_TYPE) {
+        if(w->j_type_bit) w->j_type= get_bits1(&s->gb);
+        else              w->j_type= 0; //FIXME check
+        
+        if(!w->j_type){
+            if(w->per_mb_rl_bit) s->per_mb_rl_table= get_bits1(&s->gb);
+            else                 s->per_mb_rl_table= 0;
+        
+            if(!s->per_mb_rl_table){
+                s->rl_chroma_table_index = decode012(&s->gb);
+                s->rl_table_index = decode012(&s->gb);
+            }
+
+            s->dc_table_index = get_bits1(&s->gb);
+        }
+        s->inter_intra_pred= 0;
+        s->no_rounding = 1;
+        if(s->avctx->debug&FF_DEBUG_PICT_INFO){
+	    printf("qscale:%d rlc:%d rl:%d dc:%d mbrl:%d j_type:%d \n", 
+		s->qscale,
+		s->rl_chroma_table_index,
+		s->rl_table_index, 
+		s->dc_table_index,
+                s->per_mb_rl_table,
+                w->j_type);
+        }
+    }else{
+        int cbp_index;
+        w->j_type=0;
+
+        parse_mb_skip(w);
+        cbp_index= decode012(&s->gb);
+        if(s->qscale <= 10){
+            int map[3]= {0,2,1};
+            w->cbp_table_index= map[cbp_index];
+        }else if(s->qscale <= 20){
+            int map[3]= {1,0,2};
+            w->cbp_table_index= map[cbp_index];
+        }else{
+            int map[3]= {2,1,0};
+            w->cbp_table_index= map[cbp_index];
+        }
+
+        if(w->mspel_bit) s->mspel= get_bits1(&s->gb);
+        else             s->mspel= 0; //FIXME check
+    
+        if(w->abt_flag){
+            w->per_mb_abt= get_bits1(&s->gb)^1;
+            if(!w->per_mb_abt){
+                w->abt_type= decode012(&s->gb);
+            }
+        }
+
+        if(w->per_mb_rl_bit) s->per_mb_rl_table= get_bits1(&s->gb);
+        else                 s->per_mb_rl_table= 0;
+        
+        if(!s->per_mb_rl_table){
+            s->rl_table_index = decode012(&s->gb);
+            s->rl_chroma_table_index = s->rl_table_index;
+        }
+
+        s->dc_table_index = get_bits1(&s->gb);
+        s->mv_table_index = get_bits1(&s->gb);
+    
+        s->inter_intra_pred= (s->width*s->height < 320*240 && s->bit_rate<=II_BITRATE);
+        s->no_rounding ^= 1;
+        
+        if(s->avctx->debug&FF_DEBUG_PICT_INFO){
+            printf("rl:%d rlc:%d dc:%d mv:%d mbrl:%d qp:%d mspel:%d per_mb_abt:%d abt_type:%d cbp:%d ii:%d\n", 
+		s->rl_table_index, 
+		s->rl_chroma_table_index, 
+		s->dc_table_index,
+		s->mv_table_index,
+                s->per_mb_rl_table,
+                s->qscale,
+                s->mspel,
+                w->per_mb_abt,
+                w->abt_type,
+                w->cbp_table_index,
+                s->inter_intra_pred);
+        }
+    }
+    s->esc3_level_length= 0;
+    s->esc3_run_length= 0;
+    
+    if(s->avctx->debug&FF_DEBUG_SKIP){
+        for(i=0; i<s->mb_num; i++){
+            if(i%s->mb_width==0) printf("\n");
+            printf("%d", s->mb_type[i]);
+        }
+    }
+s->picture_number++; //FIXME ?
+
+
+//    if(w->j_type)
+//        return wmv2_decode_j_picture(w); //FIXME
+
+    if(w->j_type){
+        printf("J-type picture isnt supported\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+void ff_wmv2_decode_init(MpegEncContext *s){
+}
+
+static inline int wmv2_decode_motion(Wmv2Context *w, int *mx_ptr, int *my_ptr){
+    MpegEncContext * const s= &w->s;
+    int ret;
+   
+    ret= msmpeg4_decode_motion(s, mx_ptr, my_ptr);
+   
+    if(ret<0) return -1;
+   
+    if((((*mx_ptr)|(*my_ptr)) & 1) && s->mspel)
+        w->hshift= get_bits1(&s->gb);
+    else 
+        w->hshift= 0;
+
+//printf("%d %d  ", *mx_ptr, *my_ptr);
+   
+    return 0;
+}
+
+static int16_t *wmv2_pred_motion(Wmv2Context *w, int *px, int *py){
+    MpegEncContext * const s= &w->s;
+    int xy, wrap, diff, type;
+    INT16 *A, *B, *C, *mot_val;
+
+    wrap = s->block_wrap[0];
+    xy = s->block_index[0];
+
+    mot_val = s->motion_val[xy];
+
+    A = s->motion_val[xy - 1];
+    B = s->motion_val[xy - wrap];
+    C = s->motion_val[xy + 2 - wrap];
+    
+    diff= FFMAX(ABS(A[0] - B[0]), ABS(A[1] - B[1]));
+    
+    if(s->mb_x && s->mb_y && !s->mspel && w->top_left_mv_flag && diff >= 8)
+        //FIXME top/left bit too if y=!0 && first_slice_line?
+        type= get_bits1(&s->gb);
+    else
+        type= 2;
+    
+    if(type == 0){
+        *px= A[0];
+        *py= A[1];
+    }else if(type == 1){
+        *px= B[0];
+        *py= B[1];
+    }else{
+        /* special case for first (slice) line */
+        if (s->first_slice_line) {
+            *px = A[0];
+            *py = A[1];
+        } else {
+            *px = mid_pred(A[0], B[0], C[0]);
+            *py = mid_pred(A[1], B[1], C[1]);
+        }
+    }
+
+    return mot_val;
+}
+
+static inline int wmv2_decode_inter_block(Wmv2Context *w, DCTELEM *block, int n, int cbp){
+    MpegEncContext * const s= &w->s;
+    static const int sub_cbp_table[3]= {2,3,1};
+    int sub_cbp;
+
+    if(!cbp){ 
+        s->block_last_index[n] = -1;
+
+        return 0;
+    }
+    
+    if(w->per_block_abt)
+        w->abt_type= decode012(&s->gb);
+#if 0
+    if(w->per_block_abt)
+        printf("B%d", w->abt_type);
+#endif
+    w->abt_type_table[n]= w->abt_type;
+
+    if(w->abt_type){
+//        const uint8_t *scantable= w->abt_scantable[w->abt_type-1].permutated;
+        const uint8_t *scantable= w->abt_scantable[w->abt_type-1].scantable;
+//        const uint8_t *scantable= w->abt_type-1 ? w->abt_scantable[1].permutated : w->abt_scantable[0].scantable;
+
+        sub_cbp= sub_cbp_table[ decode012(&s->gb) ];
+//        printf("S%d", sub_cbp);
+
+        if(sub_cbp&1){
+            if (msmpeg4_decode_block(s, block, n, 1, scantable) < 0)
+                return -1;
+        }
+        
+        if(sub_cbp&2){
+            if (msmpeg4_decode_block(s, w->abt_block2[n], n, 1, scantable) < 0)
+                return -1;
+        }
+        s->block_last_index[n] = 63;
+
+        return 0;
+    }else{
+        return msmpeg4_decode_block(s, block, n, 1, s->inter_scantable.permutated);
+    }
+}
+
+static void wmv2_add_block(Wmv2Context *w, DCTELEM *block1, uint8_t *dst, int stride, int n){
+    MpegEncContext * const s= &w->s;
+    uint8_t temp[2][64];
+    int i;
+    
+    if(w->abt_type_table[n] && 0){
+        int a,b;
+        
+        a= block1[0];
+        b= w->abt_block2[n][0];
+        block1[0]= a+b;
+        w->abt_block2[n][0]= a-b;
+    }
+    
+    switch(w->abt_type_table[n]){
+    case 0:
+        if (s->block_last_index[n] >= 0) {
+            s->idct_add (dst, stride, block1);
+        }
+        break;
+    case 1:
+        simple_idct84_add(dst           , stride, block1);
+        simple_idct84_add(dst + 4*stride, stride, w->abt_block2[n]);
+        memset(w->abt_block2[n], 0, 64*sizeof(DCTELEM));
+        break;
+    case 2:
+        simple_idct48_add(dst           , stride, block1);
+        simple_idct48_add(dst + 4       , stride, w->abt_block2[n]);
+        memset(w->abt_block2[n], 0, 64*sizeof(DCTELEM));
+        break;
+    default:
+        fprintf(stderr, "internal error in WMV2 abt\n");
+    }
+}
+
+void ff_wmv2_add_mb(MpegEncContext *s, DCTELEM block1[6][64], uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr){
+    Wmv2Context * const w= (Wmv2Context*)s;
+
+    wmv2_add_block(w, block1[0], dest_y                    , s->linesize, 0);
+    wmv2_add_block(w, block1[1], dest_y + 8                , s->linesize, 1);
+    wmv2_add_block(w, block1[2], dest_y +     8*s->linesize, s->linesize, 2);
+    wmv2_add_block(w, block1[3], dest_y + 8 + 8*s->linesize, s->linesize, 3);
+    
+    if(s->flags&CODEC_FLAG_GRAY) return;
+    
+    wmv2_add_block(w, block1[4], dest_cb                   , s->uvlinesize, 4);
+    wmv2_add_block(w, block1[5], dest_cr                   , s->uvlinesize, 5);
+}
+
+void ff_mspel_motion(MpegEncContext *s,
+                               UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
+                               UINT8 **ref_picture, op_pixels_func (*pix_op)[4],
+                               int motion_x, int motion_y, int h)
+{
+    Wmv2Context * const w= (Wmv2Context*)s;
+    UINT8 *ptr;
+    int dxy, offset, mx, my, src_x, src_y, v_edge_pos, linesize, uvlinesize;
+    int emu=0;
+    
+    dxy = ((motion_y & 1) << 1) | (motion_x & 1);
+    dxy = 2*dxy + w->hshift;
+    src_x = s->mb_x * 16 + (motion_x >> 1);
+    src_y = s->mb_y * 16 + (motion_y >> 1);
+                
+    /* WARNING: do no forget half pels */
+    v_edge_pos = s->v_edge_pos;
+    src_x = clip(src_x, -16, s->width);
+    src_y = clip(src_y, -16, s->height);
+    linesize   = s->linesize;
+    uvlinesize = s->uvlinesize;
+    ptr = ref_picture[0] + (src_y * linesize) + src_x;
+
+    if(s->flags&CODEC_FLAG_EMU_EDGE){
+        if(src_x<1 || src_y<1 || src_x + 17  >= s->h_edge_pos
+                              || src_y + h+1 >= v_edge_pos){
+            ff_emulated_edge_mc(s, ptr - 1 - s->linesize, s->linesize, 19, 19, 
+                             src_x-1, src_y-1, s->h_edge_pos, s->v_edge_pos);
+            ptr= s->edge_emu_buffer + 1 + s->linesize;
+            emu=1;
+        }
+    }
+
+    s->dsp.put_mspel_pixels_tab[dxy](dest_y             , ptr             , linesize);
+    s->dsp.put_mspel_pixels_tab[dxy](dest_y+8           , ptr+8           , linesize);
+    s->dsp.put_mspel_pixels_tab[dxy](dest_y  +8*linesize, ptr  +8*linesize, linesize);
+    s->dsp.put_mspel_pixels_tab[dxy](dest_y+8+8*linesize, ptr+8+8*linesize, linesize);
+
+    if(s->flags&CODEC_FLAG_GRAY) return;
+
+    if (s->out_format == FMT_H263) {
+        dxy = 0;
+        if ((motion_x & 3) != 0)
+            dxy |= 1;
+        if ((motion_y & 3) != 0)
+            dxy |= 2;
+        mx = motion_x >> 2;
+        my = motion_y >> 2;
+    } else {
+        mx = motion_x / 2;
+        my = motion_y / 2;
+        dxy = ((my & 1) << 1) | (mx & 1);
+        mx >>= 1;
+        my >>= 1;
+    }
+    
+    src_x = s->mb_x * 8 + mx;
+    src_y = s->mb_y * 8 + my;
+    src_x = clip(src_x, -8, s->width >> 1);
+    if (src_x == (s->width >> 1))
+        dxy &= ~1;
+    src_y = clip(src_y, -8, s->height >> 1);
+    if (src_y == (s->height >> 1))
+        dxy &= ~2;
+    offset = (src_y * uvlinesize) + src_x;
+    ptr = ref_picture[1] + offset;
+    if(emu){
+        ff_emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, 
+                         src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+        ptr= s->edge_emu_buffer;
+    }
+    pix_op[1][dxy](dest_cb, ptr, uvlinesize, h >> 1);
+
+    ptr = ref_picture[2] + offset;
+    if(emu){
+        ff_emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, 
+                         src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+        ptr= s->edge_emu_buffer;
+    }
+    pix_op[1][dxy](dest_cr, ptr, uvlinesize, h >> 1);
+}
+
+
+static int wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
+{
+    Wmv2Context * const w= (Wmv2Context*)s;
+    int cbp, code, i;
+    UINT8 *coded_val;
+
+    if(w->j_type) return 0;
+    
+    s->error_status_table[s->mb_x + s->mb_y*s->mb_width]= 0;
+    
+    if (s->pict_type == P_TYPE) {
+        if(s->mb_type[s->mb_y * s->mb_width + s->mb_x]&MB_TYPE_SKIPED){
+            /* skip mb */
+            s->mb_intra = 0;
+            for(i=0;i<6;i++)
+                s->block_last_index[i] = -1;
+            s->mv_dir = MV_DIR_FORWARD;
+            s->mv_type = MV_TYPE_16X16;
+            s->mv[0][0][0] = 0;
+            s->mv[0][0][1] = 0;
+            s->mb_skiped = 1;
+            return 0;
+        }
+
+        code = get_vlc2(&s->gb, mb_non_intra_vlc[w->cbp_table_index].table, MB_NON_INTRA_VLC_BITS, 3);
+        if (code < 0)
+            return -1;
+	s->mb_intra = (~code & 0x40) >> 6;
+            
+        cbp = code & 0x3f;
+    } else {
+        s->mb_intra = 1;
+        code = get_vlc2(&s->gb, mb_intra_vlc.table, MB_INTRA_VLC_BITS, 2);
+        if (code < 0){
+            fprintf(stderr, "II-cbp illegal at %d %d\n", s->mb_x, s->mb_y);
+            return -1;
+        }
+        /* predict coded block pattern */
+        cbp = 0;
+        for(i=0;i<6;i++) {
+            int val = ((code >> (5 - i)) & 1);
+            if (i < 4) {
+                int pred = coded_block_pred(s, i, &coded_val);
+                val = val ^ pred;
+                *coded_val = val;
+            }
+            cbp |= val << (5 - i);
+        }
+    }
+
+    if (!s->mb_intra) {
+        int mx, my;
+//printf("P at %d %d\n", s->mb_x, s->mb_y);
+        wmv2_pred_motion(w, &mx, &my);
+        
+        if(cbp){
+            if(s->per_mb_rl_table){
+                s->rl_table_index = decode012(&s->gb);
+                s->rl_chroma_table_index = s->rl_table_index;
+            }
+
+            if(w->abt_flag && w->per_mb_abt){
+                w->per_block_abt= get_bits1(&s->gb);
+                if(!w->per_block_abt)
+                    w->abt_type= decode012(&s->gb);
+            }else
+                w->per_block_abt=0;
+        }
+        
+        if (wmv2_decode_motion(w, &mx, &my) < 0)
+            return -1;
+
+        s->mv_dir = MV_DIR_FORWARD;
+        s->mv_type = MV_TYPE_16X16;
+        s->mv[0][0][0] = mx;
+        s->mv[0][0][1] = my;
+
+        for (i = 0; i < 6; i++) {
+            if (wmv2_decode_inter_block(w, block[i], i, (cbp >> (5 - i)) & 1) < 0)
+	    {
+	        fprintf(stderr,"\nerror while decoding inter block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+	        return -1;
+	    }
+        }    
+    } else {
+//if(s->pict_type==P_TYPE)
+//   printf("%d%d ", s->inter_intra_pred, cbp);
+//printf("I at %d %d %d %06X\n", s->mb_x, s->mb_y, ((cbp&3)? 1 : 0) +((cbp&0x3C)? 2 : 0), show_bits(&s->gb, 24));
+        s->ac_pred = get_bits1(&s->gb);
+        if(s->inter_intra_pred){
+            s->h263_aic_dir= get_vlc2(&s->gb, inter_intra_vlc.table, INTER_INTRA_VLC_BITS, 1);
+//            printf("%d%d %d %d/", s->ac_pred, s->h263_aic_dir, s->mb_x, s->mb_y);
+        }
+        if(s->per_mb_rl_table && cbp){
+            s->rl_table_index = decode012(&s->gb);
+            s->rl_chroma_table_index = s->rl_table_index;
+        }
+    
+        for (i = 0; i < 6; i++) {
+            if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
+	    {
+	        fprintf(stderr,"\nerror while decoding intra block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+	        return -1;
+	    }
+        }    
+    }
+
+    return 0;
+}
+
+static int wmv2_decode_init(AVCodecContext *avctx){
+    Wmv2Context * const w= avctx->priv_data;
+    
+    if(ff_h263_decode_init(avctx) < 0)
+        return -1;
+    
+    wmv2_common_init(w);
+    
+    return 0;
+}
+
+AVCodec wmv2_decoder = {
+    "wmv2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_WMV2,
+    sizeof(Wmv2Context),
+    wmv2_decode_init,
+    NULL,
+    ff_h263_decode_end,
+    ff_h263_decode_frame,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
+};
+
+AVCodec wmv2_encoder = {
+    "wmv2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_WMV2,
+    sizeof(Wmv2Context),
+    wmv2_encode_init,
+    MPV_encode_picture,
+    MPV_encode_end,
+};
+
author	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-01-08 13:18:42 +0000
committer	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-01-08 13:18:42 +0000
commit	6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea (patch)
tree	e70be493d1222b10f96aa5efac01c0ec0d5bcc97
parent	1fb58a63872660424777d41389e426dc90f1b660 (diff)
download	xine-lib-6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea.tar.gz xine-lib-6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea.tar.bz2