update to latest ffmpeg cvs, this should add mpeg-4 bframe support :-) (BTW: ffmpeg-guys: impressive work\!\!)

CVS patchset: 1690 CVS date: 2002/04/06 20:51:22
author: Guenter Bartsch <guenter@users.sourceforge.net> 2002-04-06 20:51:22 +0000
committer: Guenter Bartsch <guenter@users.sourceforge.net> 2002-04-06 20:51:22 +0000
commit: 55e772ec62ef638f8a0b44e379da663f78245355 (patch)
tree: 3b90a73ab2e800ed32f68e24f125164de7a655b3 /src
parent: 0176e107fd9b6672d87f75a9eb5d83e163e0179f (diff)
download: xine-lib-55e772ec62ef638f8a0b44e379da663f78245355.tar.gz
xine-lib-55e772ec62ef638f8a0b44e379da663f78245355.tar.bz2
18 files changed, 3807 insertions, 986 deletions
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 3c27d99ea..05b27d8c2 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -14,11 +14,14 @@ enum CodecID {
     CODEC_ID_MJPEG,
     CODEC_ID_MPEG4,
     CODEC_ID_RAWVIDEO,
-    CODEC_ID_MSMPEG4,
+    CODEC_ID_MSMPEG4V1,
+    CODEC_ID_MSMPEG4V2,
+    CODEC_ID_MSMPEG4V3,
     CODEC_ID_H263P,
     CODEC_ID_H263I,
 
 };
+#define CODEC_ID_MSMPEG4 CODEC_ID_MSMPEG4V3
 
 enum CodecType {
     CODEC_TYPE_VIDEO,
@@ -48,11 +51,19 @@ extern int motion_estimation_method;
 #define ME_FULL   1
 #define ME_LOG    2
 #define ME_PHODS  3
+#define ME_EPZS   4
+#define ME_X1     5
 
 /* encoding support */
+/* note not everything is supported yet */
 
 #define CODEC_FLAG_HQ     0x0001 /* high quality (non real time) encoding */
 #define CODEC_FLAG_QSCALE 0x0002 /* use fixed qscale */
+#define CODEC_FLAG_4MV    0x0004 /* 4 MV per MB allowed */
+#define CODEC_FLAG_B      0x0008 /* use B frames */
+#define CODEC_FLAG_QPEL   0x0010 /* use qpel MC */
+#define CODEC_FLAG_GMC    0x0020 /* use GMC */
+#define CODEC_FLAG_TYPE   0x0040 /* fixed I/P frame type, from avctx->key_frame */
 
 /* codec capabilities */
 
@@ -63,12 +74,19 @@ extern int motion_estimation_method;
 
 typedef struct AVCodecContext {
     int bit_rate;
+    int bit_rate_tolerance; /* amount of +- bits (>0)*/
     int flags;
     int sub_id;    /* some codecs needs additionnal format info. It is
                       stored there */
     /* video only */
     int frame_rate; /* frames per sec multiplied by FRAME_RATE_BASE */
     int width, height;
+    int aspect_ratio_info;
+#define FF_ASPECT_SQUARE 1
+#define FF_ASPECT_4_3_625 2
+#define FF_ASPECT_4_3_525 3
+#define FF_ASPECT_16_9_625 4
+#define FF_ASPECT_16_9_525 5
     int gop_size; /* 0 = intra only */
     int pix_fmt;  /* pixel format, see PIX_FMT_xxx */
 
@@ -92,6 +110,12 @@ typedef struct AVCodecContext {
                          a key frame (intra, or seekable) */
     int quality;      /* quality of the previous encoded frame 
                          (between 1 (good) and 31 (bad)) */
+    float qcompress;  /* amount of qscale change between easy & hard scenes (0.0-1.0)*/
+    float qblur;      /* amount of qscale smoothing over time (0.0-1.0) */
+    int qmin;         /* min qscale */
+    int qmax;         /* max qscale */
+    int max_qdiff;    /* max qscale difference between frames */
+    
     struct AVCodec *codec;
     void *priv_data;
 
@@ -122,6 +146,17 @@ typedef struct AVCodecContext {
     float psnr_cb;
     float psnr_cr;
                  
+    /* statistics, used for 2-pass encoding */
+    int mv_bits;
+    int header_bits;
+    int i_tex_bits;
+    int p_tex_bits;
+    int i_count;
+    int p_count;
+    int skip_count;
+    int misc_bits; // cbp, mb_type
+    int frame_bits;
+
     /* the following fields are ignored */
     void *opaque;   /* can be used to carry app specific stuff */
     char codec_name[32];
@@ -152,7 +187,9 @@ typedef struct AVPicture {
 
 extern AVCodec h263_decoder;
 extern AVCodec mpeg4_decoder;
-extern AVCodec msmpeg4_decoder;
+extern AVCodec msmpeg4v1_decoder;
+extern AVCodec msmpeg4v2_decoder;
+extern AVCodec msmpeg4v3_decoder;
 extern AVCodec mpeg_decoder;
 extern AVCodec h263i_decoder;
 extern AVCodec rv10_decoder;
diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c
index 77f94689e..96d8a303a 100644
--- a/src/libffmpeg/libavcodec/common.c
+++ b/src/libffmpeg/libavcodec/common.c
@@ -128,6 +128,7 @@ void init_get_bits(GetBitContext *s,
         s->bit_cnt += 8;
     }
 #endif
+    s->size= buffer_size;
 }
 
 #ifndef ALT_BITSTREAM_READER
@@ -201,6 +202,14 @@ void align_get_bits(GetBitContext *s)
 #endif
 }
 
+int check_marker(GetBitContext *s, char *msg)
+{
+    int bit= get_bits1(s);
+    if(!bit) printf("Marker bit missing %s\n", msg);
+
+    return bit;
+}
+
 #ifndef ALT_BITSTREAM_READER
 /* This function is identical to get_bits_long(), the */
 /* only diference is that it doesn't touch the buffer */
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index fd4bba129..707dbbc8f 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -13,6 +13,7 @@
 //#define ALT_BITSTREAM_READER
 //#define ALIGNED_BITSTREAM
 #define FAST_GET_FIRST_VLC
+//#define DUMP_STREAM // only works with the ALT_BITSTREAM_READER
 
 #ifdef HAVE_AV_CONFIG_H
 /* only include the following when compiling package */
@@ -197,8 +198,11 @@ typedef struct GetBitContext {
     int bit_cnt;
     UINT8 *buf, *buf_ptr, *buf_end;
 #endif
+    int size;
 } GetBitContext;
 
+static inline int get_bits_count(GetBitContext *s);
+
 typedef struct VLC {
     int bits;
     INT16 *table_codes;
@@ -466,6 +470,13 @@ static inline unsigned int get_bits(GetBitContext *s, int n){
     result>>= 32 - n;
     index+= n;
     s->index= index;
+#ifdef DUMP_STREAM
+    while(n){
+        printf("%d", (result>>(n-1))&1);
+        n--;
+    }
+    printf(" ");
+#endif
     
     return result;
 #endif //!ALIGNED_BITSTREAM
@@ -492,6 +503,9 @@ static inline unsigned int get_bits1(GetBitContext *s){
     result>>= 8 - 1;
     index++;
     s->index= index;
+#ifdef DUMP_STREAM
+    printf("%d ", result);
+#endif
     
     return result;
 #else
@@ -550,9 +564,54 @@ static inline unsigned int show_bits(GetBitContext *s, int n)
 #endif //!ALT_BITSTREAM_READER
 }
 
+static inline int show_aligned_bits(GetBitContext *s, int offset, int n)
+{
+#ifdef ALT_BITSTREAM_READER
+#ifdef ALIGNED_BITSTREAM
+    int index= (s->index + offset + 7)&(~7);
+    uint32_t result1= be2me_32( ((uint32_t *)s->buffer)[index>>5] );
+    uint32_t result2= be2me_32( ((uint32_t *)s->buffer)[(index>>5) + 1] );
+#ifdef ARCH_X86
+    asm ("shldl %%cl, %2, %0\n\t"
+         : "=r" (result1)
+	 : "0" (result1), "r" (result2), "c" (index));
+#else
+    result1<<= (index&0x1F);
+    result2= (result2>>1) >> (31-(index&0x1F));
+    result1|= result2;
+#endif
+    result1>>= 32 - n;
+    
+    return result1;
+#else //ALIGNED_BITSTREAM
+    int index= (s->index + offset + 7)>>3;
+    uint32_t result= be2me_32( unaligned32( ((uint8_t *)s->buffer)+index ) );
+
+    result>>= 32 - n;
+    
+    return result;
+#endif //!ALIGNED_BITSTREAM
+#else //ALT_BITSTREAM_READER
+    int index= (get_bits_count(s) + offset + 7)>>3;
+    uint32_t result= be2me_32( unaligned32( ((uint8_t *)s->buf)+index ) );
+
+    result>>= 32 - n;
+//printf(" %X %X %d \n", (int)(((uint8_t *)s->buf)+index ), (int)s->buf_ptr, s->bit_cnt);    
+    return result;
+#endif //!ALT_BITSTREAM_READER
+}
+
 static inline void skip_bits(GetBitContext *s, int n){
 #ifdef ALT_BITSTREAM_READER
     s->index+= n;
+#ifdef DUMP_STREAM
+    {
+        int result;
+        s->index-= n;
+        result= get_bits(s, n);
+    }
+#endif
+
 #else
     if(s->bit_cnt>=n){
         /* most common case here */
@@ -570,6 +629,10 @@ static inline void skip_bits(GetBitContext *s, int n){
 static inline void skip_bits1(GetBitContext *s){
 #ifdef ALT_BITSTREAM_READER
     s->index++;
+#ifdef DUMP_STREAM
+    s->index--;
+    printf("%d ", get_bits1(s));
+#endif
 #else
     if(s->bit_cnt>0){
         /* most common case here */
@@ -593,6 +656,7 @@ static inline int get_bits_count(GetBitContext *s)
 #endif
 }
 
+int check_marker(GetBitContext *s, char *msg);
 void align_get_bits(GetBitContext *s);
 int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
              const void *bits, int bits_wrap, int bits_size,
@@ -694,6 +758,13 @@ static inline int get_vlc(GetBitContext *s, VLC *vlc)
     if (n > 0) {
         /* most common case (90%)*/
         FLUSH_BITS(n);
+#ifdef DUMP_STREAM
+        {
+            int n= bit_cnt - s->index;
+            skip_bits(s, n);
+            RESTORE_BITS(s);
+        }
+#endif
         RESTORE_BITS(s);
         return code;
     } else if (n == 0) {
@@ -728,6 +799,13 @@ static inline int get_vlc(GetBitContext *s, VLC *vlc)
             table_bits = vlc->table_bits + code;
         }
     }
+#ifdef DUMP_STREAM
+    {
+        int n= bit_cnt - s->index;
+        skip_bits(s, n);
+        RESTORE_BITS(s);
+    }
+#endif
     RESTORE_BITS(s);
     return code;
 }
@@ -786,6 +864,24 @@ static inline int av_log2(unsigned int v)
     return n;
 }
 
+/* median of 3 */
+static inline int mid_pred(int a, int b, int c)
+{
+    int vmin, vmax;
+    vmax = vmin = a;
+    if (b < vmin)
+        vmin = b;
+    else
+	vmax = b;
+
+    if (c < vmin)
+        vmin = c;
+    else if (c > vmax)
+        vmax = c;
+
+    return a + b + c - vmin - vmax;
+}
+
 /* memory */
 void *av_mallocz(int size);
 
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index 0e698f35c..dcfad05a5 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -30,12 +30,18 @@ void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
+void (*clear_blocks)(DCTELEM *blocks);
 
 op_pixels_abs_func pix_abs16x16;
 op_pixels_abs_func pix_abs16x16_x2;
 op_pixels_abs_func pix_abs16x16_y2;
 op_pixels_abs_func pix_abs16x16_xy2;
 
+op_pixels_abs_func pix_abs8x8;
+op_pixels_abs_func pix_abs8x8_x2;
+op_pixels_abs_func pix_abs8x8_y2;
+op_pixels_abs_func pix_abs8x8_xy2;
+
 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
 UINT32 squareTbl[512];
 
@@ -377,14 +383,14 @@ static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride,
     int i;
     for(i=0; i<h; i++)
     {
-        dst[0]= cm[(((src[0]+src[1])*160 - (src[0]+src[2])*48 + (src[1]+src[3])*24 - (src[2]+src[4])*8 + r)>>8)];
-        dst[1]= cm[(((src[1]+src[2])*160 - (src[0]+src[3])*48 + (src[0]+src[4])*24 - (src[1]+src[5])*8 + r)>>8)];
-        dst[2]= cm[(((src[2]+src[3])*160 - (src[1]+src[4])*48 + (src[0]+src[5])*24 - (src[0]+src[6])*8 + r)>>8)];
-        dst[3]= cm[(((src[3]+src[4])*160 - (src[2]+src[5])*48 + (src[1]+src[6])*24 - (src[0]+src[7])*8 + r)>>8)];
-        dst[4]= cm[(((src[4]+src[5])*160 - (src[3]+src[6])*48 + (src[2]+src[7])*24 - (src[1]+src[8])*8 + r)>>8)];
-        dst[5]= cm[(((src[5]+src[6])*160 - (src[4]+src[7])*48 + (src[3]+src[8])*24 - (src[2]+src[8])*8 + r)>>8)];
-        dst[6]= cm[(((src[6]+src[7])*160 - (src[5]+src[8])*48 + (src[4]+src[8])*24 - (src[3]+src[7])*8 + r)>>8)];
-        dst[7]= cm[(((src[7]+src[8])*160 - (src[6]+src[8])*48 + (src[5]+src[7])*24 - (src[4]+src[6])*8 + r)>>8)];
+        dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
+        dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
+        dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
+        dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
+        dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
+        dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
+        dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
+        dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
         dst+=dstStride;
         src+=srcStride;
     }
@@ -405,14 +411,14 @@ static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride,
         const int src6= src[6*srcStride];
         const int src7= src[7*srcStride];
         const int src8= src[8*srcStride];
-        dst[0*dstStride]= cm[(((src0+src1)*160 - (src0+src2)*48 + (src1+src3)*24 - (src2+src4)*8 + r)>>8)];
-        dst[1*dstStride]= cm[(((src1+src2)*160 - (src0+src3)*48 + (src0+src4)*24 - (src1+src5)*8 + r)>>8)];
-        dst[2*dstStride]= cm[(((src2+src3)*160 - (src1+src4)*48 + (src0+src5)*24 - (src0+src6)*8 + r)>>8)];
-        dst[3*dstStride]= cm[(((src3+src4)*160 - (src2+src5)*48 + (src1+src6)*24 - (src0+src7)*8 + r)>>8)];
-        dst[4*dstStride]= cm[(((src4+src5)*160 - (src3+src6)*48 + (src2+src7)*24 - (src1+src8)*8 + r)>>8)];
-        dst[5*dstStride]= cm[(((src5+src6)*160 - (src4+src7)*48 + (src3+src8)*24 - (src2+src8)*8 + r)>>8)];
-        dst[6*dstStride]= cm[(((src6+src7)*160 - (src5+src8)*48 + (src4+src8)*24 - (src3+src7)*8 + r)>>8)];
-        dst[7*dstStride]= cm[(((src7+src8)*160 - (src6+src8)*48 + (src5+src7)*24 - (src4+src6)*8 + r)>>8)];
+        dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
+        dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
+        dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
+        dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
+        dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
+        dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
+        dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
+        dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
         dst++;
         src++;
     }
@@ -485,38 +491,38 @@ static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_h_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 }\
 \
 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
-    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 128-r);\
+    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 }\
 \
 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_h_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
 }\
 \
 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_v_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 }\
 \
 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
-    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 128-r);\
+    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 }\
 \
 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 half[64];\
-    qpel_v_lowpass(half, src, 8, srcStride, 8, 128-r);\
+    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
 }\
 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -524,9 +530,9 @@ static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -534,9 +540,9 @@ static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -544,9 +550,9 @@ static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -554,25 +560,25 @@ static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 }\
 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -580,9 +586,9 @@ static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
@@ -590,16 +596,16 @@ static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcS
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 128-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
+    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 }\
 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 {\
     UINT8 halfH[72];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 128-r);\
-    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 128-r);\
+    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
+    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
 }\
 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
     qpel_mc00_c ## name,                                                                   \
@@ -623,12 +629,12 @@ qpel_mc_func qpel_mc ## name ## _tab[16]={ \
 QPEL_MC(0, _rnd)
 QPEL_MC(1, _no_rnd)
 
-int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - pix2[0]);
         s += abs(pix1[1] - pix2[1]);
         s += abs(pix1[2] - pix2[2]);
@@ -651,12 +657,12 @@ int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
@@ -679,13 +685,13 @@ int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
     UINT8 *pix3 = pix2 + line_size;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
@@ -709,13 +715,13 @@ int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
-int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
+int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
     int s, i;
     UINT8 *pix3 = pix2 + line_size;
 
     s = 0;
-    for(i=0;i<h;i++) {
+    for(i=0;i<16;i++) {
         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
@@ -739,6 +745,90 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
     return s;
 }
 
+int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - pix2[0]);
+        s += abs(pix1[1] - pix2[1]);
+        s += abs(pix1[2] - pix2[2]);
+        s += abs(pix1[3] - pix2[3]);
+        s += abs(pix1[4] - pix2[4]);
+        s += abs(pix1[5] - pix2[5]);
+        s += abs(pix1[6] - pix2[6]);
+        s += abs(pix1[7] - pix2[7]);
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    return s;
+}
+
+int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
+        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
+        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
+        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
+        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
+        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
+        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
+        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    return s;
+}
+
+int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+    UINT8 *pix3 = pix2 + line_size;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
+        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
+        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
+        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
+        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
+        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
+        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
+        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
+        pix1 += line_size;
+        pix2 += line_size;
+        pix3 += line_size;
+    }
+    return s;
+}
+
+int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+{
+    int s, i;
+    UINT8 *pix3 = pix2 + line_size;
+
+    s = 0;
+    for(i=0;i<8;i++) {
+        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
+        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
+        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
+        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
+        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
+        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
+        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
+        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
+        pix1 += line_size;
+        pix2 += line_size;
+        pix3 += line_size;
+    }
+    return s;
+}
+
 /* permute block according so that it corresponds to the MMX idct
    order */
 #ifdef SIMPLE_IDCT
@@ -777,6 +867,11 @@ void block_permute(INT16 *block)
 }
 #endif
 
+void clear_blocks_c(DCTELEM *blocks)
+{
+    memset(blocks, 0, sizeof(DCTELEM)*6*64);
+}
+
 void dsputil_init(void)
 {
     int i, j;
@@ -801,11 +896,16 @@ void dsputil_init(void)
     put_pixels_clamped = put_pixels_clamped_c;
     add_pixels_clamped = add_pixels_clamped_c;
     gmc1= gmc1_c;
+    clear_blocks= clear_blocks_c;
 
-    pix_abs16x16 = pix_abs16x16_c;
-    pix_abs16x16_x2 = pix_abs16x16_x2_c;
-    pix_abs16x16_y2 = pix_abs16x16_y2_c;
+    pix_abs16x16     = pix_abs16x16_c;
+    pix_abs16x16_x2  = pix_abs16x16_x2_c;
+    pix_abs16x16_y2  = pix_abs16x16_y2_c;
     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
+    pix_abs8x8     = pix_abs8x8_c;
+    pix_abs8x8_x2  = pix_abs8x8_x2_c;
+    pix_abs8x8_y2  = pix_abs8x8_y2_c;
+    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
     av_fdct = jpeg_fdct_ifast;
 
     use_permuted_idct = 1;
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index d0a6e68ba..dc63f06f1 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -41,11 +41,13 @@ extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
 extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 extern void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
+extern void (*clear_blocks)(DCTELEM *blocks);
 
 
 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size);
 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
+void clear_blocks_c(DCTELEM *blocks);
 
 /* add and put pixel (decoding) */
 typedef void (*op_pixels_func)(UINT8 *block, const UINT8 *pixels, int line_size, int h);
@@ -67,17 +69,21 @@ extern void (*sub_pixels_tab[4])(DCTELEM *block, const UINT8 *pixels, int line_s
 
 /* motion estimation */
 
-typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size, int h);
+typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size);
 
 extern op_pixels_abs_func pix_abs16x16;
 extern op_pixels_abs_func pix_abs16x16_x2;
 extern op_pixels_abs_func pix_abs16x16_y2;
 extern op_pixels_abs_func pix_abs16x16_xy2;
+extern op_pixels_abs_func pix_abs8x8;
+extern op_pixels_abs_func pix_abs8x8_x2;
+extern op_pixels_abs_func pix_abs8x8_y2;
+extern op_pixels_abs_func pix_abs8x8_xy2;
 
-int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
+int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx);
 
 static inline int block_permute_op(int j)
 {
@@ -102,7 +108,8 @@ void block_permute(INT16 *block);
 
 extern int mm_flags;
 
-int mm_support(void);
+/* int mm_support(void); */
+#define mm_support() xine_mm_accel()
 
 #if 0
 static inline void emms(void)
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index 79b74631d..52127aaad 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -17,6 +17,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * ac prediction encoding by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "common.h"
 #include "dsputil.h"
@@ -28,19 +30,35 @@
 //rounded divison & shift
 #define RDIV(a,b) ((a) > 0 ? ((a)+((b)>>1))/(b) : ((a)-((b)>>1))/(b))
 #define RSHIFT(a,b) ((a) > 0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
+#define ABS(a) (((a)>=0)?(a):(-(a)))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
 
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block,
 			      int n);
 static void h263_encode_motion(MpegEncContext * s, int val);
 static void h263p_encode_umotion(MpegEncContext * s, int val);
 static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block,
-			       int n);
-static int h263_decode_motion(MpegEncContext * s, int pred);
+			       int n, int dc, UINT8 *scan_table);
+static int h263_decode_motion(MpegEncContext * s, int pred, int fcode);
 static int h263p_decode_umotion(MpegEncContext * s, int pred);
 static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
                              int n, int coded);
 static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                               int n, int coded);
+static inline int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr);
+static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
+                              int dir);
+static void mpeg4_decode_sprite_trajectory(MpegEncContext * s);
+
+extern UINT32 inverse[256];
+
+static UINT16 mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
+static UINT8 fcode_tab[MAX_MV*2+1];
+static UINT8 umv_fcode_tab[MAX_MV*2+1];
+
+static UINT16 uni_DCtab_lum  [512][2];
+static UINT16 uni_DCtab_chrom[512][2];
 
 int h263_get_picture_format(int width, int height)
 {
@@ -195,7 +213,213 @@ int h263_encode_gob_header(MpegEncContext * s, int mb_line)
    }
    return 0;
 }
+
+static inline int decide_ac_pred(MpegEncContext * s, DCTELEM block[6][64], int dir[6])
+{
+    int score0=0, score1=0;
+    int i, n;
+
+    for(n=0; n<6; n++){
+        INT16 *ac_val, *ac_val1;
+
+        ac_val = s->ac_val[0][0] + s->block_index[n] * 16;
+        ac_val1= ac_val;
+        if(dir[n]){
+            ac_val-= s->block_wrap[n]*16;
+            for(i=1; i<8; i++){
+                const int level= block[n][block_permute_op(i   )];
+                score0+= ABS(level);
+                score1+= ABS(level - ac_val[i+8]);
+                ac_val1[i  ]=    block[n][block_permute_op(i<<3)];
+                ac_val1[i+8]= level;
+            }
+        }else{
+            ac_val-= 16;
+            for(i=1; i<8; i++){
+                const int level= block[n][block_permute_op(i<<3)];
+                score0+= ABS(level);
+                score1+= ABS(level - ac_val[i]);
+                ac_val1[i  ]= level;
+                ac_val1[i+8]=    block[n][block_permute_op(i   )];
+            }
+        }
+    }
+
+    return score0 > score1 ? 1 : 0;    
+}
+
+void mpeg4_encode_mb(MpegEncContext * s,
+		    DCTELEM block[6][64],
+		    int motion_x, int motion_y)
+{
+    int cbpc, cbpy, i, cbp, pred_x, pred_y;
+    int bits;
     
+    //    printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
+    if (!s->mb_intra) {
+        /* compute cbp */
+        cbp = 0;
+        for (i = 0; i < 6; i++) {
+        if (s->block_last_index[i] >= 0)
+            cbp |= 1 << (5 - i);
+        }
+        if ((cbp | motion_x | motion_y) == 0 && s->mv_type==MV_TYPE_16X16) {
+            /* skip macroblock */
+            put_bits(&s->pb, 1, 1);
+            s->misc_bits++;
+            s->last_bits++;
+            s->skip_count++;
+            return;
+        }
+        put_bits(&s->pb, 1, 0);	/* mb coded */
+        if(s->mv_type==MV_TYPE_16X16){
+            cbpc = cbp & 3;
+            put_bits(&s->pb,
+                    inter_MCBPC_bits[cbpc],
+                    inter_MCBPC_code[cbpc]);
+            cbpy = cbp >> 2;
+            cbpy ^= 0xf;
+            put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+                
+            bits= get_bit_count(&s->pb);
+            s->misc_bits+= bits - s->last_bits;
+            s->last_bits=bits;
+
+            /* motion vectors: 16x16 mode */
+            h263_pred_motion(s, 0, &pred_x, &pred_y);
+        
+            h263_encode_motion(s, motion_x - pred_x);
+            h263_encode_motion(s, motion_y - pred_y);
+        }else{
+            cbpc = (cbp & 3)+16;
+            put_bits(&s->pb,
+                    inter_MCBPC_bits[cbpc],
+                    inter_MCBPC_code[cbpc]);
+            cbpy = cbp >> 2;
+            cbpy ^= 0xf;
+            put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+
+            bits= get_bit_count(&s->pb);
+            s->misc_bits+= bits - s->last_bits;
+            s->last_bits=bits;
+
+            for(i=0; i<4; i++){
+                /* motion vectors: 8x8 mode*/
+                h263_pred_motion(s, i, &pred_x, &pred_y);
+
+                h263_encode_motion(s, s->motion_val[ s->block_index[i] ][0] - pred_x);
+                h263_encode_motion(s, s->motion_val[ s->block_index[i] ][1] - pred_y);
+            }
+        }
+        bits= get_bit_count(&s->pb);
+        s->mv_bits+= bits - s->last_bits;
+        s->last_bits=bits;
+
+        /* encode each block */
+        for (i = 0; i < 6; i++) {
+            mpeg4_encode_block(s, block[i], i, 0, zigzag_direct);
+        }
+        bits= get_bit_count(&s->pb);
+        s->p_tex_bits+= bits - s->last_bits;
+        s->last_bits=bits;
+        s->p_count++;
+    } else {
+        int dc_diff[6];   //dc values with the dc prediction subtracted 
+        int dir[6];  //prediction direction
+        int zigzag_last_index[6];
+        UINT8 *scan_table[6];
+
+        for(i=0; i<6; i++){
+            const int level= block[i][0];
+            UINT16 *dc_ptr;
+
+            dc_diff[i]= level - mpeg4_pred_dc(s, i, &dc_ptr, &dir[i]);
+            if (i < 4) {
+                *dc_ptr = level * s->y_dc_scale;
+            } else {
+                *dc_ptr = level * s->c_dc_scale;
+            }
+        }
+
+        s->ac_pred= decide_ac_pred(s, block, dir);
+
+        if(s->ac_pred){
+            for(i=0; i<6; i++){
+                UINT8 *st;
+                int last_index;
+
+                mpeg4_inv_pred_ac(s, block[i], i, dir[i]);
+                if (dir[i]==0) st = ff_alternate_vertical_scan; /* left */
+                else           st = ff_alternate_horizontal_scan; /* top */
+
+                for(last_index=63; last_index>=0; last_index--) //FIXME optimize
+                    if(block[i][st[last_index]]) break;
+                zigzag_last_index[i]= s->block_last_index[i];
+                s->block_last_index[i]= last_index;
+                scan_table[i]= st;
+            }
+        }else{
+            for(i=0; i<6; i++)
+                scan_table[i]= zigzag_direct;
+        }
+
+        /* compute cbp */
+        cbp = 0;
+        for (i = 0; i < 6; i++) {
+            if (s->block_last_index[i] >= 1)
+                cbp |= 1 << (5 - i);
+        }
+
+        cbpc = cbp & 3;
+        if (s->pict_type == I_TYPE) {
+            put_bits(&s->pb,
+                intra_MCBPC_bits[cbpc],
+                intra_MCBPC_code[cbpc]);
+        } else {
+            put_bits(&s->pb, 1, 0);	/* mb coded */
+            put_bits(&s->pb,
+                inter_MCBPC_bits[cbpc + 4],
+                inter_MCBPC_code[cbpc + 4]);
+        }
+        put_bits(&s->pb, 1, s->ac_pred);
+        cbpy = cbp >> 2;
+        put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+
+        bits= get_bit_count(&s->pb);
+        s->misc_bits+= bits - s->last_bits;
+        s->last_bits=bits;
+
+        /* encode each block */
+        for (i = 0; i < 6; i++) {
+            mpeg4_encode_block(s, block[i], i, dc_diff[i], scan_table[i]);
+        }
+
+        bits= get_bit_count(&s->pb);
+        s->i_tex_bits+= bits - s->last_bits;
+        s->last_bits=bits;
+        s->i_count++;
+
+        /* restore ac coeffs & last_index stuff if we messed them up with the prediction */
+        if(s->ac_pred){
+            for(i=0; i<6; i++){
+                int j;    
+                INT16 *ac_val;
+
+                ac_val = s->ac_val[0][0] + s->block_index[i] * 16;
+
+                if(dir[i]){
+                    for(j=1; j<8; j++) 
+                        block[i][block_permute_op(j   )]= ac_val[j+8];
+                }else{
+                    for(j=1; j<8; j++) 
+                        block[i][block_permute_op(j<<3)]= ac_val[j  ];
+                }
+                s->block_last_index[i]= zigzag_last_index[i];
+            }
+        }
+    }
+}
+
 void h263_encode_mb(MpegEncContext * s,
 		    DCTELEM block[6][64],
 		    int motion_x, int motion_y)
@@ -266,18 +490,11 @@ void h263_encode_mb(MpegEncContext * s,
     }
 
     /* encode each block */
-    if (s->h263_pred) {
-	for (i = 0; i < 6; i++) {
-	    mpeg4_encode_block(s, block[i], i);
-	}
-    } else {
-	for (i = 0; i < 6; i++) {
-	    h263_encode_block(s, block[i], i);
-	}
+    for (i = 0; i < 6; i++) {
+        h263_encode_block(s, block[i], i);
     }
 }
 
-
 void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n)
 {
     int x, y, wrap, a, c, pred_dc, scale, i;
@@ -359,69 +576,33 @@ void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n)
         ac_val1[8 + i] = block[block_permute_op(i)];
 }
 
-
-static inline int mid_pred(int a, int b, int c)
-{
-    int vmin, vmax;
-    vmax = vmin = a;
-    if (b < vmin)
-        vmin = b;
-    else
-	vmax = b;
-
-    if (c < vmin)
-        vmin = c;
-    else if (c > vmax)
-        vmax = c;
-
-    return a + b + c - vmin - vmax;
-}
-
 INT16 *h263_pred_motion(MpegEncContext * s, int block, 
                         int *px, int *py)
 {
-    int xy, y, wrap;
+    int xy, wrap;
     INT16 *A, *B, *C, *mot_val;
+    static const int off[4]= {2, 1, 1, -1};
 
-    wrap = 2 * s->mb_width + 2;
-    y = xy = 2 * s->mb_y + 1 + ((block >> 1) & 1); // y
-    xy *= wrap; // y * wrap
-    xy += 2 * s->mb_x + 1 + (block & 1); // x + y * wrap
+    wrap = s->block_wrap[0];
+    xy = s->block_index[block];
 
     mot_val = s->motion_val[xy];
 
     /* special case for first line */
-    if (y == 1 || s->first_slice_line || s->first_gob_line) {
+    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
         A = s->motion_val[xy - 1];
         *px = A[0];
         *py = A[1];
     } else {
-        switch(block) {
-        default:
-        case 0:
-            A = s->motion_val[xy - 1];
-            B = s->motion_val[xy - wrap];
-            C = s->motion_val[xy + 2 - wrap];
-            break;
-        case 1:
-        case 2:
-            A = s->motion_val[xy - 1];
-            B = s->motion_val[xy - wrap];
-            C = s->motion_val[xy + 1 - wrap];
-            break;
-        case 3:
-            A = s->motion_val[xy - 1];
-            B = s->motion_val[xy - 1 - wrap];
-            C = s->motion_val[xy - wrap];
-            break;
-        }
+        A = s->motion_val[xy - 1];
+        B = s->motion_val[xy - wrap];
+        C = s->motion_val[xy + off[block] - wrap];
         *px = mid_pred(A[0], B[0], C[0]);
         *py = mid_pred(A[1], B[1], C[1]);
     }
     return mot_val;
 }
 
-
 static void h263_encode_motion(MpegEncContext * s, int val)
 {
     int range, l, m, bit_size, sign, code, bits;
@@ -501,15 +682,119 @@ static void h263p_encode_umotion(MpegEncContext * s, int val)
     }
 }
 
-void h263_encode_init_vlc(MpegEncContext *s)
+static void init_mv_penalty_and_fcode(MpegEncContext *s)
+{
+    int f_code;
+    int mv;
+    for(f_code=1; f_code<=MAX_FCODE; f_code++){
+        for(mv=-MAX_MV; mv<=MAX_MV; mv++){
+            int len;
+
+            if(mv==0) len= mvtab[0][1];
+            else{
+                int val, bit_size, range, code;
+
+                bit_size = s->f_code - 1;
+                range = 1 << bit_size;
+
+                val=mv;
+                if (val < 0) 
+                    val = -val;
+                val--;
+                code = (val >> bit_size) + 1;
+                if(code<33){
+                    len= mvtab[code][1] + 1 + bit_size;
+                }else{
+                    len= mvtab[32][1] + 2 + bit_size;
+                }
+            }
+
+            mv_penalty[f_code][mv+MAX_MV]= len;
+        }
+    }
+
+    for(f_code=MAX_FCODE; f_code>0; f_code--){
+        for(mv=-(16<<f_code); mv<(16<<f_code); mv++){
+            fcode_tab[mv+MAX_MV]= f_code;
+        }
+    }
+
+    for(mv=0; mv<MAX_MV*2+1; mv++){
+        umv_fcode_tab[mv]= 1;
+    }
+}
+
+static void init_uni_dc_tab()
+{
+    int level, uni_code, uni_len;
+
+    for(level=-255; level<256; level++){
+        int size, v, l;
+        /* find number of bits */
+        size = 0;
+        v = abs(level);
+        while (v) {
+            v >>= 1;
+	    size++;
+        }
+
+        if (level < 0)
+            l= (-level) ^ ((1 << size) - 1);
+        else
+            l= level;
+
+        /* luminance */
+        uni_code= DCtab_lum[size][0];
+        uni_len = DCtab_lum[size][1];
+
+        if (size > 0) {
+            uni_code<<=size; uni_code|=l;
+            uni_len+=size;
+            if (size > 8){
+                uni_code<<=1; uni_code|=1;
+                uni_len++;
+            }
+        }
+        uni_DCtab_lum[level+256][0]= uni_code;
+        uni_DCtab_lum[level+256][1]= uni_len;
+
+        /* chrominance */
+        uni_code= DCtab_chrom[size][0];
+        uni_len = DCtab_chrom[size][1];
+        
+        if (size > 0) {
+            uni_code<<=size; uni_code|=l;
+            uni_len+=size;
+            if (size > 8){
+                uni_code<<=1; uni_code|=1;
+                uni_len++;
+            }
+        }
+        uni_DCtab_chrom[level+256][0]= uni_code;
+        uni_DCtab_chrom[level+256][1]= uni_len;
+
+    }
+}
+
+void h263_encode_init(MpegEncContext *s)
 {
     static int done = 0;
 
     if (!done) {
         done = 1;
+
+        init_uni_dc_tab();
+
         init_rl(&rl_inter);
         init_rl(&rl_intra);
+
+        init_mv_penalty_and_fcode(s);
     }
+    s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p
+    
+    // use fcodes >1 only for mpeg4 & h263 & h263p FIXME
+    if(s->h263_plus) s->fcode_tab= umv_fcode_tab;
+    else if(s->h263_pred && !s->h263_msmpeg4) s->fcode_tab= fcode_tab;
 }
 
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
@@ -571,11 +856,90 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
 
 /***************************************************/
 
+static void mpeg4_stuffing(PutBitContext * pbc)
+{
+    int length;
+    put_bits(pbc, 1, 0);
+    length= (-get_bit_count(pbc))&7;
+    put_bits(pbc, length, (1<<length)-1);
+}
+
+static void put_string(PutBitContext * pbc, char *s)
+{
+    while(*s){
+        put_bits(pbc, 8, *s);
+        s++;
+    }
+    put_bits(pbc, 8, 0);
+}
+
+static void mpeg4_encode_vol_header(MpegEncContext * s)
+{
+    int vo_ver_id=1; //must be 2 if we want GMC or q-pel
+
+    if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb);
+    put_bits(&s->pb, 16, 0);
+    put_bits(&s->pb, 16, 0x100);        /* video obj */
+    put_bits(&s->pb, 16, 0);
+    put_bits(&s->pb, 16, 0x120);        /* video obj layer */
+
+    put_bits(&s->pb, 1, 0);		/* random access vol */
+    put_bits(&s->pb, 8, 1);		/* video obj type indication= simple obj */
+    put_bits(&s->pb, 1, 1);		/* is obj layer id= yes */
+      put_bits(&s->pb, 4, vo_ver_id);	/* is obj layer ver id */
+      put_bits(&s->pb, 3, 1);		/* is obj layer priority */
+    if(s->aspect_ratio_info) 
+        put_bits(&s->pb, 4, s->aspect_ratio_info);/* aspect ratio info */
+    else
+        put_bits(&s->pb, 4, 1);		/* aspect ratio info= sqare pixel */
+    put_bits(&s->pb, 1, 0);		/* vol control parameters= no */
+    put_bits(&s->pb, 2, RECT_SHAPE);	/* vol shape= rectangle */
+    put_bits(&s->pb, 1, 1);		/* marker bit */
+    put_bits(&s->pb, 16, s->time_increment_resolution=30000);
+    s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
+    if (s->time_increment_bits < 1)
+        s->time_increment_bits = 1;
+    put_bits(&s->pb, 1, 1);		/* marker bit */
+    put_bits(&s->pb, 1, 0);		/* fixed vop rate=no */
+    put_bits(&s->pb, 1, 1);		/* marker bit */
+    put_bits(&s->pb, 13, s->width);	/* vol width */
+    put_bits(&s->pb, 1, 1);		/* marker bit */
+    put_bits(&s->pb, 13, s->height);	/* vol height */
+    put_bits(&s->pb, 1, 1);		/* marker bit */
+    put_bits(&s->pb, 1, 0);		/* interlace */
+    put_bits(&s->pb, 1, 1);		/* obmc disable */
+    if (vo_ver_id == 1) {
+        put_bits(&s->pb, 1, s->vol_sprite_usage=0);		/* sprite enable */
+    }else{ /* vo_ver_id == 2 */
+        put_bits(&s->pb, 2, s->vol_sprite_usage=0);		/* sprite enable */
+    }
+    put_bits(&s->pb, 1, 0);		/* not 8 bit */
+    put_bits(&s->pb, 1, 0);		/* quant type= h263 style*/
+    if (vo_ver_id != 1)
+        put_bits(&s->pb, 1, s->quarter_sample=0);
+    put_bits(&s->pb, 1, 1);		/* complexity estimation disable */
+    put_bits(&s->pb, 1, 1);		/* resync marker disable */
+    put_bits(&s->pb, 1, 0);		/* data partitioned */
+    if (vo_ver_id != 1){
+        put_bits(&s->pb, 1, 0);		/* newpred */
+        put_bits(&s->pb, 1, 0);		/* reduced res vop */
+    }
+    put_bits(&s->pb, 1, 0);		/* scalability */
+
+    mpeg4_stuffing(&s->pb);
+    put_bits(&s->pb, 16, 0);
+    put_bits(&s->pb, 16, 0x1B2);	/* user_data */
+    put_string(&s->pb, "ffmpeg"); //FIXME append some version ...
+
+    s->no_rounding = 0;
+}
+
 /* write mpeg4 VOP header */
 void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 {
-    align_put_bits(&s->pb);
+    if(s->pict_type==I_TYPE) mpeg4_encode_vol_header(s);
 
+    if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb);
     put_bits(&s->pb, 16, 0);	        /* vop header */
     put_bits(&s->pb, 16, 0x1B6);	/* vop header */
     put_bits(&s->pb, 2, s->pict_type - 1);	/* pict type: I = 0 , P = 1 */
@@ -584,26 +948,41 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     put_bits(&s->pb, 1, 0);
 
     put_bits(&s->pb, 1, 1);	/* marker */
-    put_bits(&s->pb, 4, 1);	/* XXX: correct time increment */
+    put_bits(&s->pb, s->time_increment_bits, 1);	/* XXX: correct time increment */
     put_bits(&s->pb, 1, 1);	/* marker */
     put_bits(&s->pb, 1, 1);	/* vop coded */
-    if (s->pict_type == P_TYPE) {
-        s->no_rounding = 0;
+    if (    s->pict_type == P_TYPE 
+        || (s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE)) {
+        s->no_rounding ^= 1;
 	put_bits(&s->pb, 1, s->no_rounding);	/* rounding type */
     }
     put_bits(&s->pb, 3, 0);	/* intra dc VLC threshold */
+    //FIXME sprite stuff
 
     put_bits(&s->pb, 5, s->qscale);
 
     if (s->pict_type != I_TYPE)
 	put_bits(&s->pb, 3, s->f_code);	/* fcode_for */
+    if (s->pict_type == B_TYPE)
+	put_bits(&s->pb, 3, s->b_code);	/* fcode_back */
     //    printf("****frame %d\n", picture_number);
 }
 
 void h263_dc_scale(MpegEncContext * s)
 {
+#if 1
+    const static UINT8 y_tab[32]={
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        0, 8, 8, 8, 8,10,12,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,40,42,44,46
+    };
+    const static UINT8 c_tab[32]={
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        0, 8, 8, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,20,21,22,23,24,25
+    };
+    s->y_dc_scale = y_tab[s->qscale];
+    s->c_dc_scale = c_tab[s->qscale];
+#else
     int quant;
-
     quant = s->qscale;
     /* luminance */
     if (quant < 5)
@@ -621,36 +1000,30 @@ void h263_dc_scale(MpegEncContext * s)
 	s->c_dc_scale = ((quant + 13) / 2);
     else
 	s->c_dc_scale = (quant - 6);
+#endif
 }
 
-static int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr)
+static inline int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr)
 {
-    int a, b, c, xy, wrap, pred, scale;
+    int a, b, c, wrap, pred, scale;
     UINT16 *dc_val;
+    int dummy;
 
     /* find prediction */
     if (n < 4) {
-	wrap = s->mb_width * 2 + 2;
-	xy = 2 * s->mb_y + 1 + ((n & 2) >> 1);
-        xy *= wrap;
-	xy += 2 * s->mb_x + 1 + (n & 1);
-	dc_val = s->dc_val[0];
 	scale = s->y_dc_scale;
     } else {
-	wrap = s->mb_width + 2;
-	xy = s->mb_y + 1;
-	xy *= wrap;
-	xy += s->mb_x + 1;
-	dc_val = s->dc_val[n - 4 + 1];
 	scale = s->c_dc_scale;
     }
+    wrap= s->block_wrap[n];
+    dc_val = s->dc_val[0] + s->block_index[n];
 
     /* B C
      * A X 
      */
-    a = dc_val[xy - 1];
-    b = dc_val[xy - 1 - wrap];
-    c = dc_val[xy - wrap];
+    a = dc_val[ - 1];
+    b = dc_val[ - 1 - wrap];
+    c = dc_val[ - wrap];
 
     if (abs(a - b) < abs(b - c)) {
 	pred = c;
@@ -660,10 +1033,19 @@ static int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *di
         *dir_ptr = 0; /* left */
     }
     /* we assume pred is positive */
+#ifdef ARCH_X86
+	asm volatile (
+		"xorl %%edx, %%edx	\n\t"
+		"mul %%ecx		\n\t"
+		: "=d" (pred), "=a"(dummy)
+		: "a" (pred + (scale >> 1)), "c" (inverse[scale])
+	);
+#else
     pred = (pred + (scale >> 1)) / scale;
+#endif
 
     /* prepare address for prediction update */
-    *dc_val_ptr = &dc_val[xy];
+    *dc_val_ptr = &dc_val[0];
 
     return pred;
 }
@@ -671,22 +1053,11 @@ static int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *di
 void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
                    int dir)
 {
-    int x, y, wrap, i;
+    int i;
     INT16 *ac_val, *ac_val1;
 
     /* find prediction */
-    if (n < 4) {
-	x = 2 * s->mb_x + 1 + (n & 1);
-	y = 2 * s->mb_y + 1 + ((n & 2) >> 1);
-	wrap = s->mb_width * 2 + 2;
-	ac_val = s->ac_val[0][0];
-    } else {
-	x = s->mb_x + 1;
-	y = s->mb_y + 1;
-	wrap = s->mb_width + 2;
-	ac_val = s->ac_val[n - 4 + 1][0];
-    }
-    ac_val += ((y) * wrap + (x)) * 16;
+    ac_val = s->ac_val[0][0] + s->block_index[n] * 16;
     ac_val1 = ac_val;
     if (s->ac_pred) {
         if (dir == 0) {
@@ -697,7 +1068,7 @@ void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
             }
         } else {
             /* top prediction */
-            ac_val -= 16 * wrap;
+            ac_val -= 16 * s->block_wrap[n];
             for(i=1;i<8;i++) {
                 block[block_permute_op(i)] += ac_val[i + 8];
             }
@@ -711,20 +1082,43 @@ void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
         ac_val1[8 + i] = block[block_permute_op(i)];
 }
 
-static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr)
+static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
+                              int dir)
 {
-    int size, v, pred;
-    UINT16 *dc_val;
+    int i;
+    INT16 *ac_val;
 
-    pred = mpeg4_pred_dc(s, n, &dc_val, dir_ptr);
-    if (n < 4) {
-        *dc_val = level * s->y_dc_scale;
+    /* find prediction */
+    ac_val = s->ac_val[0][0] + s->block_index[n] * 16;
+ 
+    if (dir == 0) {
+        /* left prediction */
+        ac_val -= 16;
+        for(i=1;i<8;i++) {
+            block[block_permute_op(i*8)] -= ac_val[i];
+        }
     } else {
-        *dc_val = level * s->c_dc_scale;
+        /* top prediction */
+        ac_val -= 16 * s->block_wrap[n];
+        for(i=1;i<8;i++) {
+            block[block_permute_op(i)] -= ac_val[i + 8];
+        }
     }
+}
 
-    /* do the prediction */
-    level -= pred;
+static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n)
+{
+#if 1
+    level+=256;
+    if (n < 4) {
+	/* luminance */
+	put_bits(&s->pb, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]);
+    } else {
+	/* chrominance */
+	put_bits(&s->pb, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]);
+    }
+#else
+    int size, v;
     /* find number of bits */
     size = 0;
     v = abs(level);
@@ -749,17 +1143,18 @@ static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n, int *di
 	if (size > 8)
 	    put_bits(&s->pb, 1, 1);
     }
+#endif
 }
 
-static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
+static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, UINT8 *scan_table)
 {
     int level, run, last, i, j, last_index, last_non_zero, sign, slevel;
-    int code, dc_pred_dir;
+    int code;
     const RLTable *rl;
 
     if (s->mb_intra) {
 	/* mpeg4 based DC predictor */
-	mpeg4_encode_dc(s, block[0], n, &dc_pred_dir);
+	mpeg4_encode_dc(s, intra_dc, n);
 	i = 1;
         rl = &rl_intra;
     } else {
@@ -771,7 +1166,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
     last_index = s->block_last_index[n];
     last_non_zero = i - 1;
     for (; i <= last_index; i++) {
-	j = zigzag_direct[i];
+	j = scan_table[i];
 	level = block[j];
 	if (level) {
 	    run = i - last_non_zero - 1;
@@ -839,6 +1234,7 @@ static VLC cbpy_vlc;
 static VLC mv_vlc;
 static VLC dc_lum, dc_chrom;
 static VLC sprite_trajectory;
+static VLC mb_type_b_vlc;
 
 void init_rl(RLTable *rl)
 {
@@ -922,6 +1318,9 @@ void h263_decode_init_vlc(MpegEncContext *s)
         init_vlc(&sprite_trajectory, 9, 15,
                  &sprite_trajectory_tab[0][1], 4, 2,
                  &sprite_trajectory_tab[0][0], 4, 2);
+        init_vlc(&mb_type_b_vlc, 4, 4,
+                 &mb_type_b_tab[0][1], 2, 1,
+                 &mb_type_b_tab[0][0], 2, 1);
     }
 }
 
@@ -950,13 +1349,163 @@ int h263_decode_gob_header(MpegEncContext *s)
             
 }
 
+static inline void memsetw(short *tab, int val, int n)
+{
+    int i;
+    for(i=0;i<n;i++)
+        tab[i] = val;
+}
+
+static int mpeg4_resync(MpegEncContext *s)
+{
+    int state, v, bits;
+    int mb_num_bits= av_log2(s->mb_num - 1) + 1;
+    int header_extension=0, mb_num;
+    int c_wrap, c_xy, l_wrap, l_xy;
+//printf("resync at %d %d\n", s->mb_x, s->mb_y);
+//printf("%X\n", show_bits(&s->gb, 24));
+
+    if( get_bits_count(&s->gb) > s->gb.size*8-32)
+        return 0;
+
+    align_get_bits(&s->gb);
+    state = 0xff;
+    for(;;) {
+        v = get_bits(&s->gb, 8);
+//printf("%X ", v);
+        state = ((state << 8) | v) & 0xffff;
+        if (state == 0) break;
+        if( get_bits_count(&s->gb) > s->gb.size*8-32){
+            printf("resync failed\n");
+            return -1;
+        }
+    }
+//printf("%X\n", show_bits(&s->gb, 24));
+    bits=0;
+    while(!get_bits1(&s->gb) && bits<30) bits++;
+    if(s->pict_type == P_TYPE && bits != s->f_code-1)
+        printf("marker does not match f_code\n");
+    //FIXME check bits for B-framess
+//printf("%X\n", show_bits(&s->gb, 24));
+
+    if(s->shape != RECT_SHAPE){
+        header_extension= get_bits1(&s->gb);
+        //FIXME more stuff here
+    }
+
+    mb_num= get_bits(&s->gb, mb_num_bits);
+    if(mb_num != s->mb_x + s->mb_y*s->mb_width){
+        printf("MB-num change not supported %d %d\n", mb_num, s->mb_x + s->mb_y*s->mb_width);
+//        s->mb_x= mb_num % s->mb_width;
+//        s->mb_y= mb_num / s->mb_width;
+        //FIXME many vars are wrong now
+    } 
+
+    if(s->shape != BIN_ONLY_SHAPE){
+        s->qscale= get_bits(&s->gb, 5);
+        h263_dc_scale(s);
+    }
+
+    if(s->shape == RECT_SHAPE){
+        header_extension= get_bits1(&s->gb);
+    }
+    if(header_extension){
+        int time_incr=0;
+        printf("header extension not really supported\n");
+        while (get_bits1(&s->gb) != 0) 
+            time_incr++;
+
+        check_marker(&s->gb, "before time_increment in video packed header");
+        s->time_increment= get_bits(&s->gb, s->time_increment_bits);
+        if(s->pict_type!=B_TYPE){
+            s->time_base+= time_incr;
+            s->last_non_b_time[1]= s->last_non_b_time[0];
+            s->last_non_b_time[0]= s->time_base*s->time_increment_resolution + s->time_increment;
+        }else{
+            s->time= (s->last_non_b_time[1]/s->time_increment_resolution + time_incr)*s->time_increment_resolution;
+            s->time+= s->time_increment;
+        }
+        check_marker(&s->gb, "before vop_coding_type in video packed header");
+        
+        skip_bits(&s->gb, 2); /* vop coding type */
+        //FIXME not rect stuff here
+
+        if(s->shape != BIN_ONLY_SHAPE){
+            skip_bits(&s->gb, 3); /* intra dc vlc threshold */
+
+            if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE && s->num_sprite_warping_points){
+                mpeg4_decode_sprite_trajectory(s);
+            }
+
+            //FIXME reduced res stuff here
+            
+            if (s->pict_type != I_TYPE) {
+                s->f_code = get_bits(&s->gb, 3);	/* fcode_for */
+                if(s->f_code==0){
+                    printf("Error, video packet header damaged or not MPEG4 header (f_code=0)\n");
+                    return -1; // makes no sense to continue, as the MV decoding will break very quickly
+                }
+            }
+            if (s->pict_type == B_TYPE) {
+                s->b_code = get_bits(&s->gb, 3);
+            }       
+        }
+
+    }
+    //FIXME new-pred stuff
+
+    l_wrap= s->block_wrap[0];
+    l_xy= s->mb_y*l_wrap*2;
+    c_wrap= s->block_wrap[4];
+    c_xy= s->mb_y*c_wrap;
+
+    /* clean DC */
+    memsetw(s->dc_val[0] + l_xy, 1024, l_wrap*3);
+    memsetw(s->dc_val[1] + c_xy, 1024, c_wrap*2);
+    memsetw(s->dc_val[2] + c_xy, 1024, c_wrap*2);
+
+    /* clean AC */
+    memset(s->ac_val[0] + l_xy, 0, l_wrap*3*16*sizeof(INT16));
+    memset(s->ac_val[1] + c_xy, 0, c_wrap*2*16*sizeof(INT16));
+    memset(s->ac_val[2] + c_xy, 0, c_wrap*2*16*sizeof(INT16));
+
+    /* clean MV */
+    memset(s->motion_val + l_xy, 0, l_wrap*3*2*sizeof(INT16));
+//    memset(s->motion_val, 0, 2*sizeof(INT16)*(2 + s->mb_width*2)*(2 + s->mb_height*2));
+    s->resync_x_pos= s->mb_x;
+    s->first_slice_line=1;
+
+    return 0;
+}
+
 int h263_decode_mb(MpegEncContext *s,
                    DCTELEM block[6][64])
 {
     int cbpc, cbpy, i, cbp, pred_x, pred_y, mx, my, dquant;
     INT16 *mot_val;
     static INT8 quant_tab[4] = { -1, -2, 1, 2 };
-    
+
+    if(s->resync_marker){
+        if(   s->resync_x_pos == s->mb_x+1
+           || s->resync_x_pos == s->mb_x){
+            /* f*ck mpeg4
+               this is here so we dont need to slowdown h263_pred_motion with it */
+            if(s->resync_x_pos == s->mb_x+1 && s->mb_x==0){
+                int xy= s->block_index[0] - s->block_wrap[0];
+                s->motion_val[xy][0]= s->motion_val[xy+2][0];
+                s->motion_val[xy][1]= s->motion_val[xy+2][1];
+            }
+
+            s->first_slice_line=0; 
+            s->resync_x_pos=0; // isnt needed but for cleanness sake ;)
+        }
+
+        if(show_aligned_bits(&s->gb, 1, 16) == 0){
+            if( mpeg4_resync(s) < 0 ) return -1;
+            
+        }
+    }
+
     if (s->pict_type == P_TYPE || s->pict_type==S_TYPE) {
         if (get_bits1(&s->gb)) {
             /* skip mb */
@@ -970,8 +1519,13 @@ int h263_decode_mb(MpegEncContext *s,
 //                int l = (1 << (s->f_code - 1)) * 32;
 
                 s->mcsel=1;
-                s->mv[0][0][0] = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
-                s->mv[0][0][1] = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
+                if(s->divx_version==500 && s->divx_build==413){
+                    s->mv[0][0][0] = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample));
+                    s->mv[0][0][1] = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample));
+                }else{
+                    s->mv[0][0][0] = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
+                    s->mv[0][0][1] = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
+                }
 /*                if (s->mv[0][0][0] < -l) s->mv[0][0][0]= -l;
                 else if (s->mv[0][0][0] >= l) s->mv[0][0][0]= l-1;
                 if (s->mv[0][0][1] < -l) s->mv[0][0][1]= -l;
@@ -997,15 +1551,8 @@ int h263_decode_mb(MpegEncContext *s,
         
         dquant = cbpc & 8;
         s->mb_intra = ((cbpc & 4) != 0);
-    } else {
-        cbpc = get_vlc(&s->gb, &intra_MCBPC_vlc);
-        if (cbpc < 0)
-            return -1;
-        dquant = cbpc & 4;
-        s->mb_intra = 1;
-    }
-
-    if (!s->mb_intra) {
+        if (s->mb_intra) goto intra;
+        
         if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE && (cbpc & 16) == 0)
             s->mcsel= get_bits1(&s->gb);
         else s->mcsel= 0;
@@ -1017,6 +1564,7 @@ int h263_decode_mb(MpegEncContext *s,
                 s->qscale = 1;
             else if (s->qscale > 31)
                 s->qscale = 31;
+            h263_dc_scale(s);
         }
         s->mv_dir = MV_DIR_FORWARD;
         if ((cbpc & 16) == 0) {
@@ -1026,13 +1574,17 @@ int h263_decode_mb(MpegEncContext *s,
             if (s->umvplus_dec)
                mx = h263p_decode_umotion(s, pred_x);
             else if(!s->mcsel)
-               mx = h263_decode_motion(s, pred_x);
+               mx = h263_decode_motion(s, pred_x, s->f_code);
             else {
                const int a= s->sprite_warping_accuracy;
 //        int l = (1 << (s->f_code - 1)) * 32;
-               mx= RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
-//        if (mx < -l) mx= -l;
-//        else if (mx >= l) mx= l-1;
+                if(s->divx_version==500 && s->divx_build==413){
+                    mx = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample));
+                }else{
+                    mx = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
+                }
+//        if (mx < -l) mx= -l, printf("C");
+//        else if (mx >= l) mx= l-1, printf("C");
             }
             if (mx >= 0xffff)
                 return -1;
@@ -1040,13 +1592,17 @@ int h263_decode_mb(MpegEncContext *s,
             if (s->umvplus_dec)
                my = h263p_decode_umotion(s, pred_y);
             else if(!s->mcsel)
-               my = h263_decode_motion(s, pred_y);
+               my = h263_decode_motion(s, pred_y, s->f_code);
             else{
                const int a= s->sprite_warping_accuracy;
 //       int l = (1 << (s->f_code - 1)) * 32;
-               my= RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
-//       if (my < -l) my= -l;
-//       else if (my >= l) my= l-1;
+                if(s->divx_version==500 && s->divx_build==413){
+                    my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample));
+                }else{
+                    my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
+                }
+//       if (my < -l) my= -l, printf("C");
+//       else if (my >= l) my= l-1, printf("C");
             }
             if (my >= 0xffff)
                 return -1;
@@ -1065,14 +1621,14 @@ int h263_decode_mb(MpegEncContext *s,
                 if (s->umvplus_dec)
                   mx = h263p_decode_umotion(s, pred_x);
                 else
-                  mx = h263_decode_motion(s, pred_x);
+                  mx = h263_decode_motion(s, pred_x, s->f_code);
                 if (mx >= 0xffff)
                     return -1;
                 
                 if (s->umvplus_dec)
                   my = h263p_decode_umotion(s, pred_y);
                 else    
-                  my = h263_decode_motion(s, pred_y);
+                  my = h263_decode_motion(s, pred_y, s->f_code);
                 if (my >= 0xffff)
                     return -1;
                 s->mv[0][i][0] = mx;
@@ -1083,7 +1639,126 @@ int h263_decode_mb(MpegEncContext *s,
                 mot_val[1] = my;
             }
         }
-    } else {
+    } else if(s->pict_type==B_TYPE) {
+        int modb1; // first bit of modb
+        int modb2; // second bit of modb
+        int mb_type;
+        int time_pp;
+        int time_pb;
+        int xy;
+
+        s->mb_intra = 0; //B-frames never contain intra blocks
+        s->mcsel=0;      //     ...               true gmc blocks
+
+        if(s->mb_x==0){
+            s->last_mv[0][0][0]= 
+            s->last_mv[0][0][1]= 
+            s->last_mv[1][0][0]= 
+            s->last_mv[1][0][1]= 0;
+        }
+
+        /* if we skipped it in the future P Frame than skip it now too */
+        s->mb_skiped= s->mbskip_table[s->mb_y * s->mb_width + s->mb_x]; // Note, skiptab=0 if last was GMC
+
+        if(s->mb_skiped){
+                /* skip mb */
+            for(i=0;i<6;i++)
+                s->block_last_index[i] = -1;
+
+            s->mv_dir = MV_DIR_FORWARD;
+            s->mv_type = MV_TYPE_16X16;
+            s->mv[0][0][0] = 0;
+            s->mv[0][0][1] = 0;
+            s->mv[1][0][0] = 0;
+            s->mv[1][0][1] = 0;
+//FIXME is this correct?
+/*            s->last_mv[0][0][0]=
+            s->last_mv[0][0][1]=0;*/
+            s->mb_skiped = 1;
+            return 0;
+        }
+
+        modb1= get_bits1(&s->gb);
+        if(modb1==0){
+            modb2= get_bits1(&s->gb);
+            mb_type= get_vlc(&s->gb, &mb_type_b_vlc);
+            if(modb2==0) cbp= get_bits(&s->gb, 6);
+            else cbp=0;
+            if (mb_type && cbp) {
+                if(get_bits1(&s->gb)){
+                    s->qscale +=get_bits1(&s->gb)*4 - 2;
+                    if (s->qscale < 1)
+                        s->qscale = 1;
+                    else if (s->qscale > 31)
+                        s->qscale = 31;
+                    h263_dc_scale(s);
+                }
+            }
+        }else{
+            mb_type=4; //like 0 but no vectors coded
+            cbp=0;
+        }
+        s->mv_type = MV_TYPE_16X16; // we'll switch to 8x8 only if the last P frame had 8x8 for this MB and mb_type=0 here
+        mx=my=0; //for case 4, we could put this to the mb_type=4 but than gcc compains about uninitalized mx/my
+        switch(mb_type)
+        {
+        case 0: 
+            mx = h263_decode_motion(s, 0, 1);
+            my = h263_decode_motion(s, 0, 1);
+        case 4: 
+            s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+            xy= s->block_index[0];
+            time_pp= s->last_non_b_time[0] - s->last_non_b_time[1];
+            time_pb= s->time - s->last_non_b_time[1];
+//if(time_pp>3000 )printf("%d %d  ", time_pp, time_pb);
+            //FIXME 4MV
+            //FIXME avoid divides
+            s->mv[0][0][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
+            s->mv[0][0][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
+            s->mv[1][0][0] = mx ? s->mv[0][0][0] - s->motion_val[xy][0]
+                                : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp + mx;
+            s->mv[1][0][1] = my ? s->mv[0][0][1] - s->motion_val[xy][1] 
+                                : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp + my;
+/*            s->mv[0][0][0] = 
+            s->mv[0][0][1] = 
+            s->mv[1][0][0] = 
+            s->mv[1][0][1] = 1000;*/
+            break;
+        case 1: 
+            s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
+            mx = h263_decode_motion(s, s->last_mv[0][0][0], s->f_code);
+            my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
+            s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
+            s->last_mv[0][0][1]= s->mv[0][0][1] = my;
+
+            mx = h263_decode_motion(s, s->last_mv[1][0][0], s->b_code);
+            my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
+            s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
+            s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+            break;
+        case 2: 
+            s->mv_dir = MV_DIR_BACKWARD;
+            mx = h263_decode_motion(s, s->last_mv[1][0][0], s->b_code);
+            my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
+            s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
+            s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+            break;
+        case 3:
+            s->mv_dir = MV_DIR_FORWARD;
+            mx = h263_decode_motion(s, s->last_mv[0][0][0], s->f_code);
+            my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
+            s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
+            s->last_mv[0][0][1]= s->mv[0][0][1] = my;
+            break;
+        default: return -1;
+        }
+    } else { /* I-Frame */
+        cbpc = get_vlc(&s->gb, &intra_MCBPC_vlc);
+        if (cbpc < 0)
+            return -1;
+        dquant = cbpc & 4;
+        s->mb_intra = 1;
+intra:
         s->ac_pred = 0;
         if (s->h263_pred || s->h263_aic) {
             s->ac_pred = get_bits1(&s->gb);
@@ -1102,6 +1777,7 @@ int h263_decode_mb(MpegEncContext *s,
                 s->qscale = 1;
             else if (s->qscale > 31)
                 s->qscale = 31;
+            h263_dc_scale(s);
         }
     }
 
@@ -1120,7 +1796,7 @@ int h263_decode_mb(MpegEncContext *s,
     return 0;
 }
 
-static int h263_decode_motion(MpegEncContext * s, int pred)
+static int h263_decode_motion(MpegEncContext * s, int pred, int f_code)
 {
     int code, val, sign, shift, l, m;
 
@@ -1131,7 +1807,7 @@ static int h263_decode_motion(MpegEncContext * s, int pred)
     if (code == 0)
         return pred;
     sign = get_bits1(&s->gb);
-    shift = s->f_code - 1;
+    shift = f_code - 1;
     val = (code - 1) << shift;
     if (shift > 0)
         val |= get_bits(&s->gb, shift);
@@ -1142,7 +1818,7 @@ static int h263_decode_motion(MpegEncContext * s, int pred)
     
     /* modulo decoding */
     if (!s->h263_long_vectors) {
-        l = (1 << (s->f_code - 1)) * 32;
+        l = (1 << (f_code - 1)) * 32;
         m = 2 * l;
         if (val < -l) {
             val += m;
@@ -1269,7 +1945,7 @@ static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
 not_coded:    
     if (s->mb_intra && s->h263_aic) {
         h263_pred_acdc(s, block, n);
-        i = 64;
+        i = 63;
     }
     s->block_last_index[n] = i;
     return 0;
@@ -1577,21 +2253,21 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
     h2= 1<<beta;
 
 // Note, the 4th point isnt used for GMC
-/*
-    sprite_ref[0][0]= (a>>1)*(2*vop_ref[0][0] + d[0][0]);
-    sprite_ref[0][1]= (a>>1)*(2*vop_ref[0][1] + d[0][1]);
-    sprite_ref[1][0]= (a>>1)*(2*vop_ref[1][0] + d[0][0] + d[1][0]);
-    sprite_ref[1][1]= (a>>1)*(2*vop_ref[1][1] + d[0][1] + d[1][1]);
-    sprite_ref[2][0]= (a>>1)*(2*vop_ref[2][0] + d[0][0] + d[2][0]);
-    sprite_ref[2][1]= (a>>1)*(2*vop_ref[2][1] + d[0][1] + d[2][1]);
-*/
-//FIXME DIVX5 vs. mpeg4 ?
-    sprite_ref[0][0]= a*vop_ref[0][0] + d[0][0];
-    sprite_ref[0][1]= a*vop_ref[0][1] + d[0][1];
-    sprite_ref[1][0]= a*vop_ref[1][0] + d[0][0] + d[1][0];
-    sprite_ref[1][1]= a*vop_ref[1][1] + d[0][1] + d[1][1];
-    sprite_ref[2][0]= a*vop_ref[2][0] + d[0][0] + d[2][0];
-    sprite_ref[2][1]= a*vop_ref[2][1] + d[0][1] + d[2][1];
+    if(s->divx_version==500 && s->divx_build==413){
+        sprite_ref[0][0]= a*vop_ref[0][0] + d[0][0];
+        sprite_ref[0][1]= a*vop_ref[0][1] + d[0][1];
+        sprite_ref[1][0]= a*vop_ref[1][0] + d[0][0] + d[1][0];
+        sprite_ref[1][1]= a*vop_ref[1][1] + d[0][1] + d[1][1];
+        sprite_ref[2][0]= a*vop_ref[2][0] + d[0][0] + d[2][0];
+        sprite_ref[2][1]= a*vop_ref[2][1] + d[0][1] + d[2][1];
+    } else {
+        sprite_ref[0][0]= (a>>1)*(2*vop_ref[0][0] + d[0][0]);
+        sprite_ref[0][1]= (a>>1)*(2*vop_ref[0][1] + d[0][1]);
+        sprite_ref[1][0]= (a>>1)*(2*vop_ref[1][0] + d[0][0] + d[1][0]);
+        sprite_ref[1][1]= (a>>1)*(2*vop_ref[1][1] + d[0][1] + d[1][1]);
+        sprite_ref[2][0]= (a>>1)*(2*vop_ref[2][0] + d[0][0] + d[2][0]);
+        sprite_ref[2][1]= (a>>1)*(2*vop_ref[2][1] + d[0][1] + d[2][1]);
+    }
 /*    sprite_ref[3][0]= (a>>1)*(2*vop_ref[3][0] + d[0][0] + d[1][0] + d[2][0] + d[3][0]);
     sprite_ref[3][1]= (a>>1)*(2*vop_ref[3][1] + d[0][1] + d[1][1] + d[2][1] + d[3][1]); */
     
@@ -1715,7 +2391,7 @@ printf("%d %d\n", s->sprite_delta[1][1][1], a<<s->sprite_shift[1][1]);*/
     else
         s->real_sprite_warping_points= s->num_sprite_warping_points;
 
-//FIXME convert stuff if accurace != 3
+//printf("%d %d %d %d\n", d[0][0], d[0][1], s->sprite_offset[0][0], s->sprite_offset[0][1]);
 }
 
 /* decode mpeg4 VOP header */
@@ -1735,13 +2411,14 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             break;
         }
         state = ((state << 8) | v) & 0xffffff;
-        /* XXX: really detect end of frame */
-        if (state == 0)
+        if( get_bits_count(&s->gb) > s->gb.size*8-32){
+            printf("no VOP startcode found\n");
             return -1;
+        }
     }
 //printf("startcode %X %d\n", startcode, get_bits_count(&s->gb));
     if (startcode == 0x120) { // Video Object Layer
-        int time_increment_resolution, width, height, vo_ver_id;
+        int width, height, vo_ver_id;
 
         /* vol header */
         skip_bits(&s->gb, 1); /* random access */
@@ -1758,11 +2435,13 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             skip_bits(&s->gb, 8); //par_width
             skip_bits(&s->gb, 8); // par_height
         }
+
         if(get_bits1(&s->gb)){ /* vol control parameter */
             printf("vol control parameter not supported\n");
             return -1;   
         }
         s->shape = get_bits(&s->gb, 2); /* vol shape */
+        if(s->shape != RECT_SHAPE) printf("only rectangular vol supported\n");
         if(s->shape == GRAY_SHAPE && vo_ver_id != 1){
             printf("Gray shape not supported\n");
             skip_bits(&s->gb, 4);  //video_object_layer_shape_extension
@@ -1770,8 +2449,8 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
 
         skip_bits1(&s->gb);   /* marker */
         
-        time_increment_resolution = get_bits(&s->gb, 16);
-        s->time_increment_bits = av_log2(time_increment_resolution - 1) + 1;
+        s->time_increment_resolution = get_bits(&s->gb, 16);
+        s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
         if (s->time_increment_bits < 1)
             s->time_increment_bits = 1;
         skip_bits1(&s->gb);   /* marker */
@@ -1787,9 +2466,14 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
                 skip_bits1(&s->gb);   /* marker */
                 height = get_bits(&s->gb, 13);
                 skip_bits1(&s->gb);   /* marker */
+                if(width && height){ /* they should be non zero but who knows ... */
+                    s->width = width;
+                    s->height = height;
+//                    printf("%d %d\n", width, height);
+                }
             }
             
-            skip_bits1(&s->gb);   /* interlaced */
+            if(get_bits1(&s->gb)) printf("interlaced not supported\n");   /* interlaced */
             if(!get_bits1(&s->gb)) printf("OBMC not supported\n");   /* OBMC Disable */
             if (vo_ver_id == 1) {
                 s->vol_sprite_usage = get_bits1(&s->gb); /* vol_sprite_usage */
@@ -1818,7 +2502,8 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             
             if (get_bits1(&s->gb) == 1) {   /* not_8_bit */
                 s->quant_precision = get_bits(&s->gb, 4); /* quant_precision */
-                skip_bits(&s->gb, 4); /* bits_per_pixel */
+                if(get_bits(&s->gb, 4)!=8) printf("N-bit not supported\n"); /* bits_per_pixel */
+                if(s->quant_precision!=5) printf("quant precission %d\n", s->quant_precision);
             } else {
                 s->quant_precision = 5;
             }
@@ -1828,13 +2513,11 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             if(vo_ver_id != 1)
                  s->quarter_sample= get_bits1(&s->gb);
             else s->quarter_sample=0;
-#if 0
-            if(get_bits1(&s->gb)) printf("Complexity est disabled\n");
-            if(get_bits1(&s->gb)) printf("resync disable\n");
-#else
-            skip_bits1(&s->gb);   /* complexity_estimation_disabled */
-            skip_bits1(&s->gb);   /* resync_marker_disabled */
-#endif
+
+            if(!get_bits1(&s->gb)) printf("Complexity estimation not supported\n");
+
+            s->resync_marker= !get_bits1(&s->gb); /* resync_marker_disabled */
+
             s->data_partioning= get_bits1(&s->gb);
             if(s->data_partioning){
                 printf("data partitioning not supported\n");
@@ -1858,8 +2541,7 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
 
             s->scalability= get_bits1(&s->gb);
             if (s->scalability) {
-                printf("bad scalability!!!\n");
-                return -1;
+                printf("scalability not supported\n");
             }
         }
 //printf("end Data %X %d\n", show_bits(&s->gb, 32), get_bits_count(&s->gb)&0x7);
@@ -1899,24 +2581,34 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
     }
 
     s->pict_type = get_bits(&s->gb, 2) + 1;	/* pict type: I = 0 , P = 1 */
-    if(s->pict_type == B_TYPE)
-    {
-        printf("B-VOP\n");
-	return -1;
-    }
- 
-    /* XXX: parse time base */
-    time_incr = 0;
+//printf("pic: %d, qpel:%d\n", s->pict_type, s->quarter_sample); 
+    time_incr=0;
     while (get_bits1(&s->gb) != 0) 
         time_incr++;
 
-    skip_bits1(&s->gb);   	/* marker */
-    skip_bits(&s->gb, s->time_increment_bits);
-    skip_bits1(&s->gb);   	/* marker */
+    check_marker(&s->gb, "before time_increment");
+    s->time_increment= get_bits(&s->gb, s->time_increment_bits);
+    if(s->pict_type!=B_TYPE){
+        s->time_base+= time_incr;
+        s->last_non_b_time[1]= s->last_non_b_time[0];
+        s->last_non_b_time[0]= s->time_base*s->time_increment_resolution + s->time_increment;
+    }else{
+        s->time= (s->last_non_b_time[1]/s->time_increment_resolution + time_incr)*s->time_increment_resolution;
+        s->time+= s->time_increment;
+    }
+
+    if(check_marker(&s->gb, "before vop_coded")==0 && s->picture_number==0){
+        printf("hmm, seems the headers arnt complete, trying to guess time_increment_bits\n");
+        for(s->time_increment_bits++ ;s->time_increment_bits<16; s->time_increment_bits++){
+            if(get_bits1(&s->gb)) break;
+        }
+        printf("my guess is %d bits ;)\n",s->time_increment_bits);
+    }
     /* vop coded */
     if (get_bits1(&s->gb) != 1)
         goto redo;
-    
+//printf("time %d %d %d || %d %d %d\n", s->time_increment_bits, s->time_increment, s->time_base,
+//s->time, s->last_non_b_time[0], s->last_non_b_time[1]);  
     if (s->shape != BIN_ONLY_SHAPE && ( s->pict_type == P_TYPE
                           || (s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE))) {
         /* rounding type for motion estimation */
@@ -1947,7 +2639,9 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
 //FIXME complexity estimation stuff
      
      if (s->shape != BIN_ONLY_SHAPE) {
-         skip_bits(&s->gb, 3); /* intra dc VLC threshold */
+         int t;
+         t=get_bits(&s->gb, 3); /* intra dc VLC threshold */
+//printf("threshold %d\n", t);
          //FIXME interlaced specific bits
      }
 
@@ -1964,12 +2658,21 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             MPEG4 vol header as it is found on some old opendivx
             movies */
          s->qscale = get_bits(&s->gb, 5);
+         if(s->qscale==0){
+             printf("Error, header damaged or not MPEG4 header (qscale=0)\n");
+             return -1; // makes no sense to continue, as there is nothing left from the image then
+         }
   
          if (s->pict_type != I_TYPE) {
              s->f_code = get_bits(&s->gb, 3);	/* fcode_for */
+             if(s->f_code==0){
+                 printf("Error, header damaged or not MPEG4 header (f_code=0)\n");
+                 return -1; // makes no sense to continue, as the MV decoding will break very quickly
+             }
          }
          if (s->pict_type == B_TYPE) {
              s->b_code = get_bits(&s->gb, 3);
+//printf("b-code %d\n", s->b_code);
          }
 //printf("quant:%d fcode:%d\n", s->qscale, s->f_code);
          if(!s->scalability){
@@ -1978,7 +2681,6 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
              }
          }
      }
-//printf("end Data %X %d\n", show_bits(&s->gb, 32), get_bits_count(&s->gb)&0x7);
      s->picture_number++; // better than pic number==0 allways ;)
      return 0;
 }
diff --git a/src/libffmpeg/libavcodec/h263data.h b/src/libffmpeg/libavcodec/h263data.h
index 88e456ba2..a129fd6bf 100644
--- a/src/libffmpeg/libavcodec/h263data.h
+++ b/src/libffmpeg/libavcodec/h263data.h
@@ -40,13 +40,13 @@ static const UINT8 inter_MCBPC_bits[20] = {
     3, 7, 7, 8,
 };*/
 
-static const UINT8 cbpy_tab[16][2] =
+const UINT8 cbpy_tab[16][2] =
 {
   {3,4}, {5,5}, {4,5}, {9,4}, {3,5}, {7,4}, {2,6}, {11,4},
   {2,5}, {3,6}, {5,4}, {10,4}, {4,4}, {8,4}, {6,4}, {3,2}
 };
 
-static const UINT8 mvtab[33][2] =
+const UINT8 mvtab[33][2] =
 {
   {1,1}, {1,2}, {1,3}, {1,4}, {3,6}, {5,7}, {4,7}, {3,7},
   {11,9}, {10,9}, {9,9}, {17,10}, {16,10}, {15,10}, {14,10}, {13,10},
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 3733ed565..e909ac56e 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -47,10 +47,22 @@ static int h263_decode_init(AVCodecContext *avctx)
     case CODEC_ID_MPEG4:
         s->time_increment_bits = 4; /* default value for broken headers */
         s->h263_pred = 1;
+        s->has_b_frames = 1;
         break;
-    case CODEC_ID_MSMPEG4:
+    case CODEC_ID_MSMPEG4V1:
         s->h263_msmpeg4 = 1;
         s->h263_pred = 1;
+        s->msmpeg4_version=1;
+        break;
+    case CODEC_ID_MSMPEG4V2:
+        s->h263_msmpeg4 = 1;
+        s->h263_pred = 1;
+        s->msmpeg4_version=2;
+        break;
+    case CODEC_ID_MSMPEG4V3:
+        s->h263_msmpeg4 = 1;
+        s->h263_pred = 1;
+        s->msmpeg4_version=3;
         break;
     case CODEC_ID_H263I:
         s->h263_intel = 1;
@@ -60,7 +72,7 @@ static int h263_decode_init(AVCodecContext *avctx)
     }
 
     /* for h263, we allocate the images after having read the header */
-    if (avctx->codec->id != CODEC_ID_H263)
+    if (avctx->codec->id != CODEC_ID_H263 && avctx->codec->id != CODEC_ID_MPEG4)
         if (MPV_common_init(s) < 0)
             return -1;
 
@@ -115,22 +127,25 @@ static int h263_decode_frame(AVCodecContext *avctx,
         ret = intel_h263_decode_picture_header(s);
     } else {
         ret = h263_decode_picture_header(s);
-        /* After H263 header decode we have the height, width,       */
+    }
+
+        /* After H263 & mpeg4 header decode we have the height, width,*/
         /* and other parameters. So then we could init the picture   */
         /* FIXME: By the way H263 decoder is evolving it should have */
         /* an H263EncContext                                         */
-        if (!s->context_initialized) {
-            avctx->width = s->width;
-            avctx->height = s->height;
-            if (MPV_common_init(s) < 0)
-                return -1;
-        } else if (s->width != avctx->width || s->height != avctx->height) {
-            /* H.263 could change picture size any time */
-            MPV_common_end(s);
-            if (MPV_common_init(s) < 0)
-                return -1;
-        }
+    if (!s->context_initialized) {
+        avctx->width = s->width;
+        avctx->height = s->height;
+        avctx->aspect_ratio_info= s->aspect_ratio_info;
+        if (MPV_common_init(s) < 0)
+            return -1;
+    } else if (s->width != avctx->width || s->height != avctx->height) {
+        /* H.263 could change picture size any time */
+        MPV_common_end(s);
+        if (MPV_common_init(s) < 0)
+            return -1;
     }
+
     if (ret < 0)
         return -1;
 
@@ -141,6 +156,12 @@ static int h263_decode_frame(AVCodecContext *avctx,
 #endif
 
     /* decode each macroblock */
+    s->block_wrap[0]=
+    s->block_wrap[1]=
+    s->block_wrap[2]=
+    s->block_wrap[3]= s->mb_width*2 + 2;
+    s->block_wrap[4]=
+    s->block_wrap[5]= s->mb_width + 2;
     for(s->mb_y=0; s->mb_y < s->mb_height; s->mb_y++) {
         /* Check for GOB headers on H.263 */
         /* FIXME: In the future H.263+ will have intra prediction */
@@ -148,7 +169,20 @@ static int h263_decode_frame(AVCodecContext *avctx,
         if (s->mb_y && !s->h263_pred) {
             s->first_gob_line = h263_decode_gob_header(s);
         }
+
+        s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1;
+        s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1);
+        s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1;
+        s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2);
+        s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
+        s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         for(s->mb_x=0; s->mb_x < s->mb_width; s->mb_x++) {
+            s->block_index[0]+=2;
+            s->block_index[1]+=2;
+            s->block_index[2]+=2;
+            s->block_index[3]+=2;
+            s->block_index[4]++;
+            s->block_index[5]++;
 #ifdef DEBUG
             printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 #endif
@@ -163,28 +197,8 @@ static int h263_decode_frame(AVCodecContext *avctx,
                 s->y_dc_scale = 8;
                 s->c_dc_scale = 8;
             }
-
-#ifdef HAVE_MMX
-            if (mm_flags & MM_MMX) {
-                asm volatile(
-			"pxor %%mm7, %%mm7		\n\t"
-			"movl $-128*6, %%eax		\n\t"
-			"1:				\n\t"
-			"movq %%mm7, (%0, %%eax)	\n\t"
-			"movq %%mm7, 8(%0, %%eax)	\n\t"
-			"movq %%mm7, 16(%0, %%eax)	\n\t"
-			"movq %%mm7, 24(%0, %%eax)	\n\t"
-			"addl $32, %%eax		\n\t"
-			" js 1b				\n\t"
-			: : "r" (((int)s->block)+128*6)
-			: "%eax"
-                );
-            }else{
-                memset(s->block, 0, sizeof(s->block));
-            }
-#else
-            memset(s->block, 0, sizeof(s->block));
-#endif
+            clear_blocks(s->block[0]);
+            
             s->mv_dir = MV_DIR_FORWARD;
             s->mv_type = MV_TYPE_16X16; 
             if (s->h263_msmpeg4) {
@@ -208,9 +222,15 @@ static int h263_decode_frame(AVCodecContext *avctx,
             if (h > 16)
                 h = 16;
             offset = y * s->linesize;
-            src_ptr[0] = s->current_picture[0] + offset;
-            src_ptr[1] = s->current_picture[1] + (offset >> 2);
-            src_ptr[2] = s->current_picture[2] + (offset >> 2);
+            if(s->pict_type==B_TYPE || (!s->has_b_frames)){
+                src_ptr[0] = s->current_picture[0] + offset;
+                src_ptr[1] = s->current_picture[1] + (offset >> 2);
+                src_ptr[2] = s->current_picture[2] + (offset >> 2);
+            } else {
+                src_ptr[0] = s->last_picture[0] + offset;
+                src_ptr[1] = s->last_picture[1] + (offset >> 2);
+                src_ptr[2] = s->last_picture[2] + (offset >> 2);
+            }
             avctx->draw_horiz_band(avctx, src_ptr, s->linesize,
                                    y, s->width, h);
         }
@@ -221,9 +241,15 @@ static int h263_decode_frame(AVCodecContext *avctx,
 
     MPV_frame_end(s);
     
-    pict->data[0] = s->current_picture[0];
-    pict->data[1] = s->current_picture[1];
-    pict->data[2] = s->current_picture[2];
+    if(s->pict_type==B_TYPE || (!s->has_b_frames)){
+        pict->data[0] = s->current_picture[0];
+        pict->data[1] = s->current_picture[1];
+        pict->data[2] = s->current_picture[2];
+    } else {
+        pict->data[0] = s->last_picture[0];
+        pict->data[1] = s->last_picture[1];
+        pict->data[2] = s->last_picture[2];
+    }
     pict->linesize[0] = s->linesize;
     pict->linesize[1] = s->linesize / 2;
     pict->linesize[2] = s->linesize / 2;
@@ -262,10 +288,34 @@ AVCodec h263_decoder = {
     CODEC_CAP_DRAW_HORIZ_BAND,
 };
 
-AVCodec msmpeg4_decoder = {
+AVCodec msmpeg4v1_decoder = {
+    "msmpeg4v1",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_MSMPEG4V1,
+    sizeof(MpegEncContext),
+    h263_decode_init,
+    NULL,
+    h263_decode_end,
+    h263_decode_frame,
+    CODEC_CAP_DRAW_HORIZ_BAND,
+};
+
+AVCodec msmpeg4v2_decoder = {
+    "msmpeg4v2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_MSMPEG4V2,
+    sizeof(MpegEncContext),
+    h263_decode_init,
+    NULL,
+    h263_decode_end,
+    h263_decode_frame,
+    CODEC_CAP_DRAW_HORIZ_BAND,
+};
+
+AVCodec msmpeg4v3_decoder = {
     "msmpeg4",
     CODEC_TYPE_VIDEO,
-    CODEC_ID_MSMPEG4,
+    CODEC_ID_MSMPEG4V3,
     sizeof(MpegEncContext),
     h263_decode_init,
     NULL,
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 37716a983..2c71850ee 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -25,22 +25,58 @@
 
 int mm_flags; /* multimedia extension flags */
 
-int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
+int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+
+int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+
+int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
+
+int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
+
 
 /* external functions, from idct_mmx.c */
 void ff_mmx_idct(DCTELEM *block);
 void ff_mmxext_idct(DCTELEM *block);
 
 /* pixel operations */
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
-static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
+static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
+static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
 
+#define JUMPALIGN() __asm __volatile (".balign 8"::)
+#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
+
+#ifndef PIC
+#define MOVQ_WONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
+#define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_WONE(regd) \
+    __asm __volatile ( \
+       "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+       "psrlw $15, %%" #regd ::)
+
+#define MOVQ_WTWO(regd) \
+    __asm __volatile ( \
+       "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+       "psrlw $15, %%" #regd " \n\t" \
+       "psllw $1, %%" #regd ::)
+#endif
+
 /***********************************/
 /* 3Dnow specific */
 
@@ -78,7 +114,7 @@ static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
     /* read the pixels */
     p = block;
     pix = pixels;
-    __asm __volatile("pxor %%mm7, %%mm7":);
+    MOVQ_ZERO(mm7);
     for(i=0;i<4;i++) {
 	__asm __volatile(
 		"movq	%1, %%mm0\n\t"
@@ -105,12 +141,11 @@ static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
 {
     const DCTELEM *p;
     UINT8 *pix;
-    int i;
 
     /* read the pixels */
     p = block;
     pix = pixels;
-    for(i=0;i<2;i++) {
+    /* unrolled loop */
 	__asm __volatile(
 		"movq	%3, %%mm0\n\t"
 		"movq	8%3, %%mm1\n\t"
@@ -132,7 +167,29 @@ static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
 		:"memory");
         pix += line_size*4;
         p += 32;
-    }
+
+    // if here would be an exact copy of the code above
+    // compiler would generate some very strange code
+    // thus using "r"
+    __asm __volatile(
+	    "movq	(%3), %%mm0\n\t"
+	    "movq	8(%3), %%mm1\n\t"
+	    "movq	16(%3), %%mm2\n\t"
+	    "movq	24(%3), %%mm3\n\t"
+	    "movq	32(%3), %%mm4\n\t"
+	    "movq	40(%3), %%mm5\n\t"
+	    "movq	48(%3), %%mm6\n\t"
+	    "movq	56(%3), %%mm7\n\t"
+	    "packuswb %%mm1, %%mm0\n\t"
+	    "packuswb %%mm3, %%mm2\n\t"
+	    "packuswb %%mm5, %%mm4\n\t"
+	    "packuswb %%mm7, %%mm6\n\t"
+	    "movq	%%mm0, (%0)\n\t"
+	    "movq	%%mm2, (%0, %1)\n\t"
+	    "movq	%%mm4, (%0, %1, 2)\n\t"
+	    "movq	%%mm6, (%0, %2)\n\t"
+	    ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
+	    :"memory");
 }
 
 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
@@ -144,8 +201,9 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
     /* read the pixels */
     p = block;
     pix = pixels;
-	__asm __volatile("pxor	%%mm7, %%mm7":);
-    for(i=0;i<4;i++) {
+    MOVQ_ZERO(mm7);
+    i = 4;
+    while (i) {
 	__asm __volatile(
 		"movq	%2, %%mm0\n\t"
 		"movq	8%2, %%mm1\n\t"
@@ -172,19 +230,47 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
 		:"memory");
         pix += line_size*2;
         p += 16;
-    }
+        i--;
+    };
 }
 
 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-    int dh, hh;
+    int hh;
     UINT8 *p;
     const UINT8 *pix;
+
     p   = block;
-    pix = pixels;
+    pix = pixels; // 2s
+#if 0
+    do {
+      __asm __volatile(
+	"movq	%1, %%mm0\n\t"
+	"movq	%%mm0, %0\n\t"
+	:"=m"(*p)
+	:"m"(*pix)
+	:"memory");
+	pix += line_size;
+	p += line_size;
+    } while (--h);
+#else
+    // this optimized code is not very usefull
+    // the above loop is definitely faster
+    // at least on Celeron 500MHz
+    hh = h & 3;
+    while (hh) {
+      __asm __volatile(
+	  "movq	%1, %%mm0\n\t"
+	  "movq	%%mm0, %0\n\t"
+	  :"=m"(*p)
+	  :"m"(*pix)
+	  :"memory");
+	pix += line_size;
+	p += line_size;
+	hh--;
+    }
     hh=h>>2;
-    dh=h&3;
-    while(hh--) {
+    while (hh) {
     __asm __volatile(
 	"movq	(%1), %%mm0		\n\t"
 	"movq	(%1, %2), %%mm1		\n\t"
@@ -196,19 +282,11 @@ static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int
 	"movq	%%mm3, (%0, %3)		\n\t"
 	::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
 	:"memory");
-        pix = pix + line_size*4;
-        p =   p   + line_size*4;
-    }
-    while(dh--) {
-     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-        pix = pix + line_size;
-        p =   p   + line_size;
+        pix += line_size*4;
+	p += line_size*4;
+        hh--;
     }
+#endif
 }
 
 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
@@ -217,10 +295,9 @@ static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size,
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm4\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm4);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -252,10 +329,9 @@ static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size,
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm4\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm4);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -288,11 +364,10 @@ static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size,
   UINT8 *p;
   const UINT8 *pix;
   p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo));
+  pix = pixels; // 1s
+  MOVQ_ZERO(mm7);
+  MOVQ_WTWO(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -338,7 +413,7 @@ static void   put_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7\n\t":);
+  MOVQ_ZERO(mm7);
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -369,7 +444,8 @@ static void put_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int li
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7\n\t":);
+  MOVQ_ZERO(mm7);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -401,10 +477,9 @@ static void   put_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -450,10 +525,9 @@ static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%0, %%mm0\n\t"
@@ -487,10 +561,9 @@ static void   avg_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_si
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm1\n\t"
@@ -533,10 +606,9 @@ static void   avg_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_si
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm1\n\t"
@@ -579,10 +651,10 @@ static void   avg_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_s
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo));
+  MOVQ_ZERO(mm7);
+  // this doesn't seem to be used offten - so
+  // the inside usage of mm_wone is not optimized
+  MOVQ_WTWO(mm6);
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -639,7 +711,7 @@ static void avg_no_rnd_pixels_mmx( UINT8  *block, const UINT8 *pixels, int line_
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7\n\t":);
+  MOVQ_ZERO(mm7);
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -670,8 +742,7 @@ static void   avg_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t":);
+  MOVQ_ZERO(mm7);
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -710,8 +781,7 @@ static void   avg_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t":);
+  MOVQ_ZERO(mm7);
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -750,10 +820,9 @@ static void   avg_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -807,7 +876,7 @@ static void sub_pixels_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size,
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7":);
+  MOVQ_ZERO(mm7);
   do {
     __asm __volatile(
 	"movq	%0, %%mm0\n\t"
@@ -834,10 +903,9 @@ static void sub_pixels_x2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_si
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t"
-      "movq	%0, %%mm6"
-      ::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%0, %%mm0\n\t"
@@ -874,10 +942,8 @@ static void sub_pixels_y2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_si
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t"
-      "movq	%0, %%mm6"
-      ::"m"(mm_wone));
+  MOVQ_ZERO(mm7);
+  MOVQ_WONE(mm6);
   do {
     __asm __volatile(
 	"movq	%0, %%mm0\n\t"
@@ -914,10 +980,9 @@ static void   sub_pixels_xy2_mmx( DCTELEM  *block, const UINT8 *pixels, int line
   const UINT8 *pix;
   p = block;
   pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo));
+  MOVQ_ZERO(mm7);
+  MOVQ_WTWO(mm6);
+  JUMPALIGN();
   do {
     __asm __volatile(
 	"movq	%1, %%mm0\n\t"
@@ -961,11 +1026,30 @@ static void   sub_pixels_xy2_mmx( DCTELEM  *block, const UINT8 *pixels, int line
   } while(--h);
 }
 
+static void clear_blocks_mmx(DCTELEM *blocks)
+{
+        asm volatile(
+                "pxor %%mm7, %%mm7		\n\t"
+                "movl $-128*6, %%eax		\n\t"
+                "1:				\n\t"
+                "movq %%mm7, (%0, %%eax)	\n\t"
+                "movq %%mm7, 8(%0, %%eax)	\n\t"
+                "movq %%mm7, 16(%0, %%eax)	\n\t"
+                "movq %%mm7, 24(%0, %%eax)	\n\t"
+                "addl $32, %%eax		\n\t"
+                " js 1b				\n\t"
+                : : "r" (((int)blocks)+128*6)
+                : "%eax"
+        );
+}
+
+static void just_return() { return; }
+
 void dsputil_init_mmx(void)
 {
-    mm_flags = xine_mm_accel();
-#if 0
-    printf("CPU flags:");
+    mm_flags = mm_support();
+#if 1
+    printf("libavcodec: CPU flags:");
     if (mm_flags & MM_MMX)
         printf(" mmx");
     if (mm_flags & MM_MMXEXT)
@@ -983,11 +1067,16 @@ void dsputil_init_mmx(void)
         get_pixels = get_pixels_mmx;
         put_pixels_clamped = put_pixels_clamped_mmx;
         add_pixels_clamped = add_pixels_clamped_mmx;
-        
-        pix_abs16x16 = pix_abs16x16_mmx;
-        pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
-        pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
+        clear_blocks= clear_blocks_mmx;
+       
+        pix_abs16x16     = pix_abs16x16_mmx;
+        pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
+        pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
         pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
+        pix_abs8x8    = pix_abs8x8_mmx;
+        pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
+        pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
+        pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
         av_fdct = fdct_mmx;
 
         put_pixels_tab[0] = put_pixels_mmx;
@@ -1016,10 +1105,16 @@ void dsputil_init_mmx(void)
         sub_pixels_tab[3] = sub_pixels_xy2_mmx;
 
         if (mm_flags & MM_MMXEXT) {
-            pix_abs16x16 = pix_abs16x16_sse;
-        }
-
-        if (mm_flags & MM_SSE) {
+            pix_abs16x16    = pix_abs16x16_mmx2;
+            pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
+            pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
+            pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
+            
+            pix_abs8x8    = pix_abs8x8_mmx2;
+            pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
+            pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
+            pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
+            
             put_pixels_tab[1] = put_pixels_x2_sse;
             put_pixels_tab[2] = put_pixels_y2_sse;
             
@@ -1054,4 +1149,44 @@ void dsputil_init_mmx(void)
 	ff_idct = simple_idct_mmx;
 #endif
     }
+
+#if 0
+    // for speed testing
+    get_pixels = just_return;
+    put_pixels_clamped = just_return;
+    add_pixels_clamped = just_return;
+
+    pix_abs16x16 = just_return;
+    pix_abs16x16_x2 = just_return;
+    pix_abs16x16_y2 = just_return;
+    pix_abs16x16_xy2 = just_return;
+
+    put_pixels_tab[0] = just_return;
+    put_pixels_tab[1] = just_return;
+    put_pixels_tab[2] = just_return;
+    put_pixels_tab[3] = just_return;
+
+    put_no_rnd_pixels_tab[0] = just_return;
+    put_no_rnd_pixels_tab[1] = just_return;
+    put_no_rnd_pixels_tab[2] = just_return;
+    put_no_rnd_pixels_tab[3] = just_return;
+
+    avg_pixels_tab[0] = just_return;
+    avg_pixels_tab[1] = just_return;
+    avg_pixels_tab[2] = just_return;
+    avg_pixels_tab[3] = just_return;
+
+    avg_no_rnd_pixels_tab[0] = just_return;
+    avg_no_rnd_pixels_tab[1] = just_return;
+    avg_no_rnd_pixels_tab[2] = just_return;
+    avg_no_rnd_pixels_tab[3] = just_return;
+
+    sub_pixels_tab[0] = just_return;
+    sub_pixels_tab[1] = just_return;
+    sub_pixels_tab[2] = just_return;
+    sub_pixels_tab[3] = just_return;
+
+    //av_fdct = just_return;
+    //ff_idct = just_return;
+#endif
 }
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index 35b16b711..e704c4219 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -16,229 +16,347 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
+ * mostly by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "../dsputil.h"
-#include "mmx.h"
 
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
-static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
+static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
+0x0000000000000000,
+0x0001000100010001,
+0x0002000200020002,
+};
 
-/* mm7 is accumulator, mm6 is zero */
-static inline void sad_add(const UINT8 *p1, const UINT8 *p2)
+static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
-    movq_m2r(*p1, mm0);
-    movq_m2r(*p2, mm1);
-    movq_r2r(mm0, mm2);
-    psubusb_r2r(mm1, mm0);
-    psubusb_r2r(mm2, mm1);
-    por_r2r(mm1, mm0); /* mm0 is absolute value */
-
-    movq_r2r(mm0, mm1);
-    punpcklbw_r2r(mm6, mm0);
-    punpckhbw_r2r(mm6, mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%2, %%eax), %%mm4	\n\t"
+        "addl %3, %%eax			\n\t"
+        "psubusb %%mm0, %%mm2		\n\t"
+        "psubusb %%mm4, %%mm0		\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%2, %%eax), %%mm5	\n\t"
+        "psubusb %%mm1, %%mm3		\n\t"
+        "psubusb %%mm5, %%mm1		\n\t"
+        "por %%mm2, %%mm0		\n\t"
+        "por %%mm1, %%mm3		\n\t"
+        "movq %%mm0, %%mm1		\n\t"
+        "movq %%mm3, %%mm2		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "punpcklbw %%mm7, %%mm3		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "paddw %%mm1, %%mm0		\n\t"
+        "paddw %%mm3, %%mm2		\n\t"
+        "paddw %%mm2, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %3, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-/* convert mm7 to value */
-static inline int sad_end(void)
+static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
-    int res;
-
-    movq_r2r(mm7, mm0);
-    psrlq_i2r(32, mm7);
-    paddusw_r2r(mm0, mm7);
-
-    movq_r2r(mm7, mm0);
-    psrlq_i2r(16, mm7);
-    paddusw_r2r(mm0, mm7);
-    __asm __volatile ("movd %%mm7, %0" : "=a" (res));
-    return res & 0xffff;
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "psadbw %%mm2, %%mm0		\n\t"
+        "addl %3, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "psadbw %%mm1, %%mm3		\n\t"
+        "paddw %%mm3, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %3, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
 {
-    const UINT8 *p1, *p2;
-
-    h >>= 1;
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    do {
-        sad_add(p1, p2);
-        sad_add(p1 + 8, p2 + 8);
-        p1 += lx;
-        p2 += lx;
-        sad_add(p1, p2);
-        sad_add(p1 + 8, p2 + 8);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "pavgb %%mm2, %%mm0		\n\t"
+        "movq (%3, %%eax), %%mm2	\n\t"
+        "psadbw %%mm2, %%mm0		\n\t"
+        "addl %4, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "pavgb %%mm1, %%mm3		\n\t"
+        "movq (%3, %%eax), %%mm1	\n\t"
+        "psadbw %%mm1, %%mm3		\n\t"
+        "paddw %%mm3, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-/* please test it ! */
-static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2)
-{
-    movq_m2r(*(p1 + 0), mm0);
-    movq_m2r(*(p1 + 8), mm1);
-    psadbw_m2r(*(p2 + 0), mm0);
-    psadbw_m2r(*(p2 + 8), mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
+static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
+{ //FIXME reuse src
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t" 
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq 1(%1, %%eax), %%mm1	\n\t"
+        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "pavgb %%mm2, %%mm0		\n\t"
+        "pavgb %%mm1, %%mm3		\n\t"
+        "pavgb %%mm3, %%mm0		\n\t"
+        "movq (%3, %%eax), %%mm2	\n\t"
+        "psadbw %%mm2, %%mm0		\n\t"
+        "addl %4, %%eax			\n\t"
+        "movq (%1, %%eax), %%mm1	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq 1(%1, %%eax), %%mm2	\n\t"
+        "movq 1(%2, %%eax), %%mm4	\n\t"
+        "pavgb %%mm3, %%mm1		\n\t"
+        "pavgb %%mm4, %%mm2		\n\t"
+        "pavgb %%mm1, %%mm2		\n\t"
+        "movq (%3, %%eax), %%mm1	\n\t"
+        "psadbw %%mm1, %%mm2		\n\t"
+        "paddw %%mm2, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
 {
-    const UINT8 *p1, *p2;
-
-    h >>= 1;
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    do {
-        sad_add_sse(p1, p2);
-        p1 += lx;
-        p2 += lx;
-        sad_add_sse(p1, p2);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq (%1, %%eax), %%mm2	\n\t"
+        "movq (%2, %%eax), %%mm3	\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm1		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "punpckhbw %%mm7, %%mm3		\n\t"
+        "paddw %%mm0, %%mm1		\n\t"
+        "paddw %%mm2, %%mm3		\n\t"
+        "movq (%3, %%eax), %%mm4	\n\t" 
+        "movq (%3, %%eax), %%mm2	\n\t"
+        "paddw %%mm5, %%mm1		\n\t"
+        "paddw %%mm5, %%mm3		\n\t"
+        "psrlw $1, %%mm1		\n\t"
+        "psrlw $1, %%mm3		\n\t"
+        "packuswb %%mm3, %%mm1		\n\t"
+        "psubusb %%mm1, %%mm4		\n\t"
+        "psubusb %%mm2, %%mm1		\n\t"
+        "por %%mm4, %%mm1		\n\t"
+        "movq %%mm1, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "paddw %%mm1, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-#define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); }
-
-/* mm7 is accumulator, mm6 is zero */
-static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3)
+static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
-    movq_m2r(*(p2 + 0), mm0);
-    movq_m2r(*(p3 + 0), mm1);
-    movq_r2r(mm0, mm2);
-    movq_r2r(mm1, mm3);
-    punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */
-    punpcklbw_r2r(mm6, mm1);
-    punpckhbw_r2r(mm6, mm2); /* high */
-    punpckhbw_r2r(mm6, mm3); 
-    paddusw_r2r(mm1, mm0);
-    paddusw_r2r(mm3, mm2);
-    movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */
-    paddusw_r2r(mm5, mm0); /* + 1 */
-    paddusw_r2r(mm5, mm2); /* + 1 */
-    psrlw_i2r(1, mm0);
-    psrlw_i2r(1, mm2);
-    packuswb_r2r(mm2, mm0); /* average is in mm0 */
-
-    movq_r2r(mm1, mm2); 
-    psubusb_r2r(mm0, mm1);
-    psubusb_r2r(mm2, mm0);
-    por_r2r(mm1, mm0); /* mm0 is absolute value */
-
-    movq_r2r(mm0, mm1);
-    punpcklbw_r2r(mm6, mm0);
-    punpckhbw_r2r(mm6, mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
+    int len= -(stride<<h);
+    asm volatile(
+        ".balign 16			\n\t"
+        "1:				\n\t"
+        "movq (%1, %%eax), %%mm0	\n\t"
+        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq %%mm0, %%mm4		\n\t"
+        "movq %%mm1, %%mm2		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm1		\n\t"
+        "punpckhbw %%mm7, %%mm4		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "paddw %%mm1, %%mm0		\n\t"
+        "paddw %%mm2, %%mm4		\n\t"
+        "movq 1(%1, %%eax), %%mm2	\n\t"
+        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "movq %%mm2, %%mm1		\n\t"
+        "punpcklbw %%mm7, %%mm2		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "paddw %%mm0, %%mm2		\n\t"
+        "paddw %%mm4, %%mm1		\n\t"
+        "movq %%mm3, %%mm4		\n\t"
+        "punpcklbw %%mm7, %%mm3		\n\t"
+        "punpckhbw %%mm7, %%mm4		\n\t"
+        "paddw %%mm3, %%mm2		\n\t"
+        "paddw %%mm4, %%mm1		\n\t"
+        "movq (%3, %%eax), %%mm3	\n\t" 
+        "movq (%3, %%eax), %%mm4	\n\t" 
+        "paddw %%mm5, %%mm2		\n\t"
+        "paddw %%mm5, %%mm1		\n\t"
+        "psrlw $2, %%mm2		\n\t"
+        "psrlw $2, %%mm1		\n\t"
+        "packuswb %%mm1, %%mm2		\n\t"
+        "psubusb %%mm2, %%mm3		\n\t"
+        "psubusb %%mm4, %%mm2		\n\t"
+        "por %%mm3, %%mm2		\n\t"
+        "movq %%mm2, %%mm0		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm2		\n\t"
+        "paddw %%mm2, %%mm0		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "addl %4, %%eax			\n\t"
+        " js 1b				\n\t"
+        : "+a" (len)
+        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
+    );
 }
 
-int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline int sum_mmx()
 {
-    const UINT8 *p1, *p2;
-
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    movq_m2r(mm_wone, mm5); /* one constant */
-    do {
-        sad_add_x2(p1, p2, p2 + 1);
-        sad_add_x2(p1 + 8, p2 + 8, p2 + 9);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int ret;
+    asm volatile(
+        "movq %%mm6, %%mm0		\n\t"
+        "psrlq $32, %%mm6		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "movq %%mm6, %%mm0		\n\t"
+        "psrlq $16, %%mm6		\n\t"
+        "paddw %%mm0, %%mm6		\n\t"
+        "movd %%mm6, %0			\n\t"
+        : "=r" (ret)
+    );
+    return ret&0xFFFF;
 }
 
-int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
+static inline int sum_mmx2()
 {
-    const UINT8 *p1, *p2;
-
-    p1 = blk1;
-    p2 = blk2;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    movq_m2r(mm_wone, mm5); /* one constant */
-    do {
-        sad_add_x2(p1, p2, p2 + lx);
-        sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
+    int ret;
+    asm volatile(
+        "movd %%mm6, %0			\n\t"
+        : "=r" (ret)
+    );
+    return ret;
 }
 
-/* mm7 is accumulator, mm6 is zero */
-static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3)
-{
-    movq_m2r(*(p2 + 0), mm0);
-    movq_m2r(*(p3 + 0), mm1);
-    movq_r2r(mm0, mm2);
-    movq_r2r(mm1, mm3);
-    punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */
-    punpcklbw_r2r(mm6, mm1);
-    punpckhbw_r2r(mm6, mm2); /* high */
-    punpckhbw_r2r(mm6, mm3); 
-    paddusw_r2r(mm1, mm0);
-    paddusw_r2r(mm3, mm2);
-
-    movq_m2r(*(p2 + 1), mm1);
-    movq_m2r(*(p3 + 1), mm3);
-    movq_r2r(mm1, mm4);
-    punpcklbw_r2r(mm6, mm1); /* low */
-    punpckhbw_r2r(mm6, mm4); /* high */
-    paddusw_r2r(mm1, mm0);
-    paddusw_r2r(mm4, mm2);
-    movq_r2r(mm3, mm4);
-    punpcklbw_r2r(mm6, mm3); /* low */
-    punpckhbw_r2r(mm6, mm4); /* high */
-    paddusw_r2r(mm3, mm0);
-    paddusw_r2r(mm4, mm2);
-    
-    movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */
-    paddusw_r2r(mm5, mm0); /* + 2 */
-    paddusw_r2r(mm5, mm2); /* + 2 */
-    psrlw_i2r(2, mm0);
-    psrlw_i2r(2, mm2);
-    packuswb_r2r(mm2, mm0); /* average is in mm0 */
+#define PIX_SAD(suf)\
+int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t":);\
+\
+    sad8_ ## suf(blk1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1, blk2+1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[2]) \
+                 );\
+\
+    sad8_4_ ## suf(blk1, blk2, stride, 3);\
+\
+    return sum_ ## suf();\
+}\
+\
+int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t":);\
+\
+    sad8_ ## suf(blk1  , blk2  , stride, 4);\
+    sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
+int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, 4);\
+    sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
+int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[1]) \
+                 );\
+\
+    sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, 4);\
+    sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
+int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+{\
+    asm volatile("pxor %%mm7, %%mm7		\n\t"\
+                 "pxor %%mm6, %%mm6		\n\t"\
+                 "movq %0, %%mm5		\n\t"\
+                 :: "m"(round_tab[2]) \
+                 );\
+\
+    sad8_4_ ## suf(blk1  , blk2  , stride, 4);\
+    sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\
+\
+    return sum_ ## suf();\
+}\
 
-    movq_r2r(mm1, mm2); 
-    psubusb_r2r(mm0, mm1);
-    psubusb_r2r(mm2, mm0);
-    por_r2r(mm1, mm0); /* mm0 is absolute value */
-
-    movq_r2r(mm0, mm1);
-    punpcklbw_r2r(mm6, mm0);
-    punpckhbw_r2r(mm6, mm1);
-    paddusw_r2r(mm0, mm7);
-    paddusw_r2r(mm1, mm7);
-}
-
-int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
-{
-    const UINT8 *p1, *p2, *p3;
-
-    p1 = blk1;
-    p2 = blk2;
-    p3 = blk2 + lx;
-    pxor_r2r(mm7, mm7); /* mm7 is accumulator */
-    pxor_r2r(mm6, mm6); /* mm7 is zero constant */
-    movq_m2r(mm_wtwo, mm5); /* one constant */
-    do {
-        sad_add_xy2(p1, p2, p2 + lx);
-        sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx);
-        p1 += lx;
-        p2 += lx;
-    } while (--h);
-    return sad_end();
-}
+PIX_SAD(mmx)
+PIX_SAD(mmx2)
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 084eb6038..92724ac87 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -16,6 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * new Motion Estimation (X1/EPZS) by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "config.h"
 #include "xine-utils/xineutils.h"
@@ -25,9 +27,14 @@
 #include "dsputil.h"
 #include "mpegvideo.h"
 
+#define ABS(a) ((a)>0 ? (a) : -(a))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define INTER_BIAS	257
+
 static void halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-				  int xmin, int ymin, int xmax, int ymax);
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y);
 
 /* config it to test motion vector encoding (send random vectors) */
 //#define CONFIG_TEST_MV_ENCODE
@@ -54,6 +61,28 @@ static int pix_sum(UINT8 * pix, int line_size)
     return s;
 }
 
+static int pix_dev(UINT8 * pix, int line_size, int mean)
+{
+    int s, i, j;
+
+    s = 0;
+    for (i = 0; i < 16; i++) {
+	for (j = 0; j < 16; j += 8) {
+	    s += ABS(pix[0]-mean);
+	    s += ABS(pix[1]-mean);
+	    s += ABS(pix[2]-mean);
+	    s += ABS(pix[3]-mean);
+	    s += ABS(pix[4]-mean);
+	    s += ABS(pix[5]-mean);
+	    s += ABS(pix[6]-mean);
+	    s += ABS(pix[7]-mean);
+	    pix += 8;
+	}
+	pix += line_size - 16;
+    }
+    return s;
+}
+
 static int pix_norm1(UINT8 * pix, int line_size)
 {
     int s, i, j;
@@ -138,7 +167,7 @@ static int full_motion_search(MpegEncContext * s,
     for (y = y1; y <= y2; y++) {
 	for (x = x1; x <= x2; x++) {
 	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x,
-			     s->linesize, 16);
+			     s->linesize);
 	    if (d < dmin ||
 		(d == dmin &&
 		 (abs(x - xx) + abs(y - yy)) <
@@ -202,7 +231,7 @@ static int log_motion_search(MpegEncContext * s,
     do {
 	for (y = y1; y <= y2; y += range) {
 	    for (x = x1; x <= x2; x += range) {
-		d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16);
+		d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
 		if (d < dmin || (d == dmin && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		    dmin = d;
 		    mx = x;
@@ -282,7 +311,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 	lastx = x;
 	for (x = x1; x <= x2; x += range) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16);
+	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
 	    if (d < dminx || (d == dminx && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		dminx = d;
 		mx = x;
@@ -291,7 +320,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 	x = lastx;
 	for (y = y1; y <= y2; y += range) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize, 16);
+	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
 	    if (d < dminy || (d == dminy && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		dminy = d;
 		my = y;
@@ -330,78 +359,474 @@ static int phods_motion_search(MpegEncContext * s,
     return dminy;
 }
 
+
+#define Z_THRESHOLD 256
+
+#define CHECK_MV(x,y)\
+{\
+    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+    if(d<dmin){\
+        best[0]=x;\
+        best[1]=y;\
+        dmin=d;\
+    }\
+}
+
+#define CHECK_MV_DIR(x,y,new_dir)\
+{\
+    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+    if(d<dmin){\
+        best[0]=x;\
+        best[1]=y;\
+        dmin=d;\
+        next_dir= new_dir;\
+    }\
+}
+
+#define CHECK_MV4(x,y)\
+{\
+    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+    if(d<dmin){\
+        best[0]=x;\
+        best[1]=y;\
+        dmin=d;\
+    }\
+}
+
+#define CHECK_MV4_DIR(x,y,new_dir)\
+{\
+    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+    if(d<dmin){\
+        best[0]=x;\
+        best[1]=y;\
+        dmin=d;\
+        next_dir= new_dir;\
+    }\
+}
+
+
+#define check(x,y,S,v)\
+if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) printf("%d %d %d %d ymin" #v, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) printf("%d %d %d %d ymax" #v, (x), (y), s->mb_x, s->mb_y);\
+
+
+static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
+                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
+                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
+                                       int xmin, int ymin, int xmax, int ymax, int shift)
+{
+    int next_dir=-1;
+
+    for(;;){
+        int d;
+        const int dir= next_dir;
+        const int x= best[0];
+        const int y= best[1];
+        next_dir=-1;
+
+//printf("%d", dir);
+        if(dir!=2 && x>xmin) CHECK_MV_DIR(x-1, y  , 0)
+        if(dir!=3 && y>ymin) CHECK_MV_DIR(x  , y-1, 1)
+        if(dir!=0 && x<xmax) CHECK_MV_DIR(x+1, y  , 2)
+        if(dir!=1 && y<ymax) CHECK_MV_DIR(x  , y+1, 3)
+
+        if(next_dir==-1){
+            return dmin;
+        }
+    }
+
+/*    for(;;){
+        int d;
+        const int x= best[0];
+        const int y= best[1];
+        const int last_min=dmin;
+        if(x>xmin) CHECK_MV(x-1, y  )
+        if(y>xmin) CHECK_MV(x  , y-1)
+        if(x<xmax) CHECK_MV(x+1, y  )
+        if(y<xmax) CHECK_MV(x  , y+1)
+        if(x>xmin && y>ymin) CHECK_MV(x-1, y-1)
+        if(x>xmin && y<ymax) CHECK_MV(x-1, y+1)
+        if(x<xmax && y>ymin) CHECK_MV(x+1, y-1)
+        if(x<xmax && y<ymax) CHECK_MV(x+1, y+1)
+        if(x-1>xmin) CHECK_MV(x-2, y  )
+        if(y-1>xmin) CHECK_MV(x  , y-2)
+        if(x+1<xmax) CHECK_MV(x+2, y  )
+        if(y+1<xmax) CHECK_MV(x  , y+2)
+        if(x-1>xmin && y-1>ymin) CHECK_MV(x-2, y-2)
+        if(x-1>xmin && y+1<ymax) CHECK_MV(x-2, y+2)
+        if(x+1<xmax && y-1>ymin) CHECK_MV(x+2, y-2)
+        if(x+1<xmax && y+1<ymax) CHECK_MV(x+2, y+2)
+        if(dmin==last_min) return dmin;
+    }
+    */
+}
+
+static inline int small_diamond_search4MV(MpegEncContext * s, int *best, int dmin,
+                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
+                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
+                                       int xmin, int ymin, int xmax, int ymax, int shift)
+{
+    int next_dir=-1;
+
+    for(;;){
+        int d;
+        const int dir= next_dir;
+        const int x= best[0];
+        const int y= best[1];
+        next_dir=-1;
+
+//printf("%d", dir);
+        if(dir!=2 && x>xmin) CHECK_MV4_DIR(x-1, y  , 0)
+        if(dir!=3 && y>ymin) CHECK_MV4_DIR(x  , y-1, 1)
+        if(dir!=0 && x<xmax) CHECK_MV4_DIR(x+1, y  , 2)
+        if(dir!=1 && y<ymax) CHECK_MV4_DIR(x  , y+1, 3)
+
+        if(next_dir==-1){
+            return dmin;
+        }
+    }
+}
+
+static inline int snake_search(MpegEncContext * s, int *best, int dmin,
+                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
+                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
+                                       int xmin, int ymin, int xmax, int ymax, int shift)
+{
+    int dir=0;
+    int c=1;
+    static int x_dir[8]= {1,1,0,-1,-1,-1, 0, 1};
+    static int y_dir[8]= {0,1,1, 1, 0,-1,-1,-1};
+    int fails=0;
+    int last_d[2]={dmin, dmin};
+
+/*static int good=0;
+static int bad=0;
+static int point=0;
+
+point++;
+if(256*256*256*64%point==0)
+{
+    printf("%d %d %d\n", good, bad, point);
+}*/
+
+    for(;;){
+        int x= best[0];
+        int y= best[1];
+        int d;
+        x+=x_dir[dir];
+        y+=y_dir[dir];
+        if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
+            d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
+            d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
+        }else{
+            d = dmin + 10000; //FIXME smarter boundary handling
+        }
+        if(d<dmin){
+            best[0]=x;
+            best[1]=y;
+            dmin=d;
+
+            if(last_d[1] - last_d[0] > last_d[0] - d) c= -c;
+            dir+=c;
+
+            fails=0;
+//good++;
+            last_d[1]=last_d[0];
+            last_d[0]=d;
+        }else{
+//bad++;
+            if(fails){
+                if(fails>=3) return dmin;
+            }else{
+                c= -c;
+            }
+            dir+=c*2;
+            fails++;
+        }
+        dir&=7;
+    }
+}
+
+static int epzs_motion_search(MpegEncContext * s,
+                             int *mx_ptr, int *my_ptr,
+                             int P[5][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax)
+{
+    int best[2]={0, 0};
+    int d, dmin; 
+    UINT8 *new_pic, *old_pic;
+    const int pic_stride= s->linesize;
+    const int pic_xy= (s->mb_y*pic_stride + s->mb_x)*16;
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    int quant= s->qscale; // qscale of the prev frame
+    const int shift= 1+s->quarter_sample;
+
+    new_pic = s->new_picture[0] + pic_xy;
+    old_pic = s->last_picture[0] + pic_xy;
+   
+    dmin = pix_abs16x16(new_pic, old_pic, pic_stride);
+    if(dmin<Z_THRESHOLD){
+        *mx_ptr= 0;
+        *my_ptr= 0;
+//printf("Z");
+        return dmin;
+    }
+
+    /* first line */
+    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
+        CHECK_MV(P[1][0]>>shift, P[1][1]>>shift)
+    }else{
+        CHECK_MV(P[4][0]>>shift, P[4][1]>>shift)
+        if(dmin<Z_THRESHOLD){
+            *mx_ptr= P[4][0]>>shift;
+            *my_ptr= P[4][1]>>shift;
+//printf("M\n");
+            return dmin;
+        }
+        CHECK_MV(P[1][0]>>shift, P[1][1]>>shift)
+        CHECK_MV(P[2][0]>>shift, P[2][1]>>shift)
+        CHECK_MV(P[3][0]>>shift, P[3][1]>>shift)
+    }
+    CHECK_MV(P[0][0]>>shift, P[0][1]>>shift)
+
+//check(best[0],best[1],0, b0)
+    if(s->full_search==ME_EPZS)
+        dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+    else
+        dmin=         snake_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+//check(best[0],best[1],0, b1)
+    *mx_ptr= best[0];
+    *my_ptr= best[1];    
+
+//    printf("%d %d %d \n", best[0], best[1], dmin);
+    return dmin;
+}
+
+static int epzs_motion_search4(MpegEncContext * s, int block,
+                             int *mx_ptr, int *my_ptr,
+                             int P[6][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax)
+{
+    int best[2]={0, 0};
+    int d, dmin; 
+    UINT8 *new_pic, *old_pic;
+    const int pic_stride= s->linesize;
+    const int pic_xy= ((s->mb_y*2 + (block>>1))*pic_stride + s->mb_x*2 + (block&1))*8;
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    int quant= s->qscale; // qscale of the prev frame
+    const int shift= 1+s->quarter_sample;
+
+    new_pic = s->new_picture[0] + pic_xy;
+    old_pic = s->last_picture[0] + pic_xy;
+   
+    dmin = pix_abs8x8(new_pic, old_pic, pic_stride);
+
+    /* first line */
+    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
+        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
+    }else{
+        CHECK_MV4(P[4][0]>>shift, P[4][1]>>shift)
+        if(dmin<Z_THRESHOLD){
+            *mx_ptr= P[4][0]>>shift;
+            *my_ptr= P[4][1]>>shift;
+//printf("M\n");
+            return dmin;
+        }
+        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
+        CHECK_MV4(P[2][0]>>shift, P[2][1]>>shift)
+        CHECK_MV4(P[3][0]>>shift, P[3][1]>>shift)
+    }
+    CHECK_MV4(P[0][0]>>shift, P[0][1]>>shift)
+    CHECK_MV4(P[5][0]>>shift, P[5][1]>>shift)
+
+//check(best[0],best[1],0, b0)
+    dmin= small_diamond_search4MV(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+//check(best[0],best[1],0, b1)
+    *mx_ptr= best[0];
+    *my_ptr= best[1];    
+
+//    printf("%d %d %d \n", best[0], best[1], dmin);
+    return dmin;
+}
+
+#define CHECK_HALF_MV(suffix, x, y) \
+    d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
+    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
+    if(d<dminh){\
+        dminh= d;\
+        mx= mx1 + x;\
+        my= my1 + y;\
+    }
+
+#define CHECK_HALF_MV4(suffix, x, y) \
+    d= pix_abs8x8_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
+    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
+    if(d<dminh){\
+        dminh= d;\
+        mx= mx1 + x;\
+        my= my1 + y;\
+    }
+    
 /* The idea would be to make half pel ME after Inter/Intra decision to 
    save time. */
-static void halfpel_motion_search(MpegEncContext * s,
+static inline void halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
-				  int xmin, int ymin, int xmax, int ymax)
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y)
 {
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    const int quant= s->qscale;
+    int pen_x, pen_y;
     int mx, my, mx1, my1, d, xx, yy, dminh;
-    UINT8 *pix;
+    UINT8 *pix, *ptr;
 
-    mx = *mx_ptr << 1;
-    my = *my_ptr << 1;
+    mx = *mx_ptr;
+    my = *my_ptr;
+    ptr = s->last_picture[0] + (my * s->linesize) + mx;
 
     xx = 16 * s->mb_x;
     yy = 16 * s->mb_y;
-
+    pix =  s->new_picture[0] + (yy * s->linesize) + xx;
+    
     dminh = dmin;
 
-    /* Half pixel search */
-    mx1 = mx;
-    my1 = my;
+    if (mx > xmin && mx < xmax && 
+        my > ymin && my < ymax) {
 
-    pix = s->new_picture[0] + (yy * s->linesize) + xx;
+        mx= mx1= 2*(mx - xx);
+        my= my1= 2*(my - yy);
+        if(dmin < Z_THRESHOLD && mx==0 && my==0){
+            *mx_ptr = 0;
+            *my_ptr = 0;
+            return;
+        }
+        
+        pen_x= pred_x + mx;
+        pen_y= pred_y + my;
+
+        ptr-= s->linesize;
+        CHECK_HALF_MV(xy2, -1, -1)
+        CHECK_HALF_MV(y2 ,  0, -1)
+        CHECK_HALF_MV(xy2, +1, -1)
+        
+        ptr+= s->linesize;
+        CHECK_HALF_MV(x2 , -1,  0)
+        CHECK_HALF_MV(x2 , +1,  0)
+        CHECK_HALF_MV(xy2, -1, +1)
+        CHECK_HALF_MV(y2 ,  0, +1)
+        CHECK_HALF_MV(xy2, +1, +1)
+
+    }else{
+        mx= 2*(mx - xx);
+        my= 2*(my - yy);
+    }
 
-    if ((mx > (xmin << 1)) && mx < (xmax << 1) && 
-        (my > (ymin << 1)) && my < (ymax << 1)) {
-	    int dx, dy, px, py;
-	    UINT8 *ptr;
-        for (dy = -1; dy <= 1; dy++) {
-            for (dx = -1; dx <= 1; dx++) {
-                if (dx != 0 || dy != 0) {
-                    px = mx1 + dx;
-                    py = my1 + dy;
-                    ptr = s->last_picture[0] + ((py >> 1) * s->linesize) + (px >> 1);
-                    switch (((py & 1) << 1) | (px & 1)) {
-                    default:
-                    case 0:
-                        d = pix_abs16x16(pix, ptr, s->linesize, 16);
-                        break;
-                    case 1:
-                        d = pix_abs16x16_x2(pix, ptr, s->linesize, 16);
-                        break;
-                    case 2:
-                        d = pix_abs16x16_y2(pix, ptr, s->linesize, 16);
-                        break;
-                    case 3:
-                        d = pix_abs16x16_xy2(pix, ptr, s->linesize, 16);
-                        break;
-                    }
-                    if (d < dminh) {
-                        dminh = d;
-                        mx = px;
-                        my = py;
-                    }
-                }
-            }
+    *mx_ptr = mx;
+    *my_ptr = my;
+}
+
+static inline void halfpel_motion_search4(MpegEncContext * s,
+				  int *mx_ptr, int *my_ptr, int dmin,
+				  int xmin, int ymin, int xmax, int ymax,
+                                  int pred_x, int pred_y, int block_x, int block_y)
+{
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    const int quant= s->qscale;
+    int pen_x, pen_y;
+    int mx, my, mx1, my1, d, xx, yy, dminh;
+    UINT8 *pix, *ptr;
+
+    xx = 8 * block_x;
+    yy = 8 * block_y;
+    pix =  s->new_picture[0] + (yy * s->linesize) + xx;
+    
+    mx = *mx_ptr;
+    my = *my_ptr;
+    ptr = s->last_picture[0] + ((yy+my) * s->linesize) + xx + mx;
+
+    dminh = dmin;
+
+    if (mx > xmin && mx < xmax && 
+        my > ymin && my < ymax) {
+
+        mx= mx1= 2*mx;
+        my= my1= 2*my;
+        if(dmin < Z_THRESHOLD && mx==0 && my==0){
+            *mx_ptr = 0;
+            *my_ptr = 0;
+            return;
         }
+        
+        pen_x= pred_x + mx;
+        pen_y= pred_y + my;
+
+        ptr-= s->linesize;
+        CHECK_HALF_MV4(xy2, -1, -1)
+        CHECK_HALF_MV4(y2 ,  0, -1)
+        CHECK_HALF_MV4(xy2, +1, -1)
+        
+        ptr+= s->linesize;
+        CHECK_HALF_MV4(x2 , -1,  0)
+        CHECK_HALF_MV4(x2 , +1,  0)
+        CHECK_HALF_MV4(xy2, -1, +1)
+        CHECK_HALF_MV4(y2 ,  0, +1)
+        CHECK_HALF_MV4(xy2, +1, +1)
+
+    }else{
+        mx*=2;
+        my*=2;
     }
 
-    *mx_ptr = mx - (xx << 1);
-    *my_ptr = my - (yy << 1);
-    //fprintf(stderr,"half  - MX: %d\tMY: %d\n",*mx_ptr ,*my_ptr);
+    *mx_ptr = mx;
+    *my_ptr = my;
+}
+
+static inline void set_mv_tables(MpegEncContext * s, int mx, int my)
+{
+    const int xy= s->mb_x + s->mb_y*s->mb_width;
+    
+    s->mv_table[0][xy] = mx;
+    s->mv_table[1][xy] = my;
+
+    /* has allready been set to the 4 MV if 4MV is done */
+    if(!(s->flags&CODEC_FLAG_4MV)){
+        int mot_xy= s->block_index[0];
+
+        s->motion_val[mot_xy  ][0]= mx;
+        s->motion_val[mot_xy  ][1]= my;
+        s->motion_val[mot_xy+1][0]= mx;
+        s->motion_val[mot_xy+1][1]= my;
+
+        mot_xy += s->block_wrap[0];
+        s->motion_val[mot_xy  ][0]= mx;
+        s->motion_val[mot_xy  ][1]= my;
+        s->motion_val[mot_xy+1][0]= mx;
+        s->motion_val[mot_xy+1][1]= my;
+    }
 }
 
 #ifndef CONFIG_TEST_MV_ENCODE
 
-int estimate_motion(MpegEncContext * s,
-		    int mb_x, int mb_y,
-		    int *mx_ptr, int *my_ptr)
+void estimate_motion(MpegEncContext * s,
+		    int mb_x, int mb_y)
 {
     UINT8 *pix, *ppix;
     int sum, varc, vard, mx, my, range, dmin, xx, yy;
     int xmin, ymin, xmax, ymax;
+    int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
+    int pred_x=0, pred_y=0;
+    int P[6][2];
+    const int shift= 1+s->quarter_sample;
+    int mb_type=0;
     
     range = 8 * (1 << (s->f_code - 1));
     /* XXX: temporary kludge to avoid overflow for msmpeg4 */
@@ -411,6 +836,8 @@ int estimate_motion(MpegEncContext * s,
     if (s->unrestricted_mv) {
         xmin = -16;
         ymin = -16;
+        if (s->h263_plus)
+            range *= 2;
         if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4){
             xmax = s->mb_width*16;
             ymax = s->mb_height*16;
@@ -426,7 +853,6 @@ int estimate_motion(MpegEncContext * s,
         xmax = s->mb_width*16 - 16;
         ymax = s->mb_height*16 - 16;
     }
-
     switch(s->full_search) {
     case ME_ZERO:
     default:
@@ -442,8 +868,116 @@ int estimate_motion(MpegEncContext * s,
     case ME_PHODS:
 	dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax);
         break;
+    case ME_X1:
+    case ME_EPZS:
+       {
+            const int mot_stride = s->block_wrap[0];
+            const int mot_xy = s->block_index[0];
+
+            rel_xmin= xmin - mb_x*16;
+            rel_xmax= xmax - mb_x*16;
+            rel_ymin= ymin - mb_y*16;
+            rel_ymax= ymax - mb_y*16;
+
+            P[0][0] = s->motion_val[mot_xy    ][0];
+            P[0][1] = s->motion_val[mot_xy    ][1];
+            P[1][0] = s->motion_val[mot_xy - 1][0];
+            P[1][1] = s->motion_val[mot_xy - 1][1];
+            if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift);
+
+            /* special case for first line */
+            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
+                P[4][0] = P[1][0];
+                P[4][1] = P[1][1];
+            } else {
+                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
+                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
+                P[3][0] = s->motion_val[mot_xy - mot_stride + 2         ][0];
+                P[3][1] = s->motion_val[mot_xy - mot_stride + 2         ][1];
+                if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift);
+                if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift);
+                if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift);
+        
+                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
+                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
+            }
+            if(s->out_format == FMT_H263){
+                pred_x = P[4][0];
+                pred_y = P[4][1];
+            }else { /* mpeg1 at least */
+                pred_x= P[1][0];
+                pred_y= P[1][1];
+            }
+        }
+        dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax);
+ 
+        mx+= mb_x*16;
+        my+= mb_y*16;
+        break;
+    }
+    
+    if(s->flags&CODEC_FLAG_4MV){
+        int block;
+
+        mb_type|= MB_TYPE_INTER4V;
+
+        for(block=0; block<4; block++){
+            int mx4, my4;
+            int pred_x4, pred_y4;
+            int dmin4;
+            static const int off[4]= {2, 1, 1, -1};
+            const int mot_stride = s->block_wrap[0];
+            const int mot_xy = s->block_index[block];
+            const int block_x= mb_x*2 + (block&1);
+            const int block_y= mb_y*2 + (block>>1);
+
+            const int rel_xmin4= xmin - block_x*8;
+            const int rel_xmax4= xmax - block_x*8 + 8;
+            const int rel_ymin4= ymin - block_y*8;
+            const int rel_ymax4= ymax - block_y*8 + 8;
+
+            P[0][0] = s->motion_val[mot_xy    ][0];
+            P[0][1] = s->motion_val[mot_xy    ][1];
+            P[1][0] = s->motion_val[mot_xy - 1][0];
+            P[1][1] = s->motion_val[mot_xy - 1][1];
+            if(P[1][0] > (rel_xmax4<<shift)) P[1][0]= (rel_xmax4<<shift);
+
+            /* special case for first line */
+            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
+                P[4][0] = P[1][0];
+                P[4][1] = P[1][1];
+            } else {
+                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
+                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
+                P[3][0] = s->motion_val[mot_xy - mot_stride + off[block]][0];
+                P[3][1] = s->motion_val[mot_xy - mot_stride + off[block]][1];
+                if(P[2][1] > (rel_ymax4<<shift)) P[2][1]= (rel_ymax4<<shift);
+                if(P[3][0] < (rel_xmin4<<shift)) P[3][0]= (rel_xmin4<<shift);
+                if(P[3][0] > (rel_xmax4<<shift)) P[3][0]= (rel_xmax4<<shift);
+                if(P[3][1] > (rel_ymax4<<shift)) P[3][1]= (rel_ymax4<<shift);
+        
+                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
+                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
+            }
+            if(s->out_format == FMT_H263){
+                pred_x4 = P[4][0];
+                pred_y4 = P[4][1];
+            }else { /* mpeg1 at least */
+                pred_x4= P[1][0];
+                pred_y4= P[1][1];
+            }
+            P[5][0]= mx - mb_x*16;
+            P[5][1]= my - mb_y*16;
+
+            dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4);
+
+            halfpel_motion_search4(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
+                                   pred_x4, pred_y4, block_x, block_y);
+     
+            s->motion_val[ s->block_index[block] ][0]= mx4;
+            s->motion_val[ s->block_index[block] ][1]= my4;
+        }
     }
-    emms_c();
 
     /* intra / predictive decision */
     xx = mb_x * 16;
@@ -452,36 +986,53 @@ int estimate_motion(MpegEncContext * s,
     pix = s->new_picture[0] + (yy * s->linesize) + xx;
     /* At this point (mx,my) are full-pell and the absolute displacement */
     ppix = s->last_picture[0] + (my * s->linesize) + mx;
-
+    
     sum = pix_sum(pix, s->linesize);
-    varc = pix_norm1(pix, s->linesize);
-    vard = pix_norm(pix, ppix, s->linesize);
+#if 0
+    varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS;
+    vard = pix_abs16x16(pix, ppix, s->linesize);
+#else
+    sum= (sum+8)>>4;
+    varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8);
+    vard = (pix_norm(pix, ppix, s->linesize)+128)>>8;
+#endif
 
-    vard = vard >> 8;
-    sum = sum >> 8;
-    varc = (varc >> 8) - (sum * sum);
     s->mb_var[s->mb_width * mb_y + mb_x] = varc;
-    s->avg_mb_var += varc;
-     
+    s->avg_mb_var+= varc;
+    s->mc_mb_var += vard;
+
 #if 0
     printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n",
 	   varc, s->avg_mb_var, sum, vard, mx - xx, my - yy);
 #endif
-    if (vard <= 64 || vard < varc) {
-        if (s->full_search != ME_ZERO) {
-            halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax);
-        } else {
-            mx -= 16 * s->mb_x;
-            my -= 16 * s->mb_y;
+    if(s->flags&CODEC_FLAG_HQ){
+        if (vard*2 + 200 > varc)
+            mb_type|= MB_TYPE_INTRA;
+        if (varc*2 + 200 > vard){
+            mb_type|= MB_TYPE_INTER;
+            halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y);
+        }else{
+            mx = mx*2 - mb_x*32;
+            my = my*2 - mb_y*32;
+        }
+    }else{
+        if (vard <= 64 || vard < varc) {
+            mb_type|= MB_TYPE_INTER;
+            if (s->full_search != ME_ZERO) {
+                halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y);
+            } else {
+                mx -= 16 * mb_x;
+                my -= 16 * mb_y;
+            }
+        }else{
+            mb_type|= MB_TYPE_INTRA;
+            mx = 0;//mx*2 - 32 * mb_x;
+            my = 0;//my*2 - 32 * mb_y;
         }
-	*mx_ptr = mx;
-	*my_ptr = my;
-	return 0;
-    } else {
-	*mx_ptr = 0;
-	*my_ptr = 0;
-	return 1;
     }
+
+    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
+    set_mv_tables(s, mx, my);
 }
 
 #else
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index 41bf524e4..ac614d5ce 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -20,6 +20,7 @@
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
+#include "xineutils.h"
 
 #include "mpeg12data.h"
 
@@ -51,6 +52,9 @@ static int mpeg2_decode_block_intra(MpegEncContext *s,
                                     int n);
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred);
 
+static UINT16 mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
+static UINT8 fcode_tab[MAX_MV*2+1];
+
 static void put_header(MpegEncContext *s, int header)
 {
     align_put_bits(&s->pb);
@@ -66,7 +70,7 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         int n;
         UINT64 time_code;
         
-        if ((s->picture_number % s->gop_size) == 0) {
+        if (s->picture_in_gop_number == 0) {
             /* mpeg1 header repeated every gop */
             put_header(s, SEQ_START_CODE);
             
@@ -129,7 +133,6 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
             }
 
         }
-        s->fake_picture_number++;
 }
 
 
@@ -226,6 +229,7 @@ void mpeg1_encode_picture_header(MpegEncContext *s, int picture_number)
     /* temporal reference */
     put_bits(&s->pb, 10, (s->fake_picture_number - 
                           s->gop_picture_number) & 0x3ff); 
+    s->fake_picture_number++;
     
     put_bits(&s->pb, 3, s->pict_type);
     put_bits(&s->pb, 16, 0xffff); /* non constant bit rate */
@@ -353,6 +357,53 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val)
     }
 }
 
+void mpeg1_encode_init(MpegEncContext *s)
+{
+    static int done=0;
+    if(!done){
+        int f_code;
+        int mv;
+
+        done=1;
+        for(f_code=1; f_code<=MAX_FCODE; f_code++){
+            for(mv=-MAX_MV; mv<=MAX_MV; mv++){
+                int len;
+
+                if(mv==0) len= mbMotionVectorTable[0][1];
+                else{
+                    int val, bit_size, range, code;
+
+                    bit_size = s->f_code - 1;
+                    range = 1 << bit_size;
+
+                    val=mv;
+                    if (val < 0) 
+                        val = -val;
+                    val--;
+                    code = (val >> bit_size) + 1;
+                    if(code<17){
+                        len= mbMotionVectorTable[code][1] + 1 + bit_size;
+                    }else{
+                        len= mbMotionVectorTable[16][1] + 2 + bit_size;
+                    }
+                }
+
+                mv_penalty[f_code][mv+MAX_MV]= len;
+            }
+        }
+        
+
+        for(f_code=MAX_FCODE; f_code>0; f_code--){
+            for(mv=-(8<<f_code); mv<(8<<f_code); mv++){
+                fcode_tab[mv+MAX_MV]= f_code;
+            }
+        }
+    }
+    s->mv_penalty= mv_penalty;
+    
+    s->fcode_tab= fcode_tab;
+}
+ 
 static inline void encode_dc(MpegEncContext *s, int diff, int component)
 {
     if (component == 0) {
@@ -1119,6 +1170,7 @@ typedef struct Mpeg1Context {
     UINT8 *buf_ptr;
     int buffer_size;
     int mpeg_enc_ctx_allocated; /* true if decoding context allocated */
+    int repeat_field; /* true if we must repeat the field */
 } Mpeg1Context;
 
 static int mpeg_decode_init(AVCodecContext *avctx)
@@ -1131,6 +1183,7 @@ static int mpeg_decode_init(AVCodecContext *avctx)
     s->start_code = -1;
     s->buf_ptr = s->buffer;
     s->mpeg_enc_ctx.picture_number = 0;
+    s->repeat_field = 0;
     return 0;
 }
 
@@ -1203,7 +1256,7 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
     int frame_rate_ext_n, frame_rate_ext_d;
 
     skip_bits(&s->gb, 8); /* profil and level */
-    skip_bits(&s->gb, 1); /* progressive_sequence */
+    s->progressive_sequence = get_bits1(&s->gb); /* progressive_sequence */
     skip_bits(&s->gb, 2); /* chroma_format */
     horiz_size_ext = get_bits(&s->gb, 2);
     vert_size_ext = get_bits(&s->gb, 2);
@@ -1279,12 +1332,13 @@ static void mpeg_decode_picture_coding_extension(MpegEncContext *s)
     s->chroma_420_type = get_bits1(&s->gb);
     s->progressive_frame = get_bits1(&s->gb);
     /* composite display not parsed */
-    dprintf("intra_dc_precion=%d\n", s->intra_dc_precision);
+    dprintf("intra_dc_precision=%d\n", s->intra_dc_precision);
     dprintf("picture_structure=%d\n", s->picture_structure);
     dprintf("conceal=%d\n", s->concealment_motion_vectors);
     dprintf("intra_vlc_format=%d\n", s->intra_vlc_format);
     dprintf("alternate_scan=%d\n", s->alternate_scan);
     dprintf("frame_pred_frame_dct=%d\n", s->frame_pred_frame_dct);
+    dprintf("progressive_frame=%d\n", s->progressive_frame);
 }
 
 static void mpeg_decode_extension(AVCodecContext *avctx, 
@@ -1349,7 +1403,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
     }
 
     for(;;) {
-        memset(s->block, 0, sizeof(s->block));
+        clear_blocks(s->block[0]);
         ret = mpeg_decode_mb(s, s->block);
         dprintf("ret=%d\n", ret);
         if (ret < 0)
@@ -1358,7 +1412,8 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
             break;
         MPV_decode_mb(s, s->block);
     }
-    
+    emms_c();
+
     /* end of slice reached */
     if (s->mb_x == (s->mb_width - 1) &&
         s->mb_y == (s->mb_height - 1)) {
@@ -1434,6 +1489,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         avctx->width = width;
         avctx->height = height;
         avctx->frame_rate = frame_rate_tab[s->frame_rate_index];
+        s->frame_rate = avctx->frame_rate;
         avctx->bit_rate = s->bit_rate;
         
         if (MPV_common_init(s) < 0)
@@ -1505,13 +1561,14 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
     UINT8 *buf_end, *buf_ptr, *buf_start;
     int len, start_code_found, ret, code, start_code, input_size;
     AVPicture *picture = data;
-
+    MpegEncContext *s2 = &s->mpeg_enc_ctx;
+            
     dprintf("fill_buffer\n");
 
     *data_size = 0;
+    
     /* special case for last picture */
     if (buf_size == 0) {
-        MpegEncContext *s2 = &s->mpeg_enc_ctx;
         if (s2->picture_number > 0) {
             picture->data[0] = s2->next_picture[0];
             picture->data[1] = s2->next_picture[1];
@@ -1526,6 +1583,15 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
 
     buf_ptr = buf;
     buf_end = buf + buf_size;
+    
+    if (s->repeat_field % 2 == 1) {
+        s->repeat_field++;
+        //fprintf(stderr,"\nRepeating last frame: %d -> %d! pict: %d %d", avctx->frame_number-1, avctx->frame_number,
+        //                                                         s2->picture_number, s->repeat_field);
+        *data_size = sizeof(AVPicture);
+        goto the_end;
+    }
+        
     while (buf_ptr < buf_end) {
         buf_start = buf_ptr;
         /* find start next code */
@@ -1574,6 +1640,14 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
                                                 start_code, s->buffer, input_size);
                         if (ret == 1) {
                             /* got a picture: exit */
+                            /* first check if we must repeat the frame */
+                            if (s2->progressive_frame && s2->repeat_first_field) {
+                                //fprintf(stderr,"\nRepeat this frame: %d! pict: %d",avctx->frame_number,s2->picture_number);
+                                s2->repeat_first_field = 0;
+                                s2->progressive_frame = 0;
+                                if (++s->repeat_field > 2)
+                                    s->repeat_field = 0;
+                            }
                             *data_size = sizeof(AVPicture);
                             goto the_end;
                         }
diff --git a/src/libffmpeg/libavcodec/mpeg4data.h b/src/libffmpeg/libavcodec/mpeg4data.h
index 4eed75654..91b99625f 100644
--- a/src/libffmpeg/libavcodec/mpeg4data.h
+++ b/src/libffmpeg/libavcodec/mpeg4data.h
@@ -12,13 +12,13 @@
 #define GMC_SPRITE 2
 
 /* dc encoding for mpeg4 */
-static const UINT8 DCtab_lum[13][2] =
+const UINT8 DCtab_lum[13][2] =
 {
     {3,3}, {3,2}, {2,2}, {2,3}, {1,3}, {1,4}, {1,5}, {1,6}, {1,7},
     {1,8}, {1,9}, {1,10}, {1,11},
 }; 
 
-static const UINT8 DCtab_chrom[13][2] =
+const UINT8 DCtab_chrom[13][2] =
 {
     {3,2}, {2,2}, {1,2}, {1,3}, {1,4}, {1,5}, {1,6}, {1,7}, {1,8},
     {1,9}, {1,10}, {1,11}, {1,12},
@@ -99,3 +99,26 @@ static const UINT16 sprite_trajectory_tab[15][2] = {
  {0x0E, 4}, {0x1E, 5},  {0x3E, 6},  {0x7E, 7}, {0xFE, 8}, 
  {0x1FE, 9},{0x3FE, 10},{0x7FE, 11},{0xFFE, 12},
 };
+
+static const UINT8 mb_type_b_tab[4][2] = {
+ {1, 1}, {1, 2}, {1, 3}, {1, 4},
+};
+
+static const UINT16 pixel_aspect[16][2]={
+ {0, 0},
+ {1, 1},
+ {12, 11},
+ {10, 11},
+ {16, 11},
+ {40, 33},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+ {0, 0},
+};
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index a8297a2c1..9f572c3d9 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -15,19 +15,20 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * 4MV & hq encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  */
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
-
-#include "config.h"
-#include "xine-utils/xineutils.h"
-
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
 
+#include "config.h"
+#include "xine-utils/xineutils.h"
+
 #ifdef USE_FASTMEMCPY
 #include "fastmemcpy.h"
 #endif
@@ -71,6 +72,9 @@ static UINT8 h263_chroma_roundtab[16] = {
     0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
 };
 
+static UINT16 default_mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
+static UINT8 default_fcode_tab[MAX_MV*2+1];
+
 /* default motion estimation */
 int motion_estimation_method = ME_LOG;
 
@@ -175,7 +179,7 @@ int MPV_common_init(MpegEncContext *s)
         }
     }
     
-    if (s->out_format == FMT_H263) {
+    if (s->out_format == FMT_H263 || s->encoding) {
         int size;
         /* MV prediction */
         size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
@@ -228,6 +232,8 @@ int MPV_common_init(MpegEncContext *s)
         if (!s->mbskip_table)
             goto fail;
     }
+    
+    s->block= s->intra_block;
 
     s->context_initialized = 1;
     return 0;
@@ -280,6 +286,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     int i;
 
     s->bit_rate = avctx->bit_rate;
+    s->bit_rate_tolerance = avctx->bit_rate_tolerance;
     s->frame_rate = avctx->frame_rate;
     s->width = avctx->width;
     s->height = avctx->height;
@@ -288,7 +295,14 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->rtp_payload_size = avctx->rtp_payload_size;
     if (avctx->rtp_callback)
         s->rtp_callback = avctx->rtp_callback;
+    s->qmin= avctx->qmin;
+    s->qmax= avctx->qmax;
+    s->max_qdiff= avctx->max_qdiff;
+    s->qcompress= avctx->qcompress;
+    s->qblur= avctx->qblur;
     s->avctx = avctx;
+    s->aspect_ratio_info= avctx->aspect_ratio_info;
+    s->flags= avctx->flags;
     
     if (s->gop_size <= 1) {
         s->intra_only = 1;
@@ -344,18 +358,59 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
         break;
-    case CODEC_ID_MSMPEG4:
+    case CODEC_ID_MSMPEG4V1:
+        s->out_format = FMT_H263;
+        s->h263_msmpeg4 = 1;
+        s->h263_pred = 1;
+        s->unrestricted_mv = 1;
+        s->msmpeg4_version= 1;
+        break;
+    case CODEC_ID_MSMPEG4V2:
         s->out_format = FMT_H263;
         s->h263_msmpeg4 = 1;
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
+        s->msmpeg4_version= 2;
+        break;
+    case CODEC_ID_MSMPEG4V3:
+        s->out_format = FMT_H263;
+        s->h263_msmpeg4 = 1;
+        s->h263_pred = 1;
+        s->unrestricted_mv = 1;
+        s->msmpeg4_version= 3;
         break;
     default:
         return -1;
     }
+    
+    if((s->flags&CODEC_FLAG_4MV) && !(s->flags&CODEC_FLAG_HQ)){
+        printf("4MV is currently only supported in HQ mode\n");
+        return -1;
+    }
+
+    { /* set up some save defaults, some codecs might override them later */
+        static int done=0;
+        if(!done){
+            int i;
+            done=1;
+            memset(default_mv_penalty, 0, sizeof(UINT16)*(MAX_FCODE+1)*(2*MAX_MV+1));
+            memset(default_fcode_tab , 0, sizeof(UINT8)*(2*MAX_MV+1));
+
+            for(i=-16; i<16; i++){
+                default_fcode_tab[i + MAX_MV]= 1;
+            }
+        }
+    }
+    s->mv_penalty= default_mv_penalty;
+    s->fcode_tab= default_fcode_tab;
 
     if (s->out_format == FMT_H263)
-        h263_encode_init_vlc(s);
+        h263_encode_init(s);
+    else if (s->out_format == FMT_MPEG1)
+        mpeg1_encode_init(s);
+
+    /* dont use mv_penalty table for crap MV as it would be confused */
+    if(s->full_search<4) s->mv_penalty= default_mv_penalty;
 
     s->encoding = 1;
 
@@ -373,6 +428,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     rate_control_init(s);
 
     s->picture_number = 0;
+    s->picture_in_gop_number = 0;
     s->fake_picture_number = 0;
     /* motion detector init */
     s->f_code = 1;
@@ -434,6 +490,7 @@ void MPV_frame_start(MpegEncContext *s)
             s->current_picture[i] = s->aux_picture[i];
         }
     } else {
+        s->last_non_b_pict_type= s->pict_type;
         for(i=0;i<3;i++) {
             /* swap next and last */
             tmp = s->last_picture[i];
@@ -475,16 +532,18 @@ int MPV_encode_picture(AVCodecContext *avctx,
 
     init_put_bits(&s->pb, buf, buf_size, NULL, NULL);
 
+    s->force_type= (avctx->flags&CODEC_FLAG_TYPE) ?
+	(avctx->key_frame ? I_TYPE : P_TYPE) : 0;
     if (!s->intra_only) {
         /* first picture of GOP is intra */
-        if ((s->picture_number % s->gop_size) == 0)
+        if (s->picture_in_gop_number % s->gop_size==0 || s->force_type==I_TYPE){
+            s->picture_in_gop_number=0;
             s->pict_type = I_TYPE;
-        else
+        }else
             s->pict_type = P_TYPE;
     } else {
         s->pict_type = I_TYPE;
     }
-    avctx->key_frame = (s->pict_type == I_TYPE);
     
     MPV_frame_start(s);
     
@@ -515,15 +574,30 @@ int MPV_encode_picture(AVCodecContext *avctx,
     }
 
     encode_picture(s, s->picture_number);
-    
+    avctx->key_frame = (s->pict_type == I_TYPE);
+    avctx->header_bits = s->header_bits;
+    avctx->mv_bits     = s->mv_bits;
+    avctx->misc_bits   = s->misc_bits;
+    avctx->i_tex_bits  = s->i_tex_bits;
+    avctx->p_tex_bits  = s->p_tex_bits;
+    avctx->i_count     = s->i_count;
+    avctx->p_count     = s->p_count;
+    avctx->skip_count  = s->skip_count;
+
     MPV_frame_end(s);
     s->picture_number++;
+    s->picture_in_gop_number++;
 
     if (s->out_format == FMT_MJPEG)
         mjpeg_picture_trailer(s);
 
     flush_put_bits(&s->pb);
-    s->total_bits += (pbBufPtr(&s->pb) - s->pb.buf) * 8;
+    s->last_frame_bits= s->frame_bits;
+    s->frame_bits  = (pbBufPtr(&s->pb) - s->pb.buf) * 8;
+    s->total_bits += s->frame_bits;
+    avctx->frame_bits  = s->frame_bits;
+//printf("fcode: %d, type: %d, head: %d, mv: %d, misc: %d, frame: %d, itex: %d, ptex: %d\n", 
+//s->f_code, avctx->key_frame, s->header_bits, s->mv_bits, s->misc_bits, s->frame_bits, s->i_tex_bits, s->p_tex_bits);
 
     avctx->quality = s->qscale;
     if (avctx->get_psnr) {
@@ -555,7 +629,7 @@ static inline void gmc1_motion(MpegEncContext *s,
     int dxy, offset, mx, my, src_x, src_y, height, linesize;
     int motion_x, motion_y;
 
-    if(s->real_sprite_warping_points>1) printf("Oops, thats bad, contact the developers\n");
+    if(s->real_sprite_warping_points>1) printf("more than 1 warp point isnt supported\n");
     motion_x= s->sprite_offset[0][0];
     motion_y= s->sprite_offset[0][1];
     src_x = s->mb_x * 16 + (motion_x >> (s->sprite_warping_accuracy+1));
@@ -749,7 +823,7 @@ static inline void MPV_motion(MpegEncContext *s,
                         ref_picture, 0,
                         16);
 #endif
-        }else if(s->quarter_sample){
+        }else if(s->quarter_sample && dir==0){ //FIXME
             qpel_motion(s, dest_y, dest_cb, dest_cr, 0,
                         ref_picture, 0,
                         0, pix_op, qpix_op,
@@ -768,7 +842,7 @@ static inline void MPV_motion(MpegEncContext *s,
 
             dxy = ((motion_y & 1) << 1) | (motion_x & 1);
             src_x = mb_x * 16 + (motion_x >> 1) + (i & 1) * 8;
-            src_y = mb_y * 16 + (motion_y >> 1) + ((i >> 1) & 1) * 8;
+            src_y = mb_y * 16 + (motion_y >> 1) + (i >>1) * 8;
                     
             /* WARNING: do no forget half pels */
             src_x = clip(src_x, -16, s->width);
@@ -934,8 +1008,9 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
     else if (s->h263_pred || s->h263_aic)
         s->mbintra_table[mb_x + mb_y*s->mb_width]=1;
 
-    /* update motion predictor */
+    /* update motion predictor, not for B-frames as they need the motion_val from the last P/S-Frame */
     if (s->out_format == FMT_H263) {
+      if(s->pict_type!=B_TYPE){
         int xy, wrap, motion_x, motion_y;
         
         wrap = 2 * s->mb_width + 2;
@@ -958,6 +1033,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             s->motion_val[xy + 1 + wrap][0] = motion_x;
             s->motion_val[xy + 1 + wrap][1] = motion_y;
         }
+      }
     }
     
     if (!s->intra_only) {
@@ -1031,16 +1107,326 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         }
     }
  the_end:
-    emms_c();
+    emms_c(); //FIXME remove
 }
 
-static void encode_picture(MpegEncContext *s, int picture_number)
+static void encode_mb(MpegEncContext *s)
 {
-    int mb_x, mb_y, wrap, last_gob, pdif = 0;
+    int wrap;
+    const int mb_x= s->mb_x;
+    const int mb_y= s->mb_y;
     UINT8 *ptr;
-    int i, motion_x, motion_y;
+    const int motion_x= s->mv[0][0][0];
+    const int motion_y= s->mv[0][0][1];
+    int i;
+
+    /* get the pixels */
+    wrap = s->linesize;
+    ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16;
+    get_pixels(s->block[0], ptr, wrap);
+    get_pixels(s->block[1], ptr + 8, wrap);
+    get_pixels(s->block[2], ptr + 8 * wrap, wrap);
+    get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap);
+    wrap = s->linesize >> 1;
+    ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8;
+    get_pixels(s->block[4], ptr, wrap);
+
+    wrap = s->linesize >> 1;
+    ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8;
+    get_pixels(s->block[5], ptr, wrap);
+
+    /* subtract previous frame if non intra */
+    if (!s->mb_intra) {
+        int dxy, offset, mx, my;
+        
+        if(s->mv_type==MV_TYPE_16X16){
+            dxy = ((motion_y & 1) << 1) | (motion_x & 1);
+            ptr = s->last_picture[0] + 
+                ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + 
+                (mb_x * 16 + (motion_x >> 1));
+
+            sub_pixels_2(s->block[0], ptr, s->linesize, dxy);
+            sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy);
+            sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy);
+            sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy);
+
+            if (s->out_format == FMT_H263) {
+                /* special rounding for h263 */
+                dxy = 0;
+                if ((motion_x & 3) != 0)
+                    dxy |= 1;
+                if ((motion_y & 3) != 0)
+                    dxy |= 2;
+                mx = motion_x >> 2;
+                my = motion_y >> 2;
+            } else {
+                mx = motion_x / 2;
+                my = motion_y / 2;
+                dxy = ((my & 1) << 1) | (mx & 1);
+                mx >>= 1;
+                my >>= 1;
+            }
+            offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx);
+            ptr = s->last_picture[1] + offset;
+            sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
+            ptr = s->last_picture[2] + offset;
+            sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
+        }else{
+            int src_x, src_y;
+
+            for(i=0;i<4;i++) {
+                int motion_x = s->mv[0][i][0];
+                int motion_y = s->mv[0][i][1];
+
+                dxy = ((motion_y & 1) << 1) | (motion_x & 1);
+                src_x = mb_x * 16 + (motion_x >> 1) + (i & 1) * 8;
+                src_y = mb_y * 16 + (motion_y >> 1) + (i >>1) * 8;
+                        
+                ptr = s->last_picture[0] + (src_y * s->linesize) + (src_x);
+                sub_pixels_2(s->block[i], ptr, s->linesize, dxy);
+            }
+            /* In case of 8X8, we construct a single chroma motion vector
+               with a special rounding */
+            mx = 0;
+            my = 0;
+            for(i=0;i<4;i++) {
+                mx += s->mv[0][i][0];
+                my += s->mv[0][i][1];
+            }
+            if (mx >= 0)
+                mx = (h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
+            else {
+                mx = -mx;
+                mx = -(h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
+            }
+            if (my >= 0)
+                my = (h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
+            else {
+                my = -my;
+                my = -(h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
+            }
+            dxy = ((my & 1) << 1) | (mx & 1);
+            mx >>= 1;
+            my >>= 1;
+
+            src_x = mb_x * 8 + mx;
+            src_y = mb_y * 8 + my;
+            src_x = clip(src_x, -8, s->width/2);
+            if (src_x == s->width/2)
+                dxy &= ~1;
+            src_y = clip(src_y, -8, s->height/2);
+            if (src_y == s->height/2)
+                dxy &= ~2;
+            
+            offset = (src_y * (s->linesize >> 1)) + src_x;
+            ptr = s->last_picture[1] + offset;
+            sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
+            ptr = s->last_picture[2] + offset;
+            sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
+        }
+    }
+            
+#if 0
+            {
+                float adap_parm;
+                
+                adap_parm = ((s->avg_mb_var << 1) + s->mb_var[s->mb_width*mb_y+mb_x] + 1.0) /
+                            ((s->mb_var[s->mb_width*mb_y+mb_x] << 1) + s->avg_mb_var + 1.0);
+            
+                printf("\ntype=%c qscale=%2d adap=%0.2f dquant=%4.2f var=%4d avgvar=%4d", 
+                        (s->mb_type[s->mb_width*mb_y+mb_x] > 0) ? 'I' : 'P', 
+                        s->qscale, adap_parm, s->qscale*adap_parm,
+                        s->mb_var[s->mb_width*mb_y+mb_x], s->avg_mb_var);
+            }
+#endif
+    /* DCT & quantize */
+    if (s->h263_msmpeg4) {
+        msmpeg4_dc_scale(s);
+    } else if (s->h263_pred) {
+        h263_dc_scale(s);
+    } else {
+        /* default quantization values */
+        s->y_dc_scale = 8;
+        s->c_dc_scale = 8;
+    }
+    for(i=0;i<6;i++) {
+        s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale);
+    }
+
+    /* huffman encode */
+    switch(s->out_format) {
+    case FMT_MPEG1:
+        mpeg1_encode_mb(s, s->block, motion_x, motion_y);
+        break;
+    case FMT_H263:
+        if (s->h263_msmpeg4)
+            msmpeg4_encode_mb(s, s->block, motion_x, motion_y);
+        else if(s->h263_pred)
+            mpeg4_encode_mb(s, s->block, motion_x, motion_y);
+        else
+            h263_encode_mb(s, s->block, motion_x, motion_y);
+        break;
+    case FMT_MJPEG:
+        mjpeg_encode_mb(s, s->block);
+        break;
+    }
+}
+
+static void copy_bits(PutBitContext *pb, UINT8 *src, int length)
+{
+    int bytes= length>>3;
+    int bits= length&7;
+    int i;
+
+    for(i=0; i<bytes; i++) put_bits(pb, 8, src[i]);
+    put_bits(pb, bits, src[i]>>(8-bits));
+}
+
+static void encode_picture(MpegEncContext *s, int picture_number)
+{
+    int mb_x, mb_y, last_gob, pdif = 0;
+    int i;
+    int bits;
+    MpegEncContext best_s;
+    UINT8 bit_buf[4][3000]; //FIXME check that this is ALLWAYS large enogh for a MB
 
     s->picture_number = picture_number;
+
+    s->block_wrap[0]=
+    s->block_wrap[1]=
+    s->block_wrap[2]=
+    s->block_wrap[3]= s->mb_width*2 + 2;
+    s->block_wrap[4]=
+    s->block_wrap[5]= s->mb_width + 2;
+    
+    s->last_mc_mb_var = s->mc_mb_var;
+    /* Reset the average MB variance */
+    s->avg_mb_var = 0;
+    s->mc_mb_var = 0;
+    /* Estimate motion for every MB */
+    if(s->pict_type == P_TYPE){
+        for(mb_y=0; mb_y < s->mb_height; mb_y++) {
+            s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
+            s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
+            s->block_index[2]= s->block_wrap[0]*(mb_y*2 + 2) - 1;
+            s->block_index[3]= s->block_wrap[0]*(mb_y*2 + 2);
+            for(mb_x=0; mb_x < s->mb_width; mb_x++) {
+                s->mb_x = mb_x;
+                s->mb_y = mb_y;
+                s->block_index[0]+=2;
+                s->block_index[1]+=2;
+                s->block_index[2]+=2;
+                s->block_index[3]+=2;
+
+                /* compute motion vector & mb_type and store in context */
+                estimate_motion(s, mb_x, mb_y);
+//                s->mb_type[mb_y*s->mb_width + mb_x]=MB_TYPE_INTER;
+            }
+        }
+        emms_c();
+    }else{
+        /* I-Frame */
+        //FIXME do we need to zero them?
+        memset(s->motion_val[0], 0, sizeof(INT16)*(s->mb_width*2 + 2)*(s->mb_height*2 + 2)*2);
+        memset(s->mv_table[0]  , 0, sizeof(INT16)*s->mb_width*s->mb_height);
+        memset(s->mv_table[1]  , 0, sizeof(INT16)*s->mb_width*s->mb_height);
+        memset(s->mb_type      , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
+    }
+
+    if(s->avg_mb_var < s->mc_mb_var && s->pict_type != B_TYPE && (!s->force_type)){ //FIXME subtract MV bits
+        s->pict_type= I_TYPE;
+        s->picture_in_gop_number=0;
+        memset(s->mb_type   , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
+//printf("Scene change detected, encoding as I Frame\n");
+    }
+
+    /* find best f_code for ME which do unlimited searches */
+    if(s->pict_type==P_TYPE && s->full_search>3){
+        int mv_num[8];
+        int i;
+        int loose=0;
+        UINT8 * fcode_tab= s->fcode_tab;
+
+        for(i=0; i<8; i++) mv_num[i]=0;
+
+        for(i=0; i<s->mb_num; i++){
+            if(s->mb_type[i] & MB_TYPE_INTER){
+                mv_num[ fcode_tab[s->mv_table[0][i] + MAX_MV] ]++;
+                mv_num[ fcode_tab[s->mv_table[1][i] + MAX_MV] ]++;
+//printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i);
+            }
+//else printf("I");
+        }
+
+        for(i=MAX_FCODE; i>1; i--){
+            loose+= mv_num[i];
+            if(loose > 10) break; //FIXME this is pretty ineffective
+        }
+        s->f_code= i;
+/*        for(i=0; i<=MAX_FCODE; i++){
+            printf("%d ", mv_num[i]);
+        }
+        printf("\n");*/
+    }else{
+        s->f_code= 1;
+    }
+
+//printf("f_code %d ///\n", s->f_code);
+    /* convert MBs with too long MVs to I-Blocks */
+    if(s->pict_type==P_TYPE){
+        int i, x, y;
+        const int f_code= s->f_code;
+        UINT8 * fcode_tab= s->fcode_tab;
+//FIXME try to clip instead of intra izing ;)
+        /* clip / convert to intra 16x16 type MVs */
+        for(i=0; i<s->mb_num; i++){
+            if(s->mb_type[i]&MB_TYPE_INTER){
+                if(   fcode_tab[s->mv_table[0][i] + MAX_MV] > f_code
+                   || fcode_tab[s->mv_table[0][i] + MAX_MV] == 0
+                   || fcode_tab[s->mv_table[1][i] + MAX_MV] > f_code
+                   || fcode_tab[s->mv_table[1][i] + MAX_MV] == 0 ){
+                    s->mb_type[i] &= ~MB_TYPE_INTER;
+                    s->mb_type[i] |= MB_TYPE_INTRA;
+                    s->mv_table[0][i] = 0;
+                    s->mv_table[1][i] = 0;
+                }
+            }
+        }
+
+        if(s->flags&CODEC_FLAG_4MV){
+            int wrap= 2+ s->mb_width*2;
+
+            /* clip / convert to intra 8x8 type MVs */
+            for(y=0; y<s->mb_height; y++){
+                int xy= (y*2 + 1)*wrap + 1;
+                i= y*s->mb_width;
+
+                for(x=0; x<s->mb_width; x++){
+                    if(s->mb_type[i]&MB_TYPE_INTER4V){
+                        int block;
+                        for(block=0; block<4; block++){
+                            int off= (block& 1) + (block>>1)*wrap;
+                            int mx= s->motion_val[ xy + off ][0];
+                            int my= s->motion_val[ xy + off ][1];
+
+                            if(   fcode_tab[mx + MAX_MV] > f_code
+                               || fcode_tab[mx + MAX_MV] == 0
+                               || fcode_tab[my + MAX_MV] > f_code
+                               || fcode_tab[my + MAX_MV] == 0 ){
+                                s->mb_type[i] &= ~MB_TYPE_INTER4V;
+                                s->mb_type[i] |= MB_TYPE_INTRA;
+                            }
+                        }
+                        xy+=2;
+                        i++;
+                    }
+                }
+            }
+        }
+    }
+
+//    printf("%d %d\n", s->avg_mb_var, s->mc_mb_var);
+
     if (!s->fixed_qscale) 
         s->qscale = rate_estimate_qscale(s);
 
@@ -1056,6 +1442,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         convert_matrix(s->q_non_intra_matrix, s->q_non_intra_matrix16, s->non_intra_matrix, s->qscale);
     }
 
+    s->last_bits= get_bit_count(&s->pb);
     switch(s->out_format) {
     case FMT_MJPEG:
         mjpeg_picture_header(s);
@@ -1074,7 +1461,17 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         mpeg1_encode_picture_header(s, picture_number);
         break;
     }
-        
+    bits= get_bit_count(&s->pb);
+    s->header_bits= bits - s->last_bits;
+    s->last_bits= bits;
+    s->mv_bits=0;
+    s->misc_bits=0;
+    s->i_tex_bits=0;
+    s->p_tex_bits=0;
+    s->i_count=0;
+    s->p_count=0;
+    s->skip_count=0;
+
     /* init last dc values */
     /* note: quant matrix value (8) is implied here */
     s->last_dc[0] = 128;
@@ -1083,8 +1480,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->mb_incr = 1;
     s->last_mv[0][0][0] = 0;
     s->last_mv[0][0][1] = 0;
-    s->mv_type = MV_TYPE_16X16;
-    s->mv_dir = MV_DIR_FORWARD;
 
     /* Get the GOB height based on picture height */
     if (s->out_format == FMT_H263 && !s->h263_pred && !s->h263_msmpeg4) {
@@ -1095,33 +1490,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         else
             s->gob_index = 4;
     }
-    
-    /* Reset the average MB variance */
-    s->avg_mb_var = 0;
-    
-    /* Estimate motion for every MB */
-    for(mb_y=0; mb_y < s->mb_height; mb_y++) {
-        for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-            s->mb_x = mb_x;
-            s->mb_y = mb_y;
-
-            /* compute motion vector and macro block type (intra or non intra) */
-            motion_x = 0;
-            motion_y = 0;
-            if (s->pict_type == P_TYPE) {
-                s->mb_intra = estimate_motion(s, mb_x, mb_y,
-                                              &motion_x,
-                                              &motion_y);
-            } else {
-                s->mb_intra = 1;
-            }
-            /* Store MB type and MV */
-            s->mb_type[mb_y * s->mb_width + mb_x] = s->mb_intra;
-            s->mv_table[0][mb_y * s->mb_width + mb_x] = motion_x;
-            s->mv_table[1][mb_y * s->mb_width + mb_x] = motion_y;
-        }
-    }
-    
+        
     s->avg_mb_var = s->avg_mb_var / s->mb_num;        
     
     for(mb_y=0; mb_y < s->mb_height; mb_y++) {
@@ -1139,127 +1508,134 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             }
         }
         
+        s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
+        s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
+        s->block_index[2]= s->block_wrap[0]*(mb_y*2 + 2) - 1;
+        s->block_index[3]= s->block_wrap[0]*(mb_y*2 + 2);
+        s->block_index[4]= s->block_wrap[4]*(mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
+        s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
+            const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
+            PutBitContext pb;
+            int d;
+            int dmin=10000000;
+            int best=0;
 
             s->mb_x = mb_x;
             s->mb_y = mb_y;
-#if 0
-            /* compute motion vector and macro block type (intra or non intra) */
-            motion_x = 0;
-            motion_y = 0;
-            if (s->pict_type == P_TYPE) {
-                s->mb_intra = estimate_motion(s, mb_x, mb_y,
-                                              &motion_x,
-                                              &motion_y);
-            } else {
-                s->mb_intra = 1;
-            }
-#endif
-
-            s->mb_intra = s->mb_type[mb_y * s->mb_width + mb_x];
-            motion_x = s->mv_table[0][mb_y * s->mb_width + mb_x];
-            motion_y = s->mv_table[1][mb_y * s->mb_width + mb_x];
-            
-            /* get the pixels */
-            wrap = s->linesize;
-            ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16;
-            get_pixels(s->block[0], ptr, wrap);
-            get_pixels(s->block[1], ptr + 8, wrap);
-            get_pixels(s->block[2], ptr + 8 * wrap, wrap);
-            get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap);
-            wrap = s->linesize >> 1;
-            ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8;
-            get_pixels(s->block[4], ptr, wrap);
-
-            wrap = s->linesize >> 1;
-            ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8;
-            get_pixels(s->block[5], ptr, wrap);
-
-            /* subtract previous frame if non intra */
-            if (!s->mb_intra) {
-                int dxy, offset, mx, my;
-
-                dxy = ((motion_y & 1) << 1) | (motion_x & 1);
-                ptr = s->last_picture[0] + 
-                    ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + 
-                    (mb_x * 16 + (motion_x >> 1));
-
-                sub_pixels_2(s->block[0], ptr, s->linesize, dxy);
-                sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy);
-                sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy);
-                sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy);
-
-                if (s->out_format == FMT_H263) {
-                    /* special rounding for h263 */
-                    dxy = 0;
-                    if ((motion_x & 3) != 0)
-                        dxy |= 1;
-                    if ((motion_y & 3) != 0)
-                        dxy |= 2;
-                    mx = motion_x >> 2;
-                    my = motion_y >> 2;
-                } else {
-                    mx = motion_x / 2;
-                    my = motion_y / 2;
-                    dxy = ((my & 1) << 1) | (mx & 1);
-                    mx >>= 1;
-                    my >>= 1;
+            s->block_index[0]+=2;
+            s->block_index[1]+=2;
+            s->block_index[2]+=2;
+            s->block_index[3]+=2;
+            s->block_index[4]++;
+            s->block_index[5]++;
+
+            s->mv_dir = MV_DIR_FORWARD;
+            if(mb_type & (mb_type-1)){ // more than 1 MB type possible
+                pb= s->pb;
+                if(mb_type&MB_TYPE_INTER){
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x];
+                    s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x];
+                    init_put_bits(&s->pb, bit_buf[1], 3000, NULL, NULL);
+                    s->block= s->inter_block;
+
+                    encode_mb(s);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        best_s.mv[0][0][0]= s->mv[0][0][0];
+                        best_s.mv[0][0][1]= s->mv[0][0][1];
+                        best_s.mb_intra= 0;
+                        best_s.mv_type = MV_TYPE_16X16;
+                        best_s.pb=s->pb;
+                        best_s.block= s->block;
+                        best=1;
+                        for(i=0; i<6; i++)
+                            best_s.block_last_index[i]= s->block_last_index[i];
+                    }
                 }
-                offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx);
-                ptr = s->last_picture[1] + offset;
-                sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
-                ptr = s->last_picture[2] + offset;
-                sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
-            }
-            emms_c();
-            
-#if 0
-            {
-                float adap_parm;
-                
-                adap_parm = ((s->avg_mb_var << 1) + s->mb_var[s->mb_width*mb_y+mb_x] + 1.0) /
-                            ((s->mb_var[s->mb_width*mb_y+mb_x] << 1) + s->avg_mb_var + 1.0);
-            
-                printf("\ntype=%c qscale=%2d adap=%0.2f dquant=%4.2f var=%4d avgvar=%4d", 
-                        (s->mb_type[s->mb_width*mb_y+mb_x] > 0) ? 'I' : 'P', 
-                        s->qscale, adap_parm, s->qscale*adap_parm,
-                        s->mb_var[s->mb_width*mb_y+mb_x], s->avg_mb_var);
-            }
-#endif
-            /* DCT & quantize */
-            if (s->h263_msmpeg4) {
-                msmpeg4_dc_scale(s);
-            } else if (s->h263_pred) {
-                h263_dc_scale(s);
+                if(mb_type&MB_TYPE_INTER4V){
+                    s->mv_type = MV_TYPE_8X8;
+                    s->mb_intra= 0;
+                    for(i=0; i<4; i++){
+                        s->mv[0][i][0] = s->motion_val[s->block_index[i]][0];
+                        s->mv[0][i][1] = s->motion_val[s->block_index[i]][1];
+                    }
+                    init_put_bits(&s->pb, bit_buf[2], 3000, NULL, NULL);
+                    s->block= s->inter4v_block;
+
+                    encode_mb(s);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        for(i=0; i<4; i++){
+                            best_s.mv[0][i][0] = s->mv[0][i][0];
+                            best_s.mv[0][i][1] = s->mv[0][i][1];
+                        }
+                        best_s.mb_intra= 0;
+                        best_s.mv_type = MV_TYPE_8X8;
+                        best_s.pb=s->pb;
+                        best_s.block= s->block;
+                        best=2;
+                        for(i=0; i<6; i++)
+                            best_s.block_last_index[i]= s->block_last_index[i];
+                    }
+                }
+                if(mb_type&MB_TYPE_INTRA){
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 1;
+                    s->mv[0][0][0] = 0;
+                    s->mv[0][0][1] = 0;
+                    init_put_bits(&s->pb, bit_buf[0], 3000, NULL, NULL);
+                    s->block= s->intra_block;
+                   
+                    encode_mb(s);
+                    d= get_bit_count(&s->pb);
+                    if(d<dmin){
+                        flush_put_bits(&s->pb);
+                        dmin=d;
+                        best_s.mv[0][0][0]= 0;
+                        best_s.mv[0][0][1]= 0;
+                        best_s.mb_intra= 1;
+                        best_s.mv_type = MV_TYPE_16X16;
+                        best_s.pb=s->pb;
+                        best_s.block= s->block;
+                        for(i=0; i<6; i++)
+                            best_s.block_last_index[i]= s->block_last_index[i];
+                        best=0;
+                    }
+                    /* force cleaning of ac/dc if needed ... */
+                    s->mbintra_table[mb_x + mb_y*s->mb_width]=1;
+                }
+                for(i=0; i<4; i++){
+                   s->mv[0][i][0] =  best_s.mv[0][i][0];
+                   s->mv[0][i][1] =  best_s.mv[0][i][1];
+                }
+                s->mb_intra= best_s.mb_intra;
+                s->mv_type= best_s.mv_type;
+                for(i=0; i<6; i++)
+                   s->block_last_index[i]= best_s.block_last_index[i];
+                copy_bits(&pb, bit_buf[best], dmin);
+                s->block= best_s.block;
+                s->pb= pb;
             } else {
-                /* default quantization values */
-                s->y_dc_scale = 8;
-                s->c_dc_scale = 8;
-            }
-            for(i=0;i<6;i++) {
-                s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale);
-            }
-
-            /* huffman encode */
-            switch(s->out_format) {
-            case FMT_MPEG1:
-                mpeg1_encode_mb(s, s->block, motion_x, motion_y);
-                break;
-            case FMT_H263:
-                if (s->h263_msmpeg4)
-                    msmpeg4_encode_mb(s, s->block, motion_x, motion_y);
-                else
-                    h263_encode_mb(s, s->block, motion_x, motion_y);
-                break;
-            case FMT_MJPEG:
-                mjpeg_encode_mb(s, s->block);
-                break;
+                // only one MB-Type possible
+                if(mb_type&MB_TYPE_INTRA){
+                    s->mb_intra= 1;
+                    s->mv[0][0][0] = 0;
+                    s->mv[0][0][1] = 0;
+                }else{
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x];
+                    s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x];
+                }
+                encode_mb(s);
             }
 
-            /* decompress blocks so that we keep the state of the decoder */
-            s->mv[0][0][0] = motion_x;
-            s->mv[0][0][1] = motion_y;
-
             MPV_decode_mb(s, s->block);
         }
 
@@ -1277,7 +1653,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->first_gob_line = 0;
         }
     }
-    
+    emms_c();
+
     if (s->h263_msmpeg4 && s->pict_type == I_TYPE)
         msmpeg4_encode_ext_header(s);
 
@@ -1294,7 +1671,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->ptr_lastgob = pbBufPtr(&s->pb);
         //fprintf(stderr,"\nGOB: %2d size: %d (last)", s->gob_number, pdif);
     }
-
 }
 
 static int dct_quantize_c(MpegEncContext *s, 
@@ -1523,6 +1899,22 @@ static void dct_unquantize_h263_c(MpegEncContext *s,
 
 static void rate_control_init(MpegEncContext *s)
 {
+#if 1
+    emms_c();
+
+    //initial values, they dont really matter as they will be totally different within a few frames
+    s->i_pred.coeff= s->p_pred.coeff= 7.0;
+    s->i_pred.count= s->p_pred.count= 1.0;
+    
+    s->i_pred.decay= s->p_pred.decay= 0.4;
+    
+    // use more bits at the beginning, otherwise high motion at the begin will look like shit
+    s->qsum=100;
+    s->qcount=100;
+
+    s->short_term_qsum=0.001;
+    s->short_term_qcount=0.001;
+#else
     s->wanted_bits = 0;
 
     if (s->intra_only) {
@@ -1533,24 +1925,123 @@ static void rate_control_init(MpegEncContext *s)
                                  (float)((float)s->frame_rate / FRAME_RATE_BASE * (I_FRAME_SIZE_RATIO + s->gop_size - 1)));
         s->I_frame_bits = (int)(s->P_frame_bits * I_FRAME_SIZE_RATIO);
     }
-    
+
 #if defined(DEBUG)
     printf("I_frame_size=%d P_frame_size=%d\n",
            s->I_frame_bits, s->P_frame_bits);
 #endif
+#endif
+}
+
+static double predict(Predictor *p, double q, double var)
+{
+    return p->coeff*var / (q*p->count);
 }
 
+static void update_predictor(Predictor *p, double q, double var, double size)
+{
+    double new_coeff= size*q / (var + 1);
+    if(var<1000) return;
+/*{
+int pred= predict(p, q, var);
+int error= abs(pred-size);
+static double sum=0;
+static int count=0;
+if(count>5) sum+=error;
+count++;
+if(256*256*256*64%count==0){
+    printf("%d %f %f\n", count, sum/count, p->coeff);
+}
+}*/
+    p->count*= p->decay;
+    p->coeff*= p->decay;
+    p->count++;
+    p->coeff+= new_coeff;
+}
 
-/*
- * This heuristic is rather poor, but at least we do not have to
- * change the qscale at every macroblock.
- */
 static int rate_estimate_qscale(MpegEncContext *s)
 {
-    INT64 diff, total_bits = s->total_bits;
+#if 1
+    int qmin= s->qmin;
+    int qmax= s->qmax;
+    int rate_q=5;
     float q;
-    int qscale, qmin;
+    int qscale;
+    float br_compensation;
+    double diff;
+    double short_term_q;
+    double long_term_q;
+    int last_qscale= s->qscale;
+    double fps;
+    INT64 wanted_bits;
+    emms_c();
+
+    fps= (double)s->frame_rate / FRAME_RATE_BASE;
+    wanted_bits= s->bit_rate*(double)s->picture_number/fps;
+
+    
+    if(s->picture_number>2){
+        /* update predictors */
+        if(s->last_pict_type == I_TYPE){
+        //FIXME
+        }else{ //P Frame
+//printf("%d %d %d %f\n", s->qscale, s->last_mc_mb_var, s->frame_bits, s->p_pred.coeff);
+            update_predictor(&s->p_pred, s->qscale, s->last_mc_mb_var, s->frame_bits);
+        }
+    }
+
+    if(s->pict_type == I_TYPE){
+        //FIXME
+        rate_q= s->qsum/s->qcount;
+    }else{ //P Frame
+        int i;
+        int diff, best_diff=1000000000;
+        for(i=1; i<=31; i++){
+            diff= predict(&s->p_pred, i, s->mc_mb_var) - (double)s->bit_rate/fps;
+            if(diff<0) diff= -diff;
+            if(diff<best_diff){
+                best_diff= diff;
+                rate_q= i;
+            }
+        }
+    }
+
+    s->short_term_qsum*=s->qblur;
+    s->short_term_qcount*=s->qblur;
+
+    s->short_term_qsum+= rate_q;
+    s->short_term_qcount++;
+    short_term_q= s->short_term_qsum/s->short_term_qcount;
+    
+    long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0
+
+//    q= (long_term_q - short_term_q)*s->qcompress + short_term_q;
+    q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q);
+
+    diff= s->total_bits - wanted_bits;
+    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
+    if(br_compensation<=0.0) br_compensation=0.001;
+    q/=br_compensation;
 
+    qscale= (int)(q + 0.5);
+    if     (qscale<qmin) qscale=qmin;
+    else if(qscale>qmax) qscale=qmax;
+    
+    if     (qscale<last_qscale-s->max_qdiff) qscale=last_qscale-s->max_qdiff;
+    else if(qscale>last_qscale+s->max_qdiff) qscale=last_qscale+s->max_qdiff;
+
+    s->qsum+= qscale;
+    s->qcount++;
+
+    s->last_pict_type= s->pict_type;
+//printf("q:%d diff:%d comp:%f rate_q:%d st_q:%f fvar:%d last_size:%d\n", qscale, (int)diff, br_compensation, 
+//       rate_q, short_term_q, s->mc_mb_var, s->frame_bits);
+//printf("%d %d\n", s->bit_rate, (int)fps);
+    return qscale;
+#else
+    INT64 diff, total_bits = s->total_bits;
+    float q;
+    int qscale;
     if (s->pict_type == I_TYPE) {
         s->wanted_bits += s->I_frame_bits;
     } else {
@@ -1581,6 +2072,7 @@ static int rate_estimate_qscale(MpegEncContext *s)
            (int)diff, q);
 #endif
     return qscale;
+#endif
 }
 
 AVCodec mpeg1video_encoder = {
@@ -1643,10 +2135,30 @@ AVCodec mpeg4_encoder = {
     MPV_encode_end,
 };
 
-AVCodec msmpeg4_encoder = {
+AVCodec msmpeg4v1_encoder = {
+    "msmpeg4v1",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_MSMPEG4V1,
+    sizeof(MpegEncContext),
+    MPV_encode_init,
+    MPV_encode_picture,
+    MPV_encode_end,
+};
+
+AVCodec msmpeg4v2_encoder = {
+    "msmpeg4v2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_MSMPEG4V2,
+    sizeof(MpegEncContext),
+    MPV_encode_init,
+    MPV_encode_picture,
+    MPV_encode_end,
+};
+
+AVCodec msmpeg4v3_encoder = {
     "msmpeg4",
     CODEC_TYPE_VIDEO,
-    CODEC_ID_MSMPEG4,
+    CODEC_ID_MSMPEG4V3,
     sizeof(MpegEncContext),
     MPV_encode_init,
     MPV_encode_picture,
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 01e477865..f809a1255 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -34,6 +34,15 @@ enum OutputFormat {
 #define QMAT_SHIFT_MMX 19
 #define QMAT_SHIFT 25
 
+#define MAX_FCODE 7
+#define MAX_MV 2048
+
+typedef struct Predictor{
+    double coeff;
+    double count;
+    double decay;
+} Predictor;
+
 typedef struct MpegEncContext {
     struct AVCodecContext *avctx;
     /* the following parameters must be initialized before encoding */
@@ -42,6 +51,7 @@ typedef struct MpegEncContext {
     int frame_rate; /* number of frames per second */
     int intra_only; /* if true, only intra pictures are generated */
     int bit_rate;        /* wanted bit rate */
+    int bit_rate_tolerance; /* amount of +- bits (>0)*/
     enum OutputFormat out_format; /* output format */
     int h263_plus; /* h263 plus headers */
     int h263_rv10; /* use RV10 variation for H263 */
@@ -49,7 +59,14 @@ typedef struct MpegEncContext {
     int h263_msmpeg4; /* generate MSMPEG4 compatible stream */
     int h263_intel; /* use I263 intel h263 header */
     int fixed_qscale; /* fixed qscale if non zero */
+    float qcompress;  /* amount of qscale change between easy & hard scenes (0.0-1.0) */
+    float qblur;      /* amount of qscale smoothing over time (0.0-1.0) */
+    int qmin;         /* min qscale */
+    int qmax;         /* max qscale */
+    int max_qdiff;    /* max qscale difference between frames */
     int encoding;     /* true if we are encoding (vs decoding) */
+    int flags;        /* AVCodecContext.flags (HQ, MV4, ...) */
+    int force_type;   /* 0= no force, otherwise I_TYPE, P_TYPE, ... */
     /* the following fields are managed internally by the encoder */
 
     /* bit output */
@@ -59,7 +76,8 @@ typedef struct MpegEncContext {
     int context_initialized;
     int picture_number;
     int fake_picture_number; /* picture number at the bitstream frame rate */
-    int gop_picture_number;  /* index of the first picture of a GOP */
+    int gop_picture_number;  /* index of the first picture of a GOP based on fake_pic_num & mpeg1 specific */
+    int picture_in_gop_number; /* 0-> first pic in gop, ... */
     int mb_width, mb_height;
     int mb_num;                /* number of MBs of a picture */
     int linesize;              /* line size, in bytes, may be different from width */
@@ -72,10 +90,10 @@ typedef struct MpegEncContext {
     UINT8 *aux_picture_base[3]; /* real start of the picture */
     UINT8 *current_picture[3]; /* buffer to store the decompressed current picture */
     int last_dc[3]; /* last DC values for MPEG1 */
-    INT16 *dc_val[3]; /* used for mpeg4 DC prediction */
+    INT16 *dc_val[3]; /* used for mpeg4 DC prediction, all 3 arrays must be continuous */
     int y_dc_scale, c_dc_scale;
     UINT8 *coded_block; /* used for coded block pattern prediction */
-    INT16 (*ac_val[3])[16]; /* used for for mpeg4 AC prediction */
+    INT16 (*ac_val[3])[16]; /* used for for mpeg4 AC prediction, all 3 arrays must be continuous */
     int ac_pred;
     int mb_skiped;              /* MUST BE SET only during DECODING */
     UINT8 *mbskip_table;        /* used to avoid copy if macroblock
@@ -84,22 +102,25 @@ typedef struct MpegEncContext {
 
     int qscale;
     int pict_type;
+    int last_non_b_pict_type; /* used for mpeg4 gmc b-frames */
+    int last_pict_type; /* used for bit rate stuff (needs that to update the right predictor) */
     int frame_rate_index;
     /* motion compensation */
     int unrestricted_mv;
     int h263_long_vectors; /* use horrible h263v1 long vector mode */
 
     int f_code; /* resolution */
-    int b_code; /* resolution for B Frames*/
-    INT16 *mv_table[2];    /* MV table */
-    INT16 (*motion_val)[2]; /* used for MV prediction */
+    int b_code; /* backward resolution for B Frames (mpeg4) */
+    INT16 *mv_table[2];    /* MV table (1MV per MB)*/
+    INT16 (*motion_val)[2]; /* used for MV prediction (4MV per MB)*/
     int full_search;
     int mv_dir;
 #define MV_DIR_BACKWARD  1
 #define MV_DIR_FORWARD   2
+#define MV_DIRECT        4 // bidirectional mode where the difference equals the MV of the last P/S/I-Frame (mpeg4)
     int mv_type;
 #define MV_TYPE_16X16       0   /* 1 vector for the whole mb */
-#define MV_TYPE_8X8         1   /* 4 vectors (h263) */
+#define MV_TYPE_8X8         1   /* 4 vectors (h263, mpeg4 4MV) */
 #define MV_TYPE_16X8        2   /* 2 vectors, one per 16x8 block */ 
 #define MV_TYPE_FIELD       3   /* 2 vectors, one per field */ 
 #define MV_TYPE_DMV         4   /* 2 vectors, special mpeg2 Dual Prime Vectors */
@@ -111,6 +132,8 @@ typedef struct MpegEncContext {
     int mv[2][4][2];
     int field_select[2][2];
     int last_mv[2][2][2];
+    UINT16 (*mv_penalty)[MAX_MV*2+1]; /* amount of bits needed to encode a MV, used for ME */
+    UINT8 *fcode_tab; /* smallest fcode needed for each MV */
 
     int has_b_frames;
     int no_rounding; /* apply no rounding to motion estimation (MPEG4) */
@@ -119,9 +142,20 @@ typedef struct MpegEncContext {
     int mb_x, mb_y;
     int mb_incr;
     int mb_intra;
-    INT16 *mb_var;      /* Table for MB variances */
-    char *mb_type;    /* Table for MB type */
-    
+    UINT16 *mb_var;    /* Table for MB variances */
+    UINT8 *mb_type;    /* Table for MB type */
+#define MB_TYPE_INTRA    0x01
+#define MB_TYPE_INTER    0x02
+#define MB_TYPE_INTER4V  0x04
+#define MB_TYPE_SKIPED   0x08
+#define MB_TYPE_DIRECT   0x10
+#define MB_TYPE_FORWARD  0x20
+#define MB_TYPE_BACKWAD  0x40
+#define MB_TYPE_BIDIR    0x80
+
+    int block_index[6];
+    int block_wrap[6];
+
     /* matrix transmitted in the bitstream */
     UINT16 intra_matrix[64];
     UINT16 chroma_intra_matrix[64];
@@ -141,9 +175,30 @@ typedef struct MpegEncContext {
     int I_frame_bits;    /* wanted number of bits per I frame */
     int P_frame_bits;    /* same for P frame */
     int avg_mb_var;        /* average MB variance for current frame */
+    int mc_mb_var;     /* motion compensated MB variance for current frame */
+    int last_mc_mb_var;     /* motion compensated MB variance for last frame */
     INT64 wanted_bits;
     INT64 total_bits;
-    
+    int frame_bits;      /* bits used for the current frame */
+    int last_frame_bits; /* bits used for the last frame */
+    Predictor i_pred;
+    Predictor p_pred;
+    double qsum;         /* sum of qscales */
+    double qcount;       /* count of qscales */
+    double short_term_qsum;   /* sum of recent qscales */
+    double short_term_qcount; /* count of recent qscales */
+
+    /* statistics, used for 2-pass encoding */
+    int mv_bits;
+    int header_bits;
+    int i_tex_bits;
+    int p_tex_bits;
+    int i_count;
+    int p_count;
+    int skip_count;
+    int misc_bits; // cbp, mb_type
+    int last_bits; //temp var used for calculating the above vars
+
     /* H.263 specific */
     int gob_number;
     int gob_index;
@@ -156,7 +211,12 @@ typedef struct MpegEncContext {
     int h263_aic_dir; /* AIC direction: 0 = left, 1 = top */
     
     /* mpeg4 specific */
+    int time_increment_resolution;
     int time_increment_bits;
+    int time_increment;
+    int time_base;
+    int time;
+    int last_non_b_time[2];
     int shape;
     int vol_sprite_usage;
     int sprite_width;
@@ -179,6 +239,8 @@ typedef struct MpegEncContext {
     int sprite_warping_accuracy;
     int low_latency_sprite;
     int data_partioning;
+    int resync_marker;
+    int resync_x_pos;
 
     /* divx specific, used to workaround (many) bugs in divx5 */
     int divx_version;
@@ -202,9 +264,10 @@ typedef struct MpegEncContext {
     int dc_table_index;
     int use_skip_mb_code;
     int slice_height;      /* in macroblocks */
-    int first_slice_line;  
+    int first_slice_line;  /* used in mpeg4 too to handle resync markers */
     int flipflop_rounding;
     int bitrate;
+    int msmpeg4_version;   /* 1=mp41, 2=mp42, 3=mp43/divx3 */
     /* decompression specific */
     GetBitContext gb;
 
@@ -242,7 +305,10 @@ typedef struct MpegEncContext {
     UINT8 *ptr_last_mb_line;
     UINT32 mb_line_avgsize;
     
-    DCTELEM block[6][64] __align8;
+    DCTELEM (*block)[64]; /* points to one of the following blocks */
+    DCTELEM intra_block[6][64] __align8;
+    DCTELEM inter_block[6][64] __align8;
+    DCTELEM inter4v_block[6][64] __align8;
     void (*dct_unquantize)(struct MpegEncContext *s, 
                            DCTELEM *block, int n, int qscale);
 } MpegEncContext;
@@ -258,9 +324,8 @@ void MPV_common_init_mmx(MpegEncContext *s);
 
 /* motion_est.c */
 
-int estimate_motion(MpegEncContext *s, 
-                    int mb_x, int mb_y,
-                    int *mx_ptr, int *my_ptr);
+void estimate_motion(MpegEncContext *s, 
+                    int mb_x, int mb_y);
 
 /* mpeg12.c */
 extern INT16 default_intra_matrix[64];
@@ -270,6 +335,7 @@ void mpeg1_encode_picture_header(MpegEncContext *s, int picture_number);
 void mpeg1_encode_mb(MpegEncContext *s,
                      DCTELEM block[6][64],
                      int motion_x, int motion_y);
+void mpeg1_encode_init(MpegEncContext *s);
 
 /* h263enc.c */
 
@@ -306,6 +372,9 @@ static inline int get_rl_index(const RLTable *rl, int last, int run, int level)
 void h263_encode_mb(MpegEncContext *s, 
                     DCTELEM block[6][64],
                     int motion_x, int motion_y);
+void mpeg4_encode_mb(MpegEncContext *s, 
+                    DCTELEM block[6][64],
+                    int motion_x, int motion_y);
 void h263_encode_picture_header(MpegEncContext *s, int picture_number);
 int h263_encode_gob_header(MpegEncContext * s, int mb_line);
 void h263_dc_scale(MpegEncContext *s);
@@ -314,7 +383,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block,
 void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, 
                    int dir);
 void mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
-void h263_encode_init_vlc(MpegEncContext *s);
+void h263_encode_init(MpegEncContext *s);
 
 void h263_decode_init_vlc(MpegEncContext *s);
 int h263_decode_picture_header(MpegEncContext *s);
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 8fa9aefaa..66fc5255e 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -29,7 +29,6 @@
  * TODO: 
  *        - (encoding) select best mv table (two choices)
  *        - (encoding) select best vlc/dc table 
- *        - (decoding) handle slice indication
  */
 //#define DEBUG
 
@@ -44,12 +43,18 @@ typedef struct MVTable {
     VLC vlc;                /* decoding: vlc */
 } MVTable;
 
+static UINT32 v2_dc_lum_table[512][2];
+static UINT32 v2_dc_chroma_table[512][2];
+
 static void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n);
 static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                                 int n, int coded);
 static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
 static int msmpeg4_decode_motion(MpegEncContext * s, 
                                  int *mx_ptr, int *my_ptr);
+static void msmpeg4v2_encode_motion(MpegEncContext * s, int val);
+static void init_h263_dc_for_msmpeg4();
+
 
 extern UINT32 inverse[256];
 
@@ -166,7 +171,11 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     put_bits(&s->pb, 5, s->qscale);
 
     s->rl_table_index = 2;
-    s->rl_chroma_table_index = 1; /* only for I frame */
+    if(s->msmpeg4_version==2)
+        s->rl_chroma_table_index = 2; /* only for I frame */
+    else
+        s->rl_chroma_table_index = 1; /* only for I frame */
+
     s->dc_table_index = 1;
     s->mv_table_index = 1; /* only if P frame */
     s->use_skip_mb_code = 1; /* only if P frame */
@@ -174,21 +183,25 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     if (s->pict_type == I_TYPE) {
         put_bits(&s->pb, 5, 0x17); /* indicate only one "slice" */
 
-        code012(&s->pb, s->rl_chroma_table_index);
-        code012(&s->pb, s->rl_table_index);
+        if(s->msmpeg4_version!=2){
+            code012(&s->pb, s->rl_chroma_table_index);
+            code012(&s->pb, s->rl_table_index);
 
-        put_bits(&s->pb, 1, s->dc_table_index);
+            put_bits(&s->pb, 1, s->dc_table_index);
+        }
         s->no_rounding = 1;
     } else {
         put_bits(&s->pb, 1, s->use_skip_mb_code);
         
         s->rl_chroma_table_index = s->rl_table_index;
-        code012(&s->pb, s->rl_table_index);
+        if(s->msmpeg4_version!=2){
+            code012(&s->pb, s->rl_table_index);
 
-        put_bits(&s->pb, 1, s->dc_table_index);
+            put_bits(&s->pb, 1, s->dc_table_index);
+
+            put_bits(&s->pb, 1, s->mv_table_index);
+        }
 
-        put_bits(&s->pb, 1, s->mv_table_index);
-	
 	if(s->flipflop_rounding){
 	    s->no_rounding ^= 1;
 	}else{
@@ -203,6 +216,8 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
         init_mv_table(&mv_tables[1]);
         for(i=0;i<NB_RL_TABLES;i++)
             init_rl(&rl_table[i]);
+
+        init_h263_dc_for_msmpeg4();
     }
 
 #ifdef DEBUG
@@ -226,18 +241,17 @@ void msmpeg4_encode_ext_header(MpegEncContext * s)
 /* predict coded block */
 static inline int coded_block_pred(MpegEncContext * s, int n, UINT8 **coded_block_ptr)
 {
-    int x, y, wrap, pred, a, b, c;
+    int xy, wrap, pred, a, b, c;
 
-    x = 2 * s->mb_x + 1 + (n & 1);
-    y = 2 * s->mb_y + 1 + ((n & 2) >> 1);
-    wrap = s->mb_width * 2 + 2;
+    xy = s->block_index[n];
+    wrap = s->block_wrap[0];
 
     /* B C
      * A X 
      */
-    a = s->coded_block[(x - 1) + (y) * wrap];
-    b = s->coded_block[(x - 1) + (y - 1) * wrap];
-    c = s->coded_block[(x) + (y - 1) * wrap];
+    a = s->coded_block[xy - 1       ];
+    b = s->coded_block[xy - 1 - wrap];
+    c = s->coded_block[xy     - wrap];
     
     if (b == c) {
         pred = a;
@@ -246,7 +260,7 @@ static inline int coded_block_pred(MpegEncContext * s, int n, UINT8 **coded_bloc
     }
     
     /* store value */
-    *coded_block_ptr = &s->coded_block[(x) + (y) * wrap];
+    *coded_block_ptr = &s->coded_block[xy];
 
     return pred;
 }
@@ -314,14 +328,30 @@ void msmpeg4_encode_mb(MpegEncContext * s,
         if (s->use_skip_mb_code)
             put_bits(&s->pb, 1, 0);	/* mb coded */
         
-        put_bits(&s->pb, 
-                 table_mb_non_intra[cbp + 64][1], 
-                 table_mb_non_intra[cbp + 64][0]);
-
-        /* motion vector */
-        h263_pred_motion(s, 0, &pred_x, &pred_y);
-        msmpeg4_encode_motion(s, motion_x - pred_x, 
-                              motion_y - pred_y);
+        if(s->msmpeg4_version==2){
+            put_bits(&s->pb, 
+                     v2_mb_type[cbp&3][1], 
+                     v2_mb_type[cbp&3][0]);
+            if((cbp&3) != 3) coded_cbp= cbp ^ 0x3C;
+            else             coded_cbp= cbp;
+
+            put_bits(&s->pb, 
+                     cbpy_tab[coded_cbp>>2][1], 
+                     cbpy_tab[coded_cbp>>2][0]);
+                        
+            h263_pred_motion(s, 0, &pred_x, &pred_y);
+            msmpeg4v2_encode_motion(s, motion_x - pred_x);
+            msmpeg4v2_encode_motion(s, motion_y - pred_y);
+        }else{
+            put_bits(&s->pb, 
+                     table_mb_non_intra[cbp + 64][1], 
+                     table_mb_non_intra[cbp + 64][0]);
+
+            /* motion vector */
+            h263_pred_motion(s, 0, &pred_x, &pred_y);
+            msmpeg4_encode_motion(s, motion_x - pred_x, 
+                                  motion_y - pred_y);
+        }
     } else {
 	/* compute cbp */
 	cbp = 0;
@@ -343,19 +373,36 @@ void msmpeg4_encode_mb(MpegEncContext * s,
             printf("cbp=%x %x\n", cbp, coded_cbp);
 #endif
 
-	if (s->pict_type == I_TYPE) {
-            set_stat(ST_INTRA_MB);
-            put_bits(&s->pb, 
-                     table_mb_intra[coded_cbp][1], table_mb_intra[coded_cbp][0]);
-        } else {
-            if (s->use_skip_mb_code)
-                put_bits(&s->pb, 1, 0);	/* mb coded */
+        if(s->msmpeg4_version==2){
+            if (s->pict_type == I_TYPE) {
+                put_bits(&s->pb, 
+                         v2_intra_cbpc[cbp&3][1], v2_intra_cbpc[cbp&3][0]);
+            } else {
+                if (s->use_skip_mb_code)
+                    put_bits(&s->pb, 1, 0);	/* mb coded */
+                put_bits(&s->pb, 
+                         v2_mb_type[(cbp&3) + 4][1], 
+                         v2_mb_type[(cbp&3) + 4][0]);
+            }
+            put_bits(&s->pb, 1, 0);	/* no AC prediction yet */
             put_bits(&s->pb, 
-                     table_mb_non_intra[cbp][1], 
-                     table_mb_non_intra[cbp][0]);
+                     cbpy_tab[cbp>>2][1], 
+                     cbpy_tab[cbp>>2][0]);
+        }else{
+            if (s->pict_type == I_TYPE) {
+                set_stat(ST_INTRA_MB);
+                put_bits(&s->pb, 
+                         table_mb_intra[coded_cbp][1], table_mb_intra[coded_cbp][0]);
+            } else {
+                if (s->use_skip_mb_code)
+                    put_bits(&s->pb, 1, 0);	/* mb coded */
+                put_bits(&s->pb, 
+                         table_mb_non_intra[cbp][1], 
+                         table_mb_non_intra[cbp][0]);
+            }
+            set_stat(ST_INTRA_MB);
+            put_bits(&s->pb, 1, 0);	/* no AC prediction yet */
         }
-        set_stat(ST_INTRA_MB);
-        put_bits(&s->pb, 1, 0);	/* no AC prediction yet */
     }
 
     for (i = 0; i < 6; i++) {
@@ -367,10 +414,9 @@ void msmpeg4_encode_mb(MpegEncContext * s,
 /* strongly inspirated from MPEG4, but not exactly the same ! */
 void msmpeg4_dc_scale(MpegEncContext * s)
 {
-    if (s->qscale < 5){
+    if (s->qscale < 5 || s->msmpeg4_version==2){
         s->y_dc_scale = 8;
         s->c_dc_scale = 8;
-//        s->c_dc_scale = (s->qscale + 13)>>1;
     }else if (s->qscale < 9){
         s->y_dc_scale = 2 * s->qscale;
         s->c_dc_scale = (s->qscale + 13)>>1;
@@ -378,56 +424,30 @@ void msmpeg4_dc_scale(MpegEncContext * s)
         s->y_dc_scale = s->qscale + 8;
         s->c_dc_scale = (s->qscale + 13)>>1;
     }
-    // this differs for quant >24 from mpeg4 
-    
-//    if(s->qscale==13) s->c_dc_scale=14;
-    
-//    if(s->qscale>=6)
-//     printf("%d", s->qscale);
-    
-    /* s->c_dc_scale values (found by Michael Nidermayer)
-     qscale=2 -> 8 (yes iam sure about that)
-     qscale=3 -> 8
-     qscale=4 -> 8
-     qscale=5 -> 9
-     qscale=6 -> 9 
-     qscale=7 -> 10
-     qscale=8 -> 10
-     qscale=9 -> 11
-     qscale=10-> 11
-    */
 }
 
 /* dir = 0: left, dir = 1: top prediction */
 static int msmpeg4_pred_dc(MpegEncContext * s, int n, 
                            INT16 **dc_val_ptr, int *dir_ptr)
 {
-    int a, b, c, xy, wrap, pred, scale;
+    int a, b, c, wrap, pred, scale;
     INT16 *dc_val;
 
     /* find prediction */
     if (n < 4) {
-	wrap = s->mb_width * 2 + 2;
-	xy = 2 * s->mb_y + 1 + ((n & 2) >> 1);
-        xy *= wrap;
-	xy += 2 * s->mb_x + 1 + (n & 1);
-	dc_val = s->dc_val[0];
 	scale = s->y_dc_scale;
     } else {
-	wrap = s->mb_width + 2;
-	xy = s->mb_y + 1;
-        xy *= wrap;
-	xy += s->mb_x + 1;
-	dc_val = s->dc_val[n - 4 + 1];
 	scale = s->c_dc_scale;
     }
+    wrap = s->block_wrap[n];
+    dc_val= s->dc_val[0] + s->block_index[n];
 
     /* B C
      * A X 
      */
-    a = dc_val[xy - 1];
-    b = dc_val[xy - 1 - wrap];
-    c = dc_val[xy - wrap];
+    a = dc_val[ - 1];
+    b = dc_val[ - 1 - wrap];
+    c = dc_val[ - wrap];
 
     /* XXX: the following solution consumes divisions, but it does not
        necessitate to modify mpegvideo.c. The problem comes from the
@@ -478,7 +498,7 @@ static int msmpeg4_pred_dc(MpegEncContext * s, int n,
     }
 
     /* update predictor */
-    *dc_val_ptr = &dc_val[xy];
+    *dc_val_ptr = &dc_val[0];
     return pred;
 }
 
@@ -502,35 +522,46 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
     /* do the prediction */
     level -= pred;
 
-    sign = 0;
-    if (level < 0) {
-        level = -level;
-        sign = 1;
-    }
-    
-    code = level;
-    if (code > DC_MAX) 
-        code = DC_MAX;
-
-    if (s->dc_table_index == 0) {
+    if(s->msmpeg4_version==2){
         if (n < 4) {
-            put_bits(&s->pb, table0_dc_lum[code][1], table0_dc_lum[code][0]);
-        } else {
-            put_bits(&s->pb, table0_dc_chroma[code][1], table0_dc_chroma[code][0]);
+            put_bits(&s->pb, 
+                     v2_dc_lum_table[level+256][1],
+                     v2_dc_lum_table[level+256][0]);
+        }else{
+            put_bits(&s->pb, 
+                     v2_dc_chroma_table[level+256][1],
+                     v2_dc_chroma_table[level+256][0]);
         }
-    } else {
-        if (n < 4) {
-            put_bits(&s->pb, table1_dc_lum[code][1], table1_dc_lum[code][0]);
+    }else{
+        sign = 0;
+        if (level < 0) {
+            level = -level;
+            sign = 1;
+        }
+        code = level;
+        if (code > DC_MAX) 
+            code = DC_MAX;
+
+        if (s->dc_table_index == 0) {
+            if (n < 4) {
+                put_bits(&s->pb, table0_dc_lum[code][1], table0_dc_lum[code][0]);
+            } else {
+                put_bits(&s->pb, table0_dc_chroma[code][1], table0_dc_chroma[code][0]);
+            }
         } else {
-            put_bits(&s->pb, table1_dc_chroma[code][1], table1_dc_chroma[code][0]);
+            if (n < 4) {
+                put_bits(&s->pb, table1_dc_lum[code][1], table1_dc_lum[code][0]);
+            } else {
+                put_bits(&s->pb, table1_dc_chroma[code][1], table1_dc_chroma[code][0]);
+            }
+        }
+            
+        if (code == DC_MAX)
+            put_bits(&s->pb, 8, level);
+            
+        if (level != 0) {
+            put_bits(&s->pb, 1, sign);
         }
-    }
-        
-    if (code == DC_MAX)
-        put_bits(&s->pb, 8, level);
-        
-    if (level != 0) {
-        put_bits(&s->pb, 1, sign);
     }
 }
 
@@ -558,7 +589,10 @@ static void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
     } else {
         i = 0;
         rl = &rl_table[3 + s->rl_table_index];
-        run_diff = 1;
+        if(s->msmpeg4_version==2)
+            run_diff = 0;
+        else
+            run_diff = 1;
         set_stat(ST_INTER_AC);
     }
 
@@ -629,6 +663,72 @@ static VLC mb_non_intra_vlc;
 static VLC mb_intra_vlc;
 static VLC dc_lum_vlc[2];
 static VLC dc_chroma_vlc[2];
+static VLC v2_dc_lum_vlc;
+static VLC v2_dc_chroma_vlc;
+static VLC cbpy_vlc;
+static VLC v2_intra_cbpc_vlc;
+static VLC v2_mb_type_vlc;
+static VLC v2_mv_vlc;
+
+/* this table is practically identical to the one from h263 except that its inverted */
+static void init_h263_dc_for_msmpeg4()
+{
+    static int inited=0;
+    
+    if(!inited){
+        int level, uni_code, uni_len;
+        inited=1;
+
+        for(level=-256; level<256; level++){
+            int size, v, l;
+            /* find number of bits */
+            size = 0;
+            v = abs(level);
+            while (v) {
+                v >>= 1;
+		    size++;
+            }
+
+            if (level < 0)
+                l= (-level) ^ ((1 << size) - 1);
+            else
+                l= level;
+
+            /* luminance h263 */
+            uni_code= DCtab_lum[size][0];
+            uni_len = DCtab_lum[size][1];
+            uni_code ^= (1<<uni_len)-1; //M$ doesnt like compatibility
+
+            if (size > 0) {
+                uni_code<<=size; uni_code|=l;
+                uni_len+=size;
+                if (size > 8){
+                    uni_code<<=1; uni_code|=1;
+                    uni_len++;
+                }
+            }
+            v2_dc_lum_table[level+256][0]= uni_code;
+            v2_dc_lum_table[level+256][1]= uni_len;
+
+            /* chrominance h263 */
+            uni_code= DCtab_chrom[size][0];
+            uni_len = DCtab_chrom[size][1];
+            uni_code ^= (1<<uni_len)-1; //M$ doesnt like compatibility
+            
+            if (size > 0) {
+                uni_code<<=size; uni_code|=l;
+                uni_len+=size;
+                if (size > 8){
+                    uni_code<<=1; uni_code|=1;
+                    uni_len++;
+                }
+            }
+            v2_dc_chroma_table[level+256][0]= uni_code;
+            v2_dc_chroma_table[level+256][1]= uni_len;
+
+        }
+    }
+}
 
 /* init all vlc decoding tables */
 int msmpeg4_decode_init_vlc(MpegEncContext *s)
@@ -659,6 +759,27 @@ int msmpeg4_decode_init_vlc(MpegEncContext *s)
     init_vlc(&dc_chroma_vlc[1], 9, 120, 
              &table1_dc_chroma[0][1], 8, 4,
              &table1_dc_chroma[0][0], 8, 4);
+    
+    init_h263_dc_for_msmpeg4();
+    init_vlc(&v2_dc_lum_vlc, 9, 512, 
+             &v2_dc_lum_table[0][1], 8, 4,
+             &v2_dc_lum_table[0][0], 8, 4);
+    init_vlc(&v2_dc_chroma_vlc, 9, 512, 
+             &v2_dc_chroma_table[0][1], 8, 4,
+             &v2_dc_chroma_table[0][0], 8, 4);
+    
+    init_vlc(&cbpy_vlc, 6, 16,
+             &cbpy_tab[0][1], 2, 1,
+             &cbpy_tab[0][0], 2, 1);
+    init_vlc(&v2_intra_cbpc_vlc, 3, 4,
+             &v2_intra_cbpc[0][1], 2, 1,
+             &v2_intra_cbpc[0][0], 2, 1);
+    init_vlc(&v2_mb_type_vlc, 5, 8,
+             &v2_mb_type[0][1], 2, 1,
+             &v2_mb_type[0][0], 2, 1);
+    init_vlc(&v2_mv_vlc, 9, 33,
+             &mvtab[0][1], 2, 1,
+             &mvtab[0][0], 2, 1);
 
     init_vlc(&mb_non_intra_vlc, 9, 128, 
              &table_mb_non_intra[0][1], 8, 4,
@@ -692,16 +813,21 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
 
     if (s->pict_type == I_TYPE) {
         code = get_bits(&s->gb, 5); 
-        /* 0x17: one slice, 0x18: three slices */
-        /* XXX: implement it */
-	//printf("%d %d %d\n", code, s->slice_height, s->first_slice_line);
+        /* 0x17: one slice, 0x18: two slices */
         if (code < 0x17)
             return -1;
         s->slice_height = s->mb_height / (code - 0x16);
-        s->rl_chroma_table_index = decode012(&s->gb);
-        s->rl_table_index = decode012(&s->gb);
+        if(s->msmpeg4_version==2){
+            s->rl_chroma_table_index = 2;
+            s->rl_table_index = 2;
+
+            s->dc_table_index = 0; //not used
+        }else{
+            s->rl_chroma_table_index = decode012(&s->gb);
+            s->rl_table_index = decode012(&s->gb);
 
-        s->dc_table_index = get_bits1(&s->gb);
+            s->dc_table_index = get_bits1(&s->gb);
+        }
         s->no_rounding = 1;
 /*	printf(" %d %d %d %d     \n", 
 		s->qscale,
@@ -711,12 +837,21 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
     } else {
         s->use_skip_mb_code = get_bits1(&s->gb);
         
-        s->rl_table_index = decode012(&s->gb);
-        s->rl_chroma_table_index = s->rl_table_index;
+        if(s->msmpeg4_version==2){
+            s->rl_table_index = 2;
+            s->rl_chroma_table_index = s->rl_table_index;
+
+            s->dc_table_index = 0; //not used
 
-        s->dc_table_index = get_bits1(&s->gb);
+            s->mv_table_index = 0;
+        }else{
+            s->rl_table_index = decode012(&s->gb);
+            s->rl_chroma_table_index = s->rl_table_index;
 
-        s->mv_table_index = get_bits1(&s->gb);
+            s->dc_table_index = get_bits1(&s->gb);
+
+            s->mv_table_index = get_bits1(&s->gb);
+        }
 /*	printf(" %d %d %d %d %d     \n", 
 		s->use_skip_mb_code, 
 		s->rl_table_index, 
@@ -731,7 +866,18 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
 //	printf("%d", s->no_rounding);
     }
     
-   
+#if 0
+if(s->msmpeg4_version==2)
+{
+int i;
+for(i=0; i<s->gb.size*8; i++)
+//    printf("%d", get_bits1(&s->gb));
+    get_bits1(&s->gb);
+printf("END\n");
+return -1;
+}
+#endif
+
 #ifdef DEBUG
     printf("*****frame %d:\n", frame_count++);
 #endif
@@ -767,6 +913,127 @@ static inline void memsetw(short *tab, int val, int n)
         tab[i] = val;
 }
 
+static void msmpeg4v2_encode_motion(MpegEncContext * s, int val)
+{
+    int range, bit_size, sign, code, bits;
+
+    if (val == 0) {
+        /* zero vector */
+        code = 0;
+        put_bits(&s->pb, mvtab[code][1], mvtab[code][0]);
+    } else {
+        bit_size = s->f_code - 1;
+        range = 1 << bit_size;
+        if (val <= -64)
+            val += 64;
+        else if (val >= 64)
+            val -= 64;
+
+        if (val >= 0) {
+            sign = 0;
+        } else {
+            val = -val;
+            sign = 1;
+        }
+        val--;
+        code = (val >> bit_size) + 1;
+        bits = val & (range - 1);
+
+        put_bits(&s->pb, mvtab[code][1] + 1, (mvtab[code][0] << 1) | sign); 
+        if (bit_size > 0) {
+            put_bits(&s->pb, bit_size, bits);
+        }
+    }
+}
+
+/* this is identical to h263 except that its range is multiplied by 2 */
+static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
+{
+    int code, val, sign, shift;
+
+    code = get_vlc(&s->gb, &v2_mv_vlc);
+    if (code < 0)
+        return 0xffff;
+
+    if (code == 0)
+        return pred;
+    sign = get_bits1(&s->gb);
+    shift = f_code - 1;
+    val = (code - 1) << shift;
+    if (shift > 0)
+        val |= get_bits(&s->gb, shift);
+    val++;
+    if (sign)
+        val = -val;
+    val += pred;
+
+    if (val <= -64)
+        val += 64;
+    else if (val >= 64)
+        val -= 64;
+
+    return val;
+}
+
+
+int msmpeg4v2_decode_mb(MpegEncContext *s, 
+                      DCTELEM block[6][64])
+{
+    int cbp, code, i;
+    if (s->pict_type == P_TYPE) {
+        if (s->use_skip_mb_code) {
+            if (get_bits1(&s->gb)) {
+                /* skip mb */
+                s->mb_intra = 0;
+                for(i=0;i<6;i++)
+                    s->block_last_index[i] = -1;
+                s->mv_dir = MV_DIR_FORWARD;
+                s->mv_type = MV_TYPE_16X16;
+                s->mv[0][0][0] = 0;
+                s->mv[0][0][1] = 0;
+                s->mb_skiped = 1;
+                return 0;
+            }
+        }
+
+        code = get_vlc(&s->gb, &v2_mb_type_vlc);
+        s->mb_intra = code >>2;
+    
+        cbp = code & 0x3;
+    } else {
+        s->mb_intra = 1;
+        cbp= get_vlc(&s->gb, &v2_intra_cbpc_vlc);
+    }
+
+    if (!s->mb_intra) {
+        int mx, my;
+
+        cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2;
+        if((cbp&3) != 3) cbp^= 0x3C;
+        
+        h263_pred_motion(s, 0, &mx, &my);
+        mx= msmpeg4v2_decode_motion(s, mx, 1);
+        my= msmpeg4v2_decode_motion(s, my, 1);
+        
+        s->mv_dir = MV_DIR_FORWARD;
+        s->mv_type = MV_TYPE_16X16;
+        s->mv[0][0][0] = mx;
+        s->mv[0][0][1] = my;
+    } else {
+        s->ac_pred = get_bits1(&s->gb);
+        cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2;
+    }
+
+    for (i = 0; i < 6; i++) {
+        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0)
+	{
+             fprintf(stderr,"\nIgnoring error while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+             return -1;
+	}
+    }
+    return 0;
+}
+
 int msmpeg4_decode_mb(MpegEncContext *s, 
                       DCTELEM block[6][64])
 {
@@ -803,6 +1070,8 @@ int msmpeg4_decode_mb(MpegEncContext *s,
         }
     }
 
+    if(s->msmpeg4_version==2) return msmpeg4v2_decode_mb(s, block); //FIXME merge if possible
+    
     if (s->pict_type == P_TYPE) {
         set_stat(ST_INTER_MB);
         if (s->use_skip_mb_code) {
@@ -915,7 +1184,12 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         qadd = (s->qscale - 1) | 1;
 	i = 0;
         rl = &rl_table[3 + s->rl_table_index];
-        run_diff = 1;
+
+        if(s->msmpeg4_version==2)
+            run_diff = 0;
+        else
+            run_diff = 1;
+
         if (!coded) {
             s->block_last_index[n] = i - 1;
             return 0;
@@ -999,21 +1273,32 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
     int level, pred;
     INT16 *dc_val;
 
-    if (n < 4) {
-        level = get_vlc(&s->gb, &dc_lum_vlc[s->dc_table_index]);
-    } else {
-        level = get_vlc(&s->gb, &dc_chroma_vlc[s->dc_table_index]);
-    }
-    if (level < 0)
-        return -1;
+    if(s->msmpeg4_version==2){
+        if (n < 4) {
+            level = get_vlc(&s->gb, &v2_dc_lum_vlc);
+        } else {
+            level = get_vlc(&s->gb, &v2_dc_chroma_vlc);
+        }
+        if (level < 0) 
+            return -1;
+        level-=256;
+    }else{  //FIXME optimize use unified tables & index
+        if (n < 4) {
+            level = get_vlc(&s->gb, &dc_lum_vlc[s->dc_table_index]);
+        } else {
+            level = get_vlc(&s->gb, &dc_chroma_vlc[s->dc_table_index]);
+        }
+        if (level < 0)
+            return -1;
 
-    if (level == DC_MAX) {
-        level = get_bits(&s->gb, 8);
-        if (get_bits1(&s->gb))
-            level = -level;
-    } else if (level != 0) {
-        if (get_bits1(&s->gb))
-            level = -level;
+        if (level == DC_MAX) {
+            level = get_bits(&s->gb, 8);
+            if (get_bits1(&s->gb))
+                level = -level;
+        } else if (level != 0) {
+            if (get_bits1(&s->gb))
+                level = -level;
+        }
     }
 
     pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
diff --git a/src/libffmpeg/libavcodec/msmpeg4data.h b/src/libffmpeg/libavcodec/msmpeg4data.h
index 03a261211..9dcb8276f 100644
--- a/src/libffmpeg/libavcodec/msmpeg4data.h
+++ b/src/libffmpeg/libavcodec/msmpeg4data.h
@@ -569,6 +569,13 @@ extern const UINT16 intra_vlc[103][2];
 extern const INT8 intra_level[102];
 extern const INT8 intra_run[102];
 
+extern const UINT8 DCtab_lum[13][2];
+extern const UINT8 DCtab_chrom[13][2];
+
+extern const UINT8 cbpy_tab[16][2];
+extern const UINT8 mvtab[33][2];
+
+
 #define NB_RL_TABLES  6
 
 static RLTable rl_table[NB_RL_TABLES] = {
@@ -1765,3 +1772,12 @@ static MVTable mv_tables[2] = {
         table1_mvy,
     }
 };
+
+static const UINT8 v2_mb_type[8][2] = {
+ {1, 1}, {0   , 2}, {3   , 3}, {9   , 5},
+ {5, 4}, {0x21, 7}, {0x20, 7}, {0x11, 6},
+};
+
+static const UINT8 v2_intra_cbpc[4][2] = {
+ {1, 1}, {0, 3}, {1, 3}, {1, 2},
+};
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index f84b17e63..180712314 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -113,7 +113,8 @@ int avcodec_decode_video(AVCodecContext *avctx, AVPicture *picture,
 
     ret = avctx->codec->decode(avctx, picture, got_picture_ptr, 
                                buf, buf_size);
-    avctx->frame_number++;
+    if (*got_picture_ptr)                           
+        avctx->frame_number++;
     return ret;
 }
 
@@ -367,21 +368,57 @@ int avpicture_get_size(int pix_fmt, int width, int height)
 /* must be called before any other functions */
 void avcodec_init(void)
 {
+    static int inited = 0;
+
+    if (inited != 0)
+	return;
+    inited = 1;
+
     dsputil_init();
 }
 
 /* simple call to use all the codecs */
 void avcodec_register_all(void)
 {
+    static int inited = 0;
+    
+    if (inited != 0)
+	return;
+    inited = 1;
+
+    /* encoders */
+#ifdef CONFIG_ENCODERS
+    register_avcodec(&ac3_encoder);
+    register_avcodec(&mp2_encoder);
+#ifdef CONFIG_MP3LAME
+    register_avcodec(&mp3lame_encoder);
+#endif
+    register_avcodec(&mpeg1video_encoder);
+    register_avcodec(&h263_encoder);
+    register_avcodec(&h263p_encoder);
+    register_avcodec(&rv10_encoder);
+    register_avcodec(&mjpeg_encoder);
+    register_avcodec(&mpeg4_encoder);
+    register_avcodec(&msmpeg4v1_encoder);
+    register_avcodec(&msmpeg4v2_encoder);
+    register_avcodec(&msmpeg4v3_encoder);
+#endif /* CONFIG_ENCODERS */
+    register_avcodec(&rawvideo_codec);
+
     /* decoders */
 #ifdef CONFIG_DECODERS
     register_avcodec(&h263_decoder);
     register_avcodec(&mpeg4_decoder);
-    register_avcodec(&msmpeg4_decoder);
+    register_avcodec(&msmpeg4v1_decoder);
+    register_avcodec(&msmpeg4v2_decoder);
+    register_avcodec(&msmpeg4v3_decoder);
     register_avcodec(&mpeg_decoder);
     register_avcodec(&h263i_decoder);
     register_avcodec(&rv10_decoder);
     register_avcodec(&mjpeg_decoder);
+#ifdef CONFIG_AC3
+    register_avcodec(&ac3_decoder);
+#endif
 #endif /* CONFIG_DECODERS */
 
 }
author	Guenter Bartsch <guenter@users.sourceforge.net>	2002-04-06 20:51:22 +0000
committer	Guenter Bartsch <guenter@users.sourceforge.net>	2002-04-06 20:51:22 +0000
commit	55e772ec62ef638f8a0b44e379da663f78245355 (patch)
tree	3b90a73ab2e800ed32f68e24f125164de7a655b3 /src
parent	0176e107fd9b6672d87f75a9eb5d83e163e0179f (diff)
download	xine-lib-55e772ec62ef638f8a0b44e379da663f78245355.tar.gz xine-lib-55e772ec62ef638f8a0b44e379da663f78245355.tar.bz2