45 files changed, 4026 insertions, 1275 deletions
diff --git a/src/libffmpeg/diff_to_ffmpeg_cvs.txt b/src/libffmpeg/diff_to_ffmpeg_cvs.txt
index f3cc97794..79542110e 100644
--- a/src/libffmpeg/diff_to_ffmpeg_cvs.txt
+++ b/src/libffmpeg/diff_to_ffmpeg_cvs.txt
@@ -1,19 +1,31 @@
-diff -ur ./common.c ../../xine-lib/src/libffmpeg/libavcodec/common.c
---- ./common.c	Sun Jun  2 09:13:09 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/common.c	Sun Jun  9 20:23:23 2002
-@@ -166,7 +166,7 @@
-         buf_ptr += 4;
-         /* handle common case: we can read everything */
-         if (buf_ptr <= s->buf_end) {
--#if ARCH_X86
-+#ifdef ARCH_X86
- 	    bit_buf = bswap_32(*((unsigned long*)(&buf_ptr[-4])));
+Index: libavcodec/common.h
+===================================================================
+RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/common.h,v
+retrieving revision 1.56
+diff -u -r1.56 common.h
+--- libavcodec/common.h	22 Aug 2002 19:41:39 -0000	1.56
++++ libavcodec/common.h	16 Sep 2002 17:55:53 -0000
+@@ -147,11 +147,19 @@
+ 
+ #else
+ 
++#if __GNUC__
+ #ifdef DEBUG
+ #define dprintf(fmt,args...) printf(fmt, ## args)
  #else
- 	    bit_buf = (buf_ptr[-4] << 24) |
-diff -ur ./common.h ../../xine-lib/src/libffmpeg/libavcodec/common.h
---- ./common.h	Sun Jun  2 09:11:44 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/common.h	Sun Jun  9 20:28:19 2002
-@@ -913,6 +913,8 @@
+ #define dprintf(fmt,args...)
+ #endif
++#else
++#ifdef DEBUG
++#define dprintf(...) printf(__VA_ARGS__)
++#else
++#define dprintf(...)
++#endif
++#endif
+ 
+ #endif /* !CONFIG_WIN32 */
+ 
+@@ -922,6 +930,8 @@
      }
      return ret;
  }
@@ -22,30 +34,13 @@ diff -ur ./common.h ../../xine-lib/src/libffmpeg/libavcodec/common.h
  #if __CPU__ >= 686 && !defined(RUNTIME_CPUDETECT)
  #define COPY3_IF_LT(x,y,a,b,c,d)\
  asm volatile (\
-diff -ur ./dsputil.c ../../xine-lib/src/libffmpeg/libavcodec/dsputil.c
---- ./dsputil.c	Wed Jun  5 15:48:07 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/dsputil.c	Mon Jun 10 11:19:40 2002
-@@ -1314,7 +1314,7 @@
-     use_permuted_idct = 0;
- #endif
- 
--#ifdef SIMPLE_IDCT
-+//#ifdef SIMPLE_IDCT
-     if (ff_idct == NULL) {
-         ff_idct_put = simple_idct_put;
-         ff_idct_add = simple_idct_add;
-@@ -1323,7 +1323,7 @@
-         ff_idct_put = gen_idct_put;
-         ff_idct_add = gen_idct_add;
-     }
--#endif
-+//#endif
- 
-     if(use_permuted_idct)
- #ifdef SIMPLE_IDCT
-diff -ur ./dsputil.h ../../xine-lib/src/libffmpeg/libavcodec/dsputil.h
---- ./dsputil.h	Wed Jun  5 15:48:07 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/dsputil.h	Mon Jun 10 10:56:24 2002
+Index: libavcodec/dsputil.h
+===================================================================
+RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
+retrieving revision 1.28
+diff -u -r1.28 dsputil.h
+--- libavcodec/dsputil.h	14 Sep 2002 19:00:09 -0000	1.28
++++ libavcodec/dsputil.h	16 Sep 2002 17:55:53 -0000
 @@ -21,8 +21,9 @@
  
  #include "common.h"
@@ -57,7 +52,7 @@ diff -ur ./dsputil.h ../../xine-lib/src/libffmpeg/libavcodec/dsputil.h
  /* dct code */
  typedef short DCTELEM;
  
-@@ -105,23 +106,32 @@
+@@ -125,23 +126,32 @@
  }
  
  void block_permute(INT16 *block);
@@ -91,9 +86,13 @@ diff -ur ./dsputil.h ../../xine-lib/src/libffmpeg/libavcodec/dsputil.h
  
  #define emms_c() \
  {\
-diff -ur ./i386/fdct_mmx.c ../../xine-lib/src/libffmpeg/libavcodec/i386/fdct_mmx.c
---- ./i386/fdct_mmx.c	Sat May 25 19:45:33 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/i386/fdct_mmx.c	Sun Jun  9 20:32:35 2002
+Index: libavcodec/i386/fdct_mmx.c
+===================================================================
+RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/fdct_mmx.c,v
+retrieving revision 1.3
+diff -u -r1.3 fdct_mmx.c
+--- libavcodec/i386/fdct_mmx.c	1 Sep 2002 16:52:33 -0000	1.3
++++ libavcodec/i386/fdct_mmx.c	16 Sep 2002 17:55:53 -0000
 @@ -45,8 +45,8 @@
      23170, 23170, 23170, 23170,	//cos * (2<<15) + 0.5
  };
@@ -128,9 +127,13 @@ diff -ur ./i386/fdct_mmx.c ../../xine-lib/src/libffmpeg/libavcodec/i386/fdct_mmx
      psrad_i2r(SHIFT_FRW_ROW, mm1);
      paddd_r2r(mm6, mm5);
      psrad_i2r(SHIFT_FRW_ROW, mm0);
-diff -ur ./i386/mmx.h ../../xine-lib/src/libffmpeg/libavcodec/i386/mmx.h
---- ./i386/mmx.h	Mon May 27 05:31:54 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/i386/mmx.h	Thu Dec 27 18:02:23 2001
+Index: libavcodec/i386/mmx.h
+===================================================================
+RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/mmx.h,v
+retrieving revision 1.3
+diff -u -r1.3 mmx.h
+--- libavcodec/i386/mmx.h	27 May 2002 08:31:54 -0000	1.3
++++ libavcodec/i386/mmx.h	16 Sep 2002 17:55:53 -0000
 @@ -1,243 +1 @@
 -/*
 - * mmx.h
@@ -376,36 +379,157 @@ diff -ur ./i386/mmx.h ../../xine-lib/src/libffmpeg/libavcodec/i386/mmx.h
 -
 -#endif /* AVCODEC_I386MMX_H */
 +#include "xineutils.h"
-diff -ur ./utils.c ../../xine-lib/src/libffmpeg/libavcodec/utils.c
---- ./utils.c	Mon May 27 13:42:14 2002
-+++ ../../xine-lib/src/libffmpeg/libavcodec/utils.c	Sun Jun  9 20:36:59 2002
-@@ -485,29 +485,11 @@
-     register_avcodec(&h263i_decoder);
-     register_avcodec(&rv10_decoder);
-     register_avcodec(&mjpeg_decoder);
--    register_avcodec(&mp2_decoder);
--    register_avcodec(&mp3_decoder);
- #ifdef CONFIG_AC3
-     register_avcodec(&ac3_decoder);
- #endif
- #endif /* CONFIG_DECODERS */
+Index: libavcodec/i386/mpegvideo_mmx_template.c
+===================================================================
+RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/mpegvideo_mmx_template.c,v
+retrieving revision 1.10
+diff -u -r1.10 mpegvideo_mmx_template.c
+--- libavcodec/i386/mpegvideo_mmx_template.c	2 Sep 2002 16:56:29 -0000	1.10
++++ libavcodec/i386/mpegvideo_mmx_template.c	16 Sep 2002 17:55:53 -0000
+@@ -84,16 +84,25 @@
+     }
  
--    /* pcm codecs */
--
--#define PCM_CODEC(id, name) \
--    register_avcodec(& name ## _encoder); \
--    register_avcodec(& name ## _decoder); \
--
--PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le);
--PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be);
--PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le);
--PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be);
--PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8);
--PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8);
--PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw);
--PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw);
--
--#undef PCM_CODEC
- }
+     if(s->out_format == FMT_H263 && s->mpeg_quant==0){
+-    
++
++        /* the following code is patched using avifile's modifications
++           to enable -fpic compilation. this patch has not been accepted on
++           main ffmpeg cvs. */
++
+         asm volatile(
+             "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+             SPREADW(%%mm3)
+             "pxor %%mm7, %%mm7			\n\t" // 0
+             "pxor %%mm4, %%mm4			\n\t" // 0
+-            "movq (%2), %%mm5			\n\t" // qmat[0]
++            "movq (%1), %%mm5			\n\t" // qmat[0]
+             "pxor %%mm6, %%mm6			\n\t"
+-            "psubw (%3), %%mm6			\n\t" // -bias[0]
++            "psubw (%2), %%mm6			\n\t" // -bias[0]
+             "movl $-128, %%eax			\n\t"
++            : "+a" (last_non_zero_p1)
++            : "r" (qmat), "r" (bias)
++            );
++	  /* CORE */
++	  asm volatile(
+             ".balign 16				\n\t"
+             "1:					\n\t"
+             "pxor %%mm1, %%mm1			\n\t" // 0
+@@ -106,7 +115,7 @@
+             "por %%mm0, %%mm4			\n\t" 
+             "pxor %%mm1, %%mm0			\n\t" 
+             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+-            "movq %%mm0, (%5, %%eax)		\n\t"
++            "movq %%mm0, (%3, %%eax)		\n\t"
+             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
+             "movq (%4, %%eax), %%mm1		\n\t" 
+             "movq %%mm7, (%1, %%eax)		\n\t" // 0
+@@ -114,6 +123,11 @@
+ 	    PMAXW(%%mm0, %%mm3)
+             "addl $8, %%eax			\n\t"
+             " js 1b				\n\t"
++            : "+a" (last_non_zero_p1)
++            : "r" (block+64), "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
++            );
++	  /* EPILOGUE */
++	  asm volatile(
+             "movq %%mm3, %%mm0			\n\t"
+             "psrlq $32, %%mm3			\n\t"
+ 	    PMAXW(%%mm0, %%mm3)
+@@ -122,48 +136,46 @@
+ 	    PMAXW(%%mm0, %%mm3)
+             "movd %%mm3, %%eax			\n\t"
+             "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+-	    : "+a" (last_non_zero_p1)
+-            : "r" (block+64), "r" (qmat), "r" (bias),
+-              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+-        );
+-        // note the asm is split cuz gcc doesnt like that many operands ...
+-        asm volatile(
+-            "movd %1, %%mm1			\n\t" // max_qcoeff
++            "movd %2, %%mm1			\n\t" // max_qcoeff
+ 	    SPREADW(%%mm1)
+             "psubusw %%mm1, %%mm4		\n\t" 
+             "packuswb %%mm4, %%mm4		\n\t"
+-            "movd %%mm4, %0			\n\t" // *overflow
+-        : "=g" (*overflow)
+-        : "g" (s->max_qcoeff)
+-        );
++            "movd %%mm4, %1			\n\t" // *overflow
++            : "+a" (last_non_zero_p1), "=r" (*overflow)
++            : "r" (s->max_qcoeff)
++            );
+     }else{ // FMT_H263
+         asm volatile(
+-            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
++            "pushl %%ebp				\n\t"
++            "pushl %%ebx				\n\t"
++            "movl %0, %%ebp				\n\t"
++            "movl (%%ebp), %%ebx		\n\t"
++            "movd %%ebx, %%mm3			\n\t" // last_non_zero_p1
+             SPREADW(%%mm3)
+             "pxor %%mm7, %%mm7			\n\t" // 0
+             "pxor %%mm4, %%mm4			\n\t" // 0
+-            "movl $-128, %%eax			\n\t"
++            "movl $-128, %%ebx			\n\t"
+             ".balign 16				\n\t"
+             "1:					\n\t"
+             "pxor %%mm1, %%mm1			\n\t" // 0
+-            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
++            "movq (%1, %%ebx), %%mm0		\n\t" // block[i]
+             "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
+             "pxor %%mm1, %%mm0			\n\t" 
+             "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
+-            "movq (%3, %%eax), %%mm6		\n\t" // bias[0]
++            "movq (%3, %%ebx), %%mm6		\n\t" // bias[0]
+             "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
+-            "movq (%2, %%eax), %%mm5		\n\t" // qmat[i]
++            "movq (%2, %%ebx), %%mm5		\n\t" // qmat[i]
+             "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
+             "por %%mm0, %%mm4			\n\t" 
+             "pxor %%mm1, %%mm0			\n\t" 
+             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+-            "movq %%mm0, (%5, %%eax)		\n\t"
++            "movq %%mm0, (%5, %%ebx)		\n\t"
+             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
+-            "movq (%4, %%eax), %%mm1		\n\t" 
+-            "movq %%mm7, (%1, %%eax)		\n\t" // 0
++            "movq (%4, %%ebx), %%mm1		\n\t" 
++            "movq %%mm7, (%1, %%ebx)		\n\t" // 0
+             "pandn %%mm1, %%mm0			\n\t"
+ 	    PMAXW(%%mm0, %%mm3)
+-            "addl $8, %%eax			\n\t"
++            "addl $8, %%ebx			\n\t"
+             " js 1b				\n\t"
+             "movq %%mm3, %%mm0			\n\t"
+             "psrlq $32, %%mm3			\n\t"
+@@ -171,10 +183,14 @@
+             "movq %%mm3, %%mm0			\n\t"
+             "psrlq $16, %%mm3			\n\t"
+ 	    PMAXW(%%mm0, %%mm3)
+-            "movd %%mm3, %%eax			\n\t"
+-            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+-	    : "+a" (last_non_zero_p1)
+-            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
++            "movd %%mm3, %%ebx			\n\t"
++            "movzbl %%bl, %%ebx			\n\t" // last_non_zero_p1
++            "movl %%ebx, (%%ebp)		\n\t"
++            "popl %%ebx					\n\t"
++            "popl %%ebp					\n\t"
++            :
++			: "m" (last_non_zero_p1),
++              "r" (block+64), "r" (qmat+64), "r" (bias+64),
+               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+         );
+         // note the asm is split cuz gcc doesnt like that many operands ...
+@@ -184,8 +200,8 @@
+             "psubusw %%mm1, %%mm4		\n\t" 
+             "packuswb %%mm4, %%mm4		\n\t"
+             "movd %%mm4, %0			\n\t" // *overflow
+-        : "=g" (*overflow)
+-        : "g" (s->max_qcoeff)
++        : "=r" (*overflow)
++        : "r" (s->max_qcoeff)
+         );
+     }
  
- /* this should be called after seeking and before trying to decode the next frame */
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index e388643a8..fe6b04a02 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -16,10 +16,10 @@ LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic
 noinst_LTLIBRARIES = libavcodec.la
 
 libavcodec_la_SOURCES = common.c utils.c mpegvideo.c h263.c jrevdct.c jfdctfst.c \
-			mjpeg.c dsputil.c svq1.c \
+			mjpeg.c dsputil.c svq1.c jfdctint.c \
 			motion_est.c imgconvert.c msmpeg4.c \
 			mpeg12.c h263dec.c rv10.c simple_idct.c \
-			ratecontrol.c mem.c 
+			ratecontrol.c mem.c eval.c
 #imgresample.c
 
 libavcodec_la_LDFLAGS = \
diff --git a/src/libffmpeg/libavcodec/alpha/asm.h b/src/libffmpeg/libavcodec/alpha/asm.h
index ceaf0be4a..2fdbdf13d 100644
--- a/src/libffmpeg/libavcodec/alpha/asm.h
+++ b/src/libffmpeg/libavcodec/alpha/asm.h
@@ -52,7 +52,7 @@ struct unaligned_long { uint64_t l; } __attribute__((packed));
 #define ldq_u(p)     (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
 #define uldq(a)	     (((const struct unaligned_long *) (a))->l)
 
-#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 2
+#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 3
 #define cmpbge	__builtin_alpha_cmpbge
 /* Avoid warnings.  */
 #define extql(a, b)	__builtin_alpha_extql(a, (uint64_t) (b))
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
index 7ed015521..fef86fe64 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -224,16 +224,24 @@ static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
         } while (--h);                                                      \
     } while (0)
 
-#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                            \
-static void OPNAME ## _pixels ## SUFF ## _axp                           \
-        (uint8_t *restrict block, const uint8_t *restrict pixels,       \
-         int line_size, int h)                                          \
-{                                                                       \
-    if ((size_t) pixels & 0x7) {                                        \
-        OPKIND(uldq, STORE);                                            \
-    } else {                                                            \
-        OPKIND(ldq, STORE);                                             \
-    }                                                                   \
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
+static void OPNAME ## _pixels ## SUFF ## _axp                               \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         int line_size, int h)                                              \
+{                                                                           \
+    if ((size_t) pixels & 0x7) {                                            \
+        OPKIND(uldq, STORE);                                                \
+    } else {                                                                \
+        OPKIND(ldq, STORE);                                                 \
+    }                                                                       \
+}                                                                           \
+                                                                            \
+static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         int line_size, int h)                                              \
+{                                                                           \
+    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
+    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
 }
 
 #define PIXOP(OPNAME, STORE)                    \
@@ -268,27 +276,54 @@ PIXOP(put_no_rnd, STORE);
 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
 PIXOP(avg_no_rnd, STORE);
 
+void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
+                          int line_size, int h)
+{
+    put_pixels_axp_asm(block,     pixels,     line_size, h);
+    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
+}
+
 void dsputil_init_alpha(void)
 {
-    put_pixels_tab[0] = put_pixels_axp_asm;
-    put_pixels_tab[1] = put_pixels_x2_axp;
-    put_pixels_tab[2] = put_pixels_y2_axp;
-    put_pixels_tab[3] = put_pixels_xy2_axp;
-
-    put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
-    put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
-    put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
-    put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
-
-    avg_pixels_tab[0] = avg_pixels_axp;
-    avg_pixels_tab[1] = avg_pixels_x2_axp;
-    avg_pixels_tab[2] = avg_pixels_y2_axp;
-    avg_pixels_tab[3] = avg_pixels_xy2_axp;
-
-    avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
-    avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
-    avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
-    avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
+    put_pixels_tab[0][0] = put_pixels16_axp_asm;
+    put_pixels_tab[0][1] = put_pixels16_x2_axp;
+    put_pixels_tab[0][2] = put_pixels16_y2_axp;
+    put_pixels_tab[0][3] = put_pixels16_xy2_axp;
+
+    put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
+    put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
+    put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
+    put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
+
+    avg_pixels_tab[0][0] = avg_pixels16_axp;
+    avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
+    avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
+    avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
+
+    avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp;
+    avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp;
+    avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp;
+    avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp;
+
+    put_pixels_tab[1][0] = put_pixels_axp_asm;
+    put_pixels_tab[1][1] = put_pixels_x2_axp;
+    put_pixels_tab[1][2] = put_pixels_y2_axp;
+    put_pixels_tab[1][3] = put_pixels_xy2_axp;
+
+    put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
+    put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
+    put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
+    put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
+
+    avg_pixels_tab[1][0] = avg_pixels_axp;
+    avg_pixels_tab[1][1] = avg_pixels_x2_axp;
+    avg_pixels_tab[1][2] = avg_pixels_y2_axp;
+    avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
+
+    avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp;
+    avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp;
+    avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp;
+    avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp;
 
     clear_blocks = clear_blocks_axp;
 
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
index 5349e443c..7ec6757d7 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
@@ -122,41 +122,21 @@ $aligned:
         ldq     t2, 0(a1)
         addq    a1, a2, a1
         ldq     t3, 0(a1)
-        addq    a1, a2, a1
-
-        ldq     t4, 0(a1)
-        addq    a1, a2, a1
-        ldq     t5, 0(a1)
-        addq    a1, a2, a1
-
-        ldq     t6, 0(a1)
-        addq    a1, a2, a1
-        ldq     t7, 0(a1)
-        addq    a1, a2, a1
-
-        addq    a0, a2, t8
-        stq     t0, 0(a0)
-        addq    t8, a2, t9
-        stq     t1, 0(t8)
-
-        addq    t9, a2, ta
-        stq     t2, 0(t9)
-        addq    ta, a2, tb
-        stq     t3, 0(ta)
-
-        addq    tb, a2, tc
-        stq     t4, 0(tb)
-        addq    tc, a2, td
-        stq     t5, 0(tc)
 
-        addq    td, a2, te
-        stq     t6, 0(td)
-        addq    te, a2, a0
-        stq     t7, 0(te)
+	addq	a0, a2, t4
+	addq    a1, a2, a1
+	addq	t4, a2, t5
+	subq    a3, 4, a3
 
-        subq    a3, 8, a3
-        bne     a3, $aligned
+	stq	t0, 0(a0)
+	addq	t5, a2, t6
+	stq	t1, 0(t4)
+	addq	t6, a2, a0
 
+	stq	t2, 0(t5)
+	stq	t3, 0(t6)
+	
+	bne     a3, $aligned
         ret
         .end put_pixels_axp_asm
 
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 7dbcc46af..acedebaf3 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -5,8 +5,8 @@
 
 #define LIBAVCODEC_VERSION_INT 0x000406
 #define LIBAVCODEC_VERSION     "0.4.6"
-#define LIBAVCODEC_BUILD       4619
-#define LIBAVCODEC_BUILD_STR   "4619"
+#define LIBAVCODEC_BUILD       4623
+#define LIBAVCODEC_BUILD_STR   "4623"
 
 enum CodecID {
     CODEC_ID_NONE, 
@@ -15,6 +15,7 @@ enum CodecID {
     CODEC_ID_RV10,
     CODEC_ID_MP2,
     CODEC_ID_MP3LAME,
+    CODEC_ID_VORBIS,
     CODEC_ID_AC3,
     CODEC_ID_MJPEG,
     CODEC_ID_MPEG4,
@@ -82,6 +83,13 @@ enum Motion_Est_ID {
     ME_X1
 };
 
+typedef struct RcOverride{
+    int start_frame;
+    int end_frame;
+    int qscale; // if this is 0 then quality_factor will be used instead
+    float quality_factor;
+} RcOverride;
+
 /* only for ME compatiblity with old apps */
 extern int motion_estimation_method;
 
@@ -89,6 +97,7 @@ extern int motion_estimation_method;
 static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG, 
                                        ME_X1, ME_EPZS, ME_FULL };
 
+
 #define FF_MAX_B_FRAMES 4
 
 /* encoding support */
@@ -110,6 +119,7 @@ static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG,
 #define CODEC_FLAG_GRAY  0x2000  /* only decode/encode grayscale */
 #define CODEC_FLAG_EMU_EDGE 0x4000/* dont draw edges */
 #define CODEC_FLAG_DR1    0x8000 /* dr1 */
+#define CODEC_FLAG_NOT_TRUNCATED  0x00010000 /* input bitstream is not truncated, except before a startcode */
 /* codec capabilities */
 
 /* decoder can use draw_horiz_band callback */
@@ -145,6 +155,7 @@ typedef struct AVCodecContext {
 #define FF_ASPECT_4_3_525 3
 #define FF_ASPECT_16_9_625 4
 #define FF_ASPECT_16_9_525 5
+#define FF_ASPECT_EXTENDED 15
     int gop_size; /* 0 = intra only */
     enum PixelFormat pix_fmt;  /* pixel format, see PIX_FMT_xxx */
     int repeat_pict; /* when decoding, this signal how much the picture */
@@ -171,7 +182,7 @@ typedef struct AVCodecContext {
     int key_frame;      /* true if the previous compressed frame was 
                            a key frame (intra, or seekable) */
     int pict_type;      /* picture type of the previous 
-                           encoded frame */
+                           en/decoded frame */
 /* FIXME: these should have FF_ */
 #define I_TYPE 1 // Intra
 #define P_TYPE 2 // Predicted
@@ -194,8 +205,8 @@ typedef struct AVCodecContext {
     int qmax;         /* max qscale */
     int max_qdiff;    /* max qscale difference between frames */
     int max_b_frames; /* maximum b frames, the output will be delayed by max_b_frames+1 relative to the input */
-    float b_quant_factor;/* qscale factor between ips and b frames */
-    int rc_strategy;
+    float b_quant_factor;/* qscale factor between ps and b frames */
+    int rc_strategy;  /* obsolete FIXME remove */
     int b_frame_strategy;
 
     int hurry_up;     /* when set to 1 during decoding, b frames will be skiped
@@ -274,13 +285,46 @@ typedef struct AVCodecContext {
     int dr_uvstride;
     int dr_ip_buffer_count;
     int block_align; /* currently only for adpcm codec in wav/avi */
-
+    
     int parse_only; /* decoding only: if true, only parsing is done
                        (function avcodec_parse_frame()). The frame
                        data is returned. Only MPEG codecs support this now. */
+    
     int mpeg_quant; /* 0-> h263 quant 1-> mpeg quant */
+    
+    char *stats_out; /* encoding statistics output buffer */
+    char *stats_in;  /* encoding statistics input buffer (concatenated stuff from stats_out of pass1 should be placed here)*/
+    float rc_qsquish;
+    float rc_qmod_amp;
+    int rc_qmod_freq;
+    RcOverride *rc_override;
+    int rc_override_count;
+    char *rc_eq;
+    int rc_max_rate;
+    int rc_min_rate;
+    int rc_buffer_size;
+    float rc_buffer_aggressivity;
+    float i_quant_factor;/* qscale factor between i and p frames */
+    float i_quant_offset;/* qscale offset between i and p frames */
+    float rc_initial_cplx;
+
+    int aspected_width;
+    int aspected_height;
+
+    int dct_algo;
+#define FF_DCT_AUTO    0
+#define FF_DCT_FASTINT 1
+#define FF_DCT_INT     2
+#define FF_DCT_MMX     3
+#define FF_DCT_MLIB    4
+
+    long long int pts; /* timestamp in micro seconds
+                          for decoding: the timestamp from the stream or 0
+                          for encoding: the timestamp which will be stored in the stream
+                                        if 0 then the frame_rate will be used */   
 
     //FIXME this should be reordered after kabis API is finished ...
+    //TODO kill kabi
     /*
 	Note: Below are located reserved fields for further usage
 	It requires for ABI !!!
@@ -291,13 +335,13 @@ typedef struct AVCodecContext {
     */
     unsigned long long int
 	    ull_res0,ull_res1,ull_res2,ull_res3,ull_res4,ull_res5,
-	    ull_res6,ull_res7,ull_res8,ull_res9,ull_res10,ull_res11,ull_res12;
+	    ull_res6,ull_res7,ull_res8,ull_res9,ull_res10,ull_res11;
     float
 	    flt_res0,flt_res1,flt_res2,flt_res3,flt_res4,flt_res5,
-	    flt_res6,flt_res7,flt_res8,flt_res9,flt_res10,flt_res11;
+	    flt_res6,flt_res7,flt_res8,flt_res9,flt_res10,flt_res11,flt_res12;
     void
 	    *ptr_res0,*ptr_res1,*ptr_res2,*ptr_res3,*ptr_res4,*ptr_res5,
-	    *ptr_res6;
+            *ptr_res6,*ptr_res7,*ptr_res8,*ptr_res9,*ptr_res10,*ptr_res11,*ptr_res12;
     unsigned long int
 	    ul_res0,ul_res1,ul_res2,ul_res3,ul_res4,ul_res5,
 	    ul_res6,ul_res7,ul_res8,ul_res9,ul_res10,ul_res11,ul_res12;
@@ -307,6 +351,9 @@ typedef struct AVCodecContext {
     unsigned char
 	    uc_res0,uc_res1,uc_res2,uc_res3,uc_res4,uc_res5,
 	    uc_res6,uc_res7,uc_res8,uc_res9,uc_res10,uc_res11,uc_res12;
+    unsigned int
+	    ui_res0,ui_res1,ui_res2,ui_res3,ui_res4,ui_res5,ui_res6,ui_res7,ui_res8,ui_res9,
+	    ui_res10,ui_res11,ui_res12,ui_res13,ui_res14,ui_res15,ui_res16;
 } AVCodecContext;
 
 typedef struct AVCodec {
@@ -349,6 +396,7 @@ typedef struct AVPicture {
 extern AVCodec ac3_encoder;
 extern AVCodec mp2_encoder;
 extern AVCodec mp3lame_encoder;
+extern AVCodec oggvorbis_encoder;
 extern AVCodec mpeg1video_encoder;
 extern AVCodec h263_encoder;
 extern AVCodec h263p_encoder;
@@ -423,6 +471,12 @@ typedef struct ImgReSampleContext ImgReSampleContext;
 
 ImgReSampleContext *img_resample_init(int output_width, int output_height,
                                       int input_width, int input_height);
+
+ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
+                                      int iwidth, int iheight,
+                                      int topBand, int bottomBand,
+                                      int leftBand, int rightBand);
+
 void img_resample(ImgReSampleContext *s, 
                   AVPicture *output, AVPicture *input);
 
diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c
index 63d17b6c2..fde12d927 100644
--- a/src/libffmpeg/libavcodec/common.c
+++ b/src/libffmpeg/libavcodec/common.c
@@ -20,6 +20,13 @@
  */
 #include "avcodec.h"
 
+const UINT8 ff_sqrt_tab[128]={
+        0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+        9, 9, 9, 9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11
+};
+
 void init_put_bits(PutBitContext *s, 
                    UINT8 *buffer, int buffer_size,
                    void *opaque,
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 8cc781edb..e887b9c62 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -909,6 +909,8 @@ static inline int clip(int a, int amin, int amax)
 }
 
 /* math */
+extern const UINT8 ff_sqrt_tab[128];
+
 int ff_gcd(int a, int b);
 
 static inline int ff_sqrt(int a)
@@ -916,7 +918,9 @@ static inline int ff_sqrt(int a)
     int ret=0;
     int s;
     int ret_sq=0;
-
+    
+    if(a<128) return ff_sqrt_tab[a];
+    
     for(s=15; s>=0; s--){
         int b= ret_sq + (1<<(s*2)) + (ret<<s)*2;
         if(b<=a){
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index d65bfc39a..eb8592352 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -25,13 +25,14 @@
 void (*ff_idct)(DCTELEM *block);
 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
-void (*av_fdct)(DCTELEM *block);
 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
 void (*clear_blocks)(DCTELEM *blocks);
+int (*pix_sum)(UINT8 * pix, int line_size);
+int (*pix_norm1)(UINT8 * pix, int line_size);
 
 op_pixels_abs_func pix_abs16x16;
 op_pixels_abs_func pix_abs16x16_x2;
@@ -43,6 +44,8 @@ op_pixels_abs_func pix_abs8x8_x2;
 op_pixels_abs_func pix_abs8x8_y2;
 op_pixels_abs_func pix_abs8x8_xy2;
 
+int ff_bit_exact=0;
+
 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
 UINT32 squareTbl[512];
 
@@ -159,6 +162,52 @@ static void build_zigzag_end(void)
     }
 }
 
+int pix_sum_c(UINT8 * pix, int line_size)
+{
+    int s, i, j;
+
+    s = 0;
+    for (i = 0; i < 16; i++) {
+	for (j = 0; j < 16; j += 8) {
+	    s += pix[0];
+	    s += pix[1];
+	    s += pix[2];
+	    s += pix[3];
+	    s += pix[4];
+	    s += pix[5];
+	    s += pix[6];
+	    s += pix[7];
+	    pix += 8;
+	}
+	pix += line_size - 16;
+    }
+    return s;
+}
+
+int pix_norm1_c(UINT8 * pix, int line_size)
+{
+    int s, i, j;
+    UINT32 *sq = squareTbl + 256;
+
+    s = 0;
+    for (i = 0; i < 16; i++) {
+	for (j = 0; j < 16; j += 8) {
+	    s += sq[pix[0]];
+	    s += sq[pix[1]];
+	    s += sq[pix[2]];
+	    s += sq[pix[3]];
+	    s += sq[pix[4]];
+	    s += sq[pix[5]];
+	    s += sq[pix[6]];
+	    s += sq[pix[7]];
+	    pix += 8;
+	}
+	pix += line_size - 16;
+    }
+    return s;
+}
+
+
 void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
 {
     int i;
@@ -241,11 +290,10 @@ void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
         block += 8;
     }
 }
-
 #if 0
 
 #define PIXOP2(OPNAME, OP) \
-void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int i;\
     for(i=0; i<h; i++){\
@@ -255,7 +303,7 @@ void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int
     }\
 }\
 \
-void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int i;\
     for(i=0; i<h; i++){\
@@ -267,7 +315,7 @@ void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line
     }\
 }\
 \
-void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int i;\
     for(i=0; i<h; i++){\
@@ -279,7 +327,7 @@ void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size,
     }\
 }\
 \
-void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int i;\
     for(i=0; i<h; i++){\
@@ -291,7 +339,7 @@ void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line
     }\
 }\
 \
-void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int i;\
     for(i=0; i<h; i++){\
@@ -303,7 +351,7 @@ void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size,
     }\
 }\
 \
-void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
         int i;\
         const uint64_t a= LD64(pixels  );\
@@ -339,7 +387,7 @@ void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size,
         }\
 }\
 \
-void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
         int i;\
         const uint64_t a= LD64(pixels  );\
@@ -375,26 +423,45 @@ void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int lin
         }\
 }\
 \
-void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
-    OPNAME ## _pixels,\
-    OPNAME ## _pixels_x2,\
-    OPNAME ## _pixels_y2,\
-    OPNAME ## _pixels_xy2,\
+CALL_2X_PIXELS(OPNAME ## _pixels16    , OPNAME ## _pixels    , 8)\
+CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
+\
+void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    {\
+        OPNAME ## _pixels,\
+        OPNAME ## _pixels_x2,\
+        OPNAME ## _pixels_y2,\
+        OPNAME ## _pixels_xy2},\
+    {\
+        OPNAME ## _pixels16,\
+        OPNAME ## _pixels16_x2,\
+        OPNAME ## _pixels16_y2,\
+        OPNAME ## _pixels16_xy2}\
 };\
 \
-void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
-    OPNAME ## _pixels,\
-    OPNAME ## _no_rnd_pixels_x2,\
-    OPNAME ## _no_rnd_pixels_y2,\
-    OPNAME ## _no_rnd_pixels_xy2,\
+void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    {\
+        OPNAME ## _pixels,\
+        OPNAME ## _no_rnd_pixels_x2,\
+        OPNAME ## _no_rnd_pixels_y2,\
+        OPNAME ## _no_rnd_pixels_xy2},\
+    {\
+        OPNAME ## _pixels16,\
+        OPNAME ## _no_rnd_pixels16_x2,\
+        OPNAME ## _no_rnd_pixels16_y2,\
+        OPNAME ## _no_rnd_pixels16_xy2}\
 };
 
 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 #else // 64 bit variant
 
 #define PIXOP2(OPNAME, OP) \
-void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
+static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
     int i;\
     for(i=0; i<h; i++){\
         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
@@ -403,76 +470,148 @@ void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int
         block +=line_size;\
     }\
 }\
+static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels8(block, pixels, line_size, h);\
+}\
 \
-void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
+static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
     int i;\
     for(i=0; i<h; i++){\
-        int j;\
-        for(j=0; j<2; j++){\
-            const uint32_t a= LD32(pixels  );\
-            const uint32_t b= LD32(pixels+1);\
-            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
-            pixels+=4;\
-            block +=4;\
-        }\
-        pixels+=line_size-8;\
-        block +=line_size-8;\
+        uint32_t a,b;\
+        a= LD32(&src1[i*src_stride1  ]);\
+        b= LD32(&src2[i*src_stride2  ]);\
+        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
+        a= LD32(&src1[i*src_stride1+4]);\
+        b= LD32(&src2[i*src_stride2+4]);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
     }\
 }\
 \
-void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
+static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
     int i;\
     for(i=0; i<h; i++){\
-        int j;\
-        for(j=0; j<2; j++){\
-            const uint32_t a= LD32(pixels  );\
-            const uint32_t b= LD32(pixels+1);\
-            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
-            pixels+=4;\
-            block +=4;\
-        }\
-        pixels+=line_size-8;\
-        block +=line_size-8;\
+        uint32_t a,b;\
+        a= LD32(&src1[i*src_stride1  ]);\
+        b= LD32(&src2[i*src_stride2  ]);\
+        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
+        a= LD32(&src1[i*src_stride1+4]);\
+        b= LD32(&src2[i*src_stride2+4]);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
     }\
 }\
 \
-void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
+static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
+    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
+                                                int src_stride1, int src_stride2, int h){\
+    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
+    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
+}\
+\
+static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
     int i;\
     for(i=0; i<h; i++){\
-        int j;\
-        for(j=0; j<2; j++){\
-            const uint32_t a= LD32(pixels          );\
-            const uint32_t b= LD32(pixels+line_size);\
-            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
-            pixels+=4;\
-            block +=4;\
-        }\
-        pixels+=line_size-8;\
-        block +=line_size-8;\
+        uint32_t a, b, c, d, l0, l1, h0, h1;\
+        a= LD32(&src1[i*src_stride1]);\
+        b= LD32(&src2[i*src_stride2]);\
+        c= LD32(&src3[i*src_stride3]);\
+        d= LD32(&src4[i*src_stride4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x02020202UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+        a= LD32(&src1[i*src_stride1+4]);\
+        b= LD32(&src2[i*src_stride2+4]);\
+        c= LD32(&src3[i*src_stride3+4]);\
+        d= LD32(&src4[i*src_stride4+4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x02020202UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
     }\
 }\
-\
-void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
-{\
+static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
     int i;\
     for(i=0; i<h; i++){\
-        int j;\
-        for(j=0; j<2; j++){\
-            const uint32_t a= LD32(pixels          );\
-            const uint32_t b= LD32(pixels+line_size);\
-            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
-            pixels+=4;\
-            block +=4;\
-        }\
-        pixels+=line_size-8;\
-        block +=line_size-8;\
+        uint32_t a, b, c, d, l0, l1, h0, h1;\
+        a= LD32(&src1[i*src_stride1]);\
+        b= LD32(&src2[i*src_stride2]);\
+        c= LD32(&src3[i*src_stride3]);\
+        d= LD32(&src4[i*src_stride4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x01010101UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+        a= LD32(&src1[i*src_stride1+4]);\
+        b= LD32(&src2[i*src_stride2+4]);\
+        c= LD32(&src3[i*src_stride3+4]);\
+        d= LD32(&src4[i*src_stride4+4]);\
+        l0=  (a&0x03030303UL)\
+           + (b&0x03030303UL)\
+           + 0x01010101UL;\
+        h0= ((a&0xFCFCFCFCUL)>>2)\
+          + ((b&0xFCFCFCFCUL)>>2);\
+        l1=  (c&0x03030303UL)\
+           + (d&0x03030303UL);\
+        h1= ((c&0xFCFCFCFCUL)>>2)\
+          + ((d&0xFCFCFCFCUL)>>2);\
+        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
     }\
 }\
+static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+}\
+static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
+                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
+    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
+}\
 \
-void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int j;\
     for(j=0; j<2; j++){\
@@ -513,7 +652,7 @@ void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size,
     }\
 }\
 \
-void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 {\
     int j;\
     for(j=0; j<2; j++){\
@@ -554,22 +693,43 @@ void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int lin
     }\
 }\
 \
-void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
-    OPNAME ## _pixels,\
-    OPNAME ## _pixels_x2,\
-    OPNAME ## _pixels_y2,\
-    OPNAME ## _pixels_xy2,\
+CALL_2X_PIXELS(OPNAME ## _pixels16    , OPNAME ## _pixels8    , 8)\
+CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16    , OPNAME ## _pixels8    , 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
+CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
+\
+void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    {\
+        OPNAME ## _pixels16,\
+        OPNAME ## _pixels16_x2,\
+        OPNAME ## _pixels16_y2,\
+        OPNAME ## _pixels16_xy2},\
+    {\
+        OPNAME ## _pixels8,\
+        OPNAME ## _pixels8_x2,\
+        OPNAME ## _pixels8_y2,\
+        OPNAME ## _pixels8_xy2},\
 };\
 \
-void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
-    OPNAME ## _pixels,\
-    OPNAME ## _no_rnd_pixels_x2,\
-    OPNAME ## _no_rnd_pixels_y2,\
-    OPNAME ## _no_rnd_pixels_xy2,\
+void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    {\
+        OPNAME ## _pixels16,\
+        OPNAME ## _no_rnd_pixels16_x2,\
+        OPNAME ## _no_rnd_pixels16_y2,\
+        OPNAME ## _no_rnd_pixels16_xy2},\
+    {\
+        OPNAME ## _pixels8,\
+        OPNAME ## _no_rnd_pixels8_x2,\
+        OPNAME ## _no_rnd_pixels8_y2,\
+        OPNAME ## _no_rnd_pixels8_xy2},\
 };
+
 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 #endif
-
 #define op_put(a, b) a = b
 
 PIXOP2(avg, op_avg)
@@ -684,8 +844,11 @@ void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_siz
 
 #define op_avg(a, b) a = avg2(a, b)
 #define op_sub(a, b) a -= b
+#define op_put(a, b) a = b
 
 PIXOP(DCTELEM, sub, op_sub, 8)
+PIXOP(uint8_t, avg, op_avg, line_size)
+PIXOP(uint8_t, put, op_put, line_size)
 
 /* not rounding primitives */
 #undef avg2
@@ -693,6 +856,8 @@ PIXOP(DCTELEM, sub, op_sub, 8)
 #define avg2(a,b) ((a+b)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
 
+PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
+PIXOP(uint8_t, put_no_rnd, op_put, line_size)
 /* motion estimation */
 
 #undef avg2
@@ -702,7 +867,7 @@ PIXOP(DCTELEM, sub, op_sub, 8)
 #define avg2(a,b) ((a+b+1)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 
-static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
+static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
 {
     const int A=(16-x16)*(16-y16);
     const int B=(   x16)*(16-y16);
@@ -713,270 +878,465 @@ static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y1
 
     for(i=0; i<h; i++)
     {
-        dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
-        dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
-        dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
-        dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
-        dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
-        dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
-        dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
-        dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
-        dst+= srcStride;
-        src+= srcStride;
+        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
+        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
+        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
+        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
+        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
+        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
+        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
+        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
+        dst+= stride;
+        src+= stride;
     }
 }
 
-static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
+static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
 {
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
     int i;
     for(i=0; i<h; i++)
     {
-        dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
-        dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
-        dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
-        dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
-        dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
-        dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
-        dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
-        dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
+        ST32(dst   , LD32(src   ));
+        ST32(dst+4 , LD32(src+4 ));
+        ST32(dst+8 , LD32(src+8 ));
+        ST32(dst+12, LD32(src+12));
+        dst[16]= src[16];
         dst+=dstStride;
         src+=srcStride;
     }
 }
 
-static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
-{
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
-    int i;
-    for(i=0; i<w; i++)
-    {
-        const int src0= src[0*srcStride];
-        const int src1= src[1*srcStride];
-        const int src2= src[2*srcStride];
-        const int src3= src[3*srcStride];
-        const int src4= src[4*srcStride];
-        const int src5= src[5*srcStride];
-        const int src6= src[6*srcStride];
-        const int src7= src[7*srcStride];
-        const int src8= src[8*srcStride];
-        dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
-        dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
-        dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
-        dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
-        dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
-        dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
-        dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
-        dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
-        dst++;
-        src++;
-    }
-}
-
-static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
+static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
 {
     int i;
-    for(i=0; i<8; i++)
+    for(i=0; i<h; i++)
     {
-        dst[0]= src[0];
-        dst[1]= src[1];
-        dst[2]= src[2];
-        dst[3]= src[3];
-        dst[4]= src[4];
-        dst[5]= src[5];
-        dst[6]= src[6];
-        dst[7]= src[7];
+        ST32(dst   , LD32(src   ));
+        ST32(dst+4 , LD32(src+4 ));
+        dst[8]= src[8];
         dst+=dstStride;
         src+=srcStride;
     }
 }
 
-static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
-{
-    int i;
-    for(i=0; i<8; i++)
-    {
-        dst[0]= (src1[0] + src2[0] + r)>>1;
-        dst[1]= (src1[1] + src2[1] + r)>>1;
-        dst[2]= (src1[2] + src2[2] + r)>>1;
-        dst[3]= (src1[3] + src2[3] + r)>>1;
-        dst[4]= (src1[4] + src2[4] + r)>>1;
-        dst[5]= (src1[5] + src2[5] + r)>>1;
-        dst[6]= (src1[6] + src2[6] + r)>>1;
-        dst[7]= (src1[7] + src2[7] + r)>>1;
-        dst+=dstStride;
-        src1+=srcStride;
-        src2+=8;
-    }
-}
-
-static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
-{
-    int i;
-    for(i=0; i<8; i++)
-    {
-        dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
-        dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
-        dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
-        dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
-        dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
-        dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
-        dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
-        dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
-        dst+=dstStride;
-        src1+=srcStride;
-        src2+=8;
-        src3+=8;
-        src4+=8;
-    }
-}
-
-#define QPEL_MC(r, name) \
-static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
-    put_block(dst, src, dstStride, srcStride);\
+#define QPEL_MC(r, OPNAME, RND, OP) \
+static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
+        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
+        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
+        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
+        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
+        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
+        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
+        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
 }\
 \
-static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<w; i++)\
+    {\
+        const int src0= src[0*srcStride];\
+        const int src1= src[1*srcStride];\
+        const int src2= src[2*srcStride];\
+        const int src3= src[3*srcStride];\
+        const int src4= src[4*srcStride];\
+        const int src5= src[5*srcStride];\
+        const int src6= src[6*srcStride];\
+        const int src7= src[7*srcStride];\
+        const int src8= src[8*srcStride];\
+        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
+        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
+        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
+        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
+        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
+        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
+        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
+        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
+        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
+        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
+        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
+        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
+        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
+        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
+        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
+        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
+        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
+        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
+        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
+        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
+        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
+        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
+        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<w; i++)\
+    {\
+        const int src0= src[0*srcStride];\
+        const int src1= src[1*srcStride];\
+        const int src2= src[2*srcStride];\
+        const int src3= src[3*srcStride];\
+        const int src4= src[4*srcStride];\
+        const int src5= src[5*srcStride];\
+        const int src6= src[6*srcStride];\
+        const int src7= src[7*srcStride];\
+        const int src8= src[8*srcStride];\
+        const int src9= src[9*srcStride];\
+        const int src10= src[10*srcStride];\
+        const int src11= src[11*srcStride];\
+        const int src12= src[12*srcStride];\
+        const int src13= src[13*srcStride];\
+        const int src14= src[14*srcStride];\
+        const int src15= src[15*srcStride];\
+        const int src16= src[16*srcStride];\
+        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
+        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
+        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
+        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
+        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
+        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
+        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
+        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
+        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
+        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
+        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
+        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
+        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
+        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
+        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
+        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## pixels8(dst, src, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 half[64];\
-    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
-    avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
 }\
 \
-static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
-    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
+static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 }\
 \
-static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 half[64];\
-    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
-    avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
 }\
 \
-static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 half[64];\
-    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
-    avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
+    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
 }\
 \
-static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
-    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
+static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    copy_block9(full, src, 16, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
 }\
 \
-static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 half[64];\
-    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
-    avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
+    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
 }\
-static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
 }\
-static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 }\
-static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
 }\
-static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
-    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
-    avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
 }\
-static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
-{\
+static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[72];\
-    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
-    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
+}\
+static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## pixels16(dst, src, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 half[256];\
+    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
+    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 half[256];\
+    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 half[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
+    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    copy_block17(full, src, 24, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 half[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
+    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfV[256];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfV[256];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfV[256];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfV[256];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 halfH[272];\
+    UINT8 halfHV[256];\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
 }\
-qpel_mc_func qpel_mc ## name ## _tab[16]={ \
-    qpel_mc00_c ## name,                                                                   \
-    qpel_mc10_c ## name,                                                                   \
-    qpel_mc20_c ## name,                                                                   \
-    qpel_mc30_c ## name,                                                                   \
-    qpel_mc01_c ## name,                                                                   \
-    qpel_mc11_c ## name,                                                                   \
-    qpel_mc21_c ## name,                                                                   \
-    qpel_mc31_c ## name,                                                                   \
-    qpel_mc02_c ## name,                                                                   \
-    qpel_mc12_c ## name,                                                                   \
-    qpel_mc22_c ## name,                                                                   \
-    qpel_mc32_c ## name,                                                                   \
-    qpel_mc03_c ## name,                                                                   \
-    qpel_mc13_c ## name,                                                                   \
-    qpel_mc23_c ## name,                                                                   \
-    qpel_mc33_c ## name,                                                                   \
+static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 halfH[272];\
+    UINT8 halfHV[256];\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfV[256];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfV[256];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 halfH[272];\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
+}\
+qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
+  {\
+    OPNAME ## qpel16_mc00_c,                                                                   \
+    OPNAME ## qpel16_mc10_c,                                                                   \
+    OPNAME ## qpel16_mc20_c,                                                                   \
+    OPNAME ## qpel16_mc30_c,                                                                   \
+    OPNAME ## qpel16_mc01_c,                                                                   \
+    OPNAME ## qpel16_mc11_c,                                                                   \
+    OPNAME ## qpel16_mc21_c,                                                                   \
+    OPNAME ## qpel16_mc31_c,                                                                   \
+    OPNAME ## qpel16_mc02_c,                                                                   \
+    OPNAME ## qpel16_mc12_c,                                                                   \
+    OPNAME ## qpel16_mc22_c,                                                                   \
+    OPNAME ## qpel16_mc32_c,                                                                   \
+    OPNAME ## qpel16_mc03_c,                                                                   \
+    OPNAME ## qpel16_mc13_c,                                                                   \
+    OPNAME ## qpel16_mc23_c,                                                                   \
+    OPNAME ## qpel16_mc33_c,                                                                   \
+  },{\
+    OPNAME ## qpel8_mc00_c,                                                                   \
+    OPNAME ## qpel8_mc10_c,                                                                   \
+    OPNAME ## qpel8_mc20_c,                                                                   \
+    OPNAME ## qpel8_mc30_c,                                                                   \
+    OPNAME ## qpel8_mc01_c,                                                                   \
+    OPNAME ## qpel8_mc11_c,                                                                   \
+    OPNAME ## qpel8_mc21_c,                                                                   \
+    OPNAME ## qpel8_mc31_c,                                                                   \
+    OPNAME ## qpel8_mc02_c,                                                                   \
+    OPNAME ## qpel8_mc12_c,                                                                   \
+    OPNAME ## qpel8_mc22_c,                                                                   \
+    OPNAME ## qpel8_mc32_c,                                                                   \
+    OPNAME ## qpel8_mc03_c,                                                                   \
+    OPNAME ## qpel8_mc13_c,                                                                   \
+    OPNAME ## qpel8_mc23_c,                                                                   \
+    OPNAME ## qpel8_mc33_c,                                                                   \
+  }\
 };
 
-QPEL_MC(0, _rnd)
-QPEL_MC(1, _no_rnd)
+#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
+#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
+#define op_put(a, b) a = cm[((b) + 16)>>5]
+#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
+
+QPEL_MC(0, put_       , _       , op_put)
+QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
+QPEL_MC(0, avg_       , _       , op_avg)
+//QPEL_MC(1, avg_no_rnd , _       , op_avg)
+#undef op_avg
+#undef op_avg_no_rnd
+#undef op_put
+#undef op_put_no_rnd
 
 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
@@ -1261,6 +1621,8 @@ void dsputil_init(void)
     add_pixels_clamped = add_pixels_clamped_c;
     gmc1= gmc1_c;
     clear_blocks= clear_blocks_c;
+    pix_sum= pix_sum_c;
+    pix_norm1= pix_norm1_c;
 
     pix_abs16x16     = pix_abs16x16_c;
     pix_abs16x16_x2  = pix_abs16x16_x2_c;
@@ -1270,7 +1632,6 @@ void dsputil_init(void)
     pix_abs8x8_x2  = pix_abs8x8_x2_c;
     pix_abs8x8_y2  = pix_abs8x8_y2_c;
     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
-    av_fdct = fdct_ifast;
 
     use_permuted_idct = 1;
 
@@ -1288,6 +1649,9 @@ void dsputil_init(void)
     dsputil_init_alpha();
     use_permuted_idct = 0;
 #endif
+#ifdef ARCH_POWERPC
+    dsputil_init_ppc();
+#endif
 
 #ifdef SIMPLE_IDCT
     if (ff_idct == NULL) {
@@ -1335,6 +1699,7 @@ void dsputil_init(void)
 /* remove any non bit exact operation (testing purpose) */
 void avcodec_set_bit_exact(void)
 {
+    ff_bit_exact=1;
 #ifdef HAVE_MMX
     dsputil_set_bit_exact_mmx();
 #endif
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index 8c3fdb716..3a26cddcf 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -28,13 +28,12 @@
 typedef short DCTELEM;
 
 void fdct_ifast (DCTELEM *data);
+void ff_jpeg_fdct_islow (DCTELEM *data);
 
 void j_rev_dct (DCTELEM *data);
 
 void fdct_mmx(DCTELEM *block);
 
-extern void (*av_fdct)(DCTELEM *block);
-
 /* encoding scans */
 extern UINT8 ff_alternate_horizontal_scan[64];
 extern UINT8 ff_alternate_vertical_scan[64];
@@ -52,17 +51,29 @@ extern UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
 
 void dsputil_init(void);
 
+/* minimum alignment rules ;)
+if u notice errors in the align stuff, need more alignment for some asm code for some cpu 
+or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ...
+
+!warning these alignments might not match reallity, (missing attribute((align)) stuff somewhere possible)
+i (michael) didnt check them, these are just the alignents which i think could be reached easily ...
+
+!future video codecs might need functions with less strict alignment
+*/
+
 /* pixel ops : interface with DCT */
+extern void (*ff_idct)(DCTELEM *block/*align 16*/);
+extern void (*ff_idct_put)(UINT8 *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+extern void (*ff_idct_add)(UINT8 *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+extern void (*get_pixels)(DCTELEM *block/*align 16*/, const UINT8 *pixels/*align 8*/, int line_size);
+extern void (*diff_pixels)(DCTELEM *block/*align 16*/, const UINT8 *s1/*align 8*/, const UINT8 *s2/*align 8*/, int stride);
+extern void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
+extern void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
+extern void (*gmc1)(UINT8 *dst/*align 8*/, UINT8 *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
+extern void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
+extern int (*pix_sum)(UINT8 * pix, int line_size);
+extern int (*pix_norm1)(UINT8 * pix, int line_size);
 
-extern void (*ff_idct)(DCTELEM *block);
-extern void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
-extern void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
-extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
-extern void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
-extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
-extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
-extern void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
-extern void (*clear_blocks)(DCTELEM *blocks);
 
 
 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size);
@@ -72,19 +83,28 @@ void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
 void clear_blocks_c(DCTELEM *blocks);
 
 /* add and put pixel (decoding) */
-typedef void (*op_pixels_func)(UINT8 *block, const UINT8 *pixels, int line_size, int h);
-typedef void (*qpel_mc_func)(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my);
-
-extern op_pixels_func put_pixels_tab[4];
-extern op_pixels_func avg_pixels_tab[4];
-extern op_pixels_func put_no_rnd_pixels_tab[4];
-extern op_pixels_func avg_no_rnd_pixels_tab[4];
-extern qpel_mc_func qpel_mc_rnd_tab[16];
-extern qpel_mc_func qpel_mc_no_rnd_tab[16];
+// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
+typedef void (*op_pixels_func)(UINT8 *block/*align width (8 or 16)*/, const UINT8 *pixels/*align 1*/, int line_size, int h);
+typedef void (*qpel_mc_func)(UINT8 *dst/*align width (8 or 16)*/, UINT8 *src/*align 1*/, int stride);
+
+extern op_pixels_func put_pixels_tab[2][4];
+extern op_pixels_func avg_pixels_tab[2][4];
+extern op_pixels_func put_no_rnd_pixels_tab[2][4];
+extern op_pixels_func avg_no_rnd_pixels_tab[2][4];
+extern qpel_mc_func put_qpel_pixels_tab[2][16];
+extern qpel_mc_func avg_qpel_pixels_tab[2][16];
+extern qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
+extern qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
+
+#define CALL_2X_PIXELS(a, b, n)\
+static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
+    b(block  , pixels  , line_size, h);\
+    b(block+n, pixels+n, line_size, h);\
+}
 
 /* motion estimation */
 
-typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size);
+typedef int (*op_pixels_abs_func)(UINT8 *blk1/*align width (8 or 16)*/, UINT8 *blk2/*align 1*/, int line_size);
 
 extern op_pixels_abs_func pix_abs16x16;
 extern op_pixels_abs_func pix_abs16x16_x2;
@@ -170,6 +190,13 @@ void dsputil_init_mlib(void);
 
 void dsputil_init_alpha(void);
 
+#elif defined(ARCH_POWERPC)
+
+#define emms_c()
+#define __align8 __attribute__ ((aligned (16)))
+
+void dsputil_init_ppc(void);
+
 #else
 
 #define emms_c()
diff --git a/src/libffmpeg/libavcodec/eval.c b/src/libffmpeg/libavcodec/eval.c
new file mode 100644
index 000000000..bcaf4f59b
--- /dev/null
+++ b/src/libffmpeg/libavcodec/eval.c
@@ -0,0 +1,249 @@
+/*
+ * simple arithmetic expression evaluator
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+ /*
+ * see http://joe.hotchkiss.com/programming/eval/eval.html
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#ifndef NAN
+  #define NAN 0
+#endif
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define STACK_SIZE 100
+
+typedef struct Parser{
+    double stack[STACK_SIZE];
+    int stack_index;
+    char *s;
+    double *const_value;
+    char **const_name;          // NULL terminated
+    double (**func1)(void *, double a); // NULL terminated
+    char **func1_name;          // NULL terminated
+    double (**func2)(void *, double a, double b); // NULL terminated
+    char **func2_name;          // NULL terminated
+    void *opaque;
+} Parser;
+
+static void evalExpression(Parser *p);
+
+static void push(Parser *p, double d){
+    if(p->stack_index+1>= STACK_SIZE){
+        fprintf(stderr, "stack overflow in the parser\n");
+        return;
+    }
+    p->stack[ p->stack_index++ ]= d;
+//printf("push %f\n", d); fflush(stdout);
+}
+
+static double pop(Parser *p){
+    if(p->stack_index<=0){
+        fprintf(stderr, "stack underflow in the parser\n");
+        return NAN;
+    }
+//printf("pop\n"); fflush(stdout);
+    return p->stack[ --p->stack_index ];
+}
+
+static int strmatch(char *s, char *prefix){
+    int i;
+    for(i=0; prefix[i]; i++){
+        if(prefix[i] != s[i]) return 0;
+    }
+    return 1;
+}
+
+static void evalPrimary(Parser *p){
+    double d, d2=NAN;
+    char *next= p->s;
+    int i;
+
+    /* number */
+    d= strtod(p->s, &next);
+    if(next != p->s){
+        push(p, d);
+        p->s= next;
+        return;
+    }
+    
+    /* named constants */
+    for(i=0; p->const_name[i]; i++){
+        if(strmatch(p->s, p->const_name[i])){
+            push(p, p->const_value[i]);
+            p->s+= strlen(p->const_name[i]);
+            return;
+        }
+    }
+    
+    p->s= strchr(p->s, '(');
+    if(p->s==NULL){
+        fprintf(stderr, "Parser: missing ( in \"%s\"\n", next);
+        return;
+    }
+    p->s++; // "("
+    evalExpression(p);
+    d= pop(p);
+    p->s++; // ")" or ","
+    if(p->s[-1]== ','){
+        evalExpression(p);
+        d2= pop(p);
+        p->s++; // ")"
+    }
+    
+         if( strmatch(next, "sinh"  ) ) d= sinh(d);
+    else if( strmatch(next, "cosh"  ) ) d= cosh(d);
+    else if( strmatch(next, "tanh"  ) ) d= tanh(d);
+    else if( strmatch(next, "sin"   ) ) d= sin(d);
+    else if( strmatch(next, "cos"   ) ) d= cos(d);
+    else if( strmatch(next, "tan"   ) ) d= tan(d);
+    else if( strmatch(next, "exp"   ) ) d= exp(d);
+    else if( strmatch(next, "log"   ) ) d= log(d);
+    else if( strmatch(next, "squish") ) d= 1/(1+exp(4*d));
+    else if( strmatch(next, "gauss" ) ) d= exp(-d*d/2)/sqrt(2*M_PI);
+    else if( strmatch(next, "abs"   ) ) d= abs(d);
+    else if( strmatch(next, "max"   ) ) d= d > d2 ? d : d2;
+    else if( strmatch(next, "min"   ) ) d= d < d2 ? d : d2;
+    else if( strmatch(next, "gt"    ) ) d= d > d2 ? 1.0 : 0.0;
+    else if( strmatch(next, "lt"    ) ) d= d > d2 ? 0.0 : 1.0;
+    else if( strmatch(next, "eq"    ) ) d= d == d2 ? 1.0 : 0.0;
+//    else if( strmatch(next, "l1"    ) ) d= 1 + d2*(d - 1);
+//    else if( strmatch(next, "sq01"  ) ) d= (d >= 0.0 && d <=1.0) ? 1.0 : 0.0;
+    else{
+        int error=1;
+        for(i=0; p->func1_name && p->func1_name[i]; i++){
+            if(strmatch(next, p->func1_name[i])){
+                d= p->func1[i](p->opaque, d);
+                error=0;
+                break;
+            }
+        }
+
+        for(i=0; p->func2_name && p->func2_name[i]; i++){
+            if(strmatch(next, p->func2_name[i])){
+                d= p->func2[i](p->opaque, d, d2);
+                error=0;
+                break;
+            }
+        }
+
+        if(error){
+            fprintf(stderr, "Parser: unknown function in \"%s\"\n", next);
+            return;
+        }
+    }
+    
+    if(p->s[-1]!= ')'){
+        fprintf(stderr, "Parser: missing ) in \"%s\"\n", next);
+        return;
+    }
+    push(p, d);
+}      
+       
+static void evalPow(Parser *p){
+    int neg= 0;
+    if(p->s[0]=='+') p->s++;
+       
+    if(p->s[0]=='-'){ 
+        neg= 1;
+        p->s++;
+    }
+    
+    if(p->s[0]=='('){
+        p->s++;;
+        evalExpression(p);
+
+        if(p->s[0]!=')')
+            fprintf(stderr, "Parser: missing )\n");
+        p->s++;
+    }else{
+        evalPrimary(p);
+    }
+    
+    if(neg) push(p, -pop(p));
+}
+
+static void evalFactor(Parser *p){
+    evalPow(p);
+    while(p->s[0]=='^'){
+        double d;
+
+        p->s++;
+        evalPow(p);
+        d= pop(p);
+        push(p, pow(pop(p), d));
+    }
+}
+
+static void evalTerm(Parser *p){
+    evalFactor(p);
+    while(p->s[0]=='*' || p->s[0]=='/'){
+        int inv= p->s[0]=='/';
+        double d;
+
+        p->s++;
+        evalFactor(p);
+        d= pop(p);
+        if(inv) d= 1.0/d;
+        push(p, d * pop(p));
+    }
+}
+
+static void evalExpression(Parser *p){
+    evalTerm(p);
+    while(p->s[0]=='+' || p->s[0]=='-'){
+        int sign= p->s[0]=='-';
+        double d;
+
+        p->s++;
+        evalTerm(p);
+        d= pop(p);
+        if(sign) d= -d;
+        push(p, d + pop(p));
+    }
+}
+
+double ff_eval(char *s, double *const_value, char **const_name,
+               double (**func1)(void *, double), char **func1_name, 
+               double (**func2)(void *, double, double), char **func2_name,
+               void *opaque){
+    Parser p;
+    
+    p.stack_index=0;
+    p.s= s;
+    p.const_value= const_value;
+    p.const_name = const_name;
+    p.func1      = func1;
+    p.func1_name = func1_name;
+    p.func2      = func2;
+    p.func2_name = func2_name;
+    p.opaque     = opaque;
+    
+    evalExpression(&p);
+    return pop(&p);
+}
diff --git a/src/libffmpeg/libavcodec/fdctref.c b/src/libffmpeg/libavcodec/fdctref.c
index 245492496..ae376f794 100644
--- a/src/libffmpeg/libavcodec/fdctref.c
+++ b/src/libffmpeg/libavcodec/fdctref.c
@@ -103,6 +103,7 @@ short *block;
         	s += c[i][5] * tmp[8 * 5 + j];
         	s += c[i][6] * tmp[8 * 6 + j];
         	s += c[i][7] * tmp[8 * 7 + j];
+		s*=8.0;
 
     		block[8 * i + j] = (short)floor(s + 0.499999);
 /*
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index 2fd90e9ec..c9a0a9d30 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -18,7 +18,9 @@
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
- * ac prediction encoding & b-frame support by Michael Niedermayer <michaelni@gmx.at>
+ * ac prediction encoding, b-frame support, error resilience, optimizations,
+ * qpel decoding, gmc decoding, interlaced decoding, 
+ * by Michael Niedermayer <michaelni@gmx.at>
  */
  
 //#define DEBUG
@@ -48,7 +50,7 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block,
 			      int n);
 static void h263_encode_motion(MpegEncContext * s, int val, int fcode);
 static void h263p_encode_umotion(MpegEncContext * s, int val);
-static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block,
+static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block,
 			       int n, int dc, UINT8 *scan_table, 
                                PutBitContext *dc_pb, PutBitContext *ac_pb);
 static int h263_decode_motion(MpegEncContext * s, int pred, int fcode);
@@ -71,6 +73,22 @@ static UINT8 umv_fcode_tab[MAX_MV*2+1];
 
 static UINT16 uni_DCtab_lum  [512][2];
 static UINT16 uni_DCtab_chrom[512][2];
+static UINT32 uni_mpeg4_intra_rl_bits[64*64*2*2];
+static UINT8  uni_mpeg4_intra_rl_len [64*64*2*2];
+static UINT32 uni_mpeg4_inter_rl_bits[64*64*2*2];
+static UINT8  uni_mpeg4_inter_rl_len [64*64*2*2];
+#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128 + (run)*256 + (level))
+//#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128*64 + (run) + (level)*64)
+
+/* mpeg4
+inter
+max level: 24/6
+max run: 53/63
+
+intra
+max level: 53/16
+max run: 29/41
+*/
 
 int h263_get_picture_format(int width, int height)
 {
@@ -169,12 +187,17 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
             /* Custom Picture Format (CPFMT) */
 		
 	    if (s->aspect_ratio_info)
-            put_bits(&s->pb,4,s->aspect_ratio_info);
+        	put_bits(&s->pb,4,s->aspect_ratio_info);
 	    else
-            put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */
+        	put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */
             put_bits(&s->pb,9,(s->width >> 2) - 1);
             put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */
             put_bits(&s->pb,9,(s->height >> 2));
+	    if (s->aspect_ratio_info == FF_ASPECT_EXTENDED)
+	    {
+		put_bits(&s->pb, 8, s->aspected_width);
+		put_bits(&s->pb, 8, s->aspected_height);
+	    }
         }
         
         /* Unlimited Unrestricted Motion Vectors Indicator (UUI) */
@@ -347,6 +370,8 @@ void mpeg4_encode_mb(MpegEncContext * s,
             case 0: /* direct */
                 h263_encode_motion(s, motion_x, 1);
                 h263_encode_motion(s, motion_y, 1);                
+                s->b_count++;
+                s->f_count++;
                 break;
             case 1: /* bidir */
                 h263_encode_motion(s, s->mv[0][0][0] - s->last_mv[0][0][0], s->f_code);
@@ -357,18 +382,22 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 s->last_mv[0][0][1]= s->mv[0][0][1];
                 s->last_mv[1][0][0]= s->mv[1][0][0];
                 s->last_mv[1][0][1]= s->mv[1][0][1];
+                s->b_count++;
+                s->f_count++;
                 break;
             case 2: /* backward */
                 h263_encode_motion(s, motion_x - s->last_mv[1][0][0], s->b_code);
                 h263_encode_motion(s, motion_y - s->last_mv[1][0][1], s->b_code);
                 s->last_mv[1][0][0]= motion_x;
                 s->last_mv[1][0][1]= motion_y;
+                s->b_count++;
                 break;
             case 3: /* forward */
                 h263_encode_motion(s, motion_x - s->last_mv[0][0][0], s->f_code);
                 h263_encode_motion(s, motion_y - s->last_mv[0][0][1], s->f_code);
                 s->last_mv[0][0][0]= motion_x;
                 s->last_mv[0][0][1]= motion_y;
+                s->f_count++;
                 break;
             default:
                 printf("unknown mb type\n");
@@ -499,7 +528,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 s->p_tex_bits+= bits - s->last_bits;
                 s->last_bits=bits;
             }
-            s->p_count++;
+            s->f_count++;
         }
     } else {
         int cbp;
@@ -1085,6 +1114,91 @@ static void init_uni_dc_tab(void)
     }
 }
 
+static void init_uni_mpeg4_rl_tab(RLTable *rl, UINT32 *bits_tab, UINT8 *len_tab){
+    int slevel, run, last;
+    
+    assert(MAX_LEVEL >= 64);
+    assert(MAX_RUN   >= 63);
+
+    for(slevel=-64; slevel<64; slevel++){
+        if(slevel==0) continue;
+        for(run=0; run<64; run++){
+            for(last=0; last<=1; last++){
+                const int index= UNI_MPEG4_ENC_INDEX(last, run, slevel+64);
+                int level= slevel < 0 ? -slevel : slevel;
+                int sign= slevel < 0 ? 1 : 0;
+                int bits, len, code;
+                int level1, run1;
+                
+                len_tab[index]= 100;
+                     
+                /* ESC0 */
+                code= get_rl_index(rl, last, run, level);
+                bits= rl->table_vlc[code][0];
+                len=  rl->table_vlc[code][1];
+                bits=bits*2+sign; len++;
+                
+                if(code!=rl->n && len < len_tab[index]){
+                    bits_tab[index]= bits;
+                    len_tab [index]= len;
+                }
+#if 1
+                /* ESC1 */
+                bits= rl->table_vlc[rl->n][0];
+                len=  rl->table_vlc[rl->n][1];
+                bits=bits*2;    len++; //esc1
+                level1= level - rl->max_level[last][run];
+                if(level1>0){
+                    code= get_rl_index(rl, last, run, level1);
+                    bits<<= rl->table_vlc[code][1];
+                    len  += rl->table_vlc[code][1];
+                    bits += rl->table_vlc[code][0];
+                    bits=bits*2+sign; len++;
+                
+                    if(code!=rl->n && len < len_tab[index]){
+                        bits_tab[index]= bits;
+                        len_tab [index]= len;
+                    }
+                }
+#endif 
+#if 1
+                /* ESC2 */
+                bits= rl->table_vlc[rl->n][0];
+                len=  rl->table_vlc[rl->n][1];
+                bits=bits*4+2;    len+=2; //esc2
+                run1 = run - rl->max_run[last][level] - 1;
+                if(run1>=0){
+                    code= get_rl_index(rl, last, run1, level);
+                    bits<<= rl->table_vlc[code][1];
+                    len  += rl->table_vlc[code][1];
+                    bits += rl->table_vlc[code][0];
+                    bits=bits*2+sign; len++;
+                
+                    if(code!=rl->n && len < len_tab[index]){
+                        bits_tab[index]= bits;
+                        len_tab [index]= len;
+                    }
+                }
+#endif           
+                /* ESC3 */        
+                bits= rl->table_vlc[rl->n][0];
+                len = rl->table_vlc[rl->n][1];
+                bits=bits*4+3;    len+=2; //esc3
+                bits=bits*2+last; len++;
+                bits=bits*64+run; len+=6;
+                bits=bits*2+1;    len++;  //marker
+                bits=bits*4096+(slevel&0xfff); len+=12;
+                bits=bits*2+1;    len++;  //marker
+                
+                if(len < len_tab[index]){
+                    bits_tab[index]= bits;
+                    len_tab [index]= len;
+                }
+            }
+        }
+    }
+}
+
 void h263_encode_init(MpegEncContext *s)
 {
     static int done = 0;
@@ -1097,6 +1211,9 @@ void h263_encode_init(MpegEncContext *s)
         init_rl(&rl_inter);
         init_rl(&rl_intra);
         init_rl(&rl_intra_aic);
+        
+        init_uni_mpeg4_rl_tab(&rl_intra, uni_mpeg4_intra_rl_bits, uni_mpeg4_intra_rl_len);
+        init_uni_mpeg4_rl_tab(&rl_inter, uni_mpeg4_inter_rl_bits, uni_mpeg4_inter_rl_len);
 
         init_mv_penalty_and_fcode(s);
     }
@@ -1210,13 +1327,16 @@ void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){
 
         s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
     }
-
-    s->time= picture_number*(INT64)FRAME_RATE_BASE*s->time_increment_resolution/s->frame_rate;
+    
+    if(s->avctx->pts)
+        s->time= (s->avctx->pts*s->time_increment_resolution + 500*1000)/(1000*1000);
+    else
+        s->time= picture_number*(INT64)FRAME_RATE_BASE*s->time_increment_resolution/s->frame_rate;
     time_div= s->time/s->time_increment_resolution;
     time_mod= s->time%s->time_increment_resolution;
 
     if(s->pict_type==B_TYPE){
-        s->bp_time= s->last_non_b_time - s->time;
+        s->pb_time= s->pp_time - (s->last_non_b_time - s->time);
     }else{
         s->last_time_base= s->time_base;
         s->time_base= time_div;
@@ -1246,6 +1366,11 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
         put_bits(&s->pb, 4, s->aspect_ratio_info);/* aspect ratio info */
     else
         put_bits(&s->pb, 4, 1);		/* aspect ratio info= sqare pixel */
+    if (s->aspect_ratio_info == FF_ASPECT_EXTENDED)
+    {
+	put_bits(&s->pb, 8, s->aspected_width);
+	put_bits(&s->pb, 8, s->aspected_height);
+    }
 
     if(s->low_delay){
         put_bits(&s->pb, 1, 1);		/* vol control parameters= yes */
@@ -1295,14 +1420,17 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
         put_bits(&s->pb, 1, 0);		/* reduced res vop */
     }
     put_bits(&s->pb, 1, 0);		/* scalability */
-
+    
     ff_mpeg4_stuffing(&s->pb);
-    put_bits(&s->pb, 16, 0);
-    put_bits(&s->pb, 16, 0x1B2);	/* user_data */
-    sprintf(buf, "FFmpeg%sb%s", FFMPEG_VERSION, LIBAVCODEC_BUILD_STR);
-    put_string(&s->pb, buf);
 
-    ff_mpeg4_stuffing(&s->pb);
+    /* user data */
+    if(!ff_bit_exact){
+        put_bits(&s->pb, 16, 0);
+        put_bits(&s->pb, 16, 0x1B2);	/* user_data */
+        sprintf(buf, "FFmpeg%sb%s", FFMPEG_VERSION, LIBAVCODEC_BUILD_STR);
+        put_string(&s->pb, buf);
+        ff_mpeg4_stuffing(&s->pb);
+    }
 }
 
 /* write mpeg4 VOP header */
@@ -1529,34 +1657,69 @@ static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
 #endif
 }
 
-static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
+static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
                                UINT8 *scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb)
 {
-    int level, run, last, i, j, last_index, last_non_zero, sign, slevel;
+    int last, i, last_non_zero, sign;
     int code;
     const RLTable *rl;
+    UINT32 *bits_tab;
+    UINT8 *len_tab;
+    const int last_index = s->block_last_index[n];
 
-    if (s->mb_intra) {
+    if (s->mb_intra) { //Note gcc (3.2.1 at least) will optimize this away
 	/* mpeg4 based DC predictor */
 	mpeg4_encode_dc(dc_pb, intra_dc, n);
+        if(last_index<1) return;
 	i = 1;
         rl = &rl_intra;
+        bits_tab= uni_mpeg4_intra_rl_bits;
+        len_tab = uni_mpeg4_intra_rl_len;
     } else {
+        if(last_index<0) return;
 	i = 0;
         rl = &rl_inter;
+        bits_tab= uni_mpeg4_inter_rl_bits;
+        len_tab = uni_mpeg4_inter_rl_len;
     }
 
     /* AC coefs */
-    last_index = s->block_last_index[n];
     last_non_zero = i - 1;
-    for (; i <= last_index; i++) {
-	j = scan_table[i];
-	level = block[j];
+#if 1
+    for (; i < last_index; i++) {
+	int level = block[ scan_table[i] ];
 	if (level) {
-	    run = i - last_non_zero - 1;
+	    int run = i - last_non_zero - 1;
+            level+=64;
+            if((level&(~127)) == 0){
+                const int index= UNI_MPEG4_ENC_INDEX(0, run, level);
+                put_bits(ac_pb, len_tab[index], bits_tab[index]);
+            }else{ //ESC3
+                put_bits(ac_pb, 7+2+1+6+1+12+1, (3<<23)+(3<<21)+(0<<20)+(run<<14)+(1<<13)+(((level-64)&0xfff)<<1)+1);
+            }
+	    last_non_zero = i;
+	}
+    }
+    /*if(i<=last_index)*/{
+	int level = block[ scan_table[i] ];
+        int run = i - last_non_zero - 1;
+        level+=64;
+        if((level&(~127)) == 0){
+            const int index= UNI_MPEG4_ENC_INDEX(1, run, level);
+            put_bits(ac_pb, len_tab[index], bits_tab[index]);
+        }else{ //ESC3
+            put_bits(ac_pb, 7+2+1+6+1+12+1, (3<<23)+(3<<21)+(1<<20)+(run<<14)+(1<<13)+(((level-64)&0xfff)<<1)+1);
+        }
+    }
+#else
+    for (; i <= last_index; i++) {
+	const int slevel = block[ scan_table[i] ];
+	if (slevel) {
+            int level;
+	    int run = i - last_non_zero - 1;
 	    last = (i == last_index);
 	    sign = 0;
-	    slevel = level;
+	    level = slevel;
 	    if (level < 0) {
 		sign = 1;
 		level = -level;
@@ -1605,6 +1768,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i
 	    last_non_zero = i;
 	}
     }
+#endif
 }
 
 
@@ -1746,6 +1910,8 @@ void h263_decode_init_vlc(MpegEncContext *s)
                  &mb_type_b_tab[0][1], 2, 1,
                  &mb_type_b_tab[0][0], 2, 1);
     }
+
+    s->progressive_sequence=1; // set to most likely for the case of incomplete headers
 }
 
 int h263_decode_gob_header(MpegEncContext *s)
@@ -1895,24 +2061,12 @@ static int decode_video_packet_header(MpegEncContext *s, GetBitContext *gb)
     if(header_extension){
         int time_increment;
         int time_incr=0;
-        printf("header extension not supported\n");
-        return -1;
 
         while (get_bits1(gb) != 0) 
             time_incr++;
 
         check_marker(gb, "before time_increment in video packed header");
         time_increment= get_bits(gb, s->time_increment_bits);
-        if(s->pict_type!=B_TYPE){
-            s->last_time_base= s->time_base;
-            s->time_base+= time_incr;
-            s->time= s->time_base*s->time_increment_resolution + time_increment;
-            s->pp_time= s->time - s->last_non_b_time;
-            s->last_non_b_time= s->time;
-        }else{
-            s->time= (s->last_time_base + time_incr)*s->time_increment_resolution + time_increment;
-            s->bp_time= s->last_non_b_time - s->time;
-        }
         check_marker(gb, "before vop_coding_type in video packed header");
         
         skip_bits(gb, 2); /* vop coding type */
@@ -1923,19 +2077,22 @@ static int decode_video_packet_header(MpegEncContext *s, GetBitContext *gb)
 
             if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE && s->num_sprite_warping_points){
                 mpeg4_decode_sprite_trajectory(s);
+                fprintf(stderr, "untested\n");
             }
 
             //FIXME reduced res stuff here
             
             if (s->pict_type != I_TYPE) {
-                s->f_code = get_bits(gb, 3);	/* fcode_for */
-                if(s->f_code==0){
-                    printf("Error, video packet header damaged or not MPEG4 header (f_code=0)\n");
-                    return -1; // makes no sense to continue, as the MV decoding will break very quickly
+                int f_code = get_bits(gb, 3);	/* fcode_for */
+                if(f_code==0){
+                    printf("Error, video packet header damaged (f_code=0)\n");
                 }
             }
             if (s->pict_type == B_TYPE) {
-                s->b_code = get_bits(gb, 3);
+                int b_code = get_bits(gb, 3);
+                if(b_code==0){
+                    printf("Error, video packet header damaged (b_code=0)\n");
+                }
             }       
         }
     }
@@ -2426,7 +2583,37 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s,
 
     return 0;
 }
+#if 0
+static inline void decode_interlaced_info(MpegEncContext *s, int cbp, int mb_type){
+    s->mv_type= 0;            
+    if(!s->progressive_sequence){
+        if(cbp || s->mb_intra)
+            s->interlaced_dct= get_bits1(&s->gb);
+        
+        if(!s->mb_intra){
+            if(   s->pict_type==P_TYPE //FIXME check that 4MV is forbidden
+               || (s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE && !s->mcsel)
+               || (s->pict_type==B_TYPE && mb_type!=0) ){
+
+                if(get_bits1(&s->gb)){
+                    s->mv_type= MV_TYPE_FIELD;
 
+                    if(   s->pict_type==P_TYPE
+                       || (s->pict_type==B_TYPE && mb_type!=2)){
+                        s->field_select[0][0]= get_bits1(&s->gb);
+                        s->field_select[0][1]= get_bits1(&s->gb);
+                    }
+                    if(s->pict_type==B_TYPE && mb_type!=3){
+                        s->field_select[1][0]= get_bits1(&s->gb);
+                        s->field_select[1][1]= get_bits1(&s->gb);
+                    }
+                }else
+                    s->mv_type= 0;            
+            }
+        }   
+    }
+}
+#endif
 
 int h263_decode_mb(MpegEncContext *s,
                    DCTELEM block[6][64])
@@ -2507,55 +2694,75 @@ int h263_decode_mb(MpegEncContext *s,
                 s->qscale = 31;
             h263_dc_scale(s);
         }
+        if((!s->progressive_sequence) && (cbp || s->workaround_bugs==2))
+            s->interlaced_dct= get_bits1(&s->gb);
+        
         s->mv_dir = MV_DIR_FORWARD;
         if ((cbpc & 16) == 0) {
-            PRINT_MB_TYPE("P");
-            /* 16x16 motion prediction */
-            s->mv_type = MV_TYPE_16X16;
-            h263_pred_motion(s, 0, &pred_x, &pred_y);
-            if (s->umvplus_dec)
-               mx = h263p_decode_umotion(s, pred_x);
-            else if(!s->mcsel)
-               mx = h263_decode_motion(s, pred_x, s->f_code);
-            else {
-               const int a= s->sprite_warping_accuracy;
+            if(s->mcsel){
+                const int a= s->sprite_warping_accuracy;
+                PRINT_MB_TYPE("G");
+                /* 16x16 global motion prediction */
+                s->mv_type = MV_TYPE_16X16;
 //        int l = (1 << (s->f_code - 1)) * 32;
                 if(s->divx_version==500 && s->divx_build==413){
                     mx = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample));
+                    my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample));
                 }else{
                     mx = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
+                    my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
                 }
-//        if (mx < -l) mx= -l, printf("C");
-//        else if (mx >= l) mx= l-1, printf("C");
-            }
-            if (mx >= 0xffff)
-                return -1;
-            
-            if (s->umvplus_dec)
-               my = h263p_decode_umotion(s, pred_y);
-            else if(!s->mcsel)
-               my = h263_decode_motion(s, pred_y, s->f_code);
-            else{
-               const int a= s->sprite_warping_accuracy;
 //       int l = (1 << (s->f_code - 1)) * 32;
-                if(s->divx_version==500 && s->divx_build==413){
-                    my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample));
-                }else{
-                    my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
+                s->mv[0][0][0] = mx;
+                s->mv[0][0][1] = my;
+            }else if((!s->progressive_sequence) && get_bits1(&s->gb)){
+                PRINT_MB_TYPE("f");
+                /* 16x8 field motion prediction */
+                s->mv_type= MV_TYPE_FIELD;
+
+                s->field_select[0][0]= get_bits1(&s->gb);
+                s->field_select[0][1]= get_bits1(&s->gb);
+
+                h263_pred_motion(s, 0, &pred_x, &pred_y);
+                
+                for(i=0; i<2; i++){
+                    mx = h263_decode_motion(s, pred_x, s->f_code);
+                    if (mx >= 0xffff)
+                        return -1;
+            
+                    my = h263_decode_motion(s, pred_y/2, s->f_code);
+                    if (my >= 0xffff)
+                        return -1;
+
+                    s->mv[0][i][0] = mx;
+                    s->mv[0][i][1] = my;
                 }
-//       if (my < -l) my= -l, printf("C");
-//       else if (my >= l) my= l-1, printf("C");
+            }else{
+                PRINT_MB_TYPE("P");
+                /* 16x16 motion prediction */
+                s->mv_type = MV_TYPE_16X16;
+                h263_pred_motion(s, 0, &pred_x, &pred_y);
+                if (s->umvplus_dec)
+                   mx = h263p_decode_umotion(s, pred_x);
+                else
+                   mx = h263_decode_motion(s, pred_x, s->f_code);
+            
+                if (mx >= 0xffff)
+                    return -1;
+            
+                if (s->umvplus_dec)
+                   my = h263p_decode_umotion(s, pred_y);
+                else
+                   my = h263_decode_motion(s, pred_y, s->f_code);
+            
+                if (my >= 0xffff)
+                    return -1;
+                s->mv[0][0][0] = mx;
+                s->mv[0][0][1] = my;
+
+                if (s->umvplus_dec && (mx - pred_x) == 1 && (my - pred_y) == 1)
+                   skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */                   
             }
-            if (my >= 0xffff)
-                return -1;
-            s->mv[0][0][0] = mx;
-            s->mv[0][0][1] = my;
-            /*fprintf(stderr, "\n MB %d", (s->mb_y * s->mb_width) + s->mb_x);
-            fprintf(stderr, "\n\tmvx: %d\t\tpredx: %d", mx, pred_x);
-            fprintf(stderr, "\n\tmvy: %d\t\tpredy: %d", my, pred_y);*/
-            if (s->umvplus_dec && (mx - pred_x) == 1 && (my - pred_y) == 1)
-               skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
-                           
         } else {
             PRINT_MB_TYPE("4");
             s->mv_type = MV_TYPE_8X8;
@@ -2594,10 +2801,12 @@ int h263_decode_mb(MpegEncContext *s,
         s->mcsel=0;      //     ...               true gmc blocks
 
         if(s->mb_x==0){
-            s->last_mv[0][0][0]= 
-            s->last_mv[0][0][1]= 
-            s->last_mv[1][0][0]= 
-            s->last_mv[1][0][1]= 0;
+            for(i=0; i<2; i++){
+                s->last_mv[i][0][0]= 
+                s->last_mv[i][0][1]= 
+                s->last_mv[i][1][0]= 
+                s->last_mv[i][1][1]= 0;
+            }
         }
 
         /* if we skipped it in the future P Frame than skip it now too */
@@ -2614,20 +2823,23 @@ int h263_decode_mb(MpegEncContext *s,
             s->mv[0][0][1] = 0;
             s->mv[1][0][0] = 0;
             s->mv[1][0][1] = 0;
-//FIXME is this correct?
-/*            s->last_mv[0][0][0]=
-            s->last_mv[0][0][1]=0;*/
             PRINT_MB_TYPE("s");
             return 0;
         }
 
-        modb1= get_bits1(&s->gb);
-        if(modb1==0){
+        modb1= get_bits1(&s->gb); 
+        if(modb1){
+            mb_type=4; //like MB_TYPE_B_DIRECT but no vectors coded
+            cbp=0;
+        }else{
+            int field_mv;
+        
             modb2= get_bits1(&s->gb);
             mb_type= get_vlc2(&s->gb, mb_type_b_vlc.table, MB_TYPE_B_VLC_BITS, 1);
-            if(modb2==0) cbp= get_bits(&s->gb, 6);
-            else cbp=0;
-            if (mb_type && cbp) {
+            if(modb2) cbp= 0;
+            else      cbp= get_bits(&s->gb, 6);
+
+            if (mb_type!=MB_TYPE_B_DIRECT && cbp) {
                 if(get_bits1(&s->gb)){
                     s->qscale +=get_bits1(&s->gb)*4 - 2;
                     if (s->qscale < 1)
@@ -2637,81 +2849,141 @@ int h263_decode_mb(MpegEncContext *s,
                     h263_dc_scale(s);
                 }
             }
-        }else{
-            mb_type=4; //like 0 but no vectors coded
-            cbp=0;
+            field_mv=0;
+
+            if(!s->progressive_sequence){
+                if(cbp)
+                    s->interlaced_dct= get_bits1(&s->gb);
+
+                if(mb_type!=MB_TYPE_B_DIRECT && get_bits1(&s->gb)){
+                    field_mv=1;
+
+                    if(mb_type!=MB_TYPE_B_BACKW){
+                        s->field_select[0][0]= get_bits1(&s->gb);
+                        s->field_select[0][1]= get_bits1(&s->gb);
+                    }
+                    if(mb_type!=MB_TYPE_B_FORW){
+                        s->field_select[1][0]= get_bits1(&s->gb);
+                        s->field_select[1][1]= get_bits1(&s->gb);
+                    }
+                }
+            }
+
+            s->mv_dir = 0;
+            if(mb_type!=MB_TYPE_B_DIRECT && !field_mv){
+                s->mv_type= MV_TYPE_16X16;
+                if(mb_type!=MB_TYPE_B_BACKW){
+                    s->mv_dir = MV_DIR_FORWARD;
+
+                    mx = h263_decode_motion(s, s->last_mv[0][0][0], s->f_code);
+                    my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
+                    s->last_mv[0][1][0]= s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
+                    s->last_mv[0][1][1]= s->last_mv[0][0][1]= s->mv[0][0][1] = my;
+                }
+    
+                if(mb_type!=MB_TYPE_B_FORW){
+                    s->mv_dir |= MV_DIR_BACKWARD;
+
+                    mx = h263_decode_motion(s, s->last_mv[1][0][0], s->b_code);
+                    my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
+                    s->last_mv[1][1][0]= s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
+                    s->last_mv[1][1][1]= s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+                }
+                if(mb_type!=MB_TYPE_B_DIRECT)
+                    PRINT_MB_TYPE(mb_type==MB_TYPE_B_FORW ? "F" : (mb_type==MB_TYPE_B_BACKW ? "B" : "T"));
+            }else if(mb_type!=MB_TYPE_B_DIRECT){
+                s->mv_type= MV_TYPE_FIELD;
+
+                if(mb_type!=MB_TYPE_B_BACKW){
+                    s->mv_dir = MV_DIR_FORWARD;
+                
+                    for(i=0; i<2; i++){
+                        mx = h263_decode_motion(s, s->last_mv[0][i][0]  , s->f_code);
+                        my = h263_decode_motion(s, s->last_mv[0][i][1]/2, s->f_code);
+                        s->last_mv[0][i][0]=  s->mv[0][i][0] = mx;
+                        s->last_mv[0][i][1]= (s->mv[0][i][1] = my)*2;
+                    }
+                }
+    
+                if(mb_type!=MB_TYPE_B_FORW){
+                    s->mv_dir |= MV_DIR_BACKWARD;
+
+                    for(i=0; i<2; i++){
+                        mx = h263_decode_motion(s, s->last_mv[1][i][0]  , s->b_code);
+                        my = h263_decode_motion(s, s->last_mv[1][i][1]/2, s->b_code);
+                        s->last_mv[1][i][0]=  s->mv[1][i][0] = mx;
+                        s->last_mv[1][i][1]= (s->mv[1][i][1] = my)*2;
+                    }
+                }
+                if(mb_type!=MB_TYPE_B_DIRECT)
+                    PRINT_MB_TYPE(mb_type==MB_TYPE_B_FORW ? "f" : (mb_type==MB_TYPE_B_BACKW ? "b" : "t"));
+            }
         }
-        s->mv_type = MV_TYPE_16X16; // we'll switch to 8x8 only if the last P frame had 8x8 for this MB and mb_type=0 here
-        mx=my=0; //for case 4, we could put this to the mb_type=4 but than gcc compains about uninitalized mx/my
-        switch(mb_type)
-        {
-        case 0: /* direct */
-            mx = h263_decode_motion(s, 0, 1);
-            my = h263_decode_motion(s, 0, 1);
-        case 4: /* direct with mx=my=0 */
+          
+        if(mb_type==4 || mb_type==MB_TYPE_B_DIRECT){
+            int mb_index= s->mb_x + s->mb_y*s->mb_width;
+            int i;
+            
+            if(mb_type==4)
+                mx=my=0;
+            else{
+                mx = h263_decode_motion(s, 0, 1);
+                my = h263_decode_motion(s, 0, 1);
+            }
+ 
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
             xy= s->block_index[0];
             time_pp= s->pp_time;
-            time_pb= time_pp - s->bp_time;
-//if(time_pp>3000 )printf("%d %d  ", time_pp, time_pb);
+            time_pb= s->pb_time;
+            
             //FIXME avoid divides
-            s->mv[0][0][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
-            s->mv[0][0][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
-            s->mv[1][0][0] = mx ? s->mv[0][0][0] - s->motion_val[xy][0]
-                                : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp + mx;
-            s->mv[1][0][1] = my ? s->mv[0][0][1] - s->motion_val[xy][1] 
-                                : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp + my;
-            if(s->non_b_mv4_table[xy]){
-                int i;
+            switch(s->co_located_type_table[mb_index]){
+            case 0:
+                s->mv_type= MV_TYPE_16X16;
+                s->mv[0][0][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
+                s->mv[0][0][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
+                s->mv[1][0][0] = mx ? s->mv[0][0][0] - s->motion_val[xy][0]
+                                    : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp;
+                s->mv[1][0][1] = my ? s->mv[0][0][1] - s->motion_val[xy][1] 
+                                    : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp;
+                PRINT_MB_TYPE(mb_type==4 ? "D" : "S");
+                break;
+            case CO_LOCATED_TYPE_4MV:
                 s->mv_type = MV_TYPE_8X8;
-                for(i=1; i<4; i++){
+                for(i=0; i<4; i++){
                     xy= s->block_index[i];
                     s->mv[0][i][0] = s->motion_val[xy][0]*time_pb/time_pp + mx;
                     s->mv[0][i][1] = s->motion_val[xy][1]*time_pb/time_pp + my;
                     s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->motion_val[xy][0]
-                                        : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp + mx;
+                                        : s->motion_val[xy][0]*(time_pb - time_pp)/time_pp;
                     s->mv[1][i][1] = my ? s->mv[0][i][1] - s->motion_val[xy][1] 
-                                        : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp + my;
+                                        : s->motion_val[xy][1]*(time_pb - time_pp)/time_pp;
                 }
                 PRINT_MB_TYPE("4");
-            }else{
-                PRINT_MB_TYPE(mb_type==4 ? "D" : "S");
+                break;
+            case CO_LOCATED_TYPE_FIELDMV:
+                s->mv_type = MV_TYPE_FIELD;
+                for(i=0; i<2; i++){
+                    if(s->top_field_first){
+                        time_pp= s->pp_field_time - s->field_select_table[mb_index][i] + i;
+                        time_pb= s->pb_field_time - s->field_select_table[mb_index][i] + i;
+                    }else{
+                        time_pp= s->pp_field_time + s->field_select_table[mb_index][i] - i;
+                        time_pb= s->pb_field_time + s->field_select_table[mb_index][i] - i;
+                    }
+                    s->mv[0][i][0] = s->field_mv_table[mb_index][i][0]*time_pb/time_pp + mx;
+                    s->mv[0][i][1] = s->field_mv_table[mb_index][i][1]*time_pb/time_pp + my;
+                    s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->field_mv_table[mb_index][i][0]
+                                        : s->field_mv_table[mb_index][i][0]*(time_pb - time_pp)/time_pp;
+                    s->mv[1][i][1] = my ? s->mv[0][i][1] - s->field_mv_table[mb_index][i][1] 
+                                        : s->field_mv_table[mb_index][i][1]*(time_pb - time_pp)/time_pp;
+                }
+                PRINT_MB_TYPE("=");
+                break;
             }
-/*            s->mv[0][0][0] = 
-            s->mv[0][0][1] = 
-            s->mv[1][0][0] = 
-            s->mv[1][0][1] = 1000;*/
-            break;
-        case 1: 
-            s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
-            mx = h263_decode_motion(s, s->last_mv[0][0][0], s->f_code);
-            my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
-            s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
-            s->last_mv[0][0][1]= s->mv[0][0][1] = my;
-
-            mx = h263_decode_motion(s, s->last_mv[1][0][0], s->b_code);
-            my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
-            s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
-            s->last_mv[1][0][1]= s->mv[1][0][1] = my;
-            PRINT_MB_TYPE("i");
-            break;
-        case 2: 
-            s->mv_dir = MV_DIR_BACKWARD;
-            mx = h263_decode_motion(s, s->last_mv[1][0][0], s->b_code);
-            my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
-            s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
-            s->last_mv[1][0][1]= s->mv[1][0][1] = my;
-            PRINT_MB_TYPE("B");
-            break;
-        case 3:
-            s->mv_dir = MV_DIR_FORWARD;
-            mx = h263_decode_motion(s, s->last_mv[0][0][0], s->f_code);
-            my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
-            s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
-            s->last_mv[0][0][1]= s->mv[0][0][1] = my;
-            PRINT_MB_TYPE("F");
-            break;
-        default: 
+        }
+        
+        if(mb_type<0 || mb_type>4){
             printf("illegal MB_type\n");
             return -1;
         }
@@ -2741,6 +3013,8 @@ intra:
                 s->qscale = 31;
             h263_dc_scale(s);
         }
+        if(!s->progressive_sequence)
+            s->interlaced_dct= get_bits1(&s->gb);
 
         /* decode each block */
         if (s->h263_pred) {
@@ -2774,7 +3048,7 @@ intra:
 
 static int h263_decode_motion(MpegEncContext * s, int pred, int f_code)
 {
-    int code, val, sign, shift, l, m;
+    int code, val, sign, shift, l;
 
     code = get_vlc2(&s->gb, mv_vlc.table, MV_VLC_BITS, 2);
     if (code < 0)
@@ -2795,11 +3069,10 @@ static int h263_decode_motion(MpegEncContext * s, int pred, int f_code)
     /* modulo decoding */
     if (!s->h263_long_vectors) {
         l = (1 << (f_code - 1)) * 32;
-        m = 2 * l;
         if (val < -l) {
-            val += m;
+            val += l<<1;
         } else if (val >= l) {
-            val -= m;
+            val -= l<<1;
         }
     } else {
         /* horrible h263 long vector mode */
@@ -2900,7 +3173,7 @@ static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
             if (s->h263_rv10 && level == -128) {
                 /* XXX: should patch encoder too */
                 level = get_bits(&s->gb, 12);
-                level = (level << 20) >> 20;
+		level= (level + ((-1)<<11)) ^ ((-1)<<11); //sign extension
             }
         } else {
             run = rl->table_run[code];
@@ -3000,7 +3273,9 @@ static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
             goto not_coded;
         rl = &rl_intra;
         rl_vlc = rl_intra.rl_vlc[0];
-        if (s->ac_pred) {
+        if(s->alternate_scan)
+            scan_table = ff_alternate_vertical_scan; /* left */
+        else if (s->ac_pred) {
             if (dc_pred_dir == 0) 
                 scan_table = ff_alternate_vertical_scan; /* left */
             else
@@ -3017,7 +3292,12 @@ static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
             return 0;
         }
         rl = &rl_inter;
-        scan_table = zigzag_direct;
+   
+        if(s->alternate_scan)
+            scan_table = ff_alternate_vertical_scan; /* left */
+        else
+            scan_table = zigzag_direct;
+
         if(s->mpeg_quant){
             qmul=1;
             qadd=0;
@@ -3282,10 +3562,10 @@ int h263_decode_picture_header(MpegEncContext *s)
                 skip_bits1(&s->gb);
                 height = get_bits(&s->gb, 9) * 4;
                 dprintf("\nH.263+ Custom picture: %dx%d\n",width,height);
-                if (s->aspect_ratio_info == EXTENDED_PAR) {
+                if (s->aspect_ratio_info == FF_ASPECT_EXTENDED) {
                     /* aspected dimensions */
-                    skip_bits(&s->gb, 8); /* width */
-                    skip_bits(&s->gb, 8); /* height */
+		    s->aspected_width = get_bits(&s->gb, 8);
+		    s->aspected_height = get_bits(&s->gb, 8);
                 }
             } else {
                 width = h263_format[format][0];
@@ -3552,9 +3832,9 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         }
 //printf("vo type:%d\n",s->vo_type);
         s->aspect_ratio_info= get_bits(&s->gb, 4);
-	if(s->aspect_ratio_info == EXTENDED_PAR){
-            skip_bits(&s->gb, 8); //par_width
-            skip_bits(&s->gb, 8); // par_height
+	if(s->aspect_ratio_info == FF_ASPECT_EXTENDED){	    
+	    s->aspected_width = get_bits(&s->gb, 8); // par_width
+	    s->aspected_height = get_bits(&s->gb, 8); // par_height
         }
 
         if ((s->vol_control_parameters=get_bits1(&s->gb))) { /* vol control parameter */
@@ -3564,8 +3844,17 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             }
             s->low_delay= get_bits1(&s->gb);
             if(get_bits1(&s->gb)){ /* vbv parameters */
-                printf("vbv parameters not supported\n");
-                return -1;
+                get_bits(&s->gb, 15);	/* first_half_bitrate */
+                skip_bits1(&s->gb);	/* marker */
+                get_bits(&s->gb, 15);	/* latter_half_bitrate */
+                skip_bits1(&s->gb);	/* marker */
+                get_bits(&s->gb, 15);	/* first_half_vbv_buffer_size */
+                skip_bits1(&s->gb);	/* marker */
+                get_bits(&s->gb, 3);	/* latter_half_vbv_buffer_size */
+                get_bits(&s->gb, 11);	/* first_half_vbv_occupancy */
+                skip_bits1(&s->gb);	/* marker */
+                get_bits(&s->gb, 15);	/* latter_half_vbv_occupancy */
+                skip_bits1(&s->gb);	/* marker */               
             }
         }else{
             // set low delay flag only once so the smart? low delay detection wont be overriden
@@ -3583,6 +3872,7 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         skip_bits1(&s->gb);   /* marker */
         
         s->time_increment_resolution = get_bits(&s->gb, 16);
+        
         s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
         if (s->time_increment_bits < 1)
             s->time_increment_bits = 1;
@@ -3606,7 +3896,7 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
                 }
             }
             
-            if(get_bits1(&s->gb)) printf("interlaced not supported\n");   /* interlaced */
+            s->progressive_sequence= get_bits1(&s->gb)^1;
             if(!get_bits1(&s->gb)) printf("OBMC not supported (very likely buggy encoder)\n");   /* OBMC Disable */
             if (vo_ver_id == 1) {
                 s->vol_sprite_usage = get_bits1(&s->gb); /* vol_sprite_usage */
@@ -3763,11 +4053,6 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
                 printf("This file was encoded with DivX%d Build%d\n", ver, build);
                 if(ver==500 && build==413){
                     printf("WARNING: this version of DivX is not MPEG4 compatible, trying to workaround these bugs...\n");
-#if 0
-                }else{
-                    printf("hmm, i havnt seen that version of divx yet, lets assume they fixed these bugs ...\n"
-                           "using mpeg4 decoder, if it fails contact the developers (of ffmpeg)\n");
-#endif
                 }
             }
         }
@@ -3783,7 +4068,12 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         printf("low_delay flag set, but shouldnt, clearing it\n");
         s->low_delay=0;
     }
-// printf("pic: %d, qpel:%d\n", s->pict_type, s->quarter_sample); 
+// printf("pic: %d, qpel:%d part:%d resync:%d\n", s->pict_type, s->quarter_sample, s->data_partitioning, s->resync_marker); 
+    
+    if(s->time_increment_resolution==0){
+        s->time_increment_resolution=1;
+//        fprintf(stderr, "time_increment_resolution is illegal\n");
+    }
     time_incr=0;
     while (get_bits1(&s->gb) != 0) 
         time_incr++;
@@ -3795,17 +4085,32 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         s->last_time_base= s->time_base;
         s->time_base+= time_incr;
         s->time= s->time_base*s->time_increment_resolution + time_increment;
+        if(s->time < s->last_non_b_time && s->workaround_bugs==3){
+            fprintf(stderr, "header is not mpeg4 compatible, broken encoder, trying to workaround\n");
+            s->time_base++;
+            s->time+= s->time_increment_resolution;
+        }
         s->pp_time= s->time - s->last_non_b_time;
         s->last_non_b_time= s->time;
     }else{
         s->time= (s->last_time_base + time_incr)*s->time_increment_resolution + time_increment;
-        s->bp_time= s->last_non_b_time - s->time;
-        if(s->pp_time <=s->bp_time){
+        s->pb_time= s->pp_time - (s->last_non_b_time - s->time);
+        if(s->pp_time <=s->pb_time || s->pp_time <= s->pp_time - s->pb_time || s->pp_time<=0){
 //            printf("messed up order, seeking?, skiping current b frame\n");
             return FRAME_SKIPED;
         }
+        
+        if(s->t_frame==0) s->t_frame= s->time - s->last_time_base;
+        if(s->t_frame==0) s->t_frame=1; // 1/0 protection
+//printf("%Ld %Ld %d %d\n", s->last_non_b_time, s->time, s->pp_time, s->t_frame); fflush(stdout);
+        s->pp_field_time= (  ROUNDED_DIV(s->last_non_b_time, s->t_frame) 
+                           - ROUNDED_DIV(s->last_non_b_time - s->pp_time, s->t_frame))*2;
+        s->pb_field_time= (  ROUNDED_DIV(s->time, s->t_frame) 
+                           - ROUNDED_DIV(s->last_non_b_time - s->pp_time, s->t_frame))*2;
     }
-
+    
+    s->avctx->pts= s->time*1000LL*1000LL / s->time_increment_resolution;
+    
     if(check_marker(&s->gb, "before vop_coded")==0 && s->picture_number==0){
         printf("hmm, seems the headers arnt complete, trying to guess time_increment_bits\n");
         for(s->time_increment_bits++ ;s->time_increment_bits<16; s->time_increment_bits++){
@@ -3816,8 +4121,8 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
     /* vop coded */
     if (get_bits1(&s->gb) != 1)
         goto redo;
-//printf("time %d %d %d || %d %d %d\n", s->time_increment_bits, s->time_increment, s->time_base,
-//s->time, s->last_non_b_time[0], s->last_non_b_time[1]);  
+//printf("time %d %d %d || %Ld %Ld %Ld\n", s->time_increment_bits, s->time_increment_resolution, s->time_base,
+//s->time, s->last_non_b_time, s->last_non_b_time - s->pp_time);  
     if (s->shape != BIN_ONLY_SHAPE && ( s->pict_type == P_TYPE
                           || (s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE))) {
         /* rounding type for motion estimation */
@@ -3851,7 +4156,11 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
          int t;
          t=get_bits(&s->gb, 3); /* intra dc VLC threshold */
 //printf("threshold %d\n", t);
-         //FIXME interlaced specific bits
+         if(!s->progressive_sequence){
+             s->top_field_first= get_bits1(&s->gb);
+             s->alternate_scan= get_bits1(&s->gb);
+         }else
+             s->alternate_scan= 0;
      }
 
      if(s->pict_type == S_TYPE && (s->vol_sprite_usage==STATIC_SPRITE || s->vol_sprite_usage==GMC_SPRITE)){
@@ -3883,11 +4192,15 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
      
          if (s->pict_type == B_TYPE) {
              s->b_code = get_bits(&s->gb, 3);
-//printf("b-code %d\n", s->b_code);
          }else
              s->b_code=1;
-
-//printf("quant:%d fcode:%d bcode:%d type:%d\n", s->qscale, s->f_code, s->b_code, s->pict_type);
+#if 0
+printf("qp:%d fc:%d bc:%d type:%s size:%d pro:%d alt:%d top:%d qpel:%d part:%d resync:%d\n", 
+    s->qscale, s->f_code, s->b_code, 
+    s->pict_type == I_TYPE ? "I" : (s->pict_type == P_TYPE ? "P" : (s->pict_type == B_TYPE ? "B" : "S")), 
+    s->gb.size,s->progressive_sequence, s->alternate_scan, s->top_field_first, 
+    s->quarter_sample, s->data_partitioning, s->resync_marker); 
+#endif
          if(!s->scalability){
              if (s->shape!=RECT_SHAPE && s->pict_type!=I_TYPE) {
                  skip_bits1(&s->gb); // vop shape coding type
@@ -3910,7 +4223,6 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
      }
 
      s->picture_number++; // better than pic number==0 allways ;)
-//printf("done\n");
 
      s->y_dc_scale_table= ff_mpeg4_y_dc_scale_table; //FIXME add short header support 
      s->c_dc_scale_table= ff_mpeg4_c_dc_scale_table;
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 0d83a5633..ffecbc932 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -110,6 +110,23 @@ static int h263_decode_end(AVCodecContext *avctx)
     return 0;
 }
 
+/**
+ * retunrs the number of bytes consumed for building the current frame
+ */
+static int get_consumed_bytes(MpegEncContext *s, int buf_size){
+    int pos= (get_bits_count(&s->gb)+7)>>3;
+
+    if(s->divx_version>=500){
+        //we would have to scan through the whole buf to handle the weird reordering ...
+        return buf_size; 
+    }else{
+        if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
+        if(pos+10>buf_size) pos=buf_size; // oops ;)
+
+        return pos;
+    }
+}
+
 static int h263_decode_frame(AVCodecContext *avctx, 
                              void *data, int *data_size,
                              UINT8 *buf, int buf_size)
@@ -130,9 +147,10 @@ uint64_t time= rdtsc();
     s->workaround_bugs= avctx->workaround_bugs;
     s->flags= avctx->flags;
 
-    /* no supplementary picture */
+    *data_size = 0;
+   
+   /* no supplementary picture */
     if (buf_size == 0) {
-        *data_size = 0;
         return 0;
     }
 
@@ -175,24 +193,29 @@ uint64_t time= rdtsc();
         avctx->width = s->width;
         avctx->height = s->height;
         avctx->aspect_ratio_info= s->aspect_ratio_info;
+	if (s->aspect_ratio_info == FF_ASPECT_EXTENDED)
+	{
+	    avctx->aspected_width = s->aspected_width;
+	    avctx->aspected_height = s->aspected_height;
+	}
         if (MPV_common_init(s) < 0)
             return -1;
     }
 
-    if(ret==FRAME_SKIPED) return buf_size;
+    if(ret==FRAME_SKIPED) return get_consumed_bytes(s, buf_size);
     /* skip if the header was thrashed */
     if (ret < 0){
         fprintf(stderr, "header damaged\n");
         return -1;
     }
     /* skip b frames if we dont have reference frames */
-    if(s->num_available_buffers<2 && s->pict_type==B_TYPE) return buf_size;
+    if(s->num_available_buffers<2 && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
     /* skip b frames if we are in a hurry */
-    if(s->hurry_up && s->pict_type==B_TYPE) return buf_size;
+    if(s->hurry_up && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
     
     if(s->next_p_frame_damaged){
         if(s->pict_type==B_TYPE)
-            return buf_size;
+            return get_consumed_bytes(s, buf_size);
         else
             s->next_p_frame_damaged=0;
     }
@@ -354,14 +377,14 @@ uint64_t time= rdtsc();
         if(msmpeg4_decode_ext_header(s, buf_size) < 0) return -1;
     
     /* divx 5.01+ bistream reorder stuff */
-    if(s->codec_id==CODEC_ID_MPEG4 && s->bitstream_buffer_size==0){
+    if(s->codec_id==CODEC_ID_MPEG4 && s->bitstream_buffer_size==0 && s->divx_version>=500){
         int current_pos= get_bits_count(&s->gb)>>3;
 
         if(   buf_size - current_pos > 5 
            && buf_size - current_pos < BITSTREAM_BUFFER_SIZE){
             int i;
             int startcode_found=0;
-            for(i=current_pos; i<buf_size; i++){
+            for(i=current_pos; i<buf_size-3; i++){
                 if(buf[i]==0 && buf[i+1]==0 && buf[i+2]==1 && buf[i+3]==0xB6){
                     startcode_found=1;
                     break;
@@ -454,7 +477,7 @@ uint64_t time= rdtsc();
 #ifdef PRINT_FRAME_TIME
 printf("%Ld\n", rdtsc()-time);
 #endif
-    return buf_size;
+    return get_consumed_bytes(s, buf_size);
 }
 
 AVCodec mpeg4_decoder = {
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 02558604b..4336e4bde 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -343,7 +343,7 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
     } while (--i);
 }
 
-static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	 "lea (%3, %3), %%eax		\n\t"
@@ -369,6 +369,40 @@ static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int
 	);
 }
 
+static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	 "lea (%3, %3), %%eax		\n\t"
+	 ".balign 8			\n\t"
+	 "1:				\n\t"
+	 "movq (%1), %%mm0		\n\t"
+	 "movq 8(%1), %%mm4		\n\t"
+	 "movq (%1, %3), %%mm1		\n\t"
+	 "movq 8(%1, %3), %%mm5		\n\t"
+     	 "movq %%mm0, (%2)		\n\t"
+     	 "movq %%mm4, 8(%2)		\n\t"
+	 "movq %%mm1, (%2, %3)		\n\t"
+	 "movq %%mm5, 8(%2, %3)		\n\t"
+	 "addl %%eax, %1		\n\t"
+         "addl %%eax, %2       		\n\t"
+	 "movq (%1), %%mm0		\n\t"
+	 "movq 8(%1), %%mm4		\n\t"
+	 "movq (%1, %3), %%mm1		\n\t"
+	 "movq 8(%1, %3), %%mm5		\n\t"
+	 "movq %%mm0, (%2)		\n\t"
+	 "movq %%mm4, 8(%2)		\n\t"
+	 "movq %%mm1, (%2, %3)		\n\t"
+	 "movq %%mm5, 8(%2, %3)		\n\t"
+	 "addl %%eax, %1		\n\t"
+	 "addl %%eax, %2       		\n\t"
+	 "subl $4, %0			\n\t"
+	 "jnz 1b			\n\t"
+	 : "+g"(h), "+r" (pixels),  "+r" (block)
+	 : "r"(line_size)
+	 : "%eax", "memory"
+	);
+}
+
 static void clear_blocks_mmx(DCTELEM *blocks)
 {
     __asm __volatile(
@@ -393,19 +427,19 @@ static void just_return() { return; }
 void dsputil_init_mmx(void)
 {
     mm_flags = mm_support();
-#if 1
-    printf("libavcodec: CPU flags:");
+#if 0
+    fprintf(stderr, "libavcodec: CPU flags:");
     if (mm_flags & MM_MMX)
-        printf(" mmx");
+        fprintf(stderr, " mmx");
     if (mm_flags & MM_MMXEXT)
-        printf(" mmxext");
+        fprintf(stderr, " mmxext");
     if (mm_flags & MM_3DNOW)
-        printf(" 3dnow");
+        fprintf(stderr, " 3dnow");
     if (mm_flags & MM_SSE)
-        printf(" sse");
+        fprintf(stderr, " sse");
     if (mm_flags & MM_SSE2)
-        printf(" sse2");
-    printf("\n");
+        fprintf(stderr, " sse2");
+    fprintf(stderr, "\n");
 #endif
 
     if (mm_flags & MM_MMX) {
@@ -424,27 +458,45 @@ void dsputil_init_mmx(void)
         pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
         pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
 
-        av_fdct = fdct_mmx;
-
-        put_pixels_tab[0] = put_pixels_mmx;
-        put_pixels_tab[1] = put_pixels_x2_mmx;
-        put_pixels_tab[2] = put_pixels_y2_mmx;
-        put_pixels_tab[3] = put_pixels_xy2_mmx;
-
-        put_no_rnd_pixels_tab[0] = put_pixels_mmx;
-        put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-        put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-        put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
-
-        avg_pixels_tab[0] = avg_pixels_mmx;
-        avg_pixels_tab[1] = avg_pixels_x2_mmx;
-        avg_pixels_tab[2] = avg_pixels_y2_mmx;
-        avg_pixels_tab[3] = avg_pixels_xy2_mmx;
-
-        avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
-        avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
-        avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
-        avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
+        put_pixels_tab[0][0] = put_pixels16_mmx;
+        put_pixels_tab[0][1] = put_pixels16_x2_mmx;
+        put_pixels_tab[0][2] = put_pixels16_y2_mmx;
+        put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
+
+        put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
+        put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
+        put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
+        put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
+
+        avg_pixels_tab[0][0] = avg_pixels16_mmx;
+        avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
+        avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
+        avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
+
+        avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
+        avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
+        avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
+        avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
+        
+        put_pixels_tab[1][0] = put_pixels8_mmx;
+        put_pixels_tab[1][1] = put_pixels8_x2_mmx;
+        put_pixels_tab[1][2] = put_pixels8_y2_mmx;
+        put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
+
+        put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
+        put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
+        put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
+        put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
+
+        avg_pixels_tab[1][0] = avg_pixels8_mmx;
+        avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
+        avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+        avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
+
+        avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
+        avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
+        avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
+        avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
 
         if (mm_flags & MM_MMXEXT) {
             pix_abs16x16    = pix_abs16x16_mmx2;
@@ -457,25 +509,45 @@ void dsputil_init_mmx(void)
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
 
-            put_pixels_tab[1] = put_pixels_x2_mmx2;
-            put_pixels_tab[2] = put_pixels_y2_mmx2;
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
-
-            avg_pixels_tab[0] = avg_pixels_mmx2;
-            avg_pixels_tab[1] = avg_pixels_x2_mmx2;
-            avg_pixels_tab[2] = avg_pixels_y2_mmx2;
-            avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
+            put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
+            put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
+            put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
+            put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
+
+            avg_pixels_tab[0][0] = avg_pixels16_mmx2;
+            avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
+            avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
+            avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
+
+            put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
+            put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
+            put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
+            put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
+
+            avg_pixels_tab[1][0] = avg_pixels8_mmx2;
+            avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
+            avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
+            avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
         } else if (mm_flags & MM_3DNOW) {
-            put_pixels_tab[1] = put_pixels_x2_3dnow;
-            put_pixels_tab[2] = put_pixels_y2_3dnow;
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
-
-            avg_pixels_tab[0] = avg_pixels_3dnow;
-            avg_pixels_tab[1] = avg_pixels_x2_3dnow;
-            avg_pixels_tab[2] = avg_pixels_y2_3dnow;
-            avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
+            put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
+            put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+            put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+            put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
+
+            avg_pixels_tab[0][0] = avg_pixels16_3dnow;
+            avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
+            avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+            avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+            
+            put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
+            put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
+            put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
+            put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+
+            avg_pixels_tab[1][0] = avg_pixels8_3dnow;
+            avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
+            avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
+            avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
         }
 
         /* idct */
@@ -526,27 +598,54 @@ void dsputil_init_mmx(void)
 #endif
 }
 
+void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
+
+/**
+ * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT
+ */ 
+void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){
+    if(   block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0
+       && block[8]==0   && block[9]==0 && block[12]==0 && block[13]==0){
+        int16_t tmp[64];
+        int i;
+
+        for(i=0; i<64; i++)
+            tmp[i]= block[i];
+        for(i=0; i<64; i++)
+            block[i]= tmp[block_permute_op(i)];
+        
+        simple_idct_put(dest, line_size, block);
+    }
+    else
+        gen_idct_put(dest, line_size, block);
+}
+
 /* remove any non bit exact operation (testing purpose). NOTE that
    this function should be kept as small as possible because it is
    always difficult to test automatically non bit exact cases. */
 void dsputil_set_bit_exact_mmx(void)
 {
     if (mm_flags & MM_MMX) {
-        if (mm_flags & MM_MMXEXT) {
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
+    
+        /* MMX2 & 3DNOW */
+        put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
+        put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
+        avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
+        put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
+        put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
+        avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
 
+        if (mm_flags & MM_MMXEXT) {
             pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
             pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
             pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
             pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
-        } else if (mm_flags & MM_3DNOW) {
-            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
         }
+#ifdef SIMPLE_IDCT
+        if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)
+            ff_idct_put= bit_exact_idct_put;
+#endif
     }
 }
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
index a16ccc88b..6873432ce 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
@@ -25,7 +25,7 @@
 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
    clobber bug - now it will work with 2.95.2 and also with -fPIC
  */
-static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -52,9 +52,49 @@ static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size,
 	:"r" (line_size)
 	:"%eax", "memory");
 }
+
+static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq 8(%1), %%mm2		\n\t"
+	"movq 8(%1, %3), %%mm3		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 9(%1), %%mm2		\n\t"
+	PAVGB" 9(%1, %3), %%mm3		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq %%mm2, 8(%2)		\n\t"
+	"movq %%mm3, 8(%2, %3)		\n\t"
+	"addl %%eax, %1			\n\t"
+	"addl %%eax, %2			\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq 8(%1), %%mm2		\n\t"
+	"movq 8(%1, %3), %%mm3		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 9(%1), %%mm2		\n\t"
+	PAVGB" 9(%1, %3), %%mm3		\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq %%mm2, 8(%2)		\n\t"
+	"movq %%mm3, 8(%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
+}
  
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -91,7 +131,7 @@ static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int lin
 	:"%eax", "memory");
 }
 
-static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -122,7 +162,7 @@ static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size,
 }
 
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -155,7 +195,7 @@ static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int lin
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -183,7 +223,7 @@ static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, in
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -215,7 +255,7 @@ static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size,
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -254,7 +294,7 @@ static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size,
 }
 
 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 
-static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -294,3 +334,34 @@ static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size
 	:"r" (line_size)
 	:"%eax",  "memory");
 }
+
+//FIXME the following could be optimized too ...
+static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
index dc70c9c8e..3605e03f9 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -22,7 +22,7 @@
  */
 
 // put_pixels
-static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -54,7 +54,53 @@ static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size
 	:"eax", "memory");
 }
 
-static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	"lea	(%3, %3), %%eax		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%2)		\n\t"
+	"movq	%%mm5, (%2, %3)		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	9(%1), %%mm1		\n\t"
+	"movq	8(%1, %3), %%mm2	\n\t"
+	"movq	9(%1, %3), %%mm3	\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, 8(%2)		\n\t"
+	"movq	%%mm5, 8(%2, %3)	\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%2)		\n\t"
+	"movq	%%mm5, (%2, %3)		\n\t"
+	"movq	8(%1), %%mm0		\n\t"
+	"movq	9(%1), %%mm1		\n\t"
+	"movq	8(%1, %3), %%mm2	\n\t"
+	"movq	9(%1, %3), %%mm3	\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, 8(%2)		\n\t"
+	"movq	%%mm5, 8(%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -83,7 +129,7 @@ static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size
 	:"eax", "memory");
 }
 
-static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_ZERO(mm7);
     SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
@@ -151,7 +197,7 @@ static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz
 
 // avg_pixels
 // in case more speed is needed - unroling would certainly help
-static void DEF(avg, pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     JUMPALIGN();
@@ -170,7 +216,50 @@ static void DEF(avg, pixels)(UINT8 *block, const UINT8 *pixels, int line_size, i
     while (--h);
 }
 
-static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	     "movq  %0, %%mm0		\n\t"
+	     "movq  %1, %%mm1		\n\t"
+	     PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	     "movq  %%mm2, %0		\n\t"
+	     "movq  8%0, %%mm0		\n\t"
+	     "movq  8%1, %%mm1		\n\t"
+	     PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	     "movq  %%mm2, 8%0		\n\t"
+	     :"+m"(*block)
+	     :"m"(*pixels)
+	     :"memory");
+	pixels += line_size;
+	block += line_size;
+    }
+    while (--h);
+}
+
+static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	    "movq  %1, %%mm0		\n\t"
+	    "movq  1%1, %%mm1		\n\t"
+	    "movq  %0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, %0		\n\t"
+	    :"+m"(*block)
+	    :"m"(*pixels)
+	    :"memory");
+	pixels += line_size;
+	block += line_size;
+    } while (--h);
+}
+
+static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     JUMPALIGN();
@@ -182,6 +271,12 @@ static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size
 	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
 	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
 	    "movq  %%mm0, %0		\n\t"
+	    "movq  8%1, %%mm0		\n\t"
+	    "movq  9%1, %%mm1		\n\t"
+	    "movq  8%0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, 8%0		\n\t"
 	    :"+m"(*block)
 	    :"m"(*pixels)
 	    :"memory");
@@ -190,7 +285,7 @@ static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size
     } while (--h);
 }
 
-static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -214,10 +309,10 @@ static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size
 	"movq	(%1, %%eax), %%mm0	\n\t"
 	PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
 	"movq	(%2), %%mm3		\n\t"
-	PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
+	PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
 	"movq	(%2, %3), %%mm3		\n\t"
 	PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
-	"movq	%%mm0, (%2)		\n\t"
+	"movq	%%mm2, (%2)		\n\t"
 	"movq	%%mm1, (%2, %3)		\n\t"
 	"addl	%%eax, %1		\n\t"
 	"addl	%%eax, %2		\n\t"
@@ -230,7 +325,7 @@ static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size
 }
 
 // this routine is 'slightly' suboptimal but mostly unused
-static void DEF(avg, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
     MOVQ_ZERO(mm7);
     SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
@@ -303,3 +398,26 @@ static void DEF(avg, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz
 	:"D"(block), "r"(line_size)
 	:"eax", "memory");
 }
+
+//FIXME optimize
+static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index 7135beb21..73b63ac63 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -25,7 +25,7 @@
 
 #define BITS_FRW_ACC	3 //; 2 or 3 for accuracy
 #define SHIFT_FRW_COL	BITS_FRW_ACC
-#define SHIFT_FRW_ROW	(BITS_FRW_ACC + 17)
+#define SHIFT_FRW_ROW	(BITS_FRW_ACC + 17 - 3)
 //#define RND_FRW_ROW		(262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
 #define RND_FRW_ROW		(1 << (SHIFT_FRW_ROW-1))
 //#define RND_FRW_COL		(2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index 390aa554c..10efc173f 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -195,7 +195,7 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
     const UINT16 *quant_matrix;
     
     if(s->alternate_scan) nCoeffs= 64;
-    else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ];
+    else nCoeffs= zigzag_end[ s->block_last_index[n] ];
 
     if (s->mb_intra) {
         int block0;
@@ -321,7 +321,7 @@ static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
     const UINT16 *quant_matrix;
     
     if(s->alternate_scan) nCoeffs= 64;
-    else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ];
+    else nCoeffs= zigzag_end[ s->block_last_index[n] ];
 
     if (s->mb_intra) {
         int block0;
@@ -552,16 +552,21 @@ void unused_var_warning_killer(){
 void MPV_common_init_mmx(MpegEncContext *s)
 {
     if (mm_flags & MM_MMX) {
+        const int dct_algo= s->avctx->dct_algo;
         s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
         s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
         s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
 
         draw_edges = draw_edges_mmx;
 
-        if(mm_flags & MM_MMXEXT){
-            dct_quantize= dct_quantize_MMX2;
-        } else {
-            dct_quantize= dct_quantize_MMX;
+        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
+            s->fdct = fdct_mmx;
+
+            if(mm_flags & MM_MMXEXT){
+                s->dct_quantize= dct_quantize_MMX2;
+            } else {
+                s->dct_quantize= dct_quantize_MMX;
+            }
         }
     }
 }
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index 1119313cc..94a6711db 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -40,7 +40,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     const UINT16 *qmat, *bias;
     static __align8 INT16 temp_block[64];
 
-    av_fdct (block);
+    //s->fdct (block);
+    fdct_mmx (block); //cant be anything else ...
 
     if (s->mb_intra) {
         int dummy;
@@ -55,7 +56,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         	"xorl %%edx, %%edx	\n\t"
         	"mul %%ecx		\n\t"
         	: "=d" (level), "=a"(dummy)
-        	: "a" (block[0] + (q >> 1)), "c" (inverse[q])
+        	: "a" ((block[0]>>2) + q), "c" (inverse[q<<1])
         );
 #else
         asm volatile (
@@ -63,13 +64,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         	"divw %%cx		\n\t"
         	"movzwl %%ax, %%eax	\n\t"
         	: "=a" (level)
-        	: "a" (block[0] + (q >> 1)), "c" (q)
+        	: "a" ((block[0]>>2) + q), "c" (q<<1)
         	: "%edx"
         );
 #endif
         } else
             /* For AIC we skip quant/dequant of INTRADC */
-            level = block[0];
+            level = (block[0] + 4)>>3;
             
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
@@ -83,7 +84,11 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     }
 
     if(s->out_format == FMT_H263 && s->mpeg_quant==0){
-    
+
+        /* the following code is patched using avifile's modifications
+           to enable -fpic compilation. this patch has not been accepted on
+           main ffmpeg cvs. */
+
         asm volatile(
             "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
             SPREADW(%%mm3)
@@ -112,7 +117,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
             "movq %%mm0, (%3, %%eax)		\n\t"
             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
-            "movq (%2, %%eax), %%mm1		\n\t" 
+            "movq (%4, %%eax), %%mm1		\n\t" 
             "movq %%mm7, (%1, %%eax)		\n\t" // 0
             "pandn %%mm1, %%mm0			\n\t"
 	    PMAXW(%%mm0, %%mm3)
@@ -201,10 +206,12 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     }
 
     if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute
+        
 // last_non_zero_p1=64;       
     /* permute for IDCT */
     asm volatile(
-    "pushl %%ebp			\n\t"
+        "movl %0, %%eax			\n\t"
+	"pushl %%ebp			\n\t"
 	"movl %%esp, " MANGLE(esp_temp) "\n\t"
 	"1:				\n\t"
 	"movzbl (%1, %%eax), %%ebx	\n\t"
@@ -219,10 +226,10 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 	" js 1b				\n\t"
 	"movl " MANGLE(esp_temp) ", %%esp\n\t"
 	"popl %%ebp			\n\t"
-	:
-	: "a" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block)
-	: "%ebx", "%ecx"
-    );
+	: 
+	: "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block)
+	: "%eax", "%ebx", "%ecx"
+	);
 /*
     for(i=0; i<last_non_zero_p1; i++)
     {
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index 26519bd38..1197f858b 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -30,6 +30,7 @@
 #define NB_PHASES  (1 << PHASE_BITS)
 #define NB_TAPS    4
 #define FCENTER    1  /* index of the center of the filter */
+//#define TEST    1  /* Test it */
 
 #define POS_FRAC_BITS 16
 #define POS_FRAC      (1 << POS_FRAC_BITS)
@@ -39,7 +40,7 @@
 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
 
 struct ImgReSampleContext {
-    int iwidth, iheight, owidth, oheight;
+    int iwidth, iheight, owidth, oheight, topBand, bottomBand, leftBand, rightBand;
     int h_incr, v_incr;
     INT16 h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
     INT16 v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
@@ -65,7 +66,7 @@ static void h_resample_fast(UINT8 *dst, int dst_width, UINT8 *src, int src_width
         /* test */
         if ((src_pos >> POS_FRAC_BITS) < 0 ||
             (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
-            abort();
+            av_abort();
 #endif
         s = src + (src_pos >> POS_FRAC_BITS);
         phase = get_phase(src_pos);
@@ -353,8 +354,8 @@ static void component_resample(ImgReSampleContext *s,
             if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
                 ring_y = NB_TAPS;
             last_src_y++;
-            /* handle limit conditions : replicate line (slighly
-               inefficient because we filter multiple times */
+            /* handle limit conditions : replicate line (slightly
+               inefficient because we filter multiple times) */
             y1 = last_src_y;
             if (y1 < 0) {
                 y1 = 0;
@@ -428,6 +429,14 @@ static void build_filter(INT16 *filter, float factor)
 ImgReSampleContext *img_resample_init(int owidth, int oheight,
                                       int iwidth, int iheight)
 {
+	return img_resample_full_init(owidth, oheight, iwidth, iheight, 0, 0, 0, 0);
+}
+
+ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
+                                      int iwidth, int iheight,
+                                      int topBand, int bottomBand,
+                                      int leftBand, int rightBand)
+{
     ImgReSampleContext *s;
 
     s = av_mallocz(sizeof(ImgReSampleContext));
@@ -441,12 +450,16 @@ ImgReSampleContext *img_resample_init(int owidth, int oheight,
     s->oheight = oheight;
     s->iwidth = iwidth;
     s->iheight = iheight;
+    s->topBand = topBand;
+    s->bottomBand = bottomBand;
+    s->leftBand = leftBand;
+    s->rightBand = rightBand;
     
-    s->h_incr = (iwidth * POS_FRAC) / owidth;
-    s->v_incr = (iheight * POS_FRAC) / oheight;
+    s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / owidth;
+    s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / oheight;
     
-    build_filter(&s->h_filters[0][0], (float)owidth / (float)iwidth);
-    build_filter(&s->v_filters[0][0], (float)oheight / (float)iheight);
+    build_filter(&s->h_filters[0][0], (float) owidth  / (float) (iwidth - leftBand - rightBand));
+    build_filter(&s->v_filters[0][0], (float) oheight / (float) (iheight - topBand - bottomBand));
 
     return s;
  fail:
@@ -463,8 +476,9 @@ void img_resample(ImgReSampleContext *s,
         shift = (i == 0) ? 0 : 1;
         component_resample(s, output->data[i], output->linesize[i], 
                            s->owidth >> shift, s->oheight >> shift,
-                           input->data[i], input->linesize[i], 
-                           s->iwidth >> shift, s->iheight >> shift);
+                           input->data[i] + (input->linesize[i] * (s->topBand >> shift)) + (s->leftBand >> shift),
+                           input->linesize[i], ((s->iwidth - s->leftBand - s->rightBand) >> shift),
+                           (s->iheight - s->topBand - s->bottomBand) >> shift);
     }
 }
 
@@ -484,6 +498,13 @@ void *av_mallocz(int size)
     return ptr;
 }
 
+void av_free(void *ptr)
+{
+    /* XXX: this test should not be needed on most libcs */
+    if (ptr)
+        free(ptr);
+}
+
 /* input */
 #define XSIZE 256
 #define YSIZE 256
@@ -518,7 +539,7 @@ static void dump_filter(INT16 *filter)
 }
 
 #ifdef HAVE_MMX
-extern int mm_flags;
+int mm_flags;
 #endif
 
 int main(int argc, char **argv)
@@ -569,19 +590,19 @@ int main(int argc, char **argv)
             } else {
                 v = ((x + y - XSIZE) * 255) / XSIZE;
             }
-            img[y * XSIZE + x] = v;
+            img[(YSIZE - y) * XSIZE + (XSIZE - x)] = v;
         }
     }
     save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
     for(i=0;i<sizeof(factors)/sizeof(float);i++) {
         fact = factors[i];
         xsize = (int)(XSIZE * fact);
-        ysize = (int)(YSIZE * fact);
-        s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
+        ysize = (int)((YSIZE - 100) * fact);
+        s = img_resample_full_init(xsize, ysize, XSIZE, YSIZE, 50 ,50, 0, 0);
         printf("Factor=%0.2f\n", fact);
         dump_filter(&s->h_filters[0][0]);
         component_resample(s, img1, xsize, xsize, ysize,
-                           img, XSIZE, XSIZE, YSIZE);
+                           img + 50 * XSIZE, XSIZE, XSIZE, YSIZE - 100);
         img_resample_close(s);
 
         sprintf(buf, "/tmp/out%d.pgm", i);
diff --git a/src/libffmpeg/libavcodec/jfdctint.c b/src/libffmpeg/libavcodec/jfdctint.c
new file mode 100644
index 000000000..6b0d4cadb
--- /dev/null
+++ b/src/libffmpeg/libavcodec/jfdctint.c
@@ -0,0 +1,292 @@
+/*
+ * jfdctint.c
+ *
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains a slow-but-accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform).
+ *
+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+ * on each column.  Direct algorithms are also available, but they are
+ * much more complex and seem not to be any faster when reduced to code.
+ *
+ * This implementation is based on an algorithm described in
+ *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+ *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+ *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+ * The primary algorithm described there uses 11 multiplies and 29 adds.
+ * We use their alternate method with 12 multiplies and 32 adds.
+ * The advantage of this method is that no data path contains more than one
+ * multiplication; this allows a very simple and accurate implementation in
+ * scaled fixed-point arithmetic, with a minimal number of shifts.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+#include "dsputil.h"
+
+#define SHIFT_TEMPS
+#define DCTSIZE 8
+#define BITS_IN_JSAMPLE 8
+#define GLOBAL(x) x
+#define RIGHT_SHIFT(x, n) ((x) >> (n))
+#define MULTIPLY16C16(var,const) ((var)*(const))
+
+#if 1 //def USE_ACCURATE_ROUNDING
+#define DESCALE(x,n)  RIGHT_SHIFT((x) + (1 << ((n) - 1)), n)
+#else
+#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#endif
+
+
+/*
+ * This module is specialized to the case DCTSIZE = 8.
+ */
+
+#if DCTSIZE != 8
+  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+#endif
+
+
+/*
+ * The poop on this scaling stuff is as follows:
+ *
+ * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+ * larger than the true DCT outputs.  The final outputs are therefore
+ * a factor of N larger than desired; since N=8 this can be cured by
+ * a simple right shift at the end of the algorithm.  The advantage of
+ * this arrangement is that we save two multiplications per 1-D DCT,
+ * because the y0 and y4 outputs need not be divided by sqrt(N).
+ * In the IJG code, this factor of 8 is removed by the quantization step
+ * (in jcdctmgr.c), NOT in this module.
+ *
+ * We have to do addition and subtraction of the integer inputs, which
+ * is no problem, and multiplication by fractional constants, which is
+ * a problem to do in integer arithmetic.  We multiply all the constants
+ * by CONST_SCALE and convert them to integer constants (thus retaining
+ * CONST_BITS bits of precision in the constants).  After doing a
+ * multiplication we have to divide the product by CONST_SCALE, with proper
+ * rounding, to produce the correct output.  This division can be done
+ * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
+ * as long as possible so that partial sums can be added together with
+ * full fractional precision.
+ *
+ * The outputs of the first pass are scaled up by PASS1_BITS bits so that
+ * they are represented to better-than-integral precision.  These outputs
+ * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
+ * with the recommended scaling.  (For 12-bit sample data, the intermediate
+ * array is INT32 anyway.)
+ *
+ * To avoid overflow of the 32-bit intermediate results in pass 2, we must
+ * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
+ * shows that the values given below are the most effective.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define CONST_BITS  13
+#define PASS1_BITS  4		/* set this to 2 if 16x16 multiplies are faster */
+#else
+#define CONST_BITS  13
+#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
+#endif
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+ * causing a lot of useless floating-point operations at run time.
+ * To get around this we use the following pre-calculated constants.
+ * If you change CONST_BITS you may want to add appropriate values.
+ * (With a reasonable C compiler, you can just rely on the FIX() macro...)
+ */
+
+#if CONST_BITS == 13
+#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
+#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
+#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
+#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
+#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
+#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
+#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
+#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
+#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
+#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
+#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
+#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
+#else
+#define FIX_0_298631336  FIX(0.298631336)
+#define FIX_0_390180644  FIX(0.390180644)
+#define FIX_0_541196100  FIX(0.541196100)
+#define FIX_0_765366865  FIX(0.765366865)
+#define FIX_0_899976223  FIX(0.899976223)
+#define FIX_1_175875602  FIX(1.175875602)
+#define FIX_1_501321110  FIX(1.501321110)
+#define FIX_1_847759065  FIX(1.847759065)
+#define FIX_1_961570560  FIX(1.961570560)
+#define FIX_2_053119869  FIX(2.053119869)
+#define FIX_2_562915447  FIX(2.562915447)
+#define FIX_3_072711026  FIX(3.072711026)
+#endif
+
+
+/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+ * For 8-bit samples with the recommended scaling, all the variable
+ * and constant values involved are no more than 16 bits wide, so a
+ * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+ * For 12-bit samples, a full 32-bit multiplication will be needed.
+ */
+
+#if BITS_IN_JSAMPLE == 8 && CONST_BITS<=13 && PASS1_BITS<=2
+#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#else
+#define MULTIPLY(var,const)  ((var) * (const))
+#endif
+
+
+/*
+ * Perform the forward DCT on one block of samples.
+ */
+
+GLOBAL(void)
+ff_jpeg_fdct_islow (DCTELEM * data)
+{
+  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  INT32 tmp10, tmp11, tmp12, tmp13;
+  INT32 z1, z2, z3, z4, z5;
+  DCTELEM *dataptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[0] + dataptr[7];
+    tmp7 = dataptr[0] - dataptr[7];
+    tmp1 = dataptr[1] + dataptr[6];
+    tmp6 = dataptr[1] - dataptr[6];
+    tmp2 = dataptr[2] + dataptr[5];
+    tmp5 = dataptr[2] - dataptr[5];
+    tmp3 = dataptr[3] + dataptr[4];
+    tmp4 = dataptr[3] - dataptr[4];
+    
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    
+    tmp10 = tmp0 + tmp3;
+    tmp13 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp1 - tmp2;
+    
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
+    
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+				   CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
+				   CONST_BITS-PASS1_BITS);
+    
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    
+    z1 = tmp4 + tmp7;
+    z2 = tmp5 + tmp6;
+    z3 = tmp4 + tmp6;
+    z4 = tmp5 + tmp7;
+    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+    
+    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    
+    z3 += z5;
+    z4 += z5;
+    
+    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
+    
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+    
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    
+    tmp10 = tmp0 + tmp3;
+    tmp13 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp1 - tmp2;
+    
+    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+    
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+					   CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
+					   CONST_BITS+PASS1_BITS);
+    
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    
+    z1 = tmp4 + tmp7;
+    z2 = tmp5 + tmp6;
+    z3 = tmp4 + tmp6;
+    z4 = tmp5 + tmp7;
+    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+    
+    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    
+    z3 += z5;
+    z4 += z5;
+    
+    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
+					   CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
+					   CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
+					   CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
+					   CONST_BITS+PASS1_BITS);
+    
+    dataptr++;			/* advance pointer to next column */
+  }
+}
diff --git a/src/libffmpeg/libavcodec/mem.c b/src/libffmpeg/libavcodec/mem.c
index 113e285e7..5799c0774 100644
--- a/src/libffmpeg/libavcodec/mem.c
+++ b/src/libffmpeg/libavcodec/mem.c
@@ -29,8 +29,8 @@
 void *av_malloc(int size)
 {
     void *ptr;
-#if defined ( ARCH_X86 ) && defined ( HAVE_MEMALIGN ) && 0
-    ptr = memalign(64,size);
+#if defined (HAVE_MEMALIGN)
+    ptr = memalign(16,size);
     /* Why 64? 
        Indeed, we should align it:
          on 4 for 386
@@ -40,11 +40,29 @@ void *av_malloc(int size)
        Because L1 and L2 caches are aligned on those values.
        But I don't want to code such logic here!
      */
+     /* Why 16?
+        because some cpus need alignment, for example SSE2 on P4, & most RISC cpus
+        it will just trigger an exception and the unaligned load will be done in the
+        exception handler or it will just segfault (SSE2 on P4)
+        Why not larger? because i didnt see a difference in benchmarks ...
+     */
+     /* benchmarks with p3
+        memalign(64)+1		3071,3051,3032
+        memalign(64)+2		3051,3032,3041
+        memalign(64)+4		2911,2896,2915
+        memalign(64)+8		2545,2554,2550
+        memalign(64)+16		2543,2572,2563
+        memalign(64)+32		2546,2545,2571
+        memalign(64)+64		2570,2533,2558
+        
+        btw, malloc seems to do 8 byte alignment by default here
+     */
 #else
     ptr = malloc(size);
 #endif
     if (!ptr)
         return NULL;
+//fprintf(stderr, "%X %d\n", (int)ptr, size);
     /* NOTE: this memset should not be present */
     memset(ptr, 0, size);
     return ptr;
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index c4ebee634..6cfd83160 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -16,8 +16,9 @@
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
- * Support for external huffman table and various fixes (AVID workaround) by
- *                                    Alex Beregszaszi <alex@naxine.org>
+ * Support for external huffman table, various fixes (AVID workaround),
+ * aspecting and various markers support
+ *                                  by Alex Beregszaszi <alex@naxine.org>
  */
 //#define DEBUG
 #include "avcodec.h"
@@ -354,30 +355,53 @@ static void jpeg_put_comments(MpegEncContext *s)
     int size;
     UINT8 *ptr;
 
-#if 0
+    if (s->aspect_ratio_info)
+    {
     /* JFIF header */
     put_marker(p, APP0);
     put_bits(p, 16, 16);
     put_string(p, "JFIF"); /* this puts the trailing zero-byte too */
-    put_bits(p, 16, 0x101);
+    put_bits(p, 16, 0x0201); /* v 1.02 */
     put_bits(p, 8, 0); /* units type: 0 - aspect ratio */
-    put_bits(p, 16, 1); /* aspect: 1:1 */
-    put_bits(p, 16, 1);
+    switch(s->aspect_ratio_info)
+    {
+	case FF_ASPECT_4_3_625:
+	case FF_ASPECT_4_3_525:
+	    put_bits(p, 16, 4); 
+	    put_bits(p, 16, 3);
+	    break;
+	case FF_ASPECT_16_9_625:
+	case FF_ASPECT_16_9_525:
+	    put_bits(p, 16, 16); 
+	    put_bits(p, 16, 9);
+	    break;
+	case FF_ASPECT_EXTENDED:
+	    put_bits(p, 16, s->aspected_width);
+	    put_bits(p, 16, s->aspected_height);
+	    break;
+	case FF_ASPECT_SQUARE:
+	default:
+	    put_bits(p, 16, 1); /* aspect: 1:1 */
+	    put_bits(p, 16, 1);
+	    break;
+    }
     put_bits(p, 8, 0); /* thumbnail width */
     put_bits(p, 8, 0); /* thumbnail height */
-#endif
+    }
 
     /* comment */
-    put_marker(p, COM);
-    flush_put_bits(p);
-    ptr = pbBufPtr(p);
-    put_bits(p, 16, 0); /* patched later */
+    if(!ff_bit_exact){
+        put_marker(p, COM);
+        flush_put_bits(p);
+        ptr = pbBufPtr(p);
+        put_bits(p, 16, 0); /* patched later */
 #define MJPEG_VERSION "FFmpeg" LIBAVCODEC_VERSION "b" LIBAVCODEC_BUILD_STR
-    put_string(p, MJPEG_VERSION);
-    size = strlen(MJPEG_VERSION)+3;
+        put_string(p, MJPEG_VERSION);
+        size = strlen(MJPEG_VERSION)+3;
 #undef MJPEG_VERSION
-    ptr[0] = size >> 8;
-    ptr[1] = size;
+        ptr[0] = size >> 8;
+        ptr[1] = size;
+    }
 }
 
 void mjpeg_picture_header(MpegEncContext *s)
@@ -1084,6 +1108,19 @@ static int mjpeg_decode_app(MJpegDecodeContext *s,
 	skip_bits(&s->gb, 8); /* the trailing zero-byte */
 	printf("mjpeg: JFIF header found (version: %x.%x)\n",
 	    get_bits(&s->gb, 8), get_bits(&s->gb, 8));
+	if (get_bits(&s->gb, 8) == 0)
+	{
+	    s->avctx->aspect_ratio_info = FF_ASPECT_EXTENDED;
+	    s->avctx->aspected_width = get_bits(&s->gb, 16);
+	    s->avctx->aspected_height = get_bits(&s->gb, 16);
+	}
+	else
+	{
+	    skip_bits(&s->gb, 16);
+	    skip_bits(&s->gb, 16);
+	}
+	skip_bits(&s->gb, 8);
+	skip_bits(&s->gb, 8);
 	goto out;
     }
     
diff --git a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
index 4539b6464..c380eb45f 100644
--- a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
+++ b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
@@ -18,6 +18,7 @@
  */
 
 #include "../dsputil.h"
+#include "../mpegvideo.h"
 
 #include <mlib_types.h>
 #include <mlib_status.h>
@@ -125,7 +126,6 @@ void ff_fdct_mlib(DCTELEM *data)
 
 void dsputil_init_mlib(void)
 {
-    av_fdct = ff_fdct_mlib;
     ff_idct = ff_idct_mlib;
 
     put_pixels_tab[0] = put_pixels_mlib;
@@ -142,3 +142,10 @@ void dsputil_init_mlib(void)
     
     add_pixels_clamped = add_pixels_clamped_mlib;
 }
+
+void MPV_common_init_mlib(MpegEncContext *s)
+{
+    if(s->avctx->dct_algo==FF_DCT_AUTO || s->avctx->dct_algo==FF_DCT_MLIB){
+	s->fdct = ff_fdct_mlib;
+    }
+}
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index ffc251da7..032556a6d 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -41,28 +41,6 @@
 #define P_MV1 P[9]
 
 
-static int pix_sum(UINT8 * pix, int line_size)
-{
-    int s, i, j;
-
-    s = 0;
-    for (i = 0; i < 16; i++) {
-	for (j = 0; j < 16; j += 8) {
-	    s += pix[0];
-	    s += pix[1];
-	    s += pix[2];
-	    s += pix[3];
-	    s += pix[4];
-	    s += pix[5];
-	    s += pix[6];
-	    s += pix[7];
-	    pix += 8;
-	}
-	pix += line_size - 16;
-    }
-    return s;
-}
-
 static int pix_dev(UINT8 * pix, int line_size, int mean)
 {
     int s, i, j;
@@ -85,29 +63,6 @@ static int pix_dev(UINT8 * pix, int line_size, int mean)
     return s;
 }
 
-static int pix_norm1(UINT8 * pix, int line_size)
-{
-    int s, i, j;
-    UINT32 *sq = squareTbl + 256;
-
-    s = 0;
-    for (i = 0; i < 16; i++) {
-	for (j = 0; j < 16; j += 8) {
-	    s += sq[pix[0]];
-	    s += sq[pix[1]];
-	    s += sq[pix[2]];
-	    s += sq[pix[3]];
-	    s += sq[pix[4]];
-	    s += sq[pix[5]];
-	    s += sq[pix[6]];
-	    s += sq[pix[7]];
-	    pix += 8;
-	}
-	pix += line_size - 16;
-    }
-    return s;
-}
-
 static int pix_norm(UINT8 * pix1, UINT8 * pix2, int line_size)
 {
     int s, i, j;
@@ -1183,6 +1138,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     sum= (sum+8)>>4;
     varc = (pix_norm1(pix, s->linesize) - sum*sum + 500 + 128)>>8;
     vard = (pix_norm(pix, ppix, s->linesize)+128)>>8;
+
 //printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
     s->mb_var   [s->mb_width * mb_y + mb_x] = varc;
     s->mc_mb_var[s->mb_width * mb_y + mb_x] = vard;
@@ -1195,6 +1151,11 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 	   varc, s->avg_mb_var, sum, vard, mx - xx, my - yy);
 #endif
     if(s->flags&CODEC_FLAG_HQ){
+        if (vard <= 64 || vard < varc)
+            s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+        else
+            s->scene_change_score+= 20;
+
         if (vard*2 + 200 > varc)
             mb_type|= MB_TYPE_INTRA;
         if (varc*2 + 200 > vard){
@@ -1221,6 +1182,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             set_p_mv_tables(s, mx, my, 1);
     }else{
         if (vard <= 64 || vard < varc) {
+            s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
             mb_type|= MB_TYPE_INTER;
             if (s->me_method != ME_ZERO) {
                 if(s->me_method >= ME_EPZS)
@@ -1251,6 +1213,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             }
 #endif
         }else{
+            s->scene_change_score+= 20;
             mb_type|= MB_TYPE_INTRA;
             mx = 0;
             my = 0;
@@ -1374,8 +1337,7 @@ static inline int check_bidir_mv(MpegEncContext * s,
     src_y = mb_y * 16 + (motion_fy >> 1);
             
     ptr = s->last_picture[0] + (src_y * s->linesize) + src_x;
-    put_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
-    put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+    put_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
     
     fbmin += (mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->qscale;
 
@@ -1384,8 +1346,7 @@ static inline int check_bidir_mv(MpegEncContext * s,
     src_y = mb_y * 16 + (motion_by >> 1);
             
     ptr = s->next_picture[0] + (src_y * s->linesize) + src_x;
-    avg_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
-    avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+    avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
     
     fbmin += pix_abs16x16(s->new_picture[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
     return fbmin;
@@ -1430,13 +1391,13 @@ static inline int direct_search(MpegEncContext * s,
     const int motion_px= s->p_mv_table[mot_xy][0];
     const int motion_py= s->p_mv_table[mot_xy][1];
     const int time_pp= s->pp_time;
-    const int time_bp= s->bp_time;
-    const int time_pb= time_pp - time_bp;
+    const int time_pb= s->pb_time;
+    const int time_bp= time_pp - time_pb;
     int bx, by;
     int mx, my, mx2, my2;
     uint8_t *ref_picture= s->me_scratchpad - (mb_x - 1 + (mb_y - 1)*s->linesize)*16;
     int16_t (*mv_table)[2]= s->b_direct_mv_table;
-    uint16_t *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+/*    uint16_t *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; */ // f_code of the prev frame
 
     /* thanks to iso-mpeg the rounding is different for the zero vector, so we need to handle that ... */
     motion_fx= (motion_px*time_pb)/time_pp;
@@ -1470,8 +1431,7 @@ static inline int direct_search(MpegEncContext * s,
             if (src_y == height) dxy &= ~2;
 
             ptr = s->last_picture[0] + (src_y * s->linesize) + src_x;
-            put_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
-            put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+            put_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
 
             dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
             src_x = (mb_x + bx) * 16 + (motion_bx >> 1);
@@ -1481,8 +1441,7 @@ static inline int direct_search(MpegEncContext * s,
             src_y = clip(src_y, -16, height);
             if (src_y == height) dxy &= ~2;
 
-            avg_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
-            avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+            avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
         }
     }
 
@@ -1570,9 +1529,7 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
 
     fbmin= bidir_refine(s, mb_x, mb_y);
 
-    if(s->flags&CODEC_FLAG_HQ){
-        type= MB_TYPE_FORWARD | MB_TYPE_BACKWARD | MB_TYPE_BIDIR | MB_TYPE_DIRECT;
-    }else{
+    {
         int score= dmin;
         type=MB_TYPE_DIRECT;
         
@@ -1588,9 +1545,15 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
             score=fbmin;
             type= MB_TYPE_BIDIR;
         }
+        score= (score*score)>>8;
         s->mc_mb_var_sum += score;
-        s->mc_mb_var[mb_y*s->mb_width + mb_x] = score;
+        s->mc_mb_var[mb_y*s->mb_width + mb_x] = score; //FIXME use SSD
     }
+
+    if(s->flags&CODEC_FLAG_HQ){
+        type= MB_TYPE_FORWARD | MB_TYPE_BACKWARD | MB_TYPE_BIDIR | MB_TYPE_DIRECT; //FIXME something smarter
+    }
+
 /*
 {
 static int count=0;
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index 8939bb6c2..ef6bec7ac 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -62,7 +62,7 @@ static UINT8 fcode_tab[MAX_MV*2+1];
 
 static void init_2d_vlc_rl(RLTable *rl)
 {
-    int i, q;
+    int i;
     
     init_vlc(&rl->vlc, TEX_VLC_BITS, rl->n + 2, 
              &rl->table_vlc[0][1], 4, 2,
@@ -142,9 +142,12 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
                 v = 0x3ffff;
             put_bits(&s->pb, 18, v);
             put_bits(&s->pb, 1, 1); /* marker */
-            /* vbv buffer size: slightly greater than an I frame. We add
-               some margin just in case */
-            vbv_buffer_size = (3 * s->I_frame_bits) / (2 * 8);
+
+            if(s->avctx->rc_buffer_size)
+                vbv_buffer_size = s->avctx->rc_buffer_size;
+            else
+                /* VBV calculation: Scaled so that a VCD has the proper VBV size of 40 kilobytes */
+                vbv_buffer_size = (( 20 * s->bit_rate) / (1151929 / 2)) * 8 * 1024;	 
             put_bits(&s->pb, 10, (vbv_buffer_size + 16383) / 16384); 
             put_bits(&s->pb, 1, 1); /* constrained parameter flag */
             put_bits(&s->pb, 1, 0); /* no custom intra matrix */
@@ -581,7 +584,7 @@ static VLC mb_ptype_vlc;
 static VLC mb_btype_vlc;
 static VLC mb_pat_vlc;
 
-void mpeg1_init_vlc(MpegEncContext *s)
+static void init_vlcs(MpegEncContext *s)
 {
     static int done = 0;
 
@@ -1049,7 +1052,7 @@ static int mpeg1_decode_block(MpegEncContext *s,
             /* escape */
             run = get_bits(&s->gb, 6);
             level = get_bits(&s->gb, 8);
-            level = (level << 24) >> 24;
+            level= (level + ((-1)<<7)) ^ ((-1)<<7); //sign extension
             if (level == -128) {
                 level = get_bits(&s->gb, 8) - 256;
             } else if (level == 0) {
@@ -1128,7 +1131,7 @@ static int mpeg2_decode_block_non_intra(MpegEncContext *s,
             /* escape */
             run = get_bits(&s->gb, 6);
             level = get_bits(&s->gb, 12);
-            level = (level << 20) >> 20;
+            level= (level + ((-1)<<11)) ^ ((-1)<<11); //sign extension
         } else {
             run = rl->table_run[code];
             level = rl->table_level[code];
@@ -1211,7 +1214,7 @@ static int mpeg2_decode_block_intra(MpegEncContext *s,
             /* escape */
             run = get_bits(&s->gb, 6);
             level = get_bits(&s->gb, 12);
-            level = (level << 20) >> 20;
+            level= (level + ((-1)<<11)) ^ ((-1)<<11); //sign extension
         } else {
             run = rl->table_run[code];
             level = rl->table_level[code];
@@ -1257,6 +1260,7 @@ static int mpeg_decode_init(AVCodecContext *avctx)
     
     s->mpeg_enc_ctx.flags= avctx->flags;
     common_init(&s->mpeg_enc_ctx);
+    init_vlcs(&s->mpeg_enc_ctx);
 
     s->header_state = 0xff;
     s->mpeg_enc_ctx_allocated = 0;
@@ -1465,7 +1469,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
 
     start_code = (start_code - 1) & 0xff;
     if (start_code >= s->mb_height){
-        fprintf(stderr, "slice below image\n");
+        fprintf(stderr, "slice below image (%d >= %d)\n", start_code, s->mb_height);
         return -1;
     }
     s->last_dc[0] = 1 << (7 + s->intra_dc_precision);
@@ -1587,7 +1591,6 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         
         if (MPV_common_init(s) < 0)
             return -1;
-        mpeg1_init_vlc(s);
         s1->mpeg_enc_ctx_allocated = 1;
     }
 
@@ -1708,7 +1711,17 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
         } else {
             memcpy(s->buf_ptr, buf_start, len);
             s->buf_ptr += len;
-            
+            if(   (s2->flags&CODEC_FLAG_NOT_TRUNCATED) && (!start_code_found) 
+               && s->buf_ptr+4<s->buffer+s->buffer_size){
+                start_code_found= 1;
+                code= 0x1FF;
+                s->header_state=0xFF;
+                s->buf_ptr[0]=0;
+                s->buf_ptr[1]=0;
+                s->buf_ptr[2]=1;
+                s->buf_ptr[3]=0xFF;
+                s->buf_ptr+=4;
+            }
             if (start_code_found) {
                 /* prepare data for next start code */
                 input_size = s->buf_ptr - s->buffer;
diff --git a/src/libffmpeg/libavcodec/mpeg4data.h b/src/libffmpeg/libavcodec/mpeg4data.h
index 3d8de7ca6..88101a544 100644
--- a/src/libffmpeg/libavcodec/mpeg4data.h
+++ b/src/libffmpeg/libavcodec/mpeg4data.h
@@ -17,6 +17,10 @@
 #define MOTION_MARKER 0x1F001
 #define DC_MARKER     0x6B001
 
+#define MB_TYPE_B_DIRECT 0
+#define MB_TYPE_B_BIDIR  1
+#define MB_TYPE_B_BACKW  2
+#define MB_TYPE_B_FORW   3
 
 /* dc encoding for mpeg4 */
 const UINT8 DCtab_lum[13][2] =
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 320e2e4a8..ff1c3a383 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -36,7 +36,6 @@ static void dct_unquantize_h263_c(MpegEncContext *s,
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w);
 static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
 
-int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow)= dct_quantize_c;
 void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edges_c;
 static void emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
                                     int src_x, int src_y, int w, int h);
@@ -76,14 +75,14 @@ extern UINT8 zigzag_end[64];
 /* default motion estimation */
 int motion_estimation_method = ME_EPZS;
 
-static void convert_matrix(int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
+static void convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
                            const UINT16 *quant_matrix, int bias)
 {
     int qscale;
 
     for(qscale=1; qscale<32; qscale++){
         int i;
-        if (av_fdct == fdct_ifast) {
+        if (s->fdct == ff_jpeg_fdct_islow) {
             for(i=0;i<64;i++) {
                 const int j= block_permute_op(i);
                 /* 16 <= qscale * quant_matrix[i] <= 7905 */
@@ -91,7 +90,18 @@ static void convert_matrix(int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*q
                 /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
                 /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
                 
-                qmat[qscale][j] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / 
+                qmat[qscale][j] = (int)((UINT64_C(1) << QMAT_SHIFT) / 
+                                (qscale * quant_matrix[j]));
+            }
+        } else if (s->fdct == fdct_ifast) {
+            for(i=0;i<64;i++) {
+                const int j= block_permute_op(i);
+                /* 16 <= qscale * quant_matrix[i] <= 7905 */
+                /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
+                /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
+                /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
+                
+                qmat[qscale][j] = (int)((UINT64_C(1) << (QMAT_SHIFT + 14)) / 
                                 (aanscales[i] * qscale * quant_matrix[j]));
             }
         } else {
@@ -105,7 +115,6 @@ static void convert_matrix(int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*q
                 qmat16[qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]);
 
                 if(qmat16[qscale][i]==0 || qmat16[qscale][i]==128*256) qmat16[qscale][i]=128*256-1;
-
                 qmat16_bias[qscale][i]= ROUNDED_DIV(bias<<(16-QUANT_BIAS_SHIFT), qmat16[qscale][i]);
             }
         }
@@ -130,6 +139,12 @@ int MPV_common_init(MpegEncContext *s)
     s->dct_unquantize_h263 = dct_unquantize_h263_c;
     s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_c;
     s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_c;
+    s->dct_quantize= dct_quantize_c;
+
+    if(s->avctx->dct_algo==FF_DCT_FASTINT)
+        s->fdct = fdct_ifast;
+    else
+        s->fdct = ff_jpeg_fdct_islow;
         
 #ifdef HAVE_MMX
     MPV_common_init_mmx(s);
@@ -137,6 +152,9 @@ int MPV_common_init(MpegEncContext *s)
 #ifdef ARCH_ALPHA
     MPV_common_init_axp(s);
 #endif
+#ifdef HAVE_MLIB
+    MPV_common_init_mlib(s);
+#endif
 
     s->mb_width = (s->width + 15) / 16;
     s->mb_height = (s->height + 15) / 16;
@@ -226,6 +244,11 @@ int MPV_common_init(MpegEncContext *s)
             CHECKED_ALLOCZ(s->tex_pb_buffer, PB_BUFFER_SIZE);
             CHECKED_ALLOCZ(   s->pb2_buffer, PB_BUFFER_SIZE);
         }
+        
+        if(s->msmpeg4_version){
+            CHECKED_ALLOCZ(s->ac_stats, 2*2*(MAX_LEVEL+1)*(MAX_RUN+1)*2*sizeof(int));
+        }
+        CHECKED_ALLOCZ(s->avctx->stats_out, 256);
     }
     
     if (s->out_format == FMT_H263 || s->encoding) {
@@ -236,9 +259,13 @@ int MPV_common_init(MpegEncContext *s)
         /* MV prediction */
         size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
         CHECKED_ALLOCZ(s->motion_val, size * 2 * sizeof(INT16));
-        
-        /* 4mv direct mode decoding table */
-        CHECKED_ALLOCZ(s->non_b_mv4_table, size * sizeof(UINT8))
+    }
+
+    if(s->codec_id==CODEC_ID_MPEG4){
+        /* 4mv and interlaced direct mode decoding tables */
+        CHECKED_ALLOCZ(s->co_located_type_table, s->mb_num * sizeof(UINT8))
+        CHECKED_ALLOCZ(s->field_mv_table, s->mb_num*2*2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->field_select_table, s->mb_num*2* sizeof(INT8))
     }
 
     if (s->h263_pred || s->h263_plus) {
@@ -262,10 +289,6 @@ int MPV_common_init(MpegEncContext *s)
         
         /* cbp values */
         CHECKED_ALLOCZ(s->coded_block, y_size);
-
-        /* which mb is a intra block */
-        CHECKED_ALLOCZ(s->mbintra_table, s->mb_num);
-        memset(s->mbintra_table, 1, s->mb_num);
         
         /* divx501 bitstream reorder buffer */
         CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE);
@@ -276,6 +299,10 @@ int MPV_common_init(MpegEncContext *s)
     }
     CHECKED_ALLOCZ(s->qscale_table  , s->mb_num * sizeof(UINT8))
     
+    /* which mb is a intra block */
+    CHECKED_ALLOCZ(s->mbintra_table, s->mb_num);
+    memset(s->mbintra_table, 1, s->mb_num);
+    
     /* default structure is frame */
     s->picture_structure = PICT_FRAME;
     
@@ -327,8 +354,12 @@ void MPV_common_end(MpegEncContext *s)
     av_freep(&s->tex_pb_buffer);
     av_freep(&s->pb2_buffer);
     av_freep(&s->edge_emu_buffer);
-    av_freep(&s->non_b_mv4_table);
-
+    av_freep(&s->co_located_type_table);
+    av_freep(&s->field_mv_table);
+    av_freep(&s->field_select_table);
+    av_freep(&s->avctx->stats_out);
+    av_freep(&s->ac_stats);
+    
     for(i=0;i<3;i++) {
         int j;
         if(!(s->flags&CODEC_FLAG_DR1)){
@@ -377,13 +408,15 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->max_qdiff= avctx->max_qdiff;
     s->qcompress= avctx->qcompress;
     s->qblur= avctx->qblur;
-    s->b_quant_factor= avctx->b_quant_factor;
-    s->b_quant_offset= avctx->b_quant_offset;
     s->avctx = avctx;
     s->aspect_ratio_info= avctx->aspect_ratio_info;
+    if (avctx->aspect_ratio_info == FF_ASPECT_EXTENDED)
+    {
+	s->aspected_width = avctx->aspected_width;
+	s->aspected_height = avctx->aspected_height;
+    }
     s->flags= avctx->flags;
     s->max_b_frames= avctx->max_b_frames;
-    s->rc_strategy= avctx->rc_strategy;
     s->b_frame_strategy= avctx->b_frame_strategy;
     s->codec_id= avctx->codec->id;
     s->luma_elim_threshold  = avctx->luma_elim_threshold;
@@ -558,9 +591,9 @@ int MPV_encode_init(AVCodecContext *avctx)
     /* precompute matrix */
     /* for mjpeg, we do include qscale in the matrix */
     if (s->out_format != FMT_MJPEG) {
-        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->q_intra_matrix16_bias, 
+        convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16, s->q_intra_matrix16_bias, 
                        s->intra_matrix, s->intra_quant_bias);
-        convert_matrix(s->q_inter_matrix, s->q_inter_matrix16, s->q_inter_matrix16_bias, 
+        convert_matrix(s, s->q_inter_matrix, s->q_inter_matrix16, s->q_inter_matrix16_bias, 
                        s->inter_matrix, s->inter_quant_bias);
     }
 
@@ -595,6 +628,7 @@ int MPV_encode_end(AVCodecContext *avctx)
 }
 
 /* draw the edges of width 'w' of an image of size width, height */
+//FIXME check that this is ok for mpeg4 interlaced
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w)
 {
     UINT8 *ptr, *last_line;
@@ -678,7 +712,6 @@ void MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
                 avctx->dr_opaque_frame= s->next_dr_opaque;
         }
     }
-
     /* set dequantizer, we cant do it during init as it might change for mpeg4
        and we cant do it in the header decode as init isnt called for mpeg4 there yet */
     if(s->out_format == FMT_H263){
@@ -693,7 +726,8 @@ void MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
 /* generic function for encode/decode called after a frame has been coded/decoded */
 void MPV_frame_end(MpegEncContext *s)
 {
-//    if((s->picture_number%100)==0 && s->encoding) printf("sads:%d //\n", sads);
+    s->avctx->key_frame   = (s->pict_type == I_TYPE);
+    s->avctx->pict_type   = s->pict_type;
 
     /* draw edge for correct motion prediction if outside */
     if (s->pict_type != B_TYPE && !s->intra_only && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
@@ -703,10 +737,9 @@ void MPV_frame_end(MpegEncContext *s)
     }
     emms_c();
     
+    s->last_pict_type    = s->pict_type;
     if(s->pict_type!=B_TYPE){
         s->last_non_b_pict_type= s->pict_type;
-        s->last_non_b_qscale= s->qscale;
-        s->last_non_b_mc_mb_var= s->mc_mb_var_sum;
         s->num_available_buffers++;
         if(s->num_available_buffers>2) s->num_available_buffers= 2;
     }
@@ -844,8 +877,7 @@ int MPV_encode_picture(AVCodecContext *avctx,
         MPV_frame_start(s, avctx);
 
         encode_picture(s, s->picture_number);
-        avctx->key_frame   = (s->pict_type == I_TYPE);
-        avctx->pict_type   = s->pict_type;
+        
         avctx->real_pict_num  = s->picture_number;
         avctx->header_bits = s->header_bits;
         avctx->mv_bits     = s->mv_bits;
@@ -853,7 +885,7 @@ int MPV_encode_picture(AVCodecContext *avctx,
         avctx->i_tex_bits  = s->i_tex_bits;
         avctx->p_tex_bits  = s->p_tex_bits;
         avctx->i_count     = s->i_count;
-        avctx->p_count     = s->p_count;
+        avctx->p_count     = s->mb_num - s->i_count - s->skip_count; //FIXME f/b_count in avctx
         avctx->skip_count  = s->skip_count;
 
         MPV_frame_end(s);
@@ -873,13 +905,21 @@ int MPV_encode_picture(AVCodecContext *avctx,
 
     flush_put_bits(&s->pb);
     s->frame_bits  = (pbBufPtr(&s->pb) - s->pb.buf) * 8;
-    if(s->pict_type==B_TYPE) s->pb_frame_bits+= s->frame_bits;
-    else                     s->pb_frame_bits= s->frame_bits;
-
+    
     s->total_bits += s->frame_bits;
     avctx->frame_bits  = s->frame_bits;
 //printf("fcode: %d, type: %d, head: %d, mv: %d, misc: %d, frame: %d, itex: %d, ptex: %d\n", 
 //s->f_code, avctx->key_frame, s->header_bits, s->mv_bits, s->misc_bits, s->frame_bits, s->i_tex_bits, s->p_tex_bits);
+#if 0 //dump some stats to stats.txt for testing/debuging
+if(s->max_b_frames==0)
+{
+    static FILE *f=NULL;
+    if(!f) f= fopen("stats.txt", "wb");
+    get_psnr(pict->data, s->current_picture,
+             pict->linesize, s->linesize, avctx);
+    fprintf(f, "%7d, %7d, %2.4f\n", pbBufPtr(&s->pb) - s->pb.buf, s->qscale, avctx->psnr_y);
+}
+#endif
 
     if (avctx->get_psnr) {
         /* At this point pict->data should have the original frame   */
@@ -1029,18 +1069,19 @@ static inline void mpeg_motion(MpegEncContext *s,
                                UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
                                int dest_offset,
                                UINT8 **ref_picture, int src_offset,
-                               int field_based, op_pixels_func *pix_op,
+                               int field_based, op_pixels_func (*pix_op)[4],
                                int motion_x, int motion_y, int h)
 {
     UINT8 *ptr;
     int dxy, offset, mx, my, src_x, src_y, height, v_edge_pos, linesize, uvlinesize;
     int emu=0;
-    
+#if 0    
 if(s->quarter_sample)
 {
     motion_x>>=1;
     motion_y>>=1;
 }
+#endif
     dxy = ((motion_y & 1) << 1) | (motion_x & 1);
     src_x = s->mb_x * 16 + (motion_x >> 1);
     src_y = s->mb_y * (16 >> field_based) + (motion_y >> 1);
@@ -1067,8 +1108,7 @@ if(s->quarter_sample)
             emu=1;
         }
     }
-    pix_op[dxy](dest_y, ptr, linesize, h);
-    pix_op[dxy](dest_y + 8, ptr + 8, linesize, h);
+    pix_op[0][dxy](dest_y, ptr, linesize, h);
 
     if(s->flags&CODEC_FLAG_GRAY) return;
 
@@ -1102,26 +1142,26 @@ if(s->quarter_sample)
         emulated_edge_mc(s, ptr, uvlinesize, 9, (h>>1)+1, src_x, src_y, s->h_edge_pos>>1, v_edge_pos>>1);
         ptr= s->edge_emu_buffer;
     }
-    pix_op[dxy](dest_cb + (dest_offset >> 1), ptr, uvlinesize, h >> 1);
+    pix_op[1][dxy](dest_cb + (dest_offset >> 1), ptr, uvlinesize, h >> 1);
 
     ptr = ref_picture[2] + offset;
     if(emu){
         emulated_edge_mc(s, ptr, uvlinesize, 9, (h>>1)+1, src_x, src_y, s->h_edge_pos>>1, v_edge_pos>>1);
         ptr= s->edge_emu_buffer;
     }
-    pix_op[dxy](dest_cr + (dest_offset >> 1), ptr, uvlinesize, h >> 1);
+    pix_op[1][dxy](dest_cr + (dest_offset >> 1), ptr, uvlinesize, h >> 1);
 }
 
 static inline void qpel_motion(MpegEncContext *s,
                                UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
                                int dest_offset,
                                UINT8 **ref_picture, int src_offset,
-                               int field_based, op_pixels_func *pix_op,
-                               qpel_mc_func *qpix_op,
+                               int field_based, op_pixels_func (*pix_op)[4],
+                               qpel_mc_func (*qpix_op)[16],
                                int motion_x, int motion_y, int h)
 {
     UINT8 *ptr;
-    int dxy, offset, mx, my, src_x, src_y, height, v_edge_pos, linesize;
+    int dxy, offset, mx, my, src_x, src_y, height, v_edge_pos, linesize, uvlinesize;
     int emu=0;
 
     dxy = ((motion_y & 3) << 2) | (motion_x & 3);
@@ -1137,6 +1177,7 @@ static inline void qpel_motion(MpegEncContext *s,
     if (src_y == height)
         dxy &= ~12;
     linesize = s->linesize << field_based;
+    uvlinesize = s->uvlinesize << field_based;
     ptr = ref_picture[0] + (src_y * linesize) + src_x + src_offset;
     dest_y += dest_offset;
 //printf("%d %d %d\n", src_x, src_y, dxy);
@@ -1149,24 +1190,33 @@ static inline void qpel_motion(MpegEncContext *s,
             emu=1;
         }
     }
-    qpix_op[dxy](dest_y                 , ptr                 , linesize, linesize, motion_x&3, motion_y&3);
-    qpix_op[dxy](dest_y              + 8, ptr              + 8, linesize, linesize, motion_x&3, motion_y&3);
-    qpix_op[dxy](dest_y + linesize*8    , ptr + linesize*8    , linesize, linesize, motion_x&3, motion_y&3);
-    qpix_op[dxy](dest_y + linesize*8 + 8, ptr + linesize*8 + 8, linesize, linesize, motion_x&3, motion_y&3);
-    
+    if(!field_based)
+        qpix_op[0][dxy](dest_y, ptr, linesize);
+    else{
+        //damn interlaced mode
+        //FIXME boundary mirroring is not exactly correct here
+        qpix_op[1][dxy](dest_y  , ptr  , linesize);
+        qpix_op[1][dxy](dest_y+8, ptr+8, linesize);
+    }
+
     if(s->flags&CODEC_FLAG_GRAY) return;
 
-    mx= (motion_x>>1) | (motion_x&1);
-    my= (motion_y>>1) | (motion_y&1);
+    if(field_based){
+        mx= motion_x/2;
+        my= motion_y>>1;
+    }else if(s->divx_version){
+        mx= (motion_x>>1)|(motion_x&1);
+        my= (motion_y>>1)|(motion_y&1);
+    }else{
+        mx= motion_x/2;
+        my= motion_y/2;
+    }
+    mx= (mx>>1)|(mx&1);
+    my= (my>>1)|(my&1);
+    dxy= (mx&1) | ((my&1)<<1);
+    mx>>=1;
+    my>>=1;
 
-    dxy = 0;
-    if ((mx & 3) != 0)
-        dxy |= 1;
-    if ((my & 3) != 0)
-        dxy |= 2;
-    mx = mx >> 2;
-    my = my >> 2;
-    
     src_x = s->mb_x * 8 + mx;
     src_y = s->mb_y * (8 >> field_based) + my;
     src_x = clip(src_x, -8, s->width >> 1);
@@ -1176,27 +1226,27 @@ static inline void qpel_motion(MpegEncContext *s,
     if (src_y == (height >> 1))
         dxy &= ~2;
 
-    offset = (src_y * s->uvlinesize) + src_x + (src_offset >> 1);
+    offset = (src_y * uvlinesize) + src_x + (src_offset >> 1);
     ptr = ref_picture[1] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr,  s->uvlinesize, 9, (h>>1)+1, src_x, src_y, s->h_edge_pos>>1, v_edge_pos>>1);
+        emulated_edge_mc(s, ptr,  uvlinesize, 9, (h>>1)+1, src_x, src_y, s->h_edge_pos>>1, v_edge_pos>>1);
         ptr= s->edge_emu_buffer;
     }
-    pix_op[dxy](dest_cb + (dest_offset >> 1), ptr,  s->uvlinesize, h >> 1);
+    pix_op[1][dxy](dest_cb + (dest_offset >> 1), ptr,  uvlinesize, h >> 1);
     
     ptr = ref_picture[2] + offset;
     if(emu){
-        emulated_edge_mc(s, ptr,  s->uvlinesize, 9, (h>>1)+1, src_x, src_y, s->h_edge_pos>>1, v_edge_pos>>1);
+        emulated_edge_mc(s, ptr,  uvlinesize, 9, (h>>1)+1, src_x, src_y, s->h_edge_pos>>1, v_edge_pos>>1);
         ptr= s->edge_emu_buffer;
     }
-    pix_op[dxy](dest_cr + (dest_offset >> 1), ptr,  s->uvlinesize, h >> 1);
+    pix_op[1][dxy](dest_cr + (dest_offset >> 1), ptr,  uvlinesize, h >> 1);
 }
 
 
 static inline void MPV_motion(MpegEncContext *s, 
                               UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
                               int dir, UINT8 **ref_picture, 
-                              op_pixels_func *pix_op, qpel_mc_func *qpix_op)
+                              op_pixels_func (*pix_op)[4], qpel_mc_func (*qpix_op)[16])
 {
     int dxy, offset, mx, my, src_x, src_y, motion_x, motion_y;
     int mb_x, mb_y, i;
@@ -1209,19 +1259,10 @@ static inline void MPV_motion(MpegEncContext *s,
     switch(s->mv_type) {
     case MV_TYPE_16X16:
         if(s->mcsel){
-#if 0
-            mpeg_motion(s, dest_y, dest_cb, dest_cr, 0,
-                        ref_picture, 0,
-                        0, pix_op,
-                        s->sprite_offset[0][0]>>3,
-                        s->sprite_offset[0][1]>>3,
-                        16);
-#else
             gmc1_motion(s, dest_y, dest_cb, dest_cr, 0,
                         ref_picture, 0,
                         16);
-#endif
-        }else if(s->quarter_sample && dir==0){ //FIXME
+        }else if(s->quarter_sample){
             qpel_motion(s, dest_y, dest_cb, dest_cr, 0,
                         ref_picture, 0,
                         0, pix_op, qpix_op,
@@ -1234,42 +1275,76 @@ static inline void MPV_motion(MpegEncContext *s,
         }           
         break;
     case MV_TYPE_8X8:
-        for(i=0;i<4;i++) {
-            motion_x = s->mv[dir][i][0];
-            motion_y = s->mv[dir][i][1];
+        mx = 0;
+        my = 0;
+        if(s->quarter_sample){
+            for(i=0;i<4;i++) {
+                motion_x = s->mv[dir][i][0];
+                motion_y = s->mv[dir][i][1];
+
+                dxy = ((motion_y & 3) << 2) | (motion_x & 3);
+                src_x = mb_x * 16 + (motion_x >> 2) + (i & 1) * 8;
+                src_y = mb_y * 16 + (motion_y >> 2) + (i >>1) * 8;
+                    
+                /* WARNING: do no forget half pels */
+                src_x = clip(src_x, -16, s->width);
+                if (src_x == s->width)
+                    dxy &= ~3;
+                src_y = clip(src_y, -16, s->height);
+                if (src_y == s->height)
+                    dxy &= ~12;
+                    
+                ptr = ref_picture[0] + (src_y * s->linesize) + (src_x);
+                if(s->flags&CODEC_FLAG_EMU_EDGE){
+                    if(src_x<0 || src_y<0 || src_x + (motion_x&3) + 8 > s->h_edge_pos
+                                          || src_y + (motion_y&3) + 8 > s->v_edge_pos){
+                        emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
+                        ptr= s->edge_emu_buffer;
+                    }
+                }
+                dest = dest_y + ((i & 1) * 8) + (i >> 1) * 8 * s->linesize;
+                qpix_op[1][dxy](dest, ptr, s->linesize);
 
-            dxy = ((motion_y & 1) << 1) | (motion_x & 1);
-            src_x = mb_x * 16 + (motion_x >> 1) + (i & 1) * 8;
-            src_y = mb_y * 16 + (motion_y >> 1) + (i >>1) * 8;
+                mx += s->mv[dir][i][0]/2;
+                my += s->mv[dir][i][1]/2;
+            }
+        }else{
+            for(i=0;i<4;i++) {
+                motion_x = s->mv[dir][i][0];
+                motion_y = s->mv[dir][i][1];
+
+                dxy = ((motion_y & 1) << 1) | (motion_x & 1);
+                src_x = mb_x * 16 + (motion_x >> 1) + (i & 1) * 8;
+                src_y = mb_y * 16 + (motion_y >> 1) + (i >>1) * 8;
                     
-            /* WARNING: do no forget half pels */
-            src_x = clip(src_x, -16, s->width);
-            if (src_x == s->width)
-                dxy &= ~1;
-            src_y = clip(src_y, -16, s->height);
-            if (src_y == s->height)
-                dxy &= ~2;
+                /* WARNING: do no forget half pels */
+                src_x = clip(src_x, -16, s->width);
+                if (src_x == s->width)
+                    dxy &= ~1;
+                src_y = clip(src_y, -16, s->height);
+                if (src_y == s->height)
+                    dxy &= ~2;
                     
-            ptr = ref_picture[0] + (src_y * s->linesize) + (src_x);
-            if(s->flags&CODEC_FLAG_EMU_EDGE){
-                if(src_x<0 || src_y<0 || src_x + (motion_x&1) + 8 > s->h_edge_pos
-                                      || src_y + (motion_y&1) + 8 > s->v_edge_pos){
-                    emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
-                    ptr= s->edge_emu_buffer;
+                ptr = ref_picture[0] + (src_y * s->linesize) + (src_x);
+                if(s->flags&CODEC_FLAG_EMU_EDGE){
+                    if(src_x<0 || src_y<0 || src_x + (motion_x&1) + 8 > s->h_edge_pos
+                                          || src_y + (motion_y&1) + 8 > s->v_edge_pos){
+                        emulated_edge_mc(s, ptr, s->linesize, 9, 9, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
+                        ptr= s->edge_emu_buffer;
+                    }
                 }
+                dest = dest_y + ((i & 1) * 8) + (i >> 1) * 8 * s->linesize;
+                pix_op[1][dxy](dest, ptr, s->linesize, 8);
+
+                mx += s->mv[dir][i][0];
+                my += s->mv[dir][i][1];
             }
-            dest = dest_y + ((i & 1) * 8) + (i >> 1) * 8 * s->linesize;
-            pix_op[dxy](dest, ptr, s->linesize, 8);
         }
-    
+
         if(s->flags&CODEC_FLAG_GRAY) break;
         /* In case of 8X8, we construct a single chroma motion vector
            with a special rounding */
-        mx = 0;
-        my = 0;
         for(i=0;i<4;i++) {
-            mx += s->mv[dir][i][0];
-            my += s->mv[dir][i][1];
         }
         if (mx >= 0)
             mx = (h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
@@ -1306,27 +1381,40 @@ static inline void MPV_motion(MpegEncContext *s,
                     emu=1;
                 }
             }
-        pix_op[dxy](dest_cb, ptr, s->uvlinesize, 8);
+        pix_op[1][dxy](dest_cb, ptr, s->uvlinesize, 8);
 
         ptr = ref_picture[2] + offset;
         if(emu){
             emulated_edge_mc(s, ptr, s->uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
             ptr= s->edge_emu_buffer;
         }
-        pix_op[dxy](dest_cr, ptr, s->uvlinesize, 8);
+        pix_op[1][dxy](dest_cr, ptr, s->uvlinesize, 8);
         break;
     case MV_TYPE_FIELD:
         if (s->picture_structure == PICT_FRAME) {
-            /* top field */
-            mpeg_motion(s, dest_y, dest_cb, dest_cr, 0,
-                        ref_picture, s->field_select[dir][0] ? s->linesize : 0,
-                        1, pix_op,
-                        s->mv[dir][0][0], s->mv[dir][0][1], 8);
-            /* bottom field */
-            mpeg_motion(s, dest_y, dest_cb, dest_cr, s->linesize,
-                        ref_picture, s->field_select[dir][1] ? s->linesize : 0,
-                        1, pix_op,
-                        s->mv[dir][1][0], s->mv[dir][1][1], 8);
+            if(s->quarter_sample){
+                /* top field */
+                qpel_motion(s, dest_y, dest_cb, dest_cr, 0,
+                            ref_picture, s->field_select[dir][0] ? s->linesize : 0,
+                            1, pix_op, qpix_op,
+                            s->mv[dir][0][0], s->mv[dir][0][1], 8);
+                /* bottom field */
+                qpel_motion(s, dest_y, dest_cb, dest_cr, s->linesize,
+                            ref_picture, s->field_select[dir][1] ? s->linesize : 0,
+                            1, pix_op, qpix_op,
+                            s->mv[dir][1][0], s->mv[dir][1][1], 8);
+            }else{
+                /* top field */       
+                mpeg_motion(s, dest_y, dest_cb, dest_cr, 0,
+                            ref_picture, s->field_select[dir][0] ? s->linesize : 0,
+                            1, pix_op,
+                            s->mv[dir][0][0], s->mv[dir][0][1], 8);
+                /* bottom field */
+                mpeg_motion(s, dest_y, dest_cb, dest_cr, s->linesize,
+                            ref_picture, s->field_select[dir][1] ? s->linesize : 0,
+                            1, pix_op,
+                            s->mv[dir][1][0], s->mv[dir][1][1], 8);
+            }
         } else {
             
 
@@ -1441,18 +1529,36 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
 
     /* update motion predictor, not for B-frames as they need the motion_val from the last P/S-Frame */
     if (s->out_format == FMT_H263 && s->pict_type!=B_TYPE) { //FIXME move into h263.c if possible, format specific stuff shouldnt be here
-        int motion_x, motion_y;
         
         const int wrap = s->block_wrap[0];
         const int xy = s->block_index[0];
-        if (s->mb_intra) {
-            motion_x = 0;
-            motion_y = 0;
-            goto motion_init;
-        } else if (s->mv_type == MV_TYPE_16X16) {
-            motion_x = s->mv[0][0][0];
-            motion_y = s->mv[0][0][1];
-        motion_init:
+        const int mb_index= s->mb_x + s->mb_y*s->mb_width;
+        if(s->mv_type == MV_TYPE_8X8){
+            s->co_located_type_table[mb_index]= CO_LOCATED_TYPE_4MV;
+        } else {
+            int motion_x, motion_y;
+            if (s->mb_intra) {
+                motion_x = 0;
+                motion_y = 0;
+                if(s->co_located_type_table)
+                    s->co_located_type_table[mb_index]= 0;
+            } else if (s->mv_type == MV_TYPE_16X16) {
+                motion_x = s->mv[0][0][0];
+                motion_y = s->mv[0][0][1];
+                if(s->co_located_type_table)
+                    s->co_located_type_table[mb_index]= 0;
+            } else /*if (s->mv_type == MV_TYPE_FIELD)*/ {
+                int i;
+                motion_x = s->mv[0][0][0] + s->mv[0][1][0];
+                motion_y = s->mv[0][0][1] + s->mv[0][1][1];
+                motion_x = (motion_x>>1) | (motion_x&1);
+                for(i=0; i<2; i++){
+                    s->field_mv_table[mb_index][i][0]= s->mv[0][i][0];
+                    s->field_mv_table[mb_index][i][1]= s->mv[0][i][1];
+                    s->field_select_table[mb_index][i]= s->field_select[0][i];
+                }
+                s->co_located_type_table[mb_index]= CO_LOCATED_TYPE_FIELDMV;
+            }
             /* no update if 8X8 because it has been done during parsing */
             s->motion_val[xy][0] = motion_x;
             s->motion_val[xy][1] = motion_y;
@@ -1462,17 +1568,14 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             s->motion_val[xy + wrap][1] = motion_y;
             s->motion_val[xy + 1 + wrap][0] = motion_x;
             s->motion_val[xy + 1 + wrap][1] = motion_y;
-            s->non_b_mv4_table[xy]=0;
-        } else { /* 8X8 */
-            s->non_b_mv4_table[xy]=1;
         }
     }
     
     if (!(s->encoding && (s->intra_only || s->pict_type==B_TYPE))) {
         UINT8 *dest_y, *dest_cb, *dest_cr;
         int dct_linesize, dct_offset;
-        op_pixels_func *op_pix;
-        qpel_mc_func *op_qpix;
+        op_pixels_func (*op_pix)[4];
+        qpel_mc_func (*op_qpix)[16];
 
         /* avoid copy if macroblock skipped in last frame too 
            dont touch it for B-frames as they need the skip info from the next p-frame */
@@ -1511,18 +1614,16 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             if((!s->encoding) || (s->mb_type[mb_xy]&(s->mb_type[mb_xy]-1))){
                 if ((!s->no_rounding) || s->pict_type==B_TYPE){                
                     op_pix = put_pixels_tab;
-                    op_qpix= qpel_mc_rnd_tab;
+                    op_qpix= put_qpel_pixels_tab;
                 }else{
                     op_pix = put_no_rnd_pixels_tab;
-                    op_qpix= qpel_mc_no_rnd_tab;
+                    op_qpix= put_no_rnd_qpel_pixels_tab;
                 }
 
                 if (s->mv_dir & MV_DIR_FORWARD) {
                     MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
-                    if ((!s->no_rounding) || s->pict_type==B_TYPE)
-                        op_pix = avg_pixels_tab;
-                    else
-                        op_pix = avg_no_rnd_pixels_tab;
+                    op_pix = avg_pixels_tab;
+                    op_qpix= avg_qpel_pixels_tab;
                 }
                 if (s->mv_dir & MV_DIR_BACKWARD) {
                     MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix);
@@ -1571,7 +1672,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
     emms_c(); //FIXME remove
 }
 
-static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int threshold, int skip_dc)
+static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int threshold)
 {
     static const char tab[64]=
         {3,2,2,1,1,1,1,1,
@@ -1587,9 +1688,14 @@ static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int th
     int i;
     DCTELEM *block= s->block[n];
     const int last_index= s->block_last_index[n];
+    int skip_dc;
+
+    if(threshold<0){
+        skip_dc=0;
+        threshold= -threshold;
+    }else
+        skip_dc=1;
 
-    if(skip_dc) skip_dc=1;
-    
     /* are all which we could set to zero are allready zero? */
     if(last_index<=skip_dc - 1) return;
 
@@ -1685,8 +1791,8 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             get_pixels(s->block[5], ptr, wrap);
         }
     }else{
-        op_pixels_func *op_pix;
-        qpel_mc_func *op_qpix;
+        op_pixels_func (*op_pix)[4];
+        qpel_mc_func (*op_qpix)[16];
         UINT8 *dest_y, *dest_cb, *dest_cr;
         UINT8 *ptr_y, *ptr_cb, *ptr_cr;
         int wrap_y, wrap_c;
@@ -1703,18 +1809,16 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
         if ((!s->no_rounding) || s->pict_type==B_TYPE){
             op_pix = put_pixels_tab;
-            op_qpix= qpel_mc_rnd_tab;
+            op_qpix= put_qpel_pixels_tab;
         }else{
             op_pix = put_no_rnd_pixels_tab;
-            op_qpix= qpel_mc_no_rnd_tab;
+            op_qpix= put_no_rnd_qpel_pixels_tab;
         }
 
         if (s->mv_dir & MV_DIR_FORWARD) {
             MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
-           if ((!s->no_rounding) || s->pict_type==B_TYPE)
-                op_pix = avg_pixels_tab;
-            else
-                op_pix = avg_no_rnd_pixels_tab;
+            op_pix = avg_pixels_tab;
+            op_qpix= avg_qpel_pixels_tab;
         }
         if (s->mv_dir & MV_DIR_BACKWARD) {
             MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix);
@@ -1790,14 +1894,14 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     if(s->out_format==FMT_MJPEG){
         for(i=0;i<6;i++) {
             int overflow;
-            s->block_last_index[i] = dct_quantize(s, s->block[i], i, 8, &overflow);
+            s->block_last_index[i] = s->dct_quantize(s, s->block[i], i, 8, &overflow);
             if (overflow) clip_coeffs(s, s->block[i], s->block_last_index[i]);
         }
     }else{
         for(i=0;i<6;i++) {
             if(!skip_dct[i]){
                 int overflow;
-                s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale, &overflow);
+                s->block_last_index[i] = s->dct_quantize(s, s->block[i], i, s->qscale, &overflow);
             // FIXME we could decide to change to quantizer instead of clipping
             // JS: I don't think that would be a good idea it could lower quality instead
             //     of improve it. Just INTRADC clipping deserves changes in quantizer
@@ -1807,10 +1911,10 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         }
         if(s->luma_elim_threshold && !s->mb_intra)
             for(i=0; i<4; i++)
-                dct_single_coeff_elimination(s, i, s->luma_elim_threshold, 0);
+                dct_single_coeff_elimination(s, i, s->luma_elim_threshold);
         if(s->chroma_elim_threshold && !s->mb_intra)
             for(i=4; i<6; i++)
-                dct_single_coeff_elimination(s, i, s->chroma_elim_threshold, 1);
+                dct_single_coeff_elimination(s, i, s->chroma_elim_threshold);
     }
 
     if((s->flags&CODEC_FLAG_GRAY) && s->mb_intra){
@@ -1866,7 +1970,8 @@ static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext
     d->i_tex_bits= s->i_tex_bits;
     d->p_tex_bits= s->p_tex_bits;
     d->i_count= s->i_count;
-    d->p_count= s->p_count;
+    d->f_count= s->f_count;
+    d->b_count= s->b_count;
     d->skip_count= s->skip_count;
     d->misc_bits= s->misc_bits;
     d->last_bits= 0;
@@ -1890,7 +1995,8 @@ static inline void copy_context_after_encode(MpegEncContext *d, MpegEncContext *
     d->i_tex_bits= s->i_tex_bits;
     d->p_tex_bits= s->p_tex_bits;
     d->i_count= s->i_count;
-    d->p_count= s->p_count;
+    d->f_count= s->f_count;
+    d->b_count= s->b_count;
     d->skip_count= s->skip_count;
     d->misc_bits= s->misc_bits;
 
@@ -1973,6 +2079,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     if (s->h263_pred && !s->h263_msmpeg4)
         ff_set_mpeg4_time(s, s->picture_number); 
 
+    s->scene_change_score=0;
+
     /* Estimate motion for every MB */
     if(s->pict_type != I_TYPE){
         for(mb_y=0; mb_y < s->mb_height; mb_y++) {
@@ -2003,16 +2111,34 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         memset(s->motion_val[0], 0, sizeof(INT16)*(s->mb_width*2 + 2)*(s->mb_height*2 + 2)*2);
         memset(s->p_mv_table   , 0, sizeof(INT16)*(s->mb_width+2)*(s->mb_height+2)*2);
         memset(s->mb_type      , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
-    }
+        
+        if(!s->fixed_qscale){
+            /* finding spatial complexity for I-frame rate control */
+            for(mb_y=0; mb_y < s->mb_height; mb_y++) {
+                for(mb_x=0; mb_x < s->mb_width; mb_x++) {
+                    int xx = mb_x * 16;
+                    int yy = mb_y * 16;
+                    uint8_t *pix = s->new_picture[0] + (yy * s->linesize) + xx;
+                    int varc;
+                    int sum = pix_sum(pix, s->linesize);
+    
+                    sum= (sum+8)>>4;
+                    varc = (pix_norm1(pix, s->linesize) - sum*sum + 500 + 128)>>8;
 
-    if(s->mb_var_sum < s->mc_mb_var_sum && s->pict_type == P_TYPE){ //FIXME subtract MV bits
+                    s->mb_var[s->mb_width * mb_y + mb_x] = varc;
+                    s->mb_var_sum    += varc;
+                }
+            }
+        }
+    }
+    if(s->scene_change_score > 0 && s->pict_type == P_TYPE){
         s->pict_type= I_TYPE;
         memset(s->mb_type   , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
         if(s->max_b_frames==0){
             s->input_pict_type= I_TYPE;
             s->input_picture_in_gop_number=0;
         }
-//printf("Scene change detected, encoding as I Frame\n");
+//printf("Scene change detected, encoding as I Frame %d %d\n", s->mb_var_sum, s->mc_mb_var_sum);
     }
     
     if(s->pict_type==P_TYPE || s->pict_type==S_TYPE) 
@@ -2031,9 +2157,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 //printf("f_code %d ///\n", s->f_code);
 
 //    printf("%d %d\n", s->avg_mb_var, s->mc_mb_var);
-    if(s->flags&CODEC_FLAG_PASS2)
-        s->qscale = ff_rate_estimate_qscale_pass2(s);
-    else if (!s->fixed_qscale) 
+    if (!s->fixed_qscale) 
         s->qscale = ff_rate_estimate_qscale(s);
 
     if (s->out_format == FMT_MJPEG) {
@@ -2041,7 +2165,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->intra_matrix[0] = ff_mpeg1_default_intra_matrix[0];
         for(i=1;i<64;i++)
             s->intra_matrix[i] = CLAMP_TO_8BIT((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
-        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, 
+        convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16, 
                        s->q_intra_matrix16_bias, s->intra_matrix, s->intra_quant_bias);
     }
 
@@ -2072,7 +2196,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->i_tex_bits=0;
     s->p_tex_bits=0;
     s->i_count=0;
-    s->p_count=0;
+    s->f_count=0;
+    s->b_count=0;
     s->skip_count=0;
 
     /* init last dc values */
@@ -2406,7 +2531,7 @@ static int dct_quantize_c(MpegEncContext *s,
     int max=0;
     unsigned int threshold1, threshold2;
     
-    av_fdct (block);
+    s->fdct (block);
 
     /* we need this permutation so that we correct the IDCT
        permutation. will be moved into DCT code */
@@ -2428,15 +2553,15 @@ static int dct_quantize_c(MpegEncContext *s,
         i = 1;
         last_non_zero = 0;
         qmat = s->q_intra_matrix[qscale];
-        bias= s->intra_quant_bias<<(QMAT_SHIFT - 3 - QUANT_BIAS_SHIFT);
+        bias= s->intra_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
     } else {
         i = 0;
         last_non_zero = -1;
         qmat = s->q_inter_matrix[qscale];
-        bias= s->inter_quant_bias<<(QMAT_SHIFT - 3 - QUANT_BIAS_SHIFT);
+        bias= s->inter_quant_bias<<(QMAT_SHIFT - QUANT_BIAS_SHIFT);
     }
-    threshold1= (1<<(QMAT_SHIFT - 3)) - bias - 1;
-    threshold2= threshold1<<1;
+    threshold1= (1<<QMAT_SHIFT) - bias - 1;
+    threshold2= (threshold1<<1);
 
     for(;i<64;i++) {
         j = zigzag_direct[i];
@@ -2447,10 +2572,10 @@ static int dct_quantize_c(MpegEncContext *s,
 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
         if(((unsigned)(level+threshold1))>threshold2){
             if(level>0){
-                level= (bias + level)>>(QMAT_SHIFT - 3);
+                level= (bias + level)>>QMAT_SHIFT;
                 block[j]= level;
             }else{
-                level= (bias - level)>>(QMAT_SHIFT - 3);
+                level= (bias - level)>>QMAT_SHIFT;
                 block[j]= -level;
             }
             max |=level;
@@ -2669,7 +2794,7 @@ void ff_conceal_past_errors(MpegEncContext *s, int unknown_pos)
     int i, intra_count=0, inter_count=0;
     int intra_conceal= s->msmpeg4_version ? 50 : 50; //FIXME finetune
     int inter_conceal= s->msmpeg4_version ? 50 : 50;
-    
+
     // for last block
     if(mb_x>=s->mb_width)  mb_x= s->mb_width -1;
     if(mb_y>=s->mb_height) mb_y= s->mb_height-1;
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 0f4983f49..2caccce8a 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -30,8 +30,8 @@ enum OutputFormat {
 
 #define MPEG_BUF_SIZE (16 * 1024)
 
-#define QMAT_SHIFT_MMX 19
-#define QMAT_SHIFT 25
+#define QMAT_SHIFT_MMX 16
+#define QMAT_SHIFT 22
 
 #define MAX_FCODE 7
 #define MAX_MV 2048
@@ -61,12 +61,34 @@ typedef struct RateControlEntry{
     UINT64 expected_bits;
     int new_pict_type;
     float new_qscale;
+    int mc_mb_var_sum;
+    int mb_var_sum;
+    int i_count;
+    int f_code;
+    int b_code;
 }RateControlEntry;
 
 typedef struct RateControlContext{
     FILE *stats_file;
-    int num_entries;
+    int num_entries;          /* number of RateControlEntries */
     RateControlEntry *entry;
+    int buffer_index;         /* amount of bits in the video/audio buffer */
+    Predictor pred[5];
+    double short_term_qsum;   /* sum of recent qscales */
+    double short_term_qcount; /* count of recent qscales */
+    double pass1_bits;        /* bits outputted by the pass1 code (including complexity init) */
+    double pass1_wanted_bits; /* bits which should have been outputed by the pass1 code (including complexity init) */
+    double last_qscale;
+    double last_qscale_for[5]; /* last qscale for a specific pict type */
+    double next_non_b_qscale;
+    double next_p_qscale;
+    int last_mc_mb_var_sum;
+    int last_mb_var_sum;
+    UINT64 i_cplx_sum[5];
+    UINT64 p_cplx_sum[5];
+    UINT64 mv_bits_sum[5];
+    UINT64 qscale_sum[5];
+    int frame_count[5];
 }RateControlContext;
 
 typedef struct ReorderBuffer{
@@ -107,9 +129,6 @@ typedef struct MpegEncContext {
     int flags;        /* AVCodecContext.flags (HQ, MV4, ...) */
     int force_input_type;/* 0= no force, otherwise I_TYPE, P_TYPE, ... */
     int max_b_frames; /* max number of b-frames for encoding */
-    float b_quant_factor;/* qscale factor between ips and b frames */
-    float b_quant_offset;/* qscale offset between ips and b frames */
-    int rc_strategy;
     int b_frame_strategy;
     int luma_elim_threshold;
     int chroma_elim_threshold;
@@ -170,8 +189,8 @@ typedef struct MpegEncContext {
     int input_pict_type;        /* pict_type prior to reordering of frames */
     int force_type;             /* 0= no force, otherwise I_TYPE, P_TYPE, ... */
     int qscale;                 /* QP */
-    int last_non_b_qscale;	/* QP of last non b frame used for b frame qscale*/
     int pict_type;              /* I_TYPE, P_TYPE, B_TYPE, ... */
+    int last_pict_type;
     int last_non_b_pict_type;   /* used for mpeg4 gmc b-frames & ratecontrol */
     int frame_rate_index;
     /* motion compensation */
@@ -195,6 +214,7 @@ typedef struct MpegEncContext {
     uint16_t *me_score_map;            /* map to store the SADs */
     int me_map_generation;
     int skip_me;                       /* set if ME is skiped for the current MB */
+    int scene_change_score;
     int mv_dir;
 #define MV_DIR_BACKWARD  1
 #define MV_DIR_FORWARD   2
@@ -270,18 +290,10 @@ typedef struct MpegEncContext {
     int I_frame_bits; //FIXME used in mpeg12 ...
     int mb_var_sum;          /* sum of MB variance for current frame */
     int mc_mb_var_sum;       /* motion compensated MB variance for current frame */
-    int last_non_b_mc_mb_var;/* motion compensated MB variance for last non b frame */
     INT64 wanted_bits;
     INT64 total_bits;
     int frame_bits;        /* bits used for the current frame */
-    int pb_frame_bits;     /* bits of the last b...bp group */
-    Predictor i_pred;
-    Predictor p_pred;
-    double qsum;         /* sum of qscales */
-    double qcount;       /* count of qscales */
-    double short_term_qsum;   /* sum of recent qscales */
-    double short_term_qcount; /* count of recent qscales */
-    RateControlContext rc_context;
+    RateControlContext rc_context; // contains stuff only accessed in ratecontrol.c
 
     /* statistics, used for 2-pass encoding */
     int mv_bits;
@@ -289,7 +301,8 @@ typedef struct MpegEncContext {
     int i_tex_bits;
     int p_tex_bits;
     int i_count;
-    int p_count;
+    int f_count;
+    int b_count;
     int skip_count;
     int misc_bits; // cbp, mb_type
     int last_bits; //temp var used for calculating the above vars
@@ -326,7 +339,9 @@ typedef struct MpegEncContext {
     INT64 time;                   /* time of current frame */ 
     INT64 last_non_b_time;
     UINT16 pp_time;               /* time distance between the last 2 p,s,i frames */
-    UINT16 bp_time;               /* time distance between the last b and p,s,i frame */
+    UINT16 pb_time;               /* time distance between the last b and p,s,i frame */
+    UINT16 pp_field_time;
+    UINT16 pb_field_time;         /* like above, just for interlaced */
     int shape;
     int vol_sprite_usage;
     int sprite_width;
@@ -348,6 +363,8 @@ typedef struct MpegEncContext {
     int new_pred;
     int reduced_res_vop;
     int aspect_ratio_info;
+    int aspected_width;
+    int aspected_height;
     int sprite_warping_accuracy;
     int low_latency_sprite;
     int data_partitioning;
@@ -362,7 +379,12 @@ typedef struct MpegEncContext {
     uint8_t *tex_pb_buffer;          
     uint8_t *pb2_buffer;
     int mpeg_quant;
-    INT8 *non_b_mv4_table;
+#define CO_LOCATED_TYPE_4MV     1
+#define CO_LOCATED_TYPE_FIELDMV 2
+    INT8 *co_located_type_table;     /* 4mv & field_mv info for next b frame */
+    INT16 (*field_mv_table)[2][2];   /* used for interlaced b frame decoding */
+    INT8 (*field_select_table)[2];   /* wtf, no really another table for interlaced b frames */
+    int t_frame;                     /* time distance of first I -> B, used for interlaced b frames */
 
     /* divx specific, used to workaround (many) bugs in divx5 */
     int divx_version;
@@ -401,9 +423,8 @@ typedef struct MpegEncContext {
     UINT8 *intra_v_scantable;
     UINT8 *intra_h_scantable;
     /* [mb_intra][isChroma][level][run][last] */
-    int ac_stats[2][2][MAX_LEVEL+1][MAX_RUN+1][2];
+    int (*ac_stats)[2][MAX_LEVEL+1][MAX_RUN+1][2];
     int inter_intra_pred;
-    
 
     /* decompression specific */
     GetBitContext gb;
@@ -452,6 +473,8 @@ typedef struct MpegEncContext {
                            DCTELEM *block, int n, int qscale);
     void (*dct_unquantize)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both)
                            DCTELEM *block, int n, int qscale);
+    int (*dct_quantize)(struct MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
+    void (*fdct)(DCTELEM *block);
 } MpegEncContext;
 
 int MPV_common_init(MpegEncContext *s);
@@ -465,12 +488,16 @@ void MPV_common_init_mmx(MpegEncContext *s);
 #ifdef ARCH_ALPHA
 void MPV_common_init_axp(MpegEncContext *s);
 #endif
-extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
+#ifdef HAVE_MLIB
+void MPV_common_init_mlib(MpegEncContext *s);
+#endif
 extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
 void ff_conceal_past_errors(MpegEncContext *s, int conceal_all);
 void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length);
 void ff_clean_intra_table_entries(MpegEncContext *s);
 
+extern int ff_bit_exact;
+
 /* motion_est.c */
 void ff_estimate_p_frame_motion(MpegEncContext * s,
                              int mb_x, int mb_y);
@@ -589,5 +616,10 @@ int ff_rate_estimate_qscale(MpegEncContext *s);
 int ff_rate_estimate_qscale_pass2(MpegEncContext *s);
 void ff_write_pass1_stats(MpegEncContext *s);
 void ff_rate_control_uninit(MpegEncContext *s);
+double ff_eval(char *s, double *const_value, char **const_name,
+               double (**func1)(void *, double), char **func1_name, 
+               double (**func2)(void *, double, double), char **func2_name,
+               void *opaque);
+
 
 #endif /* AVCODEC_MPEGVIDEO_H */
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 1c53b8d0d..6972ae806 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -60,11 +60,12 @@ static int msmpeg4_decode_motion(MpegEncContext * s,
 static void msmpeg4v2_encode_motion(MpegEncContext * s, int val);
 static void init_h263_dc_for_msmpeg4(void);
 static inline void msmpeg4_memsetw(short *tab, int val, int n);
-
+static int get_size_of_code(MpegEncContext * s, RLTable *rl, int last, int run, int level, int intra);
 
 
 extern UINT32 inverse[256];
 
+
 #ifdef DEBUG
 int intra_count = 0;
 int frame_count = 0;
@@ -72,6 +73,8 @@ int frame_count = 0;
 
 #include "msmpeg4data.h"
 
+static int rl_length[2][NB_RL_TABLES][MAX_LEVEL+1][MAX_RUN+1][2];
+
 #ifdef STATS
 
 const char *st_names[ST_NB] = {
@@ -187,7 +190,6 @@ static void common_init(MpegEncContext * s)
                 wmv1_scantable[i][k]= block_permute_op(j);
             }
         }
-
     }
 }
 
@@ -236,6 +238,20 @@ void ff_msmpeg4_encode_init(MpegEncContext *s)
         init_mv_table(&mv_tables[1]);
         for(i=0;i<NB_RL_TABLES;i++)
             init_rl(&rl_table[i]);
+
+        for(i=0; i<NB_RL_TABLES; i++){
+            int level;
+            for(level=0; level<=MAX_LEVEL; level++){
+                int run;
+                for(run=0; run<=MAX_RUN; run++){
+                    int last;
+                    for(last=0; last<2; last++){
+                        rl_length[0][i][level][run][last]= get_size_of_code(s, &rl_table[  i], last, run, level,0);
+                        rl_length[1][i][level][run][last]= get_size_of_code(s, &rl_table[  i], last, run, level,1);
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -285,8 +301,7 @@ static void find_best_tables(MpegEncContext * s)
     int i;
     int best       =-1, best_size       =9999999;
     int chroma_best=-1, best_chroma_size=9999999;
-    int last_size=0;
-    
+
     for(i=0; i<3; i++){
         int level;
         int chroma_size=0;
@@ -300,20 +315,22 @@ static void find_best_tables(MpegEncContext * s)
             int run;
             for(run=0; run<=MAX_RUN; run++){
                 int last;
+                const int last_size= size + chroma_size;
                 for(last=0; last<2; last++){
                     int inter_count       = s->ac_stats[0][0][level][run][last] + s->ac_stats[0][1][level][run][last];
                     int intra_luma_count  = s->ac_stats[1][0][level][run][last];
                     int intra_chroma_count= s->ac_stats[1][1][level][run][last];
-
+                    
                     if(s->pict_type==I_TYPE){
-                        size       += intra_luma_count  *get_size_of_code(s, &rl_table[  i], last, run, level,1);
-                        chroma_size+= intra_chroma_count*get_size_of_code(s, &rl_table[3+i], last, run, level,1);
+                        size       += intra_luma_count  *rl_length[1][i  ][level][run][last];
+                        chroma_size+= intra_chroma_count*rl_length[1][i+3][level][run][last];
                     }else{
-                        size+=        intra_luma_count  *get_size_of_code(s, &rl_table[  i], last, run, level,1)
-                                     +intra_chroma_count*get_size_of_code(s, &rl_table[3+i], last, run, level,1)
-                                     +inter_count       *get_size_of_code(s, &rl_table[3+i], last, run, level,0);
+                        size+=        intra_luma_count  *rl_length[1][i  ][level][run][last]
+                                     +intra_chroma_count*rl_length[1][i+3][level][run][last]
+                                     +inter_count       *rl_length[0][i+3][level][run][last];
                     }                   
                 }
+                if(last_size == size+chroma_size) break;
             }
         }
         if(size<best_size){
@@ -325,6 +342,7 @@ static void find_best_tables(MpegEncContext * s)
             chroma_best= i;
         }
     }
+
 //    printf("type:%d, best:%d, qp:%d, var:%d, mcvar:%d, size:%d //\n", 
 //           s->pict_type, best, s->qscale, s->mb_var_sum, s->mc_mb_var_sum, best_size);
            
@@ -952,6 +970,7 @@ static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int
 		sign = 1;
 		level = -level;
 	    }
+
             if(level<=MAX_LEVEL && run<=MAX_RUN){
                 s->ac_stats[s->mb_intra][n>3][level][run][last]++;
             }
diff --git a/src/libffmpeg/libavcodec/ppc/Makefile.am b/src/libffmpeg/libavcodec/ppc/Makefile.am
new file mode 100644
index 000000000..34d885c6d
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/Makefile.am
@@ -0,0 +1,46 @@
+##
+## Process this file with automake to produce Makefile.in
+##
+
+#CFLAGS = @CFLAGS@ @LIBFFMPEG_CFLAGS@
+
+# we must not use CFLAGS here, gcc optimizations produce
+# bad code if we do so.
+CFLAGS = -O2 $(LIBFFMPEG_CFLAGS)
+
+ASFLAGS =
+
+LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic
+
+noinst_LTLIBRARIES = libavcodec_ppc.la
+
+libavcodec_ppc_src =  dsputil_altivec.c dsputil_ppc.c
+libavcodec_ppc_dummy = libavcodec_ppc_dummy.c
+
+EXTRA_DIST =  $(libavcodec_ppc_src) $(libavcodec_ppc_dummy)
+
+if ARCH_POWERPC
+ppc_modules = $(libavcodec_ppc_src)
+endif
+
+
+libavcodec_ppc_la_SOURCES = $(ppc_modules) $(libavcodec_ppc_dummy)
+
+noinst_HEADERS = dsputil_altivec.h
+
+.s.lo:
+	$(ASCOMPILE) -o $@ `test -f $< || echo '$(srcdir)/'`$<
+
+debug:
+	@$(MAKE) CFLAGS="$(DEBUG_CFLAGS) $(LIBFFMPEG_CFLAGS)"
+
+install-debug: debug
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+mostlyclean-generic:
+	-rm -f *~ \#* .*~ .\#*
+
+maintainer-clean-generic:
+	-@echo "This command is intended for maintainers to use;"
+	-@echo "it deletes files that may require special tools to rebuild."
+	-rm -f Makefile.in
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
new file mode 100644
index 000000000..18d9d27a4
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -0,0 +1,143 @@
+#include "../dsputil.h"
+
+#if CONFIG_DARWIN
+#include <sys/sysctl.h>
+#endif
+
+int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
+int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
+int pix_sum_altivec(UINT8 * pix, int line_size);
+
+int has_altivec(void);
+
+int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int i, s;
+    vector unsigned char perm1, perm2, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sad, zero;
+    vector signed int sumdiffs;
+    
+    zero = (vector unsigned int) (0);
+    sad = (vector unsigned int) (0);
+
+
+    for(i=0;i<16;i++) {
+	/* Read potentially unaligned pixels into t1 and t2 */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
+        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
+       
+	/* Calculate a sum of abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+	
+	/* Add each 4 pixel group together and put 4 results into sad */
+        sad = vec_sum4s(t5, sad);
+
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+
+    /* Sum up the four partial sums, and put the result into s */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+    
+    return s;
+}
+
+int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int i, s;
+    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sad, zero;
+    vector signed int sumdiffs;
+
+    zero = (vector unsigned int) (0);
+    sad = (vector unsigned int) (0);
+    permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+
+    for(i=0;i<8;i++) {
+	/* Read potentially unaligned pixels into t1 and t2
+	   Since we're reading 16 pixels, and actually only want 8,
+	   mask out the last 8 pixels. The 0s don't change the sum. */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
+        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
+
+	/* Calculate a sum of abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+
+	/* Add each 4 pixel group together and put 4 results into sad */
+        sad = vec_sum4s(t5, sad);
+
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+
+    /* Sum up the four partial sums, and put the result into s */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+
+    return s;
+}
+
+int pix_sum_altivec(UINT8 * pix, int line_size)
+{
+
+    vector unsigned char perm, *pixv;
+    vector unsigned char t1;
+    vector unsigned int sad, zero;
+    vector signed int sumdiffs;
+
+    int s, i;
+
+    zero = (vector unsigned int) (0);
+    sad = (vector unsigned int) (0);
+    
+    for (i = 0; i < 16; i++) {
+	/* Read the potentially unaligned 16 pixels into t1 */
+        perm = vec_lvsl(0, pix);
+        pixv = (vector unsigned char *) pix;
+        t1 = vec_perm(pixv[0], pixv[1], perm);
+
+	/* Add each 4 pixel group together and put 4 results into sad */
+        sad = vec_sum4s(t1, sad);
+        
+        pix += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+    
+    return s;
+}
+
+int has_altivec(void)
+{
+#if CONFIG_DARWIN
+    int sels[2] = {CTL_HW, HW_VECTORUNIT};
+    int has_vu = 0;
+    size_t len = sizeof(has_vu);
+    int err;
+
+    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+    if (err == 0) return (has_vu != 0);
+#endif
+    return 0;
+}
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
new file mode 100644
index 000000000..42c373e76
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
@@ -0,0 +1,5 @@
+extern int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
+extern int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
+extern int pix_sum_altivec(UINT8 * pix, int line_size);
+
+extern int has_altivec(void);
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
new file mode 100644
index 000000000..1311cc61b
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -0,0 +1,20 @@
+#include "../../config.h"
+#include "../dsputil.h"
+
+#ifdef HAVE_ALTIVEC
+#include "dsputil_altivec.h"
+#endif
+
+void dsputil_init_ppc(void)
+{
+#if HAVE_ALTIVEC
+    if (has_altivec()) {
+        pix_abs16x16 = pix_abs16x16_altivec;
+        pix_abs8x8 = pix_abs8x8_altivec;
+        pix_sum = pix_sum_altivec;
+    } else
+#endif
+    {
+        /* Non-AltiVec PPC optimisations here */
+    }
+}
diff --git a/src/libffmpeg/libavcodec/ppc/libavcodec_ppc_dummy.c b/src/libffmpeg/libavcodec/ppc/libavcodec_ppc_dummy.c
new file mode 100644
index 000000000..506a55beb
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ppc/libavcodec_ppc_dummy.c
@@ -0,0 +1,2 @@
+
+char libavcodec_ppc_dummy;
diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c
index 8395eefad..77af3c93c 100644
--- a/src/libffmpeg/libavcodec/ratecontrol.c
+++ b/src/libffmpeg/libavcodec/ratecontrol.c
@@ -17,84 +17,162 @@
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#include <math.h>
+#include "common.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
 
-#define STATS_FILE "lavc_stats.txt"
+#undef NDEBUG // allways check asserts, the speed effect is far too small to disable them
+#include <assert.h>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#ifndef M_E
+#define M_E 2.718281828
+#endif
 
 static int init_pass2(MpegEncContext *s);
+static double get_qscale(MpegEncContext *s, RateControlEntry *rce, double rate_factor, int frame_num);
 
 void ff_write_pass1_stats(MpegEncContext *s){
-    RateControlContext *rcc= &s->rc_context;
-//    fprintf(c->stats_file, "type:%d q:%d icount:%d pcount:%d scount:%d itex:%d ptex%d mv:%d misc:%d fcode:%d bcode:%d\")
-    fprintf(rcc->stats_file, "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d\n",
+    sprintf(s->avctx->stats_out, "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d;\n",
             s->picture_number, s->input_picture_number - s->max_b_frames, s->pict_type, 
-            s->qscale, s->i_tex_bits, s->p_tex_bits, s->mv_bits, s->misc_bits, s->f_code, s->b_code);
+            s->qscale, s->i_tex_bits, s->p_tex_bits, s->mv_bits, s->misc_bits, 
+            s->f_code, s->b_code, s->mc_mb_var_sum, s->mb_var_sum, s->i_count);
 }
 
 int ff_rate_control_init(MpegEncContext *s)
 {
     RateControlContext *rcc= &s->rc_context;
+    int i;
     emms_c();
 
-    if(s->flags&CODEC_FLAG_PASS1){
-        rcc->stats_file= fopen(STATS_FILE, "w");
-        if(!rcc->stats_file){
-            fprintf(stderr, "failed to open " STATS_FILE "\n");
-            return -1;
-        }
-    } else if(s->flags&CODEC_FLAG_PASS2){
-        int size;
+    for(i=0; i<5; i++){
+        rcc->pred[i].coeff= 7.0;
+        rcc->pred[i].count= 1.0;
+    
+        rcc->pred[i].decay= 0.4;
+        rcc->i_cplx_sum [i]=
+        rcc->p_cplx_sum [i]=
+        rcc->mv_bits_sum[i]=
+        rcc->qscale_sum [i]=
+        rcc->frame_count[i]= 1; // 1 is better cuz of 1/0 and such
+        rcc->last_qscale_for[i]=5;
+    }
+    rcc->buffer_index= s->avctx->rc_buffer_size/2;
+
+    rcc->next_non_b_qscale=10;
+    rcc->next_p_qscale=10;
+    
+    if(s->flags&CODEC_FLAG_PASS2){
         int i;
+        char *p;
 
-        rcc->stats_file= fopen(STATS_FILE, "r");
-        if(!rcc->stats_file){
-            fprintf(stderr, "failed to open " STATS_FILE "\n");
-            return -1;
+        /* find number of pics */
+        p= s->avctx->stats_in;
+        for(i=-1; p; i++){
+            p= strchr(p+1, ';');
         }
-
-        /* find number of pics without reading the file twice :) */
-        fseek(rcc->stats_file, 0, SEEK_END);
-        size= ftell(rcc->stats_file);
-        fseek(rcc->stats_file, 0, SEEK_SET);
-
-        size/= 64; // we need at least 64 byte to store a line ...
-        rcc->entry = (RateControlEntry*)av_mallocz(size*sizeof(RateControlEntry));
-
-        for(i=0; !feof(rcc->stats_file); i++){
+        i+= s->max_b_frames;
+        rcc->entry = (RateControlEntry*)av_mallocz(i*sizeof(RateControlEntry));
+        rcc->num_entries= i;
+        
+        /* init all to skiped p frames (with b frames we might have a not encoded frame at the end FIXME) */
+        for(i=0; i<rcc->num_entries; i++){
+            RateControlEntry *rce= &rcc->entry[i];
+            rce->pict_type= rce->new_pict_type=P_TYPE;
+            rce->qscale= rce->new_qscale=2;
+            rce->misc_bits= s->mb_num + 10;
+            rce->mb_var_sum= s->mb_num*100;
+        }        
+        
+        /* read stats */
+        p= s->avctx->stats_in;
+        for(i=0; i<rcc->num_entries - s->max_b_frames; i++){
             RateControlEntry *rce;
             int picture_number;
             int e;
-            
-            e= fscanf(rcc->stats_file, "in:%d ", &picture_number);
+            char *next;
+
+            next= strchr(p, ';');
+            if(next){
+                (*next)=0; //sscanf in unbelieavle slow on looong strings //FIXME copy / dont write
+                next++;
+            }
+            e= sscanf(p, " in:%d ", &picture_number);
+
+            assert(picture_number >= 0);
+            assert(picture_number < rcc->num_entries);
             rce= &rcc->entry[picture_number];
-            e+=fscanf(rcc->stats_file, "out:%*d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%*d bcode:%*d\n",
-                   &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits, &rce->mv_bits, &rce->misc_bits);
-            if(e!=7){
-                fprintf(stderr, STATS_FILE " is damaged\n");
+
+            e+=sscanf(p, " in:%*d out:%*d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d",
+                   &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits, &rce->mv_bits, &rce->misc_bits, 
+                   &rce->f_code, &rce->b_code, &rce->mc_mb_var_sum, &rce->mb_var_sum, &rce->i_count);
+            if(e!=12){
+                fprintf(stderr, "statistics are damaged at line %d, parser out=%d\n", i, e);
                 return -1;
             }
+            p= next;
         }
-        rcc->num_entries= i;
         
         if(init_pass2(s) < 0) return -1;
     }
      
-    /* no 2pass stuff, just normal 1-pass */
-    //initial values, they dont really matter as they will be totally different within a few frames
-    s->i_pred.coeff= s->p_pred.coeff= 7.0;
-    s->i_pred.count= s->p_pred.count= 1.0;
-    
-    s->i_pred.decay= s->p_pred.decay= 0.4;
+    if(!(s->flags&CODEC_FLAG_PASS2)){
+
+        rcc->short_term_qsum=0.001;
+        rcc->short_term_qcount=0.001;
     
-    // use more bits at the beginning, otherwise high motion at the begin will look like shit
-    s->qsum=100 * s->qmin;
-    s->qcount=100;
+        rcc->pass1_bits       =0.001;
+        rcc->pass1_wanted_bits=0.001;
+        
+        /* init stuff with the user specified complexity */
+        if(s->avctx->rc_initial_cplx){
+            for(i=0; i<60*30; i++){
+                double bits= s->avctx->rc_initial_cplx * (i/10000.0 + 1.0)*s->mb_num;
+                RateControlEntry rce;
+                double q;
+                
+                if     (i%((s->gop_size+3)/4)==0) rce.pict_type= I_TYPE;
+                else if(i%(s->max_b_frames+1))    rce.pict_type= B_TYPE;
+                else                              rce.pict_type= P_TYPE;
+
+                rce.new_pict_type= rce.pict_type;
+                rce.mc_mb_var_sum= bits*s->mb_num/100000;
+                rce.mb_var_sum   = s->mb_num;
+                rce.qscale   = 2;
+                rce.f_code   = 2;
+                rce.b_code   = 1;
+                rce.misc_bits= 1;
+
+                if(s->pict_type== I_TYPE){
+                    rce.i_count   = s->mb_num;
+                    rce.i_tex_bits= bits;
+                    rce.p_tex_bits= 0;
+                    rce.mv_bits= 0;
+                }else{
+                    rce.i_count   = 0; //FIXME we do know this approx
+                    rce.i_tex_bits= 0;
+                    rce.p_tex_bits= bits*0.9;
+                    rce.mv_bits= bits*0.1;
+                }
+                rcc->i_cplx_sum [rce.pict_type] += rce.i_tex_bits*rce.qscale;
+                rcc->p_cplx_sum [rce.pict_type] += rce.p_tex_bits*rce.qscale;
+                rcc->mv_bits_sum[rce.pict_type] += rce.mv_bits;
+                rcc->frame_count[rce.pict_type] ++;
 
-    s->short_term_qsum=0.001;
-    s->short_term_qcount=0.001;
+                bits= rce.i_tex_bits + rce.p_tex_bits;
 
+                q= get_qscale(s, &rce, rcc->pass1_wanted_bits/rcc->pass1_bits, i);
+                rcc->pass1_wanted_bits+= s->bit_rate/(s->frame_rate / (double)FRAME_RATE_BASE);
+            }
+        }
+
+    }
+    
     return 0;
 }
 
@@ -103,24 +181,257 @@ void ff_rate_control_uninit(MpegEncContext *s)
     RateControlContext *rcc= &s->rc_context;
     emms_c();
 
-    if(rcc->stats_file) 
-        fclose(rcc->stats_file);
-    rcc->stats_file = NULL;
     av_freep(&rcc->entry);
 }
 
+static inline double qp2bits(RateControlEntry *rce, double qp){
+    if(qp<=0.0){
+        fprintf(stderr, "qp<=0.0\n");
+    }
+    return rce->qscale * (double)(rce->i_tex_bits + rce->p_tex_bits+1)/ qp;
+}
+
+static inline double bits2qp(RateControlEntry *rce, double bits){
+    if(bits<0.9){
+        fprintf(stderr, "bits<0.9\n");
+    }
+    return rce->qscale * (double)(rce->i_tex_bits + rce->p_tex_bits+1)/ bits;
+}
+    
+static void update_rc_buffer(MpegEncContext *s, int frame_size){
+    RateControlContext *rcc= &s->rc_context;
+    const double fps= (double)s->frame_rate / FRAME_RATE_BASE;
+    const double buffer_size= s->avctx->rc_buffer_size;
+    const double min_rate= s->avctx->rc_min_rate/fps;
+    const double max_rate= s->avctx->rc_max_rate/fps;
+
+    if(buffer_size){
+        rcc->buffer_index-= frame_size;
+        if(rcc->buffer_index < buffer_size/2 /*FIXME /2 */ || min_rate==0){
+            rcc->buffer_index+= max_rate;
+            if(rcc->buffer_index >= buffer_size)
+                rcc->buffer_index= buffer_size-1;
+        }else{
+            rcc->buffer_index+= min_rate;
+        }
+        
+        if(rcc->buffer_index < 0)
+            fprintf(stderr, "rc buffer underflow\n");
+        if(rcc->buffer_index >= s->avctx->rc_buffer_size)
+            fprintf(stderr, "rc buffer overflow\n");
+    }
+}
+
+/**
+ * modifies the bitrate curve from pass1 for one frame
+ */
+static double get_qscale(MpegEncContext *s, RateControlEntry *rce, double rate_factor, int frame_num){
+    RateControlContext *rcc= &s->rc_context;
+    double q, bits;
+    const int pict_type= rce->new_pict_type;
+    const double mb_num= s->mb_num;  
+    int i;
+    const double last_q= rcc->last_qscale_for[pict_type];
+
+    double const_values[]={
+        M_PI,
+        M_E,
+        rce->i_tex_bits*rce->qscale,
+        rce->p_tex_bits*rce->qscale,
+        (rce->i_tex_bits + rce->p_tex_bits)*(double)rce->qscale,
+        rce->mv_bits/mb_num,
+        rce->pict_type == B_TYPE ? (rce->f_code + rce->b_code)*0.5 : rce->f_code,
+        rce->i_count/mb_num,
+        rce->mc_mb_var_sum/mb_num,
+        rce->mb_var_sum/mb_num,
+        rce->pict_type == I_TYPE,
+        rce->pict_type == P_TYPE,
+        rce->pict_type == B_TYPE,
+        rcc->qscale_sum[pict_type] / (double)rcc->frame_count[pict_type],
+        s->qcompress,
+/*        rcc->last_qscale_for[I_TYPE],
+        rcc->last_qscale_for[P_TYPE],
+        rcc->last_qscale_for[B_TYPE],
+        rcc->next_non_b_qscale,*/
+        rcc->i_cplx_sum[I_TYPE] / (double)rcc->frame_count[I_TYPE],
+        rcc->i_cplx_sum[P_TYPE] / (double)rcc->frame_count[P_TYPE],
+        rcc->p_cplx_sum[P_TYPE] / (double)rcc->frame_count[P_TYPE],
+        rcc->p_cplx_sum[B_TYPE] / (double)rcc->frame_count[B_TYPE],
+        (rcc->i_cplx_sum[pict_type] + rcc->p_cplx_sum[pict_type]) / (double)rcc->frame_count[pict_type],
+        0
+    };
+    char *const_names[]={
+        "PI",
+        "E",
+        "iTex",
+        "pTex",
+        "tex",
+        "mv",
+        "fCode",
+        "iCount",
+        "mcVar",
+        "var",
+        "isI",
+        "isP",
+        "isB",
+        "avgQP",
+        "qComp",
+/*        "lastIQP",
+        "lastPQP",
+        "lastBQP",
+        "nextNonBQP",*/
+        "avgIITex",
+        "avgPITex",
+        "avgPPTex",
+        "avgBPTex",
+        "avgTex",
+        NULL
+    };
+    static double (*func1[])(void *, double)={
+        bits2qp,
+        qp2bits,
+        NULL
+    };
+    char *func1_names[]={
+        "bits2qp",
+        "qp2bits",
+        NULL
+    };
+
+    bits= ff_eval(s->avctx->rc_eq, const_values, const_names, func1, func1_names, NULL, NULL, rce);
+    
+    rcc->pass1_bits+= bits;
+    bits*=rate_factor;
+    if(bits<0.0) bits=0.0;
+    bits+= 1.0; //avoid 1/0 issues
+    
+    /* user override */
+    for(i=0; i<s->avctx->rc_override_count; i++){
+        RcOverride *rco= s->avctx->rc_override;
+        if(rco[i].start_frame > frame_num) continue;
+        if(rco[i].end_frame   < frame_num) continue;
+    
+        if(rco[i].qscale) 
+            bits= qp2bits(rce, rco[i].qscale); //FIXME move at end to really force it?
+        else
+            bits*= rco[i].quality_factor;
+    }
+
+    q= bits2qp(rce, bits);
+    
+    /* I/B difference */
+    if     (pict_type==I_TYPE && s->avctx->i_quant_factor<0.0)
+        q= -q*s->avctx->i_quant_factor + s->avctx->i_quant_offset;
+    else if(pict_type==B_TYPE && s->avctx->b_quant_factor<0.0)
+        q= -q*s->avctx->b_quant_factor + s->avctx->b_quant_offset;
+    
+    /* last qscale / qdiff stuff */
+    if     (q > last_q + s->max_qdiff) q= last_q + s->max_qdiff;
+    else if(q < last_q - s->max_qdiff) q= last_q - s->max_qdiff;
+
+    rcc->last_qscale_for[pict_type]= q; //Note we cant do that after blurring
+    
+    return q;
+}
+
+/**
+ * gets the qmin & qmax for pict_type
+ */
+static void get_qminmax(int *qmin_ret, int *qmax_ret, MpegEncContext *s, int pict_type){
+    int qmin= s->qmin;                                                       
+    int qmax= s->qmax;
+
+    if(pict_type==B_TYPE){
+        qmin= (int)(qmin*ABS(s->avctx->b_quant_factor)+s->avctx->b_quant_offset + 0.5);
+        qmax= (int)(qmax*ABS(s->avctx->b_quant_factor)+s->avctx->b_quant_offset + 0.5);
+    }else if(pict_type==I_TYPE){
+        qmin= (int)(qmin*ABS(s->avctx->i_quant_factor)+s->avctx->i_quant_offset + 0.5);
+        qmax= (int)(qmax*ABS(s->avctx->i_quant_factor)+s->avctx->i_quant_offset + 0.5);
+    }
+
+    if(qmin<1) qmin=1;
+    if(qmin==1 && s->qmin>1) qmin=2; //avoid qmin=1 unless the user wants qmin=1
+
+    if(qmin<3 && s->max_qcoeff<=128 && pict_type==I_TYPE) qmin=3; //reduce cliping problems
+
+    if(qmax>31) qmax=31;
+    if(qmax<=qmin) qmax= qmin= (qmax+qmin+1)>>1;
+    
+    *qmin_ret= qmin;
+    *qmax_ret= qmax;
+}
+
+static double modify_qscale(MpegEncContext *s, RateControlEntry *rce, double q, int frame_num){
+    RateControlContext *rcc= &s->rc_context;
+    int qmin, qmax;
+    double bits;
+    const int pict_type= rce->new_pict_type;
+    const double buffer_size= s->avctx->rc_buffer_size;
+    const double min_rate= s->avctx->rc_min_rate;
+    const double max_rate= s->avctx->rc_max_rate;
+    
+    get_qminmax(&qmin, &qmax, s, pict_type);
+
+    /* modulation */
+    if(s->avctx->rc_qmod_freq && frame_num%s->avctx->rc_qmod_freq==0 && pict_type==P_TYPE)
+        q*= s->avctx->rc_qmod_amp;
+
+    bits= qp2bits(rce, q);
+
+    /* buffer overflow/underflow protection */
+    if(buffer_size){
+        double expected_size= rcc->buffer_index - bits;
+
+        if(min_rate){
+            double d= 2*(buffer_size - (expected_size + min_rate))/buffer_size;
+            if(d>1.0) d=1.0;
+            q/= pow(d, 1.0/s->avctx->rc_buffer_aggressivity);
+        }
+
+        if(max_rate){
+            double d= 2*expected_size/buffer_size;
+            if(d>1.0) d=1.0;
+            q*= pow(d, 1.0/s->avctx->rc_buffer_aggressivity);
+        }
+    }
+
+    if(s->avctx->rc_qsquish==0.0 || qmin==qmax){
+        if     (q<qmin) q=qmin;
+        else if(q>qmax) q=qmax;
+    }else{
+        double min2= log(qmin);
+        double max2= log(qmax);
+        
+        q= log(q);
+        q= (q - min2)/(max2-min2) - 0.5;
+        q*= -4.0;
+        q= 1.0/(1.0 + exp(q));
+        q= q*(max2-min2) + min2;
+        
+        q= exp(q);
+    }
+
+    return q;
+}
+
 //----------------------------------
 // 1 Pass Code
 
-static double predict(Predictor *p, double q, double var)
+static double predict_size(Predictor *p, double q, double var)
 {
      return p->coeff*var / (q*p->count);
 }
 
+static double predict_qp(Predictor *p, double size, double var)
+{
+//printf("coeff:%f, count:%f, var:%f, size:%f//\n", p->coeff, p->count, var, size);
+     return p->coeff*var / (size*p->count);
+}
+
 static void update_predictor(Predictor *p, double q, double var, double size)
 {
     double new_coeff= size*q / (var + 1);
-    if(var<1000) return;
+    if(var<10) return;
 
     p->count*= p->decay;
     p->coeff*= p->decay;
@@ -130,90 +441,138 @@ static void update_predictor(Predictor *p, double q, double var, double size)
 
 int ff_rate_estimate_qscale(MpegEncContext *s)
 {
-    int qmin= s->qmin;
-    int qmax= s->qmax;
-    int rate_q=5;
     float q;
-    int qscale;
+    int qscale, qmin, qmax;
     float br_compensation;
     double diff;
     double short_term_q;
-    double long_term_q;
     double fps;
-    int picture_number= s->input_picture_number - s->max_b_frames;
+    int picture_number= s->picture_number;
     int64_t wanted_bits;
+    RateControlContext *rcc= &s->rc_context;
+    RateControlEntry local_rce, *rce;
+    double bits;
+    double rate_factor;
+    int var;
+    const int pict_type= s->pict_type;
     emms_c();
 
-    fps= (double)s->frame_rate / FRAME_RATE_BASE;
-    wanted_bits= (uint64_t)(s->bit_rate*(double)picture_number/fps);
-//    printf("%d %d %d\n", picture_number, (int)wanted_bits, (int)s->total_bits);
-    
-    if(s->pict_type==B_TYPE){
-        qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5);
-        qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5);
-    }
-    if(qmin<1) qmin=1;
-    if(qmax>31) qmax=31;
-    if(qmax<=qmin) qmax= qmin;
+    get_qminmax(&qmin, &qmax, s, pict_type);
 
+    fps= (double)s->frame_rate / FRAME_RATE_BASE;
+//printf("input_picture_number:%d picture_number:%d\n", s->input_picture_number, s->picture_number);
         /* update predictors */
     if(picture_number>2){
-        if(s->pict_type!=B_TYPE && s->last_non_b_pict_type == P_TYPE){
-//printf("%d %d %d %f\n", s->qscale, s->last_mc_mb_var, s->frame_bits, s->p_pred.coeff);
-            update_predictor(&s->p_pred, s->last_non_b_qscale, s->last_non_b_mc_mb_var, s->pb_frame_bits);
-        }
+        const int last_var= s->last_pict_type == I_TYPE ? rcc->last_mb_var_sum : rcc->last_mc_mb_var_sum;
+        update_predictor(&rcc->pred[s->last_pict_type], rcc->last_qscale, sqrt(last_var), s->frame_bits);
     }
 
-    if(s->pict_type == I_TYPE){
-        short_term_q= s->short_term_qsum/s->short_term_qcount;
-    
-        long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0
+    if(s->flags&CODEC_FLAG_PASS2){
+        assert(picture_number>=0);
+        assert(picture_number<rcc->num_entries);
+        rce= &rcc->entry[picture_number];
+        wanted_bits= rce->expected_bits;
+    }else{
+        rce= &local_rce;
+        wanted_bits= (uint64_t)(s->bit_rate*(double)picture_number/fps);
+    }
 
-        q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q);
-    }else if(s->pict_type==B_TYPE){
-        q= (int)(s->last_non_b_qscale*s->b_quant_factor+s->b_quant_offset + 0.5);
-    }else{ //P Frame
-        int i;
-        int diff, best_diff=1000000000;
-        for(i=1; i<=31; i++){
-            diff= predict(&s->p_pred, i, s->mc_mb_var_sum) - (double)s->bit_rate/fps;
-            if(diff<0) diff= -diff;
-            if(diff<best_diff){
-                best_diff= diff;
-                rate_q= i;
-            }
+    diff= s->total_bits - wanted_bits;
+    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
+    if(br_compensation<=0.0) br_compensation=0.001;
+
+    var= pict_type == I_TYPE ? s->mb_var_sum : s->mc_mb_var_sum;
+    
+    if(s->flags&CODEC_FLAG_PASS2){
+        if(pict_type!=I_TYPE)
+            assert(pict_type == rce->new_pict_type);
+
+        q= rce->new_qscale / br_compensation;
+//printf("%f %f %f last:%d var:%d type:%d//\n", q, rce->new_qscale, br_compensation, s->frame_bits, var, pict_type);
+    }else{
+        rce->pict_type= 
+        rce->new_pict_type= pict_type;
+        rce->mc_mb_var_sum= s->mc_mb_var_sum;
+        rce->mb_var_sum   = s->   mb_var_sum;
+        rce->qscale   = 2;
+        rce->f_code   = s->f_code;
+        rce->b_code   = s->b_code;
+        rce->misc_bits= 1;
+
+        if(picture_number>0)
+            update_rc_buffer(s, s->frame_bits);
+
+        bits= predict_size(&rcc->pred[pict_type], rce->qscale, sqrt(var));
+        if(pict_type== I_TYPE){
+            rce->i_count   = s->mb_num;
+            rce->i_tex_bits= bits;
+            rce->p_tex_bits= 0;
+            rce->mv_bits= 0;
+        }else{
+            rce->i_count   = 0; //FIXME we do know this approx
+            rce->i_tex_bits= 0;
+            rce->p_tex_bits= bits*0.9;
+            
+            rce->mv_bits= bits*0.1;
         }
-        s->short_term_qsum*=s->qblur;
-        s->short_term_qcount*=s->qblur;
+        rcc->i_cplx_sum [pict_type] += rce->i_tex_bits*rce->qscale;
+        rcc->p_cplx_sum [pict_type] += rce->p_tex_bits*rce->qscale;
+        rcc->mv_bits_sum[pict_type] += rce->mv_bits;
+        rcc->frame_count[pict_type] ++;
 
-        s->short_term_qsum+= rate_q;
-        s->short_term_qcount++;
-        short_term_q= s->short_term_qsum/s->short_term_qcount;
+        bits= rce->i_tex_bits + rce->p_tex_bits;
+        rate_factor= rcc->pass1_wanted_bits/rcc->pass1_bits * br_compensation;
     
-        long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0
+        q= get_qscale(s, rce, rate_factor, picture_number);
+
+        assert(q>0.0);
+//printf("%f ", q);
+        if     (pict_type==I_TYPE && s->avctx->i_quant_factor>0.0)
+            q= rcc->next_p_qscale*s->avctx->i_quant_factor + s->avctx->i_quant_offset;
+        else if(pict_type==B_TYPE && s->avctx->b_quant_factor>0.0)
+            q= rcc->next_non_b_qscale*s->avctx->b_quant_factor + s->avctx->b_quant_offset;
+//printf("%f ", q);
+        assert(q>0.0);
+
+        if(pict_type==P_TYPE || s->intra_only){ //FIXME type dependant blur like in 2-pass
+            rcc->short_term_qsum*=s->qblur;
+            rcc->short_term_qcount*=s->qblur;
+
+            rcc->short_term_qsum+= q;
+            rcc->short_term_qcount++;
+//printf("%f ", q);
+            q= short_term_q= rcc->short_term_qsum/rcc->short_term_qcount;
+//printf("%f ", q);
+        }
+        q= modify_qscale(s, rce, q, picture_number);
+
+        rcc->pass1_wanted_bits+= s->bit_rate/fps;
+
+        assert(q>0.0);
 
-//    q= (long_term_q - short_term_q)*s->qcompress + short_term_q;
-        q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q);
+        if(pict_type != B_TYPE) rcc->next_non_b_qscale= q;
+        if(pict_type == P_TYPE) rcc->next_p_qscale= q;
     }
+//printf("qmin:%d, qmax:%d, q:%f\n", qmin, qmax, q);
+    
+
+    if     (q<qmin) q=qmin; 
+    else if(q>qmax) q=qmax;
+        
+//    printf("%f %d %d %d\n", q, picture_number, (int)wanted_bits, (int)s->total_bits);
+    
 
-    diff= s->total_bits - wanted_bits;
-    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
-    if(br_compensation<=0.0) br_compensation=0.001;
-    q/=br_compensation;
 //printf("%f %f %f\n", q, br_compensation, short_term_q);
     qscale= (int)(q + 0.5);
-    if     (qscale<qmin) qscale=qmin;
-    else if(qscale>qmax) qscale=qmax;
+//printf("%d ", qscale);
     
-    if(s->pict_type!=B_TYPE){
-        s->qsum+= qscale;
-        s->qcount++;
-        if     (qscale<s->last_non_b_qscale-s->max_qdiff) qscale=s->last_non_b_qscale-s->max_qdiff;
-        else if(qscale>s->last_non_b_qscale+s->max_qdiff) qscale=s->last_non_b_qscale+s->max_qdiff;
-    }
 //printf("q:%d diff:%d comp:%f rate_q:%d st_q:%f fvar:%d last_size:%d\n", qscale, (int)diff, br_compensation, 
 //       rate_q, short_term_q, s->mc_mb_var, s->frame_bits);
 //printf("%d %d\n", s->bit_rate, (int)fps);
+
+    rcc->last_qscale= qscale;
+    rcc->last_mc_mb_var_sum= s->mc_mb_var_sum;
+    rcc->last_mb_var_sum= s->mb_var_sum;
     return qscale;
 }
 
@@ -231,10 +590,12 @@ static int init_pass2(MpegEncContext *s)
     uint64_t available_bits[5];
     uint64_t all_const_bits;
     uint64_t all_available_bits= (uint64_t)(s->bit_rate*(double)rcc->num_entries/fps);
-    int num_frames[5]={0,0,0,0,0};
     double rate_factor=0;
     double step;
     int last_i_frame=-10000000;
+    const int filter_size= (int)(s->qblur*4) | 1;  
+    double expected_bits;
+    double *qscale, *blured_qscale;
 
     /* find complexity & const_bits & decide the pict_types */
     for(i=0; i<rcc->num_entries; i++){
@@ -272,10 +633,13 @@ static int init_pass2(MpegEncContext *s)
                 break;
             }
         }
+        rcc->i_cplx_sum [rce->pict_type] += rce->i_tex_bits*rce->qscale;
+        rcc->p_cplx_sum [rce->pict_type] += rce->p_tex_bits*rce->qscale;
+        rcc->mv_bits_sum[rce->pict_type] += rce->mv_bits;
+        rcc->frame_count[rce->pict_type] ++;
 
         complexity[rce->new_pict_type]+= (rce->i_tex_bits+ rce->p_tex_bits)*(double)rce->qscale;
         const_bits[rce->new_pict_type]+= rce->mv_bits + rce->misc_bits;
-        num_frames[rce->new_pict_type]++;
     }
     all_const_bits= const_bits[I_TYPE] + const_bits[P_TYPE] + const_bits[B_TYPE];
     
@@ -283,120 +647,108 @@ static int init_pass2(MpegEncContext *s)
         fprintf(stderr, "requested bitrate is to low\n");
         return -1;
     }
-
-//    avg_complexity= complexity/rcc->num_entries;
-    avg_quantizer[P_TYPE]= 
-    avg_quantizer[I_TYPE]=   (complexity[I_TYPE]+complexity[P_TYPE] + complexity[B_TYPE]/s->b_quant_factor) 
-                           / (all_available_bits - all_const_bits);
-    avg_quantizer[B_TYPE]= avg_quantizer[P_TYPE]*s->b_quant_factor + s->b_quant_offset;
-//printf("avg quantizer: %f %f\n", avg_quantizer[P_TYPE], avg_quantizer[B_TYPE]);
+    
+    /* find average quantizers */
+    avg_quantizer[P_TYPE]=0;
+    for(step=256*256; step>0.0000001; step*=0.5){
+        double expected_bits=0;
+        avg_quantizer[P_TYPE]+= step;
+        
+        avg_quantizer[I_TYPE]= avg_quantizer[P_TYPE]*ABS(s->avctx->i_quant_factor) + s->avctx->i_quant_offset;
+        avg_quantizer[B_TYPE]= avg_quantizer[P_TYPE]*ABS(s->avctx->b_quant_factor) + s->avctx->b_quant_offset;
+        
+        expected_bits= 
+            + all_const_bits 
+            + complexity[I_TYPE]/avg_quantizer[I_TYPE]
+            + complexity[P_TYPE]/avg_quantizer[P_TYPE]
+            + complexity[B_TYPE]/avg_quantizer[B_TYPE];
+            
+        if(expected_bits < all_available_bits) avg_quantizer[P_TYPE]-= step;
+//printf("%f %lld %f\n", expected_bits, all_available_bits, avg_quantizer[P_TYPE]);
+    }
+//printf("qp_i:%f, qp_p:%f, qp_b:%f\n", avg_quantizer[I_TYPE],avg_quantizer[P_TYPE],avg_quantizer[B_TYPE]);
 
     for(i=0; i<5; i++){
         available_bits[i]= const_bits[i] + complexity[i]/avg_quantizer[i];
     }
 //printf("%lld %lld %lld %lld\n", available_bits[I_TYPE], available_bits[P_TYPE], available_bits[B_TYPE], all_available_bits);
-    
+        
+    qscale= malloc(sizeof(double)*rcc->num_entries);
+    blured_qscale= malloc(sizeof(double)*rcc->num_entries);
+
     for(step=256*256; step>0.0000001; step*=0.5){
-        uint64_t expected_bits=0;
+        expected_bits=0;
         rate_factor+= step;
+        
+        rcc->buffer_index= s->avctx->rc_buffer_size/2;
+
         /* find qscale */
         for(i=0; i<rcc->num_entries; i++){
+            qscale[i]= get_qscale(s, &rcc->entry[i], rate_factor, i);
+        }
+        assert(filter_size%2==1);
+
+        /* fixed I/B QP relative to P mode */
+        rcc->next_non_b_qscale= 10;
+        rcc->next_p_qscale= 10;
+        for(i=rcc->num_entries-1; i>=0; i--){
             RateControlEntry *rce= &rcc->entry[i];
-            double short_term_q, q, bits_left;
             const int pict_type= rce->new_pict_type;
-            int qmin= s->qmin;
-            int qmax= s->qmax;
+        
+            if     (pict_type==I_TYPE && s->avctx->i_quant_factor>0.0)
+                qscale[i]= rcc->next_p_qscale*s->avctx->i_quant_factor + s->avctx->i_quant_offset;
+            else if(pict_type==B_TYPE && s->avctx->b_quant_factor>0.0)
+                qscale[i]= rcc->next_non_b_qscale*s->avctx->b_quant_factor + s->avctx->b_quant_offset;
+
+            if(pict_type!=B_TYPE) 
+                rcc->next_non_b_qscale= qscale[i];
+            if(pict_type==P_TYPE) 
+                rcc->next_p_qscale= qscale[i];
+        }
 
-            if(pict_type==B_TYPE){
-                qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5);
-                qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5);
-            }
-            if(qmin<1) qmin=1;
-            if(qmax>31) qmax=31;
-            if(qmax<=qmin) qmax= qmin;
+        /* smooth curve */
+        for(i=0; i<rcc->num_entries; i++){
+            RateControlEntry *rce= &rcc->entry[i];
+            const int pict_type= rce->new_pict_type;
+            int j;
+            double q=0.0, sum=0.0;
+        
+            for(j=0; j<filter_size; j++){
+                int index= i+j-filter_size/2;
+                double d= index-i;
+                double coeff= s->qblur==0 ? 1.0 : exp(-d*d/(s->qblur * s->qblur));
             
-            switch(s->rc_strategy){
-            case 0:
-                bits_left= available_bits[pict_type]/num_frames[pict_type]*rate_factor - rce->misc_bits - rce->mv_bits;
-                if(bits_left<1.0) bits_left=1.0;
-                short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits)/bits_left;
-                break;
-            case 1:
-                bits_left= (available_bits[pict_type] - const_bits[pict_type])/num_frames[pict_type]*rate_factor;
-                if(bits_left<1.0) bits_left=1.0;
-                short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits)/bits_left;
-                break;
-            case 2:
-                bits_left= available_bits[pict_type]/num_frames[pict_type]*rate_factor;
-                if(bits_left<1.0) bits_left=1.0;
-                short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits + rce->misc_bits + rce->mv_bits)/bits_left;
-                break;
-            default:
-                fprintf(stderr, "unknown strategy\n");
-                short_term_q=3; //gcc warning fix
+                if(index < 0 || index >= rcc->num_entries) continue;
+                if(pict_type != rcc->entry[index].new_pict_type) continue;
+                q+= qscale[index] * coeff;
+                sum+= coeff;
             }
-
-            if(short_term_q>31.0) short_term_q=31.0;
-            else if (short_term_q<1.0) short_term_q=1.0;
-
-            q= 1/((1/avg_quantizer[pict_type] - 1/short_term_q)*s->qcompress + 1/short_term_q);
-            if     (q<qmin) q=qmin;
-            else if(q>qmax) q=qmax;
-//printf("lq:%f, sq:%f t:%f q:%f\n", avg_quantizer[rce->pict_type], short_term_q, bits_left, q);
-            rce->new_qscale= q;
+            blured_qscale[i]= q/sum;
         }
-
-        /* smooth curve */
     
         /* find expected bits */
         for(i=0; i<rcc->num_entries; i++){
             RateControlEntry *rce= &rcc->entry[i];
-            double factor= rce->qscale / rce->new_qscale;
-            
+            double bits;
+            rce->new_qscale= modify_qscale(s, rce, blured_qscale[i], i);
+            bits= qp2bits(rce, rce->new_qscale) + rce->mv_bits + rce->misc_bits;
+//printf("%d %f\n", rce->new_bits, blured_qscale[i]);
+            update_rc_buffer(s, bits);
+
             rce->expected_bits= expected_bits;
-            expected_bits += (int)(rce->misc_bits + rce->mv_bits + (rce->i_tex_bits + rce->p_tex_bits)*factor + 0.5);
+            expected_bits += bits;
         }
 
-//        printf("%d %d %f\n", (int)expected_bits, (int)all_available_bits, rate_factor);
+//        printf("%f %d %f\n", expected_bits, (int)all_available_bits, rate_factor);
         if(expected_bits > all_available_bits) rate_factor-= step;
     }
+    free(qscale);
+    free(blured_qscale);
 
-    return 0;
-}
-
-int ff_rate_estimate_qscale_pass2(MpegEncContext *s)
-{
-    int qmin= s->qmin;
-    int qmax= s->qmax;
-    float q;
-    int qscale;
-    float br_compensation;
-    double diff;
-    int picture_number= s->picture_number;
-    RateControlEntry *rce= &s->rc_context.entry[picture_number];
-    int64_t wanted_bits= rce->expected_bits;
-    emms_c();
-
-//    printf("%d %d %d\n", picture_number, (int)wanted_bits, (int)s->total_bits);
-    
-    if(s->pict_type==B_TYPE){
-        qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5);
-        qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5);
+    if(abs(expected_bits/all_available_bits - 1.0) > 0.01 ){
+        fprintf(stderr, "Error: 2pass curve failed to converge\n");
+        return -1;
     }
-    if(qmin<1) qmin=1;
-    if(qmax>31) qmax=31;
-    if(qmax<=qmin) qmax= qmin;
 
-    q= rce->new_qscale;
-
-    diff= s->total_bits - wanted_bits;
-    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
-    if(br_compensation<=0.0) br_compensation=0.001;
-    q/=br_compensation;
-
-    qscale= (int)(q + 0.5);
-    if     (qscale<qmin) qscale=qmin;
-    else if(qscale>qmax) qscale=qmax;
-//    printf("%d %d %d %d type:%d\n", qmin, qscale, qmax, picture_number, s->pict_type); fflush(stdout);
-    return qscale;
+    return 0;
 }
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 248305929..72a412eb5 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -22,7 +22,7 @@
 
 //#define DEBUG
 
-#define DC_VLC_BITS 9
+#define DC_VLC_BITS 14 //FIXME find a better solution
 
 static const UINT16 rv_lum_code[256] =
 {
@@ -210,6 +210,7 @@ int rv_decode_dc(MpegEncContext *s, int n)
                 get_bits(&s->gb, 9);
                 code = 1;
             } else {
+                fprintf(stderr, "chroma dc error\n");
                 return 0xffff;
             }
         } else {
@@ -222,8 +223,18 @@ int rv_decode_dc(MpegEncContext *s, int n)
 /* write RV 1.0 compatible frame header */
 void rv10_encode_picture_header(MpegEncContext *s, int picture_number)
 {
+    int full_frame= 1;
+
     align_put_bits(&s->pb);
+    
+    if(full_frame){
+        put_bits(&s->pb, 8, 0xc0);	/* packet header */
+        put_bits(&s->pb, 16, 0x4000);	/* len */
+        put_bits(&s->pb, 16, 0x4000);	/* pos */
+    }
 
+    put_bits(&s->pb, 8, picture_number&0xFF);
+    
     put_bits(&s->pb, 1, 1);	/* marker */
 
     put_bits(&s->pb, 1, (s->pict_type == P_TYPE));
@@ -237,9 +248,11 @@ void rv10_encode_picture_header(MpegEncContext *s, int picture_number)
     }
     /* if multiple packets per frame are sent, the position at which
        to display the macro blocks is coded here */
-    put_bits(&s->pb, 6, 0);	/* mb_x */
-    put_bits(&s->pb, 6, 0);	/* mb_y */
-    put_bits(&s->pb, 12, s->mb_width * s->mb_height);
+    if(!full_frame){
+        put_bits(&s->pb, 6, 0);	/* mb_x */
+        put_bits(&s->pb, 6, 0);	/* mb_y */
+        put_bits(&s->pb, 12, s->mb_width * s->mb_height);
+    }
 
     put_bits(&s->pb, 3, 0);	/* ignored */
 }
@@ -261,6 +274,7 @@ static int get_num(GetBitContext *gb)
 static int rv10_decode_picture_header(MpegEncContext *s)
 {
     int mb_count, pb_frame, marker, h, full_frame;
+    int pic_num, unk;
     
     /* skip packet header */
     h = get_bits(&s->gb, 8);
@@ -269,15 +283,17 @@ static int rv10_decode_picture_header(MpegEncContext *s)
         full_frame = 1;
         len = get_num(&s->gb);
         pos = get_num(&s->gb);
+//printf("pos:%d\n",len);
     } else {
         int seq, frame_size, pos;
         full_frame = 0;
         seq = get_bits(&s->gb, 8);
         frame_size = get_num(&s->gb);
         pos = get_num(&s->gb);
+//printf("seq:%d, size:%d, pos:%d\n",seq,frame_size,pos);
     }
     /* picture number */
-    get_bits(&s->gb, 8);
+    pic_num= get_bits(&s->gb, 8);
 
     marker = get_bits(&s->gb, 1);
 
@@ -285,17 +301,24 @@ static int rv10_decode_picture_header(MpegEncContext *s)
         s->pict_type = P_TYPE;
     else
         s->pict_type = I_TYPE;
-
+//printf("h:%d ver:%d\n",h,s->rv10_version);
+    if(!marker) printf("marker missing\n");
     pb_frame = get_bits(&s->gb, 1);
 
 #ifdef DEBUG
     printf("pict_type=%d pb_frame=%d\n", s->pict_type, pb_frame);
 #endif
     
-    if (pb_frame)
+    if (pb_frame){
+        fprintf(stderr, "pb frame not supported\n");
         return -1;
+    }
 
     s->qscale = get_bits(&s->gb, 5);
+    if(s->qscale==0){
+        fprintf(stderr, "error, qscale:0\n");
+        return -1;
+    }
 
     if (s->pict_type == I_TYPE) {
         if (s->rv10_version == 3) {
@@ -322,13 +345,11 @@ static int rv10_decode_picture_header(MpegEncContext *s)
         s->mb_y = 0;
         mb_count = s->mb_width * s->mb_height;
     }
-
-    get_bits(&s->gb, 3);	/* ignored */
+    unk= get_bits(&s->gb, 3);	/* ignored */
+//printf("%d\n", unk);
     s->f_code = 1;
     s->unrestricted_mv = 1;
-#if 0
-    s->h263_long_vectors = 1;
-#endif
+
     return mb_count;
 }
 
@@ -337,14 +358,34 @@ static int rv10_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     static int done;
 
-//    s->avctx= avctx;
+    s->avctx= avctx;
     s->out_format = FMT_H263;
 
     s->width = avctx->width;
     s->height = avctx->height;
 
     s->h263_rv10 = 1;
-    s->rv10_version = avctx->sub_id;
+    if(avctx->extradata_size >= 8){
+        switch(((uint32_t*)avctx->extradata)[1]){
+        case 0x10000000:
+            s->rv10_version= 0;
+            s->h263_long_vectors=0;
+            break;
+        case 0x10003000:
+            s->rv10_version= 3;
+            s->h263_long_vectors=1;
+            break;
+        case 0x10003001:
+            s->rv10_version= 3;
+            s->h263_long_vectors=0;
+            break;
+        default:
+            fprintf(stderr, "unknown header %X\n", ((uint32_t*)avctx->extradata)[1]);
+        }
+    }else{
+    //  for backward compatibility 
+        s->rv10_version= avctx->sub_id;
+    }
     
     s->flags= avctx->flags;
 
@@ -353,6 +394,9 @@ static int rv10_decode_init(AVCodecContext *avctx)
 
     h263_decode_init_vlc(s);
 
+    s->y_dc_scale_table=
+    s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+
     /* init rv vlc */
     if (!done) {
         init_vlc(&rv_dc_lum, DC_VLC_BITS, 256, 
@@ -398,25 +442,19 @@ static int rv10_decode_frame(AVCodecContext *avctx,
 
     mb_count = rv10_decode_picture_header(s);
     if (mb_count < 0) {
-#ifdef DEBUG
-        printf("HEADER ERROR\n");
-#endif
+        fprintf(stderr, "HEADER ERROR\n");
         return -1;
     }
     
     if (s->mb_x >= s->mb_width ||
         s->mb_y >= s->mb_height) {
-#ifdef DEBUG
-        printf("POS ERROR %d %d\n", s->mb_x, s->mb_y);
-#endif
+        fprintf(stderr, "POS ERROR %d %d\n", s->mb_x, s->mb_y);
         return -1;
     }
     mb_pos = s->mb_y * s->mb_width + s->mb_x;
     left = s->mb_width * s->mb_height - mb_pos;
     if (mb_count > left) {
-#ifdef DEBUG
-        printf("COUNT ERROR\n");
-#endif
+        fprintf(stderr, "COUNT ERROR\n");
         return -1;
     }
 
@@ -463,9 +501,7 @@ static int rv10_decode_frame(AVCodecContext *avctx,
         s->mv_dir = MV_DIR_FORWARD;
         s->mv_type = MV_TYPE_16X16; 
         if (h263_decode_mb(s, block) < 0) {
-#ifdef DEBUG
-            printf("ERROR\n");
-#endif
+            fprintf(stderr, "ERROR at MB %d %d\n", s->mb_x, s->mb_y);
             return -1;
         }
         MPV_decode_mb(s, block);
diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c
index 9edb7262a..ccebd67a9 100644
--- a/src/libffmpeg/libavcodec/simple_idct.c
+++ b/src/libffmpeg/libavcodec/simple_idct.c
@@ -25,6 +25,8 @@
 #include "dsputil.h"
 #include "simple_idct.h"
 
+//#define ARCH_ALPHA
+
 #if 0
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -83,10 +85,13 @@ static inline int idctRowCondDC(int16_t *row)
 			return 0;
 		if ((lrow[0] & ~0xffffULL) == 0) {
 			uint64_t v;
-
+#if 1			//is ok if |a0| < 1024 than theres an +-1 error (for the *W4 case for W4=16383 !!!)
+			a0 = row[0]<<3;
+#else
 			a0 = W4 * row[0];
 			a0 += 1 << (ROW_SHIFT - 1);
 			a0 >>= ROW_SHIFT;
+#endif
 			v = (uint16_t) a0;
 			v += v << 16;
 			v += v << 32;
@@ -168,7 +173,7 @@ static inline int idctRowCondDC(int16_t *row)
 	return 2;
 }
 
-inline static void idctSparseCol(int16_t *col)
+inline static void idctSparseCol2(int16_t *col)
 {
         int a0, a1, a2, a3, b0, b1, b2, b3;
 
@@ -478,6 +483,70 @@ static inline void idctSparseColAdd (UINT8 *dest, int line_size,
         dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
 }
 
+static inline void idctSparseCol (int16_t * col)
+{
+	int a0, a1, a2, a3, b0, b1, b2, b3;
+
+        /* XXX: I did that only to give same values as previous code */
+	a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
+	a1 = a0;
+	a2 = a0;
+	a3 = a0;
+
+        a0 +=  + W2*col[8*2];
+        a1 +=  + W6*col[8*2];
+        a2 +=  - W6*col[8*2];
+        a3 +=  - W2*col[8*2];
+
+        MUL16(b0, W1, col[8*1]);
+        MUL16(b1, W3, col[8*1]);
+        MUL16(b2, W5, col[8*1]);
+        MUL16(b3, W7, col[8*1]);
+
+        MAC16(b0, + W3, col[8*3]);
+        MAC16(b1, - W7, col[8*3]);
+        MAC16(b2, - W1, col[8*3]);
+        MAC16(b3, - W5, col[8*3]);
+
+	if(col[8*4]){
+            a0 += + W4*col[8*4];
+            a1 += - W4*col[8*4];
+            a2 += - W4*col[8*4];
+            a3 += + W4*col[8*4];
+	}
+
+	if (col[8*5]) {
+            MAC16(b0, + W5, col[8*5]);
+            MAC16(b1, - W1, col[8*5]);
+            MAC16(b2, + W7, col[8*5]);
+            MAC16(b3, + W3, col[8*5]);
+	}
+
+	if(col[8*6]){
+            a0 += + W6*col[8*6];
+            a1 += - W2*col[8*6];
+            a2 += + W2*col[8*6];
+            a3 += - W6*col[8*6];
+	}
+
+	if (col[8*7]) {
+            MAC16(b0, + W7, col[8*7]);
+            MAC16(b1, - W5, col[8*7]);
+            MAC16(b2, + W3, col[8*7]);
+            MAC16(b3, - W1, col[8*7]);
+	}
+
+        col[0 ] = ((a0 + b0) >> COL_SHIFT);
+        col[8 ] = ((a1 + b1) >> COL_SHIFT);
+        col[16] = ((a2 + b2) >> COL_SHIFT);
+        col[24] = ((a3 + b3) >> COL_SHIFT);
+        col[32] = ((a3 - b3) >> COL_SHIFT);
+        col[40] = ((a2 - b2) >> COL_SHIFT);
+        col[48] = ((a1 - b1) >> COL_SHIFT);
+        col[56] = ((a0 - b0) >> COL_SHIFT);
+}
+
+
 #ifdef ARCH_ALPHA
 /* If all rows but the first one are zero after row transformation,
    all rows will be identical after column transformation.  */
@@ -527,7 +596,7 @@ void simple_idct (short *block)
         } else if (rowsConstant) {
 		uint64_t *lblock = (uint64_t *) block;
 
-		idctSparseCol(block);
+		idctSparseCol2(block);
 		for (i = 0; i < 8; i++) {
 			uint64_t v = (uint16_t) block[i * 8];
 
@@ -539,7 +608,7 @@ void simple_idct (short *block)
 		}
 	} else {
 		for (i = 0; i < 8; i++)
-			idctSparseCol(block + i);
+			idctSparseCol2(block + i);
 	}
 }
 
@@ -578,6 +647,16 @@ void simple_idct_add(UINT8 *dest, int line_size, INT16 *block)
         idctSparseColAdd(dest + i, line_size, block + i);
 }
 
+void simple_idct(INT16 *block)
+{
+    int i;
+    for(i=0; i<8; i++)
+        idctRowCondDC(block + i*8);
+    
+    for(i=0; i<8; i++)
+        idctSparseCol(block + i);
+}
+
 #endif
 
 #undef COL_SHIFT
diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h
index 233a7b841..b26754225 100644
--- a/src/libffmpeg/libavcodec/simple_idct.h
+++ b/src/libffmpeg/libavcodec/simple_idct.h
@@ -21,3 +21,4 @@
 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block);
 void simple_idct_add(UINT8 *dest, int line_size, INT16 *block);
 void simple_idct_mmx(short *block);
+void simple_idct(short *block);
diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c
index f6de669b4..741bef217 100644
--- a/src/libffmpeg/libavcodec/svq1.c
+++ b/src/libffmpeg/libavcodec/svq1.c
@@ -839,8 +839,7 @@ static int svq1_motion_inter_block (bit_buffer_t *bitbuf,
   src = &previous[(x + (mv.x >> 1)) + (y + (mv.y >> 1))*pitch];
   dst = current;
 
-  put_pixels_tab[((mv.y & 1) << 1) | (mv.x & 1)](dst,src,pitch,16);
-  put_pixels_tab[((mv.y & 1) << 1) | (mv.x & 1)](dst+8,src+8,pitch,16);
+  put_pixels_tab[0][((mv.y & 1) << 1) | (mv.x & 1)](dst,src,pitch,16);
 
   return 0;
 }
@@ -907,7 +906,7 @@ static int svq1_motion_inter_4v_block (bit_buffer_t *bitbuf,
     src = &previous[(x + (pmv[i]->x >> 1)) + (y + (pmv[i]->y >> 1))*pitch];
     dst = current;
 
-    put_pixels_tab[((pmv[i]->y & 1) << 1) | (pmv[i]->x & 1)](dst,src,pitch,8);
+    put_pixels_tab[1][((pmv[i]->y & 1) << 1) | (pmv[i]->x & 1)](dst,src,pitch,8);
 
     /* select next block */
     if (i & 1) {
diff --git a/src/video_out/video_out_xv.c b/src/video_out/video_out_xv.c
index c4a96064d..49809bb29 100644
--- a/src/video_out/video_out_xv.c
+++ b/src/video_out/video_out_xv.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: video_out_xv.c,v 1.134 2002/09/15 20:56:24 guenter Exp $
+ * $Id: video_out_xv.c,v 1.135 2002/09/16 21:49:35 miguelfreitas Exp $
  * 
  * video_out_xv.c, X11 video extension interface for xine
  *
@@ -1311,7 +1311,7 @@ static void *init_video_out_plugin (xine_t *xine, void *visual_gen) {
 }
 
 static vo_info_t vo_info_xv = {
-  5,                    /* priority    */
+  9,                    /* priority    */
   "xine video output plugin using the MIT X video extension", /* description */
   XINE_VISUAL_TYPE_X11  /* visual type */
 };
diff --git a/src/xine-engine/load_plugins.c b/src/xine-engine/load_plugins.c
index ed6eb4f85..639b900f9 100644
--- a/src/xine-engine/load_plugins.c
+++ b/src/xine-engine/load_plugins.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: load_plugins.c,v 1.92 2002/09/15 12:28:16 jcdutton Exp $
+ * $Id: load_plugins.c,v 1.93 2002/09/16 21:49:35 miguelfreitas Exp $
  *
  *
  * Load input/demux/audio_out/video_out/codec plugins
@@ -108,6 +108,7 @@ static void _insert_plugin (xine_list_t *list,
   ao_info_t         *ao_new, *ao_old;
   decoder_info_t    *decoder_new, *decoder_old;
   uint32_t          *types;
+  int                priority = 0;
   int                i;
 
   entry = xine_xmalloc(sizeof(plugin_node_t));
@@ -122,7 +123,7 @@ static void _insert_plugin (xine_list_t *list,
   case PLUGIN_VIDEO_OUT:
     vo_old = info->special_info;
     vo_new = xine_xmalloc(sizeof(vo_info_t));
-    vo_new->priority = vo_old->priority;
+    priority = vo_new->priority = vo_old->priority;
     vo_new->description = _strclone(vo_old->description);
     vo_new->visual_type = vo_old->visual_type;
     entry->info->special_info = vo_new;
@@ -131,7 +132,7 @@ static void _insert_plugin (xine_list_t *list,
   case PLUGIN_AUDIO_OUT:
     ao_old = info->special_info;
     ao_new = xine_xmalloc(sizeof(ao_info_t));
-    ao_new->priority = ao_old->priority;
+    priority = ao_new->priority = ao_old->priority;
     ao_new->description = _strclone(ao_old->description);
     entry->info->special_info = ao_new;
     break;
@@ -152,12 +153,12 @@ static void _insert_plugin (xine_list_t *list,
       types[i] = decoder_old->supported_types[i];
     }
     decoder_new->supported_types = types;
-    decoder_new->priority = decoder_old->priority;
+    priority = decoder_new->priority = decoder_old->priority;
     entry->info->special_info = decoder_new;
     break;
   }
 
-  xine_list_append_content (list, entry);
+  xine_list_append_priority_content (list, entry, priority);
 }
 
 
@@ -206,7 +207,7 @@ static void collect_plugins(xine_t *this, char *path){
 	xine_log (this, XINE_LOG_PLUGIN,
 		  _("load_plugins: unable to stat %s\n"), str); 
       }
-      else {
+      else if( strstr(str, ".so") ) {
 		
 	switch (statbuffer.st_mode & S_IFMT){
 		  
@@ -217,14 +218,14 @@ static void collect_plugins(xine_t *this, char *path){
 		  
 	  if(!(lib = dlopen (str, RTLD_LAZY | RTLD_GLOBAL))) {
 			
-#ifdef LOG
+/*#ifdef LOG*/
 	    {
 	      char *dl_error_msg = dlerror();
-	      /* too noisy */
+	      /* too noisy -- but good to catch unresolved references */
 	      printf ("load_plugins: cannot open plugin lib %s:\n%s\n",
 		      str, dl_error_msg); 
 	    }
-#endif
+/*#endif*/
 	  }
 	  else {
 			
@@ -320,6 +321,7 @@ static void *_load_plugin(xine_t *this,
 	  
 	  return info->init(this, data);
 	}
+	info++;
       }
 
     } else {
diff --git a/src/xine-utils/list.c b/src/xine-utils/list.c
index 1ef36fb05..56ba961ef 100644
--- a/src/xine-utils/list.c
+++ b/src/xine-utils/list.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: list.c,v 1.2 2002/09/04 23:31:13 guenter Exp $
+ * $Id: list.c,v 1.3 2002/09/16 21:49:35 miguelfreitas Exp $
  *
  */
 #ifdef HAVE_CONFIG_H
@@ -139,6 +139,53 @@ void *xine_list_prev_content (xine_list_t *l) {
   }    
 }
 
+void xine_list_append_priority_content (xine_list_t *l, void *content, int priority) {
+  xine_node_t *node;
+
+  node = (xine_node_t *) xine_xmalloc(sizeof(xine_node_t));
+  node->content = content;
+  node->priority = priority;
+
+  if (l->first) {
+    xine_node_t *cur;
+
+    cur = l->first;
+
+    while(1) {
+      if( priority >= cur->priority ) {
+        node->next = cur;
+        node->prev = cur->prev;
+
+        if( node->prev )
+          node->prev->next = node;
+        else
+          l->first = node;
+        cur->prev = node;
+
+        l->cur = node;
+        break;
+      }
+
+      if( !cur->next ) {
+        node->next = NULL;
+        node->prev = cur;
+        cur->next = node;
+
+        l->cur = node;
+        l->last = node;
+        break;
+      }
+     
+      cur = cur->next;
+    }
+  } 
+  else {
+    l->first = l->last = l->cur = node;
+    node->prev = node->next = NULL;
+  }
+}
+
+
 void xine_list_append_content (xine_list_t *l, void *content) {
   xine_node_t *node;
 
diff --git a/src/xine-utils/xineutils.h b/src/xine-utils/xineutils.h
index 9875da713..1da508c61 100644
--- a/src/xine-utils/xineutils.h
+++ b/src/xine-utils/xineutils.h
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: xineutils.h,v 1.22 2002/09/04 23:31:14 guenter Exp $
+ * $Id: xineutils.h,v 1.23 2002/09/16 21:49:35 miguelfreitas Exp $
  *
  */
 #ifndef XINEUTILS_H
@@ -786,13 +786,15 @@ extern int v_g_table[256];
 extern int v_b_table[256];
 
 
-/******** double cained lists with builtin iterator *******/
+/******** double chained lists with builtin iterator *******/
 
 typedef struct xine_node_s {
 
   struct xine_node_s    *next, *prev;
   
   void                  *content;
+
+  int                    priority;
   
 } xine_node_t;
 
@@ -841,6 +843,11 @@ void *xine_list_last_content (xine_list_t *l);
 void *xine_list_prev_content (xine_list_t *l);
 
 /**
+ * Append content to list, sorted by decreasing priority.
+ */
+void xine_list_append_priority_content (xine_list_t *l, void *content, int priority);
+
+/**
  * Append content to list.
  */
 void xine_list_append_content (xine_list_t *l, void *content);