41 files changed, 9784 insertions, 5276 deletions
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index f05501807..0fcae49fb 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -16,7 +16,8 @@ noinst_LTLIBRARIES = libavcodec.la
 libavcodec_la_SOURCES = common.c utils.c mpegvideo.c h263.c jrevdct.c jfdctfst.c \
 			mjpeg.c dsputil.c \
 			motion_est.c imgconvert.c msmpeg4.c \
-			mpeg12.c h263dec.c rv10.c simple_idct.c
+			mpeg12.c h263dec.c rv10.c simple_idct.c \
+			ratecontrol.c
 #imgresample.c
 
 libavcodec_la_LDFLAGS = \
diff --git a/src/libffmpeg/libavcodec/alpha/asm.h b/src/libffmpeg/libavcodec/alpha/asm.h
new file mode 100644
index 000000000..0f4685f11
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/asm.h
@@ -0,0 +1,141 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef LIBAVCODEC_ALPHA_ASM_H
+#define LIBAVCODEC_ALPHA_ASM_H
+
+#include <stdint.h>
+
+#define AMASK_BWX (1 << 0)
+#define AMASK_FIX (1 << 1)
+#define AMASK_MVI (1 << 8)
+
+static inline uint64_t BYTE_VEC(uint64_t x)
+{
+    x |= x <<  8;
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+static inline uint64_t WORD_VEC(uint64_t x)
+{
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+
+static inline int32_t ldl(const void* p)
+{
+    return *(const int32_t*) p;
+}
+static inline uint64_t ldq(const void* p)
+{
+    return *(const uint64_t*) p;
+}
+/* FIXME ccc doesn't seem to get it? Use inline asm?  */
+static inline uint64_t ldq_u(const void* p)
+{
+    return *(const uint64_t*) ((uintptr_t) p & ~7ul);
+}
+static inline void stl(uint32_t l, void* p)
+{
+    *(uint32_t*) p = l;
+}
+static inline void stq(uint64_t l, void* p)
+{
+    *(uint64_t*) p = l;
+}
+
+#ifdef __GNUC__
+#define OPCODE1(name)						\
+static inline uint64_t name(uint64_t l)				\
+{								\
+    uint64_t r;							\
+    asm (#name " %1, %0" : "=r" (r) : "r" (l));			\
+    return r;							\
+}
+
+#define OPCODE2(name)						\
+static inline uint64_t name(uint64_t l1, uint64_t l2)		\
+{								\
+    uint64_t r;							\
+    asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2));	\
+    return r;							\
+}
+
+/* We don't want gcc to move this around or combine it with another
+   rpcc, so mark it volatile.  */
+static inline uint64_t rpcc(void)
+{
+    uint64_t r;
+    asm volatile ("rpcc %0" : "=r" (r));
+    return r;
+}
+
+static inline uint64_t uldq(const void* v)
+{
+    struct foo {
+	unsigned long l;
+    } __attribute__((packed));
+
+    return ((const struct foo*) v)->l;
+}
+
+#elif defined(__DECC)		/* Compaq "ccc" compiler */
+
+#include <c_asm.h>
+#define OPCODE1(name)							\
+static inline uint64_t name(uint64_t l)					\
+{									\
+    return asm (#name " %a0, %v0", l);					\
+}
+
+#define OPCODE2(name)							\
+static inline uint64_t name(uint64_t l1, uint64_t l2)			\
+{									\
+    return asm (#name " %a0, %a1, %v0", l1, l2);			\
+}
+
+static inline uint64_t rpcc(void)
+{
+    return asm  ("rpcc %v0");
+}
+
+static inline uint64_t uldq(const void* v)
+{
+    return *(const __unaligned uint64_t *) v;
+}
+
+#endif
+
+OPCODE1(amask);
+OPCODE1(unpkbw);
+OPCODE1(pkwb);
+OPCODE2(extql);
+OPCODE2(extqh);
+OPCODE2(zap);
+OPCODE2(cmpbge);
+OPCODE2(minsw4);
+OPCODE2(minuw4);
+OPCODE2(minub8);
+OPCODE2(maxsw4);
+OPCODE2(maxuw4);
+OPCODE2(perr);
+
+#endif /* LIBAVCODEC_ALPHA_ASM_H */
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
new file mode 100644
index 000000000..3a54904f4
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -0,0 +1,223 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "asm.h"
+#include "../dsputil.h"
+
+void simple_idct_axp(DCTELEM *block);
+
+static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
+				   int line_size)
+{
+    int i = 8;
+    do {
+	UINT64 shorts;
+
+	shorts = ldq(block);
+	shorts = maxsw4(shorts, 0);
+	shorts = minsw4(shorts, WORD_VEC(0x00ff));
+	stl(pkwb(shorts), pixels);
+
+	shorts = ldq(block + 4);
+	shorts = maxsw4(shorts, 0);
+	shorts = minsw4(shorts, WORD_VEC(0x00ff));
+	stl(pkwb(shorts), pixels + 4);
+
+	pixels += line_size;
+	block += 8;
+    } while (--i);
+}
+
+static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
+				   int line_size)
+{
+    int i = 8;
+    do {
+	UINT64 shorts; 
+
+	shorts = ldq(block);
+	shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
+	shorts += unpkbw(ldl(pixels));
+	shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
+	shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
+	shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
+	shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
+	stl(pkwb(shorts), pixels);
+
+	/* next 4 */
+	shorts = ldq(block + 4);
+	shorts &= ~WORD_VEC(0x8000);
+	shorts += unpkbw(ldl(pixels + 4));
+	shorts &= ~WORD_VEC(0x8000);
+	shorts = minuw4(shorts, WORD_VEC(0x4000));
+	shorts &= ~WORD_VEC(0x4000);
+	shorts = minsw4(shorts, WORD_VEC(0x00ff));
+	stl(pkwb(shorts), pixels + 4);
+
+	pixels += line_size;
+	block += 8;
+    } while (--i);
+}
+
+/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
+   Since the immediate result could be greater than 255, we do the
+   shift first. The result is too low by one if the bytes were both
+   odd, so we need to add (l1 & l2) & BYTE_VEC(0x01).  */
+static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
+{
+    UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
+    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
+    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
+    return l1 + l2 + correction;
+}
+
+/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
+   The '1' only has an effect when one byte is even and the other odd,
+   i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
+   Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01).  */
+static inline UINT64 avg2(UINT64 l1, UINT64 l2)
+{
+    UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
+    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
+    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
+    return l1 + l2 + correction;
+}
+
+static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+{
+    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+	      + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+	      + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+	      + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    UINT64 r2 = ((  (l1 & BYTE_VEC(0x03))
+		  + (l2 & BYTE_VEC(0x03))
+		  + (l3 & BYTE_VEC(0x03))
+		  + (l4 & BYTE_VEC(0x03))
+		  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+
+static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+{
+    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+	      + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+	      + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+	      + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
+		 + (l2 & BYTE_VEC(0x03))
+		 + (l3 & BYTE_VEC(0x03))
+		 + (l4 & BYTE_VEC(0x03))
+		 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+
+#define PIXOPNAME(suffix) put ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2
+#define AVG4 avg4
+#define STORE(l, b) stq(l, b)
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#define PIXOPNAME(suffix) put_no_rnd ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define STORE(l, b) stq(l, b)
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+/* The following functions are untested.  */
+#if 0
+
+#define PIXOPNAME(suffix) avg ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2
+#define AVG4 avg4
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#define PIXOPNAME(suffix) avg_no_rnd ## suffix
+#define BTYPE UINT8
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#define PIXOPNAME(suffix) sub ## suffix
+#define BTYPE DCTELEM
+#define AVG2 avg2
+#define AVG4 avg4
+#define STORE(l, block) do {		\
+    UINT64 xxx = l;			\
+    (block)[0] -= (xxx >>  0) & 0xff;	\
+    (block)[1] -= (xxx >>  8) & 0xff;	\
+    (block)[2] -= (xxx >> 16) & 0xff;	\
+    (block)[3] -= (xxx >> 24) & 0xff;	\
+    (block)[4] -= (xxx >> 32) & 0xff;	\
+    (block)[5] -= (xxx >> 40) & 0xff;	\
+    (block)[6] -= (xxx >> 48) & 0xff;	\
+    (block)[7] -= (xxx >> 56) & 0xff;	\
+} while (0)
+#include "pixops.h"
+#undef PIXOPNAME
+#undef BTYPE
+#undef AVG2
+#undef AVG4
+#undef STORE
+
+#endif
+
+void dsputil_init_alpha(void)
+{
+    put_pixels_tab[0] = put_pixels_axp;
+    put_pixels_tab[1] = put_pixels_x2_axp;
+    put_pixels_tab[2] = put_pixels_y2_axp;
+    put_pixels_tab[3] = put_pixels_xy2_axp;
+
+    put_no_rnd_pixels_tab[0] = put_pixels_axp;
+    put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
+    put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
+    put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
+
+    /* amask clears all bits that correspond to present features.  */
+    if (amask(AMASK_MVI) == 0) {
+	fprintf(stderr, "MVI extension detected\n");
+	put_pixels_clamped = put_pixels_clamped_axp;
+	add_pixels_clamped = add_pixels_clamped_axp;
+    }
+}
diff --git a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
new file mode 100644
index 000000000..d0af5e1d3
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
@@ -0,0 +1,88 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "asm.h"
+#include "../dsputil.h"
+#include "../mpegvideo.h"
+
+extern UINT8 zigzag_end[64];
+
+static void dct_unquantize_h263_axp(MpegEncContext *s, 
+				    DCTELEM *block, int n, int qscale)
+{
+    int i, level;
+    UINT64 qmul, qadd;
+    if (s->mb_intra) {
+        if (n < 4) 
+            block[0] = block[0] * s->y_dc_scale;
+        else
+            block[0] = block[0] * s->c_dc_scale;
+	/* Catch up to aligned point.  */
+	qmul = s->qscale << 1;
+	qadd = (s->qscale - 1) | 1;
+	for (i = 1; i < 4; ++i) {
+	    level = block[i];
+	    if (level) {
+		if (level < 0) {
+		    level = level * qmul - qadd;
+		} else {
+		    level = level * qmul + qadd;
+		}
+		block[i] = level;
+	    }
+	}
+	block += 4;
+	i = 60 / 4;
+    } else {
+        i = zigzag_end[s->block_last_index[n]] / 4;
+    }
+    qmul = s->qscale << 1;
+    qadd = WORD_VEC((qscale - 1) | 1);
+    do {
+	UINT64 levels, negmask, zeromask, corr;
+	levels = ldq(block);
+	if (levels == 0)
+	    continue;
+	zeromask = cmpbge(0, levels);
+	zeromask &= zeromask >> 1;
+	/* Negate all negative words.  */
+	negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */
+	negmask = minsw4(negmask, 0);		    /* positive -> 0000 (0) */
+	corr    = negmask & WORD_VEC(0x0001); /* twos-complement correction */
+	levels ^= negmask;
+	levels += corr;
+
+	levels = levels * qmul;
+	levels += zap(qadd, zeromask);
+
+	/* Re-negate negative words.  */
+	levels -= corr;
+	levels ^= negmask;
+
+	stq(levels, block);
+    } while (block += 4, --i);
+}
+
+void MPV_common_init_axp(MpegEncContext *s)
+{
+    if (amask(AMASK_MVI) == 0) {
+        if (s->out_format == FMT_H263)
+	    s->dct_unquantize = dct_unquantize_h263_axp;
+    }
+}
diff --git a/src/libffmpeg/libavcodec/alpha/pixops.h b/src/libffmpeg/libavcodec/alpha/pixops.h
new file mode 100644
index 000000000..118d7ae23
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/pixops.h
@@ -0,0 +1,135 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* This file is intended to be #included with proper definitions of
+ * PIXOPNAME, BTYPE, AVG2, AVG4 and STORE.  */
+
+static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels,
+				   int line_size, int h)
+{
+    if ((size_t) pixels & 0x7) {
+	do {
+	    STORE(uldq(pixels), block);
+	    pixels += line_size;
+	    block  += line_size;
+	} while (--h);
+    } else {
+	do {
+	    STORE(ldq(pixels), block);
+	    pixels += line_size;
+	    block  += line_size;
+	} while (--h);
+    }
+}
+
+static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels,
+				      int line_size, int h)
+{
+    if ((size_t) pixels & 0x7) {
+	do {
+	    UINT64 pix1, pix2;
+
+	    pix1 = uldq(pixels);
+	    pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+	    STORE(AVG2(pix1, pix2), block);
+	    pixels += line_size;
+	    block += line_size;
+	} while (--h);
+    } else {
+	do {
+	    UINT64 pix1, pix2;
+
+	    pix1 = ldq(pixels);
+	    pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+	    STORE(AVG2(pix1, pix2), block);
+	    pixels += line_size;
+	    block += line_size;
+	} while (--h);
+    }
+}
+
+static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels,
+				      int line_size, int h)
+{
+    if ((size_t) pixels & 0x7) {
+	UINT64 pix = uldq(pixels);
+	do {
+	    UINT64 next_pix;
+
+	    pixels += line_size;
+	    next_pix = uldq(pixels);
+	    STORE(AVG2(pix, next_pix), block);
+	    block += line_size;
+	    pix = next_pix;
+	} while (--h);
+    } else {
+	UINT64 pix = ldq(pixels);
+	do {
+	    UINT64 next_pix;
+
+	    pixels += line_size;
+	    next_pix = ldq(pixels);
+	    STORE(AVG2(pix, next_pix), block);
+	    block += line_size;
+	    pix = next_pix;
+	} while (--h);
+    }
+}
+
+/* This could be further sped up by recycling AVG4 intermediate
+  results from the previous loop pass.  */
+static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels,
+				       int line_size, int h)
+{
+    if ((size_t) pixels & 0x7) {
+	UINT64 pix1 = uldq(pixels);
+	UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+	do {
+	    UINT64 next_pix1, next_pix2;
+
+	    pixels += line_size;
+	    next_pix1 = uldq(pixels);
+	    next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+	    STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
+
+	    block += line_size;
+	    pix1 = next_pix1;
+	    pix2 = next_pix2;
+	} while (--h);
+    } else {
+	UINT64 pix1 = ldq(pixels);
+	UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+	do {
+	    UINT64 next_pix1, next_pix2;
+
+	    pixels += line_size;
+	    next_pix1 = ldq(pixels);
+	    next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
+
+	    STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
+
+	    block += line_size;
+	    pix1 = next_pix1;
+	    pix2 = next_pix2;
+	} while (--h);
+    }
+}
diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c
index 1cf7b4fba..cd362ca48 100644
--- a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c
+++ b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c
@@ -2,19 +2,19 @@
  * ARMv4L optimized DSP utils
  * Copyright (c) 2001 Lionel Ulmer.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "../dsputil.h"
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 05b27d8c2..68b67154d 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -3,6 +3,11 @@
 
 #include "common.h"
 
+#define LIBAVCODEC_VERSION_INT 0x000406
+#define LIBAVCODEC_VERSION     "0.4.6"
+#define LIBAVCODEC_BUILD       4614
+#define LIBAVCODEC_BUILD_STR   "4614"
+
 enum CodecID {
     CODEC_ID_NONE, 
     CODEC_ID_MPEG1VIDEO,
@@ -17,18 +22,31 @@ enum CodecID {
     CODEC_ID_MSMPEG4V1,
     CODEC_ID_MSMPEG4V2,
     CODEC_ID_MSMPEG4V3,
+    CODEC_ID_WMV1,
+    CODEC_ID_WMV2,
     CODEC_ID_H263P,
     CODEC_ID_H263I,
 
+    /* various pcm "codecs" */
+    CODEC_ID_PCM_S16LE,
+    CODEC_ID_PCM_S16BE,
+    CODEC_ID_PCM_U16LE,
+    CODEC_ID_PCM_U16BE,
+    CODEC_ID_PCM_S8,
+    CODEC_ID_PCM_U8,
+    CODEC_ID_PCM_MULAW,
+    CODEC_ID_PCM_ALAW,
 };
 #define CODEC_ID_MSMPEG4 CODEC_ID_MSMPEG4V3
 
 enum CodecType {
+    CODEC_TYPE_UNKNOWN = -1,
     CODEC_TYPE_VIDEO,
     CODEC_TYPE_AUDIO,
 };
 
 enum PixelFormat {
+    PIX_FMT_ANY = -1,
     PIX_FMT_YUV420P,
     PIX_FMT_YUV422,
     PIX_FMT_RGB24,
@@ -45,14 +63,24 @@ enum SampleFormat {
 /* in bytes */
 #define AVCODEC_MAX_AUDIO_FRAME_SIZE 18432
 
-/* motion estimation type */
+/* motion estimation type, EPZS by default */
+enum Motion_Est_ID {
+    ME_ZERO = 1,
+    ME_FULL,
+    ME_LOG,
+    ME_PHODS,
+    ME_EPZS,
+    ME_X1
+};
+
+/* only for ME compatiblity with old apps */
 extern int motion_estimation_method;
-#define ME_ZERO   0
-#define ME_FULL   1
-#define ME_LOG    2
-#define ME_PHODS  3
-#define ME_EPZS   4
-#define ME_X1     5
+
+/* ME algos sorted by quality */
+static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG, 
+                                       ME_X1, ME_EPZS, ME_FULL };
+
+#define FF_MAX_B_FRAMES 4
 
 /* encoding support */
 /* note not everything is supported yet */
@@ -60,10 +88,17 @@ extern int motion_estimation_method;
 #define CODEC_FLAG_HQ     0x0001 /* high quality (non real time) encoding */
 #define CODEC_FLAG_QSCALE 0x0002 /* use fixed qscale */
 #define CODEC_FLAG_4MV    0x0004 /* 4 MV per MB allowed */
-#define CODEC_FLAG_B      0x0008 /* use B frames */
 #define CODEC_FLAG_QPEL   0x0010 /* use qpel MC */
 #define CODEC_FLAG_GMC    0x0020 /* use GMC */
 #define CODEC_FLAG_TYPE   0x0040 /* fixed I/P frame type, from avctx->key_frame */
+#define CODEC_FLAG_PART   0x0080 /* use data partitioning */
+/* parent program gurantees that the input for b-frame containing streams is not written to 
+   for at least s->max_b_frames+1 frames, if this is not set than the input will be copied */
+#define CODEC_FLAG_INPUT_PRESERVED 0x0100
+#define CODEC_FLAG_PASS1 0x0200  /* use internal 2pass ratecontrol in first  pass mode */
+#define CODEC_FLAG_PASS2 0x0400  /* use internal 2pass ratecontrol in second pass mode */
+#define CODEC_FLAG_EXTERN_HUFF 0x1000 /* use external huffman table (for mjpeg) */
+#define CODEC_FLAG_GRAY  0x2000 /* only decode/encode grayscale */
 
 /* codec capabilities */
 
@@ -78,6 +113,15 @@ typedef struct AVCodecContext {
     int flags;
     int sub_id;    /* some codecs needs additionnal format info. It is
                       stored there */
+    
+    int me_method; /* ME algorithm used for video coding */
+    
+    /* extra data from parent application to codec, e.g. huffman table
+       for mjpeg */
+    /* the parent should allocate and free this buffer */
+    void *extradata;
+    int extradata_size;
+    
     /* video only */
     int frame_rate; /* frames per sec multiplied by FRAME_RATE_BASE */
     int width, height;
@@ -88,8 +132,10 @@ typedef struct AVCodecContext {
 #define FF_ASPECT_16_9_625 4
 #define FF_ASPECT_16_9_525 5
     int gop_size; /* 0 = intra only */
-    int pix_fmt;  /* pixel format, see PIX_FMT_xxx */
-
+    enum PixelFormat pix_fmt;  /* pixel format, see PIX_FMT_xxx */
+    int repeat_pict; /* when decoding, this signal how much the picture */
+                     /* must be delayed.                                */
+                     /* extra_delay = (repeat_pict / 2) * (1/fps)       */
     /* if non NULL, 'draw_horiz_band' is called by the libavcodec
        decoder to draw an horizontal band. It improve cache usage. Not
        all codecs can do that. You must check the codec capabilities
@@ -104,23 +150,48 @@ typedef struct AVCodecContext {
     int sample_fmt;  /* sample format, currenly unused */
 
     /* the following data should not be initialized */
-    int frame_size; /* in samples, initialized when calling 'init' */
-    int frame_number; /* audio or video frame number */
-    int key_frame;    /* true if the previous compressed frame was 
-                         a key frame (intra, or seekable) */
+    int frame_size;     /* in samples, initialized when calling 'init' */
+    int frame_number;   /* audio or video frame number */
+    int real_pict_num;  /* returns the real picture number of
+                           previous encoded frame */
+    int key_frame;      /* true if the previous compressed frame was 
+                           a key frame (intra, or seekable) */
+    int pict_type;      /* picture type of the previous 
+                           encoded frame */
+/* FIXME: these should have FF_ */
+#define I_TYPE 1 // Intra
+#define P_TYPE 2 // Predicted
+#define B_TYPE 3 // Bi-dir predicted
+#define S_TYPE 4 // S(GMC)-VOP MPEG4
+
+    int delay;          /* number of frames the decoded output 
+                           will be delayed relative to the encoded input */
+    uint8_t *mbskip_table; /* =1 if MB didnt change, is only valid for I/P frames 
+                              stride= mb_width = (width+15)>>4 */
+    
+    /* encoding parameters */
     int quality;      /* quality of the previous encoded frame 
-                         (between 1 (good) and 31 (bad)) */
+                         (between 1 (good) and 31 (bad)) 
+                         this is allso used to set the quality in vbr mode
+                         and the per frame quality in CODEC_FLAG_TYPE (second pass mode) */
     float qcompress;  /* amount of qscale change between easy & hard scenes (0.0-1.0)*/
     float qblur;      /* amount of qscale smoothing over time (0.0-1.0) */
     int qmin;         /* min qscale */
     int qmax;         /* max qscale */
     int max_qdiff;    /* max qscale difference between frames */
+    int max_b_frames; /* maximum b frames, the output will be delayed by max_b_frames+1 relative to the input */
+    float b_quant_factor;/* qscale factor between ips and b frames */
+    int rc_strategy;
+    int b_frame_strategy;
+
+    int hurry_up;     /* when set to 1 during decoding, b frames will be skiped
+                         when set to 2 idct/dequant will be skipped too */
     
     struct AVCodec *codec;
     void *priv_data;
 
     /* The following data is for RTP friendly coding */
-    /* By now only H.263/H.263+ coder honours this   */
+    /* By now only H.263/H.263+/MPEG4 coder honours this   */
     int rtp_mode;   /* 1 for activate RTP friendly-mode           */
                     /* highers numbers represent more error-prone */
                     /* enviroments, by now just "1" exist         */
@@ -145,7 +216,7 @@ typedef struct AVCodecContext {
     float psnr_y;
     float psnr_cb;
     float psnr_cr;
-                 
+    
     /* statistics, used for 2-pass encoding */
     int mv_bits;
     int header_bits;
@@ -156,13 +227,57 @@ typedef struct AVCodecContext {
     int skip_count;
     int misc_bits; // cbp, mb_type
     int frame_bits;
-
+                 
     /* the following fields are ignored */
     void *opaque;   /* can be used to carry app specific stuff */
     char codec_name[32];
-    int codec_type; /* see CODEC_TYPE_xxx */
-    int codec_id; /* see CODEC_ID_xxx */
+    enum CodecType codec_type; /* see CODEC_TYPE_xxx */
+    enum CodecID codec_id; /* see CODEC_ID_xxx */
     unsigned int codec_tag;  /* codec tag, only used if unknown codec */
+    
+    int workaround_bugs;       /* workaround bugs in encoders which cannot be detected automatically */
+    int luma_elim_threshold;
+    int chroma_elim_threshold;
+    int strict_std_compliance; /* strictly follow the std (MPEG4, ...) */
+    float b_quant_offset;/* qscale offset between ips and b frames, not implemented yet */
+    int error_resilience;
+    
+#ifndef MBC
+#define MBC 128
+#define MBR 96
+#endif
+    int *quant_store; /* field for communicating with external postprocessing */
+    unsigned qstride;
+    //FIXME this should be reordered after kabis API is finished ...
+    /*
+	Note: Below are located reserved fields for further usage
+	It requires for ABI !!!
+	If you'll perform some changes then borrow new space from these fields
+	(void * can be safety replaced with struct * ;)
+	P L E A S E ! ! !
+	IMPORTANT: Never change order of already declared fields!!!
+    */
+    unsigned long long int
+	    ull_res0,ull_res1,ull_res2,ull_res3,ull_res4,ull_res5,
+	    ull_res6,ull_res7,ull_res8,ull_res9,ull_res10,ull_res11,ull_res12;
+    float
+	    flt_res0,flt_res1,flt_res2,flt_res3,flt_res4,flt_res5,
+	    flt_res6,flt_res7,flt_res8,flt_res9,flt_res10,flt_res11;
+    void
+	    *ptr_res0,*ptr_res1,*ptr_res2,*ptr_res3,*ptr_res4,*ptr_res5,
+	    *ptr_res6,*ptr_res7,*ptr_res8,*ptr_res9,*ptr_res10,*ptr_res11;
+    unsigned long int
+	    ul_res0,ul_res1,ul_res2,ul_res3,ul_res4,ul_res5,
+	    ul_res6,ul_res7,ul_res8,ul_res9,ul_res10,ul_res11,ul_res12;
+    unsigned int
+	    ui_res0,ui_res1,ui_res2,ui_res3,ui_res4,ui_res5,
+	    ui_res6;
+    unsigned short int
+	    us_res0,us_res1,us_res2,us_res3,us_res4,us_res5,
+	    us_res6,us_res7,us_res8,us_res9,us_res10,us_res11,us_res12;
+    unsigned char
+	    uc_res0,uc_res1,uc_res2,uc_res3,uc_res4,uc_res5,
+	    uc_res6,uc_res7,uc_res8,uc_res9,uc_res10,uc_res11,uc_res12;    
 } AVCodecContext;
 
 typedef struct AVCodec {
@@ -177,6 +292,23 @@ typedef struct AVCodec {
                   UINT8 *buf, int buf_size);
     int capabilities;
     struct AVCodec *next;
+    /*
+	Note: Below are located reserved fields for further usage
+	It requires for ABI !!!
+	If you'll perform some changes then borrow new space from these fields
+	(void * can be safety replaced with struct * ;)
+	P L E A S E ! ! !
+	IMPORTANT: Never change order of already declared fields!!!
+    */
+    unsigned long long int
+	    ull_res0,ull_res1,ull_res2,ull_res3,ull_res4,ull_res5,
+	    ull_res6,ull_res7,ull_res8,ull_res9,ull_res10,ull_res11,ull_res12;
+    float
+	    flt_res0,flt_res1,flt_res2,flt_res3,flt_res4,flt_res5,
+	    flt_res6,flt_res7,flt_res8,flt_res9,flt_res10,flt_res11,flt_res12;
+    void
+	    *ptr_res0,*ptr_res1,*ptr_res2,*ptr_res3,*ptr_res4,*ptr_res5,
+	    *ptr_res6,*ptr_res7,*ptr_res8,*ptr_res9,*ptr_res10,*ptr_res11,*ptr_res12;
 } AVCodec;
 
 /* three components are given, that's all */
@@ -185,15 +317,47 @@ typedef struct AVPicture {
     int linesize[3];
 } AVPicture;
 
+extern AVCodec ac3_encoder;
+extern AVCodec mp2_encoder;
+extern AVCodec mp3lame_encoder;
+extern AVCodec mpeg1video_encoder;
+extern AVCodec h263_encoder;
+extern AVCodec h263p_encoder;
+extern AVCodec rv10_encoder;
+extern AVCodec mjpeg_encoder;
+extern AVCodec mpeg4_encoder;
+extern AVCodec msmpeg4v1_encoder;
+extern AVCodec msmpeg4v2_encoder;
+extern AVCodec msmpeg4v3_encoder;
+
 extern AVCodec h263_decoder;
 extern AVCodec mpeg4_decoder;
 extern AVCodec msmpeg4v1_decoder;
 extern AVCodec msmpeg4v2_decoder;
 extern AVCodec msmpeg4v3_decoder;
+extern AVCodec wmv1_decoder;
 extern AVCodec mpeg_decoder;
 extern AVCodec h263i_decoder;
 extern AVCodec rv10_decoder;
 extern AVCodec mjpeg_decoder;
+extern AVCodec mp2_decoder;
+extern AVCodec mp3_decoder;
+
+/* pcm codecs */
+#define PCM_CODEC(id, name) \
+extern AVCodec name ## _decoder; \
+extern AVCodec name ## _encoder;
+
+PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le);
+PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be);
+PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le);
+PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be);
+PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8);
+PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8);
+PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw);
+PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw);
+
+#undef PCM_CODEC
 
 /* dummy raw video codec */
 extern AVCodec rawvideo_codec;
@@ -242,8 +406,14 @@ int avpicture_deinterlace(AVPicture *dst, AVPicture *src,
 
 extern AVCodec *first_avcodec;
 
+/* returns LIBAVCODEC_VERSION_INT constant */
+unsigned avcodec_version(void);
+/* returns LIBAVCODEC_BUILD constant */
+unsigned avcodec_build(void);
 void avcodec_init(void);
 
+void avcodec_set_bit_exact(void);
+
 void register_avcodec(AVCodec *format);
 AVCodec *avcodec_find_encoder(enum CodecID id);
 AVCodec *avcodec_find_encoder_by_name(const char *name);
@@ -267,12 +437,87 @@ int avcodec_close(AVCodecContext *avctx);
 
 void avcodec_register_all(void);
 
+void avcodec_flush_buffers(AVCodecContext *avctx);
+
 #ifdef FF_POSTPROCESS
-#ifndef MBC
-#define MBC 48
-#define MBR 36
-#endif
 extern int quant_store[MBR+1][MBC+1]; // [Review]
 #endif
 
+
+/**
+ * Interface for 0.5.0 version
+ *
+ * do not even think about it's usage for this moment
+ */
+
+typedef struct {
+    // compressed size used from given memory buffer
+    int size;
+    /// I/P/B frame type
+    int frame_type;
+} avc_enc_result_t;
+
+/**
+ * Commands
+ * order can't be changed - once it was defined
+ */
+typedef enum {
+    // general commands
+    AVC_OPEN_BY_NAME = 0xACA000,
+    AVC_OPEN_BY_CODEC_ID,
+    AVC_OPEN_BY_FOURCC,
+    AVC_CLOSE,
+
+    AVC_FLUSH,
+    // pin - struct { uint8_t* src, uint_t src_size }
+    // pout - struct { AVPicture* img, consumed_bytes,
+    AVC_DECODE,
+    // pin - struct { AVPicture* img, uint8_t* dest, uint_t dest_size }
+    // pout - uint_t used_from_dest_size
+    AVC_ENCODE, 
+
+    // query/get video commands
+    AVC_GET_VERSION = 0xACB000,
+    AVC_GET_WIDTH,
+    AVC_GET_HEIGHT,
+    AVC_GET_DELAY,
+    AVC_GET_QUANT_TABLE,
+    // ...
+
+    // query/get audio commands
+    AVC_GET_FRAME_SIZE = 0xABC000,
+
+    // maybe define some simple structure which
+    // might be passed to the user - but they can't
+    // contain any codec specific parts and these
+    // calls are usualy necessary only few times
+
+    // set video commands
+    AVC_SET_WIDTH = 0xACD000,
+    AVC_SET_HEIGHT,
+
+    // set video encoding commands
+    AVC_SET_FRAME_RATE = 0xACD800,
+    AVC_SET_QUALITY,
+    AVC_SET_HURRY_UP,
+
+    // set audio commands
+    AVC_SET_SAMPLE_RATE = 0xACE000,
+    AVC_SET_CHANNELS,
+
+} avc_cmd_t;
+
+/**
+ * \param handle  allocated private structure by libavcodec
+ *                for initialization pass NULL - will be returned pout
+ *                user is supposed to know nothing about its structure
+ * \param cmd     type of operation to be performed
+ * \param pint    input parameter
+ * \param pout    output parameter
+ *
+ * \returns  command status - eventually for query command it might return
+ * integer resulting value
+ */
+int avcodec(void* handle, avc_cmd_t cmd, void* pin, void* pout);
+
 #endif /* AVCODEC_H */
diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c
index f7fe2e1d1..571de1afc 100644
--- a/src/libffmpeg/libavcodec/common.c
+++ b/src/libffmpeg/libavcodec/common.c
@@ -1,25 +1,24 @@
 /*
  * Common bit i/o utils
- * Copyright (c) 2000, 2001 Gerard Lantau.
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "common.h"
-#include <math.h>
 
 void init_put_bits(PutBitContext *s, 
                    UINT8 *buffer, int buffer_size,
@@ -108,6 +107,15 @@ void jflush_put_bits(PutBitContext *s)
 }
 #endif
 
+void put_string(PutBitContext * pbc, char *s)
+{
+    while(*s){
+        put_bits(pbc, 8, *s);
+        s++;
+    }
+    put_bits(pbc, 8, 0);
+}
+
 /* bit input functions */
 
 void init_get_bits(GetBitContext *s, 
@@ -166,6 +174,9 @@ unsigned int get_bits_long(GetBitContext *s, int n)
                 (buf_ptr[-2] << 8) |
                 (buf_ptr[-1]);	    
 #endif
+            val |= bit_buf >> (32 + bit_cnt);
+            bit_buf <<= - bit_cnt;
+            bit_cnt += 32;
         } else {
             buf_ptr -= 4;
             bit_buf = 0;
@@ -177,11 +188,13 @@ unsigned int get_bits_long(GetBitContext *s, int n)
                 bit_buf |= *buf_ptr++ << 8;
             if (buf_ptr < s->buf_end)
                 bit_buf |= *buf_ptr++;
+
+            val |= bit_buf >> (32 + bit_cnt);
+            bit_buf <<= - bit_cnt;
+            bit_cnt += 8*(buf_ptr - s->buf_ptr);
+            if(bit_cnt<0) bit_cnt=0;
         }
         s->buf_ptr = buf_ptr;
-        val |= bit_buf >> (32 + bit_cnt);
-        bit_buf <<= - bit_cnt;
-        bit_cnt += 32;
     }
     s->bit_buf = bit_buf;
     s->bit_cnt = bit_cnt;
@@ -349,7 +362,7 @@ static int build_table(VLC *vlc, int table_nb_bits,
 #endif
                     if (table_bits[j] != 0) {
                         fprintf(stderr, "incorrect codes\n");
-                        abort();
+                        exit(1);
                     }
                     table_bits[j] = n;
                     table_codes[j] = i;
@@ -435,10 +448,8 @@ int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
                     bits, bits_wrap, bits_size,
                     codes, codes_wrap, codes_size,
                     0, 0) < 0) {
-        if (vlc->table_bits)
-            free(vlc->table_bits);
-        if (vlc->table_codes)
-            free(vlc->table_codes);
+        av_free(vlc->table_bits);
+        av_free(vlc->table_codes);
         return -1;
     }
     return 0;
@@ -447,7 +458,11 @@ int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
 
 void free_vlc(VLC *vlc)
 {
-    free(vlc->table_bits);
-    free(vlc->table_codes);
+    av_free(vlc->table_bits);
+    av_free(vlc->table_codes);
 }
 
+int ff_gcd(int a, int b){
+    if(b) return ff_gcd(b, a%b);
+    else  return a;
+}
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 9c7b086d8..24bd367d6 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -1,8 +1,6 @@
 #ifndef COMMON_H
 #define COMMON_H
 
-#undef DEBUG
-
 #define FFMPEG_VERSION_INT 0x000406
 #define FFMPEG_VERSION     "0.4.6"
 
@@ -19,18 +17,19 @@
 
 #ifdef HAVE_AV_CONFIG_H
 /* only include the following when compiling package */
-#include "../config.h"
+#include "config.h"
 
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <errno.h>
+#include <math.h>
 
 #ifndef ENODATA
 #define ENODATA  61
 #endif
 
-#endif
+#endif /* HAVE_AV_CONFIG_H */
 
 #ifdef CONFIG_WIN32
 
@@ -51,6 +50,8 @@ typedef UINT16 uint16_t;
 typedef INT16 int16_t;
 typedef UINT32 uint32_t;
 typedef INT32 int32_t;
+typedef UINT64 uint64_t;
+typedef INT64 int64_t;
 
 #ifndef __MINGW32__
 #define INT64_C(c)     (c ## i64)
@@ -58,14 +59,6 @@ typedef INT32 int32_t;
 
 #define inline __inline
 
-/*
-  Disable warning messages:
-    warning C4244: '=' : conversion from 'double' to 'float', possible loss of data
-    warning C4305: 'argument' : truncation from 'const double' to 'float'
-*/
-#pragma warning( disable : 4244 )
-#pragma warning( disable : 4305 )
-
 #else
 #define INT64_C(c)     (c ## LL)
 #define UINT64_C(c)    (c ## ULL)
@@ -78,22 +71,9 @@ typedef INT32 int32_t;
 #define DEBUG
 #endif
 
-// code from bits/byteswap.h (C) 1997, 1998 Free Software Foundation, Inc.
-#define bswap_32(x) \
-     ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) | \
-      (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
-#define be2me_32(x) bswap_32(x)
-
 #define snprintf _snprintf
 
-#ifndef __MINGW32__
-/* no config.h with VC */
-#define CONFIG_ENCODERS 1
-#define CONFIG_DECODERS 1
-#define CONFIG_AC3      1
-#endif
-
-#else
+#else /* CONFIG_WIN32 */
 
 /* unix */
 
@@ -112,8 +92,6 @@ typedef signed char INT8;
 typedef signed int INT32;
 typedef signed long long INT64;
 
-#include "xine-engine/bswap.h"
-
 #ifdef HAVE_AV_CONFIG_H
 
 #ifdef __FreeBSD__
@@ -133,10 +111,19 @@ typedef signed long long INT64;
 
 #endif /* !CONFIG_WIN32 */
 
+#include "bswap.h"
 
-/* debug stuff */
 #ifdef HAVE_AV_CONFIG_H
 
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+    defined(__OS2__) || defined (__OpenBSD__)
+#define MANGLE(a) "_" #a
+#else
+#define MANGLE(a) #a
+#endif
+
+/* debug stuff */
+
 #ifndef DEBUG
 #define NDEBUG
 #endif
@@ -150,11 +137,7 @@ inline void dprintf(const char* fmt,...) {}
 #else
 
 #ifdef DEBUG
-#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95) || !defined(__GNUC__)
-#define dprintf(...)	     printf(__VA_ARGS__)
-#else
 #define dprintf(fmt,args...) printf(fmt, ## args)
-#endif
 #else
 #define dprintf(fmt,args...)
 #endif
@@ -163,6 +146,14 @@ inline void dprintf(const char* fmt,...) {}
 
 #endif /* HAVE_AV_CONFIG_H */
 
+#define av_abort()      do { fprintf(stderr, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0)
+
+/* assume b>0 */
+#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
+#define ABS(a) ((a) >= 0 ? (a) : (-(a)))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define MIN(a,b) ((a) > (b) ? (b) : (a))
+
 /* bit output */
 
 struct PutBitContext;
@@ -189,6 +180,7 @@ void init_put_bits(PutBitContext *s,
 INT64 get_bit_count(PutBitContext *s); /* XXX: change function name */
 void align_put_bits(PutBitContext *s);
 void flush_put_bits(PutBitContext *s);
+void put_string(PutBitContext * pbc, char *s);
 
 /* jpeg specific put_bits */
 void jflush_put_bits(PutBitContext *s);
@@ -250,7 +242,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 #endif
     //    printf("put_bits=%d %x\n", n, value);
     assert(n == 32 || value < (1U << n));
-
+    
     bit_buf = s->bit_buf;
     bit_left = s->bit_left;
 
@@ -430,7 +422,6 @@ static inline void jput_bits(PutBitContext *s, int n, int value)
  }
 #endif
 
-
 static inline uint8_t* pbBufPtr(PutBitContext *s)
 {
 #ifdef ALT_BITSTREAM_WRITER
@@ -483,7 +474,6 @@ static inline unsigned int get_bits(GetBitContext *s, int n){
     }
     printf(" ");
 #endif
-    
     return result;
 #endif //!ALIGNED_BITSTREAM
 #else //ALT_BITSTREAM_READER
@@ -509,10 +499,10 @@ static inline unsigned int get_bits1(GetBitContext *s){
     result>>= 8 - 1;
     index++;
     s->index= index;
+    
 #ifdef DUMP_STREAM
     printf("%d ", result);
 #endif
-    
     return result;
 #else
     if(s->bit_cnt>0){
@@ -888,7 +878,62 @@ static inline int mid_pred(int a, int b, int c)
     return a + b + c - vmin - vmax;
 }
 
+static inline int clip(int a, int amin, int amax)
+{
+    if (a < amin)
+        return amin;
+    else if (a > amax)
+        return amax;
+    else
+        return a;
+}
+
 /* memory */
+void *av_malloc(int size);
 void *av_mallocz(int size);
+void av_free(void *ptr);
+void __av_freep(void **ptr);
+#define av_freep(p) __av_freep((void **)(p))
+
+/* math */
+int ff_gcd(int a, int b);
+
+static inline int ff_sqrt(int a)
+{
+    int ret=0;
+    int s;
+    int ret_sq=0;
+
+    for(s=15; s>=0; s--){
+        int b= ret_sq + (1<<(s*2)) + (ret<<s)*2;
+        if(b<=a){
+            ret_sq=b;
+            ret+= 1<<s;
+        }
+    }
+    return ret;
+}
+#define RUNTIME_CPUDETECT
+
+#if __CPU__ >= 686 && !defined(RUNTIME_CPUDETECT)
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+asm volatile (\
+    "cmpl %0, %3	\n\t"\
+    "cmovl %3, %0	\n\t"\
+    "cmovl %4, %1	\n\t"\
+    "cmovl %5, %2	\n\t"\
+    : "+r" (x), "+r" (a), "+r" (c)\
+    : "r" (y), "r" (b), "r" (d)\
+);
+#else
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+if((y)<(x)){\
+     (x)=(y);\
+     (a)=(b);\
+     (c)=(d);\
+}
+#endif
+
+#define CLAMP_TO_8BIT(d) ((d > 0xff) ? 0xff : (d < 0) ? 0 : d)
 
 #endif
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index dcfad05a5..945b7cc9d 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -1,32 +1,33 @@
 /*
  * DSP utils
- * Copyright (c) 2000, 2001 Gerard Lantau.
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
- * gmc & q-pel support by Michael Niedermayer <michaelni@gmx.at>
+ * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
 #include "avcodec.h"
 #include "dsputil.h"
 #include "simple_idct.h"
 
 void (*ff_idct)(DCTELEM *block);
+void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
+void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
+void (*av_fdct)(DCTELEM *block);
 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
+void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
@@ -45,8 +46,10 @@ op_pixels_abs_func pix_abs8x8_xy2;
 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
 UINT32 squareTbl[512];
 
-extern UINT16 default_intra_matrix[64];
-extern UINT16 default_non_intra_matrix[64];
+extern INT16 default_intra_matrix[64];
+extern INT16 default_non_intra_matrix[64];
+extern INT16 ff_mpeg4_default_intra_matrix[64];
+extern INT16 ff_mpeg4_default_non_intra_matrix[64];
 
 UINT8 zigzag_direct[64] = {
     0, 1, 8, 16, 9, 2, 3, 10,
@@ -87,6 +90,8 @@ UINT8 ff_alternate_vertical_scan[64] = {
     38, 46, 54, 62, 39, 47, 55, 63,
 };
 
+#ifdef SIMPLE_IDCT
+
 /* Input permutation for the simple_idct_mmx */
 static UINT8 simple_mmx_permutation[64]={
 	0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
@@ -98,6 +103,7 @@ static UINT8 simple_mmx_permutation[64]={
 	0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
 	0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 };
+#endif
 
 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 UINT32 inverse[256]={
@@ -141,7 +147,7 @@ UINT8 zigzag_end[64];
 UINT8 permutation[64];
 //UINT8 invPermutation[64];
 
-static void build_zigzag_end()
+static void build_zigzag_end(void)
 {
     int lastIndex;
     int lastIndexAfterPerm=0;
@@ -176,6 +182,28 @@ void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
     }
 }
 
+void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
+    DCTELEM *p;
+    int i;
+
+    /* read the pixels */
+    p = block;
+    for(i=0;i<8;i++) {
+        p[0] = s1[0] - s2[0];
+        p[1] = s1[1] - s2[1];
+        p[2] = s1[2] - s2[2];
+        p[3] = s1[3] - s2[3];
+        p[4] = s1[4] - s2[4];
+        p[5] = s1[5] - s2[5];
+        p[6] = s1[6] - s2[6];
+        p[7] = s1[7] - s2[7];
+        s1 += stride;
+        s2 += stride;
+        p += 8;
+    }
+}
+
+
 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 {
     const DCTELEM *p;
@@ -224,6 +252,358 @@ void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
     }
 }
 
+#ifdef __GNUC__
+
+struct unaligned_64 { uint64_t l; } __attribute__((packed));
+struct unaligned_32 { uint32_t l; } __attribute__((packed));
+
+#define LD32(a) (((const struct unaligned_32 *) (a))->l)
+#define LD64(a) (((const struct unaligned_64 *) (a))->l)
+
+#else /* __GNUC__ */
+
+#define LD32(a) (*((uint32_t*)(a)))
+#define LD64(a) (*((uint64_t*)(a)))
+
+#endif /* !__GNUC__ */
+
+#if 0
+
+#define PIXOP2(OPNAME, OP) \
+void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((uint64_t*)block), LD64(pixels));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= LD64(pixels  );\
+        const uint64_t b= LD64(pixels+1);\
+        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= LD64(pixels  );\
+        const uint64_t b= LD64(pixels+1);\
+        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= LD64(pixels          );\
+        const uint64_t b= LD64(pixels+line_size);\
+        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        const uint64_t a= LD64(pixels          );\
+        const uint64_t b= LD64(pixels+line_size);\
+        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        int i;\
+        const uint64_t a= LD64(pixels  );\
+        const uint64_t b= LD64(pixels+1);\
+        uint64_t l0=  (a&0x0303030303030303ULL)\
+                    + (b&0x0303030303030303ULL)\
+                    + 0x0202020202020202ULL;\
+        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+        uint64_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint64_t a= LD64(pixels  );\
+            uint64_t b= LD64(pixels+1);\
+            l1=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL);\
+            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= LD64(pixels  );\
+            b= LD64(pixels+1);\
+            l0=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL)\
+               + 0x0202020202020202ULL;\
+            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+        int i;\
+        const uint64_t a= LD64(pixels  );\
+        const uint64_t b= LD64(pixels+1);\
+        uint64_t l0=  (a&0x0303030303030303ULL)\
+                    + (b&0x0303030303030303ULL)\
+                    + 0x0101010101010101ULL;\
+        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+        uint64_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint64_t a= LD64(pixels  );\
+            uint64_t b= LD64(pixels+1);\
+            l1=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL);\
+            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= LD64(pixels  );\
+            b= LD64(pixels+1);\
+            l0=  (a&0x0303030303030303ULL)\
+               + (b&0x0303030303030303ULL)\
+               + 0x0101010101010101ULL;\
+            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
+            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+}\
+\
+void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    OPNAME ## _pixels,\
+    OPNAME ## _pixels_x2,\
+    OPNAME ## _pixels_y2,\
+    OPNAME ## _pixels_xy2,\
+};\
+\
+void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    OPNAME ## _pixels,\
+    OPNAME ## _no_rnd_pixels_x2,\
+    OPNAME ## _no_rnd_pixels_y2,\
+    OPNAME ## _no_rnd_pixels_xy2,\
+};
+
+#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
+#else // 64 bit variant
+
+#define PIXOP2(OPNAME, OP) \
+void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
+        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
+        pixels+=line_size;\
+        block +=line_size;\
+    }\
+}\
+\
+void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        int j;\
+        for(j=0; j<2; j++){\
+            const uint32_t a= LD32(pixels  );\
+            const uint32_t b= LD32(pixels+1);\
+            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
+            pixels+=4;\
+            block +=4;\
+        }\
+        pixels+=line_size-8;\
+        block +=line_size-8;\
+    }\
+}\
+\
+void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        int j;\
+        for(j=0; j<2; j++){\
+            const uint32_t a= LD32(pixels  );\
+            const uint32_t b= LD32(pixels+1);\
+            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
+            pixels+=4;\
+            block +=4;\
+        }\
+        pixels+=line_size-8;\
+        block +=line_size-8;\
+    }\
+}\
+\
+void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        int j;\
+        for(j=0; j<2; j++){\
+            const uint32_t a= LD32(pixels          );\
+            const uint32_t b= LD32(pixels+line_size);\
+            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
+            pixels+=4;\
+            block +=4;\
+        }\
+        pixels+=line_size-8;\
+        block +=line_size-8;\
+    }\
+}\
+\
+void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int i;\
+    for(i=0; i<h; i++){\
+        int j;\
+        for(j=0; j<2; j++){\
+            const uint32_t a= LD32(pixels          );\
+            const uint32_t b= LD32(pixels+line_size);\
+            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
+            pixels+=4;\
+            block +=4;\
+        }\
+        pixels+=line_size-8;\
+        block +=line_size-8;\
+    }\
+}\
+\
+void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int j;\
+    for(j=0; j<2; j++){\
+        int i;\
+        const uint32_t a= LD32(pixels  );\
+        const uint32_t b= LD32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x02020202UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= LD32(pixels  );\
+            uint32_t b= LD32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= LD32(pixels  );\
+            b= LD32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x02020202UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+        pixels+=4-line_size*(h+1);\
+        block +=4-line_size*h;\
+    }\
+}\
+\
+void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
+{\
+    int j;\
+    for(j=0; j<2; j++){\
+        int i;\
+        const uint32_t a= LD32(pixels  );\
+        const uint32_t b= LD32(pixels+1);\
+        uint32_t l0=  (a&0x03030303UL)\
+                    + (b&0x03030303UL)\
+                    + 0x01010101UL;\
+        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
+                   + ((b&0xFCFCFCFCUL)>>2);\
+        uint32_t l1,h1;\
+\
+        pixels+=line_size;\
+        for(i=0; i<h; i+=2){\
+            uint32_t a= LD32(pixels  );\
+            uint32_t b= LD32(pixels+1);\
+            l1=  (a&0x03030303UL)\
+               + (b&0x03030303UL);\
+            h1= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+            a= LD32(pixels  );\
+            b= LD32(pixels+1);\
+            l0=  (a&0x03030303UL)\
+               + (b&0x03030303UL)\
+               + 0x01010101UL;\
+            h0= ((a&0xFCFCFCFCUL)>>2)\
+              + ((b&0xFCFCFCFCUL)>>2);\
+            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
+            pixels+=line_size;\
+            block +=line_size;\
+        }\
+        pixels+=4-line_size*(h+1);\
+        block +=4-line_size*h;\
+    }\
+}\
+\
+void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    OPNAME ## _pixels,\
+    OPNAME ## _pixels_x2,\
+    OPNAME ## _pixels_y2,\
+    OPNAME ## _pixels_xy2,\
+};\
+\
+void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
+    OPNAME ## _pixels,\
+    OPNAME ## _no_rnd_pixels_x2,\
+    OPNAME ## _no_rnd_pixels_y2,\
+    OPNAME ## _no_rnd_pixels_xy2,\
+};
+#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
+#endif
+
+#define op_put(a, b) a = b
+
+PIXOP2(avg, op_avg)
+PIXOP2(put, op_put)
+#undef op_avg
+#undef op_put
+
+#if 0
+/* FIXME this stuff could be removed as its ot really used anymore */
 #define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
                                                                                          \
 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
@@ -323,18 +703,13 @@ void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_siz
     OPNAME ## _pixels_xy2,                                                               \
 };
 
-
 /* rounding primitives */
 #define avg2(a,b) ((a+b+1)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 
-#define op_put(a, b) a = b
 #define op_avg(a, b) a = avg2(a, b)
 #define op_sub(a, b) a -= b
 
-PIXOP(UINT8, put, op_put, line_size)
-PIXOP(UINT8, avg, op_avg, line_size)
-
 PIXOP(DCTELEM, sub, op_sub, 8)
 
 /* not rounding primitives */
@@ -343,13 +718,12 @@ PIXOP(DCTELEM, sub, op_sub, 8)
 #define avg2(a,b) ((a+b)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
 
-PIXOP(UINT8, put_no_rnd, op_put, line_size)
-PIXOP(UINT8, avg_no_rnd, op_avg, line_size)
-
 /* motion estimation */
 
 #undef avg2
 #undef avg4
+#endif
+
 #define avg2(a,b) ((a+b+1)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 
@@ -872,6 +1246,20 @@ void clear_blocks_c(DCTELEM *blocks)
     memset(blocks, 0, sizeof(DCTELEM)*6*64);
 }
 
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    ff_idct (block);
+    put_pixels_clamped(block, dest, line_size);
+}
+
+void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    ff_idct (block);
+    add_pixels_clamped(block, dest, line_size);
+}
+
 void dsputil_init(void)
 {
     int i, j;
@@ -888,11 +1276,12 @@ void dsputil_init(void)
     }
 
 #ifdef SIMPLE_IDCT
-    ff_idct = simple_idct;
+    ff_idct = NULL;
 #else
     ff_idct = j_rev_dct;
 #endif
     get_pixels = get_pixels_c;
+    diff_pixels = diff_pixels_c;
     put_pixels_clamped = put_pixels_clamped_c;
     add_pixels_clamped = add_pixels_clamped_c;
     gmc1= gmc1_c;
@@ -906,7 +1295,7 @@ void dsputil_init(void)
     pix_abs8x8_x2  = pix_abs8x8_x2_c;
     pix_abs8x8_y2  = pix_abs8x8_y2_c;
     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
-    av_fdct = jpeg_fdct_ifast;
+    av_fdct = fdct_ifast;
 
     use_permuted_idct = 1;
 
@@ -925,9 +1314,16 @@ void dsputil_init(void)
     use_permuted_idct = 0;
 #endif
 
-#ifdef SIMPLE_IDCT
-    if(ff_idct == simple_idct) use_permuted_idct=0;
-#endif
+//#ifdef SIMPLE_IDCT
+    if (ff_idct == NULL) {
+        ff_idct_put = simple_idct_put;
+        ff_idct_add = simple_idct_add;
+        use_permuted_idct=0;
+    } else {
+        ff_idct_put = gen_idct_put;
+        ff_idct_add = gen_idct_add;
+    }
+//#endif
 
     if(use_permuted_idct)
 #ifdef SIMPLE_IDCT
@@ -953,11 +1349,21 @@ void dsputil_init(void)
         }
         block_permute(default_intra_matrix);
         block_permute(default_non_intra_matrix);
+        block_permute(ff_mpeg4_default_intra_matrix);
+        block_permute(ff_mpeg4_default_non_intra_matrix);
     }
     
     build_zigzag_end();
 }
 
+/* remove any non bit exact operation (testing purpose) */
+void avcodec_set_bit_exact(void)
+{
+#ifdef HAVE_MMX
+    dsputil_set_bit_exact_mmx();
+#endif
+}
+
 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
               int orig_linesize[3], int coded_linesize,
               AVCodecContext *avctx)
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index dc63f06f1..b7b7e999c 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -1,21 +1,39 @@
+/*
+ * DSP utils
+ * Copyright (c) 2000, 2001, 2002 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 #ifndef DSPUTIL_H
 #define DSPUTIL_H
 
 #include "common.h"
 #include "avcodec.h"
+#include "xineutils.h"
 
 #undef DEBUG
-//#define DEBUG
 /* dct code */
 typedef short DCTELEM;
 
-void jpeg_fdct_ifast (DCTELEM *data);
+void fdct_ifast (DCTELEM *data);
 
 void j_rev_dct (DCTELEM *data);
 
 void fdct_mmx(DCTELEM *block);
 
-void (*av_fdct)(DCTELEM *block);
+extern void (*av_fdct)(DCTELEM *block);
 
 /* encoding scans */
 extern UINT8 ff_alternate_horizontal_scan[64];
@@ -37,7 +55,10 @@ void dsputil_init(void);
 /* pixel ops : interface with DCT */
 
 extern void (*ff_idct)(DCTELEM *block);
+extern void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
+extern void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
 extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
+extern void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
 extern void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
@@ -45,6 +66,7 @@ extern void (*clear_blocks)(DCTELEM *blocks);
 
 
 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size);
+void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
 void clear_blocks_c(DCTELEM *blocks);
@@ -60,13 +82,6 @@ extern op_pixels_func avg_no_rnd_pixels_tab[4];
 extern qpel_mc_func qpel_mc_rnd_tab[16];
 extern qpel_mc_func qpel_mc_no_rnd_tab[16];
 
-
-/* sub pixel (encoding) */
-extern void (*sub_pixels_tab[4])(DCTELEM *block, const UINT8 *pixels, int line_size, int h);
-
-#define sub_pixels_2(block, pixels, line_size, dxy) \
-   sub_pixels_tab[dxy](block, pixels, line_size, 8)
-
 /* motion estimation */
 
 typedef int (*op_pixels_abs_func)(UINT8 *blk1, UINT8 *blk2, int line_size);
@@ -91,9 +106,9 @@ static inline int block_permute_op(int j)
 }
 
 void block_permute(INT16 *block);
-
+          
 #if defined(ARCH_X86)
-#define HAVE_MMX
+#define HAVE_MMX 1 
 #endif
 
 #if defined(HAVE_MMX)
@@ -108,10 +123,10 @@ void block_permute(INT16 *block);
 
 extern int mm_flags;
 
-/* int mm_support(void); */
+/*int mm_support(void);*/
 #define mm_support() xine_mm_accel()
 
-#if 0
+#if 0 
 static inline void emms(void)
 {
     __asm __volatile ("emms;":::"memory");
@@ -127,6 +142,7 @@ static inline void emms(void)
 #define __align8 __attribute__ ((aligned (8)))
 
 void dsputil_init_mmx(void);
+void dsputil_set_bit_exact_mmx(void);
 
 #elif defined(ARCH_ARMV4L)
 
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index a8d04d58a..decddd344 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -1,25 +1,27 @@
 /*
  * H263/MPEG4 backend for ffmpeg encoder and decoder
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000,2001 Fabrice Bellard.
  * H263+ support.
  * Copyright (c) 2001 Juan J. Sierralta P.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
- * ac prediction encoding by Michael Niedermayer <michaelni@gmx.at>
+ * ac prediction encoding & b-frame support by Michael Niedermayer <michaelni@gmx.at>
  */
+ 
+//#define DEBUG
 #include "common.h"
 #include "dsputil.h"
 #include "avcodec.h"
@@ -28,24 +30,26 @@
 #include "mpeg4data.h"
 
 //rounded divison & shift
-#define RDIV(a,b) ((a) > 0 ? ((a)+((b)>>1))/(b) : ((a)-((b)>>1))/(b))
 #define RSHIFT(a,b) ((a) > 0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b))
-#define ABS(a) (((a)>=0)?(a):(-(a)))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+#define PRINT_MB_TYPE(a) ;
+//#define PRINT_MB_TYPE(a) printf(a);
 
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block,
 			      int n);
-static void h263_encode_motion(MpegEncContext * s, int val);
+static void h263_encode_motion(MpegEncContext * s, int val, int fcode);
 static void h263p_encode_umotion(MpegEncContext * s, int val);
 static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block,
-			       int n, int dc, UINT8 *scan_table);
+			       int n, int dc, UINT8 *scan_table, 
+                               PutBitContext *dc_pb, PutBitContext *ac_pb);
 static int h263_decode_motion(MpegEncContext * s, int pred, int fcode);
 static int h263p_decode_umotion(MpegEncContext * s, int pred);
 static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
                              int n, int coded);
-static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
+static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
+static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                               int n, int coded);
+static int h263_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr);
 static inline int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr);
 static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
                               int dir);
@@ -65,15 +69,15 @@ int h263_get_picture_format(int width, int height)
     int format;
 
     if (width == 128 && height == 96)
-	format = 1;
+        format = 1;
     else if (width == 176 && height == 144)
-	format = 2;
+        format = 2;
     else if (width == 352 && height == 288)
-	format = 3;
+        format = 3;
     else if (width == 704 && height == 576)
-	format = 4;
+        format = 4;
     else if (width == 1408 && height == 1152)
-	format = 5;
+        format = 5;
     else
         format = 7;
     return format;
@@ -128,7 +132,7 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
         put_bits(&s->pb, 1, s->umvplus); /* Unrestricted Motion Vector */
         put_bits(&s->pb,1,0); /* SAC: off */
         put_bits(&s->pb,1,0); /* Advanced Prediction Mode: off */
-        put_bits(&s->pb,1,0); /* Advanced Intra Coding: off */
+        put_bits(&s->pb,1,s->h263_aic); /* Advanced Intra Coding */
         put_bits(&s->pb,1,0); /* Deblocking Filter: off */
         put_bits(&s->pb,1,0); /* Slice Structured: off */
         put_bits(&s->pb,1,0); /* Reference Picture Selection: off */
@@ -142,7 +146,11 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
 		
         put_bits(&s->pb,1,0); /* Reference Picture Resampling: off */
         put_bits(&s->pb,1,0); /* Reduced-Resolution Update: off */
-        put_bits(&s->pb,1,0); /* Rounding Type */
+        if (s->pict_type == I_TYPE)
+            s->no_rounding = 0;
+        else
+            s->no_rounding ^= 1;
+        put_bits(&s->pb,1,s->no_rounding); /* Rounding Type */
         put_bits(&s->pb,2,0); /* Reserved */
         put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */
 		
@@ -152,6 +160,9 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
 		if (format == 7) {
             /* Custom Picture Format (CPFMT) */
 		
+	    if (s->aspect_ratio_info)
+            put_bits(&s->pb,4,s->aspect_ratio_info);
+	    else
             put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */
             put_bits(&s->pb,9,(s->width >> 2) - 1);
             put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */
@@ -252,78 +263,230 @@ void mpeg4_encode_mb(MpegEncContext * s,
 		    DCTELEM block[6][64],
 		    int motion_x, int motion_y)
 {
-    int cbpc, cbpy, i, cbp, pred_x, pred_y;
+    int cbpc, cbpy, i, pred_x, pred_y;
     int bits;
+    PutBitContext * const pb2    = s->data_partitioning                         ? &s->pb2    : &s->pb;
+    PutBitContext * const tex_pb = s->data_partitioning && s->pict_type!=B_TYPE ? &s->tex_pb : &s->pb;
+    PutBitContext * const dc_pb  = s->data_partitioning && s->pict_type!=I_TYPE ? &s->pb2    : &s->pb;
+    const int interleaved_stats= (s->flags&CODEC_FLAG_PASS1) && !s->data_partitioning ? 1 : 0;
     
     //    printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
     if (!s->mb_intra) {
         /* compute cbp */
-        cbp = 0;
+        int cbp = 0;
         for (i = 0; i < 6; i++) {
-        if (s->block_last_index[i] >= 0)
-            cbp |= 1 << (5 - i);
-        }
-        if ((cbp | motion_x | motion_y) == 0 && s->mv_type==MV_TYPE_16X16) {
-            /* skip macroblock */
-            put_bits(&s->pb, 1, 1);
-            s->misc_bits++;
-            s->last_bits++;
-            s->skip_count++;
-            return;
+            if (s->block_last_index[i] >= 0)
+                cbp |= 1 << (5 - i);
         }
-        put_bits(&s->pb, 1, 0);	/* mb coded */
-        if(s->mv_type==MV_TYPE_16X16){
-            cbpc = cbp & 3;
-            put_bits(&s->pb,
-                    inter_MCBPC_bits[cbpc],
-                    inter_MCBPC_code[cbpc]);
-            cbpy = cbp >> 2;
-            cbpy ^= 0xf;
-            put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
-                
-            bits= get_bit_count(&s->pb);
-            s->misc_bits+= bits - s->last_bits;
-            s->last_bits=bits;
 
-            /* motion vectors: 16x16 mode */
-            h263_pred_motion(s, 0, &pred_x, &pred_y);
-        
-            h263_encode_motion(s, motion_x - pred_x);
-            h263_encode_motion(s, motion_y - pred_y);
-        }else{
-            cbpc = (cbp & 3)+16;
-            put_bits(&s->pb,
-                    inter_MCBPC_bits[cbpc],
-                    inter_MCBPC_code[cbpc]);
-            cbpy = cbp >> 2;
-            cbpy ^= 0xf;
-            put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+        if(s->pict_type==B_TYPE){
+            static const int mb_type_table[8]= {-1, 2, 3, 1,-1,-1,-1, 0}; /* convert from mv_dir to type */
+            int mb_type=  mb_type_table[s->mv_dir];
+            
+            if(s->mb_x==0){
+                s->last_mv[0][0][0]= 
+                s->last_mv[0][0][1]= 
+                s->last_mv[1][0][0]= 
+                s->last_mv[1][0][1]= 0;
+            }
 
-            bits= get_bit_count(&s->pb);
-            s->misc_bits+= bits - s->last_bits;
-            s->last_bits=bits;
+            /* nothing to do if this MB was skiped in the next P Frame */
+            if(s->mbskip_table[s->mb_y * s->mb_width + s->mb_x]){
+                s->skip_count++;
+                s->mv[0][0][0]= 
+                s->mv[0][0][1]= 
+                s->mv[1][0][0]= 
+                s->mv[1][0][1]= 0;
+                s->mv_dir= MV_DIR_FORWARD; //doesnt matter
+                return;
+            }
 
-            for(i=0; i<4; i++){
-                /* motion vectors: 8x8 mode*/
-                h263_pred_motion(s, i, &pred_x, &pred_y);
+            if ((cbp | motion_x | motion_y | mb_type) ==0) {
+                /* direct MB with MV={0,0} */
+                put_bits(&s->pb, 1, 1); /* mb not coded modb1=1 */
 
-                h263_encode_motion(s, s->motion_val[ s->block_index[i] ][0] - pred_x);
-                h263_encode_motion(s, s->motion_val[ s->block_index[i] ][1] - pred_y);
+                if(interleaved_stats){
+                    s->misc_bits++;
+                    s->last_bits++;
+                }
+                s->skip_count++;
+                return;
             }
-        }
-        bits= get_bit_count(&s->pb);
-        s->mv_bits+= bits - s->last_bits;
-        s->last_bits=bits;
+            put_bits(&s->pb, 1, 0);	/* mb coded modb1=0 */
+            put_bits(&s->pb, 1, cbp ? 0 : 1); /* modb2 */ //FIXME merge
+            put_bits(&s->pb, mb_type+1, 1); // this table is so simple that we dont need it :)
+            if(cbp) put_bits(&s->pb, 6, cbp);
+            
+            if(cbp && mb_type)
+                put_bits(&s->pb, 1, 0); /* no q-scale change */
 
-        /* encode each block */
-        for (i = 0; i < 6; i++) {
-            mpeg4_encode_block(s, block[i], i, 0, zigzag_direct);
+            if(interleaved_stats){
+                bits= get_bit_count(&s->pb);
+                s->misc_bits+= bits - s->last_bits;
+                s->last_bits=bits;
+            }
+
+            switch(mb_type)
+            {
+            case 0: /* direct */
+                h263_encode_motion(s, motion_x, 1);
+                h263_encode_motion(s, motion_y, 1);                
+                break;
+            case 1: /* bidir */
+                h263_encode_motion(s, s->mv[0][0][0] - s->last_mv[0][0][0], s->f_code);
+                h263_encode_motion(s, s->mv[0][0][1] - s->last_mv[0][0][1], s->f_code);
+                h263_encode_motion(s, s->mv[1][0][0] - s->last_mv[1][0][0], s->b_code);
+                h263_encode_motion(s, s->mv[1][0][1] - s->last_mv[1][0][1], s->b_code);
+                s->last_mv[0][0][0]= s->mv[0][0][0];
+                s->last_mv[0][0][1]= s->mv[0][0][1];
+                s->last_mv[1][0][0]= s->mv[1][0][0];
+                s->last_mv[1][0][1]= s->mv[1][0][1];
+                break;
+            case 2: /* backward */
+                h263_encode_motion(s, motion_x - s->last_mv[1][0][0], s->b_code);
+                h263_encode_motion(s, motion_y - s->last_mv[1][0][1], s->b_code);
+                s->last_mv[1][0][0]= motion_x;
+                s->last_mv[1][0][1]= motion_y;
+                break;
+            case 3: /* forward */
+                h263_encode_motion(s, motion_x - s->last_mv[0][0][0], s->f_code);
+                h263_encode_motion(s, motion_y - s->last_mv[0][0][1], s->f_code);
+                s->last_mv[0][0][0]= motion_x;
+                s->last_mv[0][0][1]= motion_y;
+                break;
+            default:
+                printf("unknown mb type\n");
+                return;
+            }
+
+            if(interleaved_stats){
+                bits= get_bit_count(&s->pb);
+                s->mv_bits+= bits - s->last_bits;
+                s->last_bits=bits;
+            }
+
+            /* encode each block */
+            for (i = 0; i < 6; i++) {
+                mpeg4_encode_block(s, block[i], i, 0, zigzag_direct, NULL, &s->pb);
+            }
+
+            if(interleaved_stats){
+                bits= get_bit_count(&s->pb);
+                s->p_tex_bits+= bits - s->last_bits;
+                s->last_bits=bits;
+            }
+        }else{ /* s->pict_type==B_TYPE */
+            if ((cbp | motion_x | motion_y) == 0 && s->mv_type==MV_TYPE_16X16) {
+                /* check if the B frames can skip it too, as we must skip it if we skip here 
+                   why didnt they just compress the skip-mb bits instead of reusing them ?! */
+                if(s->max_b_frames>0){
+                    int i;
+                    int x,y, offset;
+                    uint8_t *p_pic;
+
+                    x= s->mb_x*16;
+                    y= s->mb_y*16;
+                    if(x+16 > s->width)  x= s->width-16;
+                    if(y+16 > s->height) y= s->height-16;
+
+                    offset= x + y*s->linesize;
+                    p_pic= s->new_picture[0] + offset;
+                    
+                    s->mb_skiped=1;
+                    for(i=0; i<s->max_b_frames; i++){
+                        uint8_t *b_pic;
+                        int diff;
+
+                        if(s->coded_order[i+1].pict_type!=B_TYPE) break;
+
+                        b_pic= s->coded_order[i+1].picture[0] + offset;
+                        diff= pix_abs16x16(p_pic, b_pic, s->linesize);
+                        if(diff>s->qscale*70){ //FIXME check that 70 is optimal
+                            s->mb_skiped=0;
+                            break;
+                        }
+                    }
+                }else
+                    s->mb_skiped=1; 
+
+                if(s->mb_skiped==1){
+                    /* skip macroblock */
+                    put_bits(&s->pb, 1, 1);
+
+                    if(interleaved_stats){
+                        s->misc_bits++;
+                        s->last_bits++;
+                    }
+                    s->skip_count++;
+                    return;
+                }
+            }
+
+            put_bits(&s->pb, 1, 0);	/* mb coded */
+            if(s->mv_type==MV_TYPE_16X16){
+                cbpc = cbp & 3;
+                put_bits(&s->pb,
+                        inter_MCBPC_bits[cbpc],
+                        inter_MCBPC_code[cbpc]);
+                cbpy = cbp >> 2;
+                cbpy ^= 0xf;
+                put_bits(pb2, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+                    
+                if(interleaved_stats){
+                    bits= get_bit_count(&s->pb);
+                    s->misc_bits+= bits - s->last_bits;
+                    s->last_bits=bits;
+                }
+
+                /* motion vectors: 16x16 mode */
+                h263_pred_motion(s, 0, &pred_x, &pred_y);
+            
+                h263_encode_motion(s, motion_x - pred_x, s->f_code);
+                h263_encode_motion(s, motion_y - pred_y, s->f_code);
+            }else{
+                cbpc = (cbp & 3)+16;
+                put_bits(&s->pb,
+                        inter_MCBPC_bits[cbpc],
+                        inter_MCBPC_code[cbpc]);
+                cbpy = cbp >> 2;
+                cbpy ^= 0xf;
+                put_bits(pb2, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+
+                if(interleaved_stats){
+                    bits= get_bit_count(&s->pb);
+                    s->misc_bits+= bits - s->last_bits;
+                    s->last_bits=bits;
+                }
+
+                for(i=0; i<4; i++){
+                    /* motion vectors: 8x8 mode*/
+                    h263_pred_motion(s, i, &pred_x, &pred_y);
+
+                    h263_encode_motion(s, s->motion_val[ s->block_index[i] ][0] - pred_x, s->f_code);
+                    h263_encode_motion(s, s->motion_val[ s->block_index[i] ][1] - pred_y, s->f_code);
+                }
+            }
+
+            if(interleaved_stats){ 
+                bits= get_bit_count(&s->pb);
+                s->mv_bits+= bits - s->last_bits;
+                s->last_bits=bits;
+            }
+
+            /* encode each block */
+            for (i = 0; i < 6; i++) {
+                mpeg4_encode_block(s, block[i], i, 0, zigzag_direct, NULL, tex_pb);
+            }
+
+            if(interleaved_stats){
+                bits= get_bit_count(&s->pb);
+                s->p_tex_bits+= bits - s->last_bits;
+                s->last_bits=bits;
+            }
+            s->p_count++;
         }
-        bits= get_bit_count(&s->pb);
-        s->p_tex_bits+= bits - s->last_bits;
-        s->last_bits=bits;
-        s->p_count++;
     } else {
+        int cbp;
         int dc_diff[6];   //dc values with the dc prediction subtracted 
         int dir[6];  //prediction direction
         int zigzag_last_index[6];
@@ -381,22 +544,26 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 inter_MCBPC_bits[cbpc + 4],
                 inter_MCBPC_code[cbpc + 4]);
         }
-        put_bits(&s->pb, 1, s->ac_pred);
+        put_bits(pb2, 1, s->ac_pred);
         cbpy = cbp >> 2;
-        put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+        put_bits(pb2, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
 
-        bits= get_bit_count(&s->pb);
-        s->misc_bits+= bits - s->last_bits;
-        s->last_bits=bits;
+        if(interleaved_stats){
+            bits= get_bit_count(&s->pb);
+            s->misc_bits+= bits - s->last_bits;
+            s->last_bits=bits;
+        }
 
         /* encode each block */
         for (i = 0; i < 6; i++) {
-            mpeg4_encode_block(s, block[i], i, dc_diff[i], scan_table[i]);
+            mpeg4_encode_block(s, block[i], i, dc_diff[i], scan_table[i], dc_pb, tex_pb);
         }
 
-        bits= get_bit_count(&s->pb);
-        s->i_tex_bits+= bits - s->last_bits;
-        s->last_bits=bits;
+        if(interleaved_stats){
+            bits= get_bit_count(&s->pb);
+            s->i_tex_bits+= bits - s->last_bits;
+            s->last_bits=bits;
+        }
         s->i_count++;
 
         /* restore ac coeffs & last_index stuff if we messed them up with the prediction */
@@ -425,76 +592,169 @@ void h263_encode_mb(MpegEncContext * s,
 		    int motion_x, int motion_y)
 {
     int cbpc, cbpy, i, cbp, pred_x, pred_y;
-   
-    //    printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
-   if (!s->mb_intra) {
-	   /* compute cbp */
-	   cbp = 0;
-	   for (i = 0; i < 6; i++) {
-	      if (s->block_last_index[i] >= 0)
-		   cbp |= 1 << (5 - i);
-	   }
-	   if ((cbp | motion_x | motion_y) == 0) {
-	      /* skip macroblock */
-	      put_bits(&s->pb, 1, 1);
-	      return;
-	   }
-	   put_bits(&s->pb, 1, 0);	/* mb coded */
-	   cbpc = cbp & 3;
-	   put_bits(&s->pb,
-		inter_MCBPC_bits[cbpc],
-		inter_MCBPC_code[cbpc]);
-	   cbpy = cbp >> 2;
-	   cbpy ^= 0xf;
-	   put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
-
-	   /* motion vectors: 16x16 mode only now */
-      h263_pred_motion(s, 0, &pred_x, &pred_y);
+    INT16 pred_dc;
+    INT16 rec_intradc[6];
+    UINT16 *dc_ptr[6];
+           
+    //printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
+    if (!s->mb_intra) {
+        /* compute cbp */
+        cbp = 0;
+        for (i = 0; i < 6; i++) {
+            if (s->block_last_index[i] >= 0)
+                cbp |= 1 << (5 - i);
+        }
+        if ((cbp | motion_x | motion_y) == 0) {
+            /* skip macroblock */
+            put_bits(&s->pb, 1, 1);
+            return;
+        }
+        put_bits(&s->pb, 1, 0);	/* mb coded */
+        cbpc = cbp & 3;
+        put_bits(&s->pb,
+		    inter_MCBPC_bits[cbpc],
+		    inter_MCBPC_code[cbpc]);
+        cbpy = cbp >> 2;
+        cbpy ^= 0xf;
+        put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+
+        /* motion vectors: 16x16 mode only now */
+        h263_pred_motion(s, 0, &pred_x, &pred_y);
       
-      if (!s->umvplus) {  
-         h263_encode_motion(s, motion_x - pred_x);
-         h263_encode_motion(s, motion_y - pred_y);
-      }
-      else {
-         h263p_encode_umotion(s, motion_x - pred_x);
-         h263p_encode_umotion(s, motion_y - pred_y);
-         if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
-            /* To prevent Start Code emulation */
-            put_bits(&s->pb,1,1);
-      }
-   } else {
-	/* compute cbp */
-	cbp = 0;
-	for (i = 0; i < 6; i++) {
-	    if (s->block_last_index[i] >= 1)
-		cbp |= 1 << (5 - i);
-	}
+        if (!s->umvplus) {  
+            h263_encode_motion(s, motion_x - pred_x, s->f_code);
+            h263_encode_motion(s, motion_y - pred_y, s->f_code);
+        }
+        else {
+            h263p_encode_umotion(s, motion_x - pred_x);
+            h263p_encode_umotion(s, motion_y - pred_y);
+            if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
+                /* To prevent Start Code emulation */
+                put_bits(&s->pb,1,1);
+        }
+    } else {
+        int li = s->h263_aic ? 0 : 1;
+        
+        cbp = 0;
+        for(i=0; i<6; i++) {
+            /* Predict DC */
+            if (s->h263_aic && s->mb_intra) {
+                INT16 level = block[i][0];
+            
+                pred_dc = h263_pred_dc(s, i, &dc_ptr[i]);
+                level -= pred_dc;
+                /* Quant */
+                if (level < 0)
+                    level = (level + (s->qscale >> 1))/(s->y_dc_scale);
+                else
+                    level = (level - (s->qscale >> 1))/(s->y_dc_scale);
+                    
+                /* AIC can change CBP */
+                if (level == 0 && s->block_last_index[i] == 0)
+                    s->block_last_index[i] = -1;
+                else if (level < -127)
+                    level = -127;
+                else if (level > 127)
+                    level = 127;
+                
+                block[i][0] = level;
+                /* Reconstruction */ 
+                rec_intradc[i] = (s->y_dc_scale*level) + pred_dc;
+                /* Oddify */
+                rec_intradc[i] |= 1;
+                //if ((rec_intradc[i] % 2) == 0)
+                //    rec_intradc[i]++;
+                /* Clipping */
+                if (rec_intradc[i] < 0)
+                    rec_intradc[i] = 0;
+                else if (rec_intradc[i] > 2047)
+                    rec_intradc[i] = 2047;
+                                
+                /* Update AC/DC tables */
+                *dc_ptr[i] = rec_intradc[i];
+            }
+            /* compute cbp */
+            if (s->block_last_index[i] >= li)
+                cbp |= 1 << (5 - i);
+        }
 
-	cbpc = cbp & 3;
-	if (s->pict_type == I_TYPE) {
-	    put_bits(&s->pb,
-		     intra_MCBPC_bits[cbpc],
-		     intra_MCBPC_code[cbpc]);
-	} else {
-	    put_bits(&s->pb, 1, 0);	/* mb coded */
-	    put_bits(&s->pb,
-		     inter_MCBPC_bits[cbpc + 4],
-		     inter_MCBPC_code[cbpc + 4]);
-	}
-	if (s->h263_pred) {
-	    /* XXX: currently, we do not try to use ac prediction */
-	    put_bits(&s->pb, 1, 0);	/* no ac prediction */
-	}
-	cbpy = cbp >> 2;
-	put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+        cbpc = cbp & 3;
+        if (s->pict_type == I_TYPE) {
+            put_bits(&s->pb,
+                intra_MCBPC_bits[cbpc],
+                intra_MCBPC_code[cbpc]);
+        } else {
+            put_bits(&s->pb, 1, 0);	/* mb coded */
+            put_bits(&s->pb,
+                inter_MCBPC_bits[cbpc + 4],
+                inter_MCBPC_code[cbpc + 4]);
+        }
+        if (s->h263_aic) {
+            /* XXX: currently, we do not try to use ac prediction */
+            put_bits(&s->pb, 1, 0);	/* no AC prediction */
+        }
+        cbpy = cbp >> 2;
+        put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
     }
 
-    /* encode each block */
-    for (i = 0; i < 6; i++) {
+    for(i=0; i<6; i++) {
+        /* encode each block */
         h263_encode_block(s, block[i], i);
+    
+        /* Update INTRADC for decoding */
+        if (s->h263_aic && s->mb_intra) {
+            block[i][0] = rec_intradc[i];
+            
+        }
     }
 }
 
+static int h263_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr)
+{
+    int x, y, wrap, a, c, pred_dc, scale;
+    INT16 *dc_val, *ac_val;
+
+    /* find prediction */
+    if (n < 4) {
+        x = 2 * s->mb_x + 1 + (n & 1);
+        y = 2 * s->mb_y + 1 + ((n & 2) >> 1);
+        wrap = s->mb_width * 2 + 2;
+        dc_val = s->dc_val[0];
+        ac_val = s->ac_val[0][0];
+        scale = s->y_dc_scale;
+    } else {
+        x = s->mb_x + 1;
+        y = s->mb_y + 1;
+        wrap = s->mb_width + 2;
+        dc_val = s->dc_val[n - 4 + 1];
+        ac_val = s->ac_val[n - 4 + 1][0];
+        scale = s->c_dc_scale;
+    }
+    /* B C
+     * A X 
+     */
+    a = dc_val[(x - 1) + (y) * wrap];
+    c = dc_val[(x) + (y - 1) * wrap];
+    
+    /* No prediction outside GOB boundary */
+    if (s->first_slice_line && ((n < 2) || (n > 3)))
+        c = 1024;
+    pred_dc = 1024;
+    /* just DC prediction */
+    if (a != 1024 && c != 1024)
+        pred_dc = (a + c) >> 1;
+    else if (a != 1024)
+        pred_dc = a;
+    else
+        pred_dc = c;
+    
+    /* we assume pred is positive */
+    //pred_dc = (pred_dc + (scale >> 1)) / scale;
+    *dc_val_ptr = &dc_val[x + y * wrap];
+    return pred_dc;
+}
+
+
 void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n)
 {
     int x, y, wrap, a, c, pred_dc, scale, i;
@@ -526,6 +786,9 @@ void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n)
     a = dc_val[(x - 1) + (y) * wrap];
     c = dc_val[(x) + (y - 1) * wrap];
     
+    /* No prediction outside GOB boundary */
+    if (s->first_slice_line && ((n < 2) || (n > 3)))
+        c = 1024;
     pred_dc = 1024;
     if (s->ac_pred) {
         if (s->h263_aic_dir) {
@@ -588,13 +851,46 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block,
 
     mot_val = s->motion_val[xy];
 
-    /* special case for first line */
-    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
-        A = s->motion_val[xy - 1];
-        *px = A[0];
-        *py = A[1];
+    A = s->motion_val[xy - 1];
+    /* special case for first (slice) line */
+    if ((s->mb_y == 0 || s->first_slice_line) && block<3) {
+        // we cant just change some MVs to simulate that as we need them for the B frames (and ME)
+        // and if we ever support non rectangular objects than we need to do a few ifs here anyway :(
+        if(block==0){ //most common case
+            if(s->mb_x  == s->resync_mb_x){ //rare
+                *px= *py = 0;
+            }else if(s->mb_x + 1 == s->resync_mb_x){ //rare
+                C = s->motion_val[xy + off[block] - wrap];
+                if(s->mb_x==0){
+                    *px = C[0];
+                    *py = C[1];
+                }else{
+                    *px = mid_pred(A[0], 0, C[0]);
+                    *py = mid_pred(A[1], 0, C[1]);
+                }
+            }else{
+                *px = A[0];
+                *py = A[1];
+            }
+        }else if(block==1){
+            if(s->mb_x + 1 == s->resync_mb_x){ //rare
+                C = s->motion_val[xy + off[block] - wrap];
+                *px = mid_pred(A[0], 0, C[0]);
+                *py = mid_pred(A[1], 0, C[1]);
+            }else{
+                *px = A[0];
+                *py = A[1];
+            }
+        }else{ /* block==2*/
+            B = s->motion_val[xy - wrap];
+            C = s->motion_val[xy + off[block] - wrap];
+            if(s->mb_x == s->resync_mb_x) //rare
+                A[0]=A[1]=0;
+    
+            *px = mid_pred(A[0], B[0], C[0]);
+            *py = mid_pred(A[1], B[1], C[1]);
+        }
     } else {
-        A = s->motion_val[xy - 1];
         B = s->motion_val[xy - wrap];
         C = s->motion_val[xy + off[block] - wrap];
         *px = mid_pred(A[0], B[0], C[0]);
@@ -603,7 +899,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block,
     return mot_val;
 }
 
-static void h263_encode_motion(MpegEncContext * s, int val)
+static void h263_encode_motion(MpegEncContext * s, int val, int f_code)
 {
     int range, l, m, bit_size, sign, code, bits;
 
@@ -612,7 +908,7 @@ static void h263_encode_motion(MpegEncContext * s, int val)
         code = 0;
         put_bits(&s->pb, mvtab[code][1], mvtab[code][0]);
     } else {
-        bit_size = s->f_code - 1;
+        bit_size = f_code - 1;
         range = 1 << bit_size;
         /* modulo encoding */
         l = range * 32;
@@ -624,17 +920,14 @@ static void h263_encode_motion(MpegEncContext * s, int val)
         }
 
         if (val >= 0) {
-            val--;
-            code = (val >> bit_size) + 1;
-            bits = val & (range - 1);
             sign = 0;
         } else {
             val = -val;
-            val--;
-            code = (val >> bit_size) + 1;
-            bits = val & (range - 1);
             sign = 1;
         }
+        val--;
+        code = (val >> bit_size) + 1;
+        bits = val & (range - 1);
 
         put_bits(&s->pb, mvtab[code][1] + 1, (mvtab[code][0] << 1) | sign); 
         if (bit_size > 0) {
@@ -724,11 +1017,11 @@ static void init_mv_penalty_and_fcode(MpegEncContext *s)
     }
 }
 
-static void init_uni_dc_tab()
+static void init_uni_dc_tab(void)
 {
     int level, uni_code, uni_len;
 
-    for(level=-255; level<256; level++){
+    for(level=-256; level<256; level++){
         int size, v, l;
         /* find number of bits */
         size = 0;
@@ -787,23 +1080,42 @@ void h263_encode_init(MpegEncContext *s)
 
         init_rl(&rl_inter);
         init_rl(&rl_intra);
+        init_rl(&rl_intra_aic);
 
         init_mv_penalty_and_fcode(s);
     }
     s->mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p
     
     // use fcodes >1 only for mpeg4 & h263 & h263p FIXME
-    if(s->h263_plus) s->fcode_tab= umv_fcode_tab;
-    else if(s->h263_pred && !s->h263_msmpeg4) s->fcode_tab= fcode_tab;
+    switch(s->codec_id){
+    case CODEC_ID_MPEG4:
+        s->fcode_tab= fcode_tab;
+        s->min_qcoeff= -2048;
+        s->max_qcoeff=  2047;
+        break;
+    case CODEC_ID_H263P:
+        s->fcode_tab= umv_fcode_tab;
+        s->min_qcoeff= -128;
+        s->max_qcoeff=  127;
+        break;
+    default: //nothing needed default table allready set in mpegvideo.c
+        s->min_qcoeff= -128;
+        s->max_qcoeff=  127;
+    }
+
+    /* h263 type bias */
+    //FIXME mpeg4 mpeg quantizer    
+    s->intra_quant_bias=0;
+    s->inter_quant_bias=-(1<<(QUANT_BIAS_SHIFT-2)); //(a - x/4)/x
 }
 
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
 {
-    int level, run, last, i, j, last_index, last_non_zero, sign, slevel;
-    int code;
-    RLTable *rl = &rl_inter;
+    int level, run, last, i, j, last_index, last_non_zero, sign, slevel, code;
+    RLTable *rl;
 
-    if (s->mb_intra) {
+    rl = &rl_inter;
+    if (s->mb_intra && !s->h263_aic) {
         /* DC coef */
 	    level = block[0];
         /* 255 cannot be represented, so we clamp */
@@ -823,23 +1135,25 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
 	    i = 1;
     } else {
 	    i = 0;
+	    if (s->h263_aic && s->mb_intra)
+	        rl = &rl_intra_aic;
     }
-
+   
     /* AC coefs */
     last_index = s->block_last_index[n];
     last_non_zero = i - 1;
     for (; i <= last_index; i++) {
-	j = zigzag_direct[i];
-	level = block[j];
-	if (level) {
-	    run = i - last_non_zero - 1;
-	    last = (i == last_index);
-	    sign = 0;
-	    slevel = level;
-	    if (level < 0) {
-		sign = 1;
-		level = -level;
-	    }
+        j = zigzag_direct[i];
+        level = block[j];
+        if (level) {
+            run = i - last_non_zero - 1;
+            last = (i == last_index);
+            sign = 0;
+            slevel = level;
+            if (level < 0) {
+                sign = 1;
+                level = -level;
+            }
             code = get_rl_index(rl, last, run, level);
             put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
             if (code == rl->n) {
@@ -849,42 +1163,60 @@ static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
             } else {
                 put_bits(&s->pb, 1, sign);
             }
-	    last_non_zero = i;
-	}
+	        last_non_zero = i;
+	    }
     }
 }
 
 /***************************************************/
 
-static void mpeg4_stuffing(PutBitContext * pbc)
+void ff_mpeg4_stuffing(PutBitContext * pbc)
 {
     int length;
     put_bits(pbc, 1, 0);
     length= (-get_bit_count(pbc))&7;
-    put_bits(pbc, length, (1<<length)-1);
+    if(length) put_bits(pbc, length, (1<<length)-1);
 }
 
-static void put_string(PutBitContext * pbc, char *s)
-{
-    while(*s){
-        put_bits(pbc, 8, *s);
-        s++;
+/* must be called before writing the header */
+void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){
+    int time_div, time_mod;
+
+    if(s->pict_type==I_TYPE){ //we will encode a vol header
+        s->time_increment_resolution= s->frame_rate/ff_gcd(s->frame_rate, FRAME_RATE_BASE);
+        if(s->time_increment_resolution>=256*256) s->time_increment_resolution= 256*128;
+
+        s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
+    }
+
+    s->time= picture_number*(INT64)FRAME_RATE_BASE*s->time_increment_resolution/s->frame_rate;
+    time_div= s->time/s->time_increment_resolution;
+    time_mod= s->time%s->time_increment_resolution;
+
+    if(s->pict_type==B_TYPE){
+        s->bp_time= s->last_non_b_time - s->time;
+    }else{
+        s->last_time_base= s->time_base;
+        s->time_base= time_div;
+        s->pp_time= s->time - s->last_non_b_time;
+        s->last_non_b_time= s->time;
     }
-    put_bits(pbc, 8, 0);
 }
 
 static void mpeg4_encode_vol_header(MpegEncContext * s)
 {
     int vo_ver_id=1; //must be 2 if we want GMC or q-pel
+    char buf[255];
+
+    s->vo_type= s->has_b_frames ? CORE_VO_TYPE : SIMPLE_VO_TYPE;
 
-    if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb);
     put_bits(&s->pb, 16, 0);
     put_bits(&s->pb, 16, 0x100);        /* video obj */
     put_bits(&s->pb, 16, 0);
     put_bits(&s->pb, 16, 0x120);        /* video obj layer */
 
     put_bits(&s->pb, 1, 0);		/* random access vol */
-    put_bits(&s->pb, 8, 1);		/* video obj type indication= simple obj */
+    put_bits(&s->pb, 8, s->vo_type);	/* video obj type indication */
     put_bits(&s->pb, 1, 1);		/* is obj layer id= yes */
       put_bits(&s->pb, 4, vo_ver_id);	/* is obj layer ver id */
       put_bits(&s->pb, 3, 1);		/* is obj layer priority */
@@ -892,11 +1224,20 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
         put_bits(&s->pb, 4, s->aspect_ratio_info);/* aspect ratio info */
     else
         put_bits(&s->pb, 4, 1);		/* aspect ratio info= sqare pixel */
-    put_bits(&s->pb, 1, 0);		/* vol control parameters= no */
+
+    if(s->low_delay){
+        put_bits(&s->pb, 1, 1);		/* vol control parameters= yes */
+        put_bits(&s->pb, 2, 1);		/* chroma format YUV 420/YV12 */
+        put_bits(&s->pb, 1, s->low_delay);
+        put_bits(&s->pb, 1, 0);		/* vbv parameters= no */
+    }else{
+        put_bits(&s->pb, 1, 0);		/* vol control parameters= no */
+    }
+
     put_bits(&s->pb, 2, RECT_SHAPE);	/* vol shape= rectangle */
     put_bits(&s->pb, 1, 1);		/* marker bit */
-    put_bits(&s->pb, 16, s->time_increment_resolution=30000);
-    s->time_increment_bits = av_log2(s->time_increment_resolution - 1) + 1;
+    
+    put_bits(&s->pb, 16, s->time_increment_resolution);
     if (s->time_increment_bits < 1)
         s->time_increment_bits = 1;
     put_bits(&s->pb, 1, 1);		/* marker bit */
@@ -918,37 +1259,56 @@ static void mpeg4_encode_vol_header(MpegEncContext * s)
     if (vo_ver_id != 1)
         put_bits(&s->pb, 1, s->quarter_sample=0);
     put_bits(&s->pb, 1, 1);		/* complexity estimation disable */
-    put_bits(&s->pb, 1, 1);		/* resync marker disable */
-    put_bits(&s->pb, 1, 0);		/* data partitioned */
+    s->resync_marker= s->rtp_mode;
+    put_bits(&s->pb, 1, s->resync_marker ? 0 : 1);/* resync marker disable */
+    put_bits(&s->pb, 1, s->data_partitioning ? 1 : 0);
+    if(s->data_partitioning){
+        put_bits(&s->pb, 1, 0);		/* no rvlc */
+    }
+
     if (vo_ver_id != 1){
         put_bits(&s->pb, 1, 0);		/* newpred */
         put_bits(&s->pb, 1, 0);		/* reduced res vop */
     }
     put_bits(&s->pb, 1, 0);		/* scalability */
 
-    mpeg4_stuffing(&s->pb);
+    ff_mpeg4_stuffing(&s->pb);
     put_bits(&s->pb, 16, 0);
     put_bits(&s->pb, 16, 0x1B2);	/* user_data */
-    put_string(&s->pb, "ffmpeg"); //FIXME append some version ...
+    sprintf(buf, "FFmpeg%sb%s", FFMPEG_VERSION, LIBAVCODEC_BUILD_STR);
+    put_string(&s->pb, buf);
 
-    s->no_rounding = 0;
+    ff_mpeg4_stuffing(&s->pb);
 }
 
 /* write mpeg4 VOP header */
 void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 {
-    if(s->pict_type==I_TYPE) mpeg4_encode_vol_header(s);
-
-    if(get_bit_count(&s->pb)!=0) mpeg4_stuffing(&s->pb);
+    int time_incr;
+    int time_div, time_mod;
+    
+    if(s->pict_type==I_TYPE){
+        s->no_rounding=0;
+        if(picture_number==0 || !s->strict_std_compliance)
+            mpeg4_encode_vol_header(s);
+    }
+    
+//printf("num:%d rate:%d base:%d\n", s->picture_number, s->frame_rate, FRAME_RATE_BASE);
+    
     put_bits(&s->pb, 16, 0);	        /* vop header */
     put_bits(&s->pb, 16, 0x1B6);	/* vop header */
     put_bits(&s->pb, 2, s->pict_type - 1);	/* pict type: I = 0 , P = 1 */
-    /* XXX: time base + 1 not always correct */
-    put_bits(&s->pb, 1, 1);
+
+    time_div= s->time/s->time_increment_resolution;
+    time_mod= s->time%s->time_increment_resolution;
+    time_incr= time_div - s->last_time_base;
+    while(time_incr--)
+        put_bits(&s->pb, 1, 1);
+        
     put_bits(&s->pb, 1, 0);
 
     put_bits(&s->pb, 1, 1);	/* marker */
-    put_bits(&s->pb, s->time_increment_bits, 1);	/* XXX: correct time increment */
+    put_bits(&s->pb, s->time_increment_bits, time_mod);	/* time increment */
     put_bits(&s->pb, 1, 1);	/* marker */
     put_bits(&s->pb, 1, 1);	/* vop coded */
     if (    s->pict_type == P_TYPE 
@@ -1106,16 +1466,17 @@ static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
     }
 }
 
-static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n)
+static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
 {
 #if 1
+//    if(level<-255 || level>255) printf("dc overflow\n");
     level+=256;
     if (n < 4) {
 	/* luminance */
-	put_bits(&s->pb, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]);
+	put_bits(s, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]);
     } else {
 	/* chrominance */
-	put_bits(&s->pb, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]);
+	put_bits(s, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]);
     }
 #else
     int size, v;
@@ -1146,7 +1507,8 @@ static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n)
 #endif
 }
 
-static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, UINT8 *scan_table)
+static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
+                               UINT8 *scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb)
 {
     int level, run, last, i, j, last_index, last_non_zero, sign, slevel;
     int code;
@@ -1154,7 +1516,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i
 
     if (s->mb_intra) {
 	/* mpeg4 based DC predictor */
-	mpeg4_encode_dc(s, intra_dc, n);
+	mpeg4_encode_dc(dc_pb, intra_dc, n);
 	i = 1;
         rl = &rl_intra;
     } else {
@@ -1178,7 +1540,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i
 		level = -level;
 	    }
             code = get_rl_index(rl, last, run, level);
-            put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+            put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
             if (code == rl->n) {
                 int level1, run1;
                 level1 = level - rl->max_level[last][run];
@@ -1187,7 +1549,7 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i
                 code = get_rl_index(rl, last, run, level1);
                 if (code == rl->n) {
                 esc2:
-                    put_bits(&s->pb, 1, 1);
+                    put_bits(ac_pb, 1, 1);
                     if (level > MAX_LEVEL)
                         goto esc3;
                     run1 = run - rl->max_run[last][level] - 1;
@@ -1197,26 +1559,26 @@ static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int i
                     if (code == rl->n) {
                     esc3:
                         /* third escape */
-                        put_bits(&s->pb, 1, 1);
-                        put_bits(&s->pb, 1, last);
-                        put_bits(&s->pb, 6, run);
-                        put_bits(&s->pb, 1, 1);
-                        put_bits(&s->pb, 12, slevel & 0xfff);
-                        put_bits(&s->pb, 1, 1);
+                        put_bits(ac_pb, 1, 1);
+                        put_bits(ac_pb, 1, last);
+                        put_bits(ac_pb, 6, run);
+                        put_bits(ac_pb, 1, 1);
+                        put_bits(ac_pb, 12, slevel & 0xfff);
+                        put_bits(ac_pb, 1, 1);
                     } else {
                         /* second escape */
-                        put_bits(&s->pb, 1, 0);
-                        put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-                        put_bits(&s->pb, 1, sign);
+                        put_bits(ac_pb, 1, 0);
+                        put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+                        put_bits(ac_pb, 1, sign);
                     }
                 } else {
                     /* first escape */
-                    put_bits(&s->pb, 1, 0);
-                    put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-                    put_bits(&s->pb, 1, sign);
+                    put_bits(ac_pb, 1, 0);
+                    put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+                    put_bits(ac_pb, 1, sign);
                 }
             } else {
-                put_bits(&s->pb, 1, sign);
+                put_bits(ac_pb, 1, sign);
             }
 	    last_non_zero = i;
 	}
@@ -1265,11 +1627,11 @@ void init_rl(RLTable *rl)
             if (run > max_run[level])
                 max_run[level] = run;
         }
-        rl->max_level[last] = malloc(MAX_RUN + 1);
+        rl->max_level[last] = av_malloc(MAX_RUN + 1);
         memcpy(rl->max_level[last], max_level, MAX_RUN + 1);
-        rl->max_run[last] = malloc(MAX_LEVEL + 1);
+        rl->max_run[last] = av_malloc(MAX_LEVEL + 1);
         memcpy(rl->max_run[last], max_run, MAX_LEVEL + 1);
-        rl->index_run[last] = malloc(MAX_RUN + 1);
+        rl->index_run[last] = av_malloc(MAX_RUN + 1);
         memcpy(rl->index_run[last], index_run, MAX_RUN + 1);
     }
 }
@@ -1356,82 +1718,146 @@ static inline void memsetw(short *tab, int val, int n)
         tab[i] = val;
 }
 
-static int mpeg4_resync(MpegEncContext *s)
+void ff_mpeg4_init_partitions(MpegEncContext *s)
+{
+    init_put_bits(&s->tex_pb, s->tex_pb_buffer, PB_BUFFER_SIZE, NULL, NULL);
+    init_put_bits(&s->pb2   , s->pb2_buffer   , PB_BUFFER_SIZE, NULL, NULL);
+}
+
+void ff_mpeg4_merge_partitions(MpegEncContext *s)
+{
+    const int pb2_len   = get_bit_count(&s->pb2   );
+    const int tex_pb_len= get_bit_count(&s->tex_pb);
+    const int bits= get_bit_count(&s->pb);
+
+    if(s->pict_type==I_TYPE){
+        put_bits(&s->pb, 19, DC_MARKER);
+        s->misc_bits+=19 + pb2_len + bits - s->last_bits;
+        s->i_tex_bits+= tex_pb_len;
+    }else{
+        put_bits(&s->pb, 17, MOTION_MARKER);
+        s->misc_bits+=17 + pb2_len;;
+        s->mv_bits+= bits - s->last_bits;
+        s->p_tex_bits+= tex_pb_len;
+    }
+
+    flush_put_bits(&s->pb2);
+    flush_put_bits(&s->tex_pb);
+
+    ff_copy_bits(&s->pb, s->pb2_buffer   , pb2_len);
+    ff_copy_bits(&s->pb, s->tex_pb_buffer, tex_pb_len);
+    s->last_bits= get_bit_count(&s->pb);
+}
+
+void ff_mpeg4_encode_video_packet_header(MpegEncContext *s)
 {
-    int state, v, bits;
     int mb_num_bits= av_log2(s->mb_num - 1) + 1;
-    int header_extension=0, mb_num;
-    int c_wrap, c_xy, l_wrap, l_xy;
-//printf("resync at %d %d\n", s->mb_x, s->mb_y);
-//printf("%X\n", show_bits(&s->gb, 24));
 
-    if( get_bits_count(&s->gb) > s->gb.size*8-32)
-        return 0;
+    ff_mpeg4_stuffing(&s->pb);
+    if(s->pict_type==I_TYPE)
+        put_bits(&s->pb, 16, 0);
+    else if(s->pict_type==B_TYPE)
+        put_bits(&s->pb, MAX(MAX(s->f_code, s->b_code)+15, 17), 0);
+    else /* S/P_TYPE */
+        put_bits(&s->pb, s->f_code+15, 0);
+    put_bits(&s->pb, 1, 1);
+    
+    put_bits(&s->pb, mb_num_bits, s->mb_x + s->mb_y*s->mb_width);
+    put_bits(&s->pb, 5, s->qscale);
+    put_bits(&s->pb, 1, 0); /* no HEC */
+}
 
-    align_get_bits(&s->gb);
-    state = 0xff;
-    for(;;) {
-        v = get_bits(&s->gb, 8);
-//printf("%X ", v);
-        state = ((state << 8) | v) & 0xffff;
-        if (state == 0) break;
-        if( get_bits_count(&s->gb) > s->gb.size*8-32){
-            printf("resync failed\n");
-            return -1;
-        }
+/**
+ * decodes the next video packet and sets s->next_qscale 
+ * returns mb_num of the next packet or <0 if something went wrong
+ */
+static int decode_video_packet_header(MpegEncContext *s, GetBitContext *gb)
+{
+    int bits;
+    int mb_num_bits= av_log2(s->mb_num - 1) + 1;
+    int header_extension=0, mb_num;
+//printf("%X\n", show_bits(&gb, 24));
+//printf("parse_video_packet_header\n");
+//    if(show_aligned_bits(gb, 1, 16) != 0) return -1;
+    
+    /* is there enough space left for a video packet + header */
+    if( get_bits_count(gb) > gb->size*8-20) return -1;
+
+//printf("resync at %d %d\n", s->mb_x, s->mb_y);
+//    skip_bits(gb, 1);
+//    align_get_bits(gb);
+    if(get_bits(gb, 16)!=0){
+        printf("internal error while decoding video packet header\n");
     }
-//printf("%X\n", show_bits(&s->gb, 24));
+
+//printf("%X\n", show_bits(gb, 24));
     bits=0;
-    while(!get_bits1(&s->gb) && bits<30) bits++;
-    if(s->pict_type == P_TYPE && bits != s->f_code-1)
-        printf("marker does not match f_code\n");
-    //FIXME check bits for B-framess
-//printf("%X\n", show_bits(&s->gb, 24));
+    while(!get_bits1(gb) && bits<30) bits++;
+    if((s->pict_type == P_TYPE || s->pict_type == S_TYPE) && bits != s->f_code-1){
+        printf("marker does not match f_code (is: %d should be: %d pos: %d end %d x: %d y: %d)\n", 
+               bits+1, s->f_code, get_bits_count(gb), gb->size*8, s->mb_x, s->mb_y);
+        return -1;
+    }else if(s->pict_type == I_TYPE && bits != 0){
+        printf("marker too long\n");
+        return -1;
+    }else if(s->pict_type == B_TYPE && bits != MAX(MAX(s->f_code, s->b_code)-1, 1)){
+        printf("marker does not match f/b_code\n");
+        return -1;
+    }
+//printf("%X\n", show_bits(gb, 24));
 
     if(s->shape != RECT_SHAPE){
-        header_extension= get_bits1(&s->gb);
+        header_extension= get_bits1(gb);
         //FIXME more stuff here
     }
 
-    mb_num= get_bits(&s->gb, mb_num_bits);
-    if(mb_num != s->mb_x + s->mb_y*s->mb_width){
-        printf("MB-num change not supported %d %d\n", mb_num, s->mb_x + s->mb_y*s->mb_width);
-//        s->mb_x= mb_num % s->mb_width;
-//        s->mb_y= mb_num / s->mb_width;
-        //FIXME many vars are wrong now
-    } 
+    mb_num= get_bits(gb, mb_num_bits);
+    if(mb_num < s->mb_x + s->mb_y*s->mb_width || mb_num>=s->mb_num){
+        fprintf(stderr, "illegal mb_num in video packet (%d %d) \n", mb_num, s->mb_x + s->mb_y*s->mb_width);
+        return -1;
+    }
 
     if(s->shape != BIN_ONLY_SHAPE){
-        s->qscale= get_bits(&s->gb, 5);
-        h263_dc_scale(s);
+        s->next_resync_qscale= get_bits(gb, 5);
+        if(s->next_resync_qscale==0)
+            s->next_resync_qscale= s->qscale;
+        if(s->next_resync_qscale==0){
+            fprintf(stderr, "qscale==0\n");
+            return -1;
+        }
     }
 
     if(s->shape == RECT_SHAPE){
-        header_extension= get_bits1(&s->gb);
+        header_extension= get_bits1(gb);
     }
     if(header_extension){
+        int time_increment;
         int time_incr=0;
-        printf("header extension not really supported\n");
-        while (get_bits1(&s->gb) != 0) 
+        printf("header extension not supported\n");
+        return -1;
+
+        while (get_bits1(gb) != 0) 
             time_incr++;
 
-        check_marker(&s->gb, "before time_increment in video packed header");
-        s->time_increment= get_bits(&s->gb, s->time_increment_bits);
+        check_marker(gb, "before time_increment in video packed header");
+        time_increment= get_bits(gb, s->time_increment_bits);
         if(s->pict_type!=B_TYPE){
+            s->last_time_base= s->time_base;
             s->time_base+= time_incr;
-            s->last_non_b_time[1]= s->last_non_b_time[0];
-            s->last_non_b_time[0]= s->time_base*s->time_increment_resolution + s->time_increment;
+            s->time= s->time_base*s->time_increment_resolution + time_increment;
+            s->pp_time= s->time - s->last_non_b_time;
+            s->last_non_b_time= s->time;
         }else{
-            s->time= (s->last_non_b_time[1]/s->time_increment_resolution + time_incr)*s->time_increment_resolution;
-            s->time+= s->time_increment;
+            s->time= (s->last_time_base + time_incr)*s->time_increment_resolution + time_increment;
+            s->bp_time= s->last_non_b_time - s->time;
         }
-        check_marker(&s->gb, "before vop_coding_type in video packed header");
+        check_marker(gb, "before vop_coding_type in video packed header");
         
-        skip_bits(&s->gb, 2); /* vop coding type */
+        skip_bits(gb, 2); /* vop coding type */
         //FIXME not rect stuff here
 
         if(s->shape != BIN_ONLY_SHAPE){
-            skip_bits(&s->gb, 3); /* intra dc vlc threshold */
+            skip_bits(gb, 3); /* intra dc vlc threshold */
 
             if(s->pict_type == S_TYPE && s->vol_sprite_usage==GMC_SPRITE && s->num_sprite_warping_points){
                 mpeg4_decode_sprite_trajectory(s);
@@ -1440,44 +1866,506 @@ static int mpeg4_resync(MpegEncContext *s)
             //FIXME reduced res stuff here
             
             if (s->pict_type != I_TYPE) {
-                s->f_code = get_bits(&s->gb, 3);	/* fcode_for */
+                s->f_code = get_bits(gb, 3);	/* fcode_for */
                 if(s->f_code==0){
                     printf("Error, video packet header damaged or not MPEG4 header (f_code=0)\n");
                     return -1; // makes no sense to continue, as the MV decoding will break very quickly
                 }
             }
             if (s->pict_type == B_TYPE) {
-                s->b_code = get_bits(&s->gb, 3);
+                s->b_code = get_bits(gb, 3);
             }       
         }
-
     }
     //FIXME new-pred stuff
+    
+//printf("parse ok %d %d %d %d\n", mb_num, s->mb_x + s->mb_y*s->mb_width, get_bits_count(gb), get_bits_count(&s->gb));
+
+    return mb_num;
+}
+
+void ff_mpeg4_clean_buffers(MpegEncContext *s)
+{
+    int c_wrap, c_xy, l_wrap, l_xy;
 
     l_wrap= s->block_wrap[0];
-    l_xy= s->mb_y*l_wrap*2;
+    l_xy= s->mb_y*l_wrap*2 + s->mb_x*2;
     c_wrap= s->block_wrap[4];
-    c_xy= s->mb_y*c_wrap;
+    c_xy= s->mb_y*c_wrap + s->mb_x;
 
     /* clean DC */
-    memsetw(s->dc_val[0] + l_xy, 1024, l_wrap*3);
-    memsetw(s->dc_val[1] + c_xy, 1024, c_wrap*2);
-    memsetw(s->dc_val[2] + c_xy, 1024, c_wrap*2);
+    memsetw(s->dc_val[0] + l_xy, 1024, l_wrap*2+1);
+    memsetw(s->dc_val[1] + c_xy, 1024, c_wrap+1);
+    memsetw(s->dc_val[2] + c_xy, 1024, c_wrap+1);
 
     /* clean AC */
-    memset(s->ac_val[0] + l_xy, 0, l_wrap*3*16*sizeof(INT16));
-    memset(s->ac_val[1] + c_xy, 0, c_wrap*2*16*sizeof(INT16));
-    memset(s->ac_val[2] + c_xy, 0, c_wrap*2*16*sizeof(INT16));
+    memset(s->ac_val[0] + l_xy, 0, (l_wrap*2+1)*16*sizeof(INT16));
+    memset(s->ac_val[1] + c_xy, 0, (c_wrap  +1)*16*sizeof(INT16));
+    memset(s->ac_val[2] + c_xy, 0, (c_wrap  +1)*16*sizeof(INT16));
 
     /* clean MV */
-    memset(s->motion_val + l_xy, 0, l_wrap*3*2*sizeof(INT16));
+    // we cant clear the MVs as they might be needed by a b frame
+//    memset(s->motion_val + l_xy, 0, (l_wrap*2+1)*2*sizeof(INT16));
 //    memset(s->motion_val, 0, 2*sizeof(INT16)*(2 + s->mb_width*2)*(2 + s->mb_height*2));
-    s->resync_x_pos= s->mb_x;
+    s->last_mv[0][0][0]=
+    s->last_mv[0][0][1]=
+    s->last_mv[1][0][0]=
+    s->last_mv[1][0][1]= 0;
+}
+
+/* searches for the next resync marker clears ac,dc,mc, and sets s->next_resync_gb, s->mb_num_left */
+int ff_mpeg4_resync(MpegEncContext *s)
+{
+    GetBitContext gb;
+    
+    /* search & parse next resync marker */
+    
+    gb= s->next_resync_gb;
+    align_get_bits(&gb);
+//printf("mpeg4_resync %d next:%d \n", get_bits_count(&gb), get_bits_count(&s->next_resync_gb));
+    for(;;) {
+        int v= show_bits(&gb, 24);
+        if( get_bits_count(&gb) >= gb.size*8-24 || v == 1 /* start-code */){
+            s->mb_num_left= s->mb_num - s->mb_x - s->mb_y*s->mb_width;
+//printf("mpeg4_resync end\n");
+            s->gb= s->next_resync_gb; //continue at the next resync marker
+            return -1;
+        }else if(v>>8 == 0){
+            int next;
+            s->next_resync_pos= get_bits_count(&gb);
+            
+            next= decode_video_packet_header(s, &gb);
+            if(next >= 0){
+                s->mb_num_left= next - s->mb_x - s->mb_y*s->mb_width;
+                break;
+            }
+
+            align_get_bits(&gb);
+        }
+        skip_bits(&gb, 8);
+    }
+    s->next_resync_gb=gb;
+    
+    return 0;
+}
+
+static inline void init_block_index(MpegEncContext *s)
+{
+    s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
+    s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1)     + s->mb_x*2;
+    s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2;
+    s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2)     + s->mb_x*2;
+    s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
+    s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
+}
+
+static inline void update_block_index(MpegEncContext *s)
+{
+    s->block_index[0]+=2;
+    s->block_index[1]+=2;
+    s->block_index[2]+=2;
+    s->block_index[3]+=2;
+    s->block_index[4]++;
+    s->block_index[5]++;
+}
+
+/**
+ * decodes the first & second partition
+ * returns error type or 0 if no error
+ */
+int ff_mpeg4_decode_partitions(MpegEncContext *s)
+{
+    static const INT8 quant_tab[4] = { -1, -2, 1, 2 };
+    int mb_num;
+    
+    /* decode first partition */
+    mb_num=0;
     s->first_slice_line=1;
+    s->mb_x= s->resync_mb_x;
+    for(s->mb_y= s->resync_mb_y; mb_num < s->mb_num_left; s->mb_y++){
+        init_block_index(s);
+        for(; mb_num < s->mb_num_left && s->mb_x<s->mb_width; s->mb_x++){
+            const int xy= s->mb_x + s->mb_y*s->mb_width;
+            int cbpc;
+            int dir=0;
+            
+            mb_num++;
+            update_block_index(s);
+            if(s->mb_x == s->resync_mb_x && s->mb_y == s->resync_mb_y+1)
+                s->first_slice_line=0;
+            
+            if(s->mb_x==0) PRINT_MB_TYPE("\n");
+
+            if(s->pict_type==I_TYPE){
+                int i;
+
+                PRINT_MB_TYPE("I");
+                cbpc = get_vlc(&s->gb, &intra_MCBPC_vlc);
+                if (cbpc < 0){
+                    fprintf(stderr, "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                    return DECODING_DESYNC;
+                }
+                s->cbp_table[xy]= cbpc & 3;
+                s->mb_type[xy]= MB_TYPE_INTRA;
+                s->mb_intra = 1;
+
+                if(cbpc & 4) {
+                    s->qscale += quant_tab[get_bits(&s->gb, 2)];
+                    if (s->qscale < 1)
+                        s->qscale = 1;
+                    else if (s->qscale > 31)
+                        s->qscale = 31;
+                    h263_dc_scale(s);
+                }
+                s->qscale_table[xy]= s->qscale;
+
+                s->mbintra_table[xy]= 1;
+                for(i=0; i<6; i++){
+                    int dc_pred_dir;
+                    int dc= mpeg4_decode_dc(s, i, &dc_pred_dir); 
+                    if(dc < 0){
+                        fprintf(stderr, "DC corrupted at %d %d\n", s->mb_x, s->mb_y);
+                        return DECODING_DESYNC;
+                    }
+                    dir<<=1;
+                    if(dc_pred_dir) dir|=1;
+                }
+                s->pred_dir_table[xy]= dir;
+            }else{ /* P/S_TYPE */
+                int mx, my, pred_x, pred_y;
+                INT16 * const mot_val= s->motion_val[s->block_index[0]];
+                const int stride= s->block_wrap[0]*2;
+
+                if(get_bits1(&s->gb)){
+                    /* skip mb */
+                    s->mb_type[xy]= MB_TYPE_SKIPED;
+                    if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE){
+                        const int a= s->sprite_warping_accuracy;
+                        PRINT_MB_TYPE("G");
+                        if(s->divx_version==500 && s->divx_build==413){
+                            mx = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample));
+                            my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample));
+                        }else{
+                            mx = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
+                            my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
+                            s->mb_type[xy]= MB_TYPE_GMC | MB_TYPE_SKIPED;
+                        }
+                    }else{
+                        PRINT_MB_TYPE("S");
+                        mx = 0;
+                        my = 0;
+                    }
+                    mot_val[0       ]= mot_val[2       ]=
+                    mot_val[0+stride]= mot_val[2+stride]= mx;
+                    mot_val[1       ]= mot_val[3       ]=
+                    mot_val[1+stride]= mot_val[3+stride]= my;
+
+                    if(s->mbintra_table[xy])
+                        ff_clean_intra_table_entries(s);
+
+                    continue;
+                }
+                cbpc = get_vlc(&s->gb, &inter_MCBPC_vlc);
+                if (cbpc < 0){
+                    fprintf(stderr, "cbpc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                    return DECODING_DESYNC;
+                }
+                if (cbpc > 20)
+                    cbpc+=3;
+                else if (cbpc == 20)
+                    fprintf(stderr, "Stuffing !");
+                s->cbp_table[xy]= cbpc&(8+3); //8 is dquant
+    
+                s->mb_intra = ((cbpc & 4) != 0);
+        
+                if(s->mb_intra){
+                    PRINT_MB_TYPE("I");
+                    s->mbintra_table[xy]= 1;
+                    s->mb_type[xy]= MB_TYPE_INTRA;
+                    mot_val[0       ]= mot_val[2       ]= 
+                    mot_val[0+stride]= mot_val[2+stride]= 0;
+                    mot_val[1       ]= mot_val[3       ]=
+                    mot_val[1+stride]= mot_val[3+stride]= 0;
+                }else{
+                    if(s->mbintra_table[xy])
+                        ff_clean_intra_table_entries(s);
+
+                    if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE && (cbpc & 16) == 0)
+                        s->mcsel= get_bits1(&s->gb);
+                    else s->mcsel= 0;
+        
+                    if ((cbpc & 16) == 0) {
+                        PRINT_MB_TYPE("P");
+                        /* 16x16 motion prediction */
+                        s->mb_type[xy]= MB_TYPE_INTER;
+
+                        h263_pred_motion(s, 0, &pred_x, &pred_y);
+                        if(!s->mcsel)
+                           mx = h263_decode_motion(s, pred_x, s->f_code);
+                        else {
+                            const int a= s->sprite_warping_accuracy;
+                            if(s->divx_version==500 && s->divx_build==413){
+                                mx = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample));
+                            }else{
+                                mx = RSHIFT(s->sprite_offset[0][0], a-s->quarter_sample);
+                            }
+                        }
+                        if (mx >= 0xffff)
+                            return DECODING_DESYNC;
+            
+                        if(!s->mcsel)
+                           my = h263_decode_motion(s, pred_y, s->f_code);
+                        else{
+                           const int a= s->sprite_warping_accuracy;
+                            if(s->divx_version==500 && s->divx_build==413){
+                                my = s->sprite_offset[0][1] / (1<<(a-s->quarter_sample));
+                            }else{
+                                my = RSHIFT(s->sprite_offset[0][1], a-s->quarter_sample);
+                            }
+                        }
+                        if (my >= 0xffff)
+                            return DECODING_DESYNC;
+                        mot_val[0       ]= mot_val[2       ] =
+                        mot_val[0+stride]= mot_val[2+stride]= mx;
+                        mot_val[1       ]= mot_val[3       ]=
+                        mot_val[1+stride]= mot_val[3+stride]= my;
+                    } else {
+                        int i;
+                        PRINT_MB_TYPE("4");
+                        s->mb_type[xy]= MB_TYPE_INTER4V;
+                        for(i=0;i<4;i++) {
+                            INT16 *mot_val= h263_pred_motion(s, i, &pred_x, &pred_y);
+                            mx = h263_decode_motion(s, pred_x, s->f_code);
+                            if (mx >= 0xffff)
+                                return DECODING_DESYNC;
+                
+                            my = h263_decode_motion(s, pred_y, s->f_code);
+                            if (my >= 0xffff)
+                                return DECODING_DESYNC;
+                            mot_val[0] = mx;
+                            mot_val[1] = my;
+                        }
+                    }
+                }
+            }
+        }
+        s->mb_x= 0;
+    }
+
+    if     (s->pict_type==I_TYPE && get_bits(&s->gb, 19)!=DC_MARKER    ) s->decoding_error= DECODING_DESYNC;
+    else if(s->pict_type!=I_TYPE && get_bits(&s->gb, 17)!=MOTION_MARKER) s->decoding_error= DECODING_DESYNC;
+    if(s->decoding_error== DECODING_DESYNC){
+        fprintf(stderr, "marker missing after first partition at %d %d\n", s->mb_x, s->mb_y);
+        return DECODING_DESYNC;
+    }
+
+    /* decode second partition */
+    mb_num=0;
+    s->mb_x= s->resync_mb_x;
+    for(s->mb_y= s->resync_mb_y; mb_num < s->mb_num_left; s->mb_y++){
+        init_block_index(s);
+        for(; mb_num < s->mb_num_left && s->mb_x<s->mb_width; s->mb_x++){
+            const int xy= s->mb_x + s->mb_y*s->mb_width;
+
+            mb_num++;
+            update_block_index(s);
+            
+            if(s->pict_type==I_TYPE){
+                int ac_pred= get_bits1(&s->gb);
+                int cbpy = get_vlc(&s->gb, &cbpy_vlc);
+                if(cbpy<0){
+                    fprintf(stderr, "cbpy corrupted at %d %d\n", s->mb_x, s->mb_y);
+                    return DECODING_AC_LOST;
+                }
+                
+                s->cbp_table[xy]|= cbpy<<2;
+                s->pred_dir_table[xy]|= ac_pred<<7;
+            }else{ /* P || S_TYPE */
+                if(s->mb_type[xy]&MB_TYPE_INTRA){          
+                    int dir=0,i;
+                    int ac_pred = get_bits1(&s->gb);
+                    int cbpy = get_vlc(&s->gb, &cbpy_vlc);
+
+                    if(cbpy<0){
+                        fprintf(stderr, "I cbpy corrupted at %d %d\n", s->mb_x, s->mb_y);
+                        return DECODING_ACDC_LOST;
+                    }
+                    
+                    if(s->cbp_table[xy] & 8) {
+                        s->qscale += quant_tab[get_bits(&s->gb, 2)];
+                        if (s->qscale < 1)
+                            s->qscale = 1;
+                        else if (s->qscale > 31)
+                            s->qscale = 31;
+                        h263_dc_scale(s);
+                    }
+                    s->qscale_table[xy]= s->qscale;
+
+                    for(i=0; i<6; i++){
+                        int dc_pred_dir;
+                        int dc= mpeg4_decode_dc(s, i, &dc_pred_dir); 
+                        if(dc < 0){
+                            fprintf(stderr, "DC corrupted at %d %d\n", s->mb_x, s->mb_y);
+                            return DECODING_ACDC_LOST;
+                        }
+                        dir<<=1;
+                        if(dc_pred_dir) dir|=1;
+                    }
+                    s->cbp_table[xy]&= 3; //remove dquant
+                    s->cbp_table[xy]|= cbpy<<2;
+                    s->pred_dir_table[xy]= dir | (ac_pred<<7);
+                }else if(s->mb_type[xy]&MB_TYPE_SKIPED){
+                    s->qscale_table[xy]= s->qscale;
+                    s->cbp_table[xy]= 0;
+                }else{
+                    int cbpy = get_vlc(&s->gb, &cbpy_vlc);
+
+                    if(cbpy<0){
+                        fprintf(stderr, "P cbpy corrupted at %d %d\n", s->mb_x, s->mb_y);
+                        return DECODING_ACDC_LOST;
+                    }
+                    
+                    if(s->cbp_table[xy] & 8) {
+//fprintf(stderr, "dquant\n");
+                        s->qscale += quant_tab[get_bits(&s->gb, 2)];
+                        if (s->qscale < 1)
+                            s->qscale = 1;
+                        else if (s->qscale > 31)
+                            s->qscale = 31;
+                        h263_dc_scale(s);
+                    }
+                    s->qscale_table[xy]= s->qscale;
+
+                    s->cbp_table[xy]&= 3; //remove dquant
+                    s->cbp_table[xy]|= (cbpy^0xf)<<2;
+                }
+            }
+        }
+        s->mb_x= 0;
+    }
+    
+
+    return 0;        
+}
+
+static int mpeg4_decode_partitioned_mb(MpegEncContext *s,
+                   DCTELEM block[6][64])
+{
+    int cbp, mb_type;
+    const int xy= s->mb_x + s->mb_y*s->mb_width;
+
+    if(s->mb_x==s->resync_mb_x && s->mb_y==s->resync_mb_y){ //Note resync_mb_{x,y}==0 at the start
+        int i;
+        int block_index_backup[6];
+        int qscale= s->qscale;
+        
+        for(i=0; i<6; i++) block_index_backup[i]= s->block_index[i];
+        
+        s->decoding_error= ff_mpeg4_decode_partitions(s);
+        
+        for(i=0; i<6; i++) s->block_index[i]= block_index_backup[i];
+        s->first_slice_line=1;
+        s->mb_x= s->resync_mb_x;
+        s->mb_y= s->resync_mb_y;
+        s->qscale= qscale;
+        h263_dc_scale(s);
+
+        if(s->decoding_error==DECODING_DESYNC) return -1;
+    }
+    
+    mb_type= s->mb_type[xy];
+    if(s->decoding_error)
+        cbp=0;
+    else 
+        cbp = s->cbp_table[xy];
+
+    if(s->decoding_error!=DECODING_ACDC_LOST && s->qscale_table[xy] != s->qscale){
+        s->qscale= s->qscale_table[xy];
+        h263_dc_scale(s);
+    }
+
+    if (s->pict_type == P_TYPE || s->pict_type==S_TYPE) {
+        int i;
+        for(i=0; i<4; i++){
+            s->mv[0][i][0] = s->motion_val[ s->block_index[i] ][0];
+            s->mv[0][i][1] = s->motion_val[ s->block_index[i] ][1];
+        }
+        s->mb_intra = mb_type&MB_TYPE_INTRA;
+
+        if (mb_type&MB_TYPE_SKIPED) {
+            /* skip mb */
+            for(i=0;i<6;i++)
+                s->block_last_index[i] = -1;
+            s->mv_dir = MV_DIR_FORWARD;
+            s->mv_type = MV_TYPE_16X16;
+            if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE){
+                s->mcsel=1;
+                s->mb_skiped = 0;
+            }else{
+                s->mcsel=0;
+                s->mb_skiped = 1;
+            }
+            return 0;
+        }else if(s->mb_intra && s->decoding_error!=DECODING_ACDC_LOST){
+            s->ac_pred = s->pred_dir_table[xy]>>7;
+
+            /* decode each block */
+            for (i = 0; i < 6; i++) {
+                int ret= mpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1);
+                if(ret==DECODING_AC_LOST){
+                    fprintf(stderr, "texture corrupted at %d %d (trying to continue with mc/dc only)\n", s->mb_x, s->mb_y);
+                    s->decoding_error=DECODING_AC_LOST;
+                    cbp=0;
+                }else if(ret==DECODING_ACDC_LOST){
+                    fprintf(stderr, "dc corrupted at %d %d (trying to continue with mc only)\n", s->mb_x, s->mb_y);
+                    s->decoding_error=DECODING_ACDC_LOST;
+                    break;
+                }
+            }
+        }else if(!s->mb_intra){
+//            s->mcsel= 0; //FIXME do we need to init that
+            
+            s->mv_dir = MV_DIR_FORWARD;
+            if (mb_type&MB_TYPE_INTER4V) {
+                s->mv_type = MV_TYPE_8X8;
+            } else {
+                s->mv_type = MV_TYPE_16X16;
+            }
+            if(s->decoding_error==0 && cbp){
+                /* decode each block */
+                for (i = 0; i < 6; i++) {
+                    int ret= mpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1);
+                    if(ret==DECODING_AC_LOST){
+                        fprintf(stderr, "texture corrupted at %d %d (trying to continue with mc/dc only)\n", s->mb_x, s->mb_y);
+                        s->decoding_error=DECODING_AC_LOST;
+                        break;
+                    }
+                }
+            }
+        }
+    } else { /* I-Frame */
+        int i;
+        s->mb_intra = 1;
+        s->ac_pred = s->pred_dir_table[xy]>>7;
+        
+        /* decode each block */
+        for (i = 0; i < 6; i++) {
+            int ret= mpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1);
+            if(ret==DECODING_AC_LOST){
+                fprintf(stderr, "texture corrupted at %d %d (trying to continue with dc only)\n", s->mb_x, s->mb_y);
+                s->decoding_error=DECODING_AC_LOST;
+                cbp=0;
+            }else if(ret==DECODING_ACDC_LOST){
+                fprintf(stderr, "dc corrupted at %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+        }
+    }
 
     return 0;
 }
 
+
 int h263_decode_mb(MpegEncContext *s,
                    DCTELEM block[6][64])
 {
@@ -1485,27 +2373,17 @@ int h263_decode_mb(MpegEncContext *s,
     INT16 *mot_val;
     static INT8 quant_tab[4] = { -1, -2, 1, 2 };
 
-    if(s->resync_marker){
-        if(   s->resync_x_pos == s->mb_x+1
-           || s->resync_x_pos == s->mb_x){
-            /* f*ck mpeg4
-               this is here so we dont need to slowdown h263_pred_motion with it */
-            if(s->resync_x_pos == s->mb_x+1 && s->mb_x==0){
-                int xy= s->block_index[0] - s->block_wrap[0];
-                s->motion_val[xy][0]= s->motion_val[xy+2][0];
-                s->motion_val[xy][1]= s->motion_val[xy+2][1];
-            }
+    if(s->mb_x==0) PRINT_MB_TYPE("\n")
 
+    if(s->resync_marker){
+        if(s->resync_mb_x == s->mb_x && s->resync_mb_y+1 == s->mb_y){
             s->first_slice_line=0; 
-            s->resync_x_pos=0; // isnt needed but for cleanness sake ;)
-        }
-
-        if(show_aligned_bits(&s->gb, 1, 16) == 0){
-            if( mpeg4_resync(s) < 0 ) return -1;
-            
         }
     }
 
+    if(s->data_partitioning && s->pict_type!=B_TYPE)
+        return mpeg4_decode_partitioned_mb(s, block);
+
     if (s->pict_type == P_TYPE || s->pict_type==S_TYPE) {
         if (get_bits1(&s->gb)) {
             /* skip mb */
@@ -1517,7 +2395,7 @@ int h263_decode_mb(MpegEncContext *s,
             if(s->pict_type==S_TYPE && s->vol_sprite_usage==GMC_SPRITE){
                 const int a= s->sprite_warping_accuracy;
 //                int l = (1 << (s->f_code - 1)) * 32;
-
+                PRINT_MB_TYPE("G");
                 s->mcsel=1;
                 if(s->divx_version==500 && s->divx_build==413){
                     s->mv[0][0][0] = s->sprite_offset[0][0] / (1<<(a-s->quarter_sample));
@@ -1533,6 +2411,7 @@ int h263_decode_mb(MpegEncContext *s,
 
                 s->mb_skiped = 0;
             }else{
+                PRINT_MB_TYPE("S");
                 s->mcsel=0;
                 s->mv[0][0][0] = 0;
                 s->mv[0][0][1] = 0;
@@ -1568,6 +2447,7 @@ int h263_decode_mb(MpegEncContext *s,
         }
         s->mv_dir = MV_DIR_FORWARD;
         if ((cbpc & 16) == 0) {
+            PRINT_MB_TYPE("P");
             /* 16x16 motion prediction */
             s->mv_type = MV_TYPE_16X16;
             h263_pred_motion(s, 0, &pred_x, &pred_y);
@@ -1615,6 +2495,7 @@ int h263_decode_mb(MpegEncContext *s,
                skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
                            
         } else {
+            PRINT_MB_TYPE("4");
             s->mv_type = MV_TYPE_8X8;
             for(i=0;i<4;i++) {
                 mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
@@ -1643,8 +2524,8 @@ int h263_decode_mb(MpegEncContext *s,
         int modb1; // first bit of modb
         int modb2; // second bit of modb
         int mb_type;
-        int time_pp;
-        int time_pb;
+        uint16_t time_pp;
+        uint16_t time_pb;
         int xy;
 
         s->mb_intra = 0; //B-frames never contain intra blocks
@@ -1674,7 +2555,7 @@ int h263_decode_mb(MpegEncContext *s,
 //FIXME is this correct?
 /*            s->last_mv[0][0][0]=
             s->last_mv[0][0][1]=0;*/
-            s->mb_skiped = 1;
+            PRINT_MB_TYPE("s")
             return 0;
         }
 
@@ -1702,14 +2583,14 @@ int h263_decode_mb(MpegEncContext *s,
         mx=my=0; //for case 4, we could put this to the mb_type=4 but than gcc compains about uninitalized mx/my
         switch(mb_type)
         {
-        case 0: 
+        case 0: /* direct */
             mx = h263_decode_motion(s, 0, 1);
             my = h263_decode_motion(s, 0, 1);
-        case 4: 
+        case 4: /* direct with mx=my=0 */
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
             xy= s->block_index[0];
-            time_pp= s->last_non_b_time[0] - s->last_non_b_time[1];
-            time_pb= s->time - s->last_non_b_time[1];
+            time_pp= s->pp_time;
+            time_pb= time_pp - s->bp_time;
 //if(time_pp>3000 )printf("%d %d  ", time_pp, time_pb);
             //FIXME 4MV
             //FIXME avoid divides
@@ -1723,6 +2604,7 @@ int h263_decode_mb(MpegEncContext *s,
             s->mv[0][0][1] = 
             s->mv[1][0][0] = 
             s->mv[1][0][1] = 1000;*/
+            PRINT_MB_TYPE("D");
             break;
         case 1: 
             s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
@@ -1735,6 +2617,7 @@ int h263_decode_mb(MpegEncContext *s,
             my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
             s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
             s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+            PRINT_MB_TYPE("i");
             break;
         case 2: 
             s->mv_dir = MV_DIR_BACKWARD;
@@ -1742,6 +2625,7 @@ int h263_decode_mb(MpegEncContext *s,
             my = h263_decode_motion(s, s->last_mv[1][0][1], s->b_code);
             s->last_mv[1][0][0]= s->mv[1][0][0] = mx;
             s->last_mv[1][0][1]= s->mv[1][0][1] = my;
+            PRINT_MB_TYPE("B");
             break;
         case 3:
             s->mv_dir = MV_DIR_FORWARD;
@@ -1749,8 +2633,11 @@ int h263_decode_mb(MpegEncContext *s,
             my = h263_decode_motion(s, s->last_mv[0][0][1], s->f_code);
             s->last_mv[0][0][0]= s->mv[0][0][0] = mx;
             s->last_mv[0][0][1]= s->mv[0][0][1] = my;
+            PRINT_MB_TYPE("F");
             break;
-        default: return -1;
+        default: 
+            printf("illegal MB_type\n");
+            return -1;
         }
     } else { /* I-Frame */
         cbpc = get_vlc(&s->gb, &intra_MCBPC_vlc);
@@ -1759,6 +2646,7 @@ int h263_decode_mb(MpegEncContext *s,
         dquant = cbpc & 4;
         s->mb_intra = 1;
 intra:
+        PRINT_MB_TYPE("I");
         s->ac_pred = 0;
         if (s->h263_pred || s->h263_aic) {
             s->ac_pred = get_bits1(&s->gb);
@@ -1770,6 +2658,7 @@ intra:
             s->c_dc_scale = 2 * s->qscale;
         }
         cbpy = get_vlc(&s->gb, &cbpy_vlc);
+        if(cbpy<0) return -1;
         cbp = (cbpc & 3) | (cbpy << 2);
         if (dquant) {
             s->qscale += quant_tab[get_bits(&s->gb, 2)];
@@ -1815,7 +2704,7 @@ static int h263_decode_motion(MpegEncContext * s, int pred, int f_code)
     if (sign)
         val = -val;
     val += pred;
-    
+
     /* modulo decoding */
     if (!s->h263_long_vectors) {
         l = (1 << (f_code - 1)) * 32;
@@ -1951,7 +2840,7 @@ not_coded:
     return 0;
 }
 
-static int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
+static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
 {
     int level, pred, code;
     UINT16 *dc_val;
@@ -1960,16 +2849,22 @@ static int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         code = get_vlc(&s->gb, &dc_lum);
     else 
         code = get_vlc(&s->gb, &dc_chrom);
-    if (code < 0)
+    if (code < 0 || code > 9 /* && s->nbit<9 */){
+        fprintf(stderr, "illegal dc vlc\n");
         return -1;
+    }
     if (code == 0) {
         level = 0;
     } else {
         level = get_bits(&s->gb, code);
         if ((level >> (code - 1)) == 0) /* if MSB not set it is negative*/
             level = - (level ^ ((1 << code) - 1));
-        if (code > 8)
-            skip_bits1(&s->gb); /* marker */
+        if (code > 8){
+            if(get_bits1(&s->gb)==0){ /* marker */
+                fprintf(stderr, "dc marker bit missing\n");
+                return -1;
+            }
+        }
     }
 
     pred = mpeg4_pred_dc(s, n, &dc_val, dir_ptr);
@@ -1984,7 +2879,13 @@ static int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
     return level;
 }
 
-static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
+/**
+ * decode a block
+ * returns 0 if everything went ok
+ * returns DECODING_AC_LOST   if an error was detected during AC decoding
+ * returns DECODING_ACDC_LOST if an error was detected during DC decoding
+ */
+static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                               int n, int coded)
 {
     int code, level, i, j, last, run;
@@ -1994,11 +2895,18 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
 
     if (s->mb_intra) {
 	/* DC coef */
-        level = mpeg4_decode_dc(s, n, &dc_pred_dir);
-        if (level < 0)
-            return -1;
+        if(s->data_partitioning && s->pict_type!=B_TYPE){
+            level = s->dc_val[0][ s->block_index[n] ];
+            if(n<4) level= (level + (s->y_dc_scale>>1))/s->y_dc_scale; //FIXME optimizs
+            else    level= (level + (s->c_dc_scale>>1))/s->c_dc_scale;
+            dc_pred_dir= (s->pred_dir_table[s->mb_x + s->mb_y*s->mb_width]<<n)&32;
+        }else{
+            level = mpeg4_decode_dc(s, n, &dc_pred_dir);
+            if (level < 0)
+                return DECODING_ACDC_LOST;
+        }
         block[0] = level;
-	i = 1;
+        i = 1;
         if (!coded) 
             goto not_coded;
         rl = &rl_intra;
@@ -2023,7 +2931,7 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
     for(;;) {
         code = get_vlc(&s->gb, &rl->vlc);
         if (code < 0)
-            return -1;
+            return DECODING_AC_LOST;
         if (code == rl->n) {
             /* escape */
             if (get_bits1(&s->gb) != 0) {
@@ -2031,15 +2939,46 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                     /* third escape */
                     last = get_bits1(&s->gb);
                     run = get_bits(&s->gb, 6);
-                    get_bits1(&s->gb); /* marker */
+                    if(get_bits1(&s->gb)==0){
+                        fprintf(stderr, "1. marker bit missing in 3. esc\n");
+                        return DECODING_AC_LOST;
+                    }
                     level = get_bits(&s->gb, 12);
                     level = (level << 20) >> 20; /* sign extend */
-                    skip_bits1(&s->gb); /* marker */
+                    if(get_bits1(&s->gb)==0){
+                        fprintf(stderr, "2. marker bit missing in 3. esc\n");
+                        return DECODING_AC_LOST;
+                    }
+                    if(level>512 || level<-512){ //FIXME check that QP=1 is ok with this too
+                        fprintf(stderr, "|level| overflow in 3. esc\n");
+                        return DECODING_AC_LOST;
+                    }
+#if 1 
+                    {
+                        const int abs_level= ABS(level);
+                        int run1;
+                        if(abs_level<=MAX_LEVEL && run<=MAX_RUN && s->error_resilience>=0){
+                            if(abs_level <= rl->max_level[last][run]){
+                                fprintf(stderr, "illegal 3. esc, vlc encoding possible\n");
+                                return DECODING_AC_LOST;
+                            }
+                            if(abs_level <= rl->max_level[last][run]*2){
+                                fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n");
+                                return DECODING_AC_LOST;
+                            }
+                            run1 = run - rl->max_run[last][abs_level] - 1;
+                            if(run1 >= 0 && abs_level <= rl->max_level[last][run1]){
+                                fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n");
+                                return DECODING_AC_LOST;
+                            }
+                        }
+                    }
+#endif
                 } else {
                     /* second escape */
                     code = get_vlc(&s->gb, &rl->vlc);
                     if (code < 0 || code >= rl->n)
-                        return -1;
+                        return DECODING_AC_LOST;
                     run = rl->table_run[code];
                     level = rl->table_level[code];
                     last = code >= rl->last;
@@ -2051,7 +2990,7 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                 /* first escape */
                 code = get_vlc(&s->gb, &rl->vlc);
                 if (code < 0 || code >= rl->n)
-                    return -1;
+                    return DECODING_AC_LOST;
                 run = rl->table_run[code];
                 level = rl->table_level[code];
                 last = code >= rl->last;
@@ -2068,7 +3007,7 @@ static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         }
         i += run;
         if (i >= 64)
-            return -1;
+            return DECODING_AC_LOST;
 	j = scan_table[i];
         block[j] = level;
         i++;
@@ -2091,15 +3030,24 @@ int h263_decode_picture_header(MpegEncContext *s)
 {
     int format, width, height;
 
-    /* picture header */
-    if (get_bits(&s->gb, 22) != 0x20)
+    /* picture start code */
+    if (get_bits(&s->gb, 22) != 0x20) {
+        fprintf(stderr, "Bad picture start code\n");
         return -1;
+    }
+    /* temporal reference */
     s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
-    
-    if (get_bits1(&s->gb) != 1)
-        return -1;	/* marker */
-    if (get_bits1(&s->gb) != 0)
+
+    /* PTYPE starts here */    
+    if (get_bits1(&s->gb) != 1) {
+        /* marker */
+        fprintf(stderr, "Bad marker\n");
+        return -1;
+    }
+    if (get_bits1(&s->gb) != 0) {
+        fprintf(stderr, "Bad H263 id\n");
         return -1;	/* h263 id */
+    }
     skip_bits1(&s->gb);	/* split screen off */
     skip_bits1(&s->gb);	/* camera  off */
     skip_bits1(&s->gb);	/* freeze picture release off */
@@ -2108,6 +3056,12 @@ int h263_decode_picture_header(MpegEncContext *s)
     s->gob_number = 0;
         
     format = get_bits(&s->gb, 3);
+    /*
+        0    forbidden
+        1    sub-QCIF
+        10   QCIF
+        7	extended PTYPE (PLUSPTYPE)
+    */
 
     if (format != 7 && format != 6) {
         s->h263_plus = 0;
@@ -2124,15 +3078,18 @@ int h263_decode_picture_header(MpegEncContext *s)
         s->unrestricted_mv = get_bits1(&s->gb); 
         s->h263_long_vectors = s->unrestricted_mv;
 
-        if (get_bits1(&s->gb) != 0)
+        if (get_bits1(&s->gb) != 0) {
+            fprintf(stderr, "H263 SAC not supported\n");
             return -1;	/* SAC: off */
+        }
         if (get_bits1(&s->gb) != 0) {
             s->mv_type = MV_TYPE_8X8; /* Advanced prediction mode */
         }   
         
-        if (get_bits1(&s->gb) != 0)
+        if (get_bits1(&s->gb) != 0) {
+            fprintf(stderr, "H263 PB frame not supported\n");
             return -1;	/* not PB frame */
-
+        }
         s->qscale = get_bits(&s->gb, 5);
         skip_bits1(&s->gb);	/* Continuous Presence Multipoint mode: off */
     } else {
@@ -2141,10 +3098,12 @@ int h263_decode_picture_header(MpegEncContext *s)
         /* H.263v2 */
         s->h263_plus = 1;
         ufep = get_bits(&s->gb, 3); /* Update Full Extended PTYPE */
-        
+
+        /* ufep other than 0 and 1 are reserved */        
         if (ufep == 1) {
             /* OPPTYPE */       
             format = get_bits(&s->gb, 3);
+            dprintf("ufep=1, format: %d\n", format);
             skip_bits(&s->gb,1); /* Custom PCF */
             s->umvplus_dec = get_bits(&s->gb, 1); /* Unrestricted Motion Vector */
             skip_bits1(&s->gb); /* Syntax-based Arithmetic Coding (SAC) */
@@ -2154,34 +3113,59 @@ int h263_decode_picture_header(MpegEncContext *s)
             if (get_bits1(&s->gb) != 0) { /* Advanced Intra Coding (AIC) */
                 s->h263_aic = 1;
             }
+	    
             skip_bits(&s->gb, 7);
+            /* these are the 7 bits: (in order of appearence  */
+            /* Deblocking Filter */
+            /* Slice Structured */
+            /* Reference Picture Selection */
+            /* Independent Segment Decoding */
+            /* Alternative Inter VLC */
+            /* Modified Quantization */
+            /* Prevent start code emulation */
+
             skip_bits(&s->gb, 3); /* Reserved */
-        } else if (ufep != 0)
+        } else if (ufep != 0) {
+            fprintf(stderr, "Bad UFEP type (%d)\n", ufep);
             return -1;
+        }
             
         /* MPPTYPE */
-        s->pict_type = get_bits(&s->gb, 3) + 1;
+        s->pict_type = get_bits(&s->gb, 3) + I_TYPE;
+        dprintf("pict_type: %d\n", s->pict_type);
         if (s->pict_type != I_TYPE &&
             s->pict_type != P_TYPE)
             return -1;
         skip_bits(&s->gb, 2);
         s->no_rounding = get_bits1(&s->gb);
-        //fprintf(stderr, "\nRTYPE: %d", s->no_rounding);
+        dprintf("RTYPE: %d\n", s->no_rounding);
         skip_bits(&s->gb, 4);
         
         /* Get the picture dimensions */
         if (ufep) {
             if (format == 6) {
                 /* Custom Picture Format (CPFMT) */
-                skip_bits(&s->gb, 4); /* aspect ratio */
+                s->aspect_ratio_info = get_bits(&s->gb, 4);
+                dprintf("aspect: %d\n", s->aspect_ratio_info);
+                /* aspect ratios:
+                0 - forbidden
+                1 - 1:1
+                2 - 12:11 (CIF 4:3)
+                3 - 10:11 (525-type 4:3)
+                4 - 16:11 (CIF 16:9)
+                5 - 40:33 (525-type 16:9)
+                6-14 - reserved
+                */
                 width = (get_bits(&s->gb, 9) + 1) * 4;
                 skip_bits1(&s->gb);
                 height = get_bits(&s->gb, 9) * 4;
-#ifdef DEBUG 
-                fprintf(stderr,"\nH.263+ Custom picture: %dx%d\n",width,height);
-#endif            
-            }
-            else {
+                dprintf("\nH.263+ Custom picture: %dx%d\n",width,height);
+                if (s->aspect_ratio_info == EXTENDED_PAR) {
+                    /* aspected dimensions */
+                    skip_bits(&s->gb, 8); /* width */
+                    skip_bits(&s->gb, 8); /* height */
+                }
+            } else {
                 width = h263_format[format][0];
                 height = h263_format[format][1];
             }
@@ -2210,7 +3194,7 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
     int a= 2<<s->sprite_warping_accuracy;
     int rho= 3-s->sprite_warping_accuracy;
     int r=16/a;
-    int vop_ref[4][2]= {{0,0}, {s->width,0}, {0, s->height}, {s->width, s->height}}; // only true for rectangle shapes
+    const int vop_ref[4][2]= {{0,0}, {s->width,0}, {0, s->height}, {s->width, s->height}}; // only true for rectangle shapes
     int d[4][2]={{0,0}, {0,0}, {0,0}, {0,0}};
     int sprite_ref[4][2];
     int virtual_ref[2][2];
@@ -2276,13 +3260,13 @@ static void mpeg4_decode_sprite_trajectory(MpegEncContext * s)
 // the idea behind this virtual_ref mess is to be able to use shifts later per pixel instead of divides
 // so the distance between points is converted from w&h based to w2&h2 based which are of the 2^x form
     virtual_ref[0][0]= 16*(vop_ref[0][0] + w2) 
-        + RDIV(((w - w2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + w2*(r*sprite_ref[1][0] - 16*vop_ref[1][0])),w);
+        + ROUNDED_DIV(((w - w2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + w2*(r*sprite_ref[1][0] - 16*vop_ref[1][0])),w);
     virtual_ref[0][1]= 16*vop_ref[0][1] 
-        + RDIV(((w - w2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + w2*(r*sprite_ref[1][1] - 16*vop_ref[1][1])),w);
+        + ROUNDED_DIV(((w - w2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + w2*(r*sprite_ref[1][1] - 16*vop_ref[1][1])),w);
     virtual_ref[1][0]= 16*vop_ref[0][0] 
-        + RDIV(((h - h2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + h2*(r*sprite_ref[2][0] - 16*vop_ref[2][0])),h);
+        + ROUNDED_DIV(((h - h2)*(r*sprite_ref[0][0] - 16*vop_ref[0][0]) + h2*(r*sprite_ref[2][0] - 16*vop_ref[2][0])),h);
     virtual_ref[1][1]= 16*(vop_ref[0][1] + h2) 
-        + RDIV(((h - h2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + h2*(r*sprite_ref[2][1] - 16*vop_ref[2][1])),h);
+        + ROUNDED_DIV(((h - h2)*(r*sprite_ref[0][1] - 16*vop_ref[0][1]) + h2*(r*sprite_ref[2][1] - 16*vop_ref[2][1])),h);
 
     switch(s->num_sprite_warping_points)
     {
@@ -2398,6 +3382,7 @@ printf("%d %d\n", s->sprite_delta[1][1][1], a<<s->sprite_shift[1][1]);*/
 int mpeg4_decode_picture_header(MpegEncContext * s)
 {
     int time_incr, startcode, state, v;
+    int time_increment;
 
  redo:
     /* search next start code */
@@ -2412,8 +3397,13 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         }
         state = ((state << 8) | v) & 0xffffff;
         if( get_bits_count(&s->gb) > s->gb.size*8-32){
-            printf("no VOP startcode found\n");
-            return -1;
+            if(s->gb.size>50){
+                printf("no VOP startcode found, frame size was=%d\n", s->gb.size);
+                return -1;
+            }else{
+                printf("frame skip\n");
+                return FRAME_SKIPED;
+            }
         }
     }
 //printf("startcode %X %d\n", startcode, get_bits_count(&s->gb));
@@ -2422,24 +3412,34 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
 
         /* vol header */
         skip_bits(&s->gb, 1); /* random access */
-        skip_bits(&s->gb, 8); /* vo_type */
+        s->vo_type= get_bits(&s->gb, 8);
         if (get_bits1(&s->gb) != 0) { /* is_ol_id */
             vo_ver_id = get_bits(&s->gb, 4); /* vo_ver_id */
             skip_bits(&s->gb, 3); /* vo_priority */
         } else {
             vo_ver_id = 1;
         }
-        
+//printf("vo type:%d\n",s->vo_type);
         s->aspect_ratio_info= get_bits(&s->gb, 4);
-	if(s->aspect_ratio_info == EXTENDET_PAR){
+	if(s->aspect_ratio_info == EXTENDED_PAR){
             skip_bits(&s->gb, 8); //par_width
             skip_bits(&s->gb, 8); // par_height
         }
 
-        if(get_bits1(&s->gb)){ /* vol control parameter */
-            printf("vol control parameter not supported\n");
-            return -1;   
+        if ((s->vol_control_parameters=get_bits1(&s->gb))) { /* vol control parameter */
+            int chroma_format= get_bits(&s->gb, 2);
+            if(chroma_format!=1){
+                printf("illegal chroma format\n");
+            }
+            s->low_delay= get_bits1(&s->gb);
+            if(get_bits1(&s->gb)){ /* vbv parameters */
+                printf("vbv parameters not supported\n");
+                return -1;
+            }
+        }else{
+            s->low_delay=0;
         }
+
         s->shape = get_bits(&s->gb, 2); /* vol shape */
         if(s->shape != RECT_SHAPE) printf("only rectangular vol supported\n");
         if(s->shape == GRAY_SHAPE && vo_ver_id != 1){
@@ -2469,12 +3469,12 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
                 if(width && height){ /* they should be non zero but who knows ... */
                     s->width = width;
                     s->height = height;
-//                    printf("%d %d\n", width, height);
+//                    printf("width/height: %d %d\n", width, height);
                 }
             }
             
             if(get_bits1(&s->gb)) printf("interlaced not supported\n");   /* interlaced */
-            if(!get_bits1(&s->gb)) printf("OBMC not supported\n");   /* OBMC Disable */
+            if(!get_bits1(&s->gb)) printf("OBMC not supported (very likely buggy encoder)\n");   /* OBMC Disable */
             if (vo_ver_id == 1) {
                 s->vol_sprite_usage = get_bits1(&s->gb); /* vol_sprite_usage */
             } else {
@@ -2509,7 +3509,57 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
             }
             
             // FIXME a bunch of grayscale shape things
-            if(get_bits1(&s->gb)) printf("Quant-Type not supported\n");  /* vol_quant_type */ //FIXME
+
+            if(get_bits1(&s->gb)){ /* vol_quant_type */
+                int i, j, v;
+                /* load default matrixes */
+                for(i=0; i<64; i++){
+                    v= ff_mpeg4_default_intra_matrix[i];
+                    s->intra_matrix[i]= v;
+                    s->chroma_intra_matrix[i]= v;
+                    
+                    v= ff_mpeg4_default_non_intra_matrix[i];
+                    s->inter_matrix[i]= v;
+                    s->chroma_inter_matrix[i]= v;
+                }
+
+                /* load custom intra matrix */
+                if(get_bits1(&s->gb)){
+                    for(i=0; i<64; i++){
+                        v= get_bits(&s->gb, 8);
+                        if(v==0) break;
+
+                        j= zigzag_direct[i];
+                        s->intra_matrix[j]= v;
+                        s->chroma_intra_matrix[j]= v;
+                    }
+                }
+
+                /* load custom non intra matrix */
+                if(get_bits1(&s->gb)){
+                    for(i=0; i<64; i++){
+                        v= get_bits(&s->gb, 8);
+                        if(v==0) break;
+
+                        j= zigzag_direct[i];
+                        s->inter_matrix[j]= v;
+                        s->chroma_inter_matrix[j]= v;
+                    }
+
+                    /* replicate last value */
+                    for(; i<64; i++){
+                        j= zigzag_direct[i];
+                        s->inter_matrix[j]= v;
+                        s->chroma_inter_matrix[j]= v;
+                    }
+                }
+
+                s->dct_unquantize= s->dct_unquantize_mpeg2;
+
+                // FIXME a bunch of grayscale shape things
+            }else
+                s->dct_unquantize= s->dct_unquantize_h263;
+
             if(vo_ver_id != 1)
                  s->quarter_sample= get_bits1(&s->gb);
             else s->quarter_sample=0;
@@ -2518,10 +3568,12 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
 
             s->resync_marker= !get_bits1(&s->gb); /* resync_marker_disabled */
 
-            s->data_partioning= get_bits1(&s->gb);
-            if(s->data_partioning){
-                printf("data partitioning not supported\n");
-                skip_bits1(&s->gb); // reversible vlc
+            s->data_partitioning= get_bits1(&s->gb);
+            if(s->data_partitioning){
+                s->rvlc= get_bits1(&s->gb);
+                if(s->rvlc){
+                    printf("reversible vlc not supported\n");
+                }
             }
             
             if(vo_ver_id != 1) {
@@ -2561,16 +3613,20 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         }
         buf[255]=0;
         e=sscanf(buf, "DivX%dBuild%d", &ver, &build);
+        if(e!=2)
+            e=sscanf(buf, "DivX%db%d", &ver, &build);
         if(e==2){
             s->divx_version= ver;
             s->divx_build= build;
             if(s->picture_number==0){
                 printf("This file was encoded with DivX%d Build%d\n", ver, build);
-                if(ver==500 && build==413){ //most likely all version are indeed totally buggy but i dunno for sure ...
+                if(ver==500 && build==413){
                     printf("WARNING: this version of DivX is not MPEG4 compatible, trying to workaround these bugs...\n");
+#if 0
                 }else{
                     printf("hmm, i havnt seen that version of divx yet, lets assume they fixed these bugs ...\n"
                            "using mpeg4 decoder, if it fails contact the developers (of ffmpeg)\n");
+#endif 
                 }
             }
         }
@@ -2580,21 +3636,34 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
         goto redo;
     }
 
-    s->pict_type = get_bits(&s->gb, 2) + 1;	/* pict type: I = 0 , P = 1 */
-//printf("pic: %d, qpel:%d\n", s->pict_type, s->quarter_sample); 
+    s->pict_type = get_bits(&s->gb, 2) + I_TYPE;	/* pict type: I = 0 , P = 1 */
+//if(s->pict_type!=I_TYPE) return FRAME_SKIPED;
+    if(s->pict_type==B_TYPE && s->low_delay && s->vol_control_parameters==0){
+        printf("low_delay flag set, but shouldnt, clearing it\n");
+        s->low_delay=0;
+    }
+// printf("pic: %d, qpel:%d\n", s->pict_type, s->quarter_sample); 
+//printf("%d", s->pict_type);
     time_incr=0;
     while (get_bits1(&s->gb) != 0) 
         time_incr++;
 
     check_marker(&s->gb, "before time_increment");
-    s->time_increment= get_bits(&s->gb, s->time_increment_bits);
+    time_increment= get_bits(&s->gb, s->time_increment_bits);
+//printf(" type:%d incr:%d increment:%d\n", s->pict_type, time_incr, time_increment);
     if(s->pict_type!=B_TYPE){
+        s->last_time_base= s->time_base;
         s->time_base+= time_incr;
-        s->last_non_b_time[1]= s->last_non_b_time[0];
-        s->last_non_b_time[0]= s->time_base*s->time_increment_resolution + s->time_increment;
+        s->time= s->time_base*s->time_increment_resolution + time_increment;
+        s->pp_time= s->time - s->last_non_b_time;
+        s->last_non_b_time= s->time;
     }else{
-        s->time= (s->last_non_b_time[1]/s->time_increment_resolution + time_incr)*s->time_increment_resolution;
-        s->time+= s->time_increment;
+        s->time= (s->last_time_base + time_incr)*s->time_increment_resolution + time_increment;
+        s->bp_time= s->last_non_b_time - s->time;
+        if(s->pp_time <=s->bp_time){
+//            printf("messed up order, seeking?, skiping current b frame\n");
+            return FRAME_SKIPED;
+        }
     }
 
     if(check_marker(&s->gb, "before vop_coded")==0 && s->picture_number==0){
@@ -2674,14 +3743,23 @@ int mpeg4_decode_picture_header(MpegEncContext * s)
              s->b_code = get_bits(&s->gb, 3);
 //printf("b-code %d\n", s->b_code);
          }
-//printf("quant:%d fcode:%d\n", s->qscale, s->f_code);
+//printf("quant:%d fcode:%d bcode:%d type:%d\n", s->qscale, s->f_code, s->b_code, s->pict_type);
          if(!s->scalability){
              if (s->shape!=RECT_SHAPE && s->pict_type!=I_TYPE) {
                  skip_bits1(&s->gb); // vop shape coding type
              }
          }
      }
+     /* detect buggy encoders which dont set the low_delay flag (divx4/xvid/opendivx)*/
+     // note we cannot detect divx5 without b-frames easyly (allthough its buggy too)
+     if(s->vo_type==0 && s->vol_control_parameters==0 && s->divx_version==0 && s->picture_number==0){
+         printf("looks like this file was encoded with (divx4/(old)xvid/opendivx) -> forcing low_delay flag\n");
+         s->low_delay=1;
+     }
+
      s->picture_number++; // better than pic number==0 allways ;)
+//printf("done\n");
+
      return 0;
 }
 
@@ -2691,22 +3769,29 @@ int intel_h263_decode_picture_header(MpegEncContext *s)
     int format;
 
     /* picture header */
-    if (get_bits(&s->gb, 22) != 0x20)
+    if (get_bits(&s->gb, 22) != 0x20) {
+        fprintf(stderr, "Bad picture start code\n");
         return -1;
-    skip_bits(&s->gb, 8); /* picture timestamp */
+    }
+    s->picture_number = get_bits(&s->gb, 8); /* picture timestamp */
 
-    if (get_bits1(&s->gb) != 1)
+    if (get_bits1(&s->gb) != 1) {
+        fprintf(stderr, "Bad marker\n");
         return -1;	/* marker */
-    if (get_bits1(&s->gb) != 0)
+    }
+    if (get_bits1(&s->gb) != 0) {
+        fprintf(stderr, "Bad H263 id\n");
         return -1;	/* h263 id */
+    }
     skip_bits1(&s->gb);	/* split screen off */
     skip_bits1(&s->gb);	/* camera  off */
     skip_bits1(&s->gb);	/* freeze picture release off */
 
     format = get_bits(&s->gb, 3);
-    if (format != 7)
+    if (format != 7) {
+        fprintf(stderr, "Intel H263 free format not supported\n");
         return -1;
-
+    }
     s->h263_plus = 0;
 
     s->pict_type = I_TYPE + get_bits1(&s->gb);
@@ -2714,12 +3799,18 @@ int intel_h263_decode_picture_header(MpegEncContext *s)
     s->unrestricted_mv = get_bits1(&s->gb); 
     s->h263_long_vectors = s->unrestricted_mv;
 
-    if (get_bits1(&s->gb) != 0)
+    if (get_bits1(&s->gb) != 0) {
+        fprintf(stderr, "SAC not supported\n");
         return -1;	/* SAC: off */
-    if (get_bits1(&s->gb) != 0)
+    }
+    if (get_bits1(&s->gb) != 0) {
+        fprintf(stderr, "Advanced Prediction Mode not supported\n");
         return -1;	/* advanced prediction mode: off */
-    if (get_bits1(&s->gb) != 0)
-        return -1;	/* not PB frame */
+    }
+    if (get_bits1(&s->gb) != 0) {
+        fprintf(stderr, "PB frame mode no supported\n");
+        return -1;	/* PB frame mode */
+    }
 
     /* skip unknown header garbage */
     skip_bits(&s->gb, 41);
diff --git a/src/libffmpeg/libavcodec/h263data.h b/src/libffmpeg/libavcodec/h263data.h
index a129fd6bf..5a7b943ea 100644
--- a/src/libffmpeg/libavcodec/h263data.h
+++ b/src/libffmpeg/libavcodec/h263data.h
@@ -1,11 +1,11 @@
 
 /* intra MCBPC, mb_type = (intra), then (intraq) */
-static const UINT8 intra_MCBPC_code[8] = { 1, 1, 2, 3, 1, 1, 2, 3 };
-static const UINT8 intra_MCBPC_bits[8] = { 1, 3, 3, 3, 4, 6, 6, 6 };
+const UINT8 intra_MCBPC_code[8] = { 1, 1, 2, 3, 1, 1, 2, 3 };
+const UINT8 intra_MCBPC_bits[8] = { 1, 3, 3, 3, 4, 6, 6, 6 };
 
 /* inter MCBPC, mb_type = (inter), (intra), (interq), (intraq), (inter4v) */
 /* Changed the tables for interq and inter4v+q, following the standard ** Juanjo ** */
-static const UINT8 inter_MCBPC_code[25] = { 
+const UINT8 inter_MCBPC_code[25] = { 
     1, 3, 2, 5, 
     3, 4, 3, 3, 
     3, 7, 6, 5,
@@ -14,7 +14,7 @@ static const UINT8 inter_MCBPC_code[25] = {
     1, /* Stuffing */
     2, 12, 14, 15,
 };
-static const UINT8 inter_MCBPC_bits[25] = { 
+const UINT8 inter_MCBPC_bits[25] = { 
     1, 4, 4, 6, 
     5, 8, 8, 7,
     3, 7, 7, 9,
@@ -125,45 +125,73 @@ static RLTable rl_inter = {
     inter_level,
 };
 
-/* table used for Advanced INTRA Coding, just RUN and LEVEL change */
-const INT8 inter_level_aic[102] = {
-  1,  1,  1,  1,  1,  1,  1,  1,
-  1,  3,  2,  1,  2,  2,  4,  5,
-  6,  7,  3,  2,  3,  4,  5,  2,
-  3,  4,  2,  3,  1,  2, 25,  1,
-  2, 24,  8,  2,  7,  4,  6,  1,
-  9, 23,  2,  3,  1, 10, 12, 11,
- 18, 17, 16, 15, 14, 13, 20, 19,
- 22, 21,  1,  1,  1,  1,  1,  1,
-  1,  2,  1,  1,  1,  3,  1,  1,
-  1,  1,  1,  1,  1,  4,  1,  1,
-  1,  1,  2,  2,  6,  5,  2,  2,
-  3,  7,  3,  4,  9,  8,  1,  1,
-  1,  2,  2,  2,  3, 10,
+const UINT16 intra_vlc_aic[103][2] = {
+{  0x2,  2 }, {  0x6,  3 }, {  0xe,  4 }, {  0xc,  5 }, 
+{  0xd,  5 }, { 0x10,  6 }, { 0x11,  6 }, { 0x12,  6 }, 
+{ 0x16,  7 }, { 0x1b,  8 }, { 0x20,  9 }, { 0x21,  9 }, 
+{ 0x1a,  9 }, { 0x1b,  9 }, { 0x1c,  9 }, { 0x1d,  9 }, 
+{ 0x1e,  9 }, { 0x1f,  9 }, { 0x23, 11 }, { 0x22, 11 }, 
+{ 0x57, 12 }, { 0x56, 12 }, { 0x55, 12 }, { 0x54, 12 }, 
+{ 0x53, 12 }, {  0xf,  4 }, { 0x14,  6 }, { 0x14,  7 }, 
+{ 0x1e,  8 }, {  0xf, 10 }, { 0x21, 11 }, { 0x50, 12 }, 
+{  0xb,  5 }, { 0x15,  7 }, {  0xe, 10 }, {  0x9, 10 }, 
+{ 0x15,  6 }, { 0x1d,  8 }, {  0xd, 10 }, { 0x51, 12 }, 
+{ 0x13,  6 }, { 0x23,  9 }, {  0x7, 11 }, { 0x17,  7 }, 
+{ 0x22,  9 }, { 0x52, 12 }, { 0x1c,  8 }, {  0xc, 10 }, 
+{ 0x1f,  8 }, {  0xb, 10 }, { 0x25,  9 }, {  0xa, 10 }, 
+{ 0x24,  9 }, {  0x6, 11 }, { 0x21, 10 }, { 0x20, 10 }, 
+{  0x8, 10 }, { 0x20, 11 }, {  0x7,  4 }, {  0xc,  6 }, 
+{ 0x10,  7 }, { 0x13,  8 }, { 0x11,  9 }, { 0x12,  9 }, 
+{  0x4, 10 }, { 0x27, 11 }, { 0x26, 11 }, { 0x5f, 12 }, 
+{  0xf,  6 }, { 0x13,  9 }, {  0x5, 10 }, { 0x25, 11 }, 
+{  0xe,  6 }, { 0x14,  9 }, { 0x24, 11 }, {  0xd,  6 }, 
+{  0x6, 10 }, { 0x5e, 12 }, { 0x11,  7 }, {  0x7, 10 }, 
+{ 0x13,  7 }, { 0x5d, 12 }, { 0x12,  7 }, { 0x5c, 12 }, 
+{ 0x14,  8 }, { 0x5b, 12 }, { 0x15,  8 }, { 0x1a,  8 }, 
+{ 0x19,  8 }, { 0x18,  8 }, { 0x17,  8 }, { 0x16,  8 }, 
+{ 0x19,  9 }, { 0x15,  9 }, { 0x16,  9 }, { 0x18,  9 }, 
+{ 0x17,  9 }, {  0x4, 11 }, {  0x5, 11 }, { 0x58, 12 }, 
+{ 0x59, 12 }, { 0x5a, 12 }, {  0x3,  7 },
 };
 
-const INT8 inter_run_aic[102] = {
-  0,  1,  3,  5,  7,  8,  9, 10,
- 11,  4,  9, 13,  0,  1,  1,  1,
-  1,  1,  0,  3,  2,  3,  0,  4,
-  3,  0,  5,  5,  2,  6,  0,  4,
-  7,  0,  0,  8,  0,  2,  0, 12,
-  0,  0,  2,  1,  6,  0,  0,  0,
-  0,  0,  0,  0,  0,  0,  0,  0,
-  0,  0,  0, 14, 20,  1, 19,  2,
-  3,  0,  5,  6,  4,  0,  9, 10,
- 11, 12, 13,  8,  7,  0, 17, 18,
- 16, 15,  2,  1,  0,  0,  4,  3,
-  1,  0,  2,  1,  0,  0, 21, 22,
- 23,  7,  6,  5,  3,  0,
+const INT8 intra_run_aic[102] = {
+ 0,  0,  0,  0,  0,  0,  0,  0, 
+ 0,  0,  0,  0,  0,  0,  0,  0, 
+ 0,  0,  0,  0,  0,  0,  0,  0, 
+ 0,  1,  1,  1,  1,  1,  1,  1, 
+ 2,  2,  2,  2,  3,  3,  3,  3, 
+ 4,  4,  4,  5,  5,  5,  6,  6, 
+ 7,  7,  8,  8,  9,  9, 10, 11, 
+12, 13,  0,  0,  0,  0,  0,  0, 
+ 0,  0,  0,  0,  1,  1,  1,  1, 
+ 2,  2,  2,  3,  3,  3,  4,  4, 
+ 5,  5,  6,  6,  7,  7,  8,  9, 
+10, 11, 12, 13, 14, 15, 16, 17, 
+18, 19, 20, 21, 22, 23, 
+};
+
+const INT8 intra_level_aic[102] = {
+ 1,  2,  3,  4,  5,  6,  7,  8, 
+ 9, 10, 11, 12, 13, 14, 15, 16, 
+17, 18, 19, 20, 21, 22, 23, 24, 
+25,  1,  2,  3,  4,  5,  6,  7, 
+ 1,  2,  3,  4,  1,  2,  3,  4, 
+ 1,  2,  3,  1,  2,  3,  1,  2, 
+ 1,  2,  1,  2,  1,  2,  1,  1, 
+ 1,  1,  1,  2,  3,  4,  5,  6, 
+ 7,  8,  9, 10,  1,  2,  3,  4, 
+ 1,  2,  3,  1,  2,  3,  1,  2, 
+ 1,  2,  1,  2,  1,  2,  1,  1, 
+ 1,  1,  1,  1,  1,  1,  1,  1, 
+ 1,  1,  1,  1,  1,  1,
 };
 
 static RLTable rl_intra_aic = {
     102,
     58,
-    inter_vlc,
-    inter_run_aic,
-    inter_level_aic,
+    intra_vlc_aic,
+    intra_run_aic,
+    intra_level_aic,
 };
 
 static const UINT16 h263_format[8][2] = {
@@ -174,4 +202,3 @@ static const UINT16 h263_format[8][2] = {
     { 704, 576 },
     { 1408, 1152 },
 };
-
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index e909ac56e..3c90a1e47 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -1,53 +1,60 @@
 /*
  * H263 decoder
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-#include "config.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "dsputil.h"
 #include "avcodec.h"
+#include "dsputil.h"
 #include "mpegvideo.h"
-#include "xine-utils/xineutils.h"
 
 //#define DEBUG
+//#define PRINT_FRAME_TIME
+#ifdef PRINT_FRAME_TIME
+static inline long long rdtsc()
+{
+	long long l;
+	asm volatile(	"rdtsc\n\t"
+		: "=A" (l)
+	);
+//	printf("%d\n", int(l/1000));
+	return l;
+}
+#endif
 
 static int h263_decode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
-    int i;
 
     s->avctx = avctx;
     s->out_format = FMT_H263;
 
     s->width = avctx->width;
     s->height = avctx->height;
+    s->workaround_bugs= avctx->workaround_bugs;
 
     /* select sub codec */
     switch(avctx->codec->id) {
     case CODEC_ID_H263:
         s->gob_number = 0;
-        s->first_gob_line = 0;
+        s->first_slice_line = 0;
         break;
     case CODEC_ID_MPEG4:
         s->time_increment_bits = 4; /* default value for broken headers */
         s->h263_pred = 1;
-        s->has_b_frames = 1;
+        s->has_b_frames = 1; //default, might be overriden in the vol header during header parsing
         break;
     case CODEC_ID_MSMPEG4V1:
         s->h263_msmpeg4 = 1;
@@ -64,23 +71,25 @@ static int h263_decode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->msmpeg4_version=3;
         break;
+    case CODEC_ID_WMV1:
+        s->h263_msmpeg4 = 1;
+        s->h263_pred = 1;
+        s->msmpeg4_version=4;
+        break;
     case CODEC_ID_H263I:
         s->h263_intel = 1;
         break;
     default:
         return -1;
     }
-
+    s->codec_id= avctx->codec->id;
+    avctx->mbskip_table= s->mbskip_table;
+    
     /* for h263, we allocate the images after having read the header */
     if (avctx->codec->id != CODEC_ID_H263 && avctx->codec->id != CODEC_ID_MPEG4)
         if (MPV_common_init(s) < 0)
             return -1;
 
-    /* XXX: suppress this matrix init, only needed because using mpeg1
-       dequantize in mmx case */
-    for(i=0;i<64;i++)
-        s->non_intra_matrix[i] = default_non_intra_matrix[i];
-
     if (s->h263_msmpeg4)
         msmpeg4_decode_init_vlc(s);
     else
@@ -104,25 +113,37 @@ static int h263_decode_frame(AVCodecContext *avctx,
     MpegEncContext *s = avctx->priv_data;
     int ret;
     AVPicture *pict = data; 
-
+#ifdef PRINT_FRAME_TIME
+uint64_t time= rdtsc();
+#endif
 #ifdef DEBUG
     printf("*****frame %d size=%d\n", avctx->frame_number, buf_size);
     printf("bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]);
 #endif
-    
+
+    s->hurry_up= avctx->hurry_up;
+    s->error_resilience= avctx->error_resilience;
+    s->workaround_bugs= avctx->workaround_bugs;
+    s->flags= avctx->flags;
+
     /* no supplementary picture */
     if (buf_size == 0) {
         *data_size = 0;
         return 0;
     }
 
-    init_get_bits(&s->gb, buf, buf_size);
+    if(s->bitstream_buffer_size && buf_size<20){ //divx 5.01+ frame reorder
+        init_get_bits(&s->gb, s->bitstream_buffer, s->bitstream_buffer_size);
+    }else
+        init_get_bits(&s->gb, buf, buf_size);
+    s->bitstream_buffer_size=0;
 
     /* let's go :-) */
     if (s->h263_msmpeg4) {
         ret = msmpeg4_decode_picture_header(s);
     } else if (s->h263_pred) {
         ret = mpeg4_decode_picture_header(s);
+        s->has_b_frames= !s->low_delay;
     } else if (s->h263_intel) {
         ret = intel_h263_decode_picture_header(s);
     } else {
@@ -146,8 +167,21 @@ static int h263_decode_frame(AVCodecContext *avctx,
             return -1;
     }
 
+    if(ret==FRAME_SKIPED) return 0;
+    /* skip if the header was thrashed */
     if (ret < 0)
         return -1;
+    /* skip b frames if we dont have reference frames */
+    if(s->num_available_buffers<2 && s->pict_type==B_TYPE) return 0;
+    /* skip b frames if we are in a hurry */
+    if(s->hurry_up && s->pict_type==B_TYPE) return 0;
+    
+    if(s->next_p_frame_damaged){
+        if(s->pict_type==B_TYPE)
+            return 0;
+        else
+            s->next_p_frame_damaged=0;
+    }
 
     MPV_frame_start(s);
 
@@ -155,6 +189,12 @@ static int h263_decode_frame(AVCodecContext *avctx,
     printf("qscale=%d\n", s->qscale);
 #endif
 
+    /* init resync/ error resilience specific variables */
+    s->next_resync_qscale= s->qscale;
+    s->next_resync_gb= s->gb;
+    if(s->resync_marker) s->mb_num_left= 0;
+    else                 s->mb_num_left= s->mb_num;
+
     /* decode each macroblock */
     s->block_wrap[0]=
     s->block_wrap[1]=
@@ -167,7 +207,13 @@ static int h263_decode_frame(AVCodecContext *avctx,
         /* FIXME: In the future H.263+ will have intra prediction */
         /* and we are gonna need another way to detect MPEG4      */
         if (s->mb_y && !s->h263_pred) {
-            s->first_gob_line = h263_decode_gob_header(s);
+            s->first_slice_line = h263_decode_gob_header(s);
+        }
+        
+        if(s->msmpeg4_version==1){
+            s->last_dc[0]=
+            s->last_dc[1]=
+            s->last_dc[2]= 128;
         }
 
         s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1;
@@ -186,35 +232,95 @@ static int h263_decode_frame(AVCodecContext *avctx,
 #ifdef DEBUG
             printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 #endif
+
+            if(s->resync_marker){
+                if(s->mb_num_left<=0){
+                    /* except the first block */
+                    if(s->mb_x!=0 || s->mb_y!=0){
+                        /* did we miss the next resync marker without noticing an error yet */
+                        if(((get_bits_count(&s->gb)+8)&(~7)) != s->next_resync_pos && s->decoding_error==0){
+                            fprintf(stderr, "slice end missmatch x:%d y:%d %d %d\n", 
+                                    s->mb_x, s->mb_y, get_bits_count(&s->gb), s->next_resync_pos);
+                            ff_conceal_past_errors(s, 1);
+                        }
+                    }
+                    s->qscale= s->next_resync_qscale;
+                    s->gb= s->next_resync_gb;
+                    s->resync_mb_x= s->mb_x; //we know that the marker is here cuz mb_num_left was the distance to it
+                    s->resync_mb_y= s->mb_y;
+                    s->first_slice_line=1;
+
+                    if(s->codec_id==CODEC_ID_MPEG4){
+                        ff_mpeg4_clean_buffers(s);
+                        ff_mpeg4_resync(s);
+                    }
+                }
+
+                if(   s->resync_mb_x==s->mb_x 
+                   && s->resync_mb_y==s->mb_y && s->decoding_error!=0){
+                    fprintf(stderr, "resynced at %d %d\n", s->mb_x, s->mb_y);
+                    s->decoding_error= 0;
+                }
+            }
+
             //fprintf(stderr,"\nFrame: %d\tMB: %d",avctx->frame_number, (s->mb_y * s->mb_width) + s->mb_x);
             /* DCT & quantize */
-            if (s->h263_msmpeg4) {
-                msmpeg4_dc_scale(s);
-            } else if (s->h263_pred) {
-                h263_dc_scale(s);
+            if (s->h263_pred && !(s->msmpeg4_version==1 || s->msmpeg4_version==2)) {
+                /* old ffmpeg encoded msmpeg4v3 workaround */
+                if(s->workaround_bugs==1 && s->msmpeg4_version==3) 
+                    ff_old_msmpeg4_dc_scale(s);
+                else
+                    h263_dc_scale(s);                
             } else {
                 /* default quantization values */
                 s->y_dc_scale = 8;
                 s->c_dc_scale = 8;
             }
-            clear_blocks(s->block[0]);
+
+            if(s->decoding_error!=DECODING_DESYNC){
+                int last_error= s->decoding_error;
+                clear_blocks(s->block[0]);
             
-            s->mv_dir = MV_DIR_FORWARD;
-            s->mv_type = MV_TYPE_16X16; 
-            if (s->h263_msmpeg4) {
-		if (msmpeg4_decode_mb(s, s->block) < 0) {
-		    fprintf(stderr,"\nError at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x);
-                    return -1;
-		}
-            } else {
-                if (h263_decode_mb(s, s->block) < 0) {
-                    fprintf(stderr,"\nError at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x);
-                    return -1;
+                s->mv_dir = MV_DIR_FORWARD;
+                s->mv_type = MV_TYPE_16X16;
+                if (s->h263_msmpeg4) {
+                    if (msmpeg4_decode_mb(s, s->block) < 0) {
+                        fprintf(stderr,"Error at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x);
+                        s->decoding_error=DECODING_DESYNC;
+                    }
+                } else {
+                    if (h263_decode_mb(s, s->block) < 0) {
+                        fprintf(stderr,"Error at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x);
+                        s->decoding_error=DECODING_DESYNC;
+                    }
+                }
+
+                if(s->decoding_error!=last_error){
+                    ff_conceal_past_errors(s, 0);
                 }
             }
+
+            /* conceal errors */
+            if(    s->decoding_error==DECODING_DESYNC
+               || (s->decoding_error==DECODING_ACDC_LOST && s->mb_intra)){
+                s->mv_dir = MV_DIR_FORWARD;
+                s->mv_type = MV_TYPE_16X16;
+                s->mb_skiped=0;
+                s->mb_intra=0;
+                s->mv[0][0][0]=0; //FIXME this is not optimal 
+                s->mv[0][0][1]=0;
+                clear_blocks(s->block[0]);
+            }else if(s->decoding_error && !s->mb_intra){
+                clear_blocks(s->block[0]);
+            }
+            //FIXME remove AC for intra
+                        
             MPV_decode_mb(s, s->block);
+
+            s->mb_num_left--;            
         }
-        if (avctx->draw_horiz_band) {
+        if (    avctx->draw_horiz_band 
+            && (s->num_available_buffers>=1 || (!s->has_b_frames)) ) {
             UINT8 *src_ptr[3];
             int y, h, offset;
             y = s->mb_y * 16;
@@ -236,11 +342,84 @@ static int h263_decode_frame(AVCodecContext *avctx,
         }
     }
     
-    if (s->h263_msmpeg4 && s->pict_type==I_TYPE)
+    if (s->h263_msmpeg4 && s->msmpeg4_version<4 && s->pict_type==I_TYPE)
         if(msmpeg4_decode_ext_header(s, buf_size) < 0) return -1;
+    
+    /* divx 5.01+ bistream reorder stuff */
+    if(s->codec_id==CODEC_ID_MPEG4 && s->bitstream_buffer_size==0){
+        int current_pos= get_bits_count(&s->gb)>>3;
 
+        if(   buf_size - current_pos > 5 
+           && buf_size - current_pos < BITSTREAM_BUFFER_SIZE){
+            int i;
+            int startcode_found=0;
+            for(i=current_pos; i<buf_size; i++){
+                if(buf[i]==0 && buf[i+1]==0 && buf[i+2]==1 && buf[i+3]==0xB6){
+                    startcode_found=1;
+                    break;
+                }
+            }
+            if(startcode_found){
+                memcpy(s->bitstream_buffer, buf + current_pos, buf_size - current_pos);
+                s->bitstream_buffer_size= buf_size - current_pos;
+            }
+        }
+    }
+
+    if(s->bitstream_buffer_size==0 && s->error_resilience>0){
+        int left= s->gb.size*8 - get_bits_count(&s->gb);
+        int max_extra=8;
+        
+        if(s->codec_id==CODEC_ID_MPEG4) max_extra+=32;
+
+        if(left>max_extra){
+            fprintf(stderr, "discarding %d junk bits at end, next would be %X\n", left, show_bits(&s->gb, 24));
+            if(s->decoding_error==0)
+                ff_conceal_past_errors(s, 1);
+        }
+        if(left<0){
+            fprintf(stderr, "overreading %d bits\n", -left);
+            if(s->decoding_error==0)
+                ff_conceal_past_errors(s, 1);
+        }
+    }
+  
     MPV_frame_end(s);
-    
+#if 0 //dirty show MVs, we should export the MV tables and write a filter to show them
+{
+  int mb_y;
+  s->has_b_frames=1;
+  for(mb_y=0; mb_y<s->mb_height; mb_y++){
+    int mb_x;
+    int y= mb_y*16 + 8;
+    for(mb_x=0; mb_x<s->mb_width; mb_x++){
+      int x= mb_x*16 + 8;
+      uint8_t *ptr= s->last_picture[0];
+      int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2);
+      int mx= (s->motion_val[xy][0]>>1) + x;
+      int my= (s->motion_val[xy][1]>>1) + y;
+      int i;
+      int max;
+
+      if(mx<0) mx=0;
+      if(my<0) my=0;
+      if(mx>=s->width)  mx= s->width -1;
+      if(my>=s->height) my= s->height-1;
+      max= ABS(mx-x);
+      if(ABS(my-y) > max) max= ABS(my-y);
+      /* the ugliest linedrawing routine ... */
+      for(i=0; i<max; i++){
+        int x1= x + (mx-x)*i/max;
+        int y1= y + (my-y)*i/max;
+        ptr[y1*s->linesize + x1]+=100;
+      }
+      ptr[y*s->linesize + x]+=100;
+      s->mbskip_table[mb_x + mb_y*s->mb_width]=0;
+    }
+  }
+
+}
+#endif    
     if(s->pict_type==B_TYPE || (!s->has_b_frames)){
         pict->data[0] = s->current_picture[0];
         pict->data[1] = s->current_picture[1];
@@ -260,7 +439,13 @@ static int h263_decode_frame(AVCodecContext *avctx,
     /* we substract 1 because it is added on utils.c    */
     avctx->frame_number = s->picture_number - 1;
 
-    *data_size = sizeof(AVPicture);
+    /* dont output the last pic after seeking 
+       note we allready added +1 for the current pix in MPV_frame_end(s) */
+    if(s->num_available_buffers>=2 || (!s->has_b_frames))
+        *data_size = sizeof(AVPicture);
+#ifdef PRINT_FRAME_TIME
+printf("%Ld\n", rdtsc()-time);
+#endif
     return buf_size;
 }
 
@@ -324,6 +509,18 @@ AVCodec msmpeg4v3_decoder = {
     CODEC_CAP_DRAW_HORIZ_BAND,
 };
 
+AVCodec wmv1_decoder = {
+    "wmv1",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_WMV1,
+    sizeof(MpegEncContext),
+    h263_decode_init,
+    NULL,
+    h263_decode_end,
+    h263_decode_frame,
+    CODEC_CAP_DRAW_HORIZ_BAND,
+};
+
 AVCodec h263i_decoder = {
     "h263i",
     CODEC_TYPE_VIDEO,
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 2c71850ee..b8eaa5fbd 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -1,25 +1,24 @@
 /*
  * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Gerard Lantau.
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  */
 
-#include "xine-utils/xineutils.h"
 #include "../dsputil.h"
 #include "../simple_idct.h"
 
@@ -45,38 +44,124 @@ int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 
-
 /* external functions, from idct_mmx.c */
 void ff_mmx_idct(DCTELEM *block);
 void ff_mmxext_idct(DCTELEM *block);
 
 /* pixel operations */
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
-static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
-//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
-//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
+static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
+static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
+static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
 
 #define JUMPALIGN() __asm __volatile (".balign 8"::)
 #define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
 
+#define MOVQ_WONE(regd) \
+    __asm __volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd ::)
+
+#define MOVQ_BFE(regd) \
+    __asm __volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
+    "paddb %%" #regd ", %%" #regd " \n\t" ::)
+
 #ifndef PIC
-#define MOVQ_WONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
+#define MOVQ_BONE(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
 #define MOVQ_WTWO(regd)  __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
 #else
 // for shared library it's better to use this way for accessing constants
 // pcmpeqd -> -1
-#define MOVQ_WONE(regd) \
+#define MOVQ_BONE(regd) \
     __asm __volatile ( \
-       "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-       "psrlw $15, %%" #regd ::)
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd " \n\t" \
+    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
 
 #define MOVQ_WTWO(regd) \
     __asm __volatile ( \
-       "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-       "psrlw $15, %%" #regd " \n\t" \
-       "psllw $1, %%" #regd ::)
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd " \n\t" \
+    "psllw $1, %%" #regd " \n\t"::)
+
 #endif
 
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
+    "movq " #rega ", " #regr "	\n\t"\
+    "pand " #regb ", " #regr "	\n\t"\
+    "pxor " #rega ", " #regb "	\n\t"\
+    "pand " #regfe "," #regb "	\n\t"\
+    "psrlq $1, " #regb " 	\n\t"\
+    "paddb " #regb ", " #regr "	\n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+    "movq " #rega ", " #regr "	\n\t"\
+    "por  " #regb ", " #regr "	\n\t"\
+    "pxor " #rega ", " #regb "	\n\t"\
+    "pand " #regfe "," #regb "	\n\t"\
+    "psrlq $1, " #regb "	\n\t"\
+    "psubb " #regb ", " #regr "	\n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
+    "movq " #rega ", " #regr "	\n\t"\
+    "movq " #regc ", " #regp "	\n\t"\
+    "pand " #regb ", " #regr "	\n\t"\
+    "pand " #regd ", " #regp "	\n\t"\
+    "pxor " #rega ", " #regb "	\n\t"\
+    "pxor " #regc ", " #regd "	\n\t"\
+    "pand %%mm6, " #regb "	\n\t"\
+    "pand %%mm6, " #regd "	\n\t"\
+    "psrlq $1, " #regb " 	\n\t"\
+    "psrlq $1, " #regd " 	\n\t"\
+    "paddb " #regb ", " #regr "	\n\t"\
+    "paddb " #regd ", " #regp "	\n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+    "movq " #rega ", " #regr "	\n\t"\
+    "movq " #regc ", " #regp "	\n\t"\
+    "por  " #regb ", " #regr "	\n\t"\
+    "por  " #regd ", " #regp "	\n\t"\
+    "pxor " #rega ", " #regb "	\n\t"\
+    "pxor " #regc ", " #regd "	\n\t"\
+    "pand %%mm6, " #regb "     	\n\t"\
+    "pand %%mm6, " #regd "     	\n\t"\
+    "psrlq $1, " #regd "	\n\t"\
+    "psrlq $1, " #regb "	\n\t"\
+    "psubb " #regb ", " #regr "	\n\t"\
+    "psubb " #regd ", " #regp "	\n\t"
+
+/***********************************/
+/* MMX no rounding */
+#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
+#define SET_RND  MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f)	PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)		PAVGB_MMX_NO_RND(a, b, c, e)
+
+#include "dsputil_mmx_rnd.h"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ##_mmx
+#define SET_RND  MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f)	PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)		PAVGB_MMX(a, b, c, e)
+
+#include "dsputil_mmx_rnd.h"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+
 /***********************************/
 /* 3Dnow specific */
 
@@ -92,7 +177,7 @@ static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x000
 /***********************************/
 /* MMX2 specific */
 
-#define DEF(x) x ## _sse
+#define DEF(x) x ## _mmx2
 
 /* Introduced only in MMX2 set */
 #define PAVGB "pavgb"
@@ -107,34 +192,59 @@ static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x000
 
 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
 {
-    DCTELEM *p;
-    const UINT8 *pix;
-    int i;
+    asm volatile(
+        "movl $-128, %%eax	\n\t"
+        "pxor %%mm7, %%mm7	\n\t"
+        ".balign 16		\n\t"
+        "1:			\n\t"
+        "movq (%0), %%mm0	\n\t"
+        "movq (%0, %2), %%mm2	\n\t"
+        "movq %%mm0, %%mm1	\n\t"
+        "movq %%mm2, %%mm3	\n\t"
+        "punpcklbw %%mm7, %%mm0	\n\t"
+        "punpckhbw %%mm7, %%mm1	\n\t"
+        "punpcklbw %%mm7, %%mm2	\n\t"
+        "punpckhbw %%mm7, %%mm3	\n\t"
+        "movq %%mm0, (%1, %%eax)\n\t"
+        "movq %%mm1, 8(%1, %%eax)\n\t"
+        "movq %%mm2, 16(%1, %%eax)\n\t"
+        "movq %%mm3, 24(%1, %%eax)\n\t"
+        "addl %3, %0		\n\t"
+        "addl $32, %%eax	\n\t"
+        "js 1b			\n\t"
+        : "+r" (pixels)
+        : "r" (block+64), "r" (line_size), "r" (line_size*2)
+        : "%eax"
+    );
+}
 
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    for(i=0;i<4;i++) {
-	__asm __volatile(
-		"movq	%1, %%mm0\n\t"
-		"movq	%2, %%mm1\n\t"
-		"movq	%%mm0, %%mm2\n\t"
-		"movq	%%mm1, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm2\n\t"
-		"punpcklbw %%mm7, %%mm1\n\t"
-		"punpckhbw %%mm7, %%mm3\n\t"
-		"movq	%%mm0, %0\n\t"
-		"movq	%%mm2, 8%0\n\t"
-		"movq	%%mm1, 16%0\n\t"
-		"movq	%%mm3, 24%0\n\t"
-		:"=m"(*p)
-		:"m"(*pix), "m"(*(pix+line_size))
-		:"memory");
-        pix += line_size*2;
-        p += 16;
-    }
+static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
+{
+    asm volatile(
+        "pxor %%mm7, %%mm7	\n\t"
+        "movl $-128, %%eax	\n\t"
+        ".balign 16		\n\t"
+        "1:			\n\t"
+        "movq (%0), %%mm0	\n\t"
+        "movq (%1), %%mm2	\n\t"
+        "movq %%mm0, %%mm1	\n\t"
+        "movq %%mm2, %%mm3	\n\t"
+        "punpcklbw %%mm7, %%mm0	\n\t"
+        "punpckhbw %%mm7, %%mm1	\n\t"
+        "punpcklbw %%mm7, %%mm2	\n\t"
+        "punpckhbw %%mm7, %%mm3	\n\t"
+        "psubw %%mm2, %%mm0	\n\t"
+        "psubw %%mm3, %%mm1	\n\t"
+        "movq %%mm0, (%2, %%eax)\n\t"
+        "movq %%mm1, 8(%2, %%eax)\n\t"
+        "addl %3, %0		\n\t"
+        "addl %3, %1		\n\t"
+        "addl $16, %%eax	\n\t"
+        "jnz 1b			\n\t"
+        : "+r" (s1), "+r" (s2)
+        : "r" (block+64), "r" (stride)
+        : "%eax"
+    );
 }
 
 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
@@ -203,12 +313,12 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
     pix = pixels;
     MOVQ_ZERO(mm7);
     i = 4;
-    while (i) {
+    do {
 	__asm __volatile(
-		"movq	%2, %%mm0\n\t"
-		"movq	8%2, %%mm1\n\t"
-		"movq	16%2, %%mm2\n\t"
-		"movq	24%2, %%mm3\n\t"
+		"movq	(%2), %%mm0\n\t"
+		"movq	8(%2), %%mm1\n\t"
+		"movq	16(%2), %%mm2\n\t"
+		"movq	24(%2), %%mm3\n\t"
 		"movq	%0, %%mm4\n\t"
 		"movq	%1, %%mm6\n\t"
 		"movq	%%mm4, %%mm5\n\t"
@@ -226,809 +336,42 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line
 		"movq	%%mm0, %0\n\t"
 		"movq	%%mm2, %1\n\t"
 		:"+m"(*pix), "+m"(*(pix+line_size))
-		:"m"(*p)
+		:"r"(p)
 		:"memory");
         pix += line_size*2;
         p += 16;
-        i--;
-    };
+    } while (--i);
 }
 
 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-    int hh;
-    UINT8 *p;
-    const UINT8 *pix;
-
-    p   = block;
-    pix = pixels; // 2s
-#if 0
-    do {
-      __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-	pix += line_size;
-	p += line_size;
-    } while (--h);
-#else
-    // this optimized code is not very usefull
-    // the above loop is definitely faster
-    // at least on Celeron 500MHz
-    hh = h & 3;
-    while (hh) {
-      __asm __volatile(
-	  "movq	%1, %%mm0\n\t"
-	  "movq	%%mm0, %0\n\t"
-	  :"=m"(*p)
-	  :"m"(*pix)
-	  :"memory");
-	pix += line_size;
-	p += line_size;
-	hh--;
-    }
-    hh=h>>2;
-    while (hh) {
-    __asm __volatile(
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%1, %2), %%mm1		\n\t"
-	"movq	(%1, %2, 2), %%mm2	\n\t"
-	"movq	(%1, %3), %%mm3		\n\t"
-	"movq	%%mm0, (%0)		\n\t"
-	"movq	%%mm1, (%0, %2)		\n\t"
-	"movq	%%mm2, (%0, %2, 2)	\n\t"
-	"movq	%%mm3, (%0, %3)		\n\t"
-	::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
-	:"memory");
-        pix += line_size*4;
-	p += line_size*4;
-        hh--;
-    }
-#endif
-}
-
-static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm4);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size; p += line_size;
-  } while (--h);
-}
-
-static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm4);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p += line_size;
-  } while (--h);
-}
-
-static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels; // 1s
-  MOVQ_ZERO(mm7);
-  MOVQ_WTWO(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p += line_size;
-  } while(--h);
-}
-
-static void   put_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while (--h);
-}
-
-static void put_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while(--h);
-}
-
-static void   put_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  JUMPALIGN();
-  do {
     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while(--h);
-}
-
-static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm6, %%mm0\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  }
-  while (--h);
-}
-
-static void   avg_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"paddusw %%mm4, %%mm1\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm1\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"paddusw %%mm6, %%mm0\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while (--h);
-}
-
-static void   avg_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"paddusw %%mm4, %%mm1\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm1\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"paddusw %%mm6, %%mm0\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void   avg_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  // this doesn't seem to be used offten - so
-  // the inside usage of mm_wone is not optimized
-  MOVQ_WTWO(mm6);
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"movq	%3, %%mm5\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"movq	%0, %%mm1\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm5, %%mm0\n\t"
-	"paddusw %%mm5, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size)), "m"(mm_wone)
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void avg_no_rnd_pixels_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%0, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while (--h);
-}
-
-static void   avg_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	"movq	%0, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm5, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
- } while (--h);
-}
-
-static void   avg_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	%0, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm5, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void   avg_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"movq	%0, %%mm1\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p += line_size;
-  } while(--h);
-}
-
-static void sub_pixels_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   8;
-  } while (--h);
-}
-
-static void sub_pixels_x2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
-}
-
-static void sub_pixels_y2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WONE(mm6);
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"+m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
-}
-
-static void   sub_pixels_xy2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  MOVQ_ZERO(mm7);
-  MOVQ_WTWO(mm6);
-  JUMPALIGN();
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"movq	%0, %%mm1\n\t"
-	"movq	8%0, %%mm3\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"psubsw %%mm0, %%mm1\n\t"
-	"psubsw %%mm2, %%mm3\n\t"
-	"movq	%%mm1, %0\n\t"
-	"movq	%%mm3, 8%0\n\t"
-	:"+m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   8 ;
-  } while(--h);
+	 "lea (%3, %3), %%eax		\n\t"
+	 ".balign 8			\n\t"
+	 "1:				\n\t"
+	 "movq (%1), %%mm0		\n\t"
+	 "movq (%1, %3), %%mm1		\n\t"
+     	 "movq %%mm0, (%2)		\n\t"
+	 "movq %%mm1, (%2, %3)		\n\t"
+	 "addl %%eax, %1		\n\t"
+         "addl %%eax, %2       		\n\t"
+	 "movq (%1), %%mm0		\n\t"
+	 "movq (%1, %3), %%mm1		\n\t"
+	 "movq %%mm0, (%2)		\n\t"
+	 "movq %%mm1, (%2, %3)		\n\t"
+	 "addl %%eax, %1		\n\t"
+	 "addl %%eax, %2       		\n\t"
+	 "subl $4, %0			\n\t"
+	 "jnz 1b			\n\t"
+	 : "+g"(h), "+r" (pixels),  "+r" (block)
+	 : "r"(line_size)
+	 : "%eax", "memory"
+	);
 }
 
 static void clear_blocks_mmx(DCTELEM *blocks)
 {
-        asm volatile(
+    __asm __volatile(
                 "pxor %%mm7, %%mm7		\n\t"
                 "movl $-128*6, %%eax		\n\t"
                 "1:				\n\t"
@@ -1043,7 +386,9 @@ static void clear_blocks_mmx(DCTELEM *blocks)
         );
 }
 
+#if 0
 static void just_return() { return; }
+#endif
 
 void dsputil_init_mmx(void)
 {
@@ -1065,10 +410,11 @@ void dsputil_init_mmx(void)
 
     if (mm_flags & MM_MMX) {
         get_pixels = get_pixels_mmx;
+        diff_pixels = diff_pixels_mmx;
         put_pixels_clamped = put_pixels_clamped_mmx;
         add_pixels_clamped = add_pixels_clamped_mmx;
         clear_blocks= clear_blocks_mmx;
-       
+
         pix_abs16x16     = pix_abs16x16_mmx;
         pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
         pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
@@ -1088,7 +434,7 @@ void dsputil_init_mmx(void)
         put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
         put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
         put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
-        
+
         avg_pixels_tab[0] = avg_pixels_mmx;
         avg_pixels_tab[1] = avg_pixels_x2_mmx;
         avg_pixels_tab[2] = avg_pixels_y2_mmx;
@@ -1098,44 +444,37 @@ void dsputil_init_mmx(void)
         avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
         avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
         avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
-        
-        sub_pixels_tab[0] = sub_pixels_mmx;
-        sub_pixels_tab[1] = sub_pixels_x2_mmx;
-        sub_pixels_tab[2] = sub_pixels_y2_mmx;
-        sub_pixels_tab[3] = sub_pixels_xy2_mmx;
 
         if (mm_flags & MM_MMXEXT) {
             pix_abs16x16    = pix_abs16x16_mmx2;
             pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
             pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
             pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
-            
+
             pix_abs8x8    = pix_abs8x8_mmx2;
             pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
-            
-            put_pixels_tab[1] = put_pixels_x2_sse;
-            put_pixels_tab[2] = put_pixels_y2_sse;
-            
-            avg_pixels_tab[0] = avg_pixels_sse;
-            avg_pixels_tab[1] = avg_pixels_x2_sse;
-            avg_pixels_tab[2] = avg_pixels_y2_sse;
-            avg_pixels_tab[3] = avg_pixels_xy2_sse;
-
-            sub_pixels_tab[1] = sub_pixels_x2_sse;
-            sub_pixels_tab[2] = sub_pixels_y2_sse;
+
+            put_pixels_tab[1] = put_pixels_x2_mmx2;
+            put_pixels_tab[2] = put_pixels_y2_mmx2;
+            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
+            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
+
+            avg_pixels_tab[0] = avg_pixels_mmx2;
+            avg_pixels_tab[1] = avg_pixels_x2_mmx2;
+            avg_pixels_tab[2] = avg_pixels_y2_mmx2;
+            avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
         } else if (mm_flags & MM_3DNOW) {
             put_pixels_tab[1] = put_pixels_x2_3dnow;
             put_pixels_tab[2] = put_pixels_y2_3dnow;
-            
+            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
+            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
+
             avg_pixels_tab[0] = avg_pixels_3dnow;
             avg_pixels_tab[1] = avg_pixels_x2_3dnow;
             avg_pixels_tab[2] = avg_pixels_y2_3dnow;
             avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
-
-            sub_pixels_tab[1] = sub_pixels_x2_3dnow;
-            sub_pixels_tab[2] = sub_pixels_y2_3dnow;
         }
 
         /* idct */
@@ -1181,12 +520,25 @@ void dsputil_init_mmx(void)
     avg_no_rnd_pixels_tab[2] = just_return;
     avg_no_rnd_pixels_tab[3] = just_return;
 
-    sub_pixels_tab[0] = just_return;
-    sub_pixels_tab[1] = just_return;
-    sub_pixels_tab[2] = just_return;
-    sub_pixels_tab[3] = just_return;
-
     //av_fdct = just_return;
     //ff_idct = just_return;
 #endif
 }
+
+/* remove any non bit exact operation (testing purpose). NOTE that
+   this function should be kept as small as possible because it is
+   always difficult to test automatically non bit exact cases. */
+void dsputil_set_bit_exact_mmx(void)
+{
+    if (mm_flags & MM_MMX) {
+        if (mm_flags & MM_MMXEXT) {
+            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
+            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
+            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
+        } else if (mm_flags & MM_3DNOW) {
+            put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
+            put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
+            avg_pixels_tab[3] = avg_pixels_xy2_mmx;
+        }
+    }
+}
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
index 830fe9f3b..a16ccc88b 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
@@ -1,342 +1,296 @@
 /*
  * DSP utils : average functions are compiled twice for 3dnow/mmx2
- * Copyright (c) 2000, 2001 Gerard Lantau.
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002 Michael Niedermayer
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ */
+ 
+/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
+   clobber bug - now it will work with 2.95.2 and also with -fPIC
  */
-
 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>2;
-  dh=h&3;
-  while(hh--) {
     __asm __volatile(
-	"movq	(%1), %%mm0\n\t"
-	"movq	1(%1), %%mm1\n\t"
-	"movq	(%1, %2), %%mm2\n\t"
-	"movq	1(%1, %2), %%mm3\n\t"
-	"movq	(%1, %2, 2), %%mm4\n\t"
-	"movq	1(%1, %2, 2), %%mm5\n\t"
-	"movq	(%1, %3), %%mm6\n\t"
-	"movq	1(%1, %3), %%mm7\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm3, %%mm2\n\t"
-	PAVGB"  %%mm5, %%mm4\n\t"
-	PAVGB"  %%mm7, %%mm6\n\t"
-	"movq	%%mm0, (%0)\n\t"
-	"movq	%%mm2, (%0, %2)\n\t"
-	"movq	%%mm4, (%0, %2, 2)\n\t"
-	"movq	%%mm6, (%0, %3)\n\t"
-	::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3)
-	:"memory");
-     pix += line_size*4; p += line_size*4;
-  }
-  while(dh--) {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-     pix += line_size; p += line_size;
-  }
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"addl %%eax, %1			\n\t"
+	"addl %%eax, %2			\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
-
-static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+ 
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%2, %%mm0\n\t"
-	"movq	%3, %%mm1\n\t"
-	"movq	%4, %%mm2\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm2, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"=m"(*p), "=m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size)),
-	 "m"(*(pix+line_size*2))
-	:"memory");
-     pix += line_size*2;
-     p += line_size*2;
-  }
-  if(dh) {
+    MOVQ_BONE(mm6);
     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-  }
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm2		\n\t"
+	"movq 1(%1), %%mm1		\n\t"
+	"movq 1(%1, %3), %%mm3		\n\t"
+	"addl %%eax, %1			\n\t"
+	"psubusb %%mm6, %%mm0		\n\t"
+	"psubusb %%mm6, %%mm2		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq 1(%1), %%mm1		\n\t"
+	"movq (%1, %3), %%mm2		\n\t"
+	"movq 1(%1, %3), %%mm3		\n\t"
+	"addl %%eax, %2			\n\t"
+	"addl %%eax, %1			\n\t"
+	"psubusb %%mm6, %%mm0		\n\t"
+	"psubusb %%mm6, %%mm2		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>2;
-  dh=h&3;
-  while(hh--) {
     __asm __volatile(
-	"movq	(%0), %%mm0\n\t"
-	"movq	(%1), %%mm1\n\t"
-	"movq	(%0, %2), %%mm2\n\t"
-	"movq	(%1, %2), %%mm3\n\t"
-	"movq	(%0, %2, 2), %%mm4\n\t"
-	"movq	(%1, %2, 2), %%mm5\n\t"
-	"movq	(%0, %3), %%mm6\n\t"
-	"movq	(%1, %3), %%mm7\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm3, %%mm2\n\t"
-	PAVGB"  %%mm5, %%mm4\n\t"
-	PAVGB"  %%mm7, %%mm6\n\t"
-	"movq	%%mm0, (%0)\n\t"
-	"movq	%%mm2, (%0, %2)\n\t"
-	"movq	%%mm4, (%0, %2, 2)\n\t"
-	"movq	%%mm6, (%0, %3)\n\t"
-	::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3)
-	:"memory");
-     pix += line_size*4; p += line_size*4;
-  }
-  while(dh--) {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-     pix += line_size; p += line_size;
-  }
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"subl %3, %2			\n\t"
+	"1:				\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm2	\n\t"
+	"addl %%eax, %1			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq %%mm0, (%2, %3)		\n\t"
+	"movq %%mm1, (%2, %%eax)	\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"addl %%eax, %2			\n\t"
+	"addl %%eax, %1			\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"movq %%mm1, (%2, %%eax)	\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D" (block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_x2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	1%2, %%mm3\n\t"
-	"movq	%3, %%mm4\n\t"
-	"movq	1%3, %%mm5\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"	%%mm3, %%mm2\n\t"
-	PAVGB"	%%mm2, %%mm0\n\t"
-	PAVGB"	%%mm5, %%mm4\n\t"
-	PAVGB"	%%mm4, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"+m"(*p), "+m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size*2;
-   p +=   line_size*2;
-  }
-  if(dh) {
+    MOVQ_BONE(mm6);
     __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	1%1, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%%mm2, %%mm1\n\t"
-	PAVGB"	%%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-  }
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"subl %3, %2			\n\t"
+	"1:				\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm2	\n\t"
+	"addl %%eax, %1			\n\t"
+	"psubusb %%mm6, %%mm1		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq %%mm0, (%2, %3)		\n\t"
+	"movq %%mm1, (%2, %%eax)	\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	"addl %%eax, %2			\n\t"
+	"addl %%eax, %1			\n\t"
+	"psubusb %%mm6, %%mm1		\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"movq %%mm1, (%2, %%eax)	\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D" (block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
 
-static void  DEF(avg_pixels_y2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  int dh, hh;
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	%3, %%mm3\n\t"
-	"movq	%3, %%mm4\n\t"
-	"movq	%4, %%mm5\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"	%%mm3, %%mm2\n\t"
-	PAVGB"	%%mm2, %%mm0\n\t"
-	PAVGB"	%%mm5, %%mm4\n\t"
-	PAVGB"	%%mm4, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"+m"(*p), "+m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2))
-	:"memory");
-   pix += line_size*2;
-   p +=   line_size*2;
-  }
-  if(dh) {
     __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%2, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%%mm2, %%mm1\n\t"
-	PAVGB"	%%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-  }
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%2), %%mm0		\n\t"
+	"movq (%2, %3), %%mm1		\n\t"
+	PAVGB" (%1), %%mm0		\n\t"
+	PAVGB" (%1, %3), %%mm1		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"addl %%eax, %1			\n\t"
+	"addl %%eax, %2			\n\t"
+	"movq (%2), %%mm0		\n\t"
+	"movq (%2, %3), %%mm1		\n\t"
+	PAVGB" (%1), %%mm0		\n\t"
+	PAVGB" (%1, %3), %%mm1		\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels_xy2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo));
-  do {
     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	PAVGB"	%0, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"+m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
+	"lea (%3, %3), %%eax		\n\t"
+	"1:				\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm2		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm2		\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" (%2, %3), %%mm2		\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"movq (%1, %3), %%mm2		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	PAVGB" 1(%1, %3), %%mm2		\n\t"
+	"addl %%eax, %2			\n\t"
+	"addl %%eax, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" (%2, %3), %%mm2		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
 
-static void DEF(sub_pixels_x2)( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7":);
-  do {
     __asm __volatile(
-	"movq	1%1, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"+m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	"subl %3, %2			\n\t"
+	"1:				\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm2	\n\t"
+	"addl %%eax, %1			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	"movq (%2, %3), %%mm3		\n\t"
+	"movq (%2, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm0		\n\t"
+	PAVGB" %%mm4, %%mm1		\n\t"
+	"movq %%mm0, (%2, %3)		\n\t"
+	"movq %%mm1, (%2, %%eax)	\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	"addl %%eax, %2			\n\t"
+	"addl %%eax, %1			\n\t"
+	"movq (%2, %3), %%mm3		\n\t"
+	"movq (%2, %%eax), %%mm4	\n\t"
+	PAVGB" %%mm3, %%mm2		\n\t"
+	PAVGB" %%mm4, %%mm1		\n\t"
+	"movq %%mm2, (%2, %3)		\n\t"
+	"movq %%mm1, (%2, %%eax)	\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax", "memory");
 }
 
-static void DEF(sub_pixels_y2)( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
+// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 
+static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
 {
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7":);
-  do {
+    MOVQ_BONE(mm6);
     __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"+m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	PAVGB" 1(%1), %%mm0		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq (%1, %%eax), %%mm2	\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"psubusb %%mm6, %%mm2		\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 1(%1, %%eax), %%mm2	\n\t"
+	"addl %%eax, %1			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" %%mm2, %%mm1		\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" (%2, %3), %%mm1		\n\t"
+	"movq %%mm0, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"movq (%1, %3), %%mm1		\n\t"
+	"movq (%1, %%eax), %%mm0	\n\t"
+	PAVGB" 1(%1, %3), %%mm1		\n\t"
+	PAVGB" 1(%1, %%eax), %%mm0	\n\t"
+	"addl %%eax, %2			\n\t"
+	"addl %%eax, %1			\n\t"
+	PAVGB" %%mm1, %%mm2		\n\t"
+	PAVGB" %%mm0, %%mm1		\n\t"
+	PAVGB" (%2), %%mm2		\n\t"
+	PAVGB" (%2, %3), %%mm1		\n\t"
+	"movq %%mm2, (%2)		\n\t"
+	"movq %%mm1, (%2, %3)		\n\t"
+	"addl %%eax, %2			\n\t"
+	"subl $4, %0			\n\t"
+	"jnz 1b				\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r" (line_size)
+	:"%eax",  "memory");
 }
-
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
new file mode 100644
index 000000000..dc70c9c8e
--- /dev/null
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -0,0 +1,305 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ */
+
+// put_pixels
+static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	"lea	(%3, %3), %%eax		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%2)		\n\t"
+	"movq	%%mm5, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm1		\n\t"
+	"movq	(%1, %3), %%mm2		\n\t"
+	"movq	1(%1, %3), %%mm3	\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%2)		\n\t"
+	"movq	%%mm5, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	"lea (%3, %3), %%eax		\n\t"
+	"movq (%1), %%mm0		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1, %3), %%mm1		\n\t"
+	"movq	(%1, %%eax),%%mm2	\n\t"
+	PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+	"movq	%%mm4, (%2)		\n\t"
+	"movq	%%mm5, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"movq	(%1, %3), %%mm1		\n\t"
+	"movq	(%1, %%eax),%%mm0	\n\t"
+	PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+	"movq	%%mm4, (%2)		\n\t"
+	"movq	%%mm5, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm __volatile(
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm4		\n\t"
+	"movq	%%mm0, %%mm1		\n\t"
+	"movq	%%mm4, %%mm5		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm5		\n\t"
+	"paddusw %%mm0, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm5		\n\t"
+	"xorl	%%eax, %%eax		\n\t"
+	"addl	%3, %1			\n\t"
+	".balign 8      		\n\t"
+	"1:				\n\t"
+	"movq	(%1, %%eax), %%mm0	\n\t"
+	"movq	1(%1, %%eax), %%mm2	\n\t"
+	"movq	%%mm0, %%mm1		\n\t"
+	"movq	%%mm2, %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm2, %%mm0	 	\n\t"
+	"paddusw %%mm3, %%mm1		\n\t"
+	"paddusw %%mm6, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm5		\n\t"
+	"paddusw %%mm0, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm5		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"psrlw	$2, %%mm5		\n\t"
+	"packuswb  %%mm5, %%mm4		\n\t"
+	"movq	%%mm4, (%2, %%eax)	\n\t"
+	"addl	%3, %%eax		\n\t"
+
+	"movq	(%1, %%eax), %%mm2	\n\t" // 0 <-> 2   1 <-> 3
+	"movq	1(%1, %%eax), %%mm4	\n\t"
+	"movq	%%mm2, %%mm3		\n\t"
+	"movq	%%mm4, %%mm5		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm3		\n\t"
+	"punpckhbw %%mm7, %%mm5		\n\t"
+	"paddusw %%mm2, %%mm4	 	\n\t"
+	"paddusw %%mm3, %%mm5		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm6, %%mm1		\n\t"
+	"paddusw %%mm4, %%mm0		\n\t"
+	"paddusw %%mm5, %%mm1		\n\t"
+	"psrlw	$2, %%mm0		\n\t"
+	"psrlw	$2, %%mm1		\n\t"
+	"packuswb  %%mm1, %%mm0		\n\t"
+	"movq	%%mm0, (%2, %%eax)	\n\t"
+	"addl	%3, %%eax		\n\t"
+
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels)
+	:"D"(block), "r"(line_size)
+	:"eax", "memory");
+}
+
+// avg_pixels
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	     "movq  %0, %%mm0		\n\t"
+	     "movq  %1, %%mm1		\n\t"
+	     PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	     "movq  %%mm2, %0		\n\t"
+	     :"+m"(*block)
+	     :"m"(*pixels)
+	     :"memory");
+	pixels += line_size;
+	block += line_size;
+    }
+    while (--h);
+}
+
+static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	    "movq  %1, %%mm0		\n\t"
+	    "movq  1%1, %%mm1		\n\t"
+	    "movq  %0, %%mm3		\n\t"
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+	    "movq  %%mm0, %0		\n\t"
+	    :"+m"(*block)
+	    :"m"(*pixels)
+	    :"memory");
+	pixels += line_size;
+	block += line_size;
+    } while (--h);
+}
+
+static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm __volatile(
+	"lea	(%3, %3), %%eax		\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1, %3), %%mm1		\n\t"
+	"movq	(%1, %%eax), %%mm2	\n\t"
+	PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+	"movq	(%2), %%mm3		\n\t"
+	PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
+	"movq	(%2, %3), %%mm3		\n\t"
+	PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+	"movq	%%mm0, (%2)		\n\t"
+	"movq	%%mm1, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+
+	"movq	(%1, %3), %%mm1		\n\t"
+	"movq	(%1, %%eax), %%mm0	\n\t"
+	PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+	"movq	(%2), %%mm3		\n\t"
+	PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
+	"movq	(%2, %3), %%mm3		\n\t"
+	PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+	"movq	%%mm0, (%2)		\n\t"
+	"movq	%%mm1, (%2, %3)		\n\t"
+	"addl	%%eax, %1		\n\t"
+	"addl	%%eax, %2		\n\t"
+
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels), "+D"(block)
+	:"r"(line_size)
+	:"eax", "memory");
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm __volatile(
+	"movq	(%1), %%mm0		\n\t"
+	"movq	1(%1), %%mm4		\n\t"
+	"movq	%%mm0, %%mm1		\n\t"
+	"movq	%%mm4, %%mm5		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm5		\n\t"
+	"paddusw %%mm0, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm5		\n\t"
+	"xorl	%%eax, %%eax		\n\t"
+	"addl	%3, %1			\n\t"
+	".balign 8			\n\t"
+	"1:				\n\t"
+	"movq	(%1, %%eax), %%mm0	\n\t"
+	"movq	1(%1, %%eax), %%mm2	\n\t"
+	"movq	%%mm0, %%mm1		\n\t"
+	"movq	%%mm2, %%mm3		\n\t"
+	"punpcklbw %%mm7, %%mm0		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpckhbw %%mm7, %%mm1		\n\t"
+	"punpckhbw %%mm7, %%mm3		\n\t"
+	"paddusw %%mm2, %%mm0	 	\n\t"
+	"paddusw %%mm3, %%mm1		\n\t"
+	"paddusw %%mm6, %%mm4		\n\t"
+	"paddusw %%mm6, %%mm5		\n\t"
+	"paddusw %%mm0, %%mm4		\n\t"
+	"paddusw %%mm1, %%mm5		\n\t"
+	"psrlw	$2, %%mm4		\n\t"
+	"psrlw	$2, %%mm5		\n\t"
+		"movq	(%2, %%eax), %%mm3	\n\t"
+	"packuswb  %%mm5, %%mm4		\n\t"
+		"pcmpeqd %%mm2, %%mm2	\n\t"
+		"paddb %%mm2, %%mm2	\n\t"
+		PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
+		"movq	%%mm5, (%2, %%eax)	\n\t"
+	"addl	%3, %%eax		\n\t"
+
+	"movq	(%1, %%eax), %%mm2	\n\t" // 0 <-> 2   1 <-> 3
+	"movq	1(%1, %%eax), %%mm4	\n\t"
+	"movq	%%mm2, %%mm3		\n\t"
+	"movq	%%mm4, %%mm5		\n\t"
+	"punpcklbw %%mm7, %%mm2		\n\t"
+	"punpcklbw %%mm7, %%mm4		\n\t"
+	"punpckhbw %%mm7, %%mm3		\n\t"
+	"punpckhbw %%mm7, %%mm5		\n\t"
+	"paddusw %%mm2, %%mm4	 	\n\t"
+	"paddusw %%mm3, %%mm5		\n\t"
+	"paddusw %%mm6, %%mm0		\n\t"
+	"paddusw %%mm6, %%mm1		\n\t"
+	"paddusw %%mm4, %%mm0		\n\t"
+	"paddusw %%mm5, %%mm1		\n\t"
+	"psrlw	$2, %%mm0		\n\t"
+	"psrlw	$2, %%mm1		\n\t"
+		"movq	(%2, %%eax), %%mm3	\n\t"
+	"packuswb  %%mm1, %%mm0		\n\t"
+		"pcmpeqd %%mm2, %%mm2	\n\t"
+		"paddb %%mm2, %%mm2	\n\t"
+		PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
+		"movq	%%mm1, (%2, %%eax)	\n\t"
+	"addl	%3, %%eax		\n\t"
+
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+	:"+g"(h), "+S"(pixels)
+	:"D"(block), "r"(line_size)
+	:"eax", "memory");
+}
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index e9d48383d..7135beb21 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -1,6 +1,6 @@
 /*
  * MMX optimized forward DCT
- * The gcc porting is Copyright (c) 2001 Gerard Lantau.
+ * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
  *
  * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
  * 
@@ -10,7 +10,7 @@
 #include "../common.h"
 #include "mmx.h"
 
-//#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
 
 //////////////////////////////////////////////////////////////////////
 //
diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c
index 618c1cfde..298c8a8b0 100644
--- a/src/libffmpeg/libavcodec/i386/idct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c
@@ -528,8 +528,12 @@ static inline void idct_col (int16_t * col, int offset)
     movq_r2m (mm3, *(col+offset+4*8));	// save y4
 
     movq_r2m (mm4, *(col+offset+3*8));	// save y3
-}
 
+#undef T1
+#undef T2
+#undef T3
+#undef C4
+}
 
 static int32_t rounder0[] ATTR_ALIGN(8) =
     rounder ((1 << (COL_SHIFT - 1)) - 0.5);
@@ -547,6 +551,8 @@ static int32_t rounder3[] ATTR_ALIGN(8) =
 static int32_t rounder5[] ATTR_ALIGN(8) =
     rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
 
+#undef COL_SHIFT
+#undef ROW_SHIFT
 
 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
 void idct (int16_t * block)					\
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index e704c4219..9b76cdb07 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -1,20 +1,20 @@
 /*
  * MMX optimized motion estimation
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * mostly by Michael Niedermayer <michaelni@gmx.at>
  */
@@ -26,6 +26,8 @@ static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
 0x0002000200020002,
 };
 
+static __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
+
 static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
 {
     int len= -(stride<<h);
@@ -115,6 +117,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
     int len= -(stride<<h);
     asm volatile(
         ".balign 16			\n\t"
+        "movq "MANGLE(bone)", %%mm5	\n\t"
         "1:				\n\t" 
         "movq (%1, %%eax), %%mm0	\n\t"
         "movq (%2, %%eax), %%mm2	\n\t"
@@ -122,6 +125,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
         "movq 1(%2, %%eax), %%mm3	\n\t"
         "pavgb %%mm2, %%mm0		\n\t"
         "pavgb %%mm1, %%mm3		\n\t"
+        "psubusb %%mm5, %%mm3		\n\t"
         "pavgb %%mm3, %%mm0		\n\t"
         "movq (%3, %%eax), %%mm2	\n\t"
         "psadbw %%mm2, %%mm0		\n\t"
@@ -132,6 +136,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
         "movq 1(%2, %%eax), %%mm4	\n\t"
         "pavgb %%mm3, %%mm1		\n\t"
         "pavgb %%mm4, %%mm2		\n\t"
+        "psubusb %%mm5, %%mm2		\n\t"
         "pavgb %%mm1, %%mm2		\n\t"
         "movq (%3, %%eax), %%mm1	\n\t"
         "psadbw %%mm1, %%mm2		\n\t"
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index b7a782f56..390aa554c 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -1,34 +1,30 @@
 /*
  * The simplest mpeg encoder (well, it was the simplest!)
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000,2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
- * h263 dequantizer by Michael Niedermayer <michaelni@gmx.at>
+ * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  */
 
-#include "xine-utils/xineutils.h"
 #include "../dsputil.h"
 #include "../mpegvideo.h"
 #include "../avcodec.h"
-#include "../mangle.h"
 
 extern UINT8 zigzag_end[64];
-extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
-extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale);
 
 extern UINT8 zigzag_direct_noperm[64];
 extern UINT16 inv_zigzag_direct16[64];
@@ -195,103 +191,86 @@ asm volatile(
 static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
-    int i, level, nCoeffs;
+    int nCoeffs;
     const UINT16 *quant_matrix;
     
     if(s->alternate_scan) nCoeffs= 64;
     else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ];
 
     if (s->mb_intra) {
+        int block0;
         if (n < 4) 
-            block[0] = block[0] * s->y_dc_scale;
+            block0 = block[0] * s->y_dc_scale;
         else
-            block[0] = block[0] * s->c_dc_scale;
-        /* isnt used anymore (we have a h263 unquantizer since some time)
-	if (s->out_format == FMT_H263) {
-            i = 1;
-            goto unquant_even;
-        }*/
+            block0 = block[0] * s->c_dc_scale;
         /* XXX: only mpeg1 */
         quant_matrix = s->intra_matrix;
-	i=1;
-	/* Align on 4 elements boundary */
-	while(i&3)
-	{
-            level = block[i];
-            if (level) {
-                if (level < 0) level = -level;
-                    level = (int)(level * qscale * quant_matrix[i]) >> 3;
-                    level = (level - 1) | 1;
-                if (block[i] < 0) level = -level;
-                block[i] = level;
-            }
-	    i++;
-	}
-	__asm __volatile(
-	"movd	%0, %%mm6\n\t"       /* mm6 = qscale | 0  */
-	"punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm6, %%mm7\n\t"
-	"movq	%1, %%mm5\n\t"
-	"packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */
-	"pxor	%%mm6, %%mm6\n\t"
-	::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory");
-        for(;i<nCoeffs;i+=4) {
-		__asm __volatile(
-			"movq	%1, %%mm0\n\t"
-			"movq	%%mm7, %%mm1\n\t"
-			"movq	%%mm0, %%mm2\n\t"
-			"movq	%%mm0, %%mm3\n\t"
-			"pcmpgtw %%mm6, %%mm2\n\t"
-			"pmullw	%2, %%mm1\n\t"
-			"pandn	%%mm4, %%mm2\n\t"
-			"por	%%mm5, %%mm2\n\t"
-			"pmullw	%%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */
-
-			"pcmpeqw %%mm6, %%mm3\n\t"
-			"pmullw	%%mm0, %%mm1\n\t"
-			"psraw	$3, %%mm1\n\t"
-			"psubw	%%mm5, %%mm1\n\t"   /* block[i] --; */
-			"pandn	%%mm4, %%mm3\n\t"  /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */
-			"por	%%mm5, %%mm1\n\t"   /* block[i] |= 1 */
-			"pmullw %%mm2, %%mm1\n\t"   /* change signs again */
-
-			"pand	%%mm3, %%mm1\n\t" /* nullify if was zero */
-			"movq	%%mm1, %0"
-			:"=m"(block[i])
-			:"m"(block[i]), "m"(quant_matrix[i])
-			:"memory");
-        }
-    } else {
-        i = 0;
-//    unquant_even:
-        quant_matrix = s->non_intra_matrix;
-	/* Align on 4 elements boundary */
-	while(i&7)
-	{
-	    level = block[i];
-            if (level) {
-                if (level < 0) level = -level;
-                    level = (((level << 1) + 1) * qscale *
-                             ((int) quant_matrix[i])) >> 4;
-                    level = (level - 1) | 1;
-                if(block[i] < 0) level = -level;
-                block[i] = level;
-	    }
-	    i++;
-	}
 asm volatile(
 		"pcmpeqw %%mm7, %%mm7		\n\t"
 		"psrlw $15, %%mm7		\n\t"
 		"movd %2, %%mm6			\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
+                "movl %3, %%eax			\n\t"
 		".balign 16\n\t"
 		"1:				\n\t"
-		"movq (%0, %3), %%mm0		\n\t"
-		"movq 8(%0, %3), %%mm1		\n\t"
-		"movq (%1, %3), %%mm4		\n\t"
-		"movq 8(%1, %3), %%mm5		\n\t"
+		"movq (%0, %%eax), %%mm0	\n\t"
+		"movq 8(%0, %%eax), %%mm1	\n\t"
+		"movq (%1, %%eax), %%mm4	\n\t"
+		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
+		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
+		"pxor %%mm2, %%mm2		\n\t"
+		"pxor %%mm3, %%mm3		\n\t"
+		"pcmpgtw %%mm0, %%mm2		\n\t" // block[i] < 0 ? -1 : 0
+		"pcmpgtw %%mm1, %%mm3		\n\t" // block[i] < 0 ? -1 : 0
+		"pxor %%mm2, %%mm0		\n\t"
+		"pxor %%mm3, %%mm1		\n\t"
+		"psubw %%mm2, %%mm0		\n\t" // abs(block[i])
+		"psubw %%mm3, %%mm1		\n\t" // abs(block[i])
+		"pmullw %%mm4, %%mm0		\n\t" // abs(block[i])*q
+		"pmullw %%mm5, %%mm1		\n\t" // abs(block[i])*q
+		"pxor %%mm4, %%mm4		\n\t"
+		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
+		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"psraw $3, %%mm0		\n\t"
+		"psraw $3, %%mm1		\n\t"
+		"psubw %%mm7, %%mm0		\n\t"
+		"psubw %%mm7, %%mm1		\n\t"
+		"por %%mm7, %%mm0		\n\t"
+		"por %%mm7, %%mm1		\n\t"
+		"pxor %%mm2, %%mm0		\n\t"
+		"pxor %%mm3, %%mm1		\n\t"
+		"psubw %%mm2, %%mm0		\n\t"
+		"psubw %%mm3, %%mm1		\n\t"
+		"pandn %%mm0, %%mm4		\n\t"
+		"pandn %%mm1, %%mm5		\n\t"
+		"movq %%mm4, (%0, %%eax)	\n\t"
+		"movq %%mm5, 8(%0, %%eax)	\n\t"
+
+		"addl $16, %%eax		\n\t"
+		"js 1b				\n\t"
+		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
+		: "%eax", "memory"
+	);    
+        block[0]= block0;
+
+        } else {
+        quant_matrix = s->inter_matrix;
+asm volatile(
+		"pcmpeqw %%mm7, %%mm7		\n\t"
+		"psrlw $15, %%mm7		\n\t"
+		"movd %2, %%mm6			\n\t"
+		"packssdw %%mm6, %%mm6		\n\t"
+		"packssdw %%mm6, %%mm6		\n\t"
+                "movl %3, %%eax			\n\t"
+		".balign 16\n\t"
+		"1:				\n\t"
+		"movq (%0, %%eax), %%mm0	\n\t"
+		"movq 8(%0, %%eax), %%mm1	\n\t"
+		"movq (%1, %%eax), %%mm4	\n\t"
+		"movq 8(%1, %%eax), %%mm5	\n\t"
 		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
 		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
 		"pxor %%mm2, %%mm2		\n\t"
@@ -310,8 +289,8 @@ asm volatile(
 		"pmullw %%mm5, %%mm1		\n\t" // (abs(block[i])*2 + 1)*q
 		"pxor %%mm4, %%mm4		\n\t"
 		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
-		"pcmpeqw (%0, %3), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
-		"pcmpeqw 8(%0, %3), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
 		"psraw $4, %%mm0		\n\t"
 		"psraw $4, %%mm1		\n\t"
 		"psubw %%mm7, %%mm0		\n\t"
@@ -324,13 +303,145 @@ asm volatile(
 		"psubw %%mm3, %%mm1		\n\t"
 		"pandn %%mm0, %%mm4		\n\t"
 		"pandn %%mm1, %%mm5		\n\t"
-		"movq %%mm4, (%0, %3)		\n\t"
-		"movq %%mm5, 8(%0, %3)		\n\t"
+		"movq %%mm4, (%0, %%eax)	\n\t"
+		"movq %%mm5, 8(%0, %%eax)	\n\t"
 
-		"addl $16, %3			\n\t"
+		"addl $16, %%eax		\n\t"
 		"js 1b				\n\t"
-		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (2*(i-nCoeffs))
-		: "memory"
+		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
+		: "%eax", "memory"
+	);
+    }
+}
+
+static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
+                                     DCTELEM *block, int n, int qscale)
+{
+    int nCoeffs;
+    const UINT16 *quant_matrix;
+    
+    if(s->alternate_scan) nCoeffs= 64;
+    else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ];
+
+    if (s->mb_intra) {
+        int block0;
+        if (n < 4) 
+            block0 = block[0] * s->y_dc_scale;
+        else
+            block0 = block[0] * s->c_dc_scale;
+        quant_matrix = s->intra_matrix;
+asm volatile(
+		"pcmpeqw %%mm7, %%mm7		\n\t"
+		"psrlw $15, %%mm7		\n\t"
+		"movd %2, %%mm6			\n\t"
+		"packssdw %%mm6, %%mm6		\n\t"
+		"packssdw %%mm6, %%mm6		\n\t"
+                "movl %3, %%eax			\n\t"
+		".balign 16\n\t"
+		"1:				\n\t"
+		"movq (%0, %%eax), %%mm0	\n\t"
+		"movq 8(%0, %%eax), %%mm1	\n\t"
+		"movq (%1, %%eax), %%mm4	\n\t"
+		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
+		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
+		"pxor %%mm2, %%mm2		\n\t"
+		"pxor %%mm3, %%mm3		\n\t"
+		"pcmpgtw %%mm0, %%mm2		\n\t" // block[i] < 0 ? -1 : 0
+		"pcmpgtw %%mm1, %%mm3		\n\t" // block[i] < 0 ? -1 : 0
+		"pxor %%mm2, %%mm0		\n\t"
+		"pxor %%mm3, %%mm1		\n\t"
+		"psubw %%mm2, %%mm0		\n\t" // abs(block[i])
+		"psubw %%mm3, %%mm1		\n\t" // abs(block[i])
+		"pmullw %%mm4, %%mm0		\n\t" // abs(block[i])*q
+		"pmullw %%mm5, %%mm1		\n\t" // abs(block[i])*q
+		"pxor %%mm4, %%mm4		\n\t"
+		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
+		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"psraw $3, %%mm0		\n\t"
+		"psraw $3, %%mm1		\n\t"
+		"pxor %%mm2, %%mm0		\n\t"
+		"pxor %%mm3, %%mm1		\n\t"
+		"psubw %%mm2, %%mm0		\n\t"
+		"psubw %%mm3, %%mm1		\n\t"
+		"pandn %%mm0, %%mm4		\n\t"
+		"pandn %%mm1, %%mm5		\n\t"
+		"movq %%mm4, (%0, %%eax)	\n\t"
+		"movq %%mm5, 8(%0, %%eax)	\n\t"
+
+		"addl $16, %%eax		\n\t"
+		"js 1b				\n\t"
+		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
+		: "%eax", "memory"
+	);    
+        block[0]= block0;
+        //Note, we dont do mismatch control for intra as errors cannot accumulate
+
+    } else {
+        quant_matrix = s->inter_matrix;
+asm volatile(
+		"pcmpeqw %%mm7, %%mm7		\n\t"
+                "psrlq $48, %%mm7		\n\t"
+		"movd %2, %%mm6			\n\t"
+		"packssdw %%mm6, %%mm6		\n\t"
+		"packssdw %%mm6, %%mm6		\n\t"
+                "movl %3, %%eax			\n\t"
+		".balign 16\n\t"
+		"1:				\n\t"
+		"movq (%0, %%eax), %%mm0	\n\t"
+		"movq 8(%0, %%eax), %%mm1	\n\t"
+		"movq (%1, %%eax), %%mm4	\n\t"
+		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
+		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
+		"pxor %%mm2, %%mm2		\n\t"
+		"pxor %%mm3, %%mm3		\n\t"
+		"pcmpgtw %%mm0, %%mm2		\n\t" // block[i] < 0 ? -1 : 0
+		"pcmpgtw %%mm1, %%mm3		\n\t" // block[i] < 0 ? -1 : 0
+		"pxor %%mm2, %%mm0		\n\t"
+		"pxor %%mm3, %%mm1		\n\t"
+		"psubw %%mm2, %%mm0		\n\t" // abs(block[i])
+		"psubw %%mm3, %%mm1		\n\t" // abs(block[i])
+		"paddw %%mm0, %%mm0		\n\t" // abs(block[i])*2
+		"paddw %%mm1, %%mm1		\n\t" // abs(block[i])*2
+		"pmullw %%mm4, %%mm0		\n\t" // abs(block[i])*2*q
+		"pmullw %%mm5, %%mm1		\n\t" // abs(block[i])*2*q
+		"paddw %%mm4, %%mm0		\n\t" // (abs(block[i])*2 + 1)*q
+		"paddw %%mm5, %%mm1		\n\t" // (abs(block[i])*2 + 1)*q
+		"pxor %%mm4, %%mm4		\n\t"
+		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
+		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"psrlw $4, %%mm0		\n\t"
+		"psrlw $4, %%mm1		\n\t"
+		"pxor %%mm2, %%mm0		\n\t"
+		"pxor %%mm3, %%mm1		\n\t"
+		"psubw %%mm2, %%mm0		\n\t"
+		"psubw %%mm3, %%mm1		\n\t"
+		"pandn %%mm0, %%mm4		\n\t"
+		"pandn %%mm1, %%mm5		\n\t"
+                "pxor %%mm4, %%mm7		\n\t"
+                "pxor %%mm5, %%mm7		\n\t"
+		"movq %%mm4, (%0, %%eax)	\n\t"
+		"movq %%mm5, 8(%0, %%eax)	\n\t"
+
+		"addl $16, %%eax		\n\t"
+		"js 1b				\n\t"
+                "movd 124(%0, %3), %%mm0	\n\t"
+                "movq %%mm7, %%mm6		\n\t"
+                "psrlq $32, %%mm7		\n\t"
+                "pxor %%mm6, %%mm7		\n\t"
+                "movq %%mm7, %%mm6		\n\t"
+                "psrlq $16, %%mm7		\n\t"
+                "pxor %%mm6, %%mm7		\n\t"
+                "pslld $31, %%mm7		\n\t"
+                "psrlq $15, %%mm7		\n\t"
+                "pxor %%mm7, %%mm0		\n\t"
+                "movd %%mm0, 124(%0, %3)	\n\t"
+                
+		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
+		: "%eax", "memory"
 	);
     }
 }
@@ -441,18 +552,16 @@ void unused_var_warning_killer(){
 void MPV_common_init_mmx(MpegEncContext *s)
 {
     if (mm_flags & MM_MMX) {
-        if (s->out_format == FMT_H263)
-        	s->dct_unquantize = dct_unquantize_h263_mmx;
-	else
-        	s->dct_unquantize = dct_unquantize_mpeg1_mmx;
-	
-	draw_edges = draw_edges_mmx;
-
-	if(mm_flags & MM_MMXEXT){
-	        dct_quantize= dct_quantize_MMX2;
-	}else{
-		dct_quantize= dct_quantize_MMX;
-	}
+        s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
+        s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
+        s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
+
+        draw_edges = draw_edges_mmx;
+
+        if(mm_flags & MM_MMXEXT){
+            dct_quantize= dct_quantize_MMX2;
+        } else {
+            dct_quantize= dct_quantize_MMX;
+        }
     }
 }
-
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index 2b3322915..aed537a23 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -1,21 +1,22 @@
 /*
-    Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
+ * MPEG video MMX templates
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 #undef SPREADW
 #undef PMAXW
 #ifdef HAVE_MMX2
@@ -33,149 +34,165 @@
 
 static int RENAME(dct_quantize)(MpegEncContext *s,
                             DCTELEM *block, int n,
-                            int qscale)
+                            int qscale, int *overflow)
 {
-    int i, level, last_non_zero_p1, q;
-    const UINT16 *qmat;
+    int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
+    const UINT16 *qmat, *bias;
     static __align8 INT16 temp_block[64];
-    int minLevel, maxLevel;
-    
-    if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){
-	/* mpeg4 */
-        minLevel= -2048;
-	maxLevel= 2047;
-    }else if(s->out_format==FMT_MPEG1){
-	/* mpeg1 */
-        minLevel= -255;
-	maxLevel= 255;
-    }else if(s->out_format==FMT_MJPEG){
-	/* (m)jpeg */
-        minLevel= -1023;
-	maxLevel= 1023;
-    }else{
-	/* h263 / msmpeg4 */
-        minLevel= -128;
-	maxLevel= 127;
-    }
 
     av_fdct (block);
-    
+
     if (s->mb_intra) {
         int dummy;
         if (n < 4)
             q = s->y_dc_scale;
         else
             q = s->c_dc_scale;
-        
         /* note: block[0] is assumed to be positive */
+        if (!s->h263_aic) {
 #if 1
-	asm volatile (
-		"xorl %%edx, %%edx	\n\t"
-		"mul %%ecx		\n\t"
-		: "=d" (temp_block[0]), "=a"(dummy)
-		: "a" (block[0] + (q >> 1)), "c" (inverse[q])
-	);
+        asm volatile (
+        	"xorl %%edx, %%edx	\n\t"
+        	"mul %%ecx		\n\t"
+        	: "=d" (level), "=a"(dummy)
+        	: "a" (block[0] + (q >> 1)), "c" (inverse[q])
+        );
 #else
-	asm volatile (
-		"xorl %%edx, %%edx	\n\t"
-		"divw %%cx		\n\t"
-		"movzwl %%ax, %%eax	\n\t"
-		: "=a" (temp_block[0])
-		: "a" (block[0] + (q >> 1)), "c" (q)
-		: "%edx"
-	);
+        asm volatile (
+        	"xorl %%edx, %%edx	\n\t"
+        	"divw %%cx		\n\t"
+        	"movzwl %%ax, %%eax	\n\t"
+        	: "=a" (level)
+        	: "a" (block[0] + (q >> 1)), "c" (q)
+        	: "%edx"
+        );
 #endif
+        } else
+            /* For AIC we skip quant/dequant of INTRADC */
+            level = block[0];
+            
+        block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
-        i = 1;
         last_non_zero_p1 = 1;
-        if (s->out_format == FMT_H263) {
-            qmat = s->q_non_intra_matrix16;
-        } else {
-            qmat = s->q_intra_matrix16;
-        }
-        for(i=1;i<4;i++) {
-            level = block[i] * qmat[i];
-            level = level / (1 << (QMAT_SHIFT_MMX - 3));
-            /* XXX: currently, this code is not optimal. the range should be:
-               mpeg1: -255..255
-               mpeg2: -2048..2047
-               h263:  -128..127
-               mpeg4: -2048..2047
-            */
-            if (level > maxLevel)
-                level = maxLevel;
-            else if (level < minLevel)
-                level = minLevel;
-            temp_block[i] = level;
-
-	    if(level) 
-	        if(last_non_zero_p1 < inv_zigzag_direct16[i]) last_non_zero_p1= inv_zigzag_direct16[i];
-	    block[i]=0;
-        }
+        bias = s->q_intra_matrix16_bias[qscale];
+        qmat = s->q_intra_matrix16[qscale];
     } else {
-        i = 0;
         last_non_zero_p1 = 0;
-        qmat = s->q_non_intra_matrix16;
+        bias = s->q_inter_matrix16_bias[qscale];
+        qmat = s->q_inter_matrix16[qscale];
     }
 
-    asm volatile( /* XXX: small rounding bug, but it shouldnt matter */
-	"movd %3, %%mm3			\n\t"
-	SPREADW(%%mm3)
-	"movd %4, %%mm4			\n\t"
-	SPREADW(%%mm4)
-#ifndef HAVE_MMX2	
-	"movd %5, %%mm5			\n\t"
-	SPREADW(%%mm5)
-#endif
-	"pxor %%mm7, %%mm7		\n\t"
-	"movd %%eax, %%mm2		\n\t"
-	SPREADW(%%mm2)
-	"movl %6, %%eax			\n\t"
-	".balign 16			\n\t"
-	"1:				\n\t"
-	"movq (%1, %%eax), %%mm0	\n\t"
-	"movq (%2, %%eax), %%mm1	\n\t"
-	"movq %%mm0, %%mm6		\n\t"
-	"psraw $15, %%mm6		\n\t"
-	"pmulhw %%mm0, %%mm1		\n\t"
-	"psubsw %%mm6, %%mm1		\n\t"
-#ifdef HAVE_MMX2
-	"pminsw %%mm3, %%mm1		\n\t"
-	"pmaxsw %%mm4, %%mm1		\n\t"
-#else
-	"paddsw %%mm3, %%mm1		\n\t"
-	"psubusw %%mm4, %%mm1		\n\t"
-	"paddsw %%mm5, %%mm1		\n\t"
-#endif
-	"movq %%mm1, (%8, %%eax)	\n\t"
-	"pcmpeqw %%mm7, %%mm1		\n\t"
-	"movq (%7, %%eax), %%mm0	\n\t"
-	"movq %%mm7, (%1, %%eax)	\n\t"
-	"pandn %%mm0, %%mm1		\n\t"
-	PMAXW(%%mm1, %%mm2)
-	"addl $8, %%eax			\n\t"
-	" js 1b				\n\t"
-	"movq %%mm2, %%mm0		\n\t"
-	"psrlq $32, %%mm2		\n\t"
-	PMAXW(%%mm0, %%mm2)
-	"movq %%mm2, %%mm0		\n\t"
-	"psrlq $16, %%mm2		\n\t"
-	PMAXW(%%mm0, %%mm2)
-	"movd %%mm2, %%eax		\n\t"
-	"movzbl %%al, %%eax		\n\t"
-	: "+a" (last_non_zero_p1)
-	: "r" (block+64), "r" (qmat+64), 
-#ifdef HAVE_MMX2
-	  "m" (maxLevel),          "m" (minLevel),                    "m" (minLevel /* dummy */), "g" (2*i - 128),
-#else
-	  "m" (0x7FFF - maxLevel), "m" (0x7FFF -maxLevel + minLevel), "m" (minLevel),             "g" (2*i - 128),
-#endif
-	  "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
-    );
+    if(s->out_format == FMT_H263){
+    
+        asm volatile(
+            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            SPREADW(%%mm3)
+            "pxor %%mm7, %%mm7			\n\t" // 0
+            "pxor %%mm4, %%mm4			\n\t" // 0
+            "movq (%2), %%mm5			\n\t" // qmat[0]
+            "pxor %%mm6, %%mm6			\n\t"
+            "psubw (%3), %%mm6			\n\t" // -bias[0]
+            "movl $-128, %%eax			\n\t"
+            ".balign 16				\n\t"
+            "1:					\n\t"
+            "pxor %%mm1, %%mm1			\n\t" // 0
+            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
+            "pxor %%mm1, %%mm0			\n\t" 
+            "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
+            "psubusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
+            "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
+            "por %%mm0, %%mm4			\n\t" 
+            "pxor %%mm1, %%mm0			\n\t" 
+            "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+            "movq %%mm0, (%5, %%eax)		\n\t"
+            "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
+            "movq (%4, %%eax), %%mm1		\n\t" 
+            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "pandn %%mm1, %%mm0			\n\t"
+	    PMAXW(%%mm0, %%mm3)
+            "addl $8, %%eax			\n\t"
+            " js 1b				\n\t"
+            "movq %%mm3, %%mm0			\n\t"
+            "psrlq $32, %%mm3			\n\t"
+	    PMAXW(%%mm0, %%mm3)
+            "movq %%mm3, %%mm0			\n\t"
+            "psrlq $16, %%mm3			\n\t"
+	    PMAXW(%%mm0, %%mm3)
+            "movd %%mm3, %%eax			\n\t"
+            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+	    : "+a" (last_non_zero_p1)
+            : "r" (block+64), "r" (qmat), "r" (bias),
+              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+        );
+        // note the asm is split cuz gcc doesnt like that many operands ...
+        asm volatile(
+            "movd %1, %%mm1			\n\t" // max_qcoeff
+	    SPREADW(%%mm1)
+            "psubusw %%mm1, %%mm4		\n\t" 
+            "packuswb %%mm4, %%mm4		\n\t"
+            "movd %%mm4, %0			\n\t" // *overflow
+        : "=g" (*overflow)
+        : "g" (s->max_qcoeff)
+        );
+    }else{ // FMT_H263
+        asm volatile(
+            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            SPREADW(%%mm3)
+            "pxor %%mm7, %%mm7			\n\t" // 0
+            "pxor %%mm4, %%mm4			\n\t" // 0
+            "movl $-128, %%eax			\n\t"
+            ".balign 16				\n\t"
+            "1:					\n\t"
+            "pxor %%mm1, %%mm1			\n\t" // 0
+            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
+            "pxor %%mm1, %%mm0			\n\t" 
+            "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
+            "movq (%3, %%eax), %%mm6		\n\t" // bias[0]
+            "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
+            "movq (%2, %%eax), %%mm5		\n\t" // qmat[i]
+            "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
+            "por %%mm0, %%mm4			\n\t" 
+            "pxor %%mm1, %%mm0			\n\t" 
+            "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+            "movq %%mm0, (%5, %%eax)		\n\t"
+            "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
+            "movq (%4, %%eax), %%mm1		\n\t" 
+            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "pandn %%mm1, %%mm0			\n\t"
+	    PMAXW(%%mm0, %%mm3)
+            "addl $8, %%eax			\n\t"
+            " js 1b				\n\t"
+            "movq %%mm3, %%mm0			\n\t"
+            "psrlq $32, %%mm3			\n\t"
+	    PMAXW(%%mm0, %%mm3)
+            "movq %%mm3, %%mm0			\n\t"
+            "psrlq $16, %%mm3			\n\t"
+	    PMAXW(%%mm0, %%mm3)
+            "movd %%mm3, %%eax			\n\t"
+            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+	    : "+a" (last_non_zero_p1)
+            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+        );
+        // note the asm is split cuz gcc doesnt like that many operands ...
+        asm volatile(
+            "movd %1, %%mm1			\n\t" // max_qcoeff
+	    SPREADW(%%mm1)
+            "psubusw %%mm1, %%mm4		\n\t" 
+            "packuswb %%mm4, %%mm4		\n\t"
+            "movd %%mm4, %0			\n\t" // *overflow
+        : "=g" (*overflow)
+        : "g" (s->max_qcoeff)
+        );
+    }
+
+    if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute
 // last_non_zero_p1=64;       
     /* permute for IDCT */
     asm volatile(
-	"movl %0, %%eax			\n\t"
+        "movl %0, %%eax			\n\t"
 	"pushl %%ebp			\n\t"
 	"movl %%esp, " MANGLE(esp_temp) "\n\t"
 	"1:				\n\t"
@@ -203,5 +220,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     }
 */
 //block_permute(block);
+
     return last_non_zero_p1 - 1;
 }
diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
index 297f23724..4f19cc20a 100644
--- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
@@ -1,29 +1,43 @@
 /*
-    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-#include <inttypes.h>
+ * Simple IDCT MMX
+ *
+ * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 #include "../dsputil.h"
 
+/*
+23170.475006
+22725.260826
+21406.727617
+19265.545870
+16384.000000
+12872.826198
+8866.956905
+4520.335430
+*/
 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#if 0
 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#else
+#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+#endif
 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
@@ -31,8 +45,8 @@
 #define ROW_SHIFT 11
 #define COL_SHIFT 20 // 6
 
-static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
-static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
+static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
+static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
 static int16_t __attribute__((aligned(8))) temp[64];
 static int16_t __attribute__((aligned(8))) coeffs[]= {
 	1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
@@ -43,27 +57,31 @@ static int16_t __attribute__((aligned(8))) coeffs[]= {
 //	0, 0, 0, 0,
 //	0, 0, 0, 0,
 
-	 C4,  C2,  C4,  C2,
-	 C4,  C6,  C4,  C6,
-	 C1,  C3,  C1,  C3,
-	 C5,  C7,  C5,  C7,
+ C4,  C4,  C4,  C4,
+ C4, -C4,  C4, -C4,
+ 
+ C2,  C6,  C2,  C6,
+ C6, -C2,  C6, -C2,
+ 
+ C1,  C3,  C1,  C3,
+ C5,  C7,  C5,  C7,
+ 
+ C3, -C7,  C3, -C7,
+-C1, -C5, -C1, -C5,
+ 
+ C5, -C1,  C5, -C1,
+ C7,  C3,  C7,  C3,
+ 
+ C7, -C5,  C7, -C5,
+ C3, -C1,  C3, -C1
+};
 
-	 C4,  C6,  C4,  C6,
-	-C4, -C2, -C4, -C2,
-	 C3, -C7,  C3, -C7,
-	-C1, -C5, -C1, -C5,
-
-	 C4, -C6,  C4, -C6,
-	-C4,  C2, -C4,  C2,
-	 C5, -C1,  C5, -C1,
-	 C7,  C3,  C7,  C3,
-
-	 C4, -C2,  C4, -C2,
-	 C4, -C6,  C4, -C6,
-	 C7, -C5,  C7, -C5,
-	 C3, -C1,  C3, -C1
-	};
 #if 0
+static void unused_var_killer(){
+	int a= wm1010 + d40000;
+	temp[0]=a;
+}
+
 static void inline idctCol (int16_t * col, int16_t *input)
 {
 #undef C0
@@ -79,7 +97,7 @@ static void inline idctCol (int16_t * col, int16_t *input)
 	const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-	const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+	const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
@@ -128,7 +146,7 @@ static void inline idctRow (int16_t * output, int16_t * input)
 	const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-	const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+	const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 	const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
@@ -188,110 +206,160 @@ row[7] = input[13];
 
 static inline void idct(int16_t *block)
 {
-	int i;
-//for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ];
-//for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ];
-//for(i=0; i<64; i++) block[i]= temp[i];
-//block_permute(block);
-/*
-idctRow(temp, block);
-idctRow(temp+16, block+16);
-idctRow(temp+1, block+2);
-idctRow(temp+17, block+18);
-idctRow(temp+32, block+32);
-idctRow(temp+48, block+48);
-idctRow(temp+33, block+34);
-idctRow(temp+49, block+50);
-*/
-
 	asm volatile(
-//		"lea 64(%0), %%eax		\n\t"
-//r0,r2,R0,R2	r4,r6,R4,R6	r1,r3,R1,R3	r5,r7,R5,R7
-//src0		src4		src1		src5
-//r0,R0,r7,R7	r1,R1,r6,R6	r2,R2,r5,R5	r3,R3,r4,R4
-//dst0		dst1		dst2		dst3
 #if 0 //Alternative, simpler variant
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
+
+#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
+	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm1, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm1			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm1			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq " #src1 ", %%mm0			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm0			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm0, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"\
+
+	
+#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq wm1010, %%mm4			\n\t"\
+	"movq "MANGLE(wm1010)", %%mm4		\n\t"\
 	"pand %%mm0, %%mm4			\n\t"\
 	"por %%mm1, %%mm4			\n\t"\
 	"por %%mm2, %%mm4			\n\t"\
@@ -300,234 +368,106 @@ idctRow(temp+49, block+50);
 	"movd %%mm4, %%eax			\n\t"\
 	"orl %%eax, %%eax			\n\t"\
 	"jz 1f					\n\t"\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
 	"psrad $" #shift ", %%mm2		\n\t"\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
 	"jmp 2f					\n\t"\
 	"1:					\n\t"\
-	WRITE3(%%mm0, dst)\
-	"2:					\n\t"\
-
-
-#define WRITE0(s0, s7, dst)\
-	"movq " #s0 ", " #dst "			\n\t" /* R0		r0 */\
-	"movq " #s7 ", 24+" #dst "		\n\t" /* R7		r7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-	"movq " #dst ", " #tmp "		\n\t" /* R0		r0 */\
-	"packssdw " #s1 ", " #tmp "		\n\t" /* R1	r1	R0	r0*/\
-	"movq " #tmp ", " #dst "		\n\t"\
-	"movq 24+" #dst ", " #tmp "		\n\t" /* R7		r7 */\
-	"packssdw " #tmp ", " #s6 "		\n\t" /* R7	r7	R6	r6*/\
-	"movq " #s6 ", 24+" #dst "		\n\t"
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-	"packssdw " #s3 ", " #s2 "		\n\t" /* R3	r3	R2	r2*/\
-	"packssdw " #s5 ", " #s4 "		\n\t" /* R5	r5	R4	r4*/\
-	"movq " #s2 ", 8+" #dst "		\n\t"\
-	"movq " #s4 ", 16+" #dst "		\n\t"
-
-#define WRITE3(a, dst)\
-	"pslld $16, " #a "			\n\t"\
-	"psrad $13, " #a "			\n\t"\
-	"packssdw " #a ", " #a "		\n\t"\
-	"movq " #a ", " #dst "			\n\t"\
-	"movq " #a ", 8+" #dst "		\n\t"\
-	"movq " #a ", 16+" #dst "		\n\t"\
-	"movq " #a ", 24+" #dst "		\n\t"\
-
-//IDCT_CORE(          src0,   src4,   src1,   src5,    dst,   rounder, shift)
-IDCT_CORE(            (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-/*
-DC_COND_IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-DC_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-DC_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-*/
-IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
+	"pslld $16, %%mm0			\n\t"\
+	"#paddd "MANGLE(d40000)", %%mm0		\n\t"\
+	"psrad $13, %%mm0			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t"\
+	"movq %%mm0, " #dst "			\n\t"\
+	"movq %%mm0, 8+" #dst "			\n\t"\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 24+" #dst "		\n\t"\
+	"2:					\n\t"
 
-#undef WRITE0
-#undef WRITE1
-#undef WRITE2
 
-#define WRITE0(s0, s7, dst)\
-	"packssdw " #s0 ", " #s0 "		\n\t" /* C0, c0, C0, c0 */\
-	"packssdw " #s7 ", " #s7 "		\n\t" /* C7, c7, C7, c7 */\
-	"movd " #s0 ", " #dst "			\n\t" /* C0, c0 */\
-	"movd " #s7 ", 112+" #dst "		\n\t" /* C7, c7 */
+//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
+ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
 
-#define WRITE1(s1, s6, dst, tmp)\
-	"packssdw " #s1 ", " #s1 "		\n\t" /* C1, c1, C1, c1 */\
-	"packssdw " #s6 ", " #s6 "		\n\t" /* C6, c6, C6, c6 */\
-	"movd " #s1 ", 16+" #dst "		\n\t" /* C1, c1 */\
-	"movd " #s6 ", 96+" #dst "		\n\t" /* C6, c6 */
+DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
 
-#define WRITE2(s2, s5, s3, s4, dst)\
-	"packssdw " #s2 ", " #s2 "		\n\t" /* C2, c2, C2, c2 */\
-	"packssdw " #s3 ", " #s3 "		\n\t" /* C3, c3, C3, c3 */\
-	"movd " #s2 ", 32+" #dst "		\n\t" /* C2, c2 */\
-	"movd " #s3 ", 48+" #dst "		\n\t" /* C3, c3 */\
-	"packssdw " #s4 ", " #s4 "		\n\t" /* C4, c4, C4, c4 */\
-	"packssdw " #s5 ", " #s5 "		\n\t" /* C5, c5, C5, c5 */\
-	"movd " #s4 ", 64+" #dst "		\n\t" /* C4, c4 */\
-	"movd " #s5 ", 80+" #dst "		\n\t" /* C5, c5 */\
 
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+//IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
+COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
 #else
 
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
-	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq wm1010, %%mm4			\n\t"\
+	"movq "MANGLE(wm1010)", %%mm4		\n\t"\
 	"pand %%mm0, %%mm4			\n\t"\
 	"por %%mm1, %%mm4			\n\t"\
 	"por %%mm2, %%mm4			\n\t"\
@@ -536,920 +476,822 @@ IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"movd %%mm4, %%eax			\n\t"\
 	"orl %%eax, %%eax			\n\t"\
 	"jz 1f					\n\t"\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
 	"psrad $" #shift ", %%mm2		\n\t"\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
 	"jmp 2f					\n\t"\
-	"#.balign 16				\n\t"\
 	"1:					\n\t"\
-	WRITE3(%%mm0, dst)\
-	"2:					\n\t"\
+	"pslld $16, %%mm0			\n\t"\
+	"paddd "MANGLE(d40000)", %%mm0		\n\t"\
+	"psrad $13, %%mm0			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t"\
+	"movq %%mm0, " #dst "			\n\t"\
+	"movq %%mm0, 8+" #dst "			\n\t"\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 24+" #dst "		\n\t"\
+	"2:					\n\t"
 
-#define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
 	"movq %%mm0, %%mm4			\n\t"\
 	"por %%mm1, %%mm4			\n\t"\
 	"por %%mm2, %%mm4			\n\t"\
 	"por %%mm3, %%mm4			\n\t"\
-	"packssdw %%mm4, %%mm4			\n\t"\
+	"packssdw %%mm4,%%mm4			\n\t"\
 	"movd %%mm4, %%eax			\n\t"\
 	"orl %%eax, %%eax			\n\t"\
 	"jz " #bt "				\n\t"\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	#rounder ", %%mm4			\n\t"\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-	#rounder ", %%mm4			\n\t"\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	#rounder ", %%mm0			\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
 	"psrad $" #shift ", %%mm2		\n\t"\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
-
-
-#define WRITE0(s0, s7, dst)\
-	"movq " #s0 ", " #dst "			\n\t" /* R0		r0 */\
-	"movq " #s7 ", 24+" #dst "		\n\t" /* R7		r7 */
-
-#define WRITE1(s1, s6, dst, tmp)\
-	"movq " #dst ", " #tmp "		\n\t" /* R0		r0 */\
-	"packssdw " #s1 ", " #tmp "		\n\t" /* R1	r1	R0	r0*/\
-	"movq " #tmp ", " #dst "		\n\t"\
-	"movq 24+" #dst ", " #tmp "		\n\t" /* R7		r7 */\
-	"packssdw " #tmp ", " #s6 "		\n\t" /* R7	r7	R6	r6*/\
-	"movq " #s6 ", 24+" #dst "		\n\t"
-
-#define WRITE2(s2, s5, s3, s4, dst)\
-	"packssdw " #s3 ", " #s2 "		\n\t" /* R3	r3	R2	r2*/\
-	"packssdw " #s5 ", " #s4 "		\n\t" /* R5	r5	R4	r4*/\
-	"movq " #s2 ", 8+" #dst "		\n\t"\
-	"movq " #s4 ", 16+" #dst "		\n\t"
-
-#define WRITE3(a, dst)\
-	"pslld $16, " #a "			\n\t"\
-	"paddd d40000, " #a "			\n\t"\
-	"psrad $13, " #a "			\n\t"\
-	"packssdw " #a ", " #a "		\n\t"\
-	"movq " #a ", " #dst "			\n\t"\
-	"movq " #a ", 8+" #dst "		\n\t"\
-	"movq " #a ", 16+" #dst "		\n\t"\
-	"movq " #a ", 24+" #dst "		\n\t"\
-
-#define WRITE0b(s0, s7, dst)\
-	"packssdw " #s0 ", " #s0 "		\n\t" /* C0, c0, C0, c0 */\
-	"packssdw " #s7 ", " #s7 "		\n\t" /* C7, c7, C7, c7 */\
-	"movd " #s0 ", " #dst "			\n\t" /* C0, c0 */\
-	"movd " #s7 ", 112+" #dst "		\n\t" /* C7, c7 */
-
-#define WRITE1b(s1, s6, dst, tmp)\
-	"packssdw " #s1 ", " #s1 "		\n\t" /* C1, c1, C1, c1 */\
-	"packssdw " #s6 ", " #s6 "		\n\t" /* C6, c6, C6, c6 */\
-	"movd " #s1 ", 16+" #dst "		\n\t" /* C1, c1 */\
-	"movd " #s6 ", 96+" #dst "		\n\t" /* C6, c6 */
-
-#define WRITE2b(s2, s5, s3, s4, dst)\
-	"packssdw " #s2 ", " #s2 "		\n\t" /* C2, c2, C2, c2 */\
-	"packssdw " #s3 ", " #s3 "		\n\t" /* C3, c3, C3, c3 */\
-	"movd " #s2 ", 32+" #dst "		\n\t" /* C2, c2 */\
-	"movd " #s3 ", 48+" #dst "		\n\t" /* C3, c3 */\
-	"packssdw " #s4 ", " #s4 "		\n\t" /* C4, c4, C4, c4 */\
-	"packssdw " #s5 ", " #s5 "		\n\t" /* C5, c5, C5, c5 */\
-	"movd " #s4 ", 64+" #dst "		\n\t" /* C4, c4 */\
-	"movd " #s5 ", 80+" #dst "		\n\t" /* C5, c5 */\
-
-
-//IDCT_CORE(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
-DC_COND_IDCT_CORE(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-Z_COND_IDCT_CORE(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
-Z_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq 56(%2), %%mm5			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	#rounder ", %%mm0			\n\t"\
+	"paddd %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm0, %%mm0			\n\t" \
+	"psubd %%mm1, %%mm0			\n\t" /* A2		a2 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm5, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm5			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm5			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm5			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
+	"movq %%mm1, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm5, %%mm1			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm5, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm7			\n\t" /* A1+B1	a1+b1	A0+B0	a0+b0 */\
+	"packssdw %%mm4, %%mm2			\n\t" /* A0-B0	a0-b0	A1-B1	a1-b1 */\
+	"movq %%mm7, " #dst "			\n\t"\
+	"movq " #src1 ", %%mm1			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"movq %%mm2, 24+" #dst "		\n\t"\
+	"pmaddwd %%mm1, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm1			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm0, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm0			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm1, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm6, %%mm2			\n\t" /* A3+B3	a3+b3	A2+B2	a2+b2 */\
+	"movq %%mm2, 8+" #dst "			\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"packssdw %%mm0, %%mm4			\n\t" /* A2-B2	a2-b2	A3-B3	a3-b3 */\
+	"movq %%mm4, 16+" #dst "		\n\t"\
+
+//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
+DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
+	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm1, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm1			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm1			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
+	"psrad $" #shift ", %%mm4		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq " #src1 ", %%mm0			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm0			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm0, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"4:					\n\t"
-Z_COND_IDCT_CORE(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	#rounder ", %%mm0			\n\t"\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm7, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
 	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1b(%%mm7, %%mm4, dst, %%mm6) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
+	"paddd %%mm4, %%mm1			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm1, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	"paddd %%mm0, %%mm3			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm3, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm3		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm7, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm7, %%mm2			\n\t" /* A1-B1		a1-b1 */\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm1, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 88(%2), %%mm1			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm1, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm1, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm1			\n\t" /* A3		a3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm1			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm1, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"	
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"6:					\n\t"
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm7, %%mm4, dst) \
-\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	#rounder ", %%mm0			\n\t"\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
 	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-\
-	"movq 80(%2), %%mm4			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm5, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm5, %%mm5			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm5			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	WRITE1b(%%mm7, %%mm5, dst, %%mm6) \
-\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm7			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm7, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
+	"paddd %%mm4, %%mm1			\n\t" /* A0+B0		a0+b0 */\
+	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
+	"psubd %%mm1, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm0, %%mm3			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm3, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm3		\n\t"\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm7, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm7, %%mm2			\n\t" /* A1-B1		a1-b1 */\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm1, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 88(%2), %%mm1			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm1, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm1, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm1			\n\t" /* A3		a3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm1			\n\t" /* a3-B3		a3-b3 */\
+	"psrad $" #shift ", %%mm6		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm1, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"	
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"2:					\n\t"
-Z_COND_IDCT_CORE(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
 
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
 	"movq " #src5 ", %%mm3			\n\t" /* R7	R5	r7	r5 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-	"movq 40(%2), %%mm7			\n\t" /* C7	C5	C7	C5 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B0		b0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 56(%2), %%mm1			\n\t" /* C7	C5	C7	C5 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* C7R7+C5R5	C7r7+C5r5 */\
+	"pmaddwd 64(%2), %%mm2			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm1, %%mm7			\n\t" /* B0		b0 */\
+	"movq 72(%2), %%mm1			\n\t" /* -C5	-C1	-C5	-C1 */\
+	"pmaddwd %%mm3, %%mm1			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"paddd %%mm2, %%mm1			\n\t" /* B1		b1 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"movq 72(%2), %%mm7			\n\t" /* -C5	-C1	-C5	-C1 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /* -C5R7-C1R5	-C5r7-C1r5 */\
-\
-	"movq 80(%2), %%mm4			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm5, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm5, %%mm5			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm5			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
-\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"movq 104(%2), %%mm7			\n\t" /* C3	C7	C3	C7 */\
+	"movq %%mm0, %%mm2			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm2			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm2, %%mm2			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm2, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq " #src1 ", %%mm0			\n\t" /* R3	R1	r3	r1 */\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"movq 88(%2), %%mm7			\n\t" /* C3	C7	C3 	C7 */\
+	"pmaddwd 96(%2), %%mm0			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
 	"pmaddwd %%mm3, %%mm7			\n\t" /* C3R7+C7R5	C3r7+C7r5 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm7, %%mm6			\n\t" /* B1		b1 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
-	"pmaddwd 136(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"movq %%mm5, %%mm2			\n\t" /* A2		a2 */\
+	"pmaddwd 104(%2), %%mm3			\n\t" /* -C1R7+C3R5	-C1r7+C3r5 */\
+	"paddd %%mm7, %%mm4			\n\t" /* B2		b2 */\
+	"paddd %%mm4, %%mm2			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm0, %%mm3			\n\t" /* B3		b3 */\
+	"paddd %%mm3, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm3, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm3, %%mm2			\n\t" /* B3		b3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"packssdw %%mm2, %%mm2			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm2, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"3:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 64(%2), %%mm3			\n\t"\
+	"pmaddwd %%mm2, %%mm3			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-\
-	"movq 80(%2), %%mm4			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm5, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm5, %%mm5			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm5			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"movq %%mm0, %%mm1			\n\t" /* A1		a1 */\
+	"paddd %%mm3, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm3, %%mm1			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm1, %%mm1			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm1, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm2, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"pmaddwd 96(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"movq %%mm5, %%mm1			\n\t" /* A2		a2 */\
+	"paddd %%mm4, %%mm1			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm1		\n\t"\
 	"psrad $" #shift ", %%mm5		\n\t"\
-	WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
-\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm2, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm2, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"packssdw %%mm1, %%mm1			\n\t" /* A2+B2	a2+b2 */\
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm1, 32+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 	"#.balign 16				\n\t"\
 	"5:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"movq %%mm4, %%mm6\n\t"\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"movq %%mm5, %%mm7\n\t"\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 8+" #src0 ", %%mm2		\n\t" /*2R2	R0	r2	r0 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /*2C2R2+C4R0	C2r2+C4r0 */\
-	"movq 8+" #src4 ", %%mm3		\n\t" /*2R6	R4	r6	r4 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /*2C6R6+C4R4	C6r6+C4r4 */\
-\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
+	#rounder ", %%mm0			\n\t"\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 8+" #src0 ", %%mm2		\n\t" /* R4	R0	r4	r0 */\
+	"movq 8+" #src4 ", %%mm3		\n\t" /* R6	R2	r6	r2 */\
+	"movq 16(%2), %%mm1			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm2, %%mm1			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm7			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm7, %%mm2			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm7			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm3, %%mm7			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"pmaddwd 40(%2), %%mm3			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm1			\n\t"\
+	"paddd %%mm1, %%mm7			\n\t" /* A0		a0 */\
+	"paddd %%mm1, %%mm1			\n\t" /* 2C0		2c0 */\
+	#rounder ", %%mm2			\n\t"\
+	"psubd %%mm7, %%mm1			\n\t" /* A3		a3 */\
+	"paddd %%mm2, %%mm3			\n\t" /* A1		a1 */\
+	"paddd %%mm2, %%mm2			\n\t" /* 2C1		2c1 */\
+	"psubd %%mm3, %%mm2			\n\t" /* A2		a2 */\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-\
-	"paddd %%mm7, %%mm6			\n\t" /*2A0		a0 */\
-	"movq 56(%2), %%mm7			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"pmaddwd %%mm1, %%mm7			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-\
-	"packssdw %%mm6, %%mm4			\n\t" /* C0, c0, C0, c0 */\
-	"movq 48(%2), %%mm6			\n\t" /* C6	C4	C6	C4 */\
-	"movq %%mm4, " #dst "			\n\t" /* C0, c0 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /*2C6R2+C4R0	C6r2+C4r0 */\
-\
-	"movq %%mm4, 112+" #dst "		\n\t" /* C0, c0 */\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm3, %%mm4			\n\t" /*2-C2R6-C4R4	-C2r6-C4r4 */\
-\
-	"paddd %%mm5, %%mm7			\n\t" /* A1		a1 */\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
 	"psrad $" #shift ", %%mm7		\n\t"\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-\
-	"paddd %%mm4, %%mm6			\n\t" /*2A1		a1 */\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"packssdw %%mm6, %%mm7			\n\t" /* C1, c1, C1, c1 */\
-\
-	"movq 80(%2), %%mm6			\n\t" /* -C6	C4	-C6	C4 */\
-	"movq %%mm7, 16+" #dst "		\n\t" /* C1, c1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /*2-C6R2+C4R0	-C6r2+C4r0 */\
-\
-	"movq %%mm7, 96+" #dst "		\n\t" /* C1, c1 */\
-	"movq 88(%2), %%mm7			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm3, %%mm7			\n\t" /*2C2R6-C4R4	C2r6-C4r4 */\
-\
-	"pmaddwd 112(%2), %%mm2			\n\t" /*2-C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 120(%2), %%mm3			\n\t" /*2-C6R6+C4R4	-C6r6+C4r4 */\
-	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm7, %%mm6			\n\t" /*2A2		a2 */\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-\
-	"psrad $" #shift ", %%mm6		\n\t"\
-\
-	"packssdw %%mm6, %%mm4			\n\t" /* C2, c2, C2, c2 */\
-	"movq %%mm4, 32+" #dst "		\n\t" /* C2, c2 */\
+	"psrad $" #shift ", %%mm3		\n\t"\
+	"packssdw %%mm7, %%mm4			\n\t" /* A0	a0 */\
+	"movq %%mm4, " #dst "			\n\t"\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	"paddd %%mm3, %%mm2			\n\t" /*2A3		a3 */\
-\
-	"movq %%mm4, 80+" #dst "		\n\t" /* C2, c2 */\
+	"packssdw %%mm3, %%mm0			\n\t" /* A1	a1 */\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 96+" #dst "		\n\t"\
+	"movq %%mm4, 112+" #dst "		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"psrad $" #shift ", %%mm6		\n\t"\
 	"psrad $" #shift ", %%mm2		\n\t"\
-\
-	"packssdw %%mm2, %%mm0			\n\t" /* C3, c3, C3, c3 */\
-	"movq %%mm0, 48+" #dst "		\n\t" /* C3, c3 */\
-	"movq %%mm0, 64+" #dst "		\n\t" /* C3, c3 */\
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-//IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-//IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"packssdw %%mm2, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movq %%mm5, 32+" #dst "		\n\t"\
+	"psrad $" #shift ", %%mm1		\n\t"\
+	"packssdw %%mm1, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movq %%mm6, 48+" #dst "		\n\t"\
+	"movq %%mm6, 64+" #dst "		\n\t"\
+	"movq %%mm5, 80+" #dst "		\n\t"	
+	
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 
 	"#.balign 16				\n\t"\
 	"1:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq " #src4 ", %%mm1			\n\t" /* R6	R4	r6	r4 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq " #src4 ", %%mm1			\n\t" /* R6	R2	r6	r2 */\
 	"movq " #src1 ", %%mm2			\n\t" /* R3	R1	r3	r1 */\
-	"movq 16(%2), %%mm4			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 24(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C4R4	C6r6+C4r4 */\
-	"movq 32(%2), %%mm6			\n\t" /* C3	C1	C3	C1 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm5			\n\t" /* C6	C2	C6	C2 */\
+	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R6+C2R2	C6r6+C2r2 */\
+	"movq 40(%2), %%mm6			\n\t" /* -C2	C6	-C2	C6 */\
+	"pmaddwd %%mm6, %%mm1			\n\t" /* -C2R6+C6R2	-C2r6+C6r2 */\
+	#rounder ", %%mm4			\n\t"\
+	"movq %%mm4, %%mm6			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 48(%2), %%mm7			\n\t" /* C3	C1	C3	C1 */\
+	#rounder ", %%mm0			\n\t"\
+	"pmaddwd %%mm2, %%mm7			\n\t" /* C3R3+C1R1	C3r3+C1r1 */\
 	"paddd %%mm5, %%mm4			\n\t" /* A0		a0 */\
-\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A0+B0		a0+b0 */\
+	"psubd %%mm5, %%mm6			\n\t" /* A3		a3 */\
+	"movq %%mm0, %%mm5			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1		a1 */\
+	"psubd %%mm1, %%mm5			\n\t" /* A2		a2 */\
+	"movq 64(%2), %%mm1			\n\t"\
+	"pmaddwd %%mm2, %%mm1			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
+	"paddd %%mm4, %%mm7			\n\t" /* A0+B0		a0+b0 */\
 	"paddd %%mm4, %%mm4			\n\t" /* 2A0		2a0 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A0-B0		a0-b0 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE0b(%%mm6, %%mm4, dst) \
-\
-	"movq 56(%2), %%mm4			\n\t" /* -C2	-C4	-C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* -C2R6-C4R4	-C2r6-C4r4 */\
-	"movq 64(%2), %%mm6			\n\t" /* -C7	C3	-C7	C3 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C7R3+C3R1	-C7r3+C3r1 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A1		a1 */\
-\
-	"movq 80(%2), %%mm5			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm5			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"psrad $" #shift ", %%mm6		\n\t"\
+	"psubd %%mm7, %%mm4			\n\t" /* A0-B0		a0-b0 */\
+	"psrad $" #shift ", %%mm7		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
-\
-	"movq 88(%2), %%mm4			\n\t" /* C2	-C4	C2	-C4 */\
-	"pmaddwd %%mm1, %%mm4			\n\t" /* C2R6-C4R4	C2r6-C4r4 */\
-	"movq 96(%2), %%mm6			\n\t" /* -C1	C5	-C1	C5 */\
-	"pmaddwd %%mm2, %%mm6			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
-	"paddd %%mm5, %%mm4			\n\t" /* A2		a2 */\
-\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"paddd %%mm4, %%mm6			\n\t" /* A1+B1		a1+b1 */\
-	"pmaddwd 120(%2), %%mm1			\n\t" /* -C6R6+C4R4	-C6r6+C4r4 */\
-	"paddd %%mm4, %%mm4			\n\t" /* 2A1		2a1 */\
-	"psubd %%mm6, %%mm4			\n\t" /* A1-B1		a1-b1 */\
-	"pmaddwd 128(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"movq %%mm0, %%mm3			\n\t" /* A1		a1 */\
+	"paddd %%mm1, %%mm0			\n\t" /* A1+B1		a1+b1 */\
+	"psubd %%mm1, %%mm3			\n\t" /* A1-B1		a1-b1 */\
+	"psrad $" #shift ", %%mm0		\n\t"\
+	"psrad $" #shift ", %%mm3		\n\t"\
+	"packssdw %%mm7, %%mm7			\n\t" /* A0+B0	a0+b0 */\
+	"movd %%mm7, " #dst "			\n\t"\
+	"packssdw %%mm0, %%mm0			\n\t" /* A1+B1	a1+b1 */\
+	"movd %%mm0, 16+" #dst "		\n\t"\
+	"packssdw %%mm3, %%mm3			\n\t" /* A1-B1	a1-b1 */\
+	"movd %%mm3, 96+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A0-B0	a0-b0 */\
+	"movd %%mm4, 112+" #dst "		\n\t"\
+	"movq 80(%2), %%mm4			\n\t" /* -C1	C5	-C1 	C5 */\
+	"pmaddwd %%mm2, %%mm4			\n\t" /* -C1R3+C5R1	-C1r3+C5r1 */\
+	"pmaddwd 96(%2), %%mm2			\n\t" /* -C5R3+C7R1	-C5r3+C7r1 */\
+	"movq %%mm5, %%mm3			\n\t" /* A2		a2 */\
+	"paddd %%mm4, %%mm3			\n\t" /* A2+B2		a2+b2 */\
+	"psubd %%mm4, %%mm5			\n\t" /* a2-B2		a2-b2 */\
+	"psrad $" #shift ", %%mm3		\n\t"\
+	"psrad $" #shift ", %%mm5		\n\t"\
+	"movq %%mm6, %%mm4			\n\t" /* A3		a3 */\
+	"paddd %%mm2, %%mm6			\n\t" /* A3+B3		a3+b3 */\
+	"psubd %%mm2, %%mm4			\n\t" /* a3-B3		a3-b3 */\
 	"psrad $" #shift ", %%mm6		\n\t"\
+	"packssdw %%mm3, %%mm3			\n\t" /* A2+B2	a2+b2 */\
+	"movd %%mm3, 32+" #dst "		\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-\
-	"paddd %%mm1, %%mm0			\n\t" /* A3		a3 */\
-	"paddd %%mm0, %%mm2			\n\t" /* A3+B3		a3+b3 */\
-	"paddd %%mm0, %%mm0			\n\t" /* 2A3		2a3 */\
-	"psubd %%mm2, %%mm0			\n\t" /* A3-B3		a3-b3 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm0		\n\t"\
-	WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"packssdw %%mm6, %%mm6			\n\t" /* A3+B3	a3+b3 */\
+	"movd %%mm6, 48+" #dst "		\n\t"\
+	"packssdw %%mm4, %%mm4			\n\t" /* A3-B3	a3-b3 */\
+	"packssdw %%mm5, %%mm5			\n\t" /* A2-B2	a2-b2 */\
+	"movd %%mm4, 64+" #dst "		\n\t"\
+	"movd %%mm5, 80+" #dst "		\n\t"
+	
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 	"jmp 9f					\n\t"
 
 
 	"#.balign 16				\n\t"
 	"7:					\n\t"
-#undef IDCT_CORE
-#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
-	"movq " #src0 ", %%mm0			\n\t" /* R2	R0	r2	r0 */\
-	"movq 16(%2), %%mm2			\n\t" /* C2	C4	C2	C4 */\
-	"movq 8+" #src0 ", %%mm1		\n\t" /* R2	R0	r2	r0 */\
-	"pmaddwd %%mm0, %%mm2			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-	"movq 16(%2), %%mm3			\n\t" /* C2	C4	C2	C4 */\
-	"pmaddwd %%mm1, %%mm3			\n\t" /* C2R2+C4R0	C2r2+C4r0 */\
-\
-	"movq 48(%2), %%mm4			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm0, %%mm4			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"movq 48(%2), %%mm5			\n\t" /* C6	C4	C6	C4 */\
-	"pmaddwd %%mm1, %%mm5			\n\t" /* C6R2+C4R0	C6r2+C4r0 */\
-	"movq 80(%2), %%mm6			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm0, %%mm6			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"movq 80(%2), %%mm7			\n\t" /* -C6	C4	-C6	C4 */\
-	"pmaddwd %%mm1, %%mm7			\n\t" /* -C6R2+C4R0	-C6r2+C4r0 */\
-	"pmaddwd 112(%2), %%mm0			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"psrad $" #shift ", %%mm2		\n\t"\
-	"psrad $" #shift ", %%mm3		\n\t"\
-	"pmaddwd 112(%2), %%mm1			\n\t" /* -C2R2+C4R0	-C2r2+C4r0 */\
-	"packssdw %%mm3, %%mm2			\n\t" /* C0, c0, C0, c0 */\
-	"movq %%mm2, " #dst "			\n\t" /* C0, c0 */\
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+	"movq " #src0 ", %%mm0			\n\t" /* R4	R0	r4	r0 */\
+	"movq 16(%2), %%mm4			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm0, %%mm4			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm5			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm5, %%mm0			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	#rounder ", %%mm4			\n\t"\
+	#rounder ", %%mm0			\n\t"\
 	"psrad $" #shift ", %%mm4		\n\t"\
-	"psrad $" #shift ", %%mm5		\n\t"\
-	"movq %%mm2, 112+" #dst "		\n\t" /* C0, c0 */\
-	"packssdw %%mm5, %%mm4			\n\t" /* C1, c1, C1, c1 */\
-	"movq %%mm4, 16+" #dst "		\n\t" /* C0, c0 */\
-	"psrad $" #shift ", %%mm7		\n\t"\
-	"psrad $" #shift ", %%mm6		\n\t"\
-	"movq %%mm4, 96+" #dst "		\n\t" /* C0, c0 */\
-	"packssdw %%mm7, %%mm6			\n\t" /* C2, c2, C2, c2 */\
-	"movq %%mm6, 32+" #dst "		\n\t" /* C0, c0 */\
 	"psrad $" #shift ", %%mm0		\n\t"\
-	"movq %%mm6, 80+" #dst "		\n\t" /* C0, c0 */\
+	"movq 8+" #src0 ", %%mm2		\n\t" /* R4	R0	r4	r0 */\
+	"movq 16(%2), %%mm1			\n\t" /* C4	C4	C4	C4 */\
+	"pmaddwd %%mm2, %%mm1			\n\t" /* C4R4+C4R0	C4r4+C4r0 */\
+	"movq 24(%2), %%mm7			\n\t" /* -C4	C4	-C4	C4 */\
+	"pmaddwd %%mm7, %%mm2			\n\t" /* -C4R4+C4R0	-C4r4+C4r0 */\
+	"movq 32(%2), %%mm7			\n\t" /* C6	C2	C6	C2 */\
+	#rounder ", %%mm1			\n\t"\
+	#rounder ", %%mm2			\n\t"\
 	"psrad $" #shift ", %%mm1		\n\t"\
-	"packssdw %%mm1, %%mm0			\n\t" /* C3, c3, C3, c3 */\
-	"movq %%mm0, 48+" #dst "		\n\t" /* C0, c0 */\
-	"movq %%mm0, 64+" #dst "		\n\t" /* C0, c0 */\
-
-//IDCT_CORE(  src0,   src4,   src1,    src5,    dst, rounder, shift)
-IDCT_CORE(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
-//IDCT_CORE(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
-IDCT_CORE(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
-//IDCT_CORE(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
+	"packssdw %%mm1, %%mm4			\n\t" /* A0	a0 */\
+	"movq %%mm4, " #dst "			\n\t"\
+	"psrad $" #shift ", %%mm2		\n\t"\
+	"packssdw %%mm2, %%mm0			\n\t" /* A1	a1 */\
+	"movq %%mm0, 16+" #dst "		\n\t"\
+	"movq %%mm0, 96+" #dst "		\n\t"\
+	"movq %%mm4, 112+" #dst "		\n\t"\
+	"movq %%mm0, 32+" #dst "		\n\t"\
+	"movq %%mm4, 48+" #dst "		\n\t"\
+	"movq %%mm4, 64+" #dst "		\n\t"\
+	"movq %%mm0, 80+" #dst "		\n\t"	
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
+IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
 
 
 #endif
 
 /*
 Input
- 00 20 02 22 40 60 42 62
- 10 30 12 32 50 70 52 72
- 01 21 03 23 41 61 43 63
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
  11 31 13 33 51 71 53 73
- 04 24 06 26 44 64 46 66
- 14 34 16 36 54 74 56 76
-...
-*/
-/*
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+  
 Temp
- 00 02 10 12 20 22 30 32
- 40 42 50 52 60 62 70 72
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
  01 03 11 13 21 23 31 33
  41 43 51 53 61 63 71 73
- 04 06 14 16 24 26 34 36
- 44 46 54 56 64 66 74 76
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
  05 07 15 17 25 27 35 37
  45 47 55 57 65 67 75 77
 */
 
-/*
-Output
- 00 10 20 30 40 50 60 70
- 01 11 21 31 41 51 61 71
-...
-*/
-
 "9: \n\t"
 		:: "r" (block), "r" (temp), "r" (coeffs)
 		: "%eax"
 	);
-/*
-idctCol(block, temp);
-idctCol(block+1, temp+2);
-idctCol(block+2, temp+4);
-idctCol(block+3, temp+6);
-idctCol(block+4, temp+8);
-idctCol(block+5, temp+10);
-idctCol(block+6, temp+12);
-idctCol(block+7, temp+14);
-*/
 }
 
 void simple_idct_mmx(int16_t *block)
 {
-	static int imax=0, imin=0;
-	static int omax=0, omin=0;
-	int i, j;
-/*
-	for(i=0; i<64; i++)
-	{
-		if(block[i] > imax)
-		{
-			imax= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-		if(block[i] < imin)
-		{
-			imin= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-	}*/
-/*	static int stat[64];
-	for(j=0; j<4; j++)
-	{
-		static int line[8]={0,2,1,3,4,6,5,7};
-		for(i=0; i<16; i++)
-		{
-			if(block[j*16+i])
-			{
-				stat[j*16+1]++;
-				break;
-			}
-		}
-		for(i=0; i<16; i++)
-		{
-			if(block[j*16+i] && i!=0 && i!=2)
-			{
-				stat[j*16+2]++;
-				break;
-			}
-		}
-	}
-	stat[0]++;*/
-/*	for(i=1; i<8; i++)
-	{
-		if(block[i] != 0)
-		{
-			stat[1]++;
-			break;
-		}
-	}
-	for(i=32; i<64; i++)
-	{
-		if(block[i] != 0)
-		{
-			stat[2]++;
-			break;
-		}
-	}
-	stat[0]++;
-*/
-//	return;
 	idct(block);
-//	memset(block, 0, 128);
-/*
-	if(stat[0] > 100000)
-		for(i=0; i<64; i++)
-		{
-			if((i&7) == 0) printf("\n");
-			printf("%06d ", stat[i]);
-		}
-*/
-/*
-	for(i=0; i<4; i++) printf("%d", stat[1+i*16]);
-	printf("  ");
-	for(i=0; i<4; i++) printf("%d", stat[2+i*16]);
-	printf("\n");
-*/
-//	printf("%d", stat[2]);
-
-//	memset(stat, 0, 256);
-
-/*
-	for(i=0; i<64; i++)
-	{
-		if(block[i] > omax)
-		{
-			omax= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-		if(block[i] < omin)
-		{
-			omin= block[i];
-			printf("Input-Max: %d\n", imax);
-			printf("Input-Min: %d\n", imin);
-			printf("Output-Max: %d\n", omax);
-			printf("Output-Min: %d\n", omin);
-		}
-	}*/
 }
diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c
index d39b6c1e9..04300744f 100644
--- a/src/libffmpeg/libavcodec/imgconvert.c
+++ b/src/libffmpeg/libavcodec/imgconvert.c
@@ -1,20 +1,20 @@
 /*
  * Misc image convertion routines
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001, 2002 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "avcodec.h"
 #include "dsputil.h"
@@ -361,6 +361,8 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
 {
     int i;
 
+    assert(pix_fmt != PIX_FMT_ANY && dst_pix_fmt != PIX_FMT_ANY);
+
     if (dst_pix_fmt == pix_fmt) {
         switch(pix_fmt) {
         case PIX_FMT_YUV420P:
@@ -479,7 +481,7 @@ static void deinterlace_bottom_field(UINT8 *dst, int dst_wrap,
     int y, y1, i;
     UINT8 *buf;
 
-    buf= (UINT8*) malloc(5 * width);
+    buf = (UINT8*)av_malloc(5 * width);
 
     src = src1;
     for(y=0;y<height;y+=2) {
@@ -509,7 +511,7 @@ static void deinterlace_bottom_field(UINT8 *dst, int dst_wrap,
         dst += dst_wrap;
         src += (2 + 1) * src_wrap;
     }
-    free(buf);
+    av_free(buf);
 }
 
 
@@ -546,3 +548,5 @@ int avpicture_deinterlace(AVPicture *dst, AVPicture *src,
     }
     return 0;
 }
+
+#undef FIX
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index fda5a31c4..26519bd38 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -1,27 +1,23 @@
 /*
  * High quality image resampling with polyphase filters 
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include "dsputil.h"
 #include "avcodec.h"
+#include "dsputil.h"
 
 #ifdef USE_FASTMEMCPY
 #include "fastmemcpy.h"
@@ -454,7 +450,7 @@ ImgReSampleContext *img_resample_init(int owidth, int oheight,
 
     return s;
  fail:
-    free(s);
+    av_free(s);
     return NULL;
 }
 
@@ -474,8 +470,8 @@ void img_resample(ImgReSampleContext *s,
 
 void img_resample_close(ImgReSampleContext *s)
 {
-    free(s->line_buf);
-    free(s);
+    av_free(s->line_buf);
+    av_free(s);
 }
 
 #ifdef TEST
@@ -522,7 +518,7 @@ static void dump_filter(INT16 *filter)
 }
 
 #ifdef HAVE_MMX
-int mm_flags;
+extern int mm_flags;
 #endif
 
 int main(int argc, char **argv)
@@ -609,7 +605,7 @@ int main(int argc, char **argv)
                        img, XSIZE, XSIZE, YSIZE);
     if (memcmp(img1, img2, xsize * ysize) != 0) {
         fprintf(stderr, "mmx error\n");
-        abort();
+        exit(1);
     }
     printf("MMX OK\n");
 #endif
diff --git a/src/libffmpeg/libavcodec/jfdctfst.c b/src/libffmpeg/libavcodec/jfdctfst.c
index cdc3b47f9..4e3b55bb5 100644
--- a/src/libffmpeg/libavcodec/jfdctfst.c
+++ b/src/libffmpeg/libavcodec/jfdctfst.c
@@ -113,7 +113,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+fdct_ifast (DCTELEM * data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -222,3 +222,10 @@ jpeg_fdct_ifast (DCTELEM * data)
     dataptr++;			/* advance pointer to next column */
   }
 }
+
+
+#undef GLOBAL
+#undef CONST_BITS
+#undef DESCALE
+#undef FIX_0_541196100
+#undef FIX_1_306562965
diff --git a/src/libffmpeg/libavcodec/jrevdct.c b/src/libffmpeg/libavcodec/jrevdct.c
index 246f1b190..3ba91543d 100644
--- a/src/libffmpeg/libavcodec/jrevdct.c
+++ b/src/libffmpeg/libavcodec/jrevdct.c
@@ -1166,4 +1166,5 @@ void j_rev_dct(DCTBLOCK data)
   }
 }
 
-
+#undef FIX
+#undef CONST_BITS
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 1eb35d2b8..577e9d884 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -1,28 +1,36 @@
 /*
  * MJPEG encoder and decoder
- * Copyright (c) 2000, 2001 Gerard Lantau.
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Support for external huffman table and various fixes (AVID workaround) by
+ *                                    Alex Beregszaszi <alex@naxine.org>
  */
 //#define DEBUG
-#include "config.h"
-
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
-#include "xine-utils/xineutils.h"
+
+#ifdef USE_FASTMEMCPY
+#include "fastmemcpy.h"
+#endif
+
+/* use two quantizer table (one for luminance and one for chrominance) */
+/* not yet working */
+#undef TWOMATRIXES
 
 typedef struct MJpegContext {
     UINT8 huff_size_dc_luminance[12];
@@ -36,12 +44,87 @@ typedef struct MJpegContext {
     UINT16 huff_code_ac_chrominance[256];
 } MJpegContext;
 
-#define SOF0 0xc0
-#define SOI 0xd8
-#define EOI 0xd9
-#define DQT 0xdb
-#define DHT 0xc4
-#define SOS 0xda
+/* JPEG marker codes */
+typedef enum {
+    /* start of frame */
+    SOF0  = 0xc0,	/* baseline */
+    SOF1  = 0xc1,	/* extended sequential, huffman */
+    SOF2  = 0xc2,	/* progressive, huffman */
+    SOF3  = 0xc3,	/* lossless, huffman */
+
+    SOF5  = 0xc5,	/* differential sequential, huffman */
+    SOF6  = 0xc6,	/* differential progressive, huffman */
+    SOF7  = 0xc7,	/* differential lossless, huffman */
+    JPG   = 0xc8,	/* reserved for JPEG extension */
+    SOF9  = 0xc9,	/* extended sequential, arithmetic */
+    SOF10 = 0xca,	/* progressive, arithmetic */
+    SOF11 = 0xcb,	/* lossless, arithmetic */
+
+    SOF13 = 0xcd,	/* differential sequential, arithmetic */
+    SOF14 = 0xce,	/* differential progressive, arithmetic */
+    SOF15 = 0xcf,	/* differential lossless, arithmetic */
+
+    DHT   = 0xc4,	/* define huffman tables */
+
+    DAC   = 0xcc,	/* define arithmetic-coding conditioning */
+
+    /* restart with modulo 8 count "m" */
+    RST0  = 0xd0,
+    RST1  = 0xd1,
+    RST2  = 0xd2,
+    RST3  = 0xd3,
+    RST4  = 0xd4,
+    RST5  = 0xd5,
+    RST6  = 0xd6,
+    RST7  = 0xd7,
+
+    SOI   = 0xd8,	/* start of image */
+    EOI   = 0xd9,	/* end of image */
+    SOS   = 0xda,	/* start of scan */
+    DQT   = 0xdb,	/* define quantization tables */
+    DNL   = 0xdc,	/* define number of lines */
+    DRI   = 0xdd,	/* define restart interval */
+    DHP   = 0xde,	/* define hierarchical progression */
+    EXP   = 0xdf,	/* expand reference components */
+
+    APP0  = 0xe0,
+    APP1  = 0xe1,
+    APP2  = 0xe2,
+    APP3  = 0xe3,
+    APP4  = 0xe4,
+    APP5  = 0xe5,
+    APP6  = 0xe6,
+    APP7  = 0xe7,
+    APP8  = 0xe8,
+    APP9  = 0xe9,
+    APP10 = 0xea,
+    APP11 = 0xeb,
+    APP12 = 0xec,
+    APP13 = 0xed,
+    APP14 = 0xee,
+    APP15 = 0xef,
+
+    JPG0  = 0xf0,
+    JPG1  = 0xf1,
+    JPG2  = 0xf2,
+    JPG3  = 0xf3,
+    JPG4  = 0xf4,
+    JPG5  = 0xf5,
+    JPG6  = 0xf6,
+    JPG7  = 0xf7,
+    JPG8  = 0xf8,
+    JPG9  = 0xf9,
+    JPG10 = 0xfa,
+    JPG11 = 0xfb,
+    JPG12 = 0xfc,
+    JPG13 = 0xfd,
+
+    COM   = 0xfe,	/* comment */
+
+    TEM   = 0x01,	/* temporary private use for arithmetic coding */
+
+    /* 0x02 -> 0xbf reserved */
+} JPEG_MARKER;
 
 #if 0
 /* These are the sample quantization tables given in JPEG spec section K.1.
@@ -135,7 +218,6 @@ static const UINT8 val_ac_chrominance[] =
   0xf9, 0xfa 
 };
 
-
 /* isn't this function nicer than the one in the libjpeg ? */
 static void build_huffman_codes(UINT8 *huff_size, UINT16 *huff_code,
                                 const UINT8 *bits_table, const UINT8 *val_table)
@@ -160,9 +242,13 @@ int mjpeg_init(MpegEncContext *s)
 {
     MJpegContext *m;
     
-    m = malloc(sizeof(MJpegContext));
+    m = av_malloc(sizeof(MJpegContext));
     if (!m)
         return -1;
+    
+    s->min_qcoeff=-1023;
+    s->max_qcoeff= 1023;
+    s->intra_quant_bias= 1<<(QUANT_BIAS_SHIFT-1); //(a + x/2)/x
 
     /* build all the huffman tables */
     build_huffman_codes(m->huff_size_dc_luminance,
@@ -188,7 +274,7 @@ int mjpeg_init(MpegEncContext *s)
 
 void mjpeg_close(MpegEncContext *s)
 {
-    free(s->mjpeg_ctx);
+    av_free(s->mjpeg_ctx);
 }
 
 static inline void put_marker(PutBitContext *p, int code)
@@ -227,14 +313,18 @@ static void jpeg_table_header(MpegEncContext *s)
 
     /* quant matrixes */
     put_marker(p, DQT);
+#ifdef TWOMATRIXES
+    put_bits(p, 16, 2 + 2 * (1 + 64));
+#else
     put_bits(p, 16, 2 + 1 * (1 + 64));
+#endif
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 0); /* table 0 */
     for(i=0;i<64;i++) {
         j = zigzag_direct[i];
         put_bits(p, 8, s->intra_matrix[j]);
     }
-#if 0
+#ifdef TWOMATRIXES
     put_bits(p, 4, 0); /* 8 bit precision */
     put_bits(p, 4, 1); /* table 1 */
     for(i=0;i<64;i++) {
@@ -258,10 +348,46 @@ static void jpeg_table_header(MpegEncContext *s)
     ptr[1] = size;
 }
 
+static void jpeg_put_comments(MpegEncContext *s)
+{
+    PutBitContext *p = &s->pb;
+    int size;
+    UINT8 *ptr;
+
+#if 0
+    /* JFIF header */
+    put_marker(p, APP0);
+    put_bits(p, 16, 16);
+    put_string(p, "JFIF"); /* this puts the trailing zero-byte too */
+    put_bits(p, 16, 0x101);
+    put_bits(p, 8, 0); /* units type: 0 - aspect ratio */
+    put_bits(p, 16, 1); /* aspect: 1:1 */
+    put_bits(p, 16, 1);
+    put_bits(p, 8, 0); /* thumbnail width */
+    put_bits(p, 8, 0); /* thumbnail height */
+#endif
+
+    /* comment */
+    put_marker(p, COM);
+    flush_put_bits(p);
+    ptr = pbBufPtr(p);
+    put_bits(p, 16, 0); /* patched later */
+#define VERSION "FFmpeg" LIBAVCODEC_VERSION "b" LIBAVCODEC_BUILD_STR
+    put_string(p, VERSION);
+    size = strlen(VERSION)+3;
+#undef VERSION
+    ptr[0] = size >> 8;
+    ptr[1] = size;
+}
+
 void mjpeg_picture_header(MpegEncContext *s)
 {
     put_marker(&s->pb, SOI);
 
+    if (!s->mjpeg_data_only_frames)
+    {
+    jpeg_put_comments(s);    
+
     if (s->mjpeg_write_tables) jpeg_table_header(s);
 
     put_marker(&s->pb, SOF0);
@@ -282,13 +408,22 @@ void mjpeg_picture_header(MpegEncContext *s)
     put_bits(&s->pb, 8, 2); /* component number */
     put_bits(&s->pb, 4, s->mjpeg_hsample[1]); /* H factor */
     put_bits(&s->pb, 4, s->mjpeg_vsample[1]); /* V factor */
+#ifdef TWOMATRIXES
+    put_bits(&s->pb, 8, 1); /* select matrix */
+#else
     put_bits(&s->pb, 8, 0); /* select matrix */
+#endif
 
     /* Cr component */
     put_bits(&s->pb, 8, 3); /* component number */
     put_bits(&s->pb, 4, s->mjpeg_hsample[2]); /* H factor */
     put_bits(&s->pb, 4, s->mjpeg_vsample[2]); /* V factor */
+#ifdef TWOMATRIXES
+    put_bits(&s->pb, 8, 1); /* select matrix */
+#else
     put_bits(&s->pb, 8, 0); /* select matrix */
+#endif
+    }
 
     /* scan header */
     put_marker(&s->pb, SOS);
@@ -312,7 +447,7 @@ void mjpeg_picture_header(MpegEncContext *s)
 
     put_bits(&s->pb, 8, 0); /* Ss (not used) */
     put_bits(&s->pb, 8, 63); /* Se (not used) */
-    put_bits(&s->pb, 8, 0); /* (not used) */
+    put_bits(&s->pb, 8, 0); /* Ah/Al (not used) */
 }
 
 void mjpeg_picture_trailer(MpegEncContext *s)
@@ -321,8 +456,8 @@ void mjpeg_picture_trailer(MpegEncContext *s)
     put_marker(&s->pb, EOI);
 }
 
-static inline void encode_dc(MpegEncContext *s, int val, 
-                             UINT8 *huff_size, UINT16 *huff_code)
+static inline void mjpeg_encode_dc(MpegEncContext *s, int val,
+				   UINT8 *huff_size, UINT16 *huff_code)
 {
     int mant, nbits;
 
@@ -361,11 +496,11 @@ static void encode_block(MpegEncContext *s, DCTELEM *block, int n)
     dc = block[0]; /* overflow is impossible */
     val = dc - s->last_dc[component];
     if (n < 4) {
-        encode_dc(s, val, m->huff_size_dc_luminance, m->huff_code_dc_luminance);
+        mjpeg_encode_dc(s, val, m->huff_size_dc_luminance, m->huff_code_dc_luminance);
         huff_size_ac = m->huff_size_ac_luminance;
         huff_code_ac = m->huff_code_ac_luminance;
     } else {
-        encode_dc(s, val, m->huff_size_dc_chrominance, m->huff_code_dc_chrominance);
+        mjpeg_encode_dc(s, val, m->huff_size_dc_chrominance, m->huff_code_dc_chrominance);
         huff_size_ac = m->huff_size_ac_chrominance;
         huff_code_ac = m->huff_code_ac_chrominance;
     }
@@ -429,6 +564,7 @@ void mjpeg_encode_mb(MpegEncContext *s,
 #define MAX_COMPONENTS 4
 
 typedef struct MJpegDecodeContext {
+    AVCodecContext *avctx;
     GetBitContext gb;
     UINT32 header_state;
     int start_code; /* current start code */
@@ -455,8 +591,21 @@ typedef struct MJpegDecodeContext {
     int linesize[MAX_COMPONENTS];
     DCTELEM block[64] __align8;
     UINT8 buffer[PICTURE_BUFFER_SIZE]; 
+
+    int buggy_avid;
+    int restart_interval;
+    int restart_count;
+    int interleaved_rows;
 } MJpegDecodeContext;
 
+#define SKIP_REMAINING(gb, len) { \
+    dprintf("reamining %d bytes in marker\n", len); \
+    if (len) while (--len) \
+	skip_bits(gb, 8); \
+}
+
+static int mjpeg_decode_dht(MJpegDecodeContext *s, UINT8 *buf, int buf_size);
+
 static void build_vlc(VLC *vlc, const UINT8 *bits_table, const UINT8 *val_table, 
                       int nb_codes)
 {
@@ -473,6 +622,8 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
 {
     MJpegDecodeContext *s = avctx->priv_data;
 
+    s->avctx = avctx;
+
     s->header_state = 0;
     s->mpeg_enc_ctx_allocated = 0;
     s->buffer_size = PICTURE_BUFFER_SIZE - 1; /* minus 1 to take into
@@ -487,6 +638,13 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
     build_vlc(&s->vlcs[0][1], bits_dc_chrominance, val_dc_chrominance, 12);
     build_vlc(&s->vlcs[1][0], bits_ac_luminance, val_ac_luminance, 251);
     build_vlc(&s->vlcs[1][1], bits_ac_chrominance, val_ac_chrominance, 251);
+    
+    if (avctx->flags & CODEC_FLAG_EXTERN_HUFF)
+    {
+	printf("mjpeg: using external huffman table\n");
+	mjpeg_decode_dht(s, avctx->extradata, avctx->extradata_size);
+	/* should check for error - but dunno */
+    }
     return 0;
 }
 
@@ -496,14 +654,16 @@ static int mjpeg_decode_dqt(MJpegDecodeContext *s,
 {
     int len, index, i, j;
     init_get_bits(&s->gb, buf, buf_size);
-
-    len = get_bits(&s->gb, 16);
-    len -= 2;
+    
+    len = get_bits(&s->gb, 16) - 2;
 
     while (len >= 65) {
         /* only 8 bit precision handled */
         if (get_bits(&s->gb, 4) != 0)
+	{
+	    dprintf("dqt: 16bit precision\n");
             return -1;
+	}
         index = get_bits(&s->gb, 4);
         if (index >= 4)
             return -1;
@@ -511,10 +671,13 @@ static int mjpeg_decode_dqt(MJpegDecodeContext *s,
         /* read quant table */
         for(i=0;i<64;i++) {
             j = zigzag_direct[i];
-            s->quant_matrixes[index][j] = get_bits(&s->gb, 8);
+	    s->quant_matrixes[index][j] = get_bits(&s->gb, 8);
         }
         len -= 65;
     }
+    
+    SKIP_REMAINING(&s->gb, len);
+
     return 0;
 }
 
@@ -581,6 +744,7 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s,
         return -1;
     height = get_bits(&s->gb, 16);
     width = get_bits(&s->gb, 16);
+    dprintf("sof0: picture: %dx%d\n", width, height);
 
     nb_components = get_bits(&s->gb, 8);
     if (nb_components <= 0 ||
@@ -602,16 +766,15 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s,
         s->quant_index[i] = get_bits(&s->gb, 8);
         if (s->quant_index[i] >= 4)
             return -1;
-        dprintf("component %d %d:%d\n", i, s->h_count[i], s->v_count[i]);
+        dprintf("component %d %d:%d id: %d quant:%d\n", i, s->h_count[i],
+	    s->v_count[i], s->component_id[i], s->quant_index[i]);
     }
 
     /* if different size, realloc/alloc picture */
     /* XXX: also check h_count and v_count */
     if (width != s->width || height != s->height) {
-        for(i=0;i<MAX_COMPONENTS;i++) {
-            free(s->current_picture[i]);
-            s->current_picture[i] = NULL;
-        }
+        for(i=0;i<MAX_COMPONENTS;i++)
+            av_freep(&s->current_picture[i]);
         s->width = width;
         s->height = height;
         /* test interlaced mode */
@@ -619,7 +782,7 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s,
             s->org_height != 0 &&
             s->height < ((s->org_height * 3) / 4)) {
             s->interlaced = 1;
-            s->bottom_field = 0;
+	    s->bottom_field = 0;
         }
 
         for(i=0;i<nb_components;i++) {
@@ -636,19 +799,26 @@ static int mjpeg_decode_sof0(MJpegDecodeContext *s,
         }
         s->first_picture = 0;
     }
+
+    if (len != (8+(3*nb_components)))
+    {
+	dprintf("decode_sof0: error, len(%d) mismatch\n", len);
+    }
     
     return 0;
 }
 
-static inline int decode_dc(MJpegDecodeContext *s, int dc_index)
+static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index)
 {
-    VLC *dc_vlc;
     int code, diff;
 
-    dc_vlc = &s->vlcs[0][dc_index];
-    code = get_vlc(&s->gb, dc_vlc);
+    code = get_vlc(&s->gb, &s->vlcs[0][dc_index]);
     if (code < 0)
+    {
+	dprintf("mjpeg_decode_dc: bad vlc: %d:%d (%p)\n", 0, dc_index,
+                &s->vlcs[0][dc_index]);
         return 0xffff;
+    }
     if (code == 0) {
         diff = 0;
     } else {
@@ -668,13 +838,13 @@ static int decode_block(MJpegDecodeContext *s, DCTELEM *block,
     VLC *ac_vlc;
     INT16 *quant_matrix;
 
-    quant_matrix = s->quant_matrixes[quant_index];
     /* DC coef */
-    val = decode_dc(s, dc_index);
+    val = mjpeg_decode_dc(s, dc_index);
     if (val == 0xffff) {
         dprintf("error dc\n");
         return -1;
     }
+    quant_matrix = s->quant_matrixes[quant_index];
     val = val * quant_matrix[0] + s->last_dc[component];
     s->last_dc[component] = val;
     block[0] = val;
@@ -731,17 +901,24 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s,
     nb_components = get_bits(&s->gb, 8);
     /* XXX: only interleaved scan accepted */
     if (nb_components != 3)
+    {
+	dprintf("decode_sos: components(%d) mismatch\n", nb_components);
         return -1;
+    }
     vmax = 0;
     hmax = 0;
     for(i=0;i<nb_components;i++) {
         id = get_bits(&s->gb, 8) - 1;
+	dprintf("component: %d\n", id);
         /* find component index */
         for(index=0;index<s->nb_components;index++)
             if (id == s->component_id[index])
                 break;
         if (index == s->nb_components)
+	{
+	    dprintf("decode_sos: index(%d) out of components\n", index);
             return -1;
+	}
 
         comp_index[i] = index;
         nb_blocks[i] = s->h_count[index] * s->v_count[index];
@@ -749,15 +926,31 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s,
         v_count[i] = s->v_count[index];
 
         dc_index[i] = get_bits(&s->gb, 4);
-        if (dc_index[i] >= 4)
-            return -1;
         ac_index[i] = get_bits(&s->gb, 4);
-        if (ac_index[i] >= 4)
-            return -1;
+
+	if (dc_index[i] < 0 || ac_index[i] < 0 ||
+	    dc_index[i] >= 4 || ac_index[i] >= 4)
+	    goto out_of_range;
+	switch(s->start_code)
+	{
+	    case SOF0:
+		if (dc_index[i] > 1 || ac_index[i] > 1)
+		    goto out_of_range;
+		break;
+	    case SOF1:
+	    case SOF2:
+		if (dc_index[i] > 3 || ac_index[i] > 3)
+		    goto out_of_range;
+		break;
+	    case SOF3:
+		if (dc_index[i] > 3 || ac_index[i] != 0)
+		    goto out_of_range;
+		break;	
+	}
     }
-    get_bits(&s->gb, 8); /* Ss */
-    get_bits(&s->gb, 8); /* Se */
-    get_bits(&s->gb, 8); /* not used */
+    skip_bits(&s->gb, 8); /* Ss */
+    skip_bits(&s->gb, 8); /* Se */
+    skip_bits(&s->gb, 8); /* Ah and Al (each are 4 bits) */
 
     for(i=0;i<nb_components;i++) 
         s->last_dc[i] = 1024;
@@ -787,22 +980,24 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s,
                 v = v_count[i];
                 x = 0;
                 y = 0;
+		if (s->restart_interval && !s->restart_count)
+		    s->restart_count = s->restart_interval;
                 for(j=0;j<n;j++) {
                     memset(s->block, 0, sizeof(s->block));
                     if (decode_block(s, s->block, i, 
                                      dc_index[i], ac_index[i], 
                                      s->quant_index[c]) < 0) {
-                        dprintf("error %d %d\n", mb_y, mb_x);
+                        dprintf("error y=%d x=%d\n", mb_y, mb_x);
                         ret = -1;
                         goto the_end;
                     }
-                    ff_idct (s->block);
+//		    dprintf("mb: %d %d processed\n", mb_y, mb_x);
                     ptr = s->current_picture[c] + 
                         (s->linesize[c] * (v * mb_y + y) * 8) + 
                         (h * mb_x + x) * 8;
                     if (s->interlaced && s->bottom_field)
                         ptr += s->linesize[c] >> 1;
-                    put_pixels_clamped(s->block, ptr, s->linesize[c]);
+                    ff_idct_put(ptr, s->linesize[c], s->block);
                     if (++x == h) {
                         x = 0;
                         y++;
@@ -815,6 +1010,139 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s,
  the_end:
     emms_c();
     return ret;
+ out_of_range:
+    dprintf("decode_sos: ac/dc index out of range\n");
+    return -1;
+}
+
+static int mjpeg_decode_dri(MJpegDecodeContext *s,
+                             UINT8 *buf, int buf_size)
+{
+    init_get_bits(&s->gb, buf, buf_size);
+
+    if (get_bits(&s->gb, 16) != 4)
+	return -1;
+    s->restart_interval = get_bits(&s->gb, 16);
+    printf("restart interval: %d\n", s->restart_interval);
+
+    return 0;
+}
+
+#define FOURCC(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | d)
+static int mjpeg_decode_app(MJpegDecodeContext *s,
+                             UINT8 *buf, int buf_size, int start_code)
+{
+    int len, id;
+
+    init_get_bits(&s->gb, buf, buf_size);
+
+    /* XXX: verify len field validity */
+    len = get_bits(&s->gb, 16);
+    if (len < 5)
+	return -1;
+
+    id = (get_bits(&s->gb, 16) << 16) | get_bits(&s->gb, 16);
+    len -= 6;
+
+    /* buggy AVID, it puts EOI only at every 10th frame */
+    /* also this fourcc is used by non-avid files too, it means
+       interleaving, but it's always present in AVID files */
+    if (id == FOURCC('A','V','I','1'))
+    {
+	/* structure:
+	    4bytes	AVI1
+	    1bytes	polarity
+	    1bytes	always zero
+	    4bytes	field_size
+	    4bytes	field_size_less_padding
+	*/
+    	s->buggy_avid = 1;
+	if (s->first_picture)
+	    printf("mjpeg: workarounding buggy AVID\n");
+	s->interleaved_rows = get_bits(&s->gb, 8);
+#if 0
+	skip_bits(&s->gb, 8);
+	skip_bits(&s->gb, 32);
+	skip_bits(&s->gb, 32);
+	len -= 10;
+#endif
+	if (s->interleaved_rows)
+	    printf("mjpeg: interleaved rows: %d\n", s->interleaved_rows);
+	goto out;
+    }
+    
+    len -= 2;
+    
+    if (id == FOURCC('J','F','I','F'))
+    {
+	skip_bits(&s->gb, 8); /* the trailing zero-byte */
+	printf("mjpeg: JFIF header found (version: %x.%x)\n",
+	    get_bits(&s->gb, 8), get_bits(&s->gb, 8));
+	goto out;
+    }
+    
+    /* Apple MJPEG-A */
+    if ((start_code == APP1) && (len > (0x28 - 8)))
+    {
+	id = (get_bits(&s->gb, 16) << 16) | get_bits(&s->gb, 16);
+	len -= 4;
+        if (id == FOURCC('m','j','p','g')) /* Apple MJPEG-A */
+	{
+#if 0
+	    skip_bits(&s->gb, 32); /* field size */
+	    skip_bits(&s->gb, 32); /* pad field size */
+	    skip_bits(&s->gb, 32); /* next off */
+	    skip_bits(&s->gb, 32); /* quant off */
+	    skip_bits(&s->gb, 32); /* huff off */
+	    skip_bits(&s->gb, 32); /* image off */
+	    skip_bits(&s->gb, 32); /* scan off */
+	    skip_bits(&s->gb, 32); /* data off */
+#endif
+	    if (s->first_picture)
+		printf("mjpeg: Apple MJPEG-A header found\n");
+	}
+    }
+
+out:
+    /* should check for further values.. */
+    SKIP_REMAINING(&s->gb, len);
+
+    return 0;
+}
+#undef FOURCC
+
+static int mjpeg_decode_com(MJpegDecodeContext *s,
+                             UINT8 *buf, int buf_size)
+{
+    int len, i;
+    UINT8 *cbuf;
+
+    init_get_bits(&s->gb, buf, buf_size);
+
+    /* XXX: verify len field validity */
+    len = get_bits(&s->gb, 16)-2;
+    cbuf = av_malloc(len+1);
+
+    for (i = 0; i < len; i++)
+	cbuf[i] = get_bits(&s->gb, 8);
+    if (cbuf[i-1] == '\n')
+	cbuf[i-1] = 0;
+    else
+	cbuf[i] = 0;
+
+    printf("mjpeg comment: '%s'\n", cbuf);
+
+    /* buggy avid, it puts EOI only at every 10th frame */
+    if (!strcmp(cbuf, "AVID"))
+    {
+	s->buggy_avid = 1;
+	if (s->first_picture)
+	    printf("mjpeg: workarounding buggy AVID\n");
+    }
+    
+    av_free(cbuf);
+
+    return 0;
 }
 
 /* return the 8 bit start code value and update the search
@@ -858,8 +1186,9 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
 {
     MJpegDecodeContext *s = avctx->priv_data;
     UINT8 *buf_end, *buf_ptr, *buf_start;
-    int len, code, start_code, input_size, i;
+    int len, code, input_size, i;
     AVPicture *picture = data;
+    unsigned int start_code;
 
     *data_size = 0;
 
@@ -883,10 +1212,13 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
         } else {
             memcpy(s->buf_ptr, buf_start, len);
             s->buf_ptr += len;
-            /* if we got FF 00, we copy FF to the stream to unescape FF 00 */
-            if (code == 0) {
+            if (code < 0) {
+                /* nothing to do: wait next marker */
+            } else if (code == 0 || code == 0xff) {
+                /* if we got FF 00, we copy FF to the stream to unescape FF 00 */
+                /* valid marker code is between 00 and ff - alex */
                 s->buf_ptr--;
-            } else if (code > 0) {
+            } else {
                 /* prepare data for next start code */
                 input_size = s->buf_ptr - s->buffer;
                 start_code = s->start_code;
@@ -895,6 +1227,7 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
                 dprintf("marker=%x\n", start_code);
                 switch(start_code) {
                 case SOI:
+		    s->restart_interval = 0;
                     /* nothing to do on SOI */
                     break;
                 case DQT:
@@ -908,7 +1241,7 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
                     break;
                 case SOS:
                     mjpeg_decode_sos(s, s->buffer, input_size);
-                    if (s->start_code == EOI) {
+                    if (s->start_code == EOI || s->buggy_avid || s->restart_interval) {
                         int l;
                         if (s->interlaced) {
                             s->bottom_field ^= 1;
@@ -943,11 +1276,41 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
                         }
                         /* dummy quality */
                         /* XXX: infer it with matrix */
-                        avctx->quality = 3; 
+                    	avctx->quality = 3; 
                         goto the_end;
                     }
                     break;
+		case DRI:
+		    mjpeg_decode_dri(s, s->buffer, input_size);
+		    break;
+		case SOF1:
+		case SOF2:
+		case SOF3:
+		case SOF5:
+		case SOF6:
+		case SOF7:
+		case SOF9:
+		case SOF10:
+		case SOF11:
+		case SOF13:
+		case SOF14:
+		case SOF15:
+		case JPG:
+		    printf("mjpeg: unsupported coding type (%x)\n", start_code);
+		    return -1;
                 }
+#if 1
+		if (start_code >= 0xd0 && start_code <= 0xd7) {
+		    dprintf("restart marker: %d\n", start_code&0x0f);
+		} else if (s->first_picture) {
+		    /* APP fields */
+		    if (start_code >= 0xe0 && start_code <= 0xef)
+			mjpeg_decode_app(s, s->buffer, input_size, start_code);
+		    /* Comment */
+		    else if (start_code == COM)
+			mjpeg_decode_com(s, s->buffer, input_size);
+		}
+#endif
             }
         }
     }
@@ -961,7 +1324,7 @@ static int mjpeg_decode_end(AVCodecContext *avctx)
     int i, j;
 
     for(i=0;i<MAX_COMPONENTS;i++)
-        free(s->current_picture[i]);
+        av_free(s->current_picture[i]);
     for(i=0;i<2;i++) {
         for(j=0;j<4;j++)
             free_vlc(&s->vlcs[i][j]);
@@ -978,4 +1341,6 @@ AVCodec mjpeg_decoder = {
     NULL,
     mjpeg_decode_end,
     mjpeg_decode_frame,
+    0,
+    NULL
 };
diff --git a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
index 236c9206a..4539b6464 100644
--- a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
+++ b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c
@@ -1,20 +1,20 @@
 /*
  * Sun mediaLib optimized DSP utils
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include "../dsputil.h"
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 92724ac87..8f2ffa42e 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -1,43 +1,45 @@
 /*
  * Motion estimation 
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000,2001 Fabrice Bellard.
+ * Copyright (c) 2002 Michael Niedermayer
  * 
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * new Motion Estimation (X1/EPZS) by Michael Niedermayer <michaelni@gmx.at>
  */
-#include "config.h"
-#include "xine-utils/xineutils.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
 
-#define ABS(a) ((a)>0 ? (a) : -(a))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#define SQ(a) ((a)*(a))
 #define INTER_BIAS	257
 
-static void halfpel_motion_search(MpegEncContext * s,
-				  int *mx_ptr, int *my_ptr, int dmin,
-				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y);
+#define P_LAST P[0]
+#define P_LEFT P[1]
+#define P_TOP P[2]
+#define P_TOPRIGHT P[3]
+#define P_MEDIAN P[4]
+#define P_LAST_LEFT P[5]
+#define P_LAST_RIGHT P[6]
+#define P_LAST_TOP P[7]
+#define P_LAST_BOTTOM P[8]
+#define P_MV1 P[9]
 
-/* config it to test motion vector encoding (send random vectors) */
-//#define CONFIG_TEST_MV_ENCODE
 
 static int pix_sum(UINT8 * pix, int line_size)
 {
@@ -140,7 +142,7 @@ static void no_motion_search(MpegEncContext * s,
 
 static int full_motion_search(MpegEncContext * s,
                               int *mx_ptr, int *my_ptr, int range,
-                              int xmin, int ymin, int xmax, int ymax)
+                              int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
 {
     int x1, y1, x2, y2, xx, yy, x, y;
     int mx, my, dmin, d;
@@ -166,7 +168,7 @@ static int full_motion_search(MpegEncContext * s,
     my = 0;
     for (y = y1; y <= y2; y++) {
 	for (x = x1; x <= x2; x++) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x,
+	    d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x,
 			     s->linesize);
 	    if (d < dmin ||
 		(d == dmin &&
@@ -194,7 +196,7 @@ static int full_motion_search(MpegEncContext * s,
 
 static int log_motion_search(MpegEncContext * s,
                              int *mx_ptr, int *my_ptr, int range,
-                             int xmin, int ymin, int xmax, int ymax)
+                             int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
 {
     int x1, y1, x2, y2, xx, yy, x, y;
     int mx, my, dmin, d;
@@ -231,7 +233,7 @@ static int log_motion_search(MpegEncContext * s,
     do {
 	for (y = y1; y <= y2; y += range) {
 	    for (x = x1; x <= x2; x += range) {
-		d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
+		d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize);
 		if (d < dmin || (d == dmin && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		    dmin = d;
 		    mx = x;
@@ -270,7 +272,7 @@ static int log_motion_search(MpegEncContext * s,
 
 static int phods_motion_search(MpegEncContext * s,
                                int *mx_ptr, int *my_ptr, int range,
-                               int xmin, int ymin, int xmax, int ymax)
+                               int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
 {
     int x1, y1, x2, y2, xx, yy, x, y, lastx, d;
     int mx, my, dminx, dminy;
@@ -311,7 +313,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 	lastx = x;
 	for (x = x1; x <= x2; x += range) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
+	    d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize);
 	    if (d < dminx || (d == dminx && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		dminx = d;
 		mx = x;
@@ -320,7 +322,7 @@ static int phods_motion_search(MpegEncContext * s,
 
 	x = lastx;
 	for (y = y1; y <= y2; y += range) {
-	    d = pix_abs16x16(pix, s->last_picture[0] + (y * s->linesize) + x, s->linesize);
+	    d = pix_abs16x16(pix, ref_picture + (y * s->linesize) + x, s->linesize);
 	    if (d < dminy || (d == dminy && (abs(x - xx) + abs(y - yy)) < (abs(mx - xx) + abs(my - yy)))) {
 		dminy = d;
 		my = y;
@@ -364,62 +366,61 @@ static int phods_motion_search(MpegEncContext * s,
 
 #define CHECK_MV(x,y)\
 {\
-    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    if(map[index]!=key){\
+        d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
+        map[index]= key;\
+        score_map[index]= d;\
     }\
 }
 
 #define CHECK_MV_DIR(x,y,new_dir)\
 {\
-    d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
-        next_dir= new_dir;\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    if(map[index]!=key){\
+        d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+        if(d<dmin){\
+            best[0]=x;\
+            best[1]=y;\
+            dmin=d;\
+            next_dir= new_dir;\
+        }\
+        map[index]= key;\
+        score_map[index]= d;\
     }\
 }
 
 #define CHECK_MV4(x,y)\
 {\
-    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
+    const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
+    const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    if(map[index]!=key){\
+        d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
+        d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
+        COPY3_IF_LT(dmin, d, best[0], x, best[1], y)\
+        map[index]= key;\
+        score_map[index]= d;\
     }\
 }
 
-#define CHECK_MV4_DIR(x,y,new_dir)\
-{\
-    d = pix_abs8x8(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);\
-    d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;\
-    if(d<dmin){\
-        best[0]=x;\
-        best[1]=y;\
-        dmin=d;\
-        next_dir= new_dir;\
-    }\
-}
-
-
 #define check(x,y,S,v)\
-if( (x)<(xmin<<(S)) ) printf("%d %d %d %d xmin" #v, (x), (y), s->mb_x, s->mb_y);\
-if( (x)>(xmax<<(S)) ) printf("%d %d %d %d xmax" #v, (x), (y), s->mb_x, s->mb_y);\
-if( (y)<(ymin<<(S)) ) printf("%d %d %d %d ymin" #v, (x), (y), s->mb_x, s->mb_y);\
-if( (y)>(ymax<<(S)) ) printf("%d %d %d %d ymax" #v, (x), (y), s->mb_x, s->mb_y);\
+if( (x)<(xmin<<(S)) ) printf("%d %d %d %d %d xmin" #v, xmin, (x), (y), s->mb_x, s->mb_y);\
+if( (x)>(xmax<<(S)) ) printf("%d %d %d %d %d xmax" #v, xmax, (x), (y), s->mb_x, s->mb_y);\
+if( (y)<(ymin<<(S)) ) printf("%d %d %d %d %d ymin" #v, ymin, (x), (y), s->mb_x, s->mb_y);\
+if( (y)>(ymax<<(S)) ) printf("%d %d %d %d %d ymax" #v, ymax, (x), (y), s->mb_x, s->mb_y);\
 
 
 static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
                                        UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
                                        int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift)
+                                       int xmin, int ymin, int xmax, int ymax, int shift,
+                                       uint32_t *map, uint16_t *score_map, int map_generation,
+                                       op_pixels_abs_func pix_abs)
 {
     int next_dir=-1;
 
@@ -467,36 +468,19 @@ static inline int small_diamond_search(MpegEncContext * s, int *best, int dmin,
     */
 }
 
-static inline int small_diamond_search4MV(MpegEncContext * s, int *best, int dmin,
-                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
-                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift)
-{
-    int next_dir=-1;
-
-    for(;;){
-        int d;
-        const int dir= next_dir;
-        const int x= best[0];
-        const int y= best[1];
-        next_dir=-1;
-
-//printf("%d", dir);
-        if(dir!=2 && x>xmin) CHECK_MV4_DIR(x-1, y  , 0)
-        if(dir!=3 && y>ymin) CHECK_MV4_DIR(x  , y-1, 1)
-        if(dir!=0 && x<xmax) CHECK_MV4_DIR(x+1, y  , 2)
-        if(dir!=1 && y<ymax) CHECK_MV4_DIR(x  , y+1, 3)
-
-        if(next_dir==-1){
-            return dmin;
-        }
-    }
-}
-
+#if 1
+#define SNAKE_1 3
+#define SNAKE_2 2
+#else
+#define SNAKE_1 7
+#define SNAKE_2 3
+#endif
 static inline int snake_search(MpegEncContext * s, int *best, int dmin,
                                        UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
                                        int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
-                                       int xmin, int ymin, int xmax, int ymax, int shift)
+                                       int xmin, int ymin, int xmax, int ymax, int shift,
+                                       uint32_t *map, uint16_t *score_map,int map_generation,
+                                       op_pixels_abs_func pix_abs)
 {
     int dir=0;
     int c=1;
@@ -522,8 +506,15 @@ if(256*256*256*64%point==0)
         x+=x_dir[dir];
         y+=y_dir[dir];
         if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
-            d = pix_abs16x16(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
-            d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
+            const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;
+            const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);
+            if(map[index]!=key){
+                d = pix_abs(new_pic, old_pic + (x) + (y)*pic_stride, pic_stride);
+                d += (mv_penalty[((x)<<shift)-pred_x] + mv_penalty[((y)<<shift)-pred_y])*quant;
+                map[index]=key;
+                score_map[index]=d;
+            }else
+                d= dmin+1;
         }else{
             d = dmin + 10000; //FIXME smarter boundary handling
         }
@@ -542,21 +533,90 @@ if(256*256*256*64%point==0)
         }else{
 //bad++;
             if(fails){
-                if(fails>=3) return dmin;
+                if(fails>=SNAKE_1+1) return dmin;
             }else{
-                c= -c;
+                if(dir&1) dir-= c*3;
+                else      c= -c;
+//                c= -c;
             }
-            dir+=c*2;
+            dir+=c*SNAKE_2;
             fails++;
         }
         dir&=7;
     }
 }
 
+static inline int cross_search(MpegEncContext * s, int *best, int dmin,
+                                       UINT8 *new_pic, UINT8 *old_pic, int pic_stride,
+                                       int pred_x, int pred_y, UINT16 *mv_penalty, int quant,
+                                       int xmin, int ymin, int xmax, int ymax, int shift,
+                                       uint32_t *map, uint16_t *score_map,int map_generation,
+                                       op_pixels_abs_func pix_abs)
+{
+    static int x_dir[4]= {-1, 0, 1, 0};
+    static int y_dir[4]= { 0,-1, 0, 1};
+    int improvement[2]={100000, 100000};
+    int dirs[2]={2, 3};
+    int dir;
+    int last_dir= -1;
+    
+    for(;;){
+        dir= dirs[ improvement[0] > improvement[1] ? 0 : 1 ];
+        if(improvement[dir&1]==-1) return dmin;
+        
+        {
+            const int x= best[0] + x_dir[dir];
+            const int y= best[1] + y_dir[dir];
+            const int key= (y<<ME_MAP_MV_BITS) + x + map_generation;
+            const int index= ((y<<ME_MAP_SHIFT) + x)&(ME_MAP_SIZE-1);
+            int d;
+            if(x>=xmin && x<=xmax && y>=ymin && y<=ymax){
+                if(map[index]!=key){
+                    d = pix_abs(new_pic, old_pic + x + y*pic_stride, pic_stride);
+                    d += (mv_penalty[(x<<shift)-pred_x] + mv_penalty[(y<<shift)-pred_y])*quant;
+                    map[index]=key;
+                    score_map[index]=d;
+                    if(d<dmin){
+                        improvement[dir&1]= dmin-d;
+                        improvement[(dir&1)^1]++;
+                        dmin=d;
+                        best[0]= x;
+                        best[1]= y;
+                        last_dir=dir;
+                        continue;
+                    }
+                }else{
+                    d= score_map[index];
+                }
+            }else{
+                d= dmin + 1000; //FIXME is this a good idea?
+            }
+            /* evaluated point was cached or checked and worse */
+
+            if(last_dir==dir){
+                improvement[dir&1]= -1;
+            }else{
+                improvement[dir&1]= d-dmin;
+                last_dir= dirs[dir&1]= dir^2;
+            }
+        }
+    }
+}
+
+static inline int update_map_generation(MpegEncContext * s)
+{
+    s->me_map_generation+= 1<<(ME_MAP_MV_BITS*2);
+    if(s->me_map_generation==0){
+        s->me_map_generation= 1<<(ME_MAP_MV_BITS*2);
+        memset(s->me_map, 0, sizeof(uint32_t)*ME_MAP_SIZE);
+    }
+    return s->me_map_generation;
+}
+
 static int epzs_motion_search(MpegEncContext * s,
                              int *mx_ptr, int *my_ptr,
-                             int P[5][2], int pred_x, int pred_y,
-                             int xmin, int ymin, int xmax, int ymax)
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, uint8_t * ref_picture)
 {
     int best[2]={0, 0};
     int d, dmin; 
@@ -566,42 +626,74 @@ static int epzs_motion_search(MpegEncContext * s,
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     int quant= s->qscale; // qscale of the prev frame
     const int shift= 1+s->quarter_sample;
+    uint32_t *map= s->me_map;
+    uint16_t *score_map= s->me_score_map;
+    int map_generation;
 
     new_pic = s->new_picture[0] + pic_xy;
-    old_pic = s->last_picture[0] + pic_xy;
-   
+    old_pic = ref_picture + pic_xy;
+    
+    map_generation= update_map_generation(s);
+
     dmin = pix_abs16x16(new_pic, old_pic, pic_stride);
-    if(dmin<Z_THRESHOLD){
-        *mx_ptr= 0;
-        *my_ptr= 0;
-//printf("Z");
-        return dmin;
-    }
+    map[0]= map_generation;
+    score_map[0]= dmin;
 
     /* first line */
-    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
-        CHECK_MV(P[1][0]>>shift, P[1][1]>>shift)
+    if ((s->mb_y == 0 || s->first_slice_line)) {
+        CHECK_MV(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+        CHECK_MV(P_LAST[0]>>shift, P_LAST[1]>>shift)
     }else{
-        CHECK_MV(P[4][0]>>shift, P[4][1]>>shift)
-        if(dmin<Z_THRESHOLD){
-            *mx_ptr= P[4][0]>>shift;
-            *my_ptr= P[4][1]>>shift;
-//printf("M\n");
+        if(dmin<256 && ( P_LEFT[0]    |P_LEFT[1]
+                        |P_TOP[0]     |P_TOP[1]
+                        |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){
+            *mx_ptr= 0;
+            *my_ptr= 0;
+            s->skip_me=1;
             return dmin;
         }
-        CHECK_MV(P[1][0]>>shift, P[1][1]>>shift)
-        CHECK_MV(P[2][0]>>shift, P[2][1]>>shift)
-        CHECK_MV(P[3][0]>>shift, P[3][1]>>shift)
+        CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
+        if(dmin>256*2){
+            CHECK_MV(P_LAST[0]    >>shift, P_LAST[1]    >>shift)
+            CHECK_MV(P_LEFT[0]    >>shift, P_LEFT[1]    >>shift)
+            CHECK_MV(P_TOP[0]     >>shift, P_TOP[1]     >>shift)
+            CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
+        }
     }
-    CHECK_MV(P[0][0]>>shift, P[0][1]>>shift)
-
+    if(dmin>256*4){
+        CHECK_MV(P_LAST_RIGHT[0] >>shift, P_LAST_RIGHT[1] >>shift)
+        CHECK_MV(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift)
+    }
+#if 0 //doest only slow things down
+    if(dmin>512*3){
+        int step;
+        dmin= score_map[0];
+        best[0]= best[1]=0;
+        for(step=128; step>0; step>>=1){
+            const int step2= step;
+            int y;
+            for(y=-step2+best[1]; y<=step2+best[1]; y+=step){
+                int x;
+                if(y<ymin || y>ymax) continue;
+
+                for(x=-step2+best[0]; x<=step2+best[0]; x+=step){
+                    if(x<xmin || x>xmax) continue;
+                    if(x==best[0] && y==best[1]) continue;
+                    CHECK_MV(x,y)
+                }
+            }
+        }
+    }
+#endif
 //check(best[0],best[1],0, b0)
-    if(s->full_search==ME_EPZS)
+    if(s->me_method==ME_EPZS)
         dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs16x16);
     else
-        dmin=         snake_search(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
+        dmin=         cross_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs16x16);
 //check(best[0],best[1],0, b1)
     *mx_ptr= best[0];
     *my_ptr= best[1];    
@@ -612,8 +704,8 @@ static int epzs_motion_search(MpegEncContext * s,
 
 static int epzs_motion_search4(MpegEncContext * s, int block,
                              int *mx_ptr, int *my_ptr,
-                             int P[6][2], int pred_x, int pred_y,
-                             int xmin, int ymin, int xmax, int ymax)
+                             int P[10][2], int pred_x, int pred_y,
+                             int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
 {
     int best[2]={0, 0};
     int d, dmin; 
@@ -623,34 +715,47 @@ static int epzs_motion_search4(MpegEncContext * s, int block,
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     int quant= s->qscale; // qscale of the prev frame
     const int shift= 1+s->quarter_sample;
+    uint32_t *map= s->me_map;
+    uint16_t *score_map= s->me_score_map;
+    int map_generation;
 
     new_pic = s->new_picture[0] + pic_xy;
-    old_pic = s->last_picture[0] + pic_xy;
-   
-    dmin = pix_abs8x8(new_pic, old_pic, pic_stride);
+    old_pic = ref_picture + pic_xy;
 
+    map_generation= update_map_generation(s);
+
+    dmin = 1000000;
+//printf("%d %d %d %d //",xmin, ymin, xmax, ymax); 
     /* first line */
-    if ((s->mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
-        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
+    if ((s->mb_y == 0 || s->first_slice_line) && block<2) {
+        CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+        CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift)
+        CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift)
     }else{
-        CHECK_MV4(P[4][0]>>shift, P[4][1]>>shift)
-        if(dmin<Z_THRESHOLD){
-            *mx_ptr= P[4][0]>>shift;
-            *my_ptr= P[4][1]>>shift;
-//printf("M\n");
-            return dmin;
+        CHECK_MV4(P_MV1[0]>>shift, P_MV1[1]>>shift)
+        //FIXME try some early stop
+        if(dmin>64*2){
+            CHECK_MV4(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
+            CHECK_MV4(P_LEFT[0]>>shift, P_LEFT[1]>>shift)
+            CHECK_MV4(P_TOP[0]>>shift, P_TOP[1]>>shift)
+            CHECK_MV4(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
+            CHECK_MV4(P_LAST[0]>>shift, P_LAST[1]>>shift)
         }
-        CHECK_MV4(P[1][0]>>shift, P[1][1]>>shift)
-        CHECK_MV4(P[2][0]>>shift, P[2][1]>>shift)
-        CHECK_MV4(P[3][0]>>shift, P[3][1]>>shift)
     }
-    CHECK_MV4(P[0][0]>>shift, P[0][1]>>shift)
-    CHECK_MV4(P[5][0]>>shift, P[5][1]>>shift)
+    if(dmin>64*4){
+        CHECK_MV4(P_LAST_RIGHT[0]>>shift, P_LAST_RIGHT[1]>>shift)
+        CHECK_MV4(P_LAST_BOTTOM[0]>>shift, P_LAST_BOTTOM[1]>>shift)
+    }
+
+    if(s->me_method==ME_EPZS)
+        dmin= small_diamond_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs8x8);
+    else
+        dmin=         cross_search(s, best, dmin, new_pic, old_pic, pic_stride, 
+                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, 
+                                   shift, map, score_map, map_generation, pix_abs8x8);
 
-//check(best[0],best[1],0, b0)
-    dmin= small_diamond_search4MV(s, best, dmin, new_pic, old_pic, pic_stride, 
-                                   pred_x, pred_y, mv_penalty, quant, xmin, ymin, xmax, ymax, shift);
-//check(best[0],best[1],0, b1)
     *mx_ptr= best[0];
     *my_ptr= best[1];    
 
@@ -659,56 +764,50 @@ static int epzs_motion_search4(MpegEncContext * s, int block,
 }
 
 #define CHECK_HALF_MV(suffix, x, y) \
-    d= pix_abs16x16_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
+{\
+    d= pix_abs_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
     d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
-    if(d<dminh){\
-        dminh= d;\
-        mx= mx1 + x;\
-        my= my1 + y;\
-    }
+    COPY3_IF_LT(dminh, d, dx, x, dy, y)\
+}
 
-#define CHECK_HALF_MV4(suffix, x, y) \
-    d= pix_abs8x8_ ## suffix(pix, ptr+((x)>>1), s->linesize);\
-    d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*quant;\
-    if(d<dminh){\
-        dminh= d;\
-        mx= mx1 + x;\
-        my= my1 + y;\
-    }
     
 /* The idea would be to make half pel ME after Inter/Intra decision to 
    save time. */
-static inline void halfpel_motion_search(MpegEncContext * s,
+static inline int halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y)
+                                  int pred_x, int pred_y, uint8_t *ref_picture,
+                                  op_pixels_abs_func pix_abs_x2, 
+                                  op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n)
 {
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
     const int quant= s->qscale;
-    int pen_x, pen_y;
-    int mx, my, mx1, my1, d, xx, yy, dminh;
+    int mx, my, xx, yy, dminh;
     UINT8 *pix, *ptr;
 
-    mx = *mx_ptr;
-    my = *my_ptr;
-    ptr = s->last_picture[0] + (my * s->linesize) + mx;
+    if(s->skip_me){
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }else
 
-    xx = 16 * s->mb_x;
-    yy = 16 * s->mb_y;
+    xx = 16 * s->mb_x + 8*(n&1);
+    yy = 16 * s->mb_y + 8*(n>>1);
     pix =  s->new_picture[0] + (yy * s->linesize) + xx;
+
+    mx = *mx_ptr;
+    my = *my_ptr;
+    ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx);
     
     dminh = dmin;
 
     if (mx > xmin && mx < xmax && 
         my > ymin && my < ymax) {
+        int dx=0, dy=0;
+        int d, pen_x, pen_y; 
 
-        mx= mx1= 2*(mx - xx);
-        my= my1= 2*(my - yy);
-        if(dmin < Z_THRESHOLD && mx==0 && my==0){
-            *mx_ptr = 0;
-            *my_ptr = 0;
-            return;
-        }
+        mx<<=1;
+        my<<=1;
         
         pen_x= pred_x + mx;
         pen_y= pred_y + my;
@@ -725,80 +824,135 @@ static inline void halfpel_motion_search(MpegEncContext * s,
         CHECK_HALF_MV(y2 ,  0, +1)
         CHECK_HALF_MV(xy2, +1, +1)
 
+        mx+=dx;
+        my+=dy;
     }else{
-        mx= 2*(mx - xx);
-        my= 2*(my - yy);
+        mx<<=1;
+        my<<=1;
     }
 
     *mx_ptr = mx;
     *my_ptr = my;
+    return dminh;
 }
 
-static inline void halfpel_motion_search4(MpegEncContext * s,
+static inline int fast_halfpel_motion_search(MpegEncContext * s,
 				  int *mx_ptr, int *my_ptr, int dmin,
 				  int xmin, int ymin, int xmax, int ymax,
-                                  int pred_x, int pred_y, int block_x, int block_y)
+                                  int pred_x, int pred_y, uint8_t *ref_picture,
+                                  op_pixels_abs_func pix_abs_x2, 
+                                  op_pixels_abs_func pix_abs_y2, op_pixels_abs_func pix_abs_xy2, int n)
 {
     UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    uint16_t *score_map= s->me_score_map;
     const int quant= s->qscale;
-    int pen_x, pen_y;
-    int mx, my, mx1, my1, d, xx, yy, dminh;
+    int mx, my, xx, yy, dminh;
     UINT8 *pix, *ptr;
 
-    xx = 8 * block_x;
-    yy = 8 * block_y;
+    if(s->skip_me){
+//    printf("S");
+        *mx_ptr = 0;
+        *my_ptr = 0;
+        return dmin;
+    }
+//    printf("N");
+        
+    xx = 16 * s->mb_x + 8*(n&1);
+    yy = 16 * s->mb_y + 8*(n>>1);
     pix =  s->new_picture[0] + (yy * s->linesize) + xx;
-    
+
     mx = *mx_ptr;
     my = *my_ptr;
-    ptr = s->last_picture[0] + ((yy+my) * s->linesize) + xx + mx;
-
+    ptr = ref_picture + ((yy + my) * s->linesize) + (xx + mx);
+    
     dminh = dmin;
 
     if (mx > xmin && mx < xmax && 
         my > ymin && my < ymax) {
+        int dx=0, dy=0;
+        int d, pen_x, pen_y; 
+        const int index= (my<<ME_MAP_SHIFT) + mx;
+        const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
+        const int l= score_map[(index- 1               )&(ME_MAP_SIZE-1)];
+        const int r= score_map[(index+ 1               )&(ME_MAP_SIZE-1)];
+        const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)];
+        mx<<=1;
+        my<<=1;
 
-        mx= mx1= 2*mx;
-        my= my1= 2*my;
-        if(dmin < Z_THRESHOLD && mx==0 && my==0){
-            *mx_ptr = 0;
-            *my_ptr = 0;
-            return;
-        }
         
         pen_x= pred_x + mx;
         pen_y= pred_y + my;
 
         ptr-= s->linesize;
-        CHECK_HALF_MV4(xy2, -1, -1)
-        CHECK_HALF_MV4(y2 ,  0, -1)
-        CHECK_HALF_MV4(xy2, +1, -1)
-        
-        ptr+= s->linesize;
-        CHECK_HALF_MV4(x2 , -1,  0)
-        CHECK_HALF_MV4(x2 , +1,  0)
-        CHECK_HALF_MV4(xy2, -1, +1)
-        CHECK_HALF_MV4(y2 ,  0, +1)
-        CHECK_HALF_MV4(xy2, +1, +1)
+        if(t<=b){
+            CHECK_HALF_MV(y2 ,  0, -1)
+            if(l<=r){
+                CHECK_HALF_MV(xy2, -1, -1)
+                if(t+r<=b+l){
+                    CHECK_HALF_MV(xy2, +1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, -1, +1)
+                }
+                CHECK_HALF_MV(x2 , -1,  0)
+            }else{
+                CHECK_HALF_MV(xy2, +1, -1)
+                if(t+l<=b+r){
+                    CHECK_HALF_MV(xy2, -1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, +1, +1)
+                }
+                CHECK_HALF_MV(x2 , +1,  0)
+            }
+        }else{
+            if(l<=r){
+                if(t+l<=b+r){
+                    CHECK_HALF_MV(xy2, -1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, +1, +1)
+                }
+                CHECK_HALF_MV(x2 , -1,  0)
+                CHECK_HALF_MV(xy2, -1, +1)
+            }else{
+                if(t+r<=b+l){
+                    CHECK_HALF_MV(xy2, +1, -1)
+                    ptr+= s->linesize;
+                }else{
+                    ptr+= s->linesize;
+                    CHECK_HALF_MV(xy2, -1, +1)
+                }
+                CHECK_HALF_MV(x2 , +1,  0)
+                CHECK_HALF_MV(xy2, +1, +1)
+            }
+            CHECK_HALF_MV(y2 ,  0, +1)
+        }
+        mx+=dx;
+        my+=dy;
 
     }else{
-        mx*=2;
-        my*=2;
+        mx<<=1;
+        my<<=1;
     }
 
     *mx_ptr = mx;
     *my_ptr = my;
+    return dminh;
 }
 
-static inline void set_mv_tables(MpegEncContext * s, int mx, int my)
+static inline void set_p_mv_tables(MpegEncContext * s, int mx, int my, int mv4)
 {
-    const int xy= s->mb_x + s->mb_y*s->mb_width;
+    const int xy= s->mb_x + 1 + (s->mb_y + 1)*(s->mb_width + 2);
     
-    s->mv_table[0][xy] = mx;
-    s->mv_table[1][xy] = my;
+    s->p_mv_table[xy][0] = mx;
+    s->p_mv_table[xy][1] = my;
 
     /* has allready been set to the 4 MV if 4MV is done */
-    if(!(s->flags&CODEC_FLAG_4MV)){
+    if(mv4){
         int mot_xy= s->block_index[0];
 
         s->motion_val[mot_xy  ][0]= mx;
@@ -814,59 +968,158 @@ static inline void set_mv_tables(MpegEncContext * s, int mx, int my)
     }
 }
 
-#ifndef CONFIG_TEST_MV_ENCODE
-
-void estimate_motion(MpegEncContext * s,
-		    int mb_x, int mb_y)
+static inline void get_limits(MpegEncContext *s, int *range, int *xmin, int *ymin, int *xmax, int *ymax, int f_code)
 {
-    UINT8 *pix, *ppix;
-    int sum, varc, vard, mx, my, range, dmin, xx, yy;
-    int xmin, ymin, xmax, ymax;
-    int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
-    int pred_x=0, pred_y=0;
-    int P[6][2];
-    const int shift= 1+s->quarter_sample;
-    int mb_type=0;
-    
-    range = 8 * (1 << (s->f_code - 1));
+    *range = 8 * (1 << (f_code - 1));
     /* XXX: temporary kludge to avoid overflow for msmpeg4 */
     if (s->out_format == FMT_H263 && !s->h263_msmpeg4)
-	range = range * 2;
+	*range *= 2;
 
     if (s->unrestricted_mv) {
-        xmin = -16;
-        ymin = -16;
+        *xmin = -16;
+        *ymin = -16;
         if (s->h263_plus)
-            range *= 2;
+            *range *= 2;
         if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4){
-            xmax = s->mb_width*16;
-            ymax = s->mb_height*16;
+            *xmax = s->mb_width*16;
+            *ymax = s->mb_height*16;
         }else {
             /* XXX: dunno if this is correct but ffmpeg4 decoder wont like it otherwise 
 	            (cuz the drawn edge isnt large enough))*/
-            xmax = s->width;
-            ymax = s->height;
+            *xmax = s->width;
+            *ymax = s->height;
         }
     } else {
-        xmin = 0;
-        ymin = 0;
-        xmax = s->mb_width*16 - 16;
-        ymax = s->mb_height*16 - 16;
+        *xmin = 0;
+        *ymin = 0;
+        *xmax = s->mb_width*16 - 16;
+        *ymax = s->mb_height*16 - 16;
+    }
+}
+
+static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift)
+{
+    int block;
+    int P[10][2];
+    uint8_t *ref_picture= s->last_picture[0];
+    int dmin_sum=0;
+
+    for(block=0; block<4; block++){
+        int mx4, my4;
+        int pred_x4, pred_y4;
+        int dmin4;
+        static const int off[4]= {2, 1, 1, -1};
+        const int mot_stride = s->block_wrap[0];
+        const int mot_xy = s->block_index[block];
+//        const int block_x= (block&1);
+//        const int block_y= (block>>1);
+#if 1 // this saves us a bit of cliping work and shouldnt affect compression in a negative way
+        const int rel_xmin4= xmin;
+        const int rel_xmax4= xmax;
+        const int rel_ymin4= ymin;
+        const int rel_ymax4= ymax;
+#else
+        const int rel_xmin4= xmin - block_x*8;
+        const int rel_xmax4= xmax - block_x*8 + 8;
+        const int rel_ymin4= ymin - block_y*8;
+        const int rel_ymax4= ymax - block_y*8 + 8;
+#endif
+        P_LAST[0] = s->motion_val[mot_xy    ][0];
+        P_LAST[1] = s->motion_val[mot_xy    ][1];
+        P_LEFT[0] = s->motion_val[mot_xy - 1][0];
+        P_LEFT[1] = s->motion_val[mot_xy - 1][1];
+        P_LAST_RIGHT[0] = s->motion_val[mot_xy + 1][0];
+        P_LAST_RIGHT[1] = s->motion_val[mot_xy + 1][1];
+        P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 1*mot_stride][0];
+        P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 1*mot_stride][1];
+
+        if(P_LEFT[0]       > (rel_xmax4<<shift)) P_LEFT[0]       = (rel_xmax4<<shift);
+        if(P_LAST_RIGHT[0] < (rel_xmin4<<shift)) P_LAST_RIGHT[0] = (rel_xmin4<<shift);
+        if(P_LAST_BOTTOM[1]< (rel_ymin4<<shift)) P_LAST_BOTTOM[1]= (rel_ymin4<<shift);
+
+        /* special case for first line */
+        if ((s->mb_y == 0 || s->first_slice_line) && block<2) {
+            pred_x4= P_LEFT[0];
+            pred_y4= P_LEFT[1];
+        } else {
+            P_TOP[0]      = s->motion_val[mot_xy - mot_stride             ][0];
+            P_TOP[1]      = s->motion_val[mot_xy - mot_stride             ][1];
+            P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + off[block]][0];
+            P_TOPRIGHT[1] = s->motion_val[mot_xy - mot_stride + off[block]][1];
+            if(P_TOP[1]      > (rel_ymax4<<shift)) P_TOP[1]     = (rel_ymax4<<shift);
+            if(P_TOPRIGHT[0] < (rel_xmin4<<shift)) P_TOPRIGHT[0]= (rel_xmin4<<shift);
+            if(P_TOPRIGHT[0] > (rel_xmax4<<shift)) P_TOPRIGHT[0]= (rel_xmax4<<shift);
+            if(P_TOPRIGHT[1] > (rel_ymax4<<shift)) P_TOPRIGHT[1]= (rel_ymax4<<shift);
+    
+            P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+            P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+            if(s->out_format == FMT_H263){
+                pred_x4 = P_MEDIAN[0];
+                pred_y4 = P_MEDIAN[1];
+            }else { /* mpeg1 at least */
+                pred_x4= P_LEFT[0];
+                pred_y4= P_LEFT[1];
+            }
+        }
+        P_MV1[0]= mx;
+        P_MV1[1]= my;
+
+        dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, ref_picture);
+
+        dmin4= fast_halfpel_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
+                                   pred_x4, pred_y4, ref_picture, pix_abs8x8_x2, 
+                                   pix_abs8x8_y2, pix_abs8x8_xy2, block);
+ 
+        s->motion_val[ s->block_index[block] ][0]= mx4;
+        s->motion_val[ s->block_index[block] ][1]= my4;
+        dmin_sum+= dmin4;
     }
-    switch(s->full_search) {
+    return dmin_sum;
+}
+
+void ff_estimate_p_frame_motion(MpegEncContext * s,
+                                int mb_x, int mb_y)
+{
+    UINT8 *pix, *ppix;
+    int sum, varc, vard, mx, my, range, dmin, xx, yy;
+    int xmin, ymin, xmax, ymax;
+    int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
+    int pred_x=0, pred_y=0;
+    int P[10][2];
+    const int shift= 1+s->quarter_sample;
+    int mb_type=0;
+    uint8_t *ref_picture= s->last_picture[0];
+
+    get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
+    rel_xmin= xmin - mb_x*16;
+    rel_xmax= xmax - mb_x*16;
+    rel_ymin= ymin - mb_y*16;
+    rel_ymax= ymax - mb_y*16;
+    s->skip_me=0;
+
+    switch(s->me_method) {
     case ME_ZERO:
     default:
 	no_motion_search(s, &mx, &my);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         dmin = 0;
         break;
     case ME_FULL:
-	dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax);
+	dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_LOG:
-	dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax);
+	dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_PHODS:
-	dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax);
+	dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
         break;
     case ME_X1:
     case ME_EPZS:
@@ -874,133 +1127,69 @@ void estimate_motion(MpegEncContext * s,
             const int mot_stride = s->block_wrap[0];
             const int mot_xy = s->block_index[0];
 
-            rel_xmin= xmin - mb_x*16;
-            rel_xmax= xmax - mb_x*16;
-            rel_ymin= ymin - mb_y*16;
-            rel_ymax= ymax - mb_y*16;
+            P_LAST[0]       = s->motion_val[mot_xy    ][0];
+            P_LAST[1]       = s->motion_val[mot_xy    ][1];
+            P_LEFT[0]       = s->motion_val[mot_xy - 1][0];
+            P_LEFT[1]       = s->motion_val[mot_xy - 1][1];
+            P_LAST_RIGHT[0] = s->motion_val[mot_xy + 2][0];
+            P_LAST_RIGHT[1] = s->motion_val[mot_xy + 2][1];
+            P_LAST_BOTTOM[0]= s->motion_val[mot_xy + 2*mot_stride][0];
+            P_LAST_BOTTOM[1]= s->motion_val[mot_xy + 2*mot_stride][1];
 
-            P[0][0] = s->motion_val[mot_xy    ][0];
-            P[0][1] = s->motion_val[mot_xy    ][1];
-            P[1][0] = s->motion_val[mot_xy - 1][0];
-            P[1][1] = s->motion_val[mot_xy - 1][1];
-            if(P[1][0] > (rel_xmax<<shift)) P[1][0]= (rel_xmax<<shift);
+            if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
+            if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
+            if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
 
             /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line)) {
-                P[4][0] = P[1][0];
-                P[4][1] = P[1][1];
+            if ((mb_y == 0 || s->first_slice_line)) {
+                pred_x= P_LEFT[0];
+                pred_y= P_LEFT[1];
             } else {
-                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
-                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
-                P[3][0] = s->motion_val[mot_xy - mot_stride + 2         ][0];
-                P[3][1] = s->motion_val[mot_xy - mot_stride + 2         ][1];
-                if(P[2][1] > (rel_ymax<<shift)) P[2][1]= (rel_ymax<<shift);
-                if(P[3][0] < (rel_xmin<<shift)) P[3][0]= (rel_xmin<<shift);
-                if(P[3][1] > (rel_ymax<<shift)) P[3][1]= (rel_ymax<<shift);
+                P_TOP[0]      = s->motion_val[mot_xy - mot_stride    ][0];
+                P_TOP[1]      = s->motion_val[mot_xy - mot_stride    ][1];
+                P_TOPRIGHT[0] = s->motion_val[mot_xy - mot_stride + 2][0];
+                P_TOPRIGHT[1] = s->motion_val[mot_xy - mot_stride + 2][1];
+                if(P_TOP[1]      > (rel_ymax<<shift)) P_TOP[1]     = (rel_ymax<<shift);
+                if(P_TOPRIGHT[0] < (rel_xmin<<shift)) P_TOPRIGHT[0]= (rel_xmin<<shift);
+                if(P_TOPRIGHT[1] > (rel_ymax<<shift)) P_TOPRIGHT[1]= (rel_ymax<<shift);
         
-                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
-                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
-            }
-            if(s->out_format == FMT_H263){
-                pred_x = P[4][0];
-                pred_y = P[4][1];
-            }else { /* mpeg1 at least */
-                pred_x= P[1][0];
-                pred_y= P[1][1];
+                P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+                P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+                if(s->out_format == FMT_H263){
+                    pred_x = P_MEDIAN[0];
+                    pred_y = P_MEDIAN[1];
+                }else { /* mpeg1 at least */
+                    pred_x= P_LEFT[0];
+                    pred_y= P_LEFT[1];
+                }
             }
         }
-        dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax);
+        dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture);
  
-        mx+= mb_x*16;
-        my+= mb_y*16;
         break;
     }
-    
-    if(s->flags&CODEC_FLAG_4MV){
-        int block;
-
-        mb_type|= MB_TYPE_INTER4V;
-
-        for(block=0; block<4; block++){
-            int mx4, my4;
-            int pred_x4, pred_y4;
-            int dmin4;
-            static const int off[4]= {2, 1, 1, -1};
-            const int mot_stride = s->block_wrap[0];
-            const int mot_xy = s->block_index[block];
-            const int block_x= mb_x*2 + (block&1);
-            const int block_y= mb_y*2 + (block>>1);
-
-            const int rel_xmin4= xmin - block_x*8;
-            const int rel_xmax4= xmax - block_x*8 + 8;
-            const int rel_ymin4= ymin - block_y*8;
-            const int rel_ymax4= ymax - block_y*8 + 8;
-
-            P[0][0] = s->motion_val[mot_xy    ][0];
-            P[0][1] = s->motion_val[mot_xy    ][1];
-            P[1][0] = s->motion_val[mot_xy - 1][0];
-            P[1][1] = s->motion_val[mot_xy - 1][1];
-            if(P[1][0] > (rel_xmax4<<shift)) P[1][0]= (rel_xmax4<<shift);
-
-            /* special case for first line */
-            if ((mb_y == 0 || s->first_slice_line || s->first_gob_line) && block<2) {
-                P[4][0] = P[1][0];
-                P[4][1] = P[1][1];
-            } else {
-                P[2][0] = s->motion_val[mot_xy - mot_stride             ][0];
-                P[2][1] = s->motion_val[mot_xy - mot_stride             ][1];
-                P[3][0] = s->motion_val[mot_xy - mot_stride + off[block]][0];
-                P[3][1] = s->motion_val[mot_xy - mot_stride + off[block]][1];
-                if(P[2][1] > (rel_ymax4<<shift)) P[2][1]= (rel_ymax4<<shift);
-                if(P[3][0] < (rel_xmin4<<shift)) P[3][0]= (rel_xmin4<<shift);
-                if(P[3][0] > (rel_xmax4<<shift)) P[3][0]= (rel_xmax4<<shift);
-                if(P[3][1] > (rel_ymax4<<shift)) P[3][1]= (rel_ymax4<<shift);
-        
-                P[4][0]= mid_pred(P[1][0], P[2][0], P[3][0]);
-                P[4][1]= mid_pred(P[1][1], P[2][1], P[3][1]);
-            }
-            if(s->out_format == FMT_H263){
-                pred_x4 = P[4][0];
-                pred_y4 = P[4][1];
-            }else { /* mpeg1 at least */
-                pred_x4= P[1][0];
-                pred_y4= P[1][1];
-            }
-            P[5][0]= mx - mb_x*16;
-            P[5][1]= my - mb_y*16;
-
-            dmin4 = epzs_motion_search4(s, block, &mx4, &my4, P, pred_x4, pred_y4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4);
-
-            halfpel_motion_search4(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
-                                   pred_x4, pred_y4, block_x, block_y);
-     
-            s->motion_val[ s->block_index[block] ][0]= mx4;
-            s->motion_val[ s->block_index[block] ][1]= my4;
-        }
-    }
 
     /* intra / predictive decision */
     xx = mb_x * 16;
     yy = mb_y * 16;
 
     pix = s->new_picture[0] + (yy * s->linesize) + xx;
-    /* At this point (mx,my) are full-pell and the absolute displacement */
-    ppix = s->last_picture[0] + (my * s->linesize) + mx;
+    /* At this point (mx,my) are full-pell and the relative displacement */
+    ppix = ref_picture + ((yy+my) * s->linesize) + (xx+mx);
     
     sum = pix_sum(pix, s->linesize);
-#if 0
-    varc = pix_dev(pix, s->linesize, (sum+128)>>8) + INTER_BIAS;
-    vard = pix_abs16x16(pix, ppix, s->linesize);
-#else
+    
     sum= (sum+8)>>4;
-    varc = ((pix_norm1(pix, s->linesize) - sum*sum + 128 + 500)>>8);
+    varc = (pix_norm1(pix, s->linesize) - sum*sum + 500 + 128)>>8;
     vard = (pix_norm(pix, ppix, s->linesize)+128)>>8;
-#endif
-
-    s->mb_var[s->mb_width * mb_y + mb_x] = varc;
-    s->avg_mb_var+= varc;
-    s->mc_mb_var += vard;
-
+//printf("%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
+    s->mb_var   [s->mb_width * mb_y + mb_x] = varc;
+    s->mc_mb_var[s->mb_width * mb_y + mb_x] = vard;
+    s->mb_var_sum    += varc;
+    s->mc_mb_var_sum += vard;
+//printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
+    
 #if 0
     printf("varc=%4d avg_var=%4d (sum=%4d) vard=%4d mx=%2d my=%2d\n",
 	   varc, s->avg_mb_var, sum, vard, mx - xx, my - yy);
@@ -1010,68 +1199,556 @@ void estimate_motion(MpegEncContext * s,
             mb_type|= MB_TYPE_INTRA;
         if (varc*2 + 200 > vard){
             mb_type|= MB_TYPE_INTER;
-            halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y);
+            if(s->me_method >= ME_EPZS)
+                fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);
+            else
+                halfpel_motion_search(     s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);                                           
         }else{
-            mx = mx*2 - mb_x*32;
-            my = my*2 - mb_y*32;
+            mx <<=1;
+            my <<=1;
         }
+        if((s->flags&CODEC_FLAG_4MV)
+           && !s->skip_me && varc>50 && vard>10){
+            mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+            mb_type|=MB_TYPE_INTER4V;
+
+            set_p_mv_tables(s, mx, my, 0);
+        }else
+            set_p_mv_tables(s, mx, my, 1);
     }else{
         if (vard <= 64 || vard < varc) {
             mb_type|= MB_TYPE_INTER;
-            if (s->full_search != ME_ZERO) {
-                halfpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax, pred_x, pred_y);
+            if (s->me_method != ME_ZERO) {
+                if(s->me_method >= ME_EPZS)
+                    dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);
+                else
+                    dmin= halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                           pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                           pix_abs16x16_xy2, 0);
+                if((s->flags&CODEC_FLAG_4MV)
+                   && !s->skip_me && varc>50 && vard>10){
+                    int dmin4= mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+                    if(dmin4 + 128 <dmin)
+                        mb_type= MB_TYPE_INTER4V;
+                }
+                set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
+
             } else {
-                mx -= 16 * mb_x;
-                my -= 16 * mb_y;
+                mx <<=1;
+                my <<=1;
+            }
+#if 0
+            if (vard < 10) {
+                skip++;
+                fprintf(stderr,"\nEarly skip: %d vard: %2d varc: %5d dmin: %d", 
+                                skip, vard, varc, dmin);
             }
+#endif
         }else{
             mb_type|= MB_TYPE_INTRA;
-            mx = 0;//mx*2 - 32 * mb_x;
-            my = 0;//my*2 - 32 * mb_y;
+            mx = 0;
+            my = 0;
         }
     }
 
     s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
-    set_mv_tables(s, mx, my);
 }
 
-#else
+int ff_estimate_motion_b(MpegEncContext * s,
+                       int mb_x, int mb_y, int16_t (*mv_table)[2], uint8_t *ref_picture, int f_code)
+{
+    int mx, my, range, dmin;
+    int xmin, ymin, xmax, ymax;
+    int rel_xmin, rel_ymin, rel_xmax, rel_ymax;
+    int pred_x=0, pred_y=0;
+    int P[10][2];
+    const int shift= 1+s->quarter_sample;
+    const int mot_stride = s->mb_width + 2;
+    const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
+    
+    get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, f_code);
+    rel_xmin= xmin - mb_x*16;
+    rel_xmax= xmax - mb_x*16;
+    rel_ymin= ymin - mb_y*16;
+    rel_ymax= ymax - mb_y*16;
+
+    switch(s->me_method) {
+    case ME_ZERO:
+    default:
+	no_motion_search(s, &mx, &my);
+        dmin = 0;
+        mx-= mb_x*16;
+        my-= mb_y*16;
+        break;
+    case ME_FULL:
+	dmin = full_motion_search(s, &mx, &my, range, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
+        break;
+    case ME_LOG:
+	dmin = log_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
+        break;
+    case ME_PHODS:
+	dmin = phods_motion_search(s, &mx, &my, range / 2, xmin, ymin, xmax, ymax, ref_picture);
+        mx-= mb_x*16;
+        my-= mb_y*16;
+        break;
+    case ME_X1:
+    case ME_EPZS:
+       {
+
+            P_LAST[0]        = mv_table[mot_xy    ][0];
+            P_LAST[1]        = mv_table[mot_xy    ][1];
+            P_LEFT[0]        = mv_table[mot_xy - 1][0];
+            P_LEFT[1]        = mv_table[mot_xy - 1][1];
+            P_LAST_RIGHT[0]  = mv_table[mot_xy + 1][0];
+            P_LAST_RIGHT[1]  = mv_table[mot_xy + 1][1];
+            P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0];
+            P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1];
+
+            if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
+            if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
+            if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
+
+            /* special case for first line */
+            if ((mb_y == 0 || s->first_slice_line)) {
+            } else {
+                P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
+                P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
+                P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
+                P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1         ][1];
+                if(P_TOP[1] > (rel_ymax<<shift)) P_TOP[1]= (rel_ymax<<shift);
+                if(P_TOPRIGHT[0] < (rel_xmin<<shift)) P_TOPRIGHT[0]= (rel_xmin<<shift);
+                if(P_TOPRIGHT[1] > (rel_ymax<<shift)) P_TOPRIGHT[1]= (rel_ymax<<shift);
+        
+                P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+                P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+            }
+            pred_x= P_LEFT[0];
+            pred_y= P_LEFT[1];
+        }
+        dmin = epzs_motion_search(s, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, ref_picture);
+ 
+        break;
+    }
+    
+    dmin= fast_halfpel_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
+                                pred_x, pred_y, ref_picture, pix_abs16x16_x2, pix_abs16x16_y2, 
+                                pix_abs16x16_xy2, 0);
+//printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
+//    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
+    mv_table[mot_xy][0]= mx;
+    mv_table[mot_xy][1]= my;
+    return dmin;
+}
+
+
+static inline int check_bidir_mv(MpegEncContext * s,
+                   int mb_x, int mb_y,
+                   int motion_fx, int motion_fy,
+                   int motion_bx, int motion_by,
+                   int pred_fx, int pred_fy,
+                   int pred_bx, int pred_by)
+{
+    //FIXME optimize?
+    //FIXME direct mode penalty
+    UINT16 *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+    uint8_t *dest_y = s->me_scratchpad;
+    uint8_t *ptr;
+    int dxy;
+    int src_x, src_y;
+    int fbmin;
+
+    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->qscale;
+
+    dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
+    src_x = mb_x * 16 + (motion_fx >> 1);
+    src_y = mb_y * 16 + (motion_fy >> 1);
+            
+    ptr = s->last_picture[0] + (src_y * s->linesize) + src_x;
+    put_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+    put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+    
+    fbmin += (mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->qscale;
+
+    dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
+    src_x = mb_x * 16 + (motion_bx >> 1);
+    src_y = mb_y * 16 + (motion_by >> 1);
+            
+    ptr = s->next_picture[0] + (src_y * s->linesize) + src_x;
+    avg_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+    avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+    
+    fbmin += pix_abs16x16(s->new_picture[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
+    return fbmin;
+}
 
-/* test version which generates valid random vectors */
-int estimate_motion(MpegEncContext * s,
-		    int mb_x, int mb_y,
-		    int *mx_ptr, int *my_ptr)
+/* refine the bidir vectors in hq mode and return the score in both lq & hq mode*/
+static inline int bidir_refine(MpegEncContext * s,
+                                  int mb_x, int mb_y)
 {
-    int xx, yy, x1, y1, x2, y2, range;
-
-    if ((random() % 10) >= 5) {
-	range = 8 * (1 << (s->f_code - 1));
-	if (s->out_format == FMT_H263 && !s->h263_msmpeg4)
-	    range = range * 2;
-
-	xx = 16 * s->mb_x;
-	yy = 16 * s->mb_y;
-	x1 = xx - range;
-	if (x1 < 0)
-	    x1 = 0;
-	x2 = xx + range - 1;
-	if (x2 > (s->width - 16))
-	    x2 = s->width - 16;
-	y1 = yy - range;
-	if (y1 < 0)
-	    y1 = 0;
-	y2 = yy + range - 1;
-	if (y2 > (s->height - 16))
-	    y2 = s->height - 16;
-
-	*mx_ptr = (random() % (2 * (x2 - x1 + 1))) + 2 * (x1 - xx);
-	*my_ptr = (random() % (2 * (y2 - y1 + 1))) + 2 * (y1 - yy);
-	return 0;
+    const int mot_stride = s->mb_width + 2;
+    const int xy = (mb_y + 1)*mot_stride + mb_x + 1;
+    int fbmin;
+    int pred_fx= s->b_bidir_forw_mv_table[xy-1][0];
+    int pred_fy= s->b_bidir_forw_mv_table[xy-1][1];
+    int pred_bx= s->b_bidir_back_mv_table[xy-1][0];
+    int pred_by= s->b_bidir_back_mv_table[xy-1][1];
+    int motion_fx= s->b_bidir_forw_mv_table[xy][0]= s->b_forw_mv_table[xy][0];
+    int motion_fy= s->b_bidir_forw_mv_table[xy][1]= s->b_forw_mv_table[xy][1];
+    int motion_bx= s->b_bidir_back_mv_table[xy][0]= s->b_back_mv_table[xy][0];
+    int motion_by= s->b_bidir_back_mv_table[xy][1]= s->b_back_mv_table[xy][1];
+
+    //FIXME do refinement and add flag
+    
+    fbmin= check_bidir_mv(s, mb_x, mb_y, 
+                          motion_fx, motion_fy,
+                          motion_bx, motion_by,
+                          pred_fx, pred_fy,
+                          pred_bx, pred_by);
+
+   return fbmin;
+}
+
+static inline int direct_search(MpegEncContext * s,
+                                int mb_x, int mb_y)
+{
+    int P[10][2];
+    const int mot_stride = s->mb_width + 2;
+    const int mot_xy = (mb_y + 1)*mot_stride + mb_x + 1;
+    int dmin, dmin2;
+    int motion_fx, motion_fy, motion_bx, motion_by, motion_bx0, motion_by0;
+    int motion_dx, motion_dy;
+    const int motion_px= s->p_mv_table[mot_xy][0];
+    const int motion_py= s->p_mv_table[mot_xy][1];
+    const int time_pp= s->pp_time;
+    const int time_bp= s->bp_time;
+    const int time_pb= time_pp - time_bp;
+    int bx, by;
+    int mx, my, mx2, my2;
+    uint8_t *ref_picture= s->me_scratchpad - (mb_x + 1 + (mb_y + 1)*s->linesize)*16;
+    int16_t (*mv_table)[2]= s->b_direct_mv_table;
+    uint16_t *mv_penalty= s->mv_penalty[s->f_code] + MAX_MV; // f_code of the prev frame
+
+    /* thanks to iso-mpeg the rounding is different for the zero vector, so we need to handle that ... */
+    motion_fx= (motion_px*time_pb)/time_pp;
+    motion_fy= (motion_py*time_pb)/time_pp;
+    motion_bx0= (-motion_px*time_bp)/time_pp;
+    motion_by0= (-motion_py*time_bp)/time_pp;
+    motion_dx= motion_dy=0;
+    dmin2= check_bidir_mv(s, mb_x, mb_y, 
+                          motion_fx, motion_fy,
+                          motion_bx0, motion_by0,
+                          motion_fx, motion_fy,
+                          motion_bx0, motion_by0) - s->qscale;
+
+    motion_bx= motion_fx - motion_px;
+    motion_by= motion_fy - motion_py;
+    for(by=-1; by<2; by++){
+        for(bx=-1; bx<2; bx++){
+            uint8_t *dest_y = s->me_scratchpad + (by+1)*s->linesize*16 + (bx+1)*16;
+            uint8_t *ptr;
+            int dxy;
+            int src_x, src_y;
+            const int width= s->width;
+            const int height= s->height;
+
+            dxy = ((motion_fy & 1) << 1) | (motion_fx & 1);
+            src_x = (mb_x + bx) * 16 + (motion_fx >> 1);
+            src_y = (mb_y + by) * 16 + (motion_fy >> 1);
+            src_x = clip(src_x, -16, width);
+            if (src_x == width) dxy &= ~1;
+            src_y = clip(src_y, -16, height);
+            if (src_y == height) dxy &= ~2;
+
+            ptr = s->last_picture[0] + (src_y * s->linesize) + src_x;
+            put_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+            put_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+
+            dxy = ((motion_by & 1) << 1) | (motion_bx & 1);
+            src_x = (mb_x + bx) * 16 + (motion_bx >> 1);
+            src_y = (mb_y + by) * 16 + (motion_by >> 1);
+            src_x = clip(src_x, -16, width);
+            if (src_x == width) dxy &= ~1;
+            src_y = clip(src_y, -16, height);
+            if (src_y == height) dxy &= ~2;
+
+            avg_pixels_tab[dxy](dest_y    , ptr    , s->linesize, 16);
+            avg_pixels_tab[dxy](dest_y + 8, ptr + 8, s->linesize, 16);
+        }
+    }
+
+    P_LAST[0]        = mv_table[mot_xy    ][0];
+    P_LAST[1]        = mv_table[mot_xy    ][1];
+    P_LEFT[0]        = mv_table[mot_xy - 1][0];
+    P_LEFT[1]        = mv_table[mot_xy - 1][1];
+    P_LAST_RIGHT[0]  = mv_table[mot_xy + 1][0];
+    P_LAST_RIGHT[1]  = mv_table[mot_xy + 1][1];
+    P_LAST_BOTTOM[0] = mv_table[mot_xy + mot_stride][0];
+    P_LAST_BOTTOM[1] = mv_table[mot_xy + mot_stride][1];
+/*
+    if(P_LEFT[0]       > (rel_xmax<<shift)) P_LEFT[0]       = (rel_xmax<<shift);
+    if(P_LAST_RIGHT[0] < (rel_xmin<<shift)) P_LAST_RIGHT[0] = (rel_xmin<<shift);
+    if(P_LAST_BOTTOM[1]< (rel_ymin<<shift)) P_LAST_BOTTOM[1]= (rel_ymin<<shift);
+*/
+    /* special case for first line */
+    if ((mb_y == 0 || s->first_slice_line)) {
     } else {
-	*mx_ptr = 0;
-	*my_ptr = 0;
-	return 1;
+        P_TOP[0] = mv_table[mot_xy - mot_stride             ][0];
+        P_TOP[1] = mv_table[mot_xy - mot_stride             ][1];
+        P_TOPRIGHT[0] = mv_table[mot_xy - mot_stride + 1         ][0];
+        P_TOPRIGHT[1] = mv_table[mot_xy - mot_stride + 1         ][1];
+    
+        P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+        P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
+    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, -16, -16, 15, 15, ref_picture);
+    if(mx==0 && my==0) dmin=99999999; // not representable, due to rounding stuff
+    if(dmin2<dmin){ 
+        dmin= dmin2;
+        mx=0;
+        my=0;
+    }
+#if 1
+    mx2= mx= mx*2; 
+    my2= my= my*2;
+    for(by=-1; by<2; by++){
+        if(my2+by < -32) continue;
+        for(bx=-1; bx<2; bx++){
+            if(bx==0 && by==0) continue;
+            if(mx2+bx < -32) continue;
+            dmin2= check_bidir_mv(s, mb_x, mb_y, 
+                          mx2+bx+motion_fx, my2+by+motion_fy,
+                          mx2+bx+motion_bx, my2+by+motion_by,
+                          mx2+bx+motion_fx, my2+by+motion_fy,
+                          motion_bx, motion_by) - s->qscale;
+            
+            if(dmin2<dmin){
+                dmin=dmin2;
+                mx= mx2 + bx;
+                my= my2 + by;
+            }
+        }
+    }
+#else
+    mx*=2; my*=2;
+#endif
+    if(mx==0 && my==0){
+        motion_bx= motion_bx0;
+        motion_by= motion_by0;
+    }
+
+    s->b_direct_mv_table[mot_xy][0]= mx;
+    s->b_direct_mv_table[mot_xy][1]= my;
+    s->b_direct_forw_mv_table[mot_xy][0]= motion_fx + mx;
+    s->b_direct_forw_mv_table[mot_xy][1]= motion_fy + my;
+    s->b_direct_back_mv_table[mot_xy][0]= motion_bx + mx;
+    s->b_direct_back_mv_table[mot_xy][1]= motion_by + my;
+    return dmin;
 }
 
-#endif
+void ff_estimate_b_frame_motion(MpegEncContext * s,
+                             int mb_x, int mb_y)
+{
+    const int quant= s->qscale;
+    int fmin, bmin, dmin, fbmin;
+    int type=0;
+    
+    dmin= direct_search(s, mb_x, mb_y);
+
+    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, s->last_picture[0], s->f_code);
+    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, s->next_picture[0], s->b_code) - quant;
+//printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
+
+    fbmin= bidir_refine(s, mb_x, mb_y);
+
+    if(s->flags&CODEC_FLAG_HQ){
+        type= MB_TYPE_FORWARD | MB_TYPE_BACKWARD | MB_TYPE_BIDIR | MB_TYPE_DIRECT;
+    }else{
+        int score= dmin;
+        type=MB_TYPE_DIRECT;
+        
+        if(fmin<score){
+            score=fmin;
+            type= MB_TYPE_FORWARD; 
+        }
+        if(bmin<score){
+            score=bmin;
+            type= MB_TYPE_BACKWARD; 
+        }
+        if(fbmin<score){
+            score=fbmin;
+            type= MB_TYPE_BIDIR;
+        }
+        s->mc_mb_var_sum += score;
+        s->mc_mb_var[mb_y*s->mb_width + mb_x] = score;
+    }
+/*
+{
+static int count=0;
+static int sum=0;
+if(type==MB_TYPE_DIRECT){
+  int diff= ABS(s->b_forw_mv_table)
+}
+}*/
+
+    s->mb_type[mb_y*s->mb_width + mb_x]= type;
+/*    if(mb_y==0 && mb_x==0) printf("\n");
+    if(mb_x==0) printf("\n");
+    printf("%d", av_log2(type));
+*/
+}
+
+/* find best f_code for ME which do unlimited searches */
+int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
+{
+    if(s->me_method>=ME_EPZS){
+        int score[8];
+        int i, y;
+        UINT8 * fcode_tab= s->fcode_tab;
+        int best_fcode=-1;
+        int best_score=-10000000;
+
+        for(i=0; i<8; i++) score[i]= s->mb_num*(8-i); //FIXME *2 and all other too so its the same but nicer
+
+        for(y=0; y<s->mb_height; y++){
+            int x;
+            int xy= (y+1)* (s->mb_width+2) + 1;
+            i= y*s->mb_width;
+            for(x=0; x<s->mb_width; x++){
+                if(s->mb_type[i] & type){
+                    int fcode= MAX(fcode_tab[mv_table[xy][0] + MAX_MV],
+                                   fcode_tab[mv_table[xy][1] + MAX_MV]);
+                    int j;
+                    
+                    for(j=0; j<fcode && j<8; j++){
+                        if(s->pict_type==B_TYPE || s->mc_mb_var[i] < s->mb_var[i])
+                            score[j]-= 170;
+                    }
+                }
+                i++;
+                xy++;
+            }
+        }
+        
+        for(i=1; i<8; i++){
+            if(score[i] > best_score){
+                best_score= score[i];
+                best_fcode= i;
+            }
+//            printf("%d %d\n", i, score[i]);
+        }
+
+//    printf("fcode: %d type: %d\n", i, s->pict_type);
+        return best_fcode;
+/*        for(i=0; i<=MAX_FCODE; i++){
+            printf("%d ", mv_num[i]);
+        }
+        printf("\n");*/
+    }else{
+        return 1;
+    }
+}
+
+void ff_fix_long_p_mvs(MpegEncContext * s)
+{
+    const int f_code= s->f_code;
+    int y;
+    UINT8 * fcode_tab= s->fcode_tab;
+//int clip=0;
+//int noclip=0;
+    /* clip / convert to intra 16x16 type MVs */
+    for(y=0; y<s->mb_height; y++){
+        int x;
+        int xy= (y+1)* (s->mb_width+2)+1;
+        int i= y*s->mb_width;
+        for(x=0; x<s->mb_width; x++){
+            if(s->mb_type[i]&MB_TYPE_INTER){
+                if(   fcode_tab[s->p_mv_table[xy][0] + MAX_MV] > f_code
+                   || fcode_tab[s->p_mv_table[xy][0] + MAX_MV] == 0
+                   || fcode_tab[s->p_mv_table[xy][1] + MAX_MV] > f_code
+                   || fcode_tab[s->p_mv_table[xy][1] + MAX_MV] == 0 ){
+                    s->mb_type[i] &= ~MB_TYPE_INTER;
+                    s->mb_type[i] |= MB_TYPE_INTRA;
+                    s->p_mv_table[xy][0] = 0;
+                    s->p_mv_table[xy][1] = 0;
+//clip++;
+                }
+//else
+//  noclip++;
+            }
+            xy++;
+            i++;
+        }
+    }
+//printf("%d no:%d %d//\n", clip, noclip, f_code);
+    if(s->flags&CODEC_FLAG_4MV){
+        const int wrap= 2+ s->mb_width*2;
+
+        /* clip / convert to intra 8x8 type MVs */
+        for(y=0; y<s->mb_height; y++){
+            int xy= (y*2 + 1)*wrap + 1;
+            int i= y*s->mb_width;
+            int x;
+
+            for(x=0; x<s->mb_width; x++){
+                if(s->mb_type[i]&MB_TYPE_INTER4V){
+                    int block;
+                    for(block=0; block<4; block++){
+                        int off= (block& 1) + (block>>1)*wrap;
+                        int mx= s->motion_val[ xy + off ][0];
+                        int my= s->motion_val[ xy + off ][1];
+
+                        if(   fcode_tab[mx + MAX_MV] > f_code
+                           || fcode_tab[mx + MAX_MV] == 0
+                           || fcode_tab[my + MAX_MV] > f_code
+                           || fcode_tab[my + MAX_MV] == 0 ){
+                            s->mb_type[i] &= ~MB_TYPE_INTER4V;
+                            s->mb_type[i] |= MB_TYPE_INTRA;
+                        }
+                    }
+                    xy+=2;
+                    i++;
+                }
+            }
+        }
+    }
+}
+
+void ff_fix_long_b_mvs(MpegEncContext * s, int16_t (*mv_table)[2], int f_code, int type)
+{
+    int y;
+    UINT8 * fcode_tab= s->fcode_tab;
+
+    /* clip / convert to intra 16x16 type MVs */
+    for(y=0; y<s->mb_height; y++){
+        int x;
+        int xy= (y+1)* (s->mb_width+2)+1;
+        int i= y*s->mb_width;
+        for(x=0; x<s->mb_width; x++){
+            if(s->mb_type[i]&type){
+                if(   fcode_tab[mv_table[xy][0] + MAX_MV] > f_code
+                   || fcode_tab[mv_table[xy][0] + MAX_MV] == 0
+                   || fcode_tab[mv_table[xy][1] + MAX_MV] > f_code
+                   || fcode_tab[mv_table[xy][1] + MAX_MV] == 0 ){
+                    if(s->mb_type[i]&(~type)) s->mb_type[i] &= ~type;
+                    else{
+                        mv_table[xy][0] = 0;
+                        mv_table[xy][1] = 0;
+                        //this is certainly bad FIXME            
+                    }
+                }
+            }
+            xy++;
+            i++;
+        }
+    }
+}
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index ac614d5ce..37e9b70ac 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -1,26 +1,25 @@
 /*
  * MPEG1 encoder / MPEG2 decoder
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000,2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 //#define DEBUG
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
-#include "xineutils.h"
 
 #include "mpeg12data.h"
 
@@ -34,8 +33,6 @@
 #define EXT_START_CODE		0x000001b5
 #define USER_START_CODE		0x000001b2
 
-#define ABS(a) ((a)<0 ? -(a) : (a))
-
 static void mpeg1_encode_block(MpegEncContext *s, 
                          DCTELEM *block, 
                          int component);
@@ -400,8 +397,11 @@ void mpeg1_encode_init(MpegEncContext *s)
         }
     }
     s->mv_penalty= mv_penalty;
-    
     s->fcode_tab= fcode_tab;
+    s->min_qcoeff=-255;
+    s->max_qcoeff= 255;
+    s->intra_quant_bias= 3<<(QUANT_BIAS_SHIFT-3); //(a + x*3/8)/x
+    s->inter_quant_bias= 0;
 }
  
 static inline void encode_dc(MpegEncContext *s, int diff, int component)
@@ -853,6 +853,8 @@ static int mpeg_decode_mb(MpegEncContext *s,
                 if (cbp & (1 << (5 - i))) {
                     if (mpeg2_decode_block_intra(s, block[i], i) < 0)
                         return -1;
+                } else {
+                    s->block_last_index[i] = -1;
                 }
             }
         } else {
@@ -860,6 +862,8 @@ static int mpeg_decode_mb(MpegEncContext *s,
                 if (cbp & (1 << (5 - i))) {
                     if (mpeg2_decode_block_non_intra(s, block[i], i) < 0)
                         return -1;
+                } else {
+                    s->block_last_index[i] = -1;
                 }
             }
         }
@@ -868,6 +872,8 @@ static int mpeg_decode_mb(MpegEncContext *s,
             if (cbp & (1 << (5 - i))) {
                 if (mpeg1_decode_block(s, block[i], i) < 0)
                     return -1;
+            } else {
+                s->block_last_index[i] = -1;
             }
         }
     }
@@ -1028,9 +1034,9 @@ static int mpeg2_decode_block_non_intra(MpegEncContext *s,
         UINT8 *buf_ptr;
         i = 0;
         if (n < 4) 
-            matrix = s->non_intra_matrix;
+            matrix = s->inter_matrix;
         else
-            matrix = s->chroma_non_intra_matrix;
+            matrix = s->chroma_inter_matrix;
             
         /* special case for the first coef. no need to add a second vlc table */
         SAVE_BITS(&s->gb);
@@ -1184,6 +1190,9 @@ static int mpeg_decode_init(AVCodecContext *avctx)
     s->buf_ptr = s->buffer;
     s->mpeg_enc_ctx.picture_number = 0;
     s->repeat_field = 0;
+    s->mpeg_enc_ctx.codec_id= avctx->codec->id;
+    avctx->mbskip_table= s->mpeg_enc_ctx.mbskip_table;
+    s->mpeg_enc_ctx.flags= avctx->flags;
     return 0;
 }
 
@@ -1273,6 +1282,7 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
         s->frame_rate = (s->frame_rate * frame_rate_ext_n) / frame_rate_ext_d;
     dprintf("sequence extension\n");
     s->mpeg2 = 1;
+    s->avctx->sub_id = 2; /* indicates mpeg2 found */
 }
 
 static void mpeg_decode_quant_matrix_extension(MpegEncContext *s)
@@ -1293,8 +1303,8 @@ static void mpeg_decode_quant_matrix_extension(MpegEncContext *s)
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
             j = zigzag_direct[i];
-            s->non_intra_matrix[j] = v;
-            s->chroma_non_intra_matrix[j] = v;
+            s->inter_matrix[j] = v;
+            s->chroma_inter_matrix[j] = v;
         }
     }
     if (get_bits1(&s->gb)) {
@@ -1308,7 +1318,7 @@ static void mpeg_decode_quant_matrix_extension(MpegEncContext *s)
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
             j = zigzag_direct[i];
-            s->chroma_non_intra_matrix[j] = v;
+            s->chroma_inter_matrix[j] = v;
         }
     }
 }
@@ -1334,6 +1344,8 @@ static void mpeg_decode_picture_coding_extension(MpegEncContext *s)
     /* composite display not parsed */
     dprintf("intra_dc_precision=%d\n", s->intra_dc_precision);
     dprintf("picture_structure=%d\n", s->picture_structure);
+    dprintf("top field first=%d\n", s->top_field_first);
+    dprintf("repeat first field=%d\n", s->repeat_first_field);
     dprintf("conceal=%d\n", s->concealment_motion_vectors);
     dprintf("intra_vlc_format=%d\n", s->intra_vlc_format);
     dprintf("alternate_scan=%d\n", s->alternate_scan);
@@ -1387,7 +1399,6 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
     s->mb_x = -1;
     s->mb_y = start_code;
     s->mb_incr = 0;
-
     /* start frame decoding */
     if (s->first_slice) {
         s->first_slice = 0;
@@ -1404,6 +1415,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
 
     for(;;) {
         clear_blocks(s->block[0]);
+        emms_c();
         ret = mpeg_decode_mb(s, s->block);
         dprintf("ret=%d\n", ret);
         if (ret < 0)
@@ -1460,7 +1472,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     Mpeg1Context *s1 = avctx->priv_data;
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     int width, height, i, v, j;
-    
+
     init_get_bits(&s->gb, buf, buf_size);
 
     width = get_bits(&s->gb, 12);
@@ -1488,7 +1500,12 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         s->avctx = avctx;
         avctx->width = width;
         avctx->height = height;
-        avctx->frame_rate = frame_rate_tab[s->frame_rate_index];
+        if (s->frame_rate_index >= 9) {
+            /* at least give a valid frame rate (some old mpeg1 have this) */
+            avctx->frame_rate = 25 * FRAME_RATE_BASE;
+        } else {
+            avctx->frame_rate = frame_rate_tab[s->frame_rate_index];
+        }
         s->frame_rate = avctx->frame_rate;
         avctx->bit_rate = s->bit_rate;
         
@@ -1526,20 +1543,20 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         for(i=0;i<64;i++) {
             v = get_bits(&s->gb, 8);
             j = zigzag_direct[i];
-            s->non_intra_matrix[j] = v;
-            s->chroma_non_intra_matrix[j] = v;
+            s->inter_matrix[j] = v;
+            s->chroma_inter_matrix[j] = v;
         }
 #ifdef DEBUG
         dprintf("non intra matrix present\n");
         for(i=0;i<64;i++)
-            dprintf(" %d", s->non_intra_matrix[zigzag_direct[i]]);
+            dprintf(" %d", s->inter_matrix[zigzag_direct[i]]);
         printf("\n");
 #endif
     } else {
         for(i=0;i<64;i++) {
             v = default_non_intra_matrix[i];
-            s->non_intra_matrix[i] = v;
-            s->chroma_non_intra_matrix[i] = v;
+            s->inter_matrix[i] = v;
+            s->chroma_inter_matrix[i] = v;
         }
     }
 
@@ -1549,6 +1566,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     s->picture_structure = PICT_FRAME;
     s->frame_pred_frame_dct = 1;
     s->mpeg2 = 0;
+    avctx->sub_id = 1; /* indicates mpeg1 */
     return 0;
 }
 
@@ -1566,7 +1584,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
     dprintf("fill_buffer\n");
 
     *data_size = 0;
-    
+
     /* special case for last picture */
     if (buf_size == 0) {
         if (s2->picture_number > 0) {
@@ -1583,15 +1601,18 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
 
     buf_ptr = buf;
     buf_end = buf + buf_size;
-    
-    if (s->repeat_field % 2 == 1) {
+
+#if 0    
+    if (s->repeat_field % 2 == 1) { 
         s->repeat_field++;
         //fprintf(stderr,"\nRepeating last frame: %d -> %d! pict: %d %d", avctx->frame_number-1, avctx->frame_number,
-        //                                                         s2->picture_number, s->repeat_field);
-        *data_size = sizeof(AVPicture);
-        goto the_end;
+        //        s2->picture_number, s->repeat_field);
+        if (avctx->flags & CODEC_FLAG_REPEAT_FIELD) {
+            *data_size = sizeof(AVPicture);
+            goto the_end;
+        }
     }
-        
+#endif
     while (buf_ptr < buf_end) {
         buf_start = buf_ptr;
         /* find start next code */
@@ -1641,13 +1662,27 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
                         if (ret == 1) {
                             /* got a picture: exit */
                             /* first check if we must repeat the frame */
+                            avctx->repeat_pict = 0;
+#if 0
                             if (s2->progressive_frame && s2->repeat_first_field) {
                                 //fprintf(stderr,"\nRepeat this frame: %d! pict: %d",avctx->frame_number,s2->picture_number);
-                                s2->repeat_first_field = 0;
-                                s2->progressive_frame = 0;
+                                //s2->repeat_first_field = 0;
+                                //s2->progressive_frame = 0;
                                 if (++s->repeat_field > 2)
                                     s->repeat_field = 0;
+                                avctx->repeat_pict = 1;
                             }
+#endif                      
+                            if (s2->repeat_first_field) {
+                                if (s2->progressive_sequence) {
+                                    if (s2->top_field_first)
+                                        avctx->repeat_pict = 4;
+                                    else
+                                        avctx->repeat_pict = 2;
+                                } else if (s2->progressive_frame) {
+                                    avctx->repeat_pict = 1;
+                                }
+                            }         
                             *data_size = sizeof(AVPicture);
                             goto the_end;
                         }
diff --git a/src/libffmpeg/libavcodec/mpeg4data.h b/src/libffmpeg/libavcodec/mpeg4data.h
index 91b99625f..e972a7576 100644
--- a/src/libffmpeg/libavcodec/mpeg4data.h
+++ b/src/libffmpeg/libavcodec/mpeg4data.h
@@ -4,13 +4,20 @@
 #define BIN_ONLY_SHAPE   2
 #define GRAY_SHAPE       3
 
+#define SIMPLE_VO_TYPE 1
+#define CORE_VO_TYPE   3
+
 // aspect_ratio_info
-#define EXTENDET_PAR 15
+#define EXTENDED_PAR 15
 
 //vol_sprite_usage / sprite_enable
 #define STATIC_SPRITE 1
 #define GMC_SPRITE 2
 
+#define MOTION_MARKER 0x1F001
+#define DC_MARKER     0x6B001
+
+
 /* dc encoding for mpeg4 */
 const UINT8 DCtab_lum[13][2] =
 {
@@ -122,3 +129,27 @@ static const UINT16 pixel_aspect[16][2]={
  {0, 0},
  {0, 0},
 };
+
+/* these matrixes will be permuted for the idct */
+INT16 ff_mpeg4_default_intra_matrix[64] = {
+  8, 17, 18, 19, 21, 23, 25, 27,
+ 17, 18, 19, 21, 23, 25, 27, 28,
+ 20, 21, 22, 23, 24, 26, 28, 30,
+ 21, 22, 23, 24, 26, 28, 30, 32,
+ 22, 23, 24, 26, 28, 30, 32, 35,
+ 23, 24, 26, 28, 30, 32, 35, 38,
+ 25, 26, 28, 30, 32, 35, 38, 41,
+ 27, 28, 30, 32, 35, 38, 41, 45, 
+};
+
+INT16 ff_mpeg4_default_non_intra_matrix[64] = {
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 17, 18, 19, 20, 21, 22, 23, 24,
+ 18, 19, 20, 21, 22, 23, 24, 25,
+ 19, 20, 21, 22, 23, 24, 26, 27,
+ 20, 21, 22, 23, 25, 26, 27, 28,
+ 21, 22, 23, 24, 26, 27, 28, 30,
+ 22, 23, 24, 26, 27, 28, 30, 31,
+ 23, 24, 25, 27, 28, 30, 31, 33,
+};
+
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 9f572c3d9..63242c9de 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -1,49 +1,42 @@
 /*
  * The simplest mpeg encoder (well, it was the simplest!)
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000,2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
- * 4MV & hq encoding stuff by Michael Niedermayer <michaelni@gmx.at>
+ * 4MV & hq & b-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
 
-#include "config.h"
-#include "xine-utils/xineutils.h"
-
 #ifdef USE_FASTMEMCPY
 #include "fastmemcpy.h"
 #endif
 
 static void encode_picture(MpegEncContext *s, int picture_number);
-static void rate_control_init(MpegEncContext *s);
-static int rate_estimate_qscale(MpegEncContext *s);
 static void dct_unquantize_mpeg1_c(MpegEncContext *s, 
                                    DCTELEM *block, int n, int qscale);
+static void dct_unquantize_mpeg2_c(MpegEncContext *s,
+                                   DCTELEM *block, int n, int qscale);
 static void dct_unquantize_h263_c(MpegEncContext *s, 
                                   DCTELEM *block, int n, int qscale);
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w);
-static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale);
+static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
 
-int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale)= dct_quantize_c;
+int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow)= dct_quantize_c;
 void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edges_c;
 
 #define EDGE_WIDTH 16
@@ -53,6 +46,7 @@ void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edg
 
 //#define DEBUG
 
+
 /* for jpeg fast DCT */
 #define CONST_BITS 14
 
@@ -75,37 +69,55 @@ static UINT8 h263_chroma_roundtab[16] = {
 static UINT16 default_mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
 static UINT8 default_fcode_tab[MAX_MV*2+1];
 
-/* default motion estimation */
-int motion_estimation_method = ME_LOG;
-
 extern UINT8 zigzag_end[64];
 
-static void convert_matrix(int *qmat, UINT16 *qmat16, const UINT16 *quant_matrix, int qscale)
+/* default motion estimation */
+int motion_estimation_method = ME_EPZS;
+
+static void convert_matrix(int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
+                           const UINT16 *quant_matrix, int bias)
 {
-    int i;
+    int qscale;
 
-    if (av_fdct == jpeg_fdct_ifast) {
-        for(i=0;i<64;i++) {
-            /* 16 <= qscale * quant_matrix[i] <= 7905 */
-            /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
-            /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
-            /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
-            
-            qmat[block_permute_op(i)] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / 
-                            (aanscales[i] * qscale * quant_matrix[block_permute_op(i)]));
-        }
-    } else {
-        for(i=0;i<64;i++) {
-            /* We can safely suppose that 16 <= quant_matrix[i] <= 255
-               So 16           <= qscale * quant_matrix[i]             <= 7905
-               so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905
-               so 32768        >= (1<<19) / (qscale * quant_matrix[i]) >= 67
-            */
-            qmat[i]   = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
-            qmat16[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]);
+    for(qscale=1; qscale<32; qscale++){
+        int i;
+        if (av_fdct == fdct_ifast) {
+            for(i=0;i<64;i++) {
+                const int j= block_permute_op(i);
+                /* 16 <= qscale * quant_matrix[i] <= 7905 */
+                /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
+                /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
+                /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
+                
+                qmat[qscale][j] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / 
+                                (aanscales[i] * qscale * quant_matrix[j]));
+            }
+        } else {
+            for(i=0;i<64;i++) {
+                /* We can safely suppose that 16 <= quant_matrix[i] <= 255
+                   So 16           <= qscale * quant_matrix[i]             <= 7905
+                   so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905
+                   so 32768        >= (1<<19) / (qscale * quant_matrix[i]) >= 67
+                */
+                qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
+                qmat16[qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]);
+
+                if(qmat16[qscale][i]==0 || qmat16[qscale][i]==128*256) qmat16[qscale][i]=128*256-1;
+
+                qmat16_bias[qscale][i]= ROUNDED_DIV(bias<<(16-QUANT_BIAS_SHIFT), qmat16[qscale][i]);
+            }
         }
     }
 }
+// move into common.c perhaps 
+#define CHECKED_ALLOCZ(p, size)\
+{\
+    p= av_mallocz(size);\
+    if(p==NULL){\
+        perror("malloc");\
+        goto fail;\
+    }\
+}
 
 /* init common structure for both encoder and decoder */
 int MPV_common_init(MpegEncContext *s)
@@ -113,14 +125,19 @@ int MPV_common_init(MpegEncContext *s)
     int c_size, i;
     UINT8 *pict;
 
-    if (s->out_format == FMT_H263) 
-        s->dct_unquantize = dct_unquantize_h263_c;
-    else
-        s->dct_unquantize = dct_unquantize_mpeg1_c;
+    s->dct_unquantize_h263 = dct_unquantize_h263_c;
+    s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_c;
+    s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_c;
         
 #ifdef HAVE_MMX
     MPV_common_init_mmx(s);
 #endif
+    //setup default unquantizers (mpeg4 might change it later)
+    if(s->out_format == FMT_H263)
+        s->dct_unquantize = s->dct_unquantize_h263;
+    else
+        s->dct_unquantize = s->dct_unquantize_mpeg1;
+    
     s->mb_width = (s->width + 15) / 16;
     s->mb_height = (s->height + 15) / 16;
     s->mb_num = s->mb_width * s->mb_height;
@@ -135,58 +152,79 @@ int MPV_common_init(MpegEncContext *s)
         c_size = (w >> shift) * (h >> shift);
         pict_start = (w >> shift) * (EDGE_WIDTH >> shift) + (EDGE_WIDTH >> shift);
 
-        pict = av_mallocz(c_size);
-        if (pict == NULL)
-            goto fail;
+        CHECKED_ALLOCZ(pict, c_size)
         s->last_picture_base[i] = pict;
         s->last_picture[i] = pict + pict_start;
+        if(i>0) memset(s->last_picture_base[i], 128, c_size);
     
-        pict = av_mallocz(c_size);
-        if (pict == NULL)
-            goto fail;
+        CHECKED_ALLOCZ(pict, c_size)
         s->next_picture_base[i] = pict;
         s->next_picture[i] = pict + pict_start;
-
-        if (s->has_b_frames) {
-            pict = av_mallocz(c_size);
-            if (pict == NULL) 
-                goto fail;
+        if(i>0) memset(s->next_picture_base[i], 128, c_size);
+        
+        if (s->has_b_frames || s->codec_id==CODEC_ID_MPEG4) {
+        /* Note the MPEG4 stuff is here cuz of buggy encoders which dont set the low_delay flag but 
+           do low-delay encoding, so we cant allways distinguish b-frame containing streams from low_delay streams */
+            CHECKED_ALLOCZ(pict, c_size)
             s->aux_picture_base[i] = pict;
             s->aux_picture[i] = pict + pict_start;
+            if(i>0) memset(s->aux_picture_base[i], 128, c_size);
         }
     }
     
     if (s->encoding) {
-        /* Allocate MB type table */
-        s->mb_type = av_mallocz(s->mb_num * sizeof(char));
-        if (s->mb_type == NULL) {
-            perror("malloc");
-            goto fail;
-        }
+        int j;
+        int mv_table_size= (s->mb_width+2)*(s->mb_height+2);
+        
+        CHECKED_ALLOCZ(s->mb_var   , s->mb_num * sizeof(INT16))
+        CHECKED_ALLOCZ(s->mc_mb_var, s->mb_num * sizeof(INT16))
+
+        /* Allocate MV tables */
+        CHECKED_ALLOCZ(s->p_mv_table            , mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_forw_mv_table       , mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_back_mv_table       , mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_bidir_forw_mv_table , mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_bidir_back_mv_table , mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_direct_forw_mv_table, mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_direct_back_mv_table, mv_table_size * 2 * sizeof(INT16))
+        CHECKED_ALLOCZ(s->b_direct_mv_table     , mv_table_size * 2 * sizeof(INT16))
+
+        CHECKED_ALLOCZ(s->me_scratchpad,  s->linesize*16*3*sizeof(uint8_t))
         
-        s->mb_var = av_mallocz(s->mb_num * sizeof(INT16));
-        if (s->mb_var == NULL) {
-            perror("malloc");
-            goto fail;
+        CHECKED_ALLOCZ(s->me_map      , ME_MAP_SIZE*sizeof(uint32_t))
+        CHECKED_ALLOCZ(s->me_score_map, ME_MAP_SIZE*sizeof(uint16_t))
+
+        if(s->max_b_frames){
+            for(j=0; j<REORDER_BUFFER_SIZE; j++){
+                int i;
+                for(i=0;i<3;i++) {
+                    int w, h, shift;
+
+                    w = s->linesize;
+                    h = s->mb_height * 16;
+                    shift = (i == 0) ? 0 : 1;
+                    c_size = (w >> shift) * (h >> shift);
+
+                    CHECKED_ALLOCZ(pict, c_size);
+                    s->picture_buffer[j][i] = pict;
+                }
+            }
         }
-        /* Allocate MV table */
-        /* By now we just have one MV per MB */
-        s->mv_table[0] = av_mallocz(s->mb_num * sizeof(INT16));
-        s->mv_table[1] = av_mallocz(s->mb_num * sizeof(INT16));
-        if (s->mv_table[1] == NULL || s->mv_table[0] == NULL) {
-            perror("malloc");
-            goto fail;
+
+        if(s->codec_id==CODEC_ID_MPEG4){
+            CHECKED_ALLOCZ(s->tex_pb_buffer, PB_BUFFER_SIZE);
+            CHECKED_ALLOCZ(   s->pb2_buffer, PB_BUFFER_SIZE);
         }
     }
     
     if (s->out_format == FMT_H263 || s->encoding) {
         int size;
+        /* Allocate MB type table */
+        CHECKED_ALLOCZ(s->mb_type  , s->mb_num * sizeof(UINT8))
+
         /* MV prediction */
         size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
-        s->motion_val = malloc(size * 2 * sizeof(INT16));
-        if (s->motion_val == NULL)
-            goto fail;
-        memset(s->motion_val, 0, size * 2 * sizeof(INT16));
+        CHECKED_ALLOCZ(s->motion_val, size * 2 * sizeof(INT16));
     }
 
     if (s->h263_pred || s->h263_plus) {
@@ -197,43 +235,40 @@ int MPV_common_init(MpegEncContext *s)
         y_size = (2 * s->mb_width + 2) * (2 * s->mb_height + 2);
         c_size = (s->mb_width + 2) * (s->mb_height + 2);
         size = y_size + 2 * c_size;
-        s->dc_val[0] = malloc(size * sizeof(INT16));
-        if (s->dc_val[0] == NULL)
-            goto fail;
+        CHECKED_ALLOCZ(s->dc_val[0], size * sizeof(INT16));
         s->dc_val[1] = s->dc_val[0] + y_size;
         s->dc_val[2] = s->dc_val[1] + c_size;
         for(i=0;i<size;i++)
             s->dc_val[0][i] = 1024;
 
         /* ac values */
-        s->ac_val[0] = av_mallocz(size * sizeof(INT16) * 16);
-        if (s->ac_val[0] == NULL)
-            goto fail;
+        CHECKED_ALLOCZ(s->ac_val[0], size * sizeof(INT16) * 16);
         s->ac_val[1] = s->ac_val[0] + y_size;
         s->ac_val[2] = s->ac_val[1] + c_size;
         
         /* cbp values */
-        s->coded_block = av_mallocz(y_size);
-        if (!s->coded_block)
-            goto fail;
+        CHECKED_ALLOCZ(s->coded_block, y_size);
 
         /* which mb is a intra block */
-        s->mbintra_table = av_mallocz(s->mb_num);
-        if (!s->mbintra_table)
-            goto fail;
+        CHECKED_ALLOCZ(s->mbintra_table, s->mb_num);
         memset(s->mbintra_table, 1, s->mb_num);
+        
+        /* divx501 bitstream reorder buffer */
+        CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE);
+        
+        /* cbp, ac_pred, pred_dir */
+        CHECKED_ALLOCZ(s->cbp_table  , s->mb_num * sizeof(UINT8))
+        CHECKED_ALLOCZ(s->pred_dir_table, s->mb_num * sizeof(UINT8))
+        
+        CHECKED_ALLOCZ(s->qscale_table  , s->mb_num * sizeof(UINT8))
     }
     /* default structure is frame */
     s->picture_structure = PICT_FRAME;
 
     /* init macroblock skip table */
-    if (!s->encoding) {
-        s->mbskip_table = av_mallocz(s->mb_num);
-        if (!s->mbskip_table)
-            goto fail;
-    }
+    CHECKED_ALLOCZ(s->mbskip_table, s->mb_num);
     
-    s->block= s->intra_block;
+    s->block= s->blocks[0];
 
     s->context_initialized = 1;
     return 0;
@@ -242,39 +277,49 @@ int MPV_common_init(MpegEncContext *s)
     return -1;
 }
 
+
+//extern int sads;
+
 /* init common structure for both encoder and decoder */
 void MPV_common_end(MpegEncContext *s)
 {
     int i;
 
-    if (s->mb_type)
-        free(s->mb_type);
-    if (s->mb_var)
-        free(s->mb_var);
-    if (s->mv_table[0])
-        free(s->mv_table[0]);
-    if (s->mv_table[1])
-        free(s->mv_table[1]);
-    if (s->motion_val)
-        free(s->motion_val);
-    if (s->dc_val[0])
-        free(s->dc_val[0]);
-    if (s->ac_val[0])
-        free(s->ac_val[0]);
-    if (s->coded_block)
-        free(s->coded_block);
-    if (s->mbintra_table)
-        free(s->mbintra_table);
-
-    if (s->mbskip_table)
-        free(s->mbskip_table);
+    av_freep(&s->mb_type);
+    av_freep(&s->mb_var);
+    av_freep(&s->mc_mb_var);
+    av_freep(&s->p_mv_table);
+    av_freep(&s->b_forw_mv_table);
+    av_freep(&s->b_back_mv_table);
+    av_freep(&s->b_bidir_forw_mv_table);
+    av_freep(&s->b_bidir_back_mv_table);
+    av_freep(&s->b_direct_forw_mv_table);
+    av_freep(&s->b_direct_back_mv_table);
+    av_freep(&s->b_direct_mv_table);
+    av_freep(&s->motion_val);
+    av_freep(&s->dc_val[0]);
+    av_freep(&s->ac_val[0]);
+    av_freep(&s->coded_block);
+    av_freep(&s->mbintra_table);
+    av_freep(&s->cbp_table);
+    av_freep(&s->pred_dir_table);
+    av_freep(&s->qscale_table);
+    av_freep(&s->me_scratchpad);
+    av_freep(&s->me_map);
+    av_freep(&s->me_score_map);
+    
+    av_freep(&s->mbskip_table);
+    av_freep(&s->bitstream_buffer);
+    av_freep(&s->tex_pb_buffer);
+    av_freep(&s->pb2_buffer);
     for(i=0;i<3;i++) {
-        if (s->last_picture_base[i])
-	    free(s->last_picture_base[i]);
-	if (s->next_picture_base[i])
-	    free(s->next_picture_base[i]);
-        if (s->has_b_frames)
-            free(s->aux_picture_base[i]);
+        int j;
+        av_freep(&s->last_picture_base[i]);
+        av_freep(&s->next_picture_base[i]);
+        av_freep(&s->aux_picture_base[i]);
+        for(j=0; j<REORDER_BUFFER_SIZE; j++){
+            av_freep(&s->picture_buffer[j][i]);
+        }
     }
     s->context_initialized = 0;
 }
@@ -285,11 +330,17 @@ int MPV_encode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     int i;
 
+    avctx->pix_fmt = PIX_FMT_YUV420P;
+
     s->bit_rate = avctx->bit_rate;
     s->bit_rate_tolerance = avctx->bit_rate_tolerance;
     s->frame_rate = avctx->frame_rate;
     s->width = avctx->width;
     s->height = avctx->height;
+    if(avctx->gop_size > 600){
+        fprintf(stderr, "Warning keyframe interval too large! reducing it ...\n");
+        avctx->gop_size=600;
+    }
     s->gop_size = avctx->gop_size;
     s->rtp_mode = avctx->rtp_mode;
     s->rtp_payload_size = avctx->rtp_payload_size;
@@ -300,36 +351,56 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->max_qdiff= avctx->max_qdiff;
     s->qcompress= avctx->qcompress;
     s->qblur= avctx->qblur;
+    s->b_quant_factor= avctx->b_quant_factor;
+    s->b_quant_offset= avctx->b_quant_offset;
     s->avctx = avctx;
     s->aspect_ratio_info= avctx->aspect_ratio_info;
     s->flags= avctx->flags;
-    
+    s->max_b_frames= avctx->max_b_frames;
+    s->rc_strategy= avctx->rc_strategy;
+    s->b_frame_strategy= avctx->b_frame_strategy;
+    s->codec_id= avctx->codec->id;
+    s->luma_elim_threshold  = avctx->luma_elim_threshold;
+    s->chroma_elim_threshold= avctx->chroma_elim_threshold;
+    s->strict_std_compliance= avctx->strict_std_compliance;
+    s->data_partitioning= avctx->flags & CODEC_FLAG_PART;
+
     if (s->gop_size <= 1) {
         s->intra_only = 1;
         s->gop_size = 12;
     } else {
         s->intra_only = 0;
     }
-    s->full_search = motion_estimation_method;
-
+    
+    /* ME algorithm */
+    if (avctx->me_method == 0)
+        /* For compatibility */
+        s->me_method = motion_estimation_method;
+    else
+        s->me_method = avctx->me_method;
+        
+    /* Fixed QSCALE */
     s->fixed_qscale = (avctx->flags & CODEC_FLAG_QSCALE);
     
     switch(avctx->codec->id) {
     case CODEC_ID_MPEG1VIDEO:
         s->out_format = FMT_MPEG1;
+        avctx->delay=0; //FIXME not sure, should check the spec
         break;
     case CODEC_ID_MJPEG:
         s->out_format = FMT_MJPEG;
         s->intra_only = 1; /* force intra only for jpeg */
         s->mjpeg_write_tables = 1; /* write all tables */
+	s->mjpeg_data_only_frames = 0; /* write all the needed headers */
         s->mjpeg_vsample[0] = 2; /* set up default sampling factors */
         s->mjpeg_vsample[1] = 1; /* the only currently supported values */
         s->mjpeg_vsample[2] = 1; 
-        s->mjpeg_hsample[0] = 2; 
+        s->mjpeg_hsample[0] = 2;
         s->mjpeg_hsample[1] = 1; 
         s->mjpeg_hsample[2] = 1; 
         if (mjpeg_init(s) < 0)
             return -1;
+        avctx->delay=0;
         break;
     case CODEC_ID_H263:
         if (h263_get_picture_format(s->width, s->height) == 7) {
@@ -337,6 +408,7 @@ int MPV_encode_init(AVCodecContext *avctx)
             return -1;
         }
         s->out_format = FMT_H263;
+        avctx->delay=0;
         break;
     case CODEC_ID_H263P:
         s->out_format = FMT_H263;
@@ -344,19 +416,25 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->rtp_payload_size = 1200; 
         s->h263_plus = 1;
         s->unrestricted_mv = 1;
+        s->h263_aic = 1;
         
         /* These are just to be sure */
         s->umvplus = 0;
         s->umvplus_dec = 0;
+        avctx->delay=0;
         break;
     case CODEC_ID_RV10:
         s->out_format = FMT_H263;
         s->h263_rv10 = 1;
+        avctx->delay=0;
         break;
     case CODEC_ID_MPEG4:
         s->out_format = FMT_H263;
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
+        s->has_b_frames= s->max_b_frames ? 1 : 0;
+        s->low_delay=0;
+        avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1); 
         break;
     case CODEC_ID_MSMPEG4V1:
         s->out_format = FMT_H263;
@@ -364,6 +442,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 1;
+        avctx->delay=0;
         break;
     case CODEC_ID_MSMPEG4V2:
         s->out_format = FMT_H263;
@@ -371,6 +450,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 2;
+        avctx->delay=0;
         break;
     case CODEC_ID_MSMPEG4V3:
         s->out_format = FMT_H263;
@@ -378,16 +458,12 @@ int MPV_encode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->unrestricted_mv = 1;
         s->msmpeg4_version= 3;
+        avctx->delay=0;
         break;
     default:
         return -1;
     }
     
-    if((s->flags&CODEC_FLAG_4MV) && !(s->flags&CODEC_FLAG_HQ)){
-        printf("4MV is currently only supported in HQ mode\n");
-        return -1;
-    }
-
     { /* set up some save defaults, some codecs might override them later */
         static int done=0;
         if(!done){
@@ -410,7 +486,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         mpeg1_encode_init(s);
 
     /* dont use mv_penalty table for crap MV as it would be confused */
-    if(s->full_search<4) s->mv_penalty= default_mv_penalty;
+    if (s->me_method < ME_EPZS) s->mv_penalty = default_mv_penalty;
 
     s->encoding = 1;
 
@@ -420,18 +496,32 @@ int MPV_encode_init(AVCodecContext *avctx)
     
     /* init default q matrix */
     for(i=0;i<64;i++) {
-        s->intra_matrix[i] = default_intra_matrix[i];
-        s->non_intra_matrix[i] = default_non_intra_matrix[i];
+        if(s->out_format == FMT_H263)
+            s->intra_matrix[i] = default_non_intra_matrix[i];
+        else
+            s->intra_matrix[i] = default_intra_matrix[i];
+
+        s->inter_matrix[i] = default_non_intra_matrix[i];
     }
 
-    /* rate control init */
-    rate_control_init(s);
+    /* precompute matrix */
+    /* for mjpeg, we do include qscale in the matrix */
+    if (s->out_format != FMT_MJPEG) {
+        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->q_intra_matrix16_bias, 
+                       s->intra_matrix, s->intra_quant_bias);
+        convert_matrix(s->q_inter_matrix, s->q_inter_matrix16, s->q_inter_matrix16_bias, 
+                       s->inter_matrix, s->inter_quant_bias);
+    }
+
+    if(ff_rate_control_init(s) < 0)
+        return -1;
 
     s->picture_number = 0;
     s->picture_in_gop_number = 0;
     s->fake_picture_number = 0;
     /* motion detector init */
     s->f_code = 1;
+    s->b_code = 1;
 
     return 0;
 }
@@ -443,6 +533,9 @@ int MPV_encode_end(AVCodecContext *avctx)
 #ifdef STATS
     print_stats();
 #endif
+
+    ff_rate_control_uninit(s);
+
     MPV_common_end(s);
     if (s->out_format == FMT_MJPEG)
         mjpeg_close(s);
@@ -485,12 +578,13 @@ void MPV_frame_start(MpegEncContext *s)
     UINT8 *tmp;
 
     s->mb_skiped = 0;
+    s->decoding_error=0;
+
     if (s->pict_type == B_TYPE) {
         for(i=0;i<3;i++) {
             s->current_picture[i] = s->aux_picture[i];
         }
     } else {
-        s->last_non_b_pict_type= s->pict_type;
         for(i=0;i<3;i++) {
             /* swap next and last */
             tmp = s->last_picture[i];
@@ -504,9 +598,11 @@ void MPV_frame_start(MpegEncContext *s)
 /* generic function for encode/decode called after a frame has been coded/decoded */
 void MPV_frame_end(MpegEncContext *s)
 {
+//    if((s->picture_number%100)==0 && s->encoding) printf("sads:%d //\n", sads);
+
     /* draw edge for correct motion prediction if outside */
     if (s->pict_type != B_TYPE && !s->intra_only) {
-      if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4 || s->divx_version==500){
+      if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4 || s->divx_version>=500){
         draw_edges(s->current_picture[0], s->linesize, s->mb_width*16, s->mb_height*16, EDGE_WIDTH);
         draw_edges(s->current_picture[1], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2);
         draw_edges(s->current_picture[2], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2);
@@ -518,107 +614,195 @@ void MPV_frame_end(MpegEncContext *s)
       }
     }
     emms_c();
+    
+    if(s->pict_type!=B_TYPE){
+        s->last_non_b_pict_type= s->pict_type;
+        s->last_non_b_qscale= s->qscale;
+        s->last_non_b_mc_mb_var= s->mc_mb_var_sum;
+        s->num_available_buffers++;
+        if(s->num_available_buffers>2) s->num_available_buffers= 2;
+    }
 }
 
-int MPV_encode_picture(AVCodecContext *avctx,
-                       unsigned char *buf, int buf_size, void *data)
+/* reorder input for encoding */
+void reorder_input(MpegEncContext *s, AVPicture *pict)
 {
-    MpegEncContext *s = avctx->priv_data;
-    AVPicture *pict = data;
-    int i, j;
+    int i, j, index;
+            
+    if(s->max_b_frames > FF_MAX_B_FRAMES) s->max_b_frames= FF_MAX_B_FRAMES;
 
-    if (s->fixed_qscale) 
-        s->qscale = avctx->quality;
+//        delay= s->max_b_frames+1; (or 0 if no b frames cuz decoder diff)
 
-    init_put_bits(&s->pb, buf, buf_size, NULL, NULL);
-
-    s->force_type= (avctx->flags&CODEC_FLAG_TYPE) ?
-	(avctx->key_frame ? I_TYPE : P_TYPE) : 0;
-    if (!s->intra_only) {
-        /* first picture of GOP is intra */
-        if (s->picture_in_gop_number % s->gop_size==0 || s->force_type==I_TYPE){
-            s->picture_in_gop_number=0;
-            s->pict_type = I_TYPE;
-        }else
-            s->pict_type = P_TYPE;
-    } else {
-        s->pict_type = I_TYPE;
+    for(j=0; j<REORDER_BUFFER_SIZE-1; j++){
+        s->coded_order[j]= s->coded_order[j+1];
     }
-    
-    MPV_frame_start(s);
-    
-    for(i=0;i<3;i++) {
-        UINT8 *src = pict->data[i];
-        UINT8 *dest = s->current_picture[i];
-        int src_wrap = pict->linesize[i];
-        int dest_wrap = s->linesize;
-        int w = s->width;
-        int h = s->height;
-
-        if (i >= 1) {
-            dest_wrap >>= 1;
-            w >>= 1;
-            h >>= 1;
+    s->coded_order[j].picture[0]= s->coded_order[j].picture[1]= s->coded_order[j].picture[2]= NULL; //catch uninitalized buffers
+    s->coded_order[j].pict_type=0;
+
+    switch(s->input_pict_type){
+    default: 
+    case I_TYPE:
+    case S_TYPE:
+    case P_TYPE:
+        index= s->max_b_frames - s->b_frames_since_non_b;
+        s->b_frames_since_non_b=0;
+        break;            
+    case B_TYPE:
+        index= s->max_b_frames + 1;
+        s->b_frames_since_non_b++;
+        break;          
+    }
+//printf("index:%d type:%d strides: %d %d\n", index, s->input_pict_type, pict->linesize[0], s->linesize);
+    if(   (index==0 || (s->flags&CODEC_FLAG_INPUT_PRESERVED))
+       && pict->linesize[0] == s->linesize
+       && pict->linesize[1] == s->linesize>>1
+       && pict->linesize[2] == s->linesize>>1){
+//printf("ptr\n");
+        for(i=0; i<3; i++){
+            s->coded_order[index].picture[i]= pict->data[i];
         }
+    }else{
+//printf("copy\n");
+        for(i=0; i<3; i++){
+            uint8_t *src = pict->data[i];
+            uint8_t *dest;
+            int src_wrap = pict->linesize[i];
+            int dest_wrap = s->linesize;
+            int w = s->width;
+            int h = s->height;
+
+            if(index==0) dest= s->last_picture[i]+16; //is current_picture indeed but the switch hapens after reordering
+            else         dest= s->picture_buffer[s->picture_buffer_index][i];
+
+            if (i >= 1) {
+                dest_wrap >>= 1;
+                w >>= 1;
+                h >>= 1;
+            }
 
-        if(dest_wrap==src_wrap){
-            s->new_picture[i] = pict->data[i];
-        } else {
+            s->coded_order[index].picture[i]= dest;
             for(j=0;j<h;j++) {
                 memcpy(dest, src, w);
                 dest += dest_wrap;
                 src += src_wrap;
             }
-            s->new_picture[i] = s->current_picture[i];
-	    }
+        }
+        if(index!=0){
+            s->picture_buffer_index++;
+            if(s->picture_buffer_index >= REORDER_BUFFER_SIZE-1) s->picture_buffer_index=0;
+        }
+    }
+    s->coded_order[index].pict_type = s->input_pict_type;
+    s->coded_order[index].qscale    = s->input_qscale;
+    s->coded_order[index].force_type= s->force_input_type;
+    s->coded_order[index].picture_in_gop_number= s->input_picture_in_gop_number;
+    s->coded_order[index].picture_number= s->input_picture_number;
+
+    for(i=0; i<3; i++){
+        s->new_picture[i]= s->coded_order[0].picture[i];
     }
+}
+
+int MPV_encode_picture(AVCodecContext *avctx,
+                       unsigned char *buf, int buf_size, void *data)
+{
+    MpegEncContext *s = avctx->priv_data;
+    AVPicture *pict = data;
 
-    encode_picture(s, s->picture_number);
-    avctx->key_frame = (s->pict_type == I_TYPE);
-    avctx->header_bits = s->header_bits;
-    avctx->mv_bits     = s->mv_bits;
-    avctx->misc_bits   = s->misc_bits;
-    avctx->i_tex_bits  = s->i_tex_bits;
-    avctx->p_tex_bits  = s->p_tex_bits;
-    avctx->i_count     = s->i_count;
-    avctx->p_count     = s->p_count;
-    avctx->skip_count  = s->skip_count;
+    s->input_qscale = avctx->quality;
 
-    MPV_frame_end(s);
-    s->picture_number++;
-    s->picture_in_gop_number++;
+    init_put_bits(&s->pb, buf, buf_size, NULL, NULL);
 
-    if (s->out_format == FMT_MJPEG)
-        mjpeg_picture_trailer(s);
+    if(avctx->flags&CODEC_FLAG_TYPE){
+        s->input_pict_type=
+        s->force_input_type= avctx->key_frame ? I_TYPE : P_TYPE;
+    }else if(s->flags&CODEC_FLAG_PASS2){
+        s->input_pict_type=
+        s->force_input_type= s->rc_context.entry[s->input_picture_number].new_pict_type;
+    }else{
+        s->force_input_type=0;
+        if (!s->intra_only) {
+            /* first picture of GOP is intra */
+            if (s->input_picture_in_gop_number % s->gop_size==0){
+                s->input_pict_type = I_TYPE;
+            }else if(s->max_b_frames==0){
+                s->input_pict_type = P_TYPE;
+            }else{
+                if(s->b_frames_since_non_b < s->max_b_frames) //FIXME more IQ
+                    s->input_pict_type = B_TYPE;
+                else
+                    s->input_pict_type = P_TYPE;
+            }
+        } else {
+            s->input_pict_type = I_TYPE;
+        }
+    }
+
+    if(s->input_pict_type==I_TYPE)
+        s->input_picture_in_gop_number=0;
+    
+    reorder_input(s, pict);
+    
+    /* output? */
+    if(s->coded_order[0].picture[0]){
+
+        s->pict_type= s->coded_order[0].pict_type;
+        if (s->fixed_qscale) /* the ratecontrol needs the last qscale so we dont touch it for CBR */
+            s->qscale= s->coded_order[0].qscale;
+        s->force_type= s->coded_order[0].force_type;
+        s->picture_in_gop_number= s->coded_order[0].picture_in_gop_number;
+        s->picture_number= s->coded_order[0].picture_number;
+
+        MPV_frame_start(s);
+
+        encode_picture(s, s->picture_number);
+        avctx->key_frame   = (s->pict_type == I_TYPE);
+        avctx->pict_type   = s->pict_type;
+        avctx->real_pict_num  = s->picture_number;
+        avctx->header_bits = s->header_bits;
+        avctx->mv_bits     = s->mv_bits;
+        avctx->misc_bits   = s->misc_bits;
+        avctx->i_tex_bits  = s->i_tex_bits;
+        avctx->p_tex_bits  = s->p_tex_bits;
+        avctx->i_count     = s->i_count;
+        avctx->p_count     = s->p_count;
+        avctx->skip_count  = s->skip_count;
+
+        MPV_frame_end(s);
+
+        if (s->out_format == FMT_MJPEG)
+            mjpeg_picture_trailer(s);
+
+        avctx->quality = s->qscale;
+        
+        if(s->flags&CODEC_FLAG_PASS1)
+            ff_write_pass1_stats(s);
+    
+    }
+
+    s->input_picture_number++;
+    s->input_picture_in_gop_number++;
 
     flush_put_bits(&s->pb);
-    s->last_frame_bits= s->frame_bits;
     s->frame_bits  = (pbBufPtr(&s->pb) - s->pb.buf) * 8;
+    if(s->pict_type==B_TYPE) s->pb_frame_bits+= s->frame_bits;
+    else                     s->pb_frame_bits= s->frame_bits;
+
     s->total_bits += s->frame_bits;
     avctx->frame_bits  = s->frame_bits;
 //printf("fcode: %d, type: %d, head: %d, mv: %d, misc: %d, frame: %d, itex: %d, ptex: %d\n", 
 //s->f_code, avctx->key_frame, s->header_bits, s->mv_bits, s->misc_bits, s->frame_bits, s->i_tex_bits, s->p_tex_bits);
 
-    avctx->quality = s->qscale;
     if (avctx->get_psnr) {
         /* At this point pict->data should have the original frame   */
         /* an s->current_picture should have the coded/decoded frame */
         get_psnr(pict->data, s->current_picture,
                  pict->linesize, s->linesize, avctx);
+//        printf("%f\n", avctx->psnr_y);
     }
     return pbBufPtr(&s->pb) - s->pb.buf;
 }
 
-static inline int clip(int a, int amin, int amax)
-{
-    if (a < amin)
-        return amin;
-    else if (a > amax)
-        return amax;
-    else
-        return a;
-}
-
 static inline void gmc1_motion(MpegEncContext *s,
                                UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
                                int dest_offset,
@@ -626,7 +810,7 @@ static inline void gmc1_motion(MpegEncContext *s,
                                int h)
 {
     UINT8 *ptr;
-    int dxy, offset, mx, my, src_x, src_y, height, linesize;
+    int offset, src_x, src_y, linesize;
     int motion_x, motion_y;
 
     if(s->real_sprite_warping_points>1) printf("more than 1 warp point isnt supported\n");
@@ -705,6 +889,8 @@ if(s->quarter_sample)
     pix_op[dxy](dest_y, ptr, linesize, h);
     pix_op[dxy](dest_y + 8, ptr + 8, linesize, h);
 
+    if(s->flags&CODEC_FLAG_GRAY) return;
+
     if (s->out_format == FMT_H263) {
         dxy = 0;
         if ((motion_x & 3) != 0)
@@ -768,6 +954,8 @@ static inline void qpel_motion(MpegEncContext *s,
     qpix_op[dxy](dest_y + linesize*8    , ptr + linesize*8    , linesize, linesize, motion_x&3, motion_y&3);
     qpix_op[dxy](dest_y + linesize*8 + 8, ptr + linesize*8 + 8, linesize, linesize, motion_x&3, motion_y&3);
     
+    if(s->flags&CODEC_FLAG_GRAY) return;
+
     mx= (motion_x>>1) | (motion_x&1);
     my= (motion_y>>1) | (motion_y&1);
 
@@ -856,6 +1044,8 @@ static inline void MPV_motion(MpegEncContext *s,
             dest = dest_y + ((i & 1) * 8) + (i >> 1) * 8 * s->linesize;
             pix_op[dxy](dest, ptr, s->linesize, 8);
         }
+    
+        if(s->flags&CODEC_FLAG_GRAY) break;
         /* In case of 8X8, we construct a single chroma motion vector
            with a special rounding */
         mx = 0;
@@ -922,8 +1112,7 @@ static inline void put_dct(MpegEncContext *s,
 {
     if (!s->mpeg2)
         s->dct_unquantize(s, block, i, s->qscale);
-    ff_idct (block);
-    put_pixels_clamped(block, dest, line_size);
+    ff_idct_put (dest, line_size, block);
 }
 
 /* add block[] to dest[] */
@@ -931,14 +1120,53 @@ static inline void add_dct(MpegEncContext *s,
                            DCTELEM *block, int i, UINT8 *dest, int line_size)
 {
     if (s->block_last_index[i] >= 0) {
-        if (!s->mpeg2)
-            if(s->encoding || (!s->h263_msmpeg4))
-                s->dct_unquantize(s, block, i, s->qscale);
-        ff_idct (block);
-        add_pixels_clamped(block, dest, line_size);
+        ff_idct_add (dest, line_size, block);
     }
 }
 
+static inline void add_dequant_dct(MpegEncContext *s, 
+                           DCTELEM *block, int i, UINT8 *dest, int line_size)
+{
+    if (s->block_last_index[i] >= 0) {
+        s->dct_unquantize(s, block, i, s->qscale);
+
+        ff_idct_add (dest, line_size, block);
+    }
+}
+
+/**
+ * cleans dc, ac, coded_block for the current non intra MB
+ */
+void ff_clean_intra_table_entries(MpegEncContext *s)
+{
+    int wrap = s->block_wrap[0];
+    int xy = s->block_index[0];
+    
+    s->dc_val[0][xy           ] = 
+    s->dc_val[0][xy + 1       ] = 
+    s->dc_val[0][xy     + wrap] =
+    s->dc_val[0][xy + 1 + wrap] = 1024;
+    /* ac pred */
+    memset(s->ac_val[0][xy       ], 0, 32 * sizeof(INT16));
+    memset(s->ac_val[0][xy + wrap], 0, 32 * sizeof(INT16));
+    if (s->msmpeg4_version>=3) {
+        s->coded_block[xy           ] =
+        s->coded_block[xy + 1       ] =
+        s->coded_block[xy     + wrap] =
+        s->coded_block[xy + 1 + wrap] = 0;
+    }
+    /* chroma */
+    wrap = s->block_wrap[4];
+    xy = s->mb_x + 1 + (s->mb_y + 1) * wrap;
+    s->dc_val[1][xy] =
+    s->dc_val[2][xy] = 1024;
+    /* ac pred */
+    memset(s->ac_val[1][xy], 0, 16 * sizeof(INT16));
+    memset(s->ac_val[2][xy], 0, 16 * sizeof(INT16));
+    
+    s->mbintra_table[s->mb_x + s->mb_y*s->mb_width]= 0;
+}
+
 /* generic function called after a macroblock has been parsed by the
    decoder or after it has been encoded by the encoder.
 
@@ -952,69 +1180,39 @@ static inline void add_dct(MpegEncContext *s,
 void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
 {
     int mb_x, mb_y;
-    int dct_linesize, dct_offset;
-    op_pixels_func *op_pix;
-    qpel_mc_func *op_qpix;
+    const int mb_xy = s->mb_y * s->mb_width + s->mb_x;
 
     mb_x = s->mb_x;
     mb_y = s->mb_y;
 
 #ifdef FF_POSTPROCESS
+    /* Obsolete. Exists for compatibility with mplayer only. */
     quant_store[mb_y][mb_x]=s->qscale;
     //printf("[%02d][%02d] %d\n",mb_x,mb_y,s->qscale);
+#else
+    if(s->avctx->quant_store) s->avctx->quant_store[mb_y*s->avctx->qstride+mb_x] = s->qscale;
 #endif
 
     /* update DC predictors for P macroblocks */
     if (!s->mb_intra) {
         if (s->h263_pred || s->h263_aic) {
-          if(s->mbintra_table[mb_x + mb_y*s->mb_width])
-          {
-            int wrap, xy, v;
-            s->mbintra_table[mb_x + mb_y*s->mb_width]=0;
-            wrap = 2 * s->mb_width + 2;
-            xy = 2 * mb_x + 1 +  (2 * mb_y + 1) * wrap;
-            v = 1024;
-            
-            s->dc_val[0][xy] = v;
-            s->dc_val[0][xy + 1] = v;
-            s->dc_val[0][xy + wrap] = v;
-            s->dc_val[0][xy + 1 + wrap] = v;
-            /* ac pred */
-            memset(s->ac_val[0][xy], 0, 16 * sizeof(INT16));
-            memset(s->ac_val[0][xy + 1], 0, 16 * sizeof(INT16));
-            memset(s->ac_val[0][xy + wrap], 0, 16 * sizeof(INT16));
-            memset(s->ac_val[0][xy + 1 + wrap], 0, 16 * sizeof(INT16));
-            if (s->h263_msmpeg4) {
-                s->coded_block[xy] = 0;
-                s->coded_block[xy + 1] = 0;
-                s->coded_block[xy + wrap] = 0;
-                s->coded_block[xy + 1 + wrap] = 0;
-            }
-            /* chroma */
-            wrap = s->mb_width + 2;
-            xy = mb_x + 1 + (mb_y + 1) * wrap;
-            s->dc_val[1][xy] = v;
-            s->dc_val[2][xy] = v;
-            /* ac pred */
-            memset(s->ac_val[1][xy], 0, 16 * sizeof(INT16));
-            memset(s->ac_val[2][xy], 0, 16 * sizeof(INT16));
-          }
+            if(s->mbintra_table[mb_xy])
+                ff_clean_intra_table_entries(s);
         } else {
-            s->last_dc[0] = 128 << s->intra_dc_precision;
-            s->last_dc[1] = 128 << s->intra_dc_precision;
+            s->last_dc[0] =
+            s->last_dc[1] =
             s->last_dc[2] = 128 << s->intra_dc_precision;
         }
     }
     else if (s->h263_pred || s->h263_aic)
-        s->mbintra_table[mb_x + mb_y*s->mb_width]=1;
+        s->mbintra_table[mb_xy]=1;
 
     /* update motion predictor, not for B-frames as they need the motion_val from the last P/S-Frame */
-    if (s->out_format == FMT_H263) {
-      if(s->pict_type!=B_TYPE){
-        int xy, wrap, motion_x, motion_y;
+    if (s->out_format == FMT_H263 && s->pict_type!=B_TYPE) { //FIXME move into h263.c if possible, format specific stuff shouldnt be here
+        int motion_x, motion_y;
         
-        wrap = 2 * s->mb_width + 2;
-        xy = 2 * mb_x + 1 + (2 * mb_y + 1) * wrap;
+        const int wrap = s->block_wrap[0];
+        const int xy = s->block_index[0];
         if (s->mb_intra) {
             motion_x = 0;
             motion_y = 0;
@@ -1033,20 +1231,23 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             s->motion_val[xy + 1 + wrap][0] = motion_x;
             s->motion_val[xy + 1 + wrap][1] = motion_y;
         }
-      }
     }
     
-    if (!s->intra_only) {
+    if (!(s->encoding && (s->intra_only || s->pict_type==B_TYPE))) {
         UINT8 *dest_y, *dest_cb, *dest_cr;
-        UINT8 *mbskip_ptr;
-
-        /* avoid copy if macroblock skipped in last frame too */
-        if (!s->encoding && s->pict_type != B_TYPE) {
-            mbskip_ptr = &s->mbskip_table[s->mb_y * s->mb_width + s->mb_x];
+        int dct_linesize, dct_offset;
+        op_pixels_func *op_pix;
+        qpel_mc_func *op_qpix;
+
+        /* avoid copy if macroblock skipped in last frame too 
+           dont touch it for B-frames as they need the skip info from the next p-frame */
+        if (s->pict_type != B_TYPE) {
+            UINT8 *mbskip_ptr = &s->mbskip_table[mb_xy];
             if (s->mb_skiped) {
                 s->mb_skiped = 0;
-                /* if previous was skipped too, then nothing to do ! */
-                if (*mbskip_ptr != 0) 
+                /* if previous was skipped too, then nothing to do ! 
+                   skip only during decoding as we might trash the buffers during encoding a bit */
+                if (*mbskip_ptr != 0 && !s->encoding) 
                     goto the_end;
                 *mbskip_ptr = 1; /* indicate that this time we skiped it */
             } else {
@@ -1068,33 +1269,53 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
 
         if (!s->mb_intra) {
             /* motion handling */
-            if (!s->no_rounding){
-                op_pix = put_pixels_tab;
-                op_qpix= qpel_mc_rnd_tab;
-            }else{
-                op_pix = put_no_rnd_pixels_tab;
-                op_qpix= qpel_mc_no_rnd_tab;
-            }
+            /* decoding or more than one mb_type (MC was allready done otherwise) */
+            if((!s->encoding) || (s->mb_type[mb_xy]&(s->mb_type[mb_xy]-1))){
+                if ((!s->no_rounding) || s->pict_type==B_TYPE){                
+                    op_pix = put_pixels_tab;
+                    op_qpix= qpel_mc_rnd_tab;
+                }else{
+                    op_pix = put_no_rnd_pixels_tab;
+                    op_qpix= qpel_mc_no_rnd_tab;
+                }
 
-            if (s->mv_dir & MV_DIR_FORWARD) {
-                MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
-                if (!s->no_rounding) 
-                    op_pix = avg_pixels_tab;
-                else
-                    op_pix = avg_no_rnd_pixels_tab;
-            }
-            if (s->mv_dir & MV_DIR_BACKWARD) {
-                MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix);
+                if (s->mv_dir & MV_DIR_FORWARD) {
+                    MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
+                    if ((!s->no_rounding) || s->pict_type==B_TYPE)
+                        op_pix = avg_pixels_tab;
+                    else
+                        op_pix = avg_no_rnd_pixels_tab;
+                }
+                if (s->mv_dir & MV_DIR_BACKWARD) {
+                    MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix);
+                }
             }
 
-            /* add dct residue */
-            add_dct(s, block[0], 0, dest_y, dct_linesize);
-            add_dct(s, block[1], 1, dest_y + 8, dct_linesize);
-            add_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize);
-            add_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize);
+            /* skip dequant / idct if we are really late ;) */
+            if(s->hurry_up>1) goto the_end;
 
-            add_dct(s, block[4], 4, dest_cb, s->linesize >> 1);
-            add_dct(s, block[5], 5, dest_cr, s->linesize >> 1);
+            /* add dct residue */
+            if(!s->mpeg2 && (s->encoding || (!s->h263_msmpeg4))){
+                add_dequant_dct(s, block[0], 0, dest_y, dct_linesize);
+                add_dequant_dct(s, block[1], 1, dest_y + 8, dct_linesize);
+                add_dequant_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize);
+                add_dequant_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize);
+
+                if(!(s->flags&CODEC_FLAG_GRAY)){
+                    add_dequant_dct(s, block[4], 4, dest_cb, s->linesize >> 1);
+                    add_dequant_dct(s, block[5], 5, dest_cr, s->linesize >> 1);
+                }
+            } else {
+                add_dct(s, block[0], 0, dest_y, dct_linesize);
+                add_dct(s, block[1], 1, dest_y + 8, dct_linesize);
+                add_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize);
+                add_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize);
+
+                if(!(s->flags&CODEC_FLAG_GRAY)){
+                    add_dct(s, block[4], 4, dest_cb, s->linesize >> 1);
+                    add_dct(s, block[5], 5, dest_cr, s->linesize >> 1);
+                }
+            }
         } else {
             /* dct only in intra block */
             put_dct(s, block[0], 0, dest_y, dct_linesize);
@@ -1102,128 +1323,188 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             put_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize);
             put_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize);
 
-            put_dct(s, block[4], 4, dest_cb, s->linesize >> 1);
-            put_dct(s, block[5], 5, dest_cr, s->linesize >> 1);
+            if(!(s->flags&CODEC_FLAG_GRAY)){
+                put_dct(s, block[4], 4, dest_cb, s->linesize >> 1);
+                put_dct(s, block[5], 5, dest_cr, s->linesize >> 1);
+            }
         }
     }
  the_end:
     emms_c(); //FIXME remove
 }
 
-static void encode_mb(MpegEncContext *s)
+static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int threshold, int skip_dc)
+{
+    static const char tab[64]=
+        {3,2,2,1,1,1,1,1,
+         1,1,1,1,1,1,1,1,
+         1,1,1,1,1,1,1,1,
+         0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0};
+    int score=0;
+    int run=0;
+    int i;
+    DCTELEM *block= s->block[n];
+    const int last_index= s->block_last_index[n];
+
+    if(skip_dc) skip_dc=1;
+    
+    /* are all which we could set to zero are allready zero? */
+    if(last_index<=skip_dc - 1) return;
+
+    for(i=0; i<=last_index; i++){
+        const int j = zigzag_direct[i];
+        const int level = ABS(block[j]);
+        if(level==1){
+            if(skip_dc && i==0) continue;
+            score+= tab[run];
+            run=0;
+        }else if(level>1){
+            return;
+        }else{
+            run++;
+        }
+    }
+    if(score >= threshold) return;
+    for(i=skip_dc; i<=last_index; i++){
+        const int j = zigzag_direct[i];
+        block[j]=0;
+    }
+    if(block[0]) s->block_last_index[n]= 0;
+    else         s->block_last_index[n]= -1;
+}
+
+static inline void clip_coeffs(MpegEncContext *s, DCTELEM *block, int last_index)
+{
+    int i;
+    const int maxlevel= s->max_qcoeff;
+    const int minlevel= s->min_qcoeff;
+        
+    for(i=0;i<=last_index; i++){
+        const int j = zigzag_direct[i];
+        int level = block[j];
+       
+        if     (level>maxlevel) level=maxlevel;
+        else if(level<minlevel) level=minlevel;
+        block[j]= level;
+    }
+}
+
+static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 {
-    int wrap;
     const int mb_x= s->mb_x;
     const int mb_y= s->mb_y;
-    UINT8 *ptr;
-    const int motion_x= s->mv[0][0][0];
-    const int motion_y= s->mv[0][0][1];
     int i;
+    int skip_dct[6];
+#if 0
+        if (s->interlaced_dct) {
+            dct_linesize = s->linesize * 2;
+            dct_offset = s->linesize;
+        } else {
+            dct_linesize = s->linesize;
+            dct_offset = s->linesize * 8;
+        }
+#endif
+    for(i=0; i<6; i++) skip_dct[i]=0;
 
-    /* get the pixels */
-    wrap = s->linesize;
-    ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16;
-    get_pixels(s->block[0], ptr, wrap);
-    get_pixels(s->block[1], ptr + 8, wrap);
-    get_pixels(s->block[2], ptr + 8 * wrap, wrap);
-    get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap);
-    wrap = s->linesize >> 1;
-    ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8;
-    get_pixels(s->block[4], ptr, wrap);
-
-    wrap = s->linesize >> 1;
-    ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8;
-    get_pixels(s->block[5], ptr, wrap);
-
-    /* subtract previous frame if non intra */
-    if (!s->mb_intra) {
-        int dxy, offset, mx, my;
+    if (s->mb_intra) {
+        UINT8 *ptr;
+        int wrap;
+
+        wrap = s->linesize;
+        ptr = s->new_picture[0] + (mb_y * 16 * wrap) + mb_x * 16;
+        get_pixels(s->block[0], ptr               , wrap);
+        get_pixels(s->block[1], ptr            + 8, wrap);
+        get_pixels(s->block[2], ptr + 8 * wrap    , wrap);
+        get_pixels(s->block[3], ptr + 8 * wrap + 8, wrap);
+
+        if(s->flags&CODEC_FLAG_GRAY){
+            skip_dct[4]= 1;
+            skip_dct[5]= 1;
+        }else{
+            wrap >>=1;
+            ptr = s->new_picture[1] + (mb_y * 8 * wrap) + mb_x * 8;
+            get_pixels(s->block[4], ptr, wrap);
+
+            ptr = s->new_picture[2] + (mb_y * 8 * wrap) + mb_x * 8;
+            get_pixels(s->block[5], ptr, wrap);
+        }
+    }else{
+        op_pixels_func *op_pix;
+        qpel_mc_func *op_qpix;
+        UINT8 *dest_y, *dest_cb, *dest_cr;
+        UINT8 *ptr_y, *ptr_cb, *ptr_cr;
+        int wrap_y, wrap_c;
+
+        dest_y  = s->current_picture[0] + (mb_y * 16 * s->linesize       ) + mb_x * 16;
+        dest_cb = s->current_picture[1] + (mb_y * 8  * (s->linesize >> 1)) + mb_x * 8;
+        dest_cr = s->current_picture[2] + (mb_y * 8  * (s->linesize >> 1)) + mb_x * 8;
+        wrap_y = s->linesize;
+        wrap_c = wrap_y>>1;
+        ptr_y  = s->new_picture[0] + (mb_y * 16 * wrap_y) + mb_x * 16;
+        ptr_cb = s->new_picture[1] + (mb_y * 8 * wrap_c) + mb_x * 8;
+        ptr_cr = s->new_picture[2] + (mb_y * 8 * wrap_c) + mb_x * 8;
+
+        if ((!s->no_rounding) || s->pict_type==B_TYPE){
+            op_pix = put_pixels_tab;
+            op_qpix= qpel_mc_rnd_tab;
+        }else{
+            op_pix = put_no_rnd_pixels_tab;
+            op_qpix= qpel_mc_no_rnd_tab;
+        }
+
+        if (s->mv_dir & MV_DIR_FORWARD) {
+            MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture, op_pix, op_qpix);
+           if ((!s->no_rounding) || s->pict_type==B_TYPE)
+                op_pix = avg_pixels_tab;
+            else
+                op_pix = avg_no_rnd_pixels_tab;
+        }
+        if (s->mv_dir & MV_DIR_BACKWARD) {
+            MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture, op_pix, op_qpix);
+        }
+
+        diff_pixels(s->block[0], ptr_y                 , dest_y                 , wrap_y);
+        diff_pixels(s->block[1], ptr_y              + 8, dest_y              + 8, wrap_y);
+        diff_pixels(s->block[2], ptr_y + 8 * wrap_y    , dest_y + 8 * wrap_y    , wrap_y);
+        diff_pixels(s->block[3], ptr_y + 8 * wrap_y + 8, dest_y + 8 * wrap_y + 8, wrap_y);
         
-        if(s->mv_type==MV_TYPE_16X16){
-            dxy = ((motion_y & 1) << 1) | (motion_x & 1);
-            ptr = s->last_picture[0] + 
-                ((mb_y * 16 + (motion_y >> 1)) * s->linesize) + 
-                (mb_x * 16 + (motion_x >> 1));
-
-            sub_pixels_2(s->block[0], ptr, s->linesize, dxy);
-            sub_pixels_2(s->block[1], ptr + 8, s->linesize, dxy);
-            sub_pixels_2(s->block[2], ptr + s->linesize * 8, s->linesize, dxy);
-            sub_pixels_2(s->block[3], ptr + 8 + s->linesize * 8, s->linesize ,dxy);
-
-            if (s->out_format == FMT_H263) {
-                /* special rounding for h263 */
-                dxy = 0;
-                if ((motion_x & 3) != 0)
-                    dxy |= 1;
-                if ((motion_y & 3) != 0)
-                    dxy |= 2;
-                mx = motion_x >> 2;
-                my = motion_y >> 2;
-            } else {
-                mx = motion_x / 2;
-                my = motion_y / 2;
-                dxy = ((my & 1) << 1) | (mx & 1);
-                mx >>= 1;
-                my >>= 1;
-            }
-            offset = ((mb_y * 8 + my) * (s->linesize >> 1)) + (mb_x * 8 + mx);
-            ptr = s->last_picture[1] + offset;
-            sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
-            ptr = s->last_picture[2] + offset;
-            sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
+        if(s->flags&CODEC_FLAG_GRAY){
+            skip_dct[4]= 1;
+            skip_dct[5]= 1;
         }else{
-            int src_x, src_y;
-
-            for(i=0;i<4;i++) {
-                int motion_x = s->mv[0][i][0];
-                int motion_y = s->mv[0][i][1];
-
-                dxy = ((motion_y & 1) << 1) | (motion_x & 1);
-                src_x = mb_x * 16 + (motion_x >> 1) + (i & 1) * 8;
-                src_y = mb_y * 16 + (motion_y >> 1) + (i >>1) * 8;
-                        
-                ptr = s->last_picture[0] + (src_y * s->linesize) + (src_x);
-                sub_pixels_2(s->block[i], ptr, s->linesize, dxy);
-            }
-            /* In case of 8X8, we construct a single chroma motion vector
-               with a special rounding */
-            mx = 0;
-            my = 0;
-            for(i=0;i<4;i++) {
-                mx += s->mv[0][i][0];
-                my += s->mv[0][i][1];
-            }
-            if (mx >= 0)
-                mx = (h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
-            else {
-                mx = -mx;
-                mx = -(h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
-            }
-            if (my >= 0)
-                my = (h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
-            else {
-                my = -my;
-                my = -(h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
-            }
-            dxy = ((my & 1) << 1) | (mx & 1);
-            mx >>= 1;
-            my >>= 1;
-
-            src_x = mb_x * 8 + mx;
-            src_y = mb_y * 8 + my;
-            src_x = clip(src_x, -8, s->width/2);
-            if (src_x == s->width/2)
-                dxy &= ~1;
-            src_y = clip(src_y, -8, s->height/2);
-            if (src_y == s->height/2)
-                dxy &= ~2;
-            
-            offset = (src_y * (s->linesize >> 1)) + src_x;
-            ptr = s->last_picture[1] + offset;
-            sub_pixels_2(s->block[4], ptr, s->linesize >> 1, dxy);
-            ptr = s->last_picture[2] + offset;
-            sub_pixels_2(s->block[5], ptr, s->linesize >> 1, dxy);
+            diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
+            diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
         }
+
+        /* pre quantization */         
+        if(s->mc_mb_var[s->mb_width*mb_y+ mb_x]<2*s->qscale*s->qscale){
+            if(pix_abs8x8(ptr_y               , dest_y               , wrap_y) < 20*s->qscale) skip_dct[0]= 1;
+            if(pix_abs8x8(ptr_y            + 8, dest_y            + 8, wrap_y) < 20*s->qscale) skip_dct[1]= 1;
+            if(pix_abs8x8(ptr_y + 8*wrap_y    , dest_y + 8*wrap_y    , wrap_y) < 20*s->qscale) skip_dct[2]= 1;
+            if(pix_abs8x8(ptr_y + 8*wrap_y + 8, dest_y + 8*wrap_y + 8, wrap_y) < 20*s->qscale) skip_dct[3]= 1;
+            if(pix_abs8x8(ptr_cb              , dest_cb              , wrap_y) < 20*s->qscale) skip_dct[4]= 1;
+            if(pix_abs8x8(ptr_cr              , dest_cr              , wrap_y) < 20*s->qscale) skip_dct[5]= 1;
+#if 0
+{
+ static int stat[7];
+ int num=0;
+ for(i=0; i<6; i++)
+  if(skip_dct[i]) num++;
+ stat[num]++;
+ 
+ if(s->mb_x==0 && s->mb_y==0){
+  for(i=0; i<7; i++){
+   printf("%6d %1d\n", stat[i], i);
+  }
+ }
+}
+#endif
+        }
+
     }
             
 #if 0
@@ -1240,17 +1521,47 @@ static void encode_mb(MpegEncContext *s)
             }
 #endif
     /* DCT & quantize */
-    if (s->h263_msmpeg4) {
-        msmpeg4_dc_scale(s);
-    } else if (s->h263_pred) {
+    if (s->h263_pred && !(s->msmpeg4_version==1 || s->msmpeg4_version==2)) {
         h263_dc_scale(s);
+    } else if (s->h263_aic) {
+        s->y_dc_scale = 2*s->qscale;
+        s->c_dc_scale = 2*s->qscale;
     } else {
         /* default quantization values */
         s->y_dc_scale = 8;
         s->c_dc_scale = 8;
     }
-    for(i=0;i<6;i++) {
-        s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale);
+    if(s->out_format==FMT_MJPEG){
+        for(i=0;i<6;i++) {
+            int overflow;
+            s->block_last_index[i] = dct_quantize(s, s->block[i], i, 8, &overflow);
+            if (overflow) clip_coeffs(s, s->block[i], s->block_last_index[i]);
+        }
+    }else{
+        for(i=0;i<6;i++) {
+            if(!skip_dct[i]){
+                int overflow;
+                s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale, &overflow);
+            // FIXME we could decide to change to quantizer instead of clipping
+            // JS: I don't think that would be a good idea it could lower quality instead
+            //     of improve it. Just INTRADC clipping deserves changes in quantizer
+                if (overflow) clip_coeffs(s, s->block[i], s->block_last_index[i]);
+            }else
+                s->block_last_index[i]= -1;
+        }
+        if(s->luma_elim_threshold && !s->mb_intra)
+            for(i=0; i<4; i++)
+                dct_single_coeff_elimination(s, i, s->luma_elim_threshold, 0);
+        if(s->chroma_elim_threshold && !s->mb_intra)
+            for(i=4; i<6; i++)
+                dct_single_coeff_elimination(s, i, s->chroma_elim_threshold, 1);
+    }
+
+    if((s->flags&CODEC_FLAG_GRAY) && s->mb_intra){
+        s->block_last_index[4]=
+        s->block_last_index[5]= 0;
+        s->block[4][0]=
+        s->block[5][0]= 128;
     }
 
     /* huffman encode */
@@ -1272,14 +1583,113 @@ static void encode_mb(MpegEncContext *s)
     }
 }
 
-static void copy_bits(PutBitContext *pb, UINT8 *src, int length)
+void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length)
 {
+#if 1
+    int bytes= length>>4;
+    int bits= length&15;
+    int i;
+
+    if(length==0) return;
+
+    for(i=0; i<bytes; i++) put_bits(pb, 16, be2me_16(((uint16_t*)src)[i]));
+    put_bits(pb, bits, be2me_16(((uint16_t*)src)[i])>>(16-bits));
+#else
     int bytes= length>>3;
     int bits= length&7;
     int i;
 
     for(i=0; i<bytes; i++) put_bits(pb, 8, src[i]);
     put_bits(pb, bits, src[i]>>(8-bits));
+#endif
+}
+
+static inline void copy_context_before_encode(MpegEncContext *d, MpegEncContext *s, int type){
+    int i;
+
+    memcpy(d->last_mv, s->last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster then a loop?
+
+    /* mpeg1 */
+    d->mb_incr= s->mb_incr;
+    for(i=0; i<3; i++)
+        d->last_dc[i]= s->last_dc[i];
+    
+    /* statistics */
+    d->mv_bits= s->mv_bits;
+    d->i_tex_bits= s->i_tex_bits;
+    d->p_tex_bits= s->p_tex_bits;
+    d->i_count= s->i_count;
+    d->p_count= s->p_count;
+    d->skip_count= s->skip_count;
+    d->misc_bits= s->misc_bits;
+    d->last_bits= 0;
+
+    d->mb_skiped= s->mb_skiped;
+}
+
+static inline void copy_context_after_encode(MpegEncContext *d, MpegEncContext *s, int type){
+    int i;
+
+    memcpy(d->mv, s->mv, 2*4*2*sizeof(int)); 
+    memcpy(d->last_mv, s->last_mv, 2*2*2*sizeof(int)); //FIXME is memcpy faster then a loop?
+    
+    /* mpeg1 */
+    d->mb_incr= s->mb_incr;
+    for(i=0; i<3; i++)
+        d->last_dc[i]= s->last_dc[i];
+    
+    /* statistics */
+    d->mv_bits= s->mv_bits;
+    d->i_tex_bits= s->i_tex_bits;
+    d->p_tex_bits= s->p_tex_bits;
+    d->i_count= s->i_count;
+    d->p_count= s->p_count;
+    d->skip_count= s->skip_count;
+    d->misc_bits= s->misc_bits;
+
+    d->mb_intra= s->mb_intra;
+    d->mb_skiped= s->mb_skiped;
+    d->mv_type= s->mv_type;
+    d->mv_dir= s->mv_dir;
+    d->pb= s->pb;
+    if(s->data_partitioning){
+        d->pb2= s->pb2;
+        d->tex_pb= s->tex_pb;
+    }
+    d->block= s->block;
+    for(i=0; i<6; i++)
+        d->block_last_index[i]= s->block_last_index[i];
+}
+
+static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegEncContext *best, int type, 
+                           PutBitContext pb[2], PutBitContext pb2[2], PutBitContext tex_pb[2],
+                           int *dmin, int *next_block, int motion_x, int motion_y)
+{
+    int bits_count;
+    
+    copy_context_before_encode(s, backup, type);
+
+    s->block= s->blocks[*next_block];
+    s->pb= pb[*next_block];
+    if(s->data_partitioning){
+        s->pb2   = pb2   [*next_block];
+        s->tex_pb= tex_pb[*next_block];
+    }
+
+    encode_mb(s, motion_x, motion_y);
+
+    bits_count= get_bit_count(&s->pb);
+    if(s->data_partitioning){
+        bits_count+= get_bit_count(&s->pb2);
+        bits_count+= get_bit_count(&s->tex_pb);
+    }
+
+    if(bits_count<*dmin){
+        *dmin= bits_count;
+        *next_block^=1;
+
+        copy_context_after_encode(best, s, type);
+    }
 }
 
 static void encode_picture(MpegEncContext *s, int picture_number)
@@ -1287,8 +1697,17 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     int mb_x, mb_y, last_gob, pdif = 0;
     int i;
     int bits;
-    MpegEncContext best_s;
-    UINT8 bit_buf[4][3000]; //FIXME check that this is ALLWAYS large enogh for a MB
+    MpegEncContext best_s, backup_s;
+    UINT8 bit_buf[2][3000];
+    UINT8 bit_buf2[2][3000];
+    UINT8 bit_buf_tex[2][3000];
+    PutBitContext pb[2], pb2[2], tex_pb[2];
+
+    for(i=0; i<2; i++){
+        init_put_bits(&pb    [i], bit_buf    [i], 3000, NULL, NULL);
+        init_put_bits(&pb2   [i], bit_buf2   [i], 3000, NULL, NULL);
+        init_put_bits(&tex_pb[i], bit_buf_tex[i], 3000, NULL, NULL);
+    }
 
     s->picture_number = picture_number;
 
@@ -1299,12 +1718,16 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->block_wrap[4]=
     s->block_wrap[5]= s->mb_width + 2;
     
-    s->last_mc_mb_var = s->mc_mb_var;
     /* Reset the average MB variance */
-    s->avg_mb_var = 0;
-    s->mc_mb_var = 0;
+    s->mb_var_sum = 0;
+    s->mc_mb_var_sum = 0;
+
+    /* we need to initialize some time vars before we can encode b-frames */
+    if (s->h263_pred && !s->h263_msmpeg4)
+        ff_set_mpeg4_time(s, s->picture_number); 
+
     /* Estimate motion for every MB */
-    if(s->pict_type == P_TYPE){
+    if(s->pict_type != I_TYPE){
         for(mb_y=0; mb_y < s->mb_height; mb_y++) {
             s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
             s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
@@ -1319,127 +1742,61 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                 s->block_index[3]+=2;
 
                 /* compute motion vector & mb_type and store in context */
-                estimate_motion(s, mb_x, mb_y);
+                if(s->pict_type==B_TYPE)
+                    ff_estimate_b_frame_motion(s, mb_x, mb_y);
+                else
+                    ff_estimate_p_frame_motion(s, mb_x, mb_y);
 //                s->mb_type[mb_y*s->mb_width + mb_x]=MB_TYPE_INTER;
             }
         }
         emms_c();
-    }else{
+    }else /* if(s->pict_type == I_TYPE) */{
         /* I-Frame */
         //FIXME do we need to zero them?
         memset(s->motion_val[0], 0, sizeof(INT16)*(s->mb_width*2 + 2)*(s->mb_height*2 + 2)*2);
-        memset(s->mv_table[0]  , 0, sizeof(INT16)*s->mb_width*s->mb_height);
-        memset(s->mv_table[1]  , 0, sizeof(INT16)*s->mb_width*s->mb_height);
+        memset(s->p_mv_table   , 0, sizeof(INT16)*(s->mb_width+2)*(s->mb_height+2)*2);
         memset(s->mb_type      , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
     }
 
-    if(s->avg_mb_var < s->mc_mb_var && s->pict_type != B_TYPE && (!s->force_type)){ //FIXME subtract MV bits
+    if(s->mb_var_sum < s->mc_mb_var_sum && s->pict_type == P_TYPE){ //FIXME subtract MV bits
         s->pict_type= I_TYPE;
-        s->picture_in_gop_number=0;
         memset(s->mb_type   , MB_TYPE_INTRA, sizeof(UINT8)*s->mb_width*s->mb_height);
+        if(s->max_b_frames==0){
+            s->input_pict_type= I_TYPE;
+            s->input_picture_in_gop_number=0;
+        }
 //printf("Scene change detected, encoding as I Frame\n");
     }
-
-    /* find best f_code for ME which do unlimited searches */
-    if(s->pict_type==P_TYPE && s->full_search>3){
-        int mv_num[8];
-        int i;
-        int loose=0;
-        UINT8 * fcode_tab= s->fcode_tab;
-
-        for(i=0; i<8; i++) mv_num[i]=0;
-
-        for(i=0; i<s->mb_num; i++){
-            if(s->mb_type[i] & MB_TYPE_INTER){
-                mv_num[ fcode_tab[s->mv_table[0][i] + MAX_MV] ]++;
-                mv_num[ fcode_tab[s->mv_table[1][i] + MAX_MV] ]++;
-//printf("%d %d %d\n", s->mv_table[0][i], fcode_tab[s->mv_table[0][i] + MAX_MV], i);
-            }
-//else printf("I");
-        }
-
-        for(i=MAX_FCODE; i>1; i--){
-            loose+= mv_num[i];
-            if(loose > 10) break; //FIXME this is pretty ineffective
-        }
-        s->f_code= i;
-/*        for(i=0; i<=MAX_FCODE; i++){
-            printf("%d ", mv_num[i]);
-        }
-        printf("\n");*/
-    }else{
-        s->f_code= 1;
+    
+    if(s->pict_type==P_TYPE || s->pict_type==S_TYPE) 
+        s->f_code= ff_get_best_fcode(s, s->p_mv_table, MB_TYPE_INTER);
+        ff_fix_long_p_mvs(s);
+    if(s->pict_type==B_TYPE){
+        s->f_code= ff_get_best_fcode(s, s->b_forw_mv_table, MB_TYPE_FORWARD);
+        s->b_code= ff_get_best_fcode(s, s->b_back_mv_table, MB_TYPE_BACKWARD);
+
+        ff_fix_long_b_mvs(s, s->b_forw_mv_table, s->f_code, MB_TYPE_FORWARD);
+        ff_fix_long_b_mvs(s, s->b_back_mv_table, s->b_code, MB_TYPE_BACKWARD);
+        ff_fix_long_b_mvs(s, s->b_bidir_forw_mv_table, s->f_code, MB_TYPE_BIDIR);
+        ff_fix_long_b_mvs(s, s->b_bidir_back_mv_table, s->b_code, MB_TYPE_BIDIR);
     }
-
+    
 //printf("f_code %d ///\n", s->f_code);
-    /* convert MBs with too long MVs to I-Blocks */
-    if(s->pict_type==P_TYPE){
-        int i, x, y;
-        const int f_code= s->f_code;
-        UINT8 * fcode_tab= s->fcode_tab;
-//FIXME try to clip instead of intra izing ;)
-        /* clip / convert to intra 16x16 type MVs */
-        for(i=0; i<s->mb_num; i++){
-            if(s->mb_type[i]&MB_TYPE_INTER){
-                if(   fcode_tab[s->mv_table[0][i] + MAX_MV] > f_code
-                   || fcode_tab[s->mv_table[0][i] + MAX_MV] == 0
-                   || fcode_tab[s->mv_table[1][i] + MAX_MV] > f_code
-                   || fcode_tab[s->mv_table[1][i] + MAX_MV] == 0 ){
-                    s->mb_type[i] &= ~MB_TYPE_INTER;
-                    s->mb_type[i] |= MB_TYPE_INTRA;
-                    s->mv_table[0][i] = 0;
-                    s->mv_table[1][i] = 0;
-                }
-            }
-        }
-
-        if(s->flags&CODEC_FLAG_4MV){
-            int wrap= 2+ s->mb_width*2;
-
-            /* clip / convert to intra 8x8 type MVs */
-            for(y=0; y<s->mb_height; y++){
-                int xy= (y*2 + 1)*wrap + 1;
-                i= y*s->mb_width;
-
-                for(x=0; x<s->mb_width; x++){
-                    if(s->mb_type[i]&MB_TYPE_INTER4V){
-                        int block;
-                        for(block=0; block<4; block++){
-                            int off= (block& 1) + (block>>1)*wrap;
-                            int mx= s->motion_val[ xy + off ][0];
-                            int my= s->motion_val[ xy + off ][1];
-
-                            if(   fcode_tab[mx + MAX_MV] > f_code
-                               || fcode_tab[mx + MAX_MV] == 0
-                               || fcode_tab[my + MAX_MV] > f_code
-                               || fcode_tab[my + MAX_MV] == 0 ){
-                                s->mb_type[i] &= ~MB_TYPE_INTER4V;
-                                s->mb_type[i] |= MB_TYPE_INTRA;
-                            }
-                        }
-                        xy+=2;
-                        i++;
-                    }
-                }
-            }
-        }
-    }
 
 //    printf("%d %d\n", s->avg_mb_var, s->mc_mb_var);
 
-    if (!s->fixed_qscale) 
-        s->qscale = rate_estimate_qscale(s);
+    if(s->flags&CODEC_FLAG_PASS2)
+        s->qscale = ff_rate_estimate_qscale_pass2(s);
+    else if (!s->fixed_qscale) 
+        s->qscale = ff_rate_estimate_qscale(s);
 
-    /* precompute matrix */
     if (s->out_format == FMT_MJPEG) {
         /* for mjpeg, we do include qscale in the matrix */
         s->intra_matrix[0] = default_intra_matrix[0];
         for(i=1;i<64;i++)
-            s->intra_matrix[i] = (default_intra_matrix[i] * s->qscale) >> 3;
-        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, 8);
-    } else {
-        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, s->qscale);
-        convert_matrix(s->q_non_intra_matrix, s->q_non_intra_matrix16, s->non_intra_matrix, s->qscale);
+            s->intra_matrix[i] = CLAMP_TO_8BIT((default_intra_matrix[i] * s->qscale) >> 3);
+        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, 
+                       s->q_intra_matrix16_bias, s->intra_matrix, s->intra_quant_bias);
     }
 
     s->last_bits= get_bit_count(&s->pb);
@@ -1489,21 +1846,31 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->gob_index = 2;
         else
             s->gob_index = 4;
+    }else if(s->codec_id==CODEC_ID_MPEG4){
+        s->gob_index = 1;
     }
-        
-    s->avg_mb_var = s->avg_mb_var / s->mb_num;        
-    
+
+    if(s->codec_id==CODEC_ID_MPEG4 && s->data_partitioning && s->pict_type!=B_TYPE)
+        ff_mpeg4_init_partitions(s);
+
+    s->resync_mb_x=0;
+    s->resync_mb_y=0;
     for(mb_y=0; mb_y < s->mb_height; mb_y++) {
-        /* Put GOB header based on RTP MTU */
+        /* Put GOB header based on RTP MTU for formats which support it per line (H263*)*/
         /* TODO: Put all this stuff in a separate generic function */
         if (s->rtp_mode) {
             if (!mb_y) {
                 s->ptr_lastgob = s->pb.buf;
                 s->ptr_last_mb_line = s->pb.buf;
             } else if (s->out_format == FMT_H263 && !s->h263_pred && !s->h263_msmpeg4 && !(mb_y % s->gob_index)) {
+                // MN: we could move the space check from h263 -> here, as its not h263 specific
                 last_gob = h263_encode_gob_header(s, mb_y);
                 if (last_gob) {
-                    s->first_gob_line = 1;
+                    s->first_slice_line = 1;
+                }else{
+                    /*MN: we reset it here instead at the end of each line cuz mpeg4 can have 
+                          slice lines starting & ending in the middle*/
+                    s->first_slice_line = 0;
                 }
             }
         }
@@ -1516,10 +1883,9 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
             const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
-            PutBitContext pb;
-            int d;
+            const int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
+//            int d;
             int dmin=10000000;
-            int best=0;
 
             s->mb_x = mb_x;
             s->mb_y = mb_y;
@@ -1529,114 +1895,216 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->block_index[3]+=2;
             s->block_index[4]++;
             s->block_index[5]++;
+            
+            /* write gob / video packet header for formats which support it at any MB (MPEG4) */
+            if(s->rtp_mode && s->mb_y>0 && s->codec_id==CODEC_ID_MPEG4){
+                int pdif= pbBufPtr(&s->pb) - s->ptr_lastgob;
+
+                //the *2 is there so we stay below the requested size
+                if(pdif + s->mb_line_avgsize/s->mb_width >= s->rtp_payload_size){ 
+                    if(s->codec_id==CODEC_ID_MPEG4){
+                        if(s->data_partitioning && s->pict_type!=B_TYPE){
+                            ff_mpeg4_merge_partitions(s);
+                            ff_mpeg4_init_partitions(s);
+                        }
+                        ff_mpeg4_encode_video_packet_header(s);
+
+                        if(s->flags&CODEC_FLAG_PASS1){
+                            int bits= get_bit_count(&s->pb);
+                            s->misc_bits+= bits - s->last_bits;
+                            s->last_bits= bits;
+                        }
+                        ff_mpeg4_clean_buffers(s);
+                    }
+                    s->ptr_lastgob = pbBufPtr(&s->pb);
+                    s->first_slice_line=1;
+                    s->resync_mb_x=mb_x;
+                    s->resync_mb_y=mb_y;
+                }
+
+                if(  (s->resync_mb_x   == s->mb_x)
+                   && s->resync_mb_y+1 == s->mb_y){
+                    s->first_slice_line=0; 
+                }
+            }
 
-            s->mv_dir = MV_DIR_FORWARD;
             if(mb_type & (mb_type-1)){ // more than 1 MB type possible
-                pb= s->pb;
+                int next_block=0;
+                int pb_bits_count, pb2_bits_count, tex_pb_bits_count;
+
+                copy_context_before_encode(&backup_s, s, -1);
+                backup_s.pb= s->pb;
+                best_s.data_partitioning= s->data_partitioning;
+                if(s->data_partitioning){
+                    backup_s.pb2= s->pb2;
+                    backup_s.tex_pb= s->tex_pb;
+                }
+
                 if(mb_type&MB_TYPE_INTER){
+                    s->mv_dir = MV_DIR_FORWARD;
                     s->mv_type = MV_TYPE_16X16;
                     s->mb_intra= 0;
-                    s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x];
-                    s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x];
-                    init_put_bits(&s->pb, bit_buf[1], 3000, NULL, NULL);
-                    s->block= s->inter_block;
-
-                    encode_mb(s);
-                    d= get_bit_count(&s->pb);
-                    if(d<dmin){
-                        flush_put_bits(&s->pb);
-                        dmin=d;
-                        best_s.mv[0][0][0]= s->mv[0][0][0];
-                        best_s.mv[0][0][1]= s->mv[0][0][1];
-                        best_s.mb_intra= 0;
-                        best_s.mv_type = MV_TYPE_16X16;
-                        best_s.pb=s->pb;
-                        best_s.block= s->block;
-                        best=1;
-                        for(i=0; i<6; i++)
-                            best_s.block_last_index[i]= s->block_last_index[i];
-                    }
+                    s->mv[0][0][0] = s->p_mv_table[xy][0];
+                    s->mv[0][0][1] = s->p_mv_table[xy][1];
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_INTER, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, s->mv[0][0][0], s->mv[0][0][1]);
                 }
-                if(mb_type&MB_TYPE_INTER4V){
+                if(mb_type&MB_TYPE_INTER4V){                 
+                    s->mv_dir = MV_DIR_FORWARD;
                     s->mv_type = MV_TYPE_8X8;
                     s->mb_intra= 0;
                     for(i=0; i<4; i++){
                         s->mv[0][i][0] = s->motion_val[s->block_index[i]][0];
                         s->mv[0][i][1] = s->motion_val[s->block_index[i]][1];
                     }
-                    init_put_bits(&s->pb, bit_buf[2], 3000, NULL, NULL);
-                    s->block= s->inter4v_block;
-
-                    encode_mb(s);
-                    d= get_bit_count(&s->pb);
-                    if(d<dmin){
-                        flush_put_bits(&s->pb);
-                        dmin=d;
-                        for(i=0; i<4; i++){
-                            best_s.mv[0][i][0] = s->mv[0][i][0];
-                            best_s.mv[0][i][1] = s->mv[0][i][1];
-                        }
-                        best_s.mb_intra= 0;
-                        best_s.mv_type = MV_TYPE_8X8;
-                        best_s.pb=s->pb;
-                        best_s.block= s->block;
-                        best=2;
-                        for(i=0; i<6; i++)
-                            best_s.block_last_index[i]= s->block_last_index[i];
-                    }
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_INTER4V, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, 0, 0);
+                }
+                if(mb_type&MB_TYPE_FORWARD){
+                    s->mv_dir = MV_DIR_FORWARD;
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->b_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_forw_mv_table[xy][1];
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_FORWARD, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, s->mv[0][0][0], s->mv[0][0][1]);
+                }
+                if(mb_type&MB_TYPE_BACKWARD){
+                    s->mv_dir = MV_DIR_BACKWARD;
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[1][0][0] = s->b_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_back_mv_table[xy][1];
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_BACKWARD, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, s->mv[1][0][0], s->mv[1][0][1]);
+                }
+                if(mb_type&MB_TYPE_BIDIR){
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
+                    s->mv_type = MV_TYPE_16X16;
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->b_bidir_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_bidir_back_mv_table[xy][1];
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_BIDIR, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, 0, 0);
+                }
+                if(mb_type&MB_TYPE_DIRECT){
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+                    s->mv_type = MV_TYPE_16X16; //FIXME
+                    s->mb_intra= 0;
+                    s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1];
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_DIRECT, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, s->b_direct_mv_table[xy][0], s->b_direct_mv_table[xy][1]);
                 }
                 if(mb_type&MB_TYPE_INTRA){
+                    s->mv_dir = MV_DIR_FORWARD;
                     s->mv_type = MV_TYPE_16X16;
                     s->mb_intra= 1;
                     s->mv[0][0][0] = 0;
                     s->mv[0][0][1] = 0;
-                    init_put_bits(&s->pb, bit_buf[0], 3000, NULL, NULL);
-                    s->block= s->intra_block;
-                   
-                    encode_mb(s);
-                    d= get_bit_count(&s->pb);
-                    if(d<dmin){
-                        flush_put_bits(&s->pb);
-                        dmin=d;
-                        best_s.mv[0][0][0]= 0;
-                        best_s.mv[0][0][1]= 0;
-                        best_s.mb_intra= 1;
-                        best_s.mv_type = MV_TYPE_16X16;
-                        best_s.pb=s->pb;
-                        best_s.block= s->block;
-                        for(i=0; i<6; i++)
-                            best_s.block_last_index[i]= s->block_last_index[i];
-                        best=0;
-                    }
-                    /* force cleaning of ac/dc if needed ... */
-                    s->mbintra_table[mb_x + mb_y*s->mb_width]=1;
+                    encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_INTRA, pb, pb2, tex_pb, 
+                                 &dmin, &next_block, 0, 0);
+                    /* force cleaning of ac/dc pred stuff if needed ... */
+                    if(s->h263_pred || s->h263_aic)
+                        s->mbintra_table[mb_x + mb_y*s->mb_width]=1;
                 }
-                for(i=0; i<4; i++){
-                   s->mv[0][i][0] =  best_s.mv[0][i][0];
-                   s->mv[0][i][1] =  best_s.mv[0][i][1];
+                copy_context_after_encode(s, &best_s, -1);
+                
+                pb_bits_count= get_bit_count(&s->pb);
+                flush_put_bits(&s->pb);
+                ff_copy_bits(&backup_s.pb, bit_buf[next_block^1], pb_bits_count);
+                s->pb= backup_s.pb;
+                
+                if(s->data_partitioning){
+                    pb2_bits_count= get_bit_count(&s->pb2);
+                    flush_put_bits(&s->pb2);
+                    ff_copy_bits(&backup_s.pb2, bit_buf2[next_block^1], pb2_bits_count);
+                    s->pb2= backup_s.pb2;
+                    
+                    tex_pb_bits_count= get_bit_count(&s->tex_pb);
+                    flush_put_bits(&s->tex_pb);
+                    ff_copy_bits(&backup_s.tex_pb, bit_buf_tex[next_block^1], tex_pb_bits_count);
+                    s->tex_pb= backup_s.tex_pb;
                 }
-                s->mb_intra= best_s.mb_intra;
-                s->mv_type= best_s.mv_type;
-                for(i=0; i<6; i++)
-                   s->block_last_index[i]= best_s.block_last_index[i];
-                copy_bits(&pb, bit_buf[best], dmin);
-                s->block= best_s.block;
-                s->pb= pb;
+                s->last_bits= get_bit_count(&s->pb);
             } else {
+                int motion_x, motion_y;
+                s->mv_type=MV_TYPE_16X16;
                 // only one MB-Type possible
-                if(mb_type&MB_TYPE_INTRA){
+                switch(mb_type){
+                case MB_TYPE_INTRA:
+                    s->mv_dir = MV_DIR_FORWARD;
                     s->mb_intra= 1;
-                    s->mv[0][0][0] = 0;
-                    s->mv[0][0][1] = 0;
-                }else{
+                    motion_x= s->mv[0][0][0] = 0;
+                    motion_y= s->mv[0][0][1] = 0;
+                    break;
+                case MB_TYPE_INTER:
+                    s->mv_dir = MV_DIR_FORWARD;
+                    s->mb_intra= 0;
+                    motion_x= s->mv[0][0][0] = s->p_mv_table[xy][0];
+                    motion_y= s->mv[0][0][1] = s->p_mv_table[xy][1];
+                    break;
+                case MB_TYPE_INTER4V:
+                    s->mv_dir = MV_DIR_FORWARD;
+                    s->mv_type = MV_TYPE_8X8;
                     s->mb_intra= 0;
-                    s->mv[0][0][0] = s->mv_table[0][mb_y * s->mb_width + mb_x];
-                    s->mv[0][0][1] = s->mv_table[1][mb_y * s->mb_width + mb_x];
+                    for(i=0; i<4; i++){
+                        s->mv[0][i][0] = s->motion_val[s->block_index[i]][0];
+                        s->mv[0][i][1] = s->motion_val[s->block_index[i]][1];
+                    }
+                    motion_x= motion_y= 0;
+                    break;
+                case MB_TYPE_DIRECT:
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+                    s->mb_intra= 0;
+                    motion_x=s->b_direct_mv_table[xy][0];
+                    motion_y=s->b_direct_mv_table[xy][1];
+                    s->mv[0][0][0] = s->b_direct_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_direct_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_direct_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_direct_back_mv_table[xy][1];
+                    break;
+                case MB_TYPE_BIDIR:
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
+                    s->mb_intra= 0;
+                    motion_x=0;
+                    motion_y=0;
+                    s->mv[0][0][0] = s->b_bidir_forw_mv_table[xy][0];
+                    s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1];
+                    s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0];
+                    s->mv[1][0][1] = s->b_bidir_back_mv_table[xy][1];
+                    break;
+                case MB_TYPE_BACKWARD:
+                    s->mv_dir = MV_DIR_BACKWARD;
+                    s->mb_intra= 0;
+                    motion_x= s->mv[1][0][0] = s->b_back_mv_table[xy][0];
+                    motion_y= s->mv[1][0][1] = s->b_back_mv_table[xy][1];
+                    break;
+                case MB_TYPE_FORWARD:
+                    s->mv_dir = MV_DIR_FORWARD;
+                    s->mb_intra= 0;
+                    motion_x= s->mv[0][0][0] = s->b_forw_mv_table[xy][0];
+                    motion_y= s->mv[0][0][1] = s->b_forw_mv_table[xy][1];
+//                    printf(" %d %d ", motion_x, motion_y);
+                    break;
+                default:
+                    motion_x=motion_y=0; //gcc warning fix
+                    printf("illegal MB type\n");
                 }
-                encode_mb(s);
+                encode_mb(s, motion_x, motion_y);
+            }
+            /* clean the MV table in IPS frames for direct mode in B frames */
+            if(s->mb_intra /* && I,P,S_TYPE */){
+                s->p_mv_table[xy][0]=0;
+                s->p_mv_table[xy][1]=0;
             }
 
             MPV_decode_mb(s, s->block);
+//printf("MB %d %d bits\n", s->mb_x+s->mb_y*s->mb_width, get_bit_count(&s->pb));
         }
 
 
@@ -1650,14 +2118,20 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             }
             //fprintf(stderr, "\nMB line: %d\tSize: %u\tAvg. Size: %u", s->mb_y, 
             //                    (s->pb.buf_ptr - s->ptr_last_mb_line), s->mb_line_avgsize);
-            s->first_gob_line = 0;
+            if(s->codec_id!=CODEC_ID_MPEG4) s->first_slice_line = 0; //FIXME clean
         }
     }
     emms_c();
 
-    if (s->h263_msmpeg4 && s->pict_type == I_TYPE)
+    if(s->codec_id==CODEC_ID_MPEG4 && s->data_partitioning && s->pict_type!=B_TYPE)
+        ff_mpeg4_merge_partitions(s);
+
+    if (s->msmpeg4_version && s->msmpeg4_version<4 && s->pict_type == I_TYPE)
         msmpeg4_encode_ext_header(s);
 
+    if(s->codec_id==CODEC_ID_MPEG4) 
+        ff_mpeg4_stuffing(&s->pb);
+
     //if (s->gob_number)
     //    fprintf(stderr,"\nNumber of GOB: %d", s->gob_number);
     
@@ -1675,30 +2149,14 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 
 static int dct_quantize_c(MpegEncContext *s, 
                         DCTELEM *block, int n,
-                        int qscale)
+                        int qscale, int *overflow)
 {
     int i, j, level, last_non_zero, q;
     const int *qmat;
-    int minLevel, maxLevel;
-
-    if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){
-	/* mpeg4 */
-        minLevel= -2048;
-	maxLevel= 2047;
-    }else if(s->out_format==FMT_MPEG1){
-	/* mpeg1 */
-        minLevel= -255;
-	maxLevel= 255;
-    }else if(s->out_format==FMT_MJPEG){
-	/* (m)jpeg */
-        minLevel= -1023;
-	maxLevel= 1023;
-    }else{
-	/* h263 / msmpeg4 */
-        minLevel= -128;
-	maxLevel= 127;
-    }
-
+    int bias;
+    int max=0;
+    unsigned int threshold1, threshold2;
+    
     av_fdct (block);
 
     /* we need this permutation so that we correct the IDCT
@@ -1706,81 +2164,54 @@ static int dct_quantize_c(MpegEncContext *s,
     block_permute(block);
 
     if (s->mb_intra) {
-        if (n < 4)
-            q = s->y_dc_scale;
-        else
-            q = s->c_dc_scale;
-        q = q << 3;
-        
+        if (!s->h263_aic) {
+            if (n < 4)
+                q = s->y_dc_scale;
+            else
+                q = s->c_dc_scale;
+            q = q << 3;
+        } else
+            /* For AIC we skip quant/dequant of INTRADC */
+            q = 1 << 3;
+            
         /* note: block[0] is assumed to be positive */
         block[0] = (block[0] + (q >> 1)) / q;
         i = 1;
         last_non_zero = 0;
-        if (s->out_format == FMT_H263) {
-            qmat = s->q_non_intra_matrix;
-        } else {
-            qmat = s->q_intra_matrix;
-        }
+        qmat = s->q_intra_matrix[qscale];
+        bias= s->intra_quant_bias<<(QMAT_SHIFT - 3 - QUANT_BIAS_SHIFT);
     } else {
         i = 0;
         last_non_zero = -1;
-        qmat = s->q_non_intra_matrix;
+        qmat = s->q_inter_matrix[qscale];
+        bias= s->inter_quant_bias<<(QMAT_SHIFT - 3 - QUANT_BIAS_SHIFT);
     }
+    threshold1= (1<<(QMAT_SHIFT - 3)) - bias - 1;
+    threshold2= threshold1<<1;
 
     for(;i<64;i++) {
         j = zigzag_direct[i];
         level = block[j];
         level = level * qmat[j];
-#ifdef PARANOID
-        {
-            static int count = 0;
-            int level1, level2, qmat1;
-            double val;
-            if (qmat == s->q_non_intra_matrix) {
-                qmat1 = default_non_intra_matrix[j] * s->qscale;
-            } else {
-                qmat1 = default_intra_matrix[j] * s->qscale;
-            }
-            if (av_fdct != jpeg_fdct_ifast)
-                val = ((double)block[j] * 8.0) / (double)qmat1;
-            else
-                val = ((double)block[j] * 8.0 * 2048.0) / 
-                    ((double)qmat1 * aanscales[j]);
-            level1 = (int)val;
-            level2 = level / (1 << (QMAT_SHIFT - 3));
-            if (level1 != level2) {
-                fprintf(stderr, "%d: quant error qlevel=%d wanted=%d level=%d qmat1=%d qmat=%d wantedf=%0.6f\n", 
-                        count, level2, level1, block[j], qmat1, qmat[j],
-                        val);
-                count++;
-            }
 
-        }
-#endif
-        /* XXX: slight error for the low range. Test should be equivalent to
-           (level <= -(1 << (QMAT_SHIFT - 3)) || level >= (1 <<
-           (QMAT_SHIFT - 3)))
-        */
-        if (((level << (31 - (QMAT_SHIFT - 3))) >> (31 - (QMAT_SHIFT - 3))) != 
-            level) {
-            level = level / (1 << (QMAT_SHIFT - 3));
-            /* XXX: currently, this code is not optimal. the range should be:
-               mpeg1: -255..255
-               mpeg2: -2048..2047
-               h263:  -128..127
-               mpeg4: -2048..2047
-            */
-            if (level > maxLevel)
-                level = maxLevel;
-            else if (level < minLevel)
-                level = minLevel;
-
-            block[j] = level;
+//        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
+//           || bias-level >= (1<<(QMAT_SHIFT - 3))){
+        if(((unsigned)(level+threshold1))>threshold2){
+            if(level>0){
+                level= (bias + level)>>(QMAT_SHIFT - 3);
+                block[j]= level;
+            }else{
+                level= (bias - level)>>(QMAT_SHIFT - 3);
+                block[j]= -level;
+            }
+            max |=level;
             last_non_zero = i;
-        } else {
-            block[j] = 0;
+        }else{
+            block[j]=0;
         }
     }
+    *overflow= s->max_qcoeff < max; //overflow might have happend
+    
     return last_non_zero;
 }
 
@@ -1822,7 +2253,7 @@ static void dct_unquantize_mpeg1_c(MpegEncContext *s,
         }
     } else {
         i = 0;
-        quant_matrix = s->non_intra_matrix;
+        quant_matrix = s->inter_matrix;
         for(;i<nCoeffs;i++) {
             int j= zigzag_direct[i];
             level = block[j];
@@ -1848,6 +2279,69 @@ static void dct_unquantize_mpeg1_c(MpegEncContext *s,
     }
 }
 
+static void dct_unquantize_mpeg2_c(MpegEncContext *s, 
+                                   DCTELEM *block, int n, int qscale)
+{
+    int i, level, nCoeffs;
+    const UINT16 *quant_matrix;
+
+    if(s->alternate_scan) nCoeffs= 64;
+    else nCoeffs= s->block_last_index[n]+1;
+    
+    if (s->mb_intra) {
+        if (n < 4) 
+            block[0] = block[0] * s->y_dc_scale;
+        else
+            block[0] = block[0] * s->c_dc_scale;
+        quant_matrix = s->intra_matrix;
+        for(i=1;i<nCoeffs;i++) {
+            int j= zigzag_direct[i];
+            level = block[j];
+            if (level) {
+                if (level < 0) {
+                    level = -level;
+                    level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                    level = -level;
+                } else {
+                    level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                }
+#ifdef PARANOID
+                if (level < -2048 || level > 2047)
+                    fprintf(stderr, "unquant error %d %d\n", i, level);
+#endif
+                block[j] = level;
+            }
+        }
+    } else {
+        int sum=-1;
+        i = 0;
+        quant_matrix = s->inter_matrix;
+        for(;i<nCoeffs;i++) {
+            int j= zigzag_direct[i];
+            level = block[j];
+            if (level) {
+                if (level < 0) {
+                    level = -level;
+                    level = (((level << 1) + 1) * qscale *
+                             ((int) (quant_matrix[j]))) >> 4;
+                    level = -level;
+                } else {
+                    level = (((level << 1) + 1) * qscale *
+                             ((int) (quant_matrix[j]))) >> 4;
+                }
+#ifdef PARANOID
+                if (level < -2048 || level > 2047)
+                    fprintf(stderr, "unquant error %d %d\n", i, level);
+#endif
+                block[j] = level;
+                sum+=level;
+            }
+        }
+        block[63]^=sum&1;
+    }
+}
+
+
 static void dct_unquantize_h263_c(MpegEncContext *s, 
                                   DCTELEM *block, int n, int qscale)
 {
@@ -1891,188 +2385,117 @@ static void dct_unquantize_h263_c(MpegEncContext *s,
     }
 }
 
-/* rate control */
-
-/* an I frame is I_FRAME_SIZE_RATIO bigger than a P frame */
-#define I_FRAME_SIZE_RATIO 3.0
-#define QSCALE_K           20
-
-static void rate_control_init(MpegEncContext *s)
+static void remove_ac(MpegEncContext *s, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int mb_x, int mb_y)
 {
-#if 1
-    emms_c();
-
-    //initial values, they dont really matter as they will be totally different within a few frames
-    s->i_pred.coeff= s->p_pred.coeff= 7.0;
-    s->i_pred.count= s->p_pred.count= 1.0;
-    
-    s->i_pred.decay= s->p_pred.decay= 0.4;
-    
-    // use more bits at the beginning, otherwise high motion at the begin will look like shit
-    s->qsum=100;
-    s->qcount=100;
-
-    s->short_term_qsum=0.001;
-    s->short_term_qcount=0.001;
-#else
-    s->wanted_bits = 0;
-
-    if (s->intra_only) {
-        s->I_frame_bits = ((INT64)s->bit_rate * FRAME_RATE_BASE) / s->frame_rate;
-        s->P_frame_bits = s->I_frame_bits;
-    } else {
-        s->P_frame_bits = (int) ((float)(s->gop_size * s->bit_rate) / 
-                                 (float)((float)s->frame_rate / FRAME_RATE_BASE * (I_FRAME_SIZE_RATIO + s->gop_size - 1)));
-        s->I_frame_bits = (int)(s->P_frame_bits * I_FRAME_SIZE_RATIO);
+    int dc, dcb, dcr, y, i;
+    for(i=0; i<4; i++){
+        dc= s->dc_val[0][mb_x*2+1 + (i&1) + (mb_y*2+1 + (i>>1))*(s->mb_width*2+2)];
+        for(y=0; y<8; y++){
+            int x;
+            for(x=0; x<8; x++){
+                dest_y[x + (i&1)*8 + (y + (i>>1)*8)*s->linesize]= dc/8;
+            }
+        }
+    }
+    dcb = s->dc_val[1][mb_x+1 + (mb_y+1)*(s->mb_width+2)];
+    dcr= s->dc_val[2][mb_x+1 + (mb_y+1)*(s->mb_width+2)];
+    for(y=0; y<8; y++){
+        int x;
+        for(x=0; x<8; x++){
+            dest_cb[x + y*(s->linesize>>1)]= dcb/8;
+            dest_cr[x + y*(s->linesize>>1)]= dcr/8;
+        }
     }
-
-#if defined(DEBUG)
-    printf("I_frame_size=%d P_frame_size=%d\n",
-           s->I_frame_bits, s->P_frame_bits);
-#endif
-#endif
-}
-
-static double predict(Predictor *p, double q, double var)
-{
-    return p->coeff*var / (q*p->count);
-}
-
-static void update_predictor(Predictor *p, double q, double var, double size)
-{
-    double new_coeff= size*q / (var + 1);
-    if(var<1000) return;
-/*{
-int pred= predict(p, q, var);
-int error= abs(pred-size);
-static double sum=0;
-static int count=0;
-if(count>5) sum+=error;
-count++;
-if(256*256*256*64%count==0){
-    printf("%d %f %f\n", count, sum/count, p->coeff);
-}
-}*/
-    p->count*= p->decay;
-    p->coeff*= p->decay;
-    p->count++;
-    p->coeff+= new_coeff;
 }
 
-static int rate_estimate_qscale(MpegEncContext *s)
+/**
+ * will conceal past errors, and allso drop b frames if needed
+ *
+ */
+void ff_conceal_past_errors(MpegEncContext *s, int unknown_pos)
 {
-#if 1
-    int qmin= s->qmin;
-    int qmax= s->qmax;
-    int rate_q=5;
-    float q;
-    int qscale;
-    float br_compensation;
-    double diff;
-    double short_term_q;
-    double long_term_q;
-    int last_qscale= s->qscale;
-    double fps;
-    INT64 wanted_bits;
-    emms_c();
-
-    fps= (double)s->frame_rate / FRAME_RATE_BASE;
-    wanted_bits= s->bit_rate*(double)s->picture_number/fps;
-
+    int mb_x= s->mb_x;
+    int mb_y= s->mb_y;
+    int mb_dist=0;
+    int i, intra_count=0, inter_count=0;
+    int intra_conceal= s->msmpeg4_version ? 50 : 50; //FIXME finetune
+    int inter_conceal= s->msmpeg4_version ? 50 : 50;
     
-    if(s->picture_number>2){
-        /* update predictors */
-        if(s->last_pict_type == I_TYPE){
-        //FIXME
-        }else{ //P Frame
-//printf("%d %d %d %f\n", s->qscale, s->last_mc_mb_var, s->frame_bits, s->p_pred.coeff);
-            update_predictor(&s->p_pred, s->qscale, s->last_mc_mb_var, s->frame_bits);
-        }
-    }
+    // for last block
+    if(mb_x>=s->mb_width)  mb_x= s->mb_width -1;
+    if(mb_y>=s->mb_height) mb_y= s->mb_height-1;
 
-    if(s->pict_type == I_TYPE){
-        //FIXME
-        rate_q= s->qsum/s->qcount;
-    }else{ //P Frame
-        int i;
-        int diff, best_diff=1000000000;
-        for(i=1; i<=31; i++){
-            diff= predict(&s->p_pred, i, s->mc_mb_var) - (double)s->bit_rate/fps;
-            if(diff<0) diff= -diff;
-            if(diff<best_diff){
-                best_diff= diff;
-                rate_q= i;
-            }
-        }
+    if(s->decoding_error==0 && unknown_pos){
+        if(s->data_partitioning && s->pict_type!=B_TYPE)
+                s->decoding_error= DECODING_AC_LOST;
+        else
+                s->decoding_error= DECODING_DESYNC;
     }
 
-    s->short_term_qsum*=s->qblur;
-    s->short_term_qcount*=s->qblur;
+    if(s->decoding_error==DECODING_DESYNC && s->pict_type!=B_TYPE) s->next_p_frame_damaged=1;
 
-    s->short_term_qsum+= rate_q;
-    s->short_term_qcount++;
-    short_term_q= s->short_term_qsum/s->short_term_qcount;
+    for(i=mb_x + mb_y*s->mb_width; i>=0; i--){
+        if(s->mbintra_table[i]) intra_count++;
+        else                    inter_count++;
+    }
     
-    long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0
-
-//    q= (long_term_q - short_term_q)*s->qcompress + short_term_q;
-    q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q);
+    if(s->decoding_error==DECODING_AC_LOST){
+        intra_conceal*=2;
+        inter_conceal*=2;
+    }else if(s->decoding_error==DECODING_ACDC_LOST){
+        intra_conceal*=2;
+        inter_conceal*=2;
+    }
 
-    diff= s->total_bits - wanted_bits;
-    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
-    if(br_compensation<=0.0) br_compensation=0.001;
-    q/=br_compensation;
+    if(unknown_pos && (intra_count<inter_count)){
+        intra_conceal= inter_conceal= s->mb_num; 
+//        printf("%d %d\n",intra_count, inter_count);
+    }
 
-    qscale= (int)(q + 0.5);
-    if     (qscale<qmin) qscale=qmin;
-    else if(qscale>qmax) qscale=qmax;
-    
-    if     (qscale<last_qscale-s->max_qdiff) qscale=last_qscale-s->max_qdiff;
-    else if(qscale>last_qscale+s->max_qdiff) qscale=last_qscale+s->max_qdiff;
+    fprintf(stderr, "concealing errors\n");
+
+    /* for all MBs from the current one back until the last resync marker */
+    for(; mb_y>=0 && mb_y>=s->resync_mb_y; mb_y--){
+        for(; mb_x>=0; mb_x--){
+            uint8_t *dest_y  = s->current_picture[0] + (mb_y * 16*  s->linesize      ) + mb_x * 16;
+            uint8_t *dest_cb = s->current_picture[1] + (mb_y * 8 * (s->linesize >> 1)) + mb_x * 8;
+            uint8_t *dest_cr = s->current_picture[2] + (mb_y * 8 * (s->linesize >> 1)) + mb_x * 8;
+            int mb_x_backup= s->mb_x; //FIXME pass xy to mpeg_motion
+            int mb_y_backup= s->mb_y;
+            s->mb_x=mb_x;
+            s->mb_y=mb_y;
+            if(s->mbintra_table[mb_y*s->mb_width + mb_x] && mb_dist<intra_conceal){
+                if(s->decoding_error==DECODING_AC_LOST){
+                    remove_ac(s, dest_y, dest_cb, dest_cr, mb_x, mb_y);
+//                    printf("remove ac to %d %d\n", mb_x, mb_y);
+                }else{
+                    mpeg_motion(s, dest_y, dest_cb, dest_cr, 0, 
+                                s->last_picture, 0, 0, put_pixels_tab,
+                                0/*mx*/, 0/*my*/, 16);
+                }
+            }
+            else if(!s->mbintra_table[mb_y*s->mb_width + mb_x] && mb_dist<inter_conceal){
+                int mx=0;
+                int my=0;
+
+                if(s->decoding_error!=DECODING_DESYNC){
+                    int xy= mb_x*2+1 + (mb_y*2+1)*(s->mb_width*2+2);
+                    mx= s->motion_val[ xy ][0];
+                    my= s->motion_val[ xy ][1];
+                }
 
-    s->qsum+= qscale;
-    s->qcount++;
+                mpeg_motion(s, dest_y, dest_cb, dest_cr, 0, 
+                            s->last_picture, 0, 0, put_pixels_tab,
+                            mx, my, 16);
+            }
+            s->mb_x= mb_x_backup;
+            s->mb_y= mb_y_backup;
 
-    s->last_pict_type= s->pict_type;
-//printf("q:%d diff:%d comp:%f rate_q:%d st_q:%f fvar:%d last_size:%d\n", qscale, (int)diff, br_compensation, 
-//       rate_q, short_term_q, s->mc_mb_var, s->frame_bits);
-//printf("%d %d\n", s->bit_rate, (int)fps);
-    return qscale;
-#else
-    INT64 diff, total_bits = s->total_bits;
-    float q;
-    int qscale;
-    if (s->pict_type == I_TYPE) {
-        s->wanted_bits += s->I_frame_bits;
-    } else {
-        s->wanted_bits += s->P_frame_bits;
-    }
-    diff = s->wanted_bits - total_bits;
-    q = 31.0 - (float)diff / (QSCALE_K * s->mb_height * s->mb_width);
-    /* adjust for I frame */
-    if (s->pict_type == I_TYPE && !s->intra_only) {
-        q /= I_FRAME_SIZE_RATIO;
-    }
-
-    /* using a too small Q scale leeds to problems in mpeg1 and h263
-       because AC coefficients are clamped to 255 or 127 */
-    qmin = 3;
-    if (q < qmin)
-        q = qmin;
-    else if (q > 31)
-        q = 31;
-    qscale = (int)(q + 0.5);
-#if defined(DEBUG)
-    printf("\n%d: total=%0.0f wanted=%0.0f br=%0.1f diff=%d qest=%2.1f\n", 
-           s->picture_number, 
-           (double)total_bits, 
-           (double)s->wanted_bits,
-           (float)s->frame_rate / FRAME_RATE_BASE * 
-           total_bits / s->picture_number, 
-           (int)diff, q);
-#endif
-    return qscale;
-#endif
+            if(mb_x== s->resync_mb_x && mb_y== s->resync_mb_y) return;
+            if(!s->mbskip_table[mb_x + mb_y*s->mb_width]) mb_dist++;
+        }
+        mb_x=s->mb_width-1;
+    }
 }
 
 AVCodec mpeg1video_encoder = {
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index f809a1255..2e957451b 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -1,32 +1,31 @@
 /*
  * Generic DCT based hybrid video encoder
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000, 2001, 2002 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-/* Macros for picture code type. */
-#define I_TYPE 1
-#define P_TYPE 2
-#define B_TYPE 3
-#define S_TYPE 4 //S(GMC)-VOP MPEG4
+#ifndef AVCODEC_MPEGVIDEO_H
+#define AVCODEC_MPEGVIDEO_H
+
+#define FRAME_SKIPED 100 // return value for header parsers if frame is not coded
 
 enum OutputFormat {
     FMT_MPEG1,
     FMT_H263,
-    FMT_MJPEG,
+    FMT_MJPEG, 
 };
 
 #define MPEG_BUF_SIZE (16 * 1024)
@@ -36,6 +35,11 @@ enum OutputFormat {
 
 #define MAX_FCODE 7
 #define MAX_MV 2048
+#define REORDER_BUFFER_SIZE (FF_MAX_B_FRAMES+2)
+
+#define ME_MAP_SIZE 64
+#define ME_MAP_SHIFT 3
+#define ME_MAP_MV_BITS 11
 
 typedef struct Predictor{
     double coeff;
@@ -43,6 +47,33 @@ typedef struct Predictor{
     double decay;
 } Predictor;
 
+typedef struct RateControlEntry{
+    int pict_type;
+    int qscale;
+    int mv_bits;
+    int i_tex_bits;
+    int p_tex_bits;
+    int misc_bits;
+    UINT64 expected_bits;
+    int new_pict_type;
+    float new_qscale;
+}RateControlEntry;
+
+typedef struct RateControlContext{
+    FILE *stats_file;
+    int num_entries;
+    RateControlEntry *entry;
+}RateControlContext;
+
+typedef struct ReorderBuffer{
+    UINT8 *picture[3];
+    int pict_type;
+    int qscale;
+    int force_type;
+    int picture_number;
+    int picture_in_gop_number;
+} ReorderBuffer;
+
 typedef struct MpegEncContext {
     struct AVCodecContext *avctx;
     /* the following parameters must be initialized before encoding */
@@ -53,11 +84,15 @@ typedef struct MpegEncContext {
     int bit_rate;        /* wanted bit rate */
     int bit_rate_tolerance; /* amount of +- bits (>0)*/
     enum OutputFormat out_format; /* output format */
+    int h263_pred;    /* use mpeg4/h263 ac/dc predictions */
+
+/* the following codec id fields are deprecated in favor of codec_id */
     int h263_plus; /* h263 plus headers */
     int h263_rv10; /* use RV10 variation for H263 */
-    int h263_pred; /* use mpeg4/h263 ac/dc predictions */
-    int h263_msmpeg4; /* generate MSMPEG4 compatible stream */
+    int h263_msmpeg4; /* generate MSMPEG4 compatible stream (deprecated, use msmpeg4_version instead)*/
     int h263_intel; /* use I263 intel h263 header */
+    
+    int codec_id;     /* see CODEC_ID_xxx */
     int fixed_qscale; /* fixed qscale if non zero */
     float qcompress;  /* amount of qscale change between easy & hard scenes (0.0-1.0) */
     float qblur;      /* amount of qscale smoothing over time (0.0-1.0) */
@@ -66,7 +101,16 @@ typedef struct MpegEncContext {
     int max_qdiff;    /* max qscale difference between frames */
     int encoding;     /* true if we are encoding (vs decoding) */
     int flags;        /* AVCodecContext.flags (HQ, MV4, ...) */
-    int force_type;   /* 0= no force, otherwise I_TYPE, P_TYPE, ... */
+    int force_input_type;/* 0= no force, otherwise I_TYPE, P_TYPE, ... */
+    int max_b_frames; /* max number of b-frames for encoding */
+    float b_quant_factor;/* qscale factor between ips and b frames */
+    float b_quant_offset;/* qscale offset between ips and b frames */
+    int rc_strategy;
+    int b_frame_strategy;
+    int luma_elim_threshold;
+    int chroma_elim_threshold;
+    int strict_std_compliance; /* strictly follow the std (MPEG4, ...) */
+    int workaround_bugs;       /* workaround bugs in encoders which cannot be detected automatically */
     /* the following fields are managed internally by the encoder */
 
     /* bit output */
@@ -74,46 +118,71 @@ typedef struct MpegEncContext {
 
     /* sequence parameters */
     int context_initialized;
+    int input_picture_number;
+    int input_picture_in_gop_number; /* 0-> first pic in gop, ... */
     int picture_number;
     int fake_picture_number; /* picture number at the bitstream frame rate */
     int gop_picture_number;  /* index of the first picture of a GOP based on fake_pic_num & mpeg1 specific */
     int picture_in_gop_number; /* 0-> first pic in gop, ... */
-    int mb_width, mb_height;
+    int b_frames_since_non_b;  /* used for encoding, relative to not yet reordered input */
+    int mb_width, mb_height;   /* number of MBs horizontally & vertically */
     int mb_num;                /* number of MBs of a picture */
     int linesize;              /* line size, in bytes, may be different from width */
     UINT8 *new_picture[3];     /* picture to be compressed */
-    UINT8 *last_picture[3];    /* previous picture */
+    UINT8 *picture_buffer[REORDER_BUFFER_SIZE][3]; /* internal buffers used for reordering of input pictures */
+    int picture_buffer_index;
+    ReorderBuffer coded_order[REORDER_BUFFER_SIZE];
+    UINT8 *last_picture[3];      /* previous picture */
     UINT8 *last_picture_base[3]; /* real start of the picture */
-    UINT8 *next_picture[3];    /* previous picture (for bidir pred) */
+    UINT8 *next_picture[3];      /* previous picture (for bidir pred) */
     UINT8 *next_picture_base[3]; /* real start of the picture */
-    UINT8 *aux_picture[3];    /* aux picture (for B frames only) */
-    UINT8 *aux_picture_base[3]; /* real start of the picture */
-    UINT8 *current_picture[3]; /* buffer to store the decompressed current picture */
-    int last_dc[3]; /* last DC values for MPEG1 */
-    INT16 *dc_val[3]; /* used for mpeg4 DC prediction, all 3 arrays must be continuous */
+    UINT8 *aux_picture[3];       /* aux picture (for B frames only) */
+    UINT8 *aux_picture_base[3];  /* real start of the picture */
+    UINT8 *current_picture[3];   /* buffer to store the decompressed current picture */
+    int num_available_buffers;   /* is 0 at the start & after seeking, after the first I frame its 1 after next I/P 2 */
+    int last_dc[3];              /* last DC values for MPEG1 */
+    INT16 *dc_val[3];            /* used for mpeg4 DC prediction, all 3 arrays must be continuous */
     int y_dc_scale, c_dc_scale;
-    UINT8 *coded_block; /* used for coded block pattern prediction */
-    INT16 (*ac_val[3])[16]; /* used for for mpeg4 AC prediction, all 3 arrays must be continuous */
+    UINT8 *coded_block;          /* used for coded block pattern prediction (msmpeg4v3, wmv1)*/
+    INT16 (*ac_val[3])[16];      /* used for for mpeg4 AC prediction, all 3 arrays must be continuous */
     int ac_pred;
     int mb_skiped;              /* MUST BE SET only during DECODING */
-    UINT8 *mbskip_table;        /* used to avoid copy if macroblock
-                                   skipped (for black regions for example) */
-    UINT8 *mbintra_table;            /* used to kill a few memsets */
-
-    int qscale;
-    int pict_type;
-    int last_non_b_pict_type; /* used for mpeg4 gmc b-frames */
-    int last_pict_type; /* used for bit rate stuff (needs that to update the right predictor) */
+    UINT8 *mbskip_table;        /* used to avoid copy if macroblock skipped (for black regions for example) 
+                                   and used for b-frame encoding & decoding (contains skip table of next P Frame) */
+    UINT8 *mbintra_table;       /* used to avoid setting {ac, dc, cbp}-pred stuff to zero on inter MB decoding */
+    UINT8 *cbp_table;           /* used to store cbp, ac_pred for partitioned decoding */
+    UINT8 *pred_dir_table;      /* used to store pred_dir for partitioned decoding */
+    INT8 *qscale_table;         /* used to store qscale for partitioned decoding (& postprocessing FIXME export) */
+
+    int input_qscale;           /* qscale prior to reordering of frames */
+    int input_pict_type;        /* pict_type prior to reordering of frames */
+    int force_type;             /* 0= no force, otherwise I_TYPE, P_TYPE, ... */
+    int qscale;                 /* QP */
+    int last_non_b_qscale;	/* QP of last non b frame used for b frame qscale*/
+    int pict_type;              /* I_TYPE, P_TYPE, B_TYPE, ... */
+    int last_non_b_pict_type;   /* used for mpeg4 gmc b-frames & ratecontrol */
     int frame_rate_index;
     /* motion compensation */
     int unrestricted_mv;
     int h263_long_vectors; /* use horrible h263v1 long vector mode */
 
-    int f_code; /* resolution */
-    int b_code; /* backward resolution for B Frames (mpeg4) */
-    INT16 *mv_table[2];    /* MV table (1MV per MB)*/
-    INT16 (*motion_val)[2]; /* used for MV prediction (4MV per MB)*/
-    int full_search;
+    int f_code; /* forward MV resolution */
+    int b_code; /* backward MV resolution for B Frames (mpeg4) */
+    INT16 (*motion_val)[2];            /* used for MV prediction (4MV per MB) */
+    INT16 (*p_mv_table)[2];            /* MV table (1MV per MB) p-frame encoding */
+    INT16 (*b_forw_mv_table)[2];       /* MV table (1MV per MB) forward mode b-frame encoding */
+    INT16 (*b_back_mv_table)[2];       /* MV table (1MV per MB) backward mode b-frame encoding */
+    INT16 (*b_bidir_forw_mv_table)[2]; /* MV table (1MV per MB) bidir mode b-frame encoding */
+    INT16 (*b_bidir_back_mv_table)[2]; /* MV table (1MV per MB) bidir mode b-frame encoding */
+    INT16 (*b_direct_forw_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */
+    INT16 (*b_direct_back_mv_table)[2];/* MV table (1MV per MB) direct mode b-frame encoding */
+    INT16 (*b_direct_mv_table)[2];     /* MV table (1MV per MB) direct mode b-frame encoding */
+    int me_method;                     /* ME algorithm */
+    uint8_t *me_scratchpad;            /* data area for the me algo, so that the ME doesnt need to malloc/free */
+    uint32_t *me_map;                  /* map to avoid duplicate evaluations */
+    uint16_t *me_score_map;            /* map to store the SADs */
+    int me_map_generation;
+    int skip_me;                       /* set if ME is skiped for the current MB */
     int mv_dir;
 #define MV_DIR_BACKWARD  1
 #define MV_DIR_FORWARD   2
@@ -131,62 +200,76 @@ typedef struct MpegEncContext {
     */
     int mv[2][4][2];
     int field_select[2][2];
-    int last_mv[2][2][2];
+    int last_mv[2][2][2];             /* last MV, used for MV prediction in MPEG1 & B-frame MPEG4 */
     UINT16 (*mv_penalty)[MAX_MV*2+1]; /* amount of bits needed to encode a MV, used for ME */
     UINT8 *fcode_tab; /* smallest fcode needed for each MV */
 
     int has_b_frames;
-    int no_rounding; /* apply no rounding to motion estimation (MPEG4) */
+    int no_rounding; /* apply no rounding to motion compensation (MPEG4, msmpeg4, ...) 
+                        for b-frames rounding mode is allways 0 */
+
+    int hurry_up;     /* when set to 1 during decoding, b frames will be skiped
+                         when set to 2 idct/dequant will be skipped too */
 
     /* macroblock layer */
     int mb_x, mb_y;
     int mb_incr;
     int mb_intra;
-    UINT16 *mb_var;    /* Table for MB variances */
-    UINT8 *mb_type;    /* Table for MB type */
+    UINT16 *mb_var;       /* Table for MB variances */
+    UINT16 *mc_mb_var;    /* Table for motion compensated MB variances */
+    UINT8 *mb_type;       /* Table for MB type */
 #define MB_TYPE_INTRA    0x01
 #define MB_TYPE_INTER    0x02
 #define MB_TYPE_INTER4V  0x04
 #define MB_TYPE_SKIPED   0x08
+#define MB_TYPE_GMC      0x10
+
 #define MB_TYPE_DIRECT   0x10
 #define MB_TYPE_FORWARD  0x20
-#define MB_TYPE_BACKWAD  0x40
+#define MB_TYPE_BACKWARD 0x40
 #define MB_TYPE_BIDIR    0x80
 
-    int block_index[6];
+    int block_index[6]; /* index to current MB in block based arrays with edges*/
     int block_wrap[6];
 
     /* matrix transmitted in the bitstream */
     UINT16 intra_matrix[64];
     UINT16 chroma_intra_matrix[64];
-    UINT16 non_intra_matrix[64];
-    UINT16 chroma_non_intra_matrix[64];
+    UINT16 inter_matrix[64];
+    UINT16 chroma_inter_matrix[64];
+#define QUANT_BIAS_SHIFT 4
+    int intra_quant_bias;    /* bias for the quantizer */
+    int inter_quant_bias;    /* bias for the quantizer */
+    int min_qcoeff;          /* minimum encodable coefficient */
+    int max_qcoeff;          /* maximum encodable coefficient */
     /* precomputed matrix (combine qscale and DCT renorm) */
-    int q_intra_matrix[64];
-    int q_non_intra_matrix[64];
+    int q_intra_matrix[32][64];
+    int q_inter_matrix[32][64];
     /* identical to the above but for MMX & these are not permutated */
-    UINT16 __align8 q_intra_matrix16[64] ;
-    UINT16 __align8 q_non_intra_matrix16[64];
+    UINT16 __align8 q_intra_matrix16[32][64];
+    UINT16 __align8 q_inter_matrix16[32][64];
+    UINT16 __align8 q_intra_matrix16_bias[32][64];
+    UINT16 __align8 q_inter_matrix16_bias[32][64];
     int block_last_index[6];  /* last non zero coefficient in block */
 
     void *opaque; /* private data for the user */
 
     /* bit rate control */
-    int I_frame_bits;    /* wanted number of bits per I frame */
-    int P_frame_bits;    /* same for P frame */
-    int avg_mb_var;        /* average MB variance for current frame */
-    int mc_mb_var;     /* motion compensated MB variance for current frame */
-    int last_mc_mb_var;     /* motion compensated MB variance for last frame */
+    int I_frame_bits; //FIXME used in mpeg12 ...
+    int mb_var_sum;          /* sum of MB variance for current frame */
+    int mc_mb_var_sum;       /* motion compensated MB variance for current frame */
+    int last_non_b_mc_mb_var;/* motion compensated MB variance for last non b frame */
     INT64 wanted_bits;
     INT64 total_bits;
-    int frame_bits;      /* bits used for the current frame */
-    int last_frame_bits; /* bits used for the last frame */
+    int frame_bits;        /* bits used for the current frame */
+    int pb_frame_bits;     /* bits of the last b...bp group */
     Predictor i_pred;
     Predictor p_pred;
     double qsum;         /* sum of qscales */
     double qcount;       /* count of qscales */
     double short_term_qsum;   /* sum of recent qscales */
     double short_term_qcount; /* count of recent qscales */
+    RateControlContext rc_context;
 
     /* statistics, used for 2-pass encoding */
     int mv_bits;
@@ -198,11 +281,24 @@ typedef struct MpegEncContext {
     int skip_count;
     int misc_bits; // cbp, mb_type
     int last_bits; //temp var used for calculating the above vars
+    
+    /* error concealment / resync */
+    int resync_mb_x;                 /* x position of last resync marker */
+    int resync_mb_y;                 /* y position of last resync marker */
+    int mb_num_left;                 /* number of MBs left in this video packet */
+    GetBitContext next_resync_gb;    /* starts at the next resync marker */
+    int next_resync_qscale;          /* qscale of next resync marker */
+    int next_resync_pos;             /* bitstream position of next resync marker */
+#define DECODING_AC_LOST -1
+#define DECODING_ACDC_LOST -2
+#define DECODING_DESYNC -3
+    int decoding_error;
+    int next_p_frame_damaged;        /* set if the next p frame is damaged, to avoid showing trashed b frames */
+    int error_resilience;
 
     /* H.263 specific */
     int gob_number;
     int gob_index;
-    int first_gob_line;
         
     /* H.263+ specific */
     int umvplus;
@@ -212,11 +308,13 @@ typedef struct MpegEncContext {
     
     /* mpeg4 specific */
     int time_increment_resolution;
-    int time_increment_bits;
-    int time_increment;
-    int time_base;
-    int time;
-    int last_non_b_time[2];
+    int time_increment_bits;        /* number of bits to represent the fractional part of time */
+    int last_time_base;
+    int time_base;                  /* time in seconds of last I,P,S Frame */
+    INT64 time;                   /* time of current frame */ 
+    INT64 last_non_b_time;
+    UINT16 pp_time;               /* time distance between the last 2 p,s,i frames */
+    UINT16 bp_time;               /* time distance between the last b and p,s,i frame */
     int shape;
     int vol_sprite_usage;
     int sprite_width;
@@ -231,21 +329,32 @@ typedef struct MpegEncContext {
     int sprite_shift[2][2];
     int mcsel;
     int quant_precision;
-    int quarter_sample;
+    int quarter_sample;              /* 1->qpel, 0->half pel ME/MC */ 
     int scalability;
     int new_pred;
     int reduced_res_vop;
     int aspect_ratio_info;
     int sprite_warping_accuracy;
     int low_latency_sprite;
-    int data_partioning;
-    int resync_marker;
-    int resync_x_pos;
+    int data_partitioning;
+    int rvlc;                        /* reversible vlc */
+    int resync_marker;               /* could this stream contain resync markers*/
+    int low_delay;                   /* no reordering needed / has no b-frames */
+    int vo_type;
+    int vol_control_parameters;      /* does the stream contain the low_delay flag, used to workaround buggy encoders */
+    PutBitContext tex_pb;            /* used for data partitioned VOPs */
+    PutBitContext pb2;               /* used for data partitioned VOPs */
+#define PB_BUFFER_SIZE 1024*256
+    uint8_t *tex_pb_buffer;          
+    uint8_t *pb2_buffer;
 
     /* divx specific, used to workaround (many) bugs in divx5 */
     int divx_version;
     int divx_build;
-
+#define BITSTREAM_BUFFER_SIZE 1024*256
+    UINT8 *bitstream_buffer; //Divx 5.01 puts several frames in a single one, this is used to reorder them
+    int bitstream_buffer_size;
+    
     /* RV10 specific */
     int rv10_version; /* RV10 version: 0 or 3 */
     int rv10_first_dc_coded[3];
@@ -256,6 +365,7 @@ typedef struct MpegEncContext {
     int mjpeg_hsample[3]; /* horizontal sampling factors, default = {2, 1, 1} */
     int mjpeg_write_tables; /* do we want to have quantisation- and
 			       huffmantables in the jpeg file ? */
+    int mjpeg_data_only_frames; /* frames only with SOI, SOS and EOI markers */
 
     /* MSMPEG4 specific */
     int mv_table_index;
@@ -266,8 +376,7 @@ typedef struct MpegEncContext {
     int slice_height;      /* in macroblocks */
     int first_slice_line;  /* used in mpeg4 too to handle resync markers */
     int flipflop_rounding;
-    int bitrate;
-    int msmpeg4_version;   /* 1=mp41, 2=mp42, 3=mp43/divx3 */
+    int msmpeg4_version;   /* 0=not msmpeg4, 1=mp41, 2=mp42, 3=mp43/divx3 */
     /* decompression specific */
     GetBitContext gb;
 
@@ -306,10 +415,14 @@ typedef struct MpegEncContext {
     UINT32 mb_line_avgsize;
     
     DCTELEM (*block)[64]; /* points to one of the following blocks */
-    DCTELEM intra_block[6][64] __align8;
-    DCTELEM inter_block[6][64] __align8;
-    DCTELEM inter4v_block[6][64] __align8;
-    void (*dct_unquantize)(struct MpegEncContext *s, 
+    DCTELEM blocks[2][6][64] __align8; // for HQ mode we need to keep the best block
+    void (*dct_unquantize_mpeg1)(struct MpegEncContext *s, 
+                           DCTELEM *block, int n, int qscale);
+    void (*dct_unquantize_mpeg2)(struct MpegEncContext *s, 
+                           DCTELEM *block, int n, int qscale);
+    void (*dct_unquantize_h263)(struct MpegEncContext *s, 
+                           DCTELEM *block, int n, int qscale);
+    void (*dct_unquantize)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both)
                            DCTELEM *block, int n, int qscale);
 } MpegEncContext;
 
@@ -321,11 +434,20 @@ void MPV_frame_end(MpegEncContext *s);
 #ifdef HAVE_MMX
 void MPV_common_init_mmx(MpegEncContext *s);
 #endif
+extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
+extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
+void ff_conceal_past_errors(MpegEncContext *s, int conceal_all);
+void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length);
+void ff_clean_intra_table_entries(MpegEncContext *s);
 
 /* motion_est.c */
-
-void estimate_motion(MpegEncContext *s, 
-                    int mb_x, int mb_y);
+void ff_estimate_p_frame_motion(MpegEncContext * s,
+                             int mb_x, int mb_y);
+void ff_estimate_b_frame_motion(MpegEncContext * s,
+                             int mb_x, int mb_y);
+int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type);
+void ff_fix_long_p_mvs(MpegEncContext * s);
+void ff_fix_long_b_mvs(MpegEncContext * s, int16_t (*mv_table)[2], int f_code, int type);
 
 /* mpeg12.c */
 extern INT16 default_intra_matrix[64];
@@ -382,6 +504,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block,
                         int *px, int *py);
 void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, 
                    int dir);
+void ff_set_mpeg4_time(MpegEncContext * s, int picture_number);
 void mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
 void h263_encode_init(MpegEncContext *s);
 
@@ -393,6 +516,13 @@ int intel_h263_decode_picture_header(MpegEncContext *s);
 int h263_decode_mb(MpegEncContext *s,
                    DCTELEM block[6][64]);
 int h263_get_picture_format(int width, int height);
+int ff_mpeg4_decode_video_packet_header(MpegEncContext *s);
+int ff_mpeg4_resync(MpegEncContext *s);
+void ff_mpeg4_encode_video_packet_header(MpegEncContext *s);
+void ff_mpeg4_clean_buffers(MpegEncContext *s);
+void ff_mpeg4_stuffing(PutBitContext * pbc);
+void ff_mpeg4_init_partitions(MpegEncContext *s);
+void ff_mpeg4_merge_partitions(MpegEncContext *s);
 
 /* rv10.c */
 void rv10_encode_picture_header(MpegEncContext *s, int picture_number);
@@ -404,12 +534,12 @@ void msmpeg4_encode_ext_header(MpegEncContext * s);
 void msmpeg4_encode_mb(MpegEncContext * s, 
                        DCTELEM block[6][64],
                        int motion_x, int motion_y);
-void msmpeg4_dc_scale(MpegEncContext * s);
 int msmpeg4_decode_picture_header(MpegEncContext * s);
 int msmpeg4_decode_ext_header(MpegEncContext * s, int buf_size);
 int msmpeg4_decode_mb(MpegEncContext *s, 
                       DCTELEM block[6][64]);
 int msmpeg4_decode_init_vlc(MpegEncContext *s);
+void ff_old_msmpeg4_dc_scale(MpegEncContext *s);
 
 /* mjpegenc.c */
 
@@ -419,3 +549,12 @@ void mjpeg_encode_mb(MpegEncContext *s,
                      DCTELEM block[6][64]);
 void mjpeg_picture_header(MpegEncContext *s);
 void mjpeg_picture_trailer(MpegEncContext *s);
+
+/* rate control */
+int ff_rate_control_init(MpegEncContext *s);
+int ff_rate_estimate_qscale(MpegEncContext *s);
+int ff_rate_estimate_qscale_pass2(MpegEncContext *s);
+void ff_write_pass1_stats(MpegEncContext *s);
+void ff_rate_control_uninit(MpegEncContext *s);
+
+#endif /* AVCODEC_MPEGVIDEO_H */
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 66fc5255e..629c74497 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -1,27 +1,27 @@
 /*
  * MSMPEG4 backend for ffmpeg encoder and decoder
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * msmpeg4v1 & v2 stuff by Michael Niedermayer <michaelni@gmx.at>
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include "common.h"
+#include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
-#include "avcodec.h"
+
 
 /*
  * You can also call this codec : MPEG4 with a twist ! 
@@ -53,7 +53,7 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
 static int msmpeg4_decode_motion(MpegEncContext * s, 
                                  int *mx_ptr, int *my_ptr);
 static void msmpeg4v2_encode_motion(MpegEncContext * s, int val);
-static void init_h263_dc_for_msmpeg4();
+static void init_h263_dc_for_msmpeg4(void);
 
 
 extern UINT32 inverse[256];
@@ -137,7 +137,7 @@ static void init_mv_table(MVTable *tab)
 {
     int i, x, y;
 
-    tab->table_mv_index = malloc(sizeof(UINT16) * 4096);
+    tab->table_mv_index = av_malloc(sizeof(UINT16) * 4096);
     /* mark all entries as not used */
     for(i=0;i<4096;i++)
         tab->table_mv_index[i] = tab->n;
@@ -159,7 +159,7 @@ static void code012(PutBitContext *pb, int n)
     }
 }
 
-/* write MSMPEG4 V3 compatible frame header */
+/* write MSMPEG4 compatible frame header */
 void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 {
     int i;
@@ -171,7 +171,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     put_bits(&s->pb, 5, s->qscale);
 
     s->rl_table_index = 2;
-    if(s->msmpeg4_version==2)
+    if(s->msmpeg4_version<=2)
         s->rl_chroma_table_index = 2; /* only for I frame */
     else
         s->rl_chroma_table_index = 1; /* only for I frame */
@@ -183,7 +183,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
     if (s->pict_type == I_TYPE) {
         put_bits(&s->pb, 5, 0x17); /* indicate only one "slice" */
 
-        if(s->msmpeg4_version!=2){
+        if(s->msmpeg4_version>2){
             code012(&s->pb, s->rl_chroma_table_index);
             code012(&s->pb, s->rl_table_index);
 
@@ -194,7 +194,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
         put_bits(&s->pb, 1, s->use_skip_mb_code);
         
         s->rl_chroma_table_index = s->rl_table_index;
-        if(s->msmpeg4_version!=2){
+        if(s->msmpeg4_version>2){
             code012(&s->pb, s->rl_table_index);
 
             put_bits(&s->pb, 1, s->dc_table_index);
@@ -228,14 +228,16 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
 void msmpeg4_encode_ext_header(MpegEncContext * s)
 {
-        s->flipflop_rounding=1;
-        s->bitrate= 910; // FIXME
-
         put_bits(&s->pb, 5, s->frame_rate / FRAME_RATE_BASE); //yes 29.97 -> 29
 
-	put_bits(&s->pb, 11, s->bitrate);
+	put_bits(&s->pb, 11, MIN(s->bit_rate, 2047));
 
-	put_bits(&s->pb, 1, s->flipflop_rounding);
+        if(s->msmpeg4_version<3)
+            s->flipflop_rounding=0;
+        else{
+            s->flipflop_rounding=1;
+            put_bits(&s->pb, 1, s->flipflop_rounding);
+        }
 }
 
 /* predict coded block */
@@ -328,7 +330,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
         if (s->use_skip_mb_code)
             put_bits(&s->pb, 1, 0);	/* mb coded */
         
-        if(s->msmpeg4_version==2){
+        if(s->msmpeg4_version<=2){
             put_bits(&s->pb, 
                      v2_mb_type[cbp&3][1], 
                      v2_mb_type[cbp&3][0]);
@@ -373,7 +375,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
             printf("cbp=%x %x\n", cbp, coded_cbp);
 #endif
 
-        if(s->msmpeg4_version==2){
+        if(s->msmpeg4_version<=2){
             if (s->pict_type == I_TYPE) {
                 put_bits(&s->pb, 
                          v2_intra_cbpc[cbp&3][1], v2_intra_cbpc[cbp&3][0]);
@@ -410,11 +412,10 @@ void msmpeg4_encode_mb(MpegEncContext * s,
     }
 }
 
-
-/* strongly inspirated from MPEG4, but not exactly the same ! */
-void msmpeg4_dc_scale(MpegEncContext * s)
+/* old ffmpeg msmpeg4v3 mode */
+void ff_old_msmpeg4_dc_scale(MpegEncContext * s)
 {
-    if (s->qscale < 5 || s->msmpeg4_version==2){
+    if (s->qscale < 5){
         s->y_dc_scale = 8;
         s->c_dc_scale = 8;
     }else if (s->qscale < 9){
@@ -426,6 +427,21 @@ void msmpeg4_dc_scale(MpegEncContext * s)
     }
 }
 
+static int msmpeg4v1_pred_dc(MpegEncContext * s, int n, 
+                           INT32 **dc_val_ptr)
+{
+    int i;
+
+    if (n < 4) {
+        i= 0;
+    } else {
+        i= n-3;
+    }
+    
+    *dc_val_ptr= &s->last_dc[i];
+    return s->last_dc[i]; 
+}
+
 /* dir = 0: left, dir = 1: top prediction */
 static int msmpeg4_pred_dc(MpegEncContext * s, int n, 
                            INT16 **dc_val_ptr, int *dir_ptr)
@@ -439,6 +455,7 @@ static int msmpeg4_pred_dc(MpegEncContext * s, int n,
     } else {
 	scale = s->c_dc_scale;
     }
+    
     wrap = s->block_wrap[n];
     dc_val= s->dc_val[0] + s->block_index[n];
 
@@ -508,21 +525,29 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
 {
     int sign, code;
     int pred;
-    INT16 *dc_val;
 
-    pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
+    if(s->msmpeg4_version==1){
+        INT32 *dc_val;
+        pred = msmpeg4v1_pred_dc(s, n, &dc_val);
+        
+        /* update predictor */
+        *dc_val= level;
+    }else{
+        INT16 *dc_val;
+        pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
 
-    /* update predictor */
-    if (n < 4) {
-        *dc_val = level * s->y_dc_scale;
-    } else {
-        *dc_val = level * s->c_dc_scale;
+        /* update predictor */
+        if (n < 4) {
+            *dc_val = level * s->y_dc_scale;
+        } else {
+            *dc_val = level * s->c_dc_scale;
+        }
     }
 
     /* do the prediction */
     level -= pred;
 
-    if(s->msmpeg4_version==2){
+    if(s->msmpeg4_version<=2){
         if (n < 4) {
             put_bits(&s->pb, 
                      v2_dc_lum_table[level+256][1],
@@ -589,7 +614,7 @@ static void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
     } else {
         i = 0;
         rl = &rl_table[3 + s->rl_table_index];
-        if(s->msmpeg4_version==2)
+        if(s->msmpeg4_version<=2)
             run_diff = 0;
         else
             run_diff = 1;
@@ -669,9 +694,11 @@ static VLC cbpy_vlc;
 static VLC v2_intra_cbpc_vlc;
 static VLC v2_mb_type_vlc;
 static VLC v2_mv_vlc;
+static VLC v1_intra_cbpc_vlc;
+static VLC v1_inter_cbpc_vlc;
 
 /* this table is practically identical to the one from h263 except that its inverted */
-static void init_h263_dc_for_msmpeg4()
+static void init_h263_dc_for_msmpeg4(void)
 {
     static int inited=0;
     
@@ -733,60 +760,73 @@ static void init_h263_dc_for_msmpeg4()
 /* init all vlc decoding tables */
 int msmpeg4_decode_init_vlc(MpegEncContext *s)
 {
+    static int done = 0;
     int i;
     MVTable *mv;
 
-    for(i=0;i<NB_RL_TABLES;i++) {
-        init_rl(&rl_table[i]);
-        init_vlc_rl(&rl_table[i]);
-    }
-    for(i=0;i<2;i++) {
-        mv = &mv_tables[i];
-        init_vlc(&mv->vlc, 9, mv->n + 1, 
-                 mv->table_mv_bits, 1, 1,
-                 mv->table_mv_code, 2, 2);
-    }
 
-    init_vlc(&dc_lum_vlc[0], 9, 120, 
-             &table0_dc_lum[0][1], 8, 4,
-             &table0_dc_lum[0][0], 8, 4);
-    init_vlc(&dc_chroma_vlc[0], 9, 120, 
-             &table0_dc_chroma[0][1], 8, 4,
-             &table0_dc_chroma[0][0], 8, 4);
-    init_vlc(&dc_lum_vlc[1], 9, 120, 
-             &table1_dc_lum[0][1], 8, 4,
-             &table1_dc_lum[0][0], 8, 4);
-    init_vlc(&dc_chroma_vlc[1], 9, 120, 
-             &table1_dc_chroma[0][1], 8, 4,
-             &table1_dc_chroma[0][0], 8, 4);
+    if (!done) {
+        done = 1;
+
+        for(i=0;i<NB_RL_TABLES;i++) {
+            init_rl(&rl_table[i]);
+            init_vlc_rl(&rl_table[i]);
+        }
+        for(i=0;i<2;i++) {
+            mv = &mv_tables[i];
+            init_vlc(&mv->vlc, 9, mv->n + 1, 
+                     mv->table_mv_bits, 1, 1,
+                     mv->table_mv_code, 2, 2);
+        }
+
+        init_vlc(&dc_lum_vlc[0], 9, 120, 
+                 &table0_dc_lum[0][1], 8, 4,
+                 &table0_dc_lum[0][0], 8, 4);
+        init_vlc(&dc_chroma_vlc[0], 9, 120, 
+                 &table0_dc_chroma[0][1], 8, 4,
+                 &table0_dc_chroma[0][0], 8, 4);
+        init_vlc(&dc_lum_vlc[1], 9, 120, 
+                 &table1_dc_lum[0][1], 8, 4,
+                 &table1_dc_lum[0][0], 8, 4);
+        init_vlc(&dc_chroma_vlc[1], 9, 120, 
+                 &table1_dc_chroma[0][1], 8, 4,
+                 &table1_dc_chroma[0][0], 8, 4);
     
-    init_h263_dc_for_msmpeg4();
-    init_vlc(&v2_dc_lum_vlc, 9, 512, 
-             &v2_dc_lum_table[0][1], 8, 4,
-             &v2_dc_lum_table[0][0], 8, 4);
-    init_vlc(&v2_dc_chroma_vlc, 9, 512, 
-             &v2_dc_chroma_table[0][1], 8, 4,
-             &v2_dc_chroma_table[0][0], 8, 4);
+        init_h263_dc_for_msmpeg4();
+        init_vlc(&v2_dc_lum_vlc, 9, 512, 
+                 &v2_dc_lum_table[0][1], 8, 4,
+                 &v2_dc_lum_table[0][0], 8, 4);
+        init_vlc(&v2_dc_chroma_vlc, 9, 512, 
+                 &v2_dc_chroma_table[0][1], 8, 4,
+                 &v2_dc_chroma_table[0][0], 8, 4);
     
-    init_vlc(&cbpy_vlc, 6, 16,
-             &cbpy_tab[0][1], 2, 1,
-             &cbpy_tab[0][0], 2, 1);
-    init_vlc(&v2_intra_cbpc_vlc, 3, 4,
-             &v2_intra_cbpc[0][1], 2, 1,
-             &v2_intra_cbpc[0][0], 2, 1);
-    init_vlc(&v2_mb_type_vlc, 5, 8,
-             &v2_mb_type[0][1], 2, 1,
-             &v2_mb_type[0][0], 2, 1);
-    init_vlc(&v2_mv_vlc, 9, 33,
-             &mvtab[0][1], 2, 1,
-             &mvtab[0][0], 2, 1);
-
-    init_vlc(&mb_non_intra_vlc, 9, 128, 
-             &table_mb_non_intra[0][1], 8, 4,
-             &table_mb_non_intra[0][0], 8, 4);
-    init_vlc(&mb_intra_vlc, 9, 64, 
-             &table_mb_intra[0][1], 4, 2,
-             &table_mb_intra[0][0], 4, 2);
+        init_vlc(&cbpy_vlc, 6, 16,
+                 &cbpy_tab[0][1], 2, 1,
+                 &cbpy_tab[0][0], 2, 1);
+        init_vlc(&v2_intra_cbpc_vlc, 3, 4,
+                 &v2_intra_cbpc[0][1], 2, 1,
+                 &v2_intra_cbpc[0][0], 2, 1);
+        init_vlc(&v2_mb_type_vlc, 5, 8,
+                 &v2_mb_type[0][1], 2, 1,
+                 &v2_mb_type[0][0], 2, 1);
+        init_vlc(&v2_mv_vlc, 9, 33,
+                 &mvtab[0][1], 2, 1,
+                 &mvtab[0][0], 2, 1);
+
+        init_vlc(&mb_non_intra_vlc, 9, 128, 
+                 &table_mb_non_intra[0][1], 8, 4,
+                 &table_mb_non_intra[0][0], 8, 4);
+        init_vlc(&mb_intra_vlc, 9, 64, 
+                 &table_mb_intra[0][1], 4, 2,
+                 &table_mb_intra[0][0], 4, 2);
+        
+        init_vlc(&v1_intra_cbpc_vlc, 6, 8, 
+                 intra_MCBPC_bits, 1, 1,
+                 intra_MCBPC_code, 1, 1);
+        init_vlc(&v1_inter_cbpc_vlc, 6, 25, 
+                 inter_MCBPC_bits, 1, 1,
+                 inter_MCBPC_code, 1, 1);
+    }
     return 0;
 }
 
@@ -802,31 +842,84 @@ static int decode012(GetBitContext *gb)
 
 int msmpeg4_decode_picture_header(MpegEncContext * s)
 {
-    int code;
+    int code, code2;
+
+#if 0
+{
+int i;
+for(i=0; i<s->gb.size*8; i++)
+    printf("%d", get_bits1(&s->gb));
+//    get_bits1(&s->gb);
+printf("END\n");
+return -1;
+}
+#endif
+
+    if(s->msmpeg4_version==1){
+        int start_code, num;
+        start_code = (get_bits(&s->gb, 16)<<16) | get_bits(&s->gb, 16);
+        if(start_code!=0x00000100){
+            fprintf(stderr, "invalid startcode\n");
+            return -1;
+        }
+
+        num= get_bits(&s->gb, 5); // frame number */
+    }
 
     s->pict_type = get_bits(&s->gb, 2) + 1;
     if (s->pict_type != I_TYPE &&
-        s->pict_type != P_TYPE)
+        s->pict_type != P_TYPE){
+        fprintf(stderr, "invalid picture type\n");
         return -1;
+    }
 
     s->qscale = get_bits(&s->gb, 5);
 
     if (s->pict_type == I_TYPE) {
         code = get_bits(&s->gb, 5); 
-        /* 0x17: one slice, 0x18: two slices */
-        if (code < 0x17)
-            return -1;
-        s->slice_height = s->mb_height / (code - 0x16);
-        if(s->msmpeg4_version==2){
+        if(s->msmpeg4_version==1){
+            if(code==0 || code>s->mb_height){
+                fprintf(stderr, "invalid slice height %d\n", code);
+                return -1;
+            }
+
+            s->slice_height = code;
+        }else{
+            /* 0x17: one slice, 0x18: two slices, ... */
+            if (code < 0x17)
+                return -1;
+
+            s->slice_height = s->mb_height / (code - 0x16);
+        }
+
+        switch(s->msmpeg4_version){
+        case 1:
+        case 2:
             s->rl_chroma_table_index = 2;
             s->rl_table_index = 2;
 
             s->dc_table_index = 0; //not used
-        }else{
+            break;
+        case 3:
             s->rl_chroma_table_index = decode012(&s->gb);
             s->rl_table_index = decode012(&s->gb);
 
             s->dc_table_index = get_bits1(&s->gb);
+            break;
+        case 4:
+            msmpeg4_decode_ext_header(s, 999 /* bufer size (useless here) */);
+            printf("%X\n", show_bits(&s->gb, 24));
+            code= get_bits(&s->gb, 2);
+            if(code==1){
+                code2= get_bits(&s->gb, 3);
+                if(code2==7) skip_bits(&s->gb, 1);
+            }
+            printf("%X\n", show_bits(&s->gb, 24));
+            s->rl_chroma_table_index = 2;
+            s->rl_table_index = 2;
+
+            s->dc_table_index = 0;
+            break;
         }
         s->no_rounding = 1;
 /*	printf(" %d %d %d %d     \n", 
@@ -835,22 +928,28 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
 		s->rl_table_index, 
 		s->dc_table_index);*/
     } else {
-        s->use_skip_mb_code = get_bits1(&s->gb);
         
-        if(s->msmpeg4_version==2){
+        switch(s->msmpeg4_version){
+        case 1:
+        case 2:
+            if(s->msmpeg4_version==1)
+                s->use_skip_mb_code = 1;
+            else
+                s->use_skip_mb_code = get_bits1(&s->gb);
             s->rl_table_index = 2;
             s->rl_chroma_table_index = s->rl_table_index;
-
             s->dc_table_index = 0; //not used
-
             s->mv_table_index = 0;
-        }else{
+            break;
+        case 3:
+            s->use_skip_mb_code = get_bits1(&s->gb);
             s->rl_table_index = decode012(&s->gb);
             s->rl_chroma_table_index = s->rl_table_index;
 
             s->dc_table_index = get_bits1(&s->gb);
 
             s->mv_table_index = get_bits1(&s->gb);
+            break;
         }
 /*	printf(" %d %d %d %d %d     \n", 
 		s->use_skip_mb_code, 
@@ -864,6 +963,7 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
 	    s->no_rounding = 0;
 	}
 //	printf("%d", s->no_rounding);
+//return -1;
     }
     
 #if 0
@@ -886,27 +986,36 @@ return -1;
 
 int msmpeg4_decode_ext_header(MpegEncContext * s, int buf_size)
 {
+    int left= buf_size*8 - get_bits_count(&s->gb);
+    int length= s->msmpeg4_version>=3 ? 17 : 16;
     /* the alt_bitstream reader could read over the end so we need to check it */
-    if(get_bits_count(&s->gb) + 16 < buf_size*8)
+    if(left>=length && left<length+8)
     {
         int fps;
 
         fps= get_bits(&s->gb, 5);
-        s->bitrate= get_bits(&s->gb, 11);
-        s->flipflop_rounding= get_bits1(&s->gb);
+        s->bit_rate= get_bits(&s->gb, 11);
+        if(s->msmpeg4_version>=3)
+            s->flipflop_rounding= get_bits1(&s->gb);
+        else
+            s->flipflop_rounding= 0;
 
-//        printf("fps:%2d bps:%2d roundingType:%1d\n", fps, s->bitrate, s->flipflop_rounding);
+//        printf("fps:%2d bps:%2d roundingType:%1d\n", fps, s->bit_rate, s->flipflop_rounding);
     }
-    else
+    else if(left<length+8)
     {
         s->flipflop_rounding= 0;
-        s->bitrate= 0;
+        printf("ext header missing, %d left\n", left);
+    }
+    else
+    {
+        fprintf(stderr, "I frame too long, ignoring ext header\n");
     }
 
     return 0;
 }
 
-static inline void memsetw(short *tab, int val, int n)
+static inline void msmpeg4_memsetw(short *tab, int val, int n)
 {
     int i;
     for(i=0;i<n;i++)
@@ -952,6 +1061,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
     int code, val, sign, shift;
 
     code = get_vlc(&s->gb, &v2_mv_vlc);
+//     printf("MV code %d at %d %d pred: %d\n", code, s->mb_x,s->mb_y, pred);
     if (code < 0)
         return 0xffff;
 
@@ -965,8 +1075,8 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
     val++;
     if (sign)
         val = -val;
-    val += pred;
 
+    val += pred;
     if (val <= -64)
         val += 64;
     else if (val >= 64)
@@ -976,7 +1086,7 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
 }
 
 
-int msmpeg4v2_decode_mb(MpegEncContext *s, 
+static int msmpeg4v12_decode_mb(MpegEncContext *s, 
                       DCTELEM block[6][64])
 {
     int cbp, code, i;
@@ -996,20 +1106,41 @@ int msmpeg4v2_decode_mb(MpegEncContext *s,
             }
         }
 
-        code = get_vlc(&s->gb, &v2_mb_type_vlc);
+        if(s->msmpeg4_version==2)
+            code = get_vlc(&s->gb, &v2_mb_type_vlc);
+        else
+            code = get_vlc(&s->gb, &v1_inter_cbpc_vlc);
+        if(code<0 || code>7){
+            fprintf(stderr, "cbpc %d invalid at %d %d\n", code, s->mb_x, s->mb_y);
+            return -1;
+        }
+
         s->mb_intra = code >>2;
     
         cbp = code & 0x3;
     } else {
         s->mb_intra = 1;
-        cbp= get_vlc(&s->gb, &v2_intra_cbpc_vlc);
+        if(s->msmpeg4_version==2)
+            cbp= get_vlc(&s->gb, &v2_intra_cbpc_vlc);
+        else
+            cbp= get_vlc(&s->gb, &v1_intra_cbpc_vlc);
+        if(cbp<0 || cbp>3){
+            fprintf(stderr, "cbpc %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
+            return -1;
+        }
     }
 
     if (!s->mb_intra) {
-        int mx, my;
+        int mx, my, cbpy;
+        
+        cbpy= get_vlc(&s->gb, &cbpy_vlc);
+        if(cbpy<0){
+            fprintf(stderr, "cbpy %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
+            return -1;
+        }
 
-        cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2;
-        if((cbp&3) != 3) cbp^= 0x3C;
+        cbp|= cbpy<<2;
+        if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C;
         
         h263_pred_motion(s, 0, &mx, &my);
         mx= msmpeg4v2_decode_motion(s, mx, 1);
@@ -1020,14 +1151,20 @@ int msmpeg4v2_decode_mb(MpegEncContext *s,
         s->mv[0][0][0] = mx;
         s->mv[0][0][1] = my;
     } else {
-        s->ac_pred = get_bits1(&s->gb);
-        cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2;
+        if(s->msmpeg4_version==2){
+            s->ac_pred = get_bits1(&s->gb);
+            cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2; //FIXME check errors
+        } else{
+            s->ac_pred = 0;
+            cbp|= get_vlc(&s->gb, &cbpy_vlc)<<2; //FIXME check errors
+            if(s->pict_type==P_TYPE) cbp^=0x3C;
+        }
     }
 
     for (i = 0; i < 6; i++) {
         if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0)
 	{
-             fprintf(stderr,"\nIgnoring error while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+             fprintf(stderr,"\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
              return -1;
 	}
     }
@@ -1046,23 +1183,23 @@ int msmpeg4_decode_mb(MpegEncContext *s,
             int wrap;
             /* reset DC pred (set previous line to 1024) */
             wrap = 2 * s->mb_width + 2;
-            memsetw(&s->dc_val[0][(1) + (2 * s->mb_y) * wrap], 
-                    1024, 2 * s->mb_width);
-            wrap = s->mb_width + 2;
-            memsetw(&s->dc_val[1][(1) + (s->mb_y) * wrap], 
-                    1024, s->mb_width);
-            memsetw(&s->dc_val[2][(1) + (s->mb_y) * wrap], 
-                    1024, s->mb_width);
-            
-            /* reset AC pred (set previous line to 0) */
-            wrap = s->mb_width * 2 + 2;
-            memsetw(s->ac_val[0][0] + (1 + (2 * s->mb_y) * wrap)*16,
-                    0, 2 * s->mb_width*16);
-            wrap = s->mb_width + 2;
-            memsetw(s->ac_val[1][0] + (1 + (s->mb_y) * wrap)*16,
-                    0, s->mb_width*16);
-            memsetw(s->ac_val[2][0] + (1 + (s->mb_y) * wrap)*16,
-                    0, s->mb_width*16);
+	    msmpeg4_memsetw(&s->dc_val[0][(1) + (2 * s->mb_y) * wrap],
+			    1024, 2 * s->mb_width);
+	    wrap = s->mb_width + 2;
+	    msmpeg4_memsetw(&s->dc_val[1][(1) + (s->mb_y) * wrap],
+			    1024, s->mb_width);
+	    msmpeg4_memsetw(&s->dc_val[2][(1) + (s->mb_y) * wrap],
+			    1024, s->mb_width);
+
+	    /* reset AC pred (set previous line to 0) */
+	    wrap = s->mb_width * 2 + 2;
+	    msmpeg4_memsetw(s->ac_val[0][0] + (1 + (2 * s->mb_y) * wrap)*16,
+			    0, 2 * s->mb_width*16);
+	    wrap = s->mb_width + 2;
+	    msmpeg4_memsetw(s->ac_val[1][0] + (1 + (s->mb_y) * wrap)*16,
+			    0, s->mb_width*16);
+	    msmpeg4_memsetw(s->ac_val[2][0] + (1 + (s->mb_y) * wrap)*16,
+			    0, s->mb_width*16);
 
             s->first_slice_line = 1;
         } else {
@@ -1070,7 +1207,7 @@ int msmpeg4_decode_mb(MpegEncContext *s,
         }
     }
 
-    if(s->msmpeg4_version==2) return msmpeg4v2_decode_mb(s, block); //FIXME merge if possible
+    if(s->msmpeg4_version<=2) return msmpeg4v12_decode_mb(s, block); //FIXME export function & call from outside perhaps
     
     if (s->pict_type == P_TYPE) {
         set_stat(ST_INTER_MB);
@@ -1133,10 +1270,11 @@ int msmpeg4_decode_mb(MpegEncContext *s,
     for (i = 0; i < 6; i++) {
         if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1) < 0)
 	{
-	    fprintf(stderr,"\nIgnoring error while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
-	    // return -1;
+	    fprintf(stderr,"\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+	    return -1;
 	}
     }
+
     return 0;
 }
 
@@ -1156,14 +1294,24 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
 	/* DC coef */
         set_stat(ST_DC);
         level = msmpeg4_decode_dc(s, n, &dc_pred_dir);
-        if (level < 0)
+        if (level < 0){
+            fprintf(stderr, "dc overflow-\n");
             return -1;
-        block[0] = level;
+        }
         if (n < 4) {
             rl = &rl_table[s->rl_table_index];
+            if(level > 256*s->y_dc_scale){
+                fprintf(stderr, "dc overflow+\n");
+                return -1;
+            }
         } else {
             rl = &rl_table[3 + s->rl_chroma_table_index];
+            if(level > 256*s->c_dc_scale){
+                fprintf(stderr, "dc overflow+\n");
+                return -1;
+            }
         }
+        block[0] = level;
 
         run_diff = 0;
 	i = 1;
@@ -1204,16 +1352,42 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
             return -1;
         if (code == rl->n) {
             /* escape */
-            if (get_bits1(&s->gb) == 0) {
-                if (get_bits1(&s->gb) == 0) {
+            if (s->msmpeg4_version==1 || get_bits1(&s->gb) == 0) {
+                if (s->msmpeg4_version==1 || get_bits1(&s->gb) == 0) {
                     /* third escape */
                     last = get_bits1(&s->gb);
                     run = get_bits(&s->gb, 6);
                     level = get_bits(&s->gb, 8);
                     level = (level << 24) >> 24; /* sign extend */
+#if 0 // waste of time / this will detect very few errors
+                    {
+                        const int abs_level= ABS(level);
+                        const int run1= run - rl->max_run[last][abs_level] - run_diff;
+                        if(abs_level<=MAX_LEVEL && run<=MAX_RUN){
+                            if(abs_level <= rl->max_level[last][run]){
+                                fprintf(stderr, "illegal 3. esc, vlc encoding possible\n");
+                                return DECODING_AC_LOST;
+                            }
+                            if(abs_level <= rl->max_level[last][run]*2){
+                                fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n");
+                                return DECODING_AC_LOST;
+                            }
+                            if(abs_level <= rl->max_level[last][run1] && 0){
+                                fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n");
+                                return DECODING_AC_LOST;
+                            }
+                        }
+                    }
+#endif
 		    //level = level * qmul + (level>0) * qadd - (level<=0) * qadd ;
 		    if (level>0) level= level * qmul + qadd;
-                    else        level= level * qmul - qadd;
+                    else         level= level * qmul - qadd;
+#if 0 // waste of time too :(
+                    if(level>2048 || level<-2048){
+                        fprintf(stderr, "|level| overflow in 3. esc\n");
+                        return DECODING_AC_LOST;
+                    }
+#endif
                 } else {
                     /* second escape */
                     code = get_vlc(&s->gb, &rl->vlc);
@@ -1250,6 +1424,7 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         i += run;
         if (i >= 64)
             return -1;
+
 	j = scan_table[i];
         block[j] = level;
         i++;
@@ -1271,9 +1446,8 @@ static int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
 static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
 {
     int level, pred;
-    INT16 *dc_val;
 
-    if(s->msmpeg4_version==2){
+    if(s->msmpeg4_version<=2){
         if (n < 4) {
             level = get_vlc(&s->gb, &v2_dc_lum_vlc);
         } else {
@@ -1288,8 +1462,10 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         } else {
             level = get_vlc(&s->gb, &dc_chroma_vlc[s->dc_table_index]);
         }
-        if (level < 0)
+        if (level < 0){
+            fprintf(stderr, "illegal dc vlc\n");
             return -1;
+        }
 
         if (level == DC_MAX) {
             level = get_bits(&s->gb, 8);
@@ -1301,14 +1477,24 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         }
     }
 
-    pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
-    level += pred;
+    if(s->msmpeg4_version==1){
+        INT32 *dc_val;
+        pred = msmpeg4v1_pred_dc(s, n, &dc_val);
+        level += pred;
+        
+        /* update predictor */
+        *dc_val= level;
+    }else{
+        INT16 *dc_val;
+        pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
+        level += pred;
 
-    /* update predictor */
-    if (n < 4) {
-        *dc_val = level * s->y_dc_scale;
-    } else {
-        *dc_val = level * s->c_dc_scale;
+        /* update predictor */
+        if (n < 4) {
+            *dc_val = level * s->y_dc_scale;
+        } else {
+            *dc_val = level * s->c_dc_scale;
+        }
     }
 
     return level;
diff --git a/src/libffmpeg/libavcodec/msmpeg4data.h b/src/libffmpeg/libavcodec/msmpeg4data.h
index 9dcb8276f..66e0a3d89 100644
--- a/src/libffmpeg/libavcodec/msmpeg4data.h
+++ b/src/libffmpeg/libavcodec/msmpeg4data.h
@@ -3,7 +3,7 @@
  */
 
 /* intra picture macro block coded block pattern */
-const UINT16 table_mb_intra[64][2] = {
+static const UINT16 table_mb_intra[64][2] = {
 { 0x1, 1 },{ 0x17, 6 },{ 0x9, 5 },{ 0x5, 5 },
 { 0x6, 5 },{ 0x47, 9 },{ 0x20, 7 },{ 0x10, 7 },
 { 0x2, 5 },{ 0x7c, 9 },{ 0x3a, 7 },{ 0x1d, 7 },
@@ -23,7 +23,7 @@ const UINT16 table_mb_intra[64][2] = {
 };
 
 /* non intra picture macro block coded block pattern + mb type */
-const UINT32 table_mb_non_intra[128][2] = {
+static const UINT32 table_mb_non_intra[128][2] = {
 { 0x40, 7 },{ 0x13c9, 13 },{ 0x9fd, 12 },{ 0x1fc, 15 },
 { 0x9fc, 12 },{ 0xa83, 18 },{ 0x12d34, 17 },{ 0x83bc, 16 },
 { 0x83a, 12 },{ 0x7f8, 17 },{ 0x3fd, 16 },{ 0x3ff, 16 },
@@ -128,7 +128,7 @@ static const UINT32 table0_dc_chroma[120][2] = {
 
 /* dc table 1 */
 
-const UINT32 table1_dc_lum[120][2] = {
+static const UINT32 table1_dc_lum[120][2] = {
 { 0x2, 2 },{ 0x3, 2 },{ 0x3, 3 },{ 0x2, 4 },
 { 0x5, 4 },{ 0x1, 5 },{ 0x3, 5 },{ 0x8, 5 },
 { 0x0, 6 },{ 0x5, 6 },{ 0xd, 6 },{ 0xf, 6 },
@@ -161,7 +161,7 @@ const UINT32 table1_dc_lum[120][2] = {
 { 0x1e6964, 26 },{ 0x1e6965, 26 },{ 0x1e6966, 26 },{ 0x1e6967, 26 },
 };
 
-const UINT32 table1_dc_chroma[120][2] = {
+static const UINT32 table1_dc_chroma[120][2] = {
 { 0x0, 2 },{ 0x1, 2 },{ 0x4, 3 },{ 0x7, 3 },
 { 0xb, 4 },{ 0xd, 4 },{ 0x15, 5 },{ 0x28, 6 },
 { 0x30, 6 },{ 0x32, 6 },{ 0x52, 7 },{ 0x62, 7 },
@@ -233,7 +233,7 @@ static const UINT16 table0_vlc[133][2] = {
 { 0x16, 7 },
 };
 
-const INT8 table0_level[132] = {
+static const INT8 table0_level[132] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14, 15, 16,
   1,  2,  3,  4,  5,  6,  7,  8,
@@ -253,7 +253,7 @@ const INT8 table0_level[132] = {
   1,  1,  1,  1,
 };
 
-const INT8 table0_run[132] = {
+static const INT8 table0_run[132] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   1,  1,  1,  1,  1,  1,  1,  1,
@@ -275,7 +275,7 @@ const INT8 table0_run[132] = {
 
 /* vlc table 1, for intra chroma and P macro blocks */
 
-const UINT16 table1_vlc[149][2] = {
+static const UINT16 table1_vlc[149][2] = {
 { 0x4, 3 },{ 0x14, 5 },{ 0x17, 7 },{ 0x7f, 8 },
 { 0x154, 9 },{ 0x1f2, 10 },{ 0xbf, 11 },{ 0x65, 12 },
 { 0xaaa, 12 },{ 0x630, 13 },{ 0x1597, 13 },{ 0x3b7, 14 },
@@ -316,7 +316,7 @@ const UINT16 table1_vlc[149][2] = {
 { 0xd, 9 },
 };
 
-const INT8 table1_level[148] = {
+static const INT8 table1_level[148] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14,  1,  2,
   3,  4,  5,  6,  7,  8,  9,  1,
@@ -338,7 +338,7 @@ const INT8 table1_level[148] = {
   1,  1,  1,  1,
 };
 
-const INT8 table1_run[148] = {
+static const INT8 table1_run[148] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  1,  1,
   1,  1,  1,  1,  1,  1,  1,  2,
@@ -362,7 +362,7 @@ const INT8 table1_run[148] = {
 
 /* third vlc table */
 
-const UINT16 table2_vlc[186][2] = {
+static const UINT16 table2_vlc[186][2] = {
 { 0x1, 2 },{ 0x5, 3 },{ 0xd, 4 },{ 0x12, 5 },
 { 0xe, 6 },{ 0x15, 7 },{ 0x13, 8 },{ 0x3f, 8 },
 { 0x4b, 9 },{ 0x11f, 9 },{ 0xb8, 10 },{ 0x3e3, 10 },
@@ -412,7 +412,7 @@ const UINT16 table2_vlc[186][2] = {
 { 0x23dc, 14 },{ 0x4a, 9 },
 };
 
-const INT8 table2_level[185] = {
+static const INT8 table2_level[185] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14, 15, 16,
  17, 18, 19,  1,  2,  3,  4,  5,
@@ -439,7 +439,7 @@ const INT8 table2_level[185] = {
   1,
 };
 
-const INT8 table2_run[185] = {
+static const INT8 table2_run[185] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  1,  1,  1,  1,  1,
@@ -467,7 +467,7 @@ const INT8 table2_run[185] = {
 };
 
 /* second non intra vlc table */
-const UINT16 table4_vlc[169][2] = {
+static const UINT16 table4_vlc[169][2] = {
 { 0x0, 3 },{ 0x3, 4 },{ 0xb, 5 },{ 0x14, 6 },
 { 0x3f, 6 },{ 0x5d, 7 },{ 0xa2, 8 },{ 0xac, 9 },
 { 0x16e, 9 },{ 0x20a, 10 },{ 0x2e2, 10 },{ 0x432, 11 },
@@ -513,7 +513,7 @@ const UINT16 table4_vlc[169][2] = {
 { 0x169, 9 },
 };
 
-const INT8 table4_level[168] = {
+static const INT8 table4_level[168] = {
   1,  2,  3,  4,  5,  6,  7,  8,
   9, 10, 11, 12, 13, 14, 15, 16,
  17, 18, 19, 20, 21, 22, 23,  1,
@@ -537,7 +537,7 @@ const INT8 table4_level[168] = {
   1,  1,  1,  1,  1,  1,  1,  1,
 };
 
-const INT8 table4_run[168] = {
+static const INT8 table4_run[168] = {
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  0,
   0,  0,  0,  0,  0,  0,  0,  1,
@@ -575,6 +575,11 @@ extern const UINT8 DCtab_chrom[13][2];
 extern const UINT8 cbpy_tab[16][2];
 extern const UINT8 mvtab[33][2];
 
+extern const UINT8 intra_MCBPC_code[8];
+extern const UINT8 intra_MCBPC_bits[8];
+
+extern const UINT8 inter_MCBPC_code[8];
+extern const UINT8 inter_MCBPC_bits[8];
 
 #define NB_RL_TABLES  6
 
@@ -627,7 +632,7 @@ static RLTable rl_table[NB_RL_TABLES] = {
 
 /* motion vector table 0 */
 
-const UINT16 table0_mv_code[1100] = {
+static const UINT16 table0_mv_code[1100] = {
  0x0001, 0x0003, 0x0005, 0x0007, 0x0003, 0x0008, 0x000c, 0x0001,
  0x0002, 0x001b, 0x0006, 0x000b, 0x0015, 0x0002, 0x000e, 0x000f,
  0x0014, 0x0020, 0x0022, 0x0025, 0x0027, 0x0029, 0x002d, 0x004b,
@@ -768,7 +773,7 @@ const UINT16 table0_mv_code[1100] = {
  0x5f0d, 0x5f0e, 0x5f0f, 0x0000,
 };
 
-const UINT8 table0_mv_bits[1100] = {
+static const UINT8 table0_mv_bits[1100] = {
   1,  4,  4,  4,  5,  5,  5,  6,
   6,  6,  7,  7,  7,  8,  8,  8,
   8,  8,  8,  8,  8,  8,  8,  8,
@@ -909,7 +914,7 @@ const UINT8 table0_mv_bits[1100] = {
  17, 17, 17,  8,
 };
 
-const UINT8 table0_mvx[1099] = {
+static const UINT8 table0_mvx[1099] = {
  32, 32, 31, 32, 33, 31, 33, 31,
  33, 32, 34, 32, 30, 32, 31, 34,
  35, 32, 34, 33, 29, 33, 30, 30,
@@ -1050,7 +1055,7 @@ const UINT8 table0_mvx[1099] = {
  61, 19, 19,
 };
 
-const UINT8 table0_mvy[1099] = {
+static const UINT8 table0_mvy[1099] = {
  32, 31, 32, 33, 32, 31, 31, 33,
  33, 34, 32, 30, 32, 35, 34, 31,
  32, 29, 33, 30, 32, 34, 33, 31,
@@ -1192,7 +1197,7 @@ const UINT8 table0_mvy[1099] = {
 };
 
 /* motion vector table 1 */
-const UINT16 table1_mv_code[1100] = {
+static const UINT16 table1_mv_code[1100] = {
  0x0000, 0x0007, 0x0009, 0x000f, 0x000a, 0x0011, 0x001a, 0x001c,
  0x0011, 0x0031, 0x0025, 0x002d, 0x002f, 0x006f, 0x0075, 0x0041,
  0x004c, 0x004e, 0x005c, 0x0060, 0x0062, 0x0066, 0x0068, 0x0069,
@@ -1333,7 +1338,7 @@ const UINT16 table1_mv_code[1100] = {
  0x2473, 0x26a2, 0x26a3, 0x000b,
 };
 
-const UINT8 table1_mv_bits[1100] = {
+static const UINT8 table1_mv_bits[1100] = {
   2,  4,  4,  4,  5,  5,  5,  5,
   6,  6,  7,  7,  7,  7,  7,  8,
   8,  8,  8,  8,  8,  8,  8,  8,
@@ -1474,7 +1479,7 @@ const UINT8 table1_mv_bits[1100] = {
  15, 15, 15,  4,
 };
 
-const UINT8 table1_mvx[1099] = {
+static const UINT8 table1_mvx[1099] = {
  32, 31, 32, 31, 33, 32, 33, 33,
  31, 34, 30, 32, 32, 34, 35, 32,
  34, 33, 29, 30, 30, 32, 31, 31,
@@ -1615,7 +1620,7 @@ const UINT8 table1_mvx[1099] = {
   0, 12, 27,
 };
 
-const UINT8 table1_mvy[1099] = {
+static const UINT8 table1_mvy[1099] = {
  32, 32, 31, 31, 32, 33, 31, 33,
  33, 32, 32, 30, 34, 31, 32, 29,
  33, 30, 32, 33, 31, 35, 34, 30,
diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c
new file mode 100644
index 000000000..8395eefad
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ratecontrol.c
@@ -0,0 +1,402 @@
+/*
+ * Rate control for video encoders
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "avcodec.h"
+#include "dsputil.h"
+#include "mpegvideo.h"
+
+#define STATS_FILE "lavc_stats.txt"
+
+static int init_pass2(MpegEncContext *s);
+
+void ff_write_pass1_stats(MpegEncContext *s){
+    RateControlContext *rcc= &s->rc_context;
+//    fprintf(c->stats_file, "type:%d q:%d icount:%d pcount:%d scount:%d itex:%d ptex%d mv:%d misc:%d fcode:%d bcode:%d\")
+    fprintf(rcc->stats_file, "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d\n",
+            s->picture_number, s->input_picture_number - s->max_b_frames, s->pict_type, 
+            s->qscale, s->i_tex_bits, s->p_tex_bits, s->mv_bits, s->misc_bits, s->f_code, s->b_code);
+}
+
+int ff_rate_control_init(MpegEncContext *s)
+{
+    RateControlContext *rcc= &s->rc_context;
+    emms_c();
+
+    if(s->flags&CODEC_FLAG_PASS1){
+        rcc->stats_file= fopen(STATS_FILE, "w");
+        if(!rcc->stats_file){
+            fprintf(stderr, "failed to open " STATS_FILE "\n");
+            return -1;
+        }
+    } else if(s->flags&CODEC_FLAG_PASS2){
+        int size;
+        int i;
+
+        rcc->stats_file= fopen(STATS_FILE, "r");
+        if(!rcc->stats_file){
+            fprintf(stderr, "failed to open " STATS_FILE "\n");
+            return -1;
+        }
+
+        /* find number of pics without reading the file twice :) */
+        fseek(rcc->stats_file, 0, SEEK_END);
+        size= ftell(rcc->stats_file);
+        fseek(rcc->stats_file, 0, SEEK_SET);
+
+        size/= 64; // we need at least 64 byte to store a line ...
+        rcc->entry = (RateControlEntry*)av_mallocz(size*sizeof(RateControlEntry));
+
+        for(i=0; !feof(rcc->stats_file); i++){
+            RateControlEntry *rce;
+            int picture_number;
+            int e;
+            
+            e= fscanf(rcc->stats_file, "in:%d ", &picture_number);
+            rce= &rcc->entry[picture_number];
+            e+=fscanf(rcc->stats_file, "out:%*d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%*d bcode:%*d\n",
+                   &rce->pict_type, &rce->qscale, &rce->i_tex_bits, &rce->p_tex_bits, &rce->mv_bits, &rce->misc_bits);
+            if(e!=7){
+                fprintf(stderr, STATS_FILE " is damaged\n");
+                return -1;
+            }
+        }
+        rcc->num_entries= i;
+        
+        if(init_pass2(s) < 0) return -1;
+    }
+     
+    /* no 2pass stuff, just normal 1-pass */
+    //initial values, they dont really matter as they will be totally different within a few frames
+    s->i_pred.coeff= s->p_pred.coeff= 7.0;
+    s->i_pred.count= s->p_pred.count= 1.0;
+    
+    s->i_pred.decay= s->p_pred.decay= 0.4;
+    
+    // use more bits at the beginning, otherwise high motion at the begin will look like shit
+    s->qsum=100 * s->qmin;
+    s->qcount=100;
+
+    s->short_term_qsum=0.001;
+    s->short_term_qcount=0.001;
+
+    return 0;
+}
+
+void ff_rate_control_uninit(MpegEncContext *s)
+{
+    RateControlContext *rcc= &s->rc_context;
+    emms_c();
+
+    if(rcc->stats_file) 
+        fclose(rcc->stats_file);
+    rcc->stats_file = NULL;
+    av_freep(&rcc->entry);
+}
+
+//----------------------------------
+// 1 Pass Code
+
+static double predict(Predictor *p, double q, double var)
+{
+     return p->coeff*var / (q*p->count);
+}
+
+static void update_predictor(Predictor *p, double q, double var, double size)
+{
+    double new_coeff= size*q / (var + 1);
+    if(var<1000) return;
+
+    p->count*= p->decay;
+    p->coeff*= p->decay;
+    p->count++;
+    p->coeff+= new_coeff;
+}
+
+int ff_rate_estimate_qscale(MpegEncContext *s)
+{
+    int qmin= s->qmin;
+    int qmax= s->qmax;
+    int rate_q=5;
+    float q;
+    int qscale;
+    float br_compensation;
+    double diff;
+    double short_term_q;
+    double long_term_q;
+    double fps;
+    int picture_number= s->input_picture_number - s->max_b_frames;
+    int64_t wanted_bits;
+    emms_c();
+
+    fps= (double)s->frame_rate / FRAME_RATE_BASE;
+    wanted_bits= (uint64_t)(s->bit_rate*(double)picture_number/fps);
+//    printf("%d %d %d\n", picture_number, (int)wanted_bits, (int)s->total_bits);
+    
+    if(s->pict_type==B_TYPE){
+        qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5);
+        qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5);
+    }
+    if(qmin<1) qmin=1;
+    if(qmax>31) qmax=31;
+    if(qmax<=qmin) qmax= qmin;
+
+        /* update predictors */
+    if(picture_number>2){
+        if(s->pict_type!=B_TYPE && s->last_non_b_pict_type == P_TYPE){
+//printf("%d %d %d %f\n", s->qscale, s->last_mc_mb_var, s->frame_bits, s->p_pred.coeff);
+            update_predictor(&s->p_pred, s->last_non_b_qscale, s->last_non_b_mc_mb_var, s->pb_frame_bits);
+        }
+    }
+
+    if(s->pict_type == I_TYPE){
+        short_term_q= s->short_term_qsum/s->short_term_qcount;
+    
+        long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0
+
+        q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q);
+    }else if(s->pict_type==B_TYPE){
+        q= (int)(s->last_non_b_qscale*s->b_quant_factor+s->b_quant_offset + 0.5);
+    }else{ //P Frame
+        int i;
+        int diff, best_diff=1000000000;
+        for(i=1; i<=31; i++){
+            diff= predict(&s->p_pred, i, s->mc_mb_var_sum) - (double)s->bit_rate/fps;
+            if(diff<0) diff= -diff;
+            if(diff<best_diff){
+                best_diff= diff;
+                rate_q= i;
+            }
+        }
+        s->short_term_qsum*=s->qblur;
+        s->short_term_qcount*=s->qblur;
+
+        s->short_term_qsum+= rate_q;
+        s->short_term_qcount++;
+        short_term_q= s->short_term_qsum/s->short_term_qcount;
+    
+        long_term_q= s->qsum/s->qcount*(s->total_bits+1)/(wanted_bits+1); //+1 to avoid nan & 0
+
+//    q= (long_term_q - short_term_q)*s->qcompress + short_term_q;
+        q= 1/((1/long_term_q - 1/short_term_q)*s->qcompress + 1/short_term_q);
+    }
+
+    diff= s->total_bits - wanted_bits;
+    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
+    if(br_compensation<=0.0) br_compensation=0.001;
+    q/=br_compensation;
+//printf("%f %f %f\n", q, br_compensation, short_term_q);
+    qscale= (int)(q + 0.5);
+    if     (qscale<qmin) qscale=qmin;
+    else if(qscale>qmax) qscale=qmax;
+    
+    if(s->pict_type!=B_TYPE){
+        s->qsum+= qscale;
+        s->qcount++;
+        if     (qscale<s->last_non_b_qscale-s->max_qdiff) qscale=s->last_non_b_qscale-s->max_qdiff;
+        else if(qscale>s->last_non_b_qscale+s->max_qdiff) qscale=s->last_non_b_qscale+s->max_qdiff;
+    }
+//printf("q:%d diff:%d comp:%f rate_q:%d st_q:%f fvar:%d last_size:%d\n", qscale, (int)diff, br_compensation, 
+//       rate_q, short_term_q, s->mc_mb_var, s->frame_bits);
+//printf("%d %d\n", s->bit_rate, (int)fps);
+    return qscale;
+}
+
+//----------------------------------------------
+// 2-Pass code
+
+static int init_pass2(MpegEncContext *s)
+{
+    RateControlContext *rcc= &s->rc_context;
+    int i;
+    double fps= (double)s->frame_rate / FRAME_RATE_BASE;
+    double complexity[5]={0,0,0,0,0};   // aproximate bits at quant=1
+    double avg_quantizer[5];
+    uint64_t const_bits[5]={0,0,0,0,0}; // quantizer idependant bits
+    uint64_t available_bits[5];
+    uint64_t all_const_bits;
+    uint64_t all_available_bits= (uint64_t)(s->bit_rate*(double)rcc->num_entries/fps);
+    int num_frames[5]={0,0,0,0,0};
+    double rate_factor=0;
+    double step;
+    int last_i_frame=-10000000;
+
+    /* find complexity & const_bits & decide the pict_types */
+    for(i=0; i<rcc->num_entries; i++){
+        RateControlEntry *rce= &rcc->entry[i];
+        
+        if(s->b_frame_strategy==0 || s->max_b_frames==0){
+            rce->new_pict_type= rce->pict_type;
+        }else{
+            int j;
+            int next_non_b_type=P_TYPE;
+
+            switch(rce->pict_type){
+            case I_TYPE:
+                if(i-last_i_frame>s->gop_size/2){ //FIXME this is not optimal
+                    rce->new_pict_type= I_TYPE;
+                    last_i_frame= i;
+                }else{
+                    rce->new_pict_type= P_TYPE; // will be caught by the scene detection anyway
+                }
+                break;
+            case P_TYPE:
+                rce->new_pict_type= P_TYPE;
+                break;
+            case B_TYPE:
+                for(j=i+1; j<i+s->max_b_frames+2 && j<rcc->num_entries; j++){
+                    if(rcc->entry[j].pict_type != B_TYPE){
+                        next_non_b_type= rcc->entry[j].pict_type;
+                        break;
+                    }
+                }
+                if(next_non_b_type==I_TYPE)
+                    rce->new_pict_type= P_TYPE;
+                else
+                    rce->new_pict_type= B_TYPE;
+                break;
+            }
+        }
+
+        complexity[rce->new_pict_type]+= (rce->i_tex_bits+ rce->p_tex_bits)*(double)rce->qscale;
+        const_bits[rce->new_pict_type]+= rce->mv_bits + rce->misc_bits;
+        num_frames[rce->new_pict_type]++;
+    }
+    all_const_bits= const_bits[I_TYPE] + const_bits[P_TYPE] + const_bits[B_TYPE];
+    
+    if(all_available_bits < all_const_bits){
+        fprintf(stderr, "requested bitrate is to low\n");
+        return -1;
+    }
+
+//    avg_complexity= complexity/rcc->num_entries;
+    avg_quantizer[P_TYPE]= 
+    avg_quantizer[I_TYPE]=   (complexity[I_TYPE]+complexity[P_TYPE] + complexity[B_TYPE]/s->b_quant_factor) 
+                           / (all_available_bits - all_const_bits);
+    avg_quantizer[B_TYPE]= avg_quantizer[P_TYPE]*s->b_quant_factor + s->b_quant_offset;
+//printf("avg quantizer: %f %f\n", avg_quantizer[P_TYPE], avg_quantizer[B_TYPE]);
+
+    for(i=0; i<5; i++){
+        available_bits[i]= const_bits[i] + complexity[i]/avg_quantizer[i];
+    }
+//printf("%lld %lld %lld %lld\n", available_bits[I_TYPE], available_bits[P_TYPE], available_bits[B_TYPE], all_available_bits);
+    
+    for(step=256*256; step>0.0000001; step*=0.5){
+        uint64_t expected_bits=0;
+        rate_factor+= step;
+        /* find qscale */
+        for(i=0; i<rcc->num_entries; i++){
+            RateControlEntry *rce= &rcc->entry[i];
+            double short_term_q, q, bits_left;
+            const int pict_type= rce->new_pict_type;
+            int qmin= s->qmin;
+            int qmax= s->qmax;
+
+            if(pict_type==B_TYPE){
+                qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5);
+                qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5);
+            }
+            if(qmin<1) qmin=1;
+            if(qmax>31) qmax=31;
+            if(qmax<=qmin) qmax= qmin;
+            
+            switch(s->rc_strategy){
+            case 0:
+                bits_left= available_bits[pict_type]/num_frames[pict_type]*rate_factor - rce->misc_bits - rce->mv_bits;
+                if(bits_left<1.0) bits_left=1.0;
+                short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits)/bits_left;
+                break;
+            case 1:
+                bits_left= (available_bits[pict_type] - const_bits[pict_type])/num_frames[pict_type]*rate_factor;
+                if(bits_left<1.0) bits_left=1.0;
+                short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits)/bits_left;
+                break;
+            case 2:
+                bits_left= available_bits[pict_type]/num_frames[pict_type]*rate_factor;
+                if(bits_left<1.0) bits_left=1.0;
+                short_term_q= rce->qscale*(rce->i_tex_bits + rce->p_tex_bits + rce->misc_bits + rce->mv_bits)/bits_left;
+                break;
+            default:
+                fprintf(stderr, "unknown strategy\n");
+                short_term_q=3; //gcc warning fix
+            }
+
+            if(short_term_q>31.0) short_term_q=31.0;
+            else if (short_term_q<1.0) short_term_q=1.0;
+
+            q= 1/((1/avg_quantizer[pict_type] - 1/short_term_q)*s->qcompress + 1/short_term_q);
+            if     (q<qmin) q=qmin;
+            else if(q>qmax) q=qmax;
+//printf("lq:%f, sq:%f t:%f q:%f\n", avg_quantizer[rce->pict_type], short_term_q, bits_left, q);
+            rce->new_qscale= q;
+        }
+
+        /* smooth curve */
+    
+        /* find expected bits */
+        for(i=0; i<rcc->num_entries; i++){
+            RateControlEntry *rce= &rcc->entry[i];
+            double factor= rce->qscale / rce->new_qscale;
+            
+            rce->expected_bits= expected_bits;
+            expected_bits += (int)(rce->misc_bits + rce->mv_bits + (rce->i_tex_bits + rce->p_tex_bits)*factor + 0.5);
+        }
+
+//        printf("%d %d %f\n", (int)expected_bits, (int)all_available_bits, rate_factor);
+        if(expected_bits > all_available_bits) rate_factor-= step;
+    }
+
+    return 0;
+}
+
+int ff_rate_estimate_qscale_pass2(MpegEncContext *s)
+{
+    int qmin= s->qmin;
+    int qmax= s->qmax;
+    float q;
+    int qscale;
+    float br_compensation;
+    double diff;
+    int picture_number= s->picture_number;
+    RateControlEntry *rce= &s->rc_context.entry[picture_number];
+    int64_t wanted_bits= rce->expected_bits;
+    emms_c();
+
+//    printf("%d %d %d\n", picture_number, (int)wanted_bits, (int)s->total_bits);
+    
+    if(s->pict_type==B_TYPE){
+        qmin= (int)(qmin*s->b_quant_factor+s->b_quant_offset + 0.5);
+        qmax= (int)(qmax*s->b_quant_factor+s->b_quant_offset + 0.5);
+    }
+    if(qmin<1) qmin=1;
+    if(qmax>31) qmax=31;
+    if(qmax<=qmin) qmax= qmin;
+
+    q= rce->new_qscale;
+
+    diff= s->total_bits - wanted_bits;
+    br_compensation= (s->bit_rate_tolerance - diff)/s->bit_rate_tolerance;
+    if(br_compensation<=0.0) br_compensation=0.001;
+    q/=br_compensation;
+
+    qscale= (int)(q + 0.5);
+    if     (qscale<qmin) qscale=qmin;
+    else if(qscale>qmax) qscale=qmax;
+//    printf("%d %d %d %d type:%d\n", qmin, qscale, qmax, picture_number, s->pict_type); fflush(stdout);
+    return qscale;
+}
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index f4ebc9016..261c889de 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -1,27 +1,23 @@
 /*
  * RV10 codec
- * Copyright (c) 2000,2001 Gerard Lantau.
+ * Copyright (c) 2000,2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "common.h"
-#include "dsputil.h"
 #include "avcodec.h"
+#include "dsputil.h"
 #include "mpegvideo.h"
 
 //#define DEBUG
@@ -337,9 +333,9 @@ static int rv10_decode_picture_header(MpegEncContext *s)
 static int rv10_decode_init(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
-    int i;
     static int done;
 
+//    s->avctx= avctx;
     s->out_format = FMT_H263;
 
     s->width = avctx->width;
@@ -351,11 +347,6 @@ static int rv10_decode_init(AVCodecContext *avctx)
     if (MPV_common_init(s) < 0)
         return -1;
 
-    /* XXX: suppress this matrix init, only needed because using mpeg1
-       dequantize in mmx case */
-    for(i=0;i<64;i++)
-        s->non_intra_matrix[i] = default_non_intra_matrix[i];
-
     h263_decode_init_vlc(s);
 
     /* init rv vlc */
@@ -439,9 +430,27 @@ static int rv10_decode_frame(AVCodecContext *avctx,
     s->rv10_first_dc_coded[0] = 0;
     s->rv10_first_dc_coded[1] = 0;
     s->rv10_first_dc_coded[2] = 0;
-    
+
+    s->block_wrap[0]=
+    s->block_wrap[1]=
+    s->block_wrap[2]=
+    s->block_wrap[3]= s->mb_width*2 + 2;
+    s->block_wrap[4]=
+    s->block_wrap[5]= s->mb_width + 2;
+    s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
+    s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1)     + s->mb_x*2;
+    s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1 + s->mb_x*2;
+    s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2)     + s->mb_x*2;
+    s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
+    s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2) + s->mb_x;
     /* decode each macroblock */
     for(i=0;i<mb_count;i++) {
+        s->block_index[0]+=2;
+        s->block_index[1]+=2;
+        s->block_index[2]+=2;
+        s->block_index[3]+=2;
+        s->block_index[4]++;
+        s->block_index[5]++;
 #ifdef DEBUG
         printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 #endif
@@ -459,6 +468,12 @@ static int rv10_decode_frame(AVCodecContext *avctx,
         if (++s->mb_x == s->mb_width) {
             s->mb_x = 0;
             s->mb_y++;
+            s->block_index[0]= s->block_wrap[0]*(s->mb_y*2 + 1) - 1;
+            s->block_index[1]= s->block_wrap[0]*(s->mb_y*2 + 1);
+            s->block_index[2]= s->block_wrap[0]*(s->mb_y*2 + 2) - 1;
+            s->block_index[3]= s->block_wrap[0]*(s->mb_y*2 + 2);
+            s->block_index[4]= s->block_wrap[4]*(s->mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
+            s->block_index[5]= s->block_wrap[4]*(s->mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         }
     }
 
diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c
index a9653b187..0665f667a 100644
--- a/src/libffmpeg/libavcodec/simple_idct.c
+++ b/src/libffmpeg/libavcodec/simple_idct.c
@@ -1,29 +1,29 @@
 /*
-    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 /*
-  based upon some outcommented c code from mpeg2dec (idct_mmx.c written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
-*/
-
-#include <inttypes.h>
-
+  based upon some outcommented c code from mpeg2dec (idct_mmx.c
+  written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) 
+ */
+#include "avcodec.h"
+#include "dsputil.h"
 #include "simple_idct.h"
-#include "../config.h"
 
 #if 0
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
@@ -39,7 +39,7 @@
 #define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W4  16384  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 #define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
@@ -47,65 +47,33 @@
 #define COL_SHIFT 20 // 6
 #endif
 
-/* 8x8 Matrix used to do a trivial (slow) 8 point IDCT */
-static int coeff[64]={
-	W4, W4, W4, W4, W4, W4, W4, W4,
-	W1, W3, W5, W7,-W7,-W5,-W3,-W1,
-	W2, W6,-W6,-W2,-W2,-W6, W6, W2,
-	W3,-W7,-W1,-W5, W5, W1, W7,-W3,
-	W4,-W4,-W4, W4, W4,-W4,-W4, W4,
-	W5,-W1, W7, W3,-W3,-W7, W1,-W5,
-	W6,-W2, W2,-W6,-W6, W2,-W2, W6,
-	W7,-W5, W3,-W1, W1,-W3, W5,-W7
-};
-
-static int inline idctRowCondZ (int16_t * row)
-{
-	int a0, a1, a2, a3, b0, b1, b2, b3;
+#ifdef ARCH_ALPHA
+#define FAST_64BIT
+#endif
 
-	if( !( ((uint32_t*)row)[0]|((uint32_t*)row)[1] |((uint32_t*)row)[2] |((uint32_t*)row)[3])) {
-/*		row[0] = row[1] = row[2] = row[3] = row[4] =
-			row[5] = row[6] = row[7] = 0;*/
-		return 0;
-	}
+#if defined(ARCH_POWERPC_405)
 
-	if(!( ((uint32_t*)row)[2] |((uint32_t*)row)[3] )){
-		a0 = W4*row[0] + W2*row[2] + (1<<(ROW_SHIFT-1));
-		a1 = W4*row[0] + W6*row[2] + (1<<(ROW_SHIFT-1));
-		a2 = W4*row[0] - W6*row[2] + (1<<(ROW_SHIFT-1));
-		a3 = W4*row[0] - W2*row[2] + (1<<(ROW_SHIFT-1));
-
-		b0 = W1*row[1] + W3*row[3];
-		b1 = W3*row[1] - W7*row[3];
-		b2 = W5*row[1] - W1*row[3];
-		b3 = W7*row[1] - W5*row[3];
-	}else{
-		a0 = W4*row[0] + W2*row[2] + W4*row[4] + W6*row[6] + (1<<(ROW_SHIFT-1));
-		a1 = W4*row[0] + W6*row[2] - W4*row[4] - W2*row[6] + (1<<(ROW_SHIFT-1));
-		a2 = W4*row[0] - W6*row[2] - W4*row[4] + W2*row[6] + (1<<(ROW_SHIFT-1));
-		a3 = W4*row[0] - W2*row[2] + W4*row[4] - W6*row[6] + (1<<(ROW_SHIFT-1));
-
-		b0 = W1*row[1] + W3*row[3] + W5*row[5] + W7*row[7];
-		b1 = W3*row[1] - W7*row[3] - W1*row[5] - W5*row[7];
-		b2 = W5*row[1] - W1*row[3] + W7*row[5] + W3*row[7];
-		b3 = W7*row[1] - W5*row[3] + W3*row[5] - W1*row[7];
-	}
+/* signed 16x16 -> 32 multiply add accumulate */
+#define MAC16(rt, ra, rb) \
+    asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
 
-	row[0] = (a0 + b0) >> ROW_SHIFT;
-	row[1] = (a1 + b1) >> ROW_SHIFT;
-	row[2] = (a2 + b2) >> ROW_SHIFT;
-	row[3] = (a3 + b3) >> ROW_SHIFT;
-	row[4] = (a3 - b3) >> ROW_SHIFT;
-	row[5] = (a2 - b2) >> ROW_SHIFT;
-	row[6] = (a1 - b1) >> ROW_SHIFT;
-	row[7] = (a0 - b0) >> ROW_SHIFT;
-	
-	return 1;
-}
+/* signed 16x16 -> 32 multiply */
+#define MUL16(rt, ra, rb) \
+    asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
+
+#else
+
+/* signed 16x16 -> 32 multiply add accumulate */
+#define MAC16(rt, ra, rb) rt += (ra) * (rb)
+
+/* signed 16x16 -> 32 multiply */
+#define MUL16(rt, ra, rb) rt = (ra) * (rb)
+
+#endif
 
 #ifdef ARCH_ALPHA
 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
-static int inline idctRowCondDC(int16_t *row)
+static inline int idctRowCondDC(int16_t *row)
 {
 	int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
 	uint64_t *lrow = (uint64_t *) row;
@@ -129,10 +97,10 @@ static int inline idctRowCondDC(int16_t *row)
 		}
 	}
 
-	a0 = W4 * row[0];
-	a1 = W4 * row[0];
-	a2 = W4 * row[0];
-	a3 = W4 * row[0];
+        a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+	a1 = a0;
+	a2 = a0;
+	a3 = a0;
 
 	if (row[2]) {
 		a0 += W2 * row[2];
@@ -155,11 +123,6 @@ static int inline idctRowCondDC(int16_t *row)
 		a3 -= W6 * row[6];
 	}
 
-	a0 += 1 << (ROW_SHIFT - 1);
-	a1 += 1 << (ROW_SHIFT - 1);
-	a2 += 1 << (ROW_SHIFT - 1);
-	a3 += 1 << (ROW_SHIFT - 1);
-
 	if (row[1]) {
 		b0 = W1 * row[1];
 		b1 = W3 * row[1];
@@ -205,38 +168,86 @@ static int inline idctRowCondDC(int16_t *row)
 	return 2;
 }
 #else  /* not ARCH_ALPHA */
-static int inline idctRowCondDC (int16_t * row)
+
+static inline void idctRowCondDC (int16_t * row)
 {
 	int a0, a1, a2, a3, b0, b1, b2, b3;
+#ifdef FAST_64BIT
+        uint64_t temp;
+#else
+        uint32_t temp;
+#endif
 
-	if( !( ((uint32_t*)row)[1] |((uint32_t*)row)[2] |((uint32_t*)row)[3]| row[1])) {
-//		row[0] = row[1] = row[2] = row[3] = row[4] = row[5] = row[6] = row[7] = row[0]<<3;
-		uint16_t temp= row[0]<<3;
-		((uint32_t*)row)[0]=((uint32_t*)row)[1]=
-		((uint32_t*)row)[2]=((uint32_t*)row)[3]= temp + (temp<<16);
-		return 0;
+#ifdef FAST_64BIT
+#ifdef WORDS_BIGENDIAN
+#define ROW0_MASK 0xffff000000000000LL
+#else
+#define ROW0_MASK 0xffffLL
+#endif
+	if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | 
+              ((uint64_t *)row)[1]) == 0) {
+            temp = (row[0] << 3) & 0xffff;
+            temp += temp << 16;
+            temp += temp << 32;
+            ((uint64_t *)row)[0] = temp;
+            ((uint64_t *)row)[1] = temp;
+            return;
+	}
+#else
+	if (!(((uint32_t*)row)[1] |
+              ((uint32_t*)row)[2] |
+              ((uint32_t*)row)[3] | 
+              row[1])) {
+            temp = (row[0] << 3) & 0xffff;
+            temp += temp << 16;
+            ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
+		((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
+		return;
 	}
+#endif
 
-	if(!( ((uint32_t*)row)[2] |((uint32_t*)row)[3] )){
-		a0 = W4*row[0] + W2*row[2] + (1<<(ROW_SHIFT-1));
-		a1 = W4*row[0] + W6*row[2] + (1<<(ROW_SHIFT-1));
-		a2 = W4*row[0] - W6*row[2] + (1<<(ROW_SHIFT-1));
-		a3 = W4*row[0] - W2*row[2] + (1<<(ROW_SHIFT-1));
-
-		b0 = W1*row[1] + W3*row[3];
-		b1 = W3*row[1] - W7*row[3];
-		b2 = W5*row[1] - W1*row[3];
-		b3 = W7*row[1] - W5*row[3];
-	}else{
-		a0 = W4*row[0] + W2*row[2] + W4*row[4] + W6*row[6] + (1<<(ROW_SHIFT-1));
-		a1 = W4*row[0] + W6*row[2] - W4*row[4] - W2*row[6] + (1<<(ROW_SHIFT-1));
-		a2 = W4*row[0] - W6*row[2] - W4*row[4] + W2*row[6] + (1<<(ROW_SHIFT-1));
-		a3 = W4*row[0] - W2*row[2] + W4*row[4] - W6*row[6] + (1<<(ROW_SHIFT-1));
-
-		b0 = W1*row[1] + W3*row[3] + W5*row[5] + W7*row[7];
-		b1 = W3*row[1] - W7*row[3] - W1*row[5] - W5*row[7];
-		b2 = W5*row[1] - W1*row[3] + W7*row[5] + W3*row[7];
-		b3 = W7*row[1] - W5*row[3] + W3*row[5] - W1*row[7];
+        a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+	a1 = a0;
+	a2 = a0;
+	a3 = a0;
+
+        /* no need to optimize : gcc does it */
+        a0 += W2 * row[2];
+        a1 += W6 * row[2];
+        a2 -= W6 * row[2];
+        a3 -= W2 * row[2];
+
+        MUL16(b0, W1, row[1]);
+        MAC16(b0, W3, row[3]);
+        MUL16(b1, W3, row[1]);
+        MAC16(b1, -W7, row[3]);
+        MUL16(b2, W5, row[1]);
+        MAC16(b2, -W1, row[3]);
+        MUL16(b3, W7, row[1]);
+        MAC16(b3, -W5, row[3]);
+
+#ifdef FAST_64BIT
+        temp = ((uint64_t*)row)[1];
+#else
+        temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+#endif
+	if (temp != 0) {
+            a0 += W4*row[4] + W6*row[6];
+            a1 += - W4*row[4] - W2*row[6];
+            a2 += - W4*row[4] + W2*row[6];
+            a3 += W4*row[4] - W6*row[6];
+
+            MAC16(b0, W5, row[5]);
+            MAC16(b0, W7, row[7]);
+            
+            MAC16(b1, -W1, row[5]);
+            MAC16(b1, -W5, row[7]);
+            
+            MAC16(b2, W7, row[5]);
+            MAC16(b2, W3, row[7]);
+            
+            MAC16(b3, W3, row[5]);
+            MAC16(b3, -W1, row[7]);
 	}
 
 	row[0] = (a0 + b0) >> ROW_SHIFT;
@@ -247,202 +258,151 @@ static int inline idctRowCondDC (int16_t * row)
 	row[5] = (a2 - b2) >> ROW_SHIFT;
 	row[3] = (a3 + b3) >> ROW_SHIFT;
 	row[4] = (a3 - b3) >> ROW_SHIFT;
-	
-	return 1;
 }
 #endif /* not ARCH_ALPHA */
 
-static void inline idctCol (int16_t * col)
+static inline void idctSparseColPut (UINT8 *dest, int line_size, 
+                                     int16_t * col)
 {
-
-/*
-	if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
-		col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
-			col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
-		return;
-	}*/
-
 	int a0, a1, a2, a3, b0, b1, b2, b3;
-	col[0] += (1<<(COL_SHIFT-1))/W4;
-	a0 = W4*col[8*0] + W2*col[8*2] + W4*col[8*4] + W6*col[8*6];
-	a1 = W4*col[8*0] + W6*col[8*2] - W4*col[8*4] - W2*col[8*6];
-	a2 = W4*col[8*0] - W6*col[8*2] - W4*col[8*4] + W2*col[8*6];
-	a3 = W4*col[8*0] - W2*col[8*2] + W4*col[8*4] - W6*col[8*6];
-
-	b0 = W1*col[8*1] + W3*col[8*3] + W5*col[8*5] + W7*col[8*7];
-	b1 = W3*col[8*1] - W7*col[8*3] - W1*col[8*5] - W5*col[8*7];
-	b2 = W5*col[8*1] - W1*col[8*3] + W7*col[8*5] + W3*col[8*7];
-	b3 = W7*col[8*1] - W5*col[8*3] + W3*col[8*5] - W1*col[8*7];
-
-	col[8*0] = (a0 + b0) >> COL_SHIFT;
-	col[8*7] = (a0 - b0) >> COL_SHIFT;
-	col[8*1] = (a1 + b1) >> COL_SHIFT;
-	col[8*6] = (a1 - b1) >> COL_SHIFT;
-	col[8*2] = (a2 + b2) >> COL_SHIFT;
-	col[8*5] = (a2 - b2) >> COL_SHIFT;
-	col[8*3] = (a3 + b3) >> COL_SHIFT;
-	col[8*4] = (a3 - b3) >> COL_SHIFT;
-}
+        UINT8 *cm = cropTbl + MAX_NEG_CROP;
 
-static void inline idctSparseCol (int16_t * col)
-{
-	int a0, a1, a2, a3, b0, b1, b2, b3;
-	col[0] += (1<<(COL_SHIFT-1))/W4;
-	a0 = W4*col[8*0];
-	a1 = W4*col[8*0];
-	a2 = W4*col[8*0];
-	a3 = W4*col[8*0];
-
-	if(col[8*2]){
-		a0 +=  + W2*col[8*2];
-		a1 +=  + W6*col[8*2];
-		a2 +=  - W6*col[8*2];
-		a3 +=  - W2*col[8*2];
-	}
+        /* XXX: I did that only to give same values as previous code */
+	a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
+	a1 = a0;
+	a2 = a0;
+	a3 = a0;
 
-	if(col[8*4]){
-		a0 += + W4*col[8*4];
-		a1 += - W4*col[8*4];
-		a2 += - W4*col[8*4];
-		a3 += + W4*col[8*4];
-	}
-
-	if(col[8*6]){
-		a0 += + W6*col[8*6];
-		a1 += - W2*col[8*6];
-		a2 += + W2*col[8*6];
-		a3 += - W6*col[8*6];
-	}
+        a0 +=  + W2*col[8*2];
+        a1 +=  + W6*col[8*2];
+        a2 +=  - W6*col[8*2];
+        a3 +=  - W2*col[8*2];
 
-	if(col[8*1]){
-		b0 = W1*col[8*1];
-		b1 = W3*col[8*1];
-		b2 = W5*col[8*1];
-		b3 = W7*col[8*1];
-	}else{
-		b0 = 
-		b1 = 
-		b2 = 
-		b3 = 0;
-	}
+        MUL16(b0, W1, col[8*1]);
+        MUL16(b1, W3, col[8*1]);
+        MUL16(b2, W5, col[8*1]);
+        MUL16(b3, W7, col[8*1]);
 
-	if(col[8*3]){
-		b0 += + W3*col[8*3];
-		b1 += - W7*col[8*3];
-		b2 += - W1*col[8*3];
-		b3 += - W5*col[8*3];
-	}
+        MAC16(b0, + W3, col[8*3]);
+        MAC16(b1, - W7, col[8*3]);
+        MAC16(b2, - W1, col[8*3]);
+        MAC16(b3, - W5, col[8*3]);
 
-	if(col[8*5]){
-		b0 += + W5*col[8*5];
-		b1 += - W1*col[8*5];
-		b2 += + W7*col[8*5];
-		b3 += + W3*col[8*5];
+	if(col[8*4]){
+            a0 += + W4*col[8*4];
+            a1 += - W4*col[8*4];
+            a2 += - W4*col[8*4];
+            a3 += + W4*col[8*4];
 	}
 
-	if(col[8*7]){
-		b0 += + W7*col[8*7];
-		b1 += - W5*col[8*7];
-		b2 += + W3*col[8*7];
-		b3 += - W1*col[8*7];
+	if (col[8*5]) {
+            MAC16(b0, + W5, col[8*5]);
+            MAC16(b1, - W1, col[8*5]);
+            MAC16(b2, + W7, col[8*5]);
+            MAC16(b3, + W3, col[8*5]);
 	}
 
-#ifndef ARCH_ALPHA
-	if(!(b0|b1|b2|b3)){
-		col[8*0] = (a0) >> COL_SHIFT;
-		col[8*7] = (a0) >> COL_SHIFT;
-		col[8*1] = (a1) >> COL_SHIFT;
-		col[8*6] = (a1) >> COL_SHIFT;
-		col[8*2] = (a2) >> COL_SHIFT;
-		col[8*5] = (a2) >> COL_SHIFT;
-		col[8*3] = (a3) >> COL_SHIFT;
-		col[8*4] = (a3) >> COL_SHIFT;
-	}else{
-#endif
-		col[8*0] = (a0 + b0) >> COL_SHIFT;
-		col[8*7] = (a0 - b0) >> COL_SHIFT;
-		col[8*1] = (a1 + b1) >> COL_SHIFT;
-		col[8*6] = (a1 - b1) >> COL_SHIFT;
-		col[8*2] = (a2 + b2) >> COL_SHIFT;
-		col[8*5] = (a2 - b2) >> COL_SHIFT;
-		col[8*3] = (a3 + b3) >> COL_SHIFT;
-		col[8*4] = (a3 - b3) >> COL_SHIFT;
-#ifndef ARCH_ALPHA
-	}
-#endif
+	if(col[8*6]){
+            a0 += + W6*col[8*6];
+            a1 += - W2*col[8*6];
+            a2 += + W2*col[8*6];
+            a3 += - W6*col[8*6];
+	}
+
+	if (col[8*7]) {
+            MAC16(b0, + W7, col[8*7]);
+            MAC16(b1, - W5, col[8*7]);
+            MAC16(b2, + W3, col[8*7]);
+            MAC16(b3, - W1, col[8*7]);
+	}
+
+        dest[0] = cm[(a0 + b0) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a1 + b1) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a2 + b2) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a3 + b3) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a3 - b3) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a2 - b2) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a1 - b1) >> COL_SHIFT];
+        dest += line_size;
+        dest[0] = cm[(a0 - b0) >> COL_SHIFT];
 }
 
-static void inline idctSparse2Col (int16_t * col)
+static inline void idctSparseColAdd (UINT8 *dest, int line_size, 
+                                     int16_t * col)
 {
 	int a0, a1, a2, a3, b0, b1, b2, b3;
-	col[0] += (1<<(COL_SHIFT-1))/W4;
-	a0 = W4*col[8*0];
-	a1 = W4*col[8*0];
-	a2 = W4*col[8*0];
-	a3 = W4*col[8*0];
-
-	if(col[8*2]){
-		a0 +=  + W2*col[8*2];
-		a1 +=  + W6*col[8*2];
-		a2 +=  - W6*col[8*2];
-		a3 +=  - W2*col[8*2];
-	}
+        UINT8 *cm = cropTbl + MAX_NEG_CROP;
 
-	if(col[8*4]){
-		a0 += + W4*col[8*4];
-		a1 += - W4*col[8*4];
-		a2 += - W4*col[8*4];
-		a3 += + W4*col[8*4];
-	}
+        /* XXX: I did that only to give same values as previous code */
+	a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
+	a1 = a0;
+	a2 = a0;
+	a3 = a0;
 
-	if(col[8*6]){
-		a0 += + W6*col[8*6];
-		a1 += - W2*col[8*6];
-		a2 += + W2*col[8*6];
-		a3 += - W6*col[8*6];
-	}
+        a0 +=  + W2*col[8*2];
+        a1 +=  + W6*col[8*2];
+        a2 +=  - W6*col[8*2];
+        a3 +=  - W2*col[8*2];
 
-	if(col[8*1] || 1){
-		b0 = W1*col[8*1];
-		b1 = W3*col[8*1];
-		b2 = W5*col[8*1];
-		b3 = W7*col[8*1];
-	}else{
-		b0 = 
-		b1 = 
-		b2 = 
-		b3 = 0;
-	}
+        MUL16(b0, W1, col[8*1]);
+        MUL16(b1, W3, col[8*1]);
+        MUL16(b2, W5, col[8*1]);
+        MUL16(b3, W7, col[8*1]);
 
-	if(col[8*3]){
-		b0 += + W3*col[8*3];
-		b1 += - W7*col[8*3];
-		b2 += - W1*col[8*3];
-		b3 += - W5*col[8*3];
-	}
+        MAC16(b0, + W3, col[8*3]);
+        MAC16(b1, - W7, col[8*3]);
+        MAC16(b2, - W1, col[8*3]);
+        MAC16(b3, - W5, col[8*3]);
 
-	if(col[8*5]){
-		b0 += + W5*col[8*5];
-		b1 += - W1*col[8*5];
-		b2 += + W7*col[8*5];
-		b3 += + W3*col[8*5];
+	if(col[8*4]){
+            a0 += + W4*col[8*4];
+            a1 += - W4*col[8*4];
+            a2 += - W4*col[8*4];
+            a3 += + W4*col[8*4];
 	}
 
-	if(col[8*7]){
-		b0 += + W7*col[8*7];
-		b1 += - W5*col[8*7];
-		b2 += + W3*col[8*7];
-		b3 += - W1*col[8*7];
+	if (col[8*5]) {
+            MAC16(b0, + W5, col[8*5]);
+            MAC16(b1, - W1, col[8*5]);
+            MAC16(b2, + W7, col[8*5]);
+            MAC16(b3, + W3, col[8*5]);
 	}
 
-	col[8*0] = (a0 + b0) >> COL_SHIFT;
-	col[8*7] = (a0 - b0) >> COL_SHIFT;
-	col[8*1] = (a1 + b1) >> COL_SHIFT;
-	col[8*6] = (a1 - b1) >> COL_SHIFT;
-	col[8*2] = (a2 + b2) >> COL_SHIFT;
-	col[8*5] = (a2 - b2) >> COL_SHIFT;
-	col[8*3] = (a3 + b3) >> COL_SHIFT;
-	col[8*4] = (a3 - b3) >> COL_SHIFT;
+	if(col[8*6]){
+            a0 += + W6*col[8*6];
+            a1 += - W2*col[8*6];
+            a2 += + W2*col[8*6];
+            a3 += - W6*col[8*6];
+	}
+
+	if (col[8*7]) {
+            MAC16(b0, + W7, col[8*7]);
+            MAC16(b1, - W5, col[8*7]);
+            MAC16(b2, + W3, col[8*7]);
+            MAC16(b3, - W1, col[8*7]);
+	}
+
+        dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
+        dest += line_size;
+        dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
 }
 
 #ifdef ARCH_ALPHA
@@ -472,82 +432,11 @@ static inline void idctCol2(int16_t *col)
 	lcol[12] = l; lcol[13] = r;
 	lcol[14] = l; lcol[15] = r;
 }
-#endif
 
 void simple_idct (short *block)
 {
 
 	int i;
-	
-#if 0
-	int nonZero[8];
-	int buffer[64];
-	int nNonZero=0;
-	
-	idctRowCondDC(block);
-	
-	for(i=1; i<8; i++)
-	{
-		nonZero[nNonZero]=i;
-		nNonZero+= idctRowCondZ(block + i*8);
-	}
-	
-	if(nNonZero==0)
-	{
-		for(i=0; i<8; i++)
-		{
-			block[i   ]=
-			block[i+8 ]=
-			block[i+16]=
-			block[i+24]=
-			block[i+32]=
-			block[i+40]=
-			block[i+48]=
-			block[i+56]= (W4*block[i] + (1<<(COL_SHIFT-1))) >> COL_SHIFT;
-		}	
-	}
-	else if(nNonZero==1)
-	{
-		int index= nonZero[0]*8;
-		for(i=0; i<8; i++)
-		{
-			int bias= W4*block[i] + (1<<(COL_SHIFT-1));
-			int c= block[i + index];
-			block[i   ]= (c*coeff[index  ] + bias) >> COL_SHIFT;
-			block[i+8 ]= (c*coeff[index+1] + bias) >> COL_SHIFT;
-			block[i+16]= (c*coeff[index+2] + bias) >> COL_SHIFT;
-			block[i+24]= (c*coeff[index+3] + bias) >> COL_SHIFT;
-			block[i+32]= (c*coeff[index+4] + bias) >> COL_SHIFT;
-			block[i+40]= (c*coeff[index+5] + bias) >> COL_SHIFT;
-			block[i+48]= (c*coeff[index+6] + bias) >> COL_SHIFT;
-			block[i+56]= (c*coeff[index+7] + bias) >> COL_SHIFT;
-		}	
-	}
-/*	else if(nNonZero==2)
-	{
-		int index1= nonZero[0]*8;
-		int index2= nonZero[1]*8;
-		for(i=0; i<8; i++)
-		{
-			int bias= W4*block[i] + (1<<(COL_SHIFT-1));
-			int c1= block[i + index1];
-			int c2= block[i + index2];
-			block[i   ]= (c1*coeff[index1  ] + c2*coeff[index2  ] + bias) >> COL_SHIFT;
-			block[i+8 ]= (c1*coeff[index1+1] + c2*coeff[index2+1] + bias) >> COL_SHIFT;
-			block[i+16]= (c1*coeff[index1+2] + c2*coeff[index2+2] + bias) >> COL_SHIFT;
-			block[i+24]= (c1*coeff[index1+3] + c2*coeff[index2+3] + bias) >> COL_SHIFT;
-			block[i+32]= (c1*coeff[index1+4] + c2*coeff[index2+4] + bias) >> COL_SHIFT;
-			block[i+40]= (c1*coeff[index1+5] + c2*coeff[index2+5] + bias) >> COL_SHIFT;
-			block[i+48]= (c1*coeff[index1+6] + c2*coeff[index2+6] + bias) >> COL_SHIFT;
-			block[i+56]= (c1*coeff[index1+7] + c2*coeff[index2+7] + bias) >> COL_SHIFT;
-		}	
-	}*/
-	else
-	{
-		for(i=0; i<8; i++)
-			idctSparse2Col(block + i);
-	}
-#elif defined(ARCH_ALPHA)
         int rowsZero = 1;       /* all rows except row 0 zero */
         int rowsConstant = 1;	/* all rows consist of a constant value */
 
@@ -579,11 +468,43 @@ void simple_idct (short *block)
 		for (i = 0; i < 8; i++)
 			idctSparseCol(block + i);
 	}
+}
+
+/* XXX: suppress this mess */
+void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    simple_idct(block);
+    put_pixels_clamped(block, dest, line_size);
+}
+
+void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    simple_idct(block);
+    add_pixels_clamped(block, dest, line_size);
+}
+
 #else
-	for(i=0; i<8; i++)
-		idctRowCondDC(block + i*8);
-	
-	for(i=0; i<8; i++)
-		idctSparseCol(block + i);
-#endif
+
+void simple_idct_put(UINT8 *dest, int line_size, INT16 *block)
+{
+    int i;
+    for(i=0; i<8; i++)
+        idctRowCondDC(block + i*8);
+    
+    for(i=0; i<8; i++)
+        idctSparseColPut(dest + i, line_size, block + i);
 }
+
+void simple_idct_add(UINT8 *dest, int line_size, INT16 *block)
+{
+    int i;
+    for(i=0; i<8; i++)
+        idctRowCondDC(block + i*8);
+    
+    for(i=0; i<8; i++)
+        idctSparseColAdd(dest + i, line_size, block + i);
+}
+
+#endif
+
+#undef COL_SHIFT
diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h
index 54dff7396..233a7b841 100644
--- a/src/libffmpeg/libavcodec/simple_idct.h
+++ b/src/libffmpeg/libavcodec/simple_idct.h
@@ -1,20 +1,23 @@
 /*
-    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-void simple_idct(short *block);
+void simple_idct_put(UINT8 *dest, int line_size, INT16 *block);
+void simple_idct_add(UINT8 *dest, int line_size, INT16 *block);
 void simple_idct_mmx(short *block);
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index 180712314..f6d967757 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -1,35 +1,30 @@
 /*
  * utils for libavcodec
- * Copyright (c) 2001 Gerard Lantau.
+ * Copyright (c) 2001 Fabrice Bellard.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include "common.h"
-#include "dsputil.h"
 #include "avcodec.h"
+#include "dsputil.h"
+#include "mpegvideo.h"
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
-#else
-#include <stdlib.h>
 #endif
 
 /* memory alloc */
-void *av_mallocz(int size)
+void *av_malloc(int size)
 {
     void *ptr;
 #if defined ( ARCH_X86 ) && defined ( HAVE_MEMALIGN )
@@ -52,6 +47,31 @@ void *av_mallocz(int size)
     return ptr;
 }
 
+void *av_mallocz(int size)
+{
+    void *ptr;
+    ptr = av_malloc(size);
+    if (!ptr)
+        return NULL;
+    memset(ptr, 0, size);
+    return ptr;
+}
+
+/* NOTE: ptr = NULL is explicetly allowed */
+void av_free(void *ptr)
+{
+    /* XXX: this test should not be needed on most libcs */
+    if (ptr)
+        free(ptr);
+}
+
+/* cannot call it directly because of 'void **' casting is not automatic */
+void __av_freep(void **ptr)
+{
+    av_free(*ptr);
+    *ptr = NULL;
+}
+
 /* encoder management */
 AVCodec *first_avcodec;
 
@@ -70,13 +90,16 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec)
 
     avctx->codec = codec;
     avctx->frame_number = 0;
-    avctx->priv_data = av_mallocz(codec->priv_data_size);
-    if (!avctx->priv_data) 
-        return -ENOMEM;
+    if (codec->priv_data_size > 0) {
+        avctx->priv_data = av_mallocz(codec->priv_data_size);
+        if (!avctx->priv_data) 
+            return -ENOMEM;
+    } else {
+        avctx->priv_data = NULL;
+    }
     ret = avctx->codec->init(avctx);
     if (ret < 0) {
-        free(avctx->priv_data);
-        avctx->priv_data = NULL;
+        av_freep(&avctx->priv_data);
         return ret;
     }
     return 0;
@@ -138,8 +161,7 @@ int avcodec_close(AVCodecContext *avctx)
 {
     if (avctx->codec->close)
         avctx->codec->close(avctx);
-    free(avctx->priv_data);
-    avctx->priv_data = NULL;
+    av_freep(&avctx->priv_data);
     avctx->codec = NULL;
     return 0;
 }
@@ -205,6 +227,7 @@ AVCodec *avcodec_find(enum CodecID id)
 }
 
 const char *pix_fmt_str[] = {
+    "??",
     "yuv420p",
     "yuv422",
     "rgb24",
@@ -218,6 +241,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
     const char *codec_name;
     AVCodec *p;
     char buf1[32];
+    char channels_str[100];
     int bitrate;
 
     if (encode)
@@ -259,19 +283,54 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
                      enc->width, enc->height, 
                      (float)enc->frame_rate / FRAME_RATE_BASE);
         }
+        snprintf(buf + strlen(buf), buf_size - strlen(buf),
+                ", q=%d-%d", enc->qmin, enc->qmax);
+
         bitrate = enc->bit_rate;
         break;
     case CODEC_TYPE_AUDIO:
         snprintf(buf, buf_size,
                  "Audio: %s",
                  codec_name);
+        switch (enc->channels) {
+            case 1:
+                strcpy(channels_str, "mono");
+                break;
+            case 2:
+                strcpy(channels_str, "stereo");
+                break;
+            case 6:
+                strcpy(channels_str, "5:1");
+                break;
+            default:
+                sprintf(channels_str, "%d channels", enc->channels);
+                break;
+        }
         if (enc->sample_rate) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", %d Hz, %s",
                      enc->sample_rate,
-                     enc->channels == 2 ? "stereo" : "mono");
+                     channels_str);
+        }
+        
+        /* for PCM codecs, compute bitrate directly */
+        switch(enc->codec_id) {
+        case CODEC_ID_PCM_S16LE:
+        case CODEC_ID_PCM_S16BE:
+        case CODEC_ID_PCM_U16LE:
+        case CODEC_ID_PCM_U16BE:
+            bitrate = enc->sample_rate * enc->channels * 16;
+            break;
+        case CODEC_ID_PCM_S8:
+        case CODEC_ID_PCM_U8:
+        case CODEC_ID_PCM_ALAW:
+        case CODEC_ID_PCM_MULAW:
+            bitrate = enc->sample_rate * enc->channels * 8;
+            break;
+        default:
+            bitrate = enc->bit_rate;
+            break;
         }
-	bitrate = enc->bit_rate;
         break;
     default:
         abort();
@@ -364,6 +423,15 @@ int avpicture_get_size(int pix_fmt, int width, int height)
     return size;
 }
 
+unsigned avcodec_version( void )
+{
+  return LIBAVCODEC_VERSION_INT;
+}
+
+unsigned avcodec_build( void )
+{
+  return LIBAVCODEC_BUILD;
+}
 
 /* must be called before any other functions */
 void avcodec_init(void)
@@ -412,6 +480,7 @@ void avcodec_register_all(void)
     register_avcodec(&msmpeg4v1_decoder);
     register_avcodec(&msmpeg4v2_decoder);
     register_avcodec(&msmpeg4v3_decoder);
+    register_avcodec(&wmv1_decoder);
     register_avcodec(&mpeg_decoder);
     register_avcodec(&h263i_decoder);
     register_avcodec(&rv10_decoder);
@@ -423,20 +492,28 @@ void avcodec_register_all(void)
 
 }
 
-static int encode_init(AVCodecContext *s)
+/* this should be called after seeking and before trying to decode the next frame */
+void avcodec_flush_buffers(AVCodecContext *avctx)
+{
+    MpegEncContext *s = avctx->priv_data;
+    s->num_available_buffers=0;
+}
+
+
+static int raw_encode_init(AVCodecContext *s)
 {
     return 0;
 }
 
-static int decode_frame(AVCodecContext *avctx, 
-                        void *data, int *data_size,
-                        UINT8 *buf, int buf_size)
+static int raw_decode_frame(AVCodecContext *avctx,
+			    void *data, int *data_size,
+			    UINT8 *buf, int buf_size)
 {
     return -1;
 }
 
-static int encode_frame(AVCodecContext *avctx,
-                        unsigned char *frame, int buf_size, void *data)
+static int raw_encode_frame(AVCodecContext *avctx,
+			    unsigned char *frame, int buf_size, void *data)
 {
     return -1;
 }
@@ -446,8 +523,8 @@ AVCodec rawvideo_codec = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_RAWVIDEO,
     0,
-    encode_init,
-    encode_frame,
+    raw_encode_init,
+    raw_encode_frame,
     NULL,
-    decode_frame,
+    raw_decode_frame,
 };