27 files changed, 2183 insertions, 3457 deletions
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index c2ffc84bc..8e2a6a4df 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -2,6 +2,8 @@
 ## Process this file with automake to produce Makefile.in
 ##
 
+SUBDIRS = armv4l i386 mlib
+
 #CFLAGS = -D_FILE_OFFSET_BITS=64 @GLOBAL_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H
 
 CFLAGS = @GLOBAL_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H
@@ -11,28 +13,22 @@ LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic
 
 noinst_LTLIBRARIES = libavcodec.la
 
-if HAVE_FFMMX
-mmx_modules = mpegvideo_mmx.c sad_mmx.s dsputil_mmx.c idct_mmx.c
-#mmx_modules = mpegvideo_mmx.c sad_mmx.s
-endif
-
-if HAVE_MLIB
-mlib_modules = dsputil_mlib.c idct_mlib.c
-endif
-
-EXTRA_DIST = mpegvideo_mmx.c sad_mmx.s dsputil_mmx.c idct_mmx.c \
-	dsputil_mlib.c idct_mlib.c
-
-libavcodec_la_SOURCES = dsputil.c   fdctref.c   jfdctfst.c  mpeg12.c   \
-		        utils.c     rv10.c      h263.c      jrevdct.c  \
-		        common.c    h263dec.c   msmpeg4.c   \
-			mpegvideo.c mjpeg.c     motion_est.c \
-			$(mmx_modules) $(mlib_modules)
+libavcodec_la_SOURCES = common.c utils.c mpegvideo.c h263.c jrevdct.c jfdctfst.c \
+			mjpeg.c dsputil.c \
+			motion_est.c imgconvert.c msmpeg4.c \
+			mpeg12.c h263dec.c rv10.c simple_idct.c
+#imgresample.c
+ 
+libavcodec_la_LDFLAGS = \
+		$(top_builddir)/src/libffmpeg/libavcodec/armv4l/libavcodec_armv4l.la \
+		$(top_builddir)/src/libffmpeg/libavcodec/i386/libavcodec_mmx.la \
+		$(top_builddir)/src/libffmpeg/libavcodec/mlib/libavcodec_mlib.la \
+		-avoid-version -module
 
 
-noinst_HEADERS = avcodec.h    dsputil.h   mpegvideo.h   dsputil_mmx_avg.h\
+noinst_HEADERS = avcodec.h    dsputil.h   mpegvideo.h \
 		 common.h     h263data.h  mpeg4data.h   msmpeg4data.h \
-		 mpeg12data.h
+		 mpeg12data.h simple_idct.h
 
 .s.lo:
 	$(ASCOMPILE) -o $@ `test -f $< || echo '$(srcdir)/'`$<
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 5155c8fd2..864ebb3ba 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -1,3 +1,6 @@
+#ifndef AVCODEC_H
+#define AVCODEC_H
+
 #include "common.h"
 
 enum CodecID {
@@ -9,11 +12,20 @@ enum CodecID {
     CODEC_ID_AC3,
     CODEC_ID_MJPEG,
     CODEC_ID_MPEG4,
-    CODEC_ID_PCM,
     CODEC_ID_RAWVIDEO,
     CODEC_ID_MSMPEG4,
     CODEC_ID_H263P,
     CODEC_ID_H263I,
+
+    /* various pcm "codecs" */
+    CODEC_ID_PCM_S16LE,
+    CODEC_ID_PCM_S16BE,
+    CODEC_ID_PCM_U16LE,
+    CODEC_ID_PCM_U16BE,
+    CODEC_ID_PCM_S8,
+    CODEC_ID_PCM_U8,
+    CODEC_ID_PCM_MULAW,
+    CODEC_ID_PCM_ALAW,
 };
 
 enum CodecType {
@@ -30,6 +42,11 @@ enum PixelFormat {
     PIX_FMT_YUV444P,
 };
 
+/* currently unused, may be used if 24/32 bits samples ever supported */
+enum SampleFormat {
+    SAMPLE_FMT_S16 = 0,         /* signed 16 bits */
+};
+
 /* in bytes */
 #define AVCODEC_MAX_AUDIO_FRAME_SIZE 18432
 
@@ -74,6 +91,7 @@ typedef struct AVCodecContext {
     /* audio only */
     int sample_rate; /* samples per sec */
     int channels;
+    int sample_fmt;  /* sample format, currenly unused */
 
     /* the following data should not be initialized */
     int frame_size; /* in samples, initialized when calling 'init' */
@@ -85,6 +103,19 @@ typedef struct AVCodecContext {
     struct AVCodec *codec;
     void *priv_data;
 
+    /* The following data is for RTP friendly coding */
+    /* By now only H.263/H.263+ coder honours this   */
+    int rtp_mode;   /* 1 for activate RTP friendly-mode           */
+                    /* highers numbers represent more error-prone */
+                    /* enviroments, by now just "1" exist         */
+    
+    int rtp_payload_size;   /* The size of the RTP payload, the coder will  */
+                            /* do it's best to deliver a chunk with size    */
+                            /* below rtp_payload_size, the chunk will start */
+                            /* with a start code on some codecs like H.263  */
+                            /* This doesn't take account of any particular  */
+                            /* headers inside the transmited RTP payload    */
+                 
     /* the following fields are ignored */
     void *opaque;   /* can be used to carry app specific stuff */
     char codec_name[32];
@@ -130,13 +161,30 @@ extern AVCodec mpeg_decoder;
 extern AVCodec h263i_decoder;
 extern AVCodec rv10_decoder;
 extern AVCodec mjpeg_decoder;
+#ifdef FF_AUDIO_CODECS
+extern AVCodec mp3_decoder;
+
+/* pcm codecs */
+#define PCM_CODEC(id, name) \
+extern AVCodec name ## _decoder; \
+extern AVCodec name ## _encoder;
 
-/* dummy raw codecs */
-extern AVCodec pcm_codec;
+PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le);
+PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be);
+PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le);
+PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be);
+PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8);
+PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8);
+PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw);
+PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw);
+
+#undef PCM_CODEC
+#endif
+
+/* dummy raw video codec */
 extern AVCodec rawvideo_codec;
 
 /* the following codecs use external GPL libs */
-extern AVCodec mp3_decoder;
 extern AVCodec ac3_decoder;
 
 /* resample.c */
@@ -184,6 +232,7 @@ void avcodec_init(void);
 
 void register_avcodec(AVCodec *format);
 AVCodec *avcodec_find_encoder(enum CodecID id);
+AVCodec *avcodec_find_encoder_by_name(const char *name);
 AVCodec *avcodec_find_decoder(enum CodecID id);
 AVCodec *avcodec_find_decoder_by_name(const char *name);
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode);
@@ -203,3 +252,13 @@ int avcodec_encode_video(AVCodecContext *avctx, UINT8 *buf, int buf_size,
 int avcodec_close(AVCodecContext *avctx);
 
 void avcodec_register_all(void);
+
+#ifdef FF_POSTPROCESS
+#ifndef MBC
+#define MBC 48
+#define MBR 36
+#endif
+extern int quant_store[MBR+1][MBC+1]; // [Review]
+#endif
+
+#endif /* AVCODEC_H */
diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c
index b3ce07c05..8f0fdc1bb 100644
--- a/src/libffmpeg/libavcodec/common.c
+++ b/src/libffmpeg/libavcodec/common.c
@@ -250,6 +250,52 @@ void align_get_bits(GetBitContext *s)
         get_bits(s, n);
     }
 }
+/* This function is identical to get_bits_long(), the */
+/* only diference is that it doesn't touch the buffer */
+/* it is usefull to see the buffer.                   */
+
+unsigned int show_bits_long(GetBitContext *s, int n)
+{
+    unsigned int val;
+    int bit_cnt;
+    unsigned int bit_buf;
+	UINT8 *buf_ptr;
+	
+    bit_buf = s->bit_buf;
+    bit_cnt = s->bit_cnt - n;
+
+    val = bit_buf >> (32 - n);
+    buf_ptr = s->buf_ptr;
+    buf_ptr += 4;
+
+    /* handle common case: we can read everything */
+    if (buf_ptr <= s->buf_end) {
+#ifdef ARCH_X86
+        bit_buf = bswap_32(*((unsigned long*)(&buf_ptr[-4])));
+#else
+        bit_buf = (buf_ptr[-4] << 24) |
+            (buf_ptr[-3] << 16) |
+            (buf_ptr[-2] << 8) |
+            (buf_ptr[-1]);	    
+#endif
+    } else {
+        buf_ptr -= 4;
+        bit_buf = 0;
+        if (buf_ptr < s->buf_end)
+            bit_buf |= *buf_ptr++ << 24;
+        if (buf_ptr < s->buf_end)
+            bit_buf |= *buf_ptr++ << 16;
+        if (buf_ptr < s->buf_end)
+            bit_buf |= *buf_ptr++ << 8;
+        if (buf_ptr < s->buf_end)
+            bit_buf |= *buf_ptr++;
+    }
+    val |= bit_buf >> (32 + bit_cnt);
+    bit_buf <<= - bit_cnt;
+    bit_cnt += 32;
+    
+    return val;
+}
 
 /* VLC decoding */
 
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 8ea58b2e7..be9678cf4 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -1,10 +1,13 @@
 #ifndef COMMON_H
 #define COMMON_H
 
-#define FFMPEG_VERSION_INT 0x000405
-#define FFMPEG_VERSION     "0.4.5"
+#define FFMPEG_VERSION_INT 0x000406
+#define FFMPEG_VERSION     "0.4.6"
+/* CVS version as 26-12-2001 */
 
-#ifdef WIN32
+#undef DEBUG
+
+#if defined(WIN32) && !defined(__MINGW32__)
 #define CONFIG_WIN32
 #endif
 
@@ -43,6 +46,7 @@ typedef INT16 int16_t;
 typedef UINT32 uint32_t;
 typedef INT32 int32_t;
 
+#ifndef __MINGW32__
 #define INT64_C(c)     (c ## i64)
 #define UINT64_C(c)    (c ## i64)
 
@@ -56,6 +60,11 @@ typedef INT32 int32_t;
 #pragma warning( disable : 4244 )
 #pragma warning( disable : 4305 )
 
+#else
+#define INT64_C(c)     (c ## LL)
+#define UINT64_C(c)    (c ## ULL)
+#endif /* __MINGW32__ */
+
 #define M_PI    3.14159265358979323846
 #define M_SQRT2 1.41421356237309504880  /* sqrt(2) */
 
@@ -71,10 +80,12 @@ typedef INT32 int32_t;
 
 #define snprintf _snprintf
 
+#ifndef __MINGW32__
+/* no config.h with VC */
 #define CONFIG_ENCODERS 1
 #define CONFIG_DECODERS 1
 #define CONFIG_AC3      1
-#define CONFIG_MPGLIB   1
+#endif
 
 #else
 
@@ -112,15 +123,35 @@ typedef signed long long INT64;
 #include "fastmemcpy.h"
 #endif
 
+#endif /* HAVE_AV_CONFIG_H */
+
+#endif /* !CONFIG_WIN32 */
+
+/* debug stuff */
+#ifdef HAVE_AV_CONFIG_H
+
 #ifndef DEBUG
 #define NDEBUG
 #endif
 #include <assert.h>
 
-#endif /* HAVE_AV_CONFIG_H */
+/* dprintf macros */
+#if defined(CONFIG_WIN32) && !defined(__MINGW32__)
+
+inline void dprintf(const char* fmt,...) {}
+
+#else
+
+#ifdef DEBUG
+#define dprintf(fmt,args...) printf(fmt, ## args)
+#else
+#define dprintf(fmt,args...)
+#endif
 
 #endif /* !CONFIG_WIN32 */
 
+#endif /* HAVE_AV_CONFIG_H */
+
 /* bit output */
 
 struct PutBitContext;
@@ -141,7 +172,7 @@ void init_put_bits(PutBitContext *s,
                    void *opaque,
                    void (*write_data)(void *, UINT8 *, int));
 void put_bits(PutBitContext *s, int n, unsigned int value);
-INT64 get_bit_count(PutBitContext *s);
+INT64 get_bit_count(PutBitContext *s); /* XXX: change function name */
 void align_put_bits(PutBitContext *s);
 void flush_put_bits(PutBitContext *s);
 
@@ -168,6 +199,7 @@ void init_get_bits(GetBitContext *s,
                    UINT8 *buffer, int buffer_size);
 
 unsigned int get_bits_long(GetBitContext *s, int n);
+unsigned int show_bits_long(GetBitContext *s, int n);
 
 static inline unsigned int get_bits(GetBitContext *s, int n){
     if(s->bit_cnt>=n){
@@ -197,6 +229,19 @@ static inline unsigned int get_bits1(GetBitContext *s){
     return get_bits_long(s,1);
 }
 
+/* This function is identical to get_bits(), the only */
+/* diference is that it doesn't touch the buffer      */
+/* it is usefull to see the buffer.                   */
+static inline unsigned int show_bits(GetBitContext *s, int n)
+{
+    if(s->bit_cnt>=n) {
+        /* most common case here */
+        unsigned int val = s->bit_buf >> (32 - n);
+        return val;
+    }
+    return show_bits_long(s,n);
+}
+
 static inline void skip_bits(GetBitContext *s, int n){
     if(s->bit_cnt>=n){
         /* most common case here */
@@ -223,6 +268,10 @@ static inline void skip_bits1(GetBitContext *s){
     }
 }
 
+static inline int get_bits_count(GetBitContext *s)
+{
+    return (s->buf_ptr - s->buf) * 8 - s->bit_cnt;
+}
 
 void align_get_bits(GetBitContext *s);
 int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index e68833b7c..f699b2ef6 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -20,7 +20,7 @@
 #include <stdio.h>
 #include "avcodec.h"
 #include "dsputil.h"
-#include "xineutils.h"
+#include "simple_idct.h"
 
 void (*ff_idct)(DCTELEM *block);
 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
@@ -389,6 +389,27 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
 
 /* permute block according so that it corresponds to the MMX idct
    order */
+#ifdef SIMPLE_IDCT
+void block_permute(INT16 *block)
+{
+	int i;
+	INT16 temp[64];
+
+//	for(i=0; i<64; i++) temp[i] = block[ block_permute_op(i) ];
+	for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
+
+	for(i=0; i<64; i++) block[i] = temp[i];
+/*
+	for(i=0; i<64; i++)
+	{
+		if((i&7)==0) printf("\n");
+		printf("%2d ", block[i]);
+	}
+*/
+}
+
+#else
+
 void block_permute(INT16 *block)
 {
     int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
@@ -410,16 +431,12 @@ void block_permute(INT16 *block)
         block += 8;
     }
 }
+#endif
 
 void dsputil_init(void)
 {
     int i, j;
-#ifdef ARCH_X86
-    uint32_t mm = xine_mm_accel();
-#endif
-    int use_permuted_mmx_idct;
-    int accel_dsputil;
-
+    int use_permuted_idct;
 
     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
     for(i=0;i<MAX_NEG_CROP;i++) {
@@ -431,7 +448,11 @@ void dsputil_init(void)
         squareTbl[i] = (i - 256) * (i - 256);
     }
 
+#ifdef SIMPLE_IDCT
+    ff_idct = simple_idct;
+#else
     ff_idct = j_rev_dct;
+#endif
     get_pixels = get_pixels_c;
     put_pixels_clamped = put_pixels_clamped_c;
     add_pixels_clamped = add_pixels_clamped_c;
@@ -442,46 +463,34 @@ void dsputil_init(void)
     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
     av_fdct = jpeg_fdct_ifast;
 
-    use_permuted_mmx_idct = 1;
-    accel_dsputil = 0;
+    use_permuted_idct = 1;
 
-#ifdef ARCH_X86
-    if (!accel_dsputil && (mm & MM_ACCEL_X86_MMX)) {
-	dsputil_init_mmx();
-	accel_dsputil = 1;
-	/* printf("AVCODEC: Using mmx idct\n"); */
-    }
+#ifdef HAVE_MMX
+    dsputil_init_mmx();
 #endif
 #ifdef ARCH_ARMV4L
-    if (!accel_dsputil) {
-	dsputil_init_armv4l();
-	accel_dsputil = 1;
-	/* printf("AVCODEC: Using armv4l idct\n"); */
-    }
+    dsputil_init_armv4l();
 #endif
 #ifdef HAVE_MLIB
-    if (!accel_dsputil) {
-	dsputil_init_mlib();
-	accel_dsputil = 1;
-	use_permuted_mmx_idct = 0;
-	/* printf("AVCODEC: Using mediaLib idct\n"); */
-    }
+    dsputil_init_mlib();
+    use_permuted_idct = 0;
 #endif
-    if (!accel_dsputil) {
-	/* printf("AVCODEC: Using C idct\n"); */
-    }
 
-    if (use_permuted_mmx_idct) {
-	/* permute for IDCT */
-	for(i=0;i<64;i++) {
-	    j = zigzag_direct[i];
-	    zigzag_direct[i] = block_permute_op(j);
-	    j = ff_alternate_horizontal_scan[i];
-	    ff_alternate_horizontal_scan[i] = block_permute_op(j);
-	    j = ff_alternate_vertical_scan[i];
-	    ff_alternate_vertical_scan[i] = block_permute_op(j);
-	}
-	block_permute(default_intra_matrix);
-	block_permute(default_non_intra_matrix);
+#ifdef SIMPLE_IDCT
+    if(ff_idct == simple_idct) use_permuted_idct=0;
+#endif
+
+    if (use_permuted_idct) {
+        /* permute for IDCT */
+        for(i=0;i<64;i++) {
+            j = zigzag_direct[i];
+            zigzag_direct[i] = block_permute_op(j);
+            j = ff_alternate_horizontal_scan[i];
+            ff_alternate_horizontal_scan[i] = block_permute_op(j);
+            j = ff_alternate_vertical_scan[i];
+            ff_alternate_vertical_scan[i] = block_permute_op(j);
+        }
+        block_permute(default_intra_matrix);
+        block_permute(default_non_intra_matrix);
     }
 }
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index e401065a0..182b1e706 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -67,19 +67,53 @@ int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx, int h);
 
+#if defined (SIMPLE_IDCT) && defined (ARCH_X86)
+static inline int block_permute_op(int j)
+{
+static const int table[64]={
+	0x00, 0x08, 0x01, 0x09, 0x04, 0x0C, 0x05, 0x0D,
+	0x10, 0x18, 0x11, 0x19, 0x14, 0x1C, 0x15, 0x1D,
+	0x02, 0x0A, 0x03, 0x0B, 0x06, 0x0E, 0x07, 0x0F,
+	0x12, 0x1A, 0x13, 0x1B, 0x16, 0x1E, 0x17, 0x1F,
+	0x20, 0x28, 0x21, 0x29, 0x24, 0x2C, 0x25, 0x2D,
+	0x30, 0x38, 0x31, 0x39, 0x34, 0x3C, 0x35, 0x3D,
+	0x22, 0x2A, 0x23, 0x2B, 0x26, 0x2E, 0x27, 0x2F,
+	0x32, 0x3A, 0x33, 0x3B, 0x36, 0x3E, 0x37, 0x3F,
+};
+
+	return table[j];
+}
+#elif defined (SIMPLE_IDCT)
+static inline int block_permute_op(int j)
+{
+    return j;
+}
+#else
 static inline int block_permute_op(int j)
 {
     return (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
 }
+#endif
 
 void block_permute(INT16 *block);
 
 #if defined(ARCH_X86)
 
+#define HAVE_MMX 1
+
 #include "xineutils.h"
 
 extern int mm_flags;
 
+#define mm_support() xine_mm_accel()
+
+#if 0
+static inline void emms(void)
+{
+    __asm __volatile ("emms;":::"memory");
+}
+#endif
+
 #define emms_c() \
 {\
     if (mm_flags & MM_MMX)\
@@ -92,36 +126,29 @@ void dsputil_init_mmx(void);
 
 #elif defined(ARCH_ARMV4L)
 
+#define emms_c()
+
 /* This is to use 4 bytes read to the IDCT pointers for some 'zero'
    line ptimizations */
 #define __align8 __attribute__ ((aligned (4)))
 
 void dsputil_init_armv4l(void);   
 
-#endif
-
-
-
-#if defined(HAVE_MLIB)
+#elif defined(HAVE_MLIB)
+ 
+#define emms_c()
 
 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
 #define __align8 __attribute__ ((aligned (8)))
 
-void dsputil_init_mlib(void);
+void dsputil_init_mlib(void);   
 
-#endif	/* HAVE_MLIB */
+#else
 
+#define emms_c()
 
-/*
- * provide empty defaults, if the target specific accelerated dsputils did
- * not define these:
- */
-#ifndef	__align8
-#define	__align8
-#endif
+#define __align8
 
-#ifndef	emms_c
-#define	emms_c()
 #endif
 
 #endif
diff --git a/src/libffmpeg/libavcodec/dsputil_mlib.c b/src/libffmpeg/libavcodec/dsputil_mlib.c
deleted file mode 100644
index 4be97d0c7..000000000
--- a/src/libffmpeg/libavcodec/dsputil_mlib.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Sun mediaLib optimized DSP utils
- * Copyright (c) 2001 Juergen Keil.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "dsputil.h"
-
-#include <mlib_types.h>
-#include <mlib_status.h>
-#include <mlib_sys.h>
-#include <mlib_video.h>
-
-
-static void put_pixels_mlib (uint8_t * dest, const uint8_t * ref,
-			     int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoCopyRef_U8_U8_8x16(dest, (uint8_t *)ref, stride);
-    else
-	mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *)ref, stride);
-}
-
-static void put_pixels_x2_mlib (uint8_t * dest, const uint8_t * ref,
-				int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoInterpX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-    else
-	mlib_VideoInterpX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-}
-
-static void put_pixels_y2_mlib (uint8_t * dest, const uint8_t * ref,
-				int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoInterpY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-    else
-	mlib_VideoInterpY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-}
-
-static void put_pixels_xy2_mlib(uint8_t * dest, const uint8_t * ref,
-				int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16) 
-	mlib_VideoInterpXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-    else
-	mlib_VideoInterpXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-}
-
-static void avg_pixels_mlib (uint8_t * dest, const uint8_t * ref,
-			     int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoCopyRefAve_U8_U8_8x16(dest, (uint8_t *)ref, stride);
-    else
-	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, (uint8_t *)ref, stride);
-}
-
-static void avg_pixels_x2_mlib (uint8_t * dest, const uint8_t * ref,
-				int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoInterpAveX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-    else
-	mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-}
-
-static void avg_pixels_y2_mlib (uint8_t * dest, const uint8_t * ref,
-				int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoInterpAveY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-    else
-	mlib_VideoInterpAveY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-}
-
-static void avg_pixels_xy2_mlib (uint8_t * dest, const uint8_t * ref,
-				 int stride, int height)
-{
-    assert(height == 16 || height == 8);
-    if (height == 16)
-	mlib_VideoInterpAveXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride);
-    else
-	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride);
-}
-
-
-static void add_pixels_clamped_mlib(const DCTELEM *block, UINT8 *pixels, int line_size)
-{
-    mlib_VideoAddBlock_U8_S16(pixels, (mlib_s16 *)block, line_size);
-}
-
-
-extern void ff_fdct_mlib(DCTELEM *data);
-extern void ff_idct_mlib(DCTELEM *data);
-
-void dsputil_init_mlib(void)
-{
-    av_fdct = ff_fdct_mlib;
-    ff_idct = ff_idct_mlib;
-
-    put_pixels_tab[0] = put_pixels_mlib;
-    put_pixels_tab[1] = put_pixels_x2_mlib;
-    put_pixels_tab[2] = put_pixels_y2_mlib;
-    put_pixels_tab[3] = put_pixels_xy2_mlib;
-
-    avg_pixels_tab[0] = avg_pixels_mlib;
-    avg_pixels_tab[1] = avg_pixels_x2_mlib;
-    avg_pixels_tab[2] = avg_pixels_y2_mlib;
-    avg_pixels_tab[3] = avg_pixels_xy2_mlib;
-    
-    put_no_rnd_pixels_tab[0] = put_pixels_mlib;
-    
-    add_pixels_clamped = add_pixels_clamped_mlib;
-}
diff --git a/src/libffmpeg/libavcodec/dsputil_mmx.c b/src/libffmpeg/libavcodec/dsputil_mmx.c
deleted file mode 100644
index b806c34e5..000000000
--- a/src/libffmpeg/libavcodec/dsputil_mmx.c
+++ /dev/null
@@ -1,1057 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Gerard Lantau.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- */
-
-#include "dsputil.h"
-#include "xineutils.h"
-
-int mm_flags; /* multimedia extension flags */
-
-int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
-
-/* external functions, from idct_mmx.c */
-void ff_mmx_idct(DCTELEM *block);
-void ff_mmxext_idct(DCTELEM *block);
-
-/* pixel operations */
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
-static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
-//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
-//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
-
-/***********************************/
-/* 3Dnow specific */
-
-#define DEF(x) x ## _3dnow
-/* for Athlons PAVGUSB is prefered */
-#define PAVGB "pavgusb"
-
-#include "dsputil_mmx_avg.h"
-
-#undef DEF
-#undef PAVGB
-
-/***********************************/
-/* MMX2 specific */
-
-#define DEF(x) x ## _sse
-
-/* Introduced only in MMX2 set */
-#define PAVGB "pavgb"
-
-#include "dsputil_mmx_avg.h"
-
-#undef DEF
-#undef PAVGB
-
-/***********************************/
-/* standard MMX */
-
-static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
-{
-    DCTELEM *p;
-    const UINT8 *pix;
-    int i;
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    __asm __volatile("pxor %%mm7, %%mm7":::"memory");
-    for(i=0;i<4;i++) {
-	__asm __volatile(
-		"movq	%1, %%mm0\n\t"
-		"movq	%2, %%mm1\n\t"
-		"movq	%%mm0, %%mm2\n\t"
-		"movq	%%mm1, %%mm3\n\t"
-		"punpcklbw %%mm7, %%mm0\n\t"
-		"punpckhbw %%mm7, %%mm2\n\t"
-		"punpcklbw %%mm7, %%mm1\n\t"
-		"punpckhbw %%mm7, %%mm3\n\t"
-		"movq	%%mm0, %0\n\t"
-		"movq	%%mm2, 8%0\n\t"
-		"movq	%%mm1, 16%0\n\t"
-		"movq	%%mm3, 24%0\n\t"
-		:"=m"(*p)
-		:"m"(*pix), "m"(*(pix+line_size))
-		:"memory");
-        pix += line_size*2;
-        p += 16;
-    }
-}
-
-static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
-{
-    const DCTELEM *p;
-    UINT8 *pix;
-    int i;
-    static int p_inc = 32;  /* hack to avoid gcc-2.95.2 loop unrolling bug */
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    for(i=0;i<2;i++) {
-	__asm __volatile(
-		"movq	%4, %%mm0\n\t"
-		"movq	8%4, %%mm1\n\t"
-		"movq	16%4, %%mm2\n\t"
-		"movq	24%4, %%mm3\n\t"
-		"movq	32%4, %%mm4\n\t"
-		"movq	40%4, %%mm5\n\t"
-		"movq	48%4, %%mm6\n\t"
-		"movq	56%4, %%mm7\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"packuswb %%mm3, %%mm2\n\t"
-		"packuswb %%mm5, %%mm4\n\t"
-		"packuswb %%mm7, %%mm6\n\t"
-		"movq	%%mm0, %0\n\t"
-		"movq	%%mm2, %1\n\t"
-		"movq	%%mm4, %2\n\t"
-		"movq	%%mm6, %3\n\t"
-		:"=m"(*pix), "=m"(*(pix+line_size))
-		,"=m"(*(pix+line_size*2)), "=m"(*(pix+line_size*3))
-		:"m"(*p)
-		:"memory");
-        pix += line_size*4;
-        p += p_inc;
-    }
-}
-
-static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
-{
-    const DCTELEM *p;
-    UINT8 *pix;
-    int i;
-    static int p_inc = 16;  /* hack to avoid gcc-2.95.2 loop unrolling bug */
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-	__asm __volatile("pxor	%%mm7, %%mm7":::"memory");
-    for(i=0;i<4;i++) {
-	__asm __volatile(
-		"movq	%2, %%mm0\n\t"
-		"movq	8%2, %%mm1\n\t"
-		"movq	16%2, %%mm2\n\t"
-		"movq	24%2, %%mm3\n\t"
-		"movq	%0, %%mm4\n\t"
-		"movq	%1, %%mm6\n\t"
-		"movq	%%mm4, %%mm5\n\t"
-		"punpcklbw %%mm7, %%mm4\n\t"
-		"punpckhbw %%mm7, %%mm5\n\t"
-		"paddsw	%%mm4, %%mm0\n\t"
-		"paddsw	%%mm5, %%mm1\n\t"
-		"movq	%%mm6, %%mm5\n\t"
-		"punpcklbw %%mm7, %%mm6\n\t"
-		"punpckhbw %%mm7, %%mm5\n\t"
-		"paddsw	%%mm6, %%mm2\n\t"
-		"paddsw	%%mm5, %%mm3\n\t"
-		"packuswb %%mm1, %%mm0\n\t"
-		"packuswb %%mm3, %%mm2\n\t"
-		"movq	%%mm0, %0\n\t"
-		"movq	%%mm2, %1\n\t"
-		:"=m"(*pix), "=m"(*(pix+line_size))
-		:"m"(*p)
-		:"memory");
-        pix += line_size*2;
-        p += p_inc;
-    }
-}
-
-static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-    int dh, hh;
-    UINT8 *p;
-    const UINT8 *pix;
-    p   = block;
-    pix = pixels;
-    hh=h>>2;
-    dh=h&3;
-    while(hh--) {
-    __asm __volatile(
-	"movq	%4, %%mm0\n\t"
-	"movq	%5, %%mm1\n\t"
-	"movq	%6, %%mm2\n\t"
-	"movq	%7, %%mm3\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	"movq	%%mm2, %2\n\t"
-	"movq	%%mm3, %3\n\t"
-	:"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
-	:"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
-	:"memory");
-        pix = pix + line_size*4;
-        p =   p   + line_size*4;
-    }
-    while(dh--) {
-     __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-        pix = pix + line_size;
-        p =   p   + line_size;
-    }
-}
-
-static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm4\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size; p += line_size;
-  } while (--h);
-}
-
-static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm4\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p += line_size;
-  } while (--h);
-}
-
-static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p += line_size;
-  } while(--h);
-}
-
-static void   put_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while (--h);
-}
-
-static void put_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while(--h);
-}
-
-static void   put_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while(--h);
-}
-
-static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm6, %%mm0\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  }
-  while (--h);
-}
-
-static void   avg_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"paddusw %%mm4, %%mm1\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm1\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"paddusw %%mm6, %%mm0\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
-  } while (--h);
-}
-
-static void   avg_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"paddusw %%mm4, %%mm1\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm1\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"paddusw %%mm6, %%mm0\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void   avg_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"movq	%3, %%mm5\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"movq	%0, %%mm1\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"paddusw %%mm5, %%mm0\n\t"
-	"paddusw %%mm5, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size)), "m"(mm_wone)
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void avg_no_rnd_pixels_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7\n\t":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%0, %%mm1\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while (--h);
-}
-
-static void   avg_no_rnd_pixels_x2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	"movq	%0, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm5, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   line_size;
- } while (--h);
-}
-
-static void   avg_no_rnd_pixels_y2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	%0, %%mm4\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm5, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void   avg_no_rnd_pixels_xy2_mmx( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"movq	%0, %%mm1\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"psrlw	$1, %%mm0\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p += line_size;
-  } while(--h);
-}
-
-static void sub_pixels_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile("pxor %%mm7, %%mm7":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   8;
-  } while (--h);
-}
-
-static void sub_pixels_x2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t"
-      "movq	%0, %%mm6"
-      ::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
-}
-
-static void sub_pixels_y2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7\n\t"
-      "movq	%0, %%mm6"
-      ::"m"(mm_wone):"memory");
-  do {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"movq	%%mm4, %%mm5\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpckhbw %%mm7, %%mm5\n\t"
-	"paddusw %%mm4, %%mm2\n\t"
-	"paddusw %%mm5, %%mm3\n\t"
-	"paddusw %%mm6, %%mm2\n\t"
-	"paddusw %%mm6, %%mm3\n\t"
-	"psrlw	$1, %%mm2\n\t"
-	"psrlw	$1, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"=m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
-}
-
-static void   sub_pixels_xy2_mmx( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"movq	%0, %%mm1\n\t"
-	"movq	8%0, %%mm3\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"psubsw %%mm0, %%mm1\n\t"
-	"psubsw %%mm2, %%mm3\n\t"
-	"movq	%%mm1, %0\n\t"
-	"movq	%%mm3, 8%0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   8 ;
-  } while(--h);
-}
-
-void dsputil_init_mmx(void)
-{
-    mm_flags = xine_mm_accel();
-#if 0
-    printf("CPU flags:");
-    if (mm_flags & MM_MMX)
-        printf(" mmx");
-    if (mm_flags & MM_MMXEXT)
-        printf(" mmxext");
-    if (mm_flags & MM_3DNOW)
-        printf(" 3dnow");
-    if (mm_flags & MM_SSE)
-        printf(" sse");
-    if (mm_flags & MM_SSE2)
-        printf(" sse2");
-    printf("\n");
-#endif
-
-    if (mm_flags & MM_MMX) {
-        get_pixels = get_pixels_mmx;
-        put_pixels_clamped = put_pixels_clamped_mmx;
-        add_pixels_clamped = add_pixels_clamped_mmx;
-        
-        pix_abs16x16 = pix_abs16x16_mmx;
-        pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
-        pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
-        pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
-        /* av_fdct = fdct_mmx; */
-
-        put_pixels_tab[0] = put_pixels_mmx;
-        put_pixels_tab[1] = put_pixels_x2_mmx;
-        put_pixels_tab[2] = put_pixels_y2_mmx;
-        put_pixels_tab[3] = put_pixels_xy2_mmx;
-
-        put_no_rnd_pixels_tab[0] = put_pixels_mmx;
-        put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
-        put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
-        put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
-        
-        avg_pixels_tab[0] = avg_pixels_mmx;
-        avg_pixels_tab[1] = avg_pixels_x2_mmx;
-        avg_pixels_tab[2] = avg_pixels_y2_mmx;
-        avg_pixels_tab[3] = avg_pixels_xy2_mmx;
-
-        avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
-        avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
-        avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
-        avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
-        
-        sub_pixels_tab[0] = sub_pixels_mmx;
-        sub_pixels_tab[1] = sub_pixels_x2_mmx;
-        sub_pixels_tab[2] = sub_pixels_y2_mmx;
-        sub_pixels_tab[3] = sub_pixels_xy2_mmx;
-
-        if (mm_flags & MM_MMXEXT) {
-            pix_abs16x16 = pix_abs16x16_sse;
-        }
-
-        if (mm_flags & MM_SSE) {
-            put_pixels_tab[1] = put_pixels_x2_sse;
-            put_pixels_tab[2] = put_pixels_y2_sse;
-            
-            avg_pixels_tab[0] = avg_pixels_sse;
-            avg_pixels_tab[1] = avg_pixels_x2_sse;
-            avg_pixels_tab[2] = avg_pixels_y2_sse;
-            avg_pixels_tab[3] = avg_pixels_xy2_sse;
-
-            sub_pixels_tab[1] = sub_pixels_x2_sse;
-            sub_pixels_tab[2] = sub_pixels_y2_sse;
-        } else if (mm_flags & MM_3DNOW) {
-            put_pixels_tab[1] = put_pixels_x2_3dnow;
-            put_pixels_tab[2] = put_pixels_y2_3dnow;
-            
-            avg_pixels_tab[0] = avg_pixels_3dnow;
-            avg_pixels_tab[1] = avg_pixels_x2_3dnow;
-            avg_pixels_tab[2] = avg_pixels_y2_3dnow;
-            avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
-
-            sub_pixels_tab[1] = sub_pixels_x2_3dnow;
-            sub_pixels_tab[2] = sub_pixels_y2_3dnow;
-        }
-
-        /* idct */
-        if (mm_flags & MM_MMXEXT) {
-            ff_idct = ff_mmxext_idct;
-        } else {
-            ff_idct = ff_mmx_idct;
-        }
-    }
-}
diff --git a/src/libffmpeg/libavcodec/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/dsputil_mmx_avg.h
deleted file mode 100644
index 5cd640f71..000000000
--- a/src/libffmpeg/libavcodec/dsputil_mmx_avg.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * DSP utils : average functions are compiled twice for 3dnow/mmx2
- * Copyright (c) 2000, 2001 Gerard Lantau.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- */
-
-static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>2;
-  dh=h&3;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%4, %%mm0\n\t"
-	"movq	1%4, %%mm1\n\t"
-	"movq	%5, %%mm2\n\t"
-	"movq	1%5, %%mm3\n\t"
-	"movq	%6, %%mm4\n\t"
-	"movq	1%6, %%mm5\n\t"
-	"movq	%7, %%mm6\n\t"
-	"movq	1%7, %%mm7\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm3, %%mm2\n\t"
-	PAVGB"  %%mm5, %%mm4\n\t"
-	PAVGB"  %%mm7, %%mm6\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm2, %1\n\t"
-	"movq	%%mm4, %2\n\t"
-	"movq	%%mm6, %3\n\t"
-	:"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
-	:"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
-	:"memory");
-     pix += line_size*4; p += line_size*4;
-  }
-  while(dh--) {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	1%1, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-     pix += line_size; p += line_size;
-  }
-}
-
-static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%2, %%mm0\n\t"
-	"movq	%3, %%mm1\n\t"
-	"movq	%4, %%mm2\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm2, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"=m"(*p), "=m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size)),
-	 "m"(*(pix+line_size*2))
-	:"memory");
-     pix += line_size*2;
-     p += line_size*2;
-  }
-  if(dh) {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-  }
-}
-
-static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
-{
-  int dh, hh;
-  UINT8 *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>2;
-  dh=h&3;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%4, %%mm1\n\t"
-	"movq	%1, %%mm2\n\t"
-	"movq	%5, %%mm3\n\t"
-	"movq	%2, %%mm4\n\t"
-	"movq	%6, %%mm5\n\t"
-	"movq	%3, %%mm6\n\t"
-	"movq	%7, %%mm7\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	PAVGB"  %%mm3, %%mm2\n\t"
-	PAVGB"  %%mm5, %%mm4\n\t"
-	PAVGB"  %%mm7, %%mm6\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm2, %1\n\t"
-	"movq	%%mm4, %2\n\t"
-	"movq	%%mm6, %3\n\t"
-	:"=m"(*p), "=m"(*(p+line_size)), "=m"(*(p+line_size*2)), "=m"(*(p+line_size*3))
-	:"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)), "m"(*(pix+line_size*3))
-	:"memory");
-     pix += line_size*4; p += line_size*4;
-  }
-  while(dh--) {
-    __asm __volatile(
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"  %%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-     pix += line_size; p += line_size;
-  }
-}
-
-static void DEF(avg_pixels_x2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  int dh, hh;
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	1%2, %%mm3\n\t"
-	"movq	%3, %%mm4\n\t"
-	"movq	1%3, %%mm5\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"	%%mm3, %%mm2\n\t"
-	PAVGB"	%%mm2, %%mm0\n\t"
-	PAVGB"	%%mm5, %%mm4\n\t"
-	PAVGB"	%%mm4, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"=m"(*p), "=m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size*2;
-   p +=   line_size*2;
-  }
-  if(dh) {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	1%1, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%%mm2, %%mm1\n\t"
-	PAVGB"	%%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-  }
-}
-
-static void  DEF(avg_pixels_y2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  int dh, hh;
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  hh=h>>1;
-  dh=h&1;
-  while(hh--) {
-    __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	%3, %%mm3\n\t"
-	"movq	%3, %%mm4\n\t"
-	"movq	%4, %%mm5\n\t"
-	"movq	%0, %%mm0\n\t"
-	"movq	%1, %%mm1\n\t"
-	PAVGB"	%%mm3, %%mm2\n\t"
-	PAVGB"	%%mm2, %%mm0\n\t"
-	PAVGB"	%%mm5, %%mm4\n\t"
-	PAVGB"	%%mm4, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, %1\n\t"
-	:"=m"(*p), "=m"(*(p+line_size))
-	:"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2))
-	:"memory");
-   pix += line_size*2;
-   p +=   line_size*2;
-  }
-  if(dh) {
-    __asm __volatile(
-	"movq	%1, %%mm1\n\t"
-	"movq	%2, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%%mm2, %%mm1\n\t"
-	PAVGB"	%%mm1, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-  }
-}
-
-static void DEF(avg_pixels_xy2)( UINT8  *block, const UINT8 *pixels, int line_size, int h)
-{
-  UINT8  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-	"pxor	%%mm7, %%mm7\n\t"
-	"movq	%0, %%mm6\n\t"
-	::"m"(mm_wtwo):"memory");
-  do {
-    __asm __volatile(
-	"movq	%1, %%mm0\n\t"
-	"movq	%2, %%mm1\n\t"
-	"movq	1%1, %%mm4\n\t"
-	"movq	1%2, %%mm5\n\t"
-	"movq	%%mm0, %%mm2\n\t"
-	"movq	%%mm1, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm0\n\t"
-	"punpcklbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm1, %%mm0\n\t"
-	"paddusw %%mm3, %%mm2\n\t"
-	"movq	%%mm4, %%mm1\n\t"
-	"movq	%%mm5, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm4\n\t"
-	"punpcklbw %%mm7, %%mm5\n\t"
-	"punpckhbw %%mm7, %%mm1\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"paddusw %%mm5, %%mm4\n\t"
-	"paddusw %%mm3, %%mm1\n\t"
-	"paddusw %%mm6, %%mm4\n\t"
-	"paddusw %%mm6, %%mm1\n\t"
-	"paddusw %%mm4, %%mm0\n\t"
-	"paddusw %%mm1, %%mm2\n\t"
-	"psrlw	$2, %%mm0\n\t"
-	"psrlw	$2, %%mm2\n\t"
-	"packuswb  %%mm2, %%mm0\n\t"
-	PAVGB"	%0, %%mm0\n\t"
-	"movq	%%mm0, %0\n\t"
-	:"=m"(*p)
-	:"m"(*pix),
-	 "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   line_size ;
-  } while(--h);
-}
-
-static void DEF(sub_pixels_x2)( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7":::"memory");
-  do {
-    __asm __volatile(
-	"movq	1%1, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"=m"(*p)
-	:"m"(*pix)
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
-}
-
-static void DEF(sub_pixels_y2)( DCTELEM  *block, const UINT8 *pixels, int line_size, int h)
-{
-  DCTELEM  *p;
-  const UINT8 *pix;
-  p = block;
-  pix = pixels;
-  __asm __volatile(
-      "pxor	%%mm7, %%mm7":::"memory");
-  do {
-    __asm __volatile(
-	"movq	%2, %%mm2\n\t"
-	"movq	%0, %%mm0\n\t"
-	PAVGB"	%1, %%mm2\n\t"
-	"movq	8%0, %%mm1\n\t"
-	"movq	%%mm2, %%mm3\n\t"
-	"punpcklbw %%mm7, %%mm2\n\t"
-	"punpckhbw %%mm7, %%mm3\n\t"
-	"psubsw %%mm2, %%mm0\n\t"
-	"psubsw %%mm3, %%mm1\n\t"
-	"movq	%%mm0, %0\n\t"
-	"movq	%%mm1, 8%0\n\t"
-	:"=m"(*p)
-	:"m"(*pix), "m"(*(pix+line_size))
-	:"memory");
-   pix += line_size;
-   p +=   8;
- } while (--h);
-}
-
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index 29e6ab191..da694411f 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -1,7 +1,7 @@
 /*
  * H263/MPEG4 backend for ffmpeg encoder and decoder
  * Copyright (c) 2000,2001 Gerard Lantau.
- * H263+ support for custom picture format.
+ * H263+ support.
  * Copyright (c) 2001 Juan J. Sierralta P.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -28,9 +28,11 @@
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block,
 			      int n);
 static void h263_encode_motion(MpegEncContext * s, int val);
+static void h263p_encode_umotion(MpegEncContext * s, int val);
 static void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block,
 			       int n);
 static int h263_decode_motion(MpegEncContext * s, int pred);
+static int h263p_decode_umotion(MpegEncContext * s, int pred);
 static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
                              int n, int coded);
 static int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
@@ -57,7 +59,7 @@ int h263_get_picture_format(int width, int height)
 
 void h263_encode_picture_header(MpegEncContext * s, int picture_number)
 {
-    int format, umvplus;
+    int format;
 
     align_put_bits(&s->pb);
     put_bits(&s->pb, 22, 0x20);
@@ -69,10 +71,10 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
     put_bits(&s->pb, 1, 0);	/* split screen off */
     put_bits(&s->pb, 1, 0);	/* camera  off */
     put_bits(&s->pb, 1, 0);	/* freeze picture release off */
-
+    
+    format = h263_get_picture_format(s->width, s->height);
     if (!s->h263_plus) {
         /* H.263v1 */
-        format = h263_get_picture_format(s->width, s->height);
         put_bits(&s->pb, 3, format);
         put_bits(&s->pb, 1, (s->pict_type == P_TYPE));
         /* By now UMV IS DISABLED ON H.263v1, since the restrictions
@@ -89,10 +91,14 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
         /* H.263 Plus PTYPE */
         put_bits(&s->pb, 3, 7);
         put_bits(&s->pb,3,1); /* Update Full Extended PTYPE */
-        put_bits(&s->pb,3,6); /* Custom Source Format */
+        if (format == 7)
+            put_bits(&s->pb,3,6); /* Custom Source Format */
+        else
+            put_bits(&s->pb, 3, format);
+            
         put_bits(&s->pb,1,0); /* Custom PCF: off */
-        umvplus = (s->pict_type == P_TYPE) && s->unrestricted_mv;
-        put_bits(&s->pb, 1, umvplus); /* Unrestricted Motion Vector */
+        s->umvplus = (s->pict_type == P_TYPE) && s->unrestricted_mv;
+        put_bits(&s->pb, 1, s->umvplus); /* Unrestricted Motion Vector */
         put_bits(&s->pb,1,0); /* SAC: off */
         put_bits(&s->pb,1,0); /* Advanced Prediction Mode: off */
         put_bits(&s->pb,1,0); /* Advanced Intra Coding: off */
@@ -116,14 +122,17 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
         /* This should be here if PLUSPTYPE */
         put_bits(&s->pb, 1, 0);	/* Continuous Presence Multipoint mode: off */
 		
-        /* Custom Picture Format (CPFMT) */
+		if (format == 7) {
+            /* Custom Picture Format (CPFMT) */
 		
-        put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */
-        put_bits(&s->pb,9,(s->width >> 2) - 1);
-        put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */
-        put_bits(&s->pb,9,(s->height >> 2));
+            put_bits(&s->pb,4,2); /* Aspect ratio: CIF 12:11 (4:3) picture */
+            put_bits(&s->pb,9,(s->width >> 2) - 1);
+            put_bits(&s->pb,1,1); /* "1" to prevent start code emulation */
+            put_bits(&s->pb,9,(s->height >> 2));
+        }
+        
         /* Unlimited Unrestricted Motion Vectors Indicator (UUI) */
-        if (umvplus)
+        if (s->umvplus)
             put_bits(&s->pb,1,1); /* Limited according tables of Annex D */
         put_bits(&s->pb, 5, s->qscale);
     }
@@ -131,40 +140,82 @@ void h263_encode_picture_header(MpegEncContext * s, int picture_number)
     put_bits(&s->pb, 1, 0);	/* no PEI */
 }
 
+int h263_encode_gob_header(MpegEncContext * s, int mb_line)
+{
+    int pdif=0;
+    
+    /* Check to see if we need to put a new GBSC */
+    /* for RTP packetization                    */
+    if (s->rtp_mode) {
+        pdif = s->pb.buf_ptr - s->ptr_lastgob;
+        if (pdif >= s->rtp_payload_size) {
+            /* Bad luck, packet must be cut before */
+            align_put_bits(&s->pb);
+            s->ptr_lastgob = s->pb.buf_ptr;
+            put_bits(&s->pb, 17, 1); /* GBSC */
+            s->gob_number = mb_line;
+            put_bits(&s->pb, 5, s->gob_number); /* GN */
+            put_bits(&s->pb, 2, 1); /* GFID */
+            put_bits(&s->pb, 5, s->qscale); /* GQUANT */
+            return pdif;
+       } else if (pdif + s->mb_line_avgsize >= s->rtp_payload_size) {
+           /* Cut the packet before we can't */
+           align_put_bits(&s->pb);
+           s->ptr_lastgob = s->pb.buf_ptr;
+           put_bits(&s->pb, 17, 1); /* GBSC */
+           s->gob_number = mb_line;
+           put_bits(&s->pb, 5, s->gob_number); /* GN */
+           put_bits(&s->pb, 2, 1); /* GFID */
+           put_bits(&s->pb, 5, s->qscale); /* GQUANT */
+           return pdif;
+       }
+   }
+   return 0;
+}
+    
 void h263_encode_mb(MpegEncContext * s,
 		    DCTELEM block[6][64],
 		    int motion_x, int motion_y)
 {
     int cbpc, cbpy, i, cbp, pred_x, pred_y;
-
+   
     //    printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
-    if (!s->mb_intra) {
-	/* compute cbp */
-	cbp = 0;
-	for (i = 0; i < 6; i++) {
-	    if (s->block_last_index[i] >= 0)
-		cbp |= 1 << (5 - i);
-	}
-	if ((cbp | motion_x | motion_y) == 0) {
-	    /* skip macroblock */
-	    put_bits(&s->pb, 1, 1);
-	    return;
-	}
-	put_bits(&s->pb, 1, 0);	/* mb coded */
-	cbpc = cbp & 3;
-	put_bits(&s->pb,
-		 inter_MCBPC_bits[cbpc],
-		 inter_MCBPC_code[cbpc]);
-	cbpy = cbp >> 2;
-	cbpy ^= 0xf;
-	put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
-
-	/* motion vectors: 16x16 mode only now */
-        h263_pred_motion(s, 0, &pred_x, &pred_y);
-        
-        h263_encode_motion(s, motion_x - pred_x);
-        h263_encode_motion(s, motion_y - pred_y);
-    } else {
+   if (!s->mb_intra) {
+	   /* compute cbp */
+	   cbp = 0;
+	   for (i = 0; i < 6; i++) {
+	      if (s->block_last_index[i] >= 0)
+		   cbp |= 1 << (5 - i);
+	   }
+	   if ((cbp | motion_x | motion_y) == 0) {
+	      /* skip macroblock */
+	      put_bits(&s->pb, 1, 1);
+	      return;
+	   }
+	   put_bits(&s->pb, 1, 0);	/* mb coded */
+	   cbpc = cbp & 3;
+	   put_bits(&s->pb,
+		inter_MCBPC_bits[cbpc],
+		inter_MCBPC_code[cbpc]);
+	   cbpy = cbp >> 2;
+	   cbpy ^= 0xf;
+	   put_bits(&s->pb, cbpy_tab[cbpy][1], cbpy_tab[cbpy][0]);
+
+	   /* motion vectors: 16x16 mode only now */
+      h263_pred_motion(s, 0, &pred_x, &pred_y);
+      
+      if (!s->umvplus) {  
+         h263_encode_motion(s, motion_x - pred_x);
+         h263_encode_motion(s, motion_y - pred_y);
+      }
+      else {
+         h263p_encode_umotion(s, motion_x - pred_x);
+         h263p_encode_umotion(s, motion_y - pred_y);
+         if (((motion_x - pred_x) == 1) && ((motion_y - pred_y) == 1))
+            /* To prevent Start Code emulation */
+            put_bits(&s->pb,1,1);
+      }
+   } else {
 	/* compute cbp */
 	cbp = 0;
 	for (i = 0; i < 6; i++) {
@@ -234,7 +285,7 @@ INT16 *h263_pred_motion(MpegEncContext * s, int block,
     mot_val = s->motion_val[(x) + (y) * wrap];
 
     /* special case for first line */
-    if (y == 1 || s->first_slice_line) {
+    if (y == 1 || s->first_slice_line || s->first_gob_line) {
         A = s->motion_val[(x-1) + (y) * wrap];
         *px = A[0];
         *py = A[1];
@@ -305,6 +356,45 @@ static void h263_encode_motion(MpegEncContext * s, int val)
     }
 }
 
+/* Encode MV differences on H.263+ with Unrestricted MV mode */
+static void h263p_encode_umotion(MpegEncContext * s, int val)
+{
+    short sval = 0; 
+    short i = 0;
+    short n_bits = 0;
+    short temp_val;
+    int code = 0;
+    int tcode;
+    
+    if ( val == 0)
+        put_bits(&s->pb, 1, 1);
+    else if (val == 1)
+        put_bits(&s->pb, 3, 0);
+    else if (val == -1)
+        put_bits(&s->pb, 3, 2);
+    else {
+        
+        sval = ((val < 0) ? (short)(-val):(short)val);
+        temp_val = sval;
+        
+        while (temp_val != 0) {
+            temp_val = temp_val >> 1;
+            n_bits++;
+        }
+        
+        i = n_bits - 1;
+        while (i > 0) {
+            tcode = (sval & (1 << (i-1))) >> (i-1);
+            tcode = (tcode << 1) | 1;
+            code = (code << 2) | tcode;
+            i--;
+        }
+        code = ((code << 1) | (val < 0)) << 1;
+        put_bits(&s->pb, (2*n_bits)+1, code);
+        //printf("\nVal = %d\tCode = %d", sval, code);
+    }
+}
+
 void h263_encode_init_vlc(MpegEncContext *s)
 {
     static int done = 0;
@@ -464,7 +554,7 @@ static int mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *di
     return pred;
 }
 
-void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, 
+void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
                    int dir)
 {
     int x, y, wrap, i;
@@ -489,22 +579,22 @@ void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
             /* left prediction */
             ac_val -= 16;
             for(i=1;i<8;i++) {
-                block[i*8] += ac_val[i];
+                block[block_permute_op(i*8)] += ac_val[i];
             }
         } else {
             /* top prediction */
             ac_val -= 16 * wrap;
             for(i=1;i<8;i++) {
-                block[i] += ac_val[i + 8];
+                block[block_permute_op(i)] += ac_val[i + 8];
             }
         }
     }
     /* left copy */
     for(i=1;i<8;i++)
-        ac_val1[i] = block[i * 8];
+        ac_val1[i] = block[block_permute_op(i * 8)];
     /* top copy */
     for(i=1;i<8;i++)
-        ac_val1[8 + i] = block[i];
+        ac_val1[8 + i] = block[block_permute_op(i)];
 }
 
 static inline void mpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr)
@@ -693,7 +783,7 @@ void h263_decode_init_vlc(MpegEncContext *s)
         init_vlc(&intra_MCBPC_vlc, 6, 8, 
                  intra_MCBPC_bits, 1, 1,
                  intra_MCBPC_code, 1, 1);
-        init_vlc(&inter_MCBPC_vlc, 9, 20, 
+        init_vlc(&inter_MCBPC_vlc, 9, 25, 
                  inter_MCBPC_bits, 1, 1,
                  inter_MCBPC_code, 1, 1);
         init_vlc(&cbpy_vlc, 6, 16,
@@ -715,13 +805,38 @@ void h263_decode_init_vlc(MpegEncContext *s)
     }
 }
 
+int h263_decode_gob_header(MpegEncContext *s)
+{
+    unsigned int val, gfid;
+    
+    /* Check for GOB Start Code */
+    val = show_bits(&s->gb, 16);
+    if (val == 0) {
+        /* We have a GBSC probably with GSTUFF */
+        skip_bits(&s->gb, 16); /* Drop the zeros */
+        while (get_bits1(&s->gb) == 0); /* Seek the '1' bit */
+#ifdef DEBUG
+        fprintf(stderr,"\nGOB Start Code at MB %d\n", (s->mb_y * s->mb_width) + s->mb_x);
+#endif
+        s->gob_number = get_bits(&s->gb, 5); /* GN */
+        gfid = get_bits(&s->gb, 2); /* GFID */
+        s->qscale = get_bits(&s->gb, 5); /* GQUANT */
+#ifdef DEBUG
+        fprintf(stderr, "\nGN: %u GFID: %u Quant: %u\n", gn, gfid, s->qscale);
+#endif
+        return 1;
+    }
+    return 0;
+            
+}
+
 int h263_decode_mb(MpegEncContext *s,
                    DCTELEM block[6][64])
 {
     int cbpc, cbpy, i, cbp, pred_x, pred_y, mx, my, dquant;
     INT16 *mot_val;
     static INT8 quant_tab[4] = { -1, -2, 1, 2 };
-
+    
     if (s->pict_type == P_TYPE) {
         if (get_bits1(&s->gb)) {
             /* skip mb */
@@ -736,8 +851,14 @@ int h263_decode_mb(MpegEncContext *s,
             return 0;
         }
         cbpc = get_vlc(&s->gb, &inter_MCBPC_vlc);
+        //fprintf(stderr, "\tCBPC: %d", cbpc);
         if (cbpc < 0)
             return -1;
+        if (cbpc > 20)
+            cbpc+=3;
+        else if (cbpc == 20)
+            fprintf(stderr, "Stuffing !");
+        
         dquant = cbpc & 8;
         s->mb_intra = ((cbpc & 4) != 0);
     } else {
@@ -763,33 +884,55 @@ int h263_decode_mb(MpegEncContext *s,
             /* 16x16 motion prediction */
             s->mv_type = MV_TYPE_16X16;
             h263_pred_motion(s, 0, &pred_x, &pred_y);
-            mx = h263_decode_motion(s, pred_x);
+            if (s->umvplus_dec)
+               mx = h263p_decode_umotion(s, pred_x);
+            else
+               mx = h263_decode_motion(s, pred_x);
             if (mx >= 0xffff)
                 return -1;
-            my = h263_decode_motion(s, pred_y);
+            
+            if (s->umvplus_dec)
+               my = h263p_decode_umotion(s, pred_y);
+            else    
+               my = h263_decode_motion(s, pred_y);
             if (my >= 0xffff)
                 return -1;
             s->mv[0][0][0] = mx;
             s->mv[0][0][1] = my;
+            /*fprintf(stderr, "\n MB %d", (s->mb_y * s->mb_width) + s->mb_x);
+            fprintf(stderr, "\n\tmvx: %d\t\tpredx: %d", mx, pred_x);
+            fprintf(stderr, "\n\tmvy: %d\t\tpredy: %d", my, pred_y);*/
+            if (s->umvplus_dec && (mx - pred_x) == 1 && (my - pred_y) == 1)
+               skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
+                           
         } else {
             s->mv_type = MV_TYPE_8X8;
             for(i=0;i<4;i++) {
                 mot_val = h263_pred_motion(s, i, &pred_x, &pred_y);
-                mx = h263_decode_motion(s, pred_x);
+                if (s->umvplus_dec)
+                  mx = h263p_decode_umotion(s, pred_x);
+                else
+                  mx = h263_decode_motion(s, pred_x);
                 if (mx >= 0xffff)
                     return -1;
-                my = h263_decode_motion(s, pred_y);
+                
+                if (s->umvplus_dec)
+                  my = h263p_decode_umotion(s, pred_y);
+                else    
+                  my = h263_decode_motion(s, pred_y);
                 if (my >= 0xffff)
                     return -1;
                 s->mv[0][i][0] = mx;
                 s->mv[0][i][1] = my;
+                if (s->umvplus_dec && (mx - pred_x) == 1 && (my - pred_y) == 1)
+                  skip_bits1(&s->gb); /* Bit stuffing to prevent PSC */
                 mot_val[0] = mx;
                 mot_val[1] = my;
             }
         }
     } else {
         s->ac_pred = 0;
-	if (s->h263_pred) {
+	    if (s->h263_pred) {
             s->ac_pred = get_bits1(&s->gb);
         }
         cbpy = get_vlc(&s->gb, &cbpy_vlc);
@@ -853,10 +996,37 @@ static int h263_decode_motion(MpegEncContext * s, int pred)
             val += 64;
         if (pred > 32 && val > 63)
             val -= 64;
+        
     }
     return val;
 }
 
+/* Decodes RVLC of H.263+ UMV */
+static int h263p_decode_umotion(MpegEncContext * s, int pred)
+{
+   int code = 0, sign;
+   
+   if (get_bits1(&s->gb)) /* Motion difference = 0 */
+      return pred;
+   
+   code = 2 + get_bits1(&s->gb);
+   
+   while (get_bits1(&s->gb))
+   {
+      code <<= 1;
+      code += get_bits1(&s->gb);
+   }
+   sign = code & 1;
+   code >>= 1;
+   
+   code = (sign) ? (pred - code) : (pred + code);
+#ifdef DEBUG
+   fprintf(stderr,"H.263+ UMV Motion = %d\n", code);
+#endif
+   return code;   
+
+}
+
 static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
                              int n, int coded)
 {
@@ -1081,16 +1251,21 @@ int h263_decode_picture_header(MpegEncContext *s)
     skip_bits1(&s->gb);	/* camera  off */
     skip_bits1(&s->gb);	/* freeze picture release off */
 
+    /* Reset GOB number */
+    s->gob_number = 0;
+        
     format = get_bits(&s->gb, 3);
 
-    if (format != 7) {
+    if (format != 7 && format != 6) {
         s->h263_plus = 0;
         /* H.263v1 */
         width = h263_format[format][0];
         height = h263_format[format][1];
         if (!width)
             return -1;
-
+        
+        s->width = width;
+        s->height = height;
         s->pict_type = I_TYPE + get_bits1(&s->gb);
 
         s->unrestricted_mv = get_bits1(&s->gb); 
@@ -1098,33 +1273,68 @@ int h263_decode_picture_header(MpegEncContext *s)
 
         if (get_bits1(&s->gb) != 0)
             return -1;	/* SAC: off */
-        if (get_bits1(&s->gb) != 0)
-            return -1;	/* advanced prediction mode: off */
+        if (get_bits1(&s->gb) != 0) {
+            s->mv_type = MV_TYPE_8X8; /* Advanced prediction mode */
+        }   
+        
         if (get_bits1(&s->gb) != 0)
             return -1;	/* not PB frame */
 
         s->qscale = get_bits(&s->gb, 5);
         skip_bits1(&s->gb);	/* Continuous Presence Multipoint mode: off */
     } else {
-        s->h263_plus = 1;
+        int ufep;
+        
         /* H.263v2 */
-        if (get_bits(&s->gb, 3) != 1)
-            return -1;
-        if (get_bits(&s->gb, 3) != 6) /* custom source format */
+        s->h263_plus = 1;
+        ufep = get_bits(&s->gb, 3); /* Update Full Extended PTYPE */
+        
+        if (ufep == 1) {
+            /* OPPTYPE */       
+            format = get_bits(&s->gb, 3);
+            skip_bits(&s->gb,1); /* Custom PCF */
+            s->umvplus_dec = get_bits(&s->gb, 1); /* Unrestricted Motion Vector */
+            skip_bits1(&s->gb); /* Syntax-based Arithmetic Coding (SAC) */
+            if (get_bits1(&s->gb) != 0) {
+                s->mv_type = MV_TYPE_8X8; /* Advanced prediction mode */
+            }
+            skip_bits(&s->gb, 8);
+            skip_bits(&s->gb, 3); /* Reserved */
+        } else if (ufep != 0)
             return -1;
-        skip_bits(&s->gb, 12);
-        skip_bits(&s->gb, 3);
+            
+        /* MPPTYPE */
         s->pict_type = get_bits(&s->gb, 3) + 1;
         if (s->pict_type != I_TYPE &&
             s->pict_type != P_TYPE)
             return -1;
         skip_bits(&s->gb, 7);
-        skip_bits(&s->gb, 4); /* aspect ratio */
-        width = (get_bits(&s->gb, 9) + 1) * 4;
-        skip_bits1(&s->gb);
-        height = get_bits(&s->gb, 9) * 4;
-        if (height == 0)
-            return -1;
+        
+        /* Get the picture dimensions */
+        if (ufep) {
+            if (format == 6) {
+                /* Custom Picture Format (CPFMT) */
+                skip_bits(&s->gb, 4); /* aspect ratio */
+                width = (get_bits(&s->gb, 9) + 1) * 4;
+                skip_bits1(&s->gb);
+                height = get_bits(&s->gb, 9) * 4;
+#ifdef DEBUG 
+                fprintf(stderr,"\nH.263+ Custom picture: %dx%d\n",width,height);
+#endif            
+            }
+            else {
+                width = h263_format[format][0];
+                height = h263_format[format][1];
+            }
+            if ((width == 0) || (height == 0))
+                return -1;
+            s->width = width;
+            s->height = height;
+            if (s->umvplus_dec) {
+                skip_bits1(&s->gb); /* Unlimited Unrestricted Motion Vectors Indicator (UUI) */
+            }
+        }
+            
         s->qscale = get_bits(&s->gb, 5);
     }
     /* PEI */
@@ -1132,8 +1342,6 @@ int h263_decode_picture_header(MpegEncContext *s)
         skip_bits(&s->gb, 8);
     }
     s->f_code = 1;
-    s->width = width;
-    s->height = height;
     return 0;
 }
 
@@ -1335,3 +1543,4 @@ int intel_h263_decode_picture_header(MpegEncContext *s)
     s->f_code = 1;
     return 0;
 }
+
diff --git a/src/libffmpeg/libavcodec/h263data.h b/src/libffmpeg/libavcodec/h263data.h
index 4fd9d3629..d41996a3b 100644
--- a/src/libffmpeg/libavcodec/h263data.h
+++ b/src/libffmpeg/libavcodec/h263data.h
@@ -4,6 +4,27 @@ static const UINT8 intra_MCBPC_code[8] = { 1, 1, 2, 3, 1, 1, 2, 3 };
 static const UINT8 intra_MCBPC_bits[8] = { 1, 3, 3, 3, 4, 6, 6, 6 };
 
 /* inter MCBPC, mb_type = (inter), (intra), (interq), (intraq), (inter4v) */
+/* Changed the tables for interq and inter4v+q, following the standard ** Juanjo ** */
+static const UINT8 inter_MCBPC_code[25] = { 
+    1, 3, 2, 5, 
+    3, 4, 3, 3, 
+    3, 7, 6, 5,
+    4, 4, 3, 2,
+    2, 5, 4, 5,
+    1, /* Stuffing */
+    2, 12, 14, 15,
+};
+static const UINT8 inter_MCBPC_bits[25] = { 
+    1, 4, 4, 6, 
+    5, 8, 8, 7,
+    3, 7, 7, 9,
+    6, 9, 9, 9,
+    3, 7, 7, 8,
+    9, /* Stuffing */
+    11, 13, 13, 13,
+};
+
+/* This is the old table 
 static const UINT8 inter_MCBPC_code[20] = { 
     1, 3, 2, 5, 
     3, 4, 3, 3, 
@@ -17,7 +38,7 @@ static const UINT8 inter_MCBPC_bits[20] = {
     12, 12, 12, 12,
     6, 9, 9, 9,
     3, 7, 7, 8,
-};
+};*/
 
 static const UINT8 cbpy_tab[16][2] =
 {
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index ec14c09e2..be76e3030 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -23,7 +23,7 @@
 #include "avcodec.h"
 #include "mpegvideo.h"
 
-#undef DEBUG
+//#define DEBUG
 
 static int h263_decode_init(AVCodecContext *avctx)
 {
@@ -39,6 +39,8 @@ static int h263_decode_init(AVCodecContext *avctx)
     /* select sub codec */
     switch(avctx->codec->id) {
     case CODEC_ID_H263:
+        s->gob_number = 0;
+        s->first_gob_line = 0;
         break;
     case CODEC_ID_MPEG4:
         s->time_increment_bits = 4; /* default value for broken headers */
@@ -56,8 +58,9 @@ static int h263_decode_init(AVCodecContext *avctx)
     }
 
     /* for h263, we allocate the images after having read the header */
-    if (MPV_common_init(s) < 0)
-        return -1;
+    if (avctx->codec->id != CODEC_ID_H263)
+        if (MPV_common_init(s) < 0)
+            return -1;
 
     /* XXX: suppress this matrix init, only needed because using mpeg1
        dequantize in mmx case */
@@ -92,7 +95,7 @@ static int h263_decode_frame(AVCodecContext *avctx,
     printf("*****frame %d size=%d\n", avctx->frame_number, buf_size);
     printf("bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]);
 #endif
-
+    
     /* no supplementary picture */
     if (buf_size == 0) {
         *data_size = 0;
@@ -110,18 +113,25 @@ static int h263_decode_frame(AVCodecContext *avctx,
         ret = intel_h263_decode_picture_header(s);
     } else {
         ret = h263_decode_picture_header(s);
+        /* After H263 header decode we have the height, width,       */
+        /* and other parameters. So then we could init the picture   */
+        /* FIXME: By the way H263 decoder is evolving it should have */
+        /* an H263EncContext                                         */
+        if (!s->context_initialized) {
+            avctx->width = s->width;
+            avctx->height = s->height;
+            if (MPV_common_init(s) < 0)
+                return -1;
+        } else if (s->width != avctx->width || s->height != avctx->height) {
+            /* H.263 could change picture size any time */
+            MPV_common_end(s);
+            if (MPV_common_init(s) < 0)
+                return -1;
+        }
     }
     if (ret < 0)
         return -1;
 
-    /* make sure we start with an I-frame */
-    if (s->waiting_for_keyframe) {
-      if (s->pict_type != I_TYPE)
-	return -1;
-      else
-	s->waiting_for_keyframe = 0;
-    }
-
     MPV_frame_start(s);
 
 #ifdef DEBUG
@@ -130,10 +140,17 @@ static int h263_decode_frame(AVCodecContext *avctx,
 
     /* decode each macroblock */
     for(s->mb_y=0; s->mb_y < s->mb_height; s->mb_y++) {
+        /* Check for GOB headers on H.263 */
+        /* FIXME: In the future H.263+ will have intra prediction */
+        /* and we are gonna need another way to detect MPEG4      */
+        if (s->mb_y && !s->h263_pred) {
+            s->first_gob_line = h263_decode_gob_header(s);
+        }
         for(s->mb_x=0; s->mb_x < s->mb_width; s->mb_x++) {
 #ifdef DEBUG
             printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 #endif
+            //fprintf(stderr,"\nFrame: %d\tMB: %d",avctx->frame_number, (s->mb_y * s->mb_width) + s->mb_x);
             /* DCT & quantize */
             if (s->h263_msmpeg4) {
                 msmpeg4_dc_scale(s);
@@ -152,8 +169,10 @@ static int h263_decode_frame(AVCodecContext *avctx,
                 if (msmpeg4_decode_mb(s, s->block) < 0)
                     return -1;
             } else {
-                if (h263_decode_mb(s, s->block) < 0)
+                if (h263_decode_mb(s, s->block) < 0) {
+                    fprintf(stderr,"\nError at MB: %d\n", (s->mb_y * s->mb_width) + s->mb_x);
                     return -1;
+                }
             }
             MPV_decode_mb(s, s->block);
         }
diff --git a/src/libffmpeg/libavcodec/idct_mlib.c b/src/libffmpeg/libavcodec/idct_mlib.c
deleted file mode 100644
index 63421273f..000000000
--- a/src/libffmpeg/libavcodec/idct_mlib.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Sun mediaLib optimized DSP utils
- * Copyright (c) 2001 Juergen Keil.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include "dsputil.h"
-
-#include <mlib_types.h>
-#include <mlib_status.h>
-#include <mlib_sys.h>
-#include <mlib_video.h>
-
-
-void ff_idct_mlib(DCTELEM *data)
-{
-    mlib_VideoIDCT8x8_S16_S16 (data, data);
-}
-
-
-void ff_fdct_mlib(DCTELEM *data)
-{
-    mlib_VideoDCT8x8_S16_S16 (data, data);
-}
diff --git a/src/libffmpeg/libavcodec/idct_mmx.c b/src/libffmpeg/libavcodec/idct_mmx.c
deleted file mode 100644
index 7141347ec..000000000
--- a/src/libffmpeg/libavcodec/idct_mmx.c
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Note: For libavcodec, this code can also be used under the LGPL license
- */
-/*
- * idct_mmx.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include <inttypes.h>
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "xineutils.h"
-
-#ifdef ATTR_ALIGN
-#undef ATTR_ALIGN
-#endif
-#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 6
-
-#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
-#define rounder(bias) {round (bias), round (bias)}
-
-#if 0
-/* C row IDCT - its just here to document the MMXEXT and MMX versions */
-static inline void idct_row (int16_t * row, int offset,
-			     int16_t * table, int32_t * rounder)
-{
-    int C1, C2, C3, C4, C5, C6, C7;
-    int a0, a1, a2, a3, b0, b1, b2, b3;
-
-    row += offset;
-
-    C1 = table[1];
-    C2 = table[2];
-    C3 = table[3];
-    C4 = table[4];
-    C5 = table[5];
-    C6 = table[6];
-    C7 = table[7];
-
-    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
-    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
-    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
-    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
-
-    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
-    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
-    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
-    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
-
-    row[0] = (a0 + b0) >> ROW_SHIFT;
-    row[1] = (a1 + b1) >> ROW_SHIFT;
-    row[2] = (a2 + b2) >> ROW_SHIFT;
-    row[3] = (a3 + b3) >> ROW_SHIFT;
-    row[4] = (a3 - b3) >> ROW_SHIFT;
-    row[5] = (a2 - b2) >> ROW_SHIFT;
-    row[6] = (a1 - b1) >> ROW_SHIFT;
-    row[7] = (a0 - b0) >> ROW_SHIFT;
-}
-#endif
-
-
-/* MMXEXT row IDCT */
-
-#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
-						   c4,  c6,  c4,  c6,	\
-						   c1,  c3, -c1, -c5,	\
-						   c5,  c7,  c3, -c7,	\
-						   c4, -c6,  c4, -c6,	\
-						  -c4,  c2,  c4, -c2,	\
-						   c5, -c1,  c3, -c1,	\
-						   c7,  c3,  c7, -c5 }
-
-static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
-{
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
-
-    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
-
-    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
-
-    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
-}
-
-static inline void mmxext_row (int16_t * table, int32_t * rounder)
-{
-    movq_m2r (*(table+8), mm1);		// mm1 = -C5 -C1 C3 C1
-    pmaddwd_r2r (mm2, mm4);		// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
-
-    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
-    pshufw_r2r (mm6, mm6, 0x4e);	// mm6 = x3 x1 x7 x5
-
-    movq_m2r (*(table+12), mm7);	// mm7 = -C7 C3 C7 C5
-    pmaddwd_r2r (mm5, mm1);		// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
-
-    paddd_m2r (*rounder, mm3);		// mm3 += rounder
-    pmaddwd_r2r (mm6, mm7);		// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
-
-    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
-    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
-
-    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
-    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
-
-    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
-    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
-
-    paddd_m2r (*rounder, mm0);		// mm0 += rounder
-    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
-
-    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
-    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
-
-    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
-    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
-
-    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
-    movq_r2r (mm0, mm4);		// mm4 = a3 a2 + rounder
-
-    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
-    psubd_r2r (mm5, mm4);		// mm4 = a3-b3 a2-b2 + rounder
-}
-
-static inline void mmxext_row_tail (int16_t * row, int store)
-{
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
-
-    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
-
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-
-    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
-
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
-
-    /* slot */
-
-    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
-}
-
-static inline void mmxext_row_mid (int16_t * row, int store,
-				   int offset, int16_t * table)
-{
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
-
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
-
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
-
-    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
-
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
-
-    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
-    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
-
-    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
-
-    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
-    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
-}
-
-
-/* MMX row IDCT */
-
-#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
-					   c4,  c6, -c4, -c2,	\
-					   c1,  c3,  c3, -c7,	\
-					   c5,  c7, -c1, -c5,	\
-					   c4, -c6,  c4, -c2,	\
-					  -c4,  c2,  c4, -c6,	\
-					   c5, -c1,  c7, -c5,	\
-					   c7,  c3,  c3, -c1 }
-
-static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
-{
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
-
-    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
-
-    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
-
-    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
-
-    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
-    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
-}
-
-static inline void mmx_row (int16_t * table, int32_t * rounder)
-{
-    pmaddwd_r2r (mm2, mm4);		// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
-    punpckldq_r2r (mm5, mm5);		// mm5 = x3 x1 x3 x1
-
-    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
-    punpckhdq_r2r (mm6, mm6);		// mm6 = x7 x5 x7 x5
-
-    movq_m2r (*(table+12), mm7);	// mm7 = -C5 -C1 C7 C5
-    pmaddwd_r2r (mm5, mm1);		// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
-
-    paddd_m2r (*rounder, mm3);		// mm3 += rounder
-    pmaddwd_r2r (mm6, mm7);		// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
-
-    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
-    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
-
-    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
-    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
-
-    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
-    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
-
-    paddd_m2r (*rounder, mm0);		// mm0 += rounder
-    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
-
-    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
-    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
-
-    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
-    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
-
-    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
-    movq_r2r (mm0, mm7);		// mm7 = a3 a2 + rounder
-
-    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
-    psubd_r2r (mm5, mm7);		// mm7 = a3-b3 a2-b2 + rounder
-}
-
-static inline void mmx_row_tail (int16_t * row, int store)
-{
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
-
-    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
-
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-
-    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
-
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    movq_r2r (mm7, mm4);		// mm4 = y6 y7 y4 y5
-
-    pslld_i2r (16, mm7);		// mm7 = y7 0 y5 0
-
-    psrld_i2r (16, mm4);		// mm4 = 0 y6 0 y4
-
-    por_r2r (mm4, mm7);			// mm7 = y7 y6 y5 y4
-
-    /* slot */
-
-    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
-}
-
-static inline void mmx_row_mid (int16_t * row, int store,
-				int offset, int16_t * table)
-{
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
-
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
-
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
-
-    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
-
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    movq_r2r (mm7, mm1);		// mm1 = y6 y7 y4 y5
-
-    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
-    psrld_i2r (16, mm7);		// mm7 = 0 y6 0 y4
-
-    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
-    pslld_i2r (16, mm1);		// mm1 = y7 0 y5 0
-
-    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
-    por_r2r (mm1, mm7);			// mm7 = y7 y6 y5 y4
-
-    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
-    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
-
-    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
-}
-
-
-#if 0
-// C column IDCT - its just here to document the MMXEXT and MMX versions
-static inline void idct_col (int16_t * col, int offset)
-{
-/* multiplication - as implemented on mmx */
-#define F(c,x) (((c) * (x)) >> 16)
-
-/* saturation - it helps us handle torture test cases */
-#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
-
-    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
-    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
-    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
-    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
-
-    col += offset;
-
-    x0 = col[0*8];
-    x1 = col[1*8];
-    x2 = col[2*8];
-    x3 = col[3*8];
-    x4 = col[4*8];
-    x5 = col[5*8];
-    x6 = col[6*8];
-    x7 = col[7*8];
-
-    u04 = S (x0 + x4);
-    v04 = S (x0 - x4);
-    u26 = S (F (T2, x6) + x2);
-    v26 = S (F (T2, x2) - x6);
-
-    a0 = S (u04 + u26);
-    a1 = S (v04 + v26);
-    a2 = S (v04 - v26);
-    a3 = S (u04 - u26);
-
-    u17 = S (F (T1, x7) + x1);
-    v17 = S (F (T1, x1) - x7);
-    u35 = S (F (T3, x5) + x3);
-    v35 = S (F (T3, x3) - x5);
-
-    b0 = S (u17 + u35);
-    b3 = S (v17 - v35);
-    u12 = S (u17 - u35);
-    v12 = S (v17 + v35);
-    u12 = S (2 * F (C4, u12));
-    v12 = S (2 * F (C4, v12));
-    b1 = S (u12 + v12);
-    b2 = S (u12 - v12);
-
-    y0 = S (a0 + b0) >> COL_SHIFT;
-    y1 = S (a1 + b1) >> COL_SHIFT;
-    y2 = S (a2 + b2) >> COL_SHIFT;
-    y3 = S (a3 + b3) >> COL_SHIFT;
-
-    y4 = S (a3 - b3) >> COL_SHIFT;
-    y5 = S (a2 - b2) >> COL_SHIFT;
-    y6 = S (a1 - b1) >> COL_SHIFT;
-    y7 = S (a0 - b0) >> COL_SHIFT;
-
-    col[0*8] = y0;
-    col[1*8] = y1;
-    col[2*8] = y2;
-    col[3*8] = y3;
-    col[4*8] = y4;
-    col[5*8] = y5;
-    col[6*8] = y6;
-    col[7*8] = y7;
-}
-#endif
-
-
-// MMX column IDCT
-static inline void idct_col (int16_t * col, int offset)
-{
-#define T1 13036
-#define T2 27146
-#define T3 43790
-#define C4 23170
-
-    static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
-    static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
-    static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
-    static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
-
-    /* column code adapted from peter gubanov */
-    /* http://www.elecard.com/peter/idct.shtml */
-
-    movq_m2r (*_T1, mm0);		// mm0 = T1
-
-    movq_m2r (*(col+offset+1*8), mm1);	// mm1 = x1
-    movq_r2r (mm0, mm2);		// mm2 = T1
-
-    movq_m2r (*(col+offset+7*8), mm4);	// mm4 = x7
-    pmulhw_r2r (mm1, mm0);		// mm0 = T1*x1
-
-    movq_m2r (*_T3, mm5);		// mm5 = T3
-    pmulhw_r2r (mm4, mm2);		// mm2 = T1*x7
-
-    movq_m2r (*(col+offset+5*8), mm6);	// mm6 = x5
-    movq_r2r (mm5, mm7);		// mm7 = T3-1
-
-    movq_m2r (*(col+offset+3*8), mm3);	// mm3 = x3
-    psubsw_r2r (mm4, mm0);		// mm0 = v17
-
-    movq_m2r (*_T2, mm4);		// mm4 = T2
-    pmulhw_r2r (mm3, mm5);		// mm5 = (T3-1)*x3
-
-    paddsw_r2r (mm2, mm1);		// mm1 = u17
-    pmulhw_r2r (mm6, mm7);		// mm7 = (T3-1)*x5
-
-    /* slot */
-
-    movq_r2r (mm4, mm2);		// mm2 = T2
-    paddsw_r2r (mm3, mm5);		// mm5 = T3*x3
-
-    pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2
-    paddsw_r2r (mm6, mm7);		// mm7 = T3*x5
-
-    psubsw_r2r (mm6, mm5);		// mm5 = v35
-    paddsw_r2r (mm3, mm7);		// mm7 = u35
-
-    movq_m2r (*(col+offset+6*8), mm3);	// mm3 = x6
-    movq_r2r (mm0, mm6);		// mm6 = v17
-
-    pmulhw_r2r (mm3, mm2);		// mm2 = T2*x6
-    psubsw_r2r (mm5, mm0);		// mm0 = b3
-
-    psubsw_r2r (mm3, mm4);		// mm4 = v26
-    paddsw_r2r (mm6, mm5);		// mm5 = v12
-
-    movq_r2m (mm0, *(col+offset+3*8));	// save b3 in scratch0
-    movq_r2r (mm1, mm6);		// mm6 = u17
-
-    paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26
-    paddsw_r2r (mm7, mm6);		// mm6 = b0
-
-    psubsw_r2r (mm7, mm1);		// mm1 = u12
-    movq_r2r (mm1, mm7);		// mm7 = u12
-
-    movq_m2r (*(col+offset+0*8), mm3);	// mm3 = x0
-    paddsw_r2r (mm5, mm1);		// mm1 = u12+v12
-
-    movq_m2r (*_C4, mm0);		// mm0 = C4/2
-    psubsw_r2r (mm5, mm7);		// mm7 = u12-v12
-
-    movq_r2m (mm6, *(col+offset+5*8));	// save b0 in scratch1
-    pmulhw_r2r (mm0, mm1);		// mm1 = b1/2
-
-    movq_r2r (mm4, mm6);		// mm6 = v26
-    pmulhw_r2r (mm0, mm7);		// mm7 = b2/2
-
-    movq_m2r (*(col+offset+4*8), mm5);	// mm5 = x4
-    movq_r2r (mm3, mm0);		// mm0 = x0
-
-    psubsw_r2r (mm5, mm3);		// mm3 = v04
-    paddsw_r2r (mm5, mm0);		// mm0 = u04
-
-    paddsw_r2r (mm3, mm4);		// mm4 = a1
-    movq_r2r (mm0, mm5);		// mm5 = u04
-
-    psubsw_r2r (mm6, mm3);		// mm3 = a2
-    paddsw_r2r (mm2, mm5);		// mm5 = a0
-
-    paddsw_r2r (mm1, mm1);		// mm1 = b1
-    psubsw_r2r (mm2, mm0);		// mm0 = a3
-
-    paddsw_r2r (mm7, mm7);		// mm7 = b2
-    movq_r2r (mm3, mm2);		// mm2 = a2
-
-    movq_r2r (mm4, mm6);		// mm6 = a1
-    paddsw_r2r (mm7, mm3);		// mm3 = a2+b2
-
-    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y2
-    paddsw_r2r (mm1, mm4);		// mm4 = a1+b1
-
-    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y1
-    psubsw_r2r (mm1, mm6);		// mm6 = a1-b1
-
-    movq_m2r (*(col+offset+5*8), mm1);	// mm1 = b0
-    psubsw_r2r (mm7, mm2);		// mm2 = a2-b2
-
-    psraw_i2r (COL_SHIFT, mm6);		// mm6 = y6
-    movq_r2r (mm5, mm7);		// mm7 = a0
-
-    movq_r2m (mm4, *(col+offset+1*8));	// save y1
-    psraw_i2r (COL_SHIFT, mm2);		// mm2 = y5
-
-    movq_r2m (mm3, *(col+offset+2*8));	// save y2
-    paddsw_r2r (mm1, mm5);		// mm5 = a0+b0
-
-    movq_m2r (*(col+offset+3*8), mm4);	// mm4 = b3
-    psubsw_r2r (mm1, mm7);		// mm7 = a0-b0
-
-    psraw_i2r (COL_SHIFT, mm5);		// mm5 = y0
-    movq_r2r (mm0, mm3);		// mm3 = a3
-
-    movq_r2m (mm2, *(col+offset+5*8));	// save y5
-    psubsw_r2r (mm4, mm3);		// mm3 = a3-b3
-
-    psraw_i2r (COL_SHIFT, mm7);		// mm7 = y7
-    paddsw_r2r (mm0, mm4);		// mm4 = a3+b3
-
-    movq_r2m (mm5, *(col+offset+0*8));	// save y0
-    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y4
-
-    movq_r2m (mm6, *(col+offset+6*8));	// save y6
-    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y3
-
-    movq_r2m (mm7, *(col+offset+7*8));	// save y7
-
-    movq_r2m (mm3, *(col+offset+4*8));	// save y4
-
-    movq_r2m (mm4, *(col+offset+3*8));	// save y3
-}
-
-
-static int32_t rounder0[] ATTR_ALIGN(8) =
-    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
-static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
-static int32_t rounder1[] ATTR_ALIGN(8) =
-    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
-static int32_t rounder7[] ATTR_ALIGN(8) =
-    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
-static int32_t rounder2[] ATTR_ALIGN(8) =
-    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
-static int32_t rounder6[] ATTR_ALIGN(8) =
-    rounder (-0.25);		/* C2 * (C6-C2)/2 */
-static int32_t rounder3[] ATTR_ALIGN(8) =
-    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
-static int32_t rounder5[] ATTR_ALIGN(8) =
-    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
-
-
-#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
-void idct (int16_t * block)					\
-{									\
-    static int16_t table04[] ATTR_ALIGN(16) =				\
-	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
-    static int16_t table17[] ATTR_ALIGN(16) =				\
-	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
-    static int16_t table26[] ATTR_ALIGN(16) =				\
-	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
-    static int16_t table35[] ATTR_ALIGN(16) =				\
-	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
-									\
-    idct_row_head (block, 0*8, table04);				\
-    idct_row (table04, rounder0);					\
-    idct_row_mid (block, 0*8, 4*8, table04);				\
-    idct_row (table04, rounder4);					\
-    idct_row_mid (block, 4*8, 1*8, table17);				\
-    idct_row (table17, rounder1);					\
-    idct_row_mid (block, 1*8, 7*8, table17);				\
-    idct_row (table17, rounder7);					\
-    idct_row_mid (block, 7*8, 2*8, table26);				\
-    idct_row (table26, rounder2);					\
-    idct_row_mid (block, 2*8, 6*8, table26);				\
-    idct_row (table26, rounder6);					\
-    idct_row_mid (block, 6*8, 3*8, table35);				\
-    idct_row (table35, rounder3);					\
-    idct_row_mid (block, 3*8, 5*8, table35);				\
-    idct_row (table35, rounder5);					\
-    idct_row_tail (block, 5*8);						\
-									\
-    idct_col (block, 0);						\
-    idct_col (block, 4);						\
-}
-
-
-declare_idct (ff_mmxext_idct, mmxext_table,
-	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
-
-declare_idct (ff_mmx_idct, mmx_table,
-	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c
new file mode 100644
index 000000000..d39b6c1e9
--- /dev/null
+++ b/src/libffmpeg/libavcodec/imgconvert.c
@@ -0,0 +1,548 @@
+/*
+ * Misc image convertion routines
+ * Copyright (c) 2001 Gerard Lantau.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include "avcodec.h"
+#include "dsputil.h"
+
+#ifdef USE_FASTMEMCPY
+#include "fastmemcpy.h"
+#endif
+/* XXX: totally non optimized */
+
+static void yuv422_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
+                              UINT8 *src, int width, int height)
+{
+    int x, y;
+    UINT8 *p = src;
+
+    for(y=0;y<height;y+=2) {
+        for(x=0;x<width;x+=2) {
+            lum[0] = p[0];
+            cb[0] = p[1];
+            lum[1] = p[2];
+            cr[0] = p[3];
+            p += 4;
+            lum += 2;
+            cb++;
+            cr++;
+        }
+        for(x=0;x<width;x+=2) {
+            lum[0] = p[0];
+            lum[1] = p[2];
+            p += 4;
+            lum += 2;
+        }
+    }
+}
+
+#define SCALEBITS 8
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+#define FIX(x)		((int) ((x) * (1L<<SCALEBITS) + 0.5))
+
+static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
+                              UINT8 *src, int width, int height)
+{
+    int wrap, wrap3, x, y;
+    int r, g, b, r1, g1, b1;
+    UINT8 *p;
+
+    wrap = width;
+    wrap3 = width * 3;
+    p = src;
+    for(y=0;y<height;y+=2) {
+        for(x=0;x<width;x+=2) {
+            r = p[0];
+            g = p[1];
+            b = p[2];
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            r = p[3];
+            g = p[4];
+            b = p[5];
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            p += wrap3;
+            lum += wrap;
+
+            r = p[0];
+            g = p[1];
+            b = p[2];
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            r = p[3];
+            g = p[4];
+            b = p[5];
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            
+            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
+                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
+            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
+                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
+
+            cb++;
+            cr++;
+            p += -wrap3 + 2 * 3;
+            lum += -wrap + 2;
+        }
+        p += wrap3;
+        lum += wrap;
+    }
+}
+
+static void bgr24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
+                              UINT8 *src, int width, int height)
+{
+    int wrap, wrap3, x, y;
+    int r, g, b, r1, g1, b1;
+    UINT8 *p;
+
+    wrap = width;
+    wrap3 = width * 3;
+    p = src;
+    for(y=0;y<height;y+=2) {
+        for(x=0;x<width;x+=2) {
+            b = p[0];
+            g = p[1];
+            r = p[2];
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            b = p[3];
+            g = p[4];
+            r = p[5];
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            p += wrap3;
+            lum += wrap;
+
+            b = p[0];
+            g = p[1];
+            r = p[2];
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            b = p[3];
+            g = p[4];
+            r = p[5];
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
+            
+            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
+                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
+            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
+                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
+
+            cb++;
+            cr++;
+            p += -wrap3 + 2 * 3;
+            lum += -wrap + 2;
+        }
+        p += wrap3;
+        lum += wrap;
+    }
+}
+
+/* XXX: use generic filter ? */
+/* 1x2 -> 1x1 */
+static void shrink2(UINT8 *dst, int dst_wrap, 
+                    UINT8 *src, int src_wrap,
+                    int width, int height)
+{
+    int w;
+    UINT8 *s1, *s2, *d;
+
+    for(;height > 0; height--) {
+        s1 = src;
+        s2 = s1 + src_wrap;
+        d = dst;
+        for(w = width;w >= 4; w-=4) {
+            d[0] = (s1[0] + s2[0]) >> 1;
+            d[1] = (s1[1] + s2[1]) >> 1;
+            d[2] = (s1[2] + s2[2]) >> 1;
+            d[3] = (s1[3] + s2[3]) >> 1;
+            s1 += 4;
+            s2 += 4;
+            d += 4;
+        }
+        for(;w > 0; w--) {
+            d[0] = (s1[0] + s2[0]) >> 1;
+            s1++;
+            s2++;
+            d++;
+        }
+        src += 2 * src_wrap;
+        dst += dst_wrap;
+    }
+}
+
+/* 2x2 -> 1x1 */
+static void shrink22(UINT8 *dst, int dst_wrap, 
+                     UINT8 *src, int src_wrap,
+                     int width, int height)
+{
+    int w;
+    UINT8 *s1, *s2, *d;
+
+    for(;height > 0; height--) {
+        s1 = src;
+        s2 = s1 + src_wrap;
+        d = dst;
+        for(w = width;w >= 4; w-=4) {
+            d[0] = (s1[0] + s1[1] + s2[0] + s2[1] + 2) >> 1;
+            d[1] = (s1[2] + s1[3] + s2[2] + s2[3] + 2) >> 1;
+            d[2] = (s1[4] + s1[5] + s2[4] + s2[5] + 2) >> 1;
+            d[3] = (s1[6] + s1[7] + s2[6] + s2[7] + 2) >> 1;
+            s1 += 8;
+            s2 += 8;
+            d += 4;
+        }
+        for(;w > 0; w--) {
+            d[0] = (s1[0] + s1[1] + s2[0] + s2[1] + 2) >> 1;
+            s1 += 2;
+            s2 += 2;
+            d++;
+        }
+        src += 2 * src_wrap;
+        dst += dst_wrap;
+    }
+}
+
+static void img_copy(UINT8 *dst, int dst_wrap, 
+                     UINT8 *src, int src_wrap,
+                     int width, int height)
+{
+    for(;height > 0; height--) {
+        memcpy(dst, src, width);
+        dst += dst_wrap;
+        src += src_wrap;
+    }
+}
+
+#define SCALE_BITS 10
+
+#define C_Y  (76309 >> (16 - SCALE_BITS))
+#define C_RV (117504 >> (16 - SCALE_BITS))
+#define C_BU (138453 >> (16 - SCALE_BITS))
+#define C_GU (13954 >> (16 - SCALE_BITS))
+#define C_GV (34903 >> (16 - SCALE_BITS))
+
+#define RGBOUT(r, g, b, y1)\
+{\
+    y = (y1 - 16) * C_Y;\
+    r = cm[(y + r_add) >> SCALE_BITS];\
+    g = cm[(y + g_add) >> SCALE_BITS];\
+    b = cm[(y + b_add) >> SCALE_BITS];\
+}
+
+/* XXX: no chroma interpolating is done */
+static void yuv420p_to_rgb24(AVPicture *dst, AVPicture *src, 
+                             int width, int height)
+{
+    UINT8 *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
+    int w, y, cb, cr, r_add, g_add, b_add, width2;
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;
+
+    d = dst->data[0];
+    y1_ptr = src->data[0];
+    cb_ptr = src->data[1];
+    cr_ptr = src->data[2];
+    width2 = width >> 1;
+    for(;height > 0; height -= 2) {
+        d1 = d;
+        d2 = d + dst->linesize[0];
+        y2_ptr = y1_ptr + src->linesize[0];
+        for(w = width2; w > 0; w --) {
+            cb = cb_ptr[0] - 128;
+            cr = cr_ptr[0] - 128;
+            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
+            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
+            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
+            
+            /* output 4 pixels */
+            RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]);
+            RGBOUT(d1[3], d1[4], d1[5], y1_ptr[1]);
+            RGBOUT(d2[0], d2[1], d2[2], y2_ptr[0]);
+            RGBOUT(d2[3], d2[4], d2[5], y2_ptr[1]);
+
+            d1 += 6;
+            d2 += 6;
+            y1_ptr += 2;
+            y2_ptr += 2;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        d += 2 * dst->linesize[0];
+        y1_ptr += 2 * src->linesize[0] - width;
+        cb_ptr += src->linesize[1] - width2;
+        cr_ptr += src->linesize[2] - width2;
+    }
+}
+
+/* XXX: no chroma interpolating is done */
+static void yuv422p_to_rgb24(AVPicture *dst, AVPicture *src, 
+                             int width, int height)
+{
+    UINT8 *y1_ptr, *cb_ptr, *cr_ptr, *d, *d1;
+    int w, y, cb, cr, r_add, g_add, b_add, width2;
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;
+
+    d = dst->data[0];
+    y1_ptr = src->data[0];
+    cb_ptr = src->data[1];
+    cr_ptr = src->data[2];
+    width2 = width >> 1;
+    for(;height > 0; height --) {
+        d1 = d;
+        for(w = width2; w > 0; w --) {
+            cb = cb_ptr[0] - 128;
+            cr = cr_ptr[0] - 128;
+            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
+            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
+            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
+            
+            /* output 2 pixels */
+            RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]);
+            RGBOUT(d1[3], d1[4], d1[5], y1_ptr[1]);
+
+            d1 += 6;
+            y1_ptr += 2;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        d += dst->linesize[0];
+        y1_ptr += src->linesize[0] - width;
+        cb_ptr += src->linesize[1] - width2;
+        cr_ptr += src->linesize[2] - width2;
+    }
+}
+
+/* XXX: always use linesize. Return -1 if not supported */
+int img_convert(AVPicture *dst, int dst_pix_fmt,
+                AVPicture *src, int pix_fmt, 
+                int width, int height)
+{
+    int i;
+
+    if (dst_pix_fmt == pix_fmt) {
+        switch(pix_fmt) {
+        case PIX_FMT_YUV420P:
+            for(i=0;i<3;i++) {
+                if (i == 1) {
+                    width >>= 1;
+                    height >>= 1;
+                }
+                img_copy(dst->data[i], dst->linesize[i],
+                         src->data[i], src->linesize[i],
+                         width, height);
+            }
+            break;
+        default:
+            return -1;
+        }
+    } else if (dst_pix_fmt == PIX_FMT_YUV420P) {
+        
+        switch(pix_fmt) {
+        case PIX_FMT_YUV420P:
+            for(i=0;i<3;i++) {
+                img_copy(dst->data[i], dst->linesize[i],
+                         src->data[i], src->linesize[i],
+                         width, height);
+            }
+            break;
+        case PIX_FMT_YUV422P:
+            img_copy(dst->data[0], dst->linesize[0],
+                     src->data[0], src->linesize[0],
+                     width, height);
+            width >>= 1;
+            height >>= 1;
+            for(i=1;i<3;i++) {
+                shrink2(dst->data[i], dst->linesize[i],
+                        src->data[i], src->linesize[i],
+                        width, height);
+            }
+            break;
+        case PIX_FMT_YUV444P:
+            img_copy(dst->data[0], dst->linesize[0],
+                     src->data[0], src->linesize[0],
+                     width, height);
+            width >>= 1;
+            height >>= 1;
+            for(i=1;i<3;i++) {
+                shrink22(dst->data[i], dst->linesize[i],
+                         src->data[i], src->linesize[i],
+                         width, height);
+            }
+            break;
+        case PIX_FMT_YUV422:
+            yuv422_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
+                              src->data[0], width, height);
+            break;
+        case PIX_FMT_RGB24:
+            rgb24_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
+                             src->data[0], width, height);
+            break;
+        case PIX_FMT_BGR24:
+            bgr24_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
+                             src->data[0], width, height);
+            break;
+        default:
+            return -1;
+        }
+    } else if (dst_pix_fmt == PIX_FMT_RGB24) {
+        switch(pix_fmt) {
+        case PIX_FMT_YUV420P:
+            yuv420p_to_rgb24(dst, src, width, height);
+            break;
+        case PIX_FMT_YUV422P:
+            yuv422p_to_rgb24(dst, src, width, height);
+            break;
+        default:
+            return -1;
+        }
+    } else {
+        return -1;
+    }
+    return 0;
+}
+
+/* filter parameters: [-1 4 2 4 -1] // 8 */
+static void deinterlace_line(UINT8 *dst, UINT8 *src, int src_wrap,
+                             int size)
+{
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;
+    int sum;
+    UINT8 *s;
+
+    for(;size > 0;size--) {
+        s = src;
+        sum = -s[0];
+        s += src_wrap;
+        sum += s[0] << 2;
+        s += src_wrap;
+        sum += s[0] << 1;
+        s += src_wrap;
+        sum += s[0] << 2;
+        s += src_wrap;
+        sum += -s[0];
+        dst[0] = cm[(sum + 4) >> 3];
+        dst++;
+        src++;
+    }
+}
+
+/* deinterlacing : 2 temporal taps, 3 spatial taps linear filter. The
+   top field is copied as is, but the bottom field is deinterlaced
+   against the top field. */
+static void deinterlace_bottom_field(UINT8 *dst, int dst_wrap,
+                                     UINT8 *src1, int src_wrap,
+                                     int width, int height)
+{
+    UINT8 *src, *ptr;
+    int y, y1, i;
+    UINT8 *buf;
+
+    buf= (UINT8*) malloc(5 * width);
+
+    src = src1;
+    for(y=0;y<height;y+=2) {
+        /* copy top field line */
+        memcpy(dst, src, width);
+        dst += dst_wrap;
+        src += (1 - 2) * src_wrap;
+        y1 = y - 2;
+        if (y1 >= 0 && (y1 + 4) < height) {
+            /* fast case : no edges */
+            deinterlace_line(dst, src, src_wrap, width);
+        } else {
+            /* in order to use the same function, we use an intermediate buffer */
+            ptr = buf;
+            for(i=0;i<5;i++) {
+                if (y1 < 0)
+                    memcpy(ptr, src1, width);
+                else if (y1 >= height) 
+                    memcpy(ptr, src1 + (height - 1) * src_wrap, width);
+                else
+                    memcpy(ptr, src1 + y1 * src_wrap, width);
+                y1++;
+                ptr += width;
+            }
+            deinterlace_line(dst, buf, width, width);
+        }
+        dst += dst_wrap;
+        src += (2 + 1) * src_wrap;
+    }
+    free(buf);
+}
+
+
+/* deinterlace, return -1 if format not handled */
+int avpicture_deinterlace(AVPicture *dst, AVPicture *src,
+                          int pix_fmt, int width, int height)
+{
+    int i;
+
+    if (pix_fmt != PIX_FMT_YUV420P &&
+        pix_fmt != PIX_FMT_YUV422P &&
+        pix_fmt != PIX_FMT_YUV444P)
+        return -1;
+    if ((width & 1) != 0 || (height & 3) != 0)
+        return -1;
+    
+    for(i=0;i<3;i++) {
+        if (i == 1) {
+            switch(pix_fmt) {
+            case PIX_FMT_YUV420P:
+                width >>= 1;
+                height >>= 1;
+                break;
+            case PIX_FMT_YUV422P:
+                width >>= 1;
+                break;
+            default:
+                break;
+            }
+        }
+        deinterlace_bottom_field(dst->data[i], dst->linesize[i],
+                                 src->data[i], src->linesize[i],
+                                 width, height);
+    }
+    return 0;
+}
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
new file mode 100644
index 000000000..8c69de2de
--- /dev/null
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -0,0 +1,619 @@
+/*
+ * High quality image resampling with polyphase filters 
+ * Copyright (c) 2001 Gerard Lantau.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "dsputil.h"
+#include "avcodec.h"
+
+#ifdef USE_FASTMEMCPY
+#include "fastmemcpy.h"
+#endif
+
+
+#define NB_COMPONENTS 3
+
+#define PHASE_BITS 4
+#define NB_PHASES  (1 << PHASE_BITS)
+#define NB_TAPS    4
+#define FCENTER    1  /* index of the center of the filter */
+
+#define POS_FRAC_BITS 16
+#define POS_FRAC      (1 << POS_FRAC_BITS)
+/* 6 bits precision is needed for MMX */
+#define FILTER_BITS   8
+
+#define LINE_BUF_HEIGHT (NB_TAPS * 4)
+
+struct ImgReSampleContext {
+    int iwidth, iheight, owidth, oheight;
+    int h_incr, v_incr;
+    INT16 h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
+    INT16 v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
+    UINT8 *line_buf;
+};
+
+static inline int get_phase(int pos)
+{
+    return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
+}
+
+/* This function must be optimized */
+static void h_resample_fast(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
+                            int src_start, int src_incr, INT16 *filters)
+{
+    int src_pos, phase, sum, i;
+    UINT8 *s;
+    INT16 *filter;
+
+    src_pos = src_start;
+    for(i=0;i<dst_width;i++) {
+#ifdef TEST
+        /* test */
+        if ((src_pos >> POS_FRAC_BITS) < 0 ||
+            (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
+            abort();
+#endif
+        s = src + (src_pos >> POS_FRAC_BITS);
+        phase = get_phase(src_pos);
+        filter = filters + phase * NB_TAPS;
+#if NB_TAPS == 4
+        sum = s[0] * filter[0] +
+            s[1] * filter[1] +
+            s[2] * filter[2] +
+            s[3] * filter[3];
+#else
+        {
+            int j;
+            sum = 0;
+            for(j=0;j<NB_TAPS;j++)
+                sum += s[j] * filter[j];
+        }
+#endif
+        sum = sum >> FILTER_BITS;
+        if (sum < 0)
+            sum = 0;
+        else if (sum > 255)
+            sum = 255;
+        dst[0] = sum;
+        src_pos += src_incr;
+        dst++;
+    }
+}
+
+/* This function must be optimized */
+static void v_resample(UINT8 *dst, int dst_width, UINT8 *src, int wrap, 
+                       INT16 *filter)
+{
+    int sum, i;
+    UINT8 *s;
+
+    s = src;
+    for(i=0;i<dst_width;i++) {
+#if NB_TAPS == 4
+        sum = s[0 * wrap] * filter[0] +
+            s[1 * wrap] * filter[1] +
+            s[2 * wrap] * filter[2] +
+            s[3 * wrap] * filter[3];
+#else
+        {
+            int j;
+            UINT8 *s1 = s;
+
+            sum = 0;
+            for(j=0;j<NB_TAPS;j++) {
+                sum += s1[0] * filter[j];
+                s1 += wrap;
+            }
+        }
+#endif
+        sum = sum >> FILTER_BITS;
+        if (sum < 0)
+            sum = 0;
+        else if (sum > 255)
+            sum = 255;
+        dst[0] = sum;
+        dst++;
+        s++;
+    }
+}
+
+#ifdef HAVE_MMX
+
+#include "i386/mmx.h"
+
+#define FILTER4(reg) \
+{\
+        s = src + (src_pos >> POS_FRAC_BITS);\
+        phase = get_phase(src_pos);\
+        filter = filters + phase * NB_TAPS;\
+        movq_m2r(*s, reg);\
+        punpcklbw_r2r(mm7, reg);\
+        movq_m2r(*filter, mm6);\
+        pmaddwd_r2r(reg, mm6);\
+        movq_r2r(mm6, reg);\
+        psrlq_i2r(32, reg);\
+        paddd_r2r(mm6, reg);\
+        psrad_i2r(FILTER_BITS, reg);\
+        src_pos += src_incr;\
+}
+
+#define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
+
+/* XXX: do four pixels at a time */
+static void h_resample_fast4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
+                                 int src_start, int src_incr, INT16 *filters)
+{
+    int src_pos, phase;
+    UINT8 *s;
+    INT16 *filter;
+    mmx_t tmp;
+    
+    src_pos = src_start;
+    pxor_r2r(mm7, mm7);
+
+    while (dst_width >= 4) {
+
+        FILTER4(mm0);
+        FILTER4(mm1);
+        FILTER4(mm2);
+        FILTER4(mm3);
+
+        packuswb_r2r(mm7, mm0);
+        packuswb_r2r(mm7, mm1);
+        packuswb_r2r(mm7, mm3);
+        packuswb_r2r(mm7, mm2);
+        movq_r2m(mm0, tmp);
+        dst[0] = tmp.ub[0];
+        movq_r2m(mm1, tmp);
+        dst[1] = tmp.ub[0];
+        movq_r2m(mm2, tmp);
+        dst[2] = tmp.ub[0];
+        movq_r2m(mm3, tmp);
+        dst[3] = tmp.ub[0];
+        dst += 4;
+        dst_width -= 4;
+    }
+    while (dst_width > 0) {
+        FILTER4(mm0);
+        packuswb_r2r(mm7, mm0);
+        movq_r2m(mm0, tmp);
+        dst[0] = tmp.ub[0];
+        dst++;
+        dst_width--;
+    }
+    emms();
+}
+
+static void v_resample4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int wrap, 
+                            INT16 *filter)
+{
+    int sum, i, v;
+    UINT8 *s;
+    mmx_t tmp;
+    mmx_t coefs[4];
+    
+    for(i=0;i<4;i++) {
+        v = filter[i];
+        coefs[i].uw[0] = v;
+        coefs[i].uw[1] = v;
+        coefs[i].uw[2] = v;
+        coefs[i].uw[3] = v;
+    }
+    
+    pxor_r2r(mm7, mm7);
+    s = src;
+    while (dst_width >= 4) {
+        movq_m2r(s[0 * wrap], mm0);
+        punpcklbw_r2r(mm7, mm0);
+        movq_m2r(s[1 * wrap], mm1);
+        punpcklbw_r2r(mm7, mm1);
+        movq_m2r(s[2 * wrap], mm2);
+        punpcklbw_r2r(mm7, mm2);
+        movq_m2r(s[3 * wrap], mm3);
+        punpcklbw_r2r(mm7, mm3);
+
+        pmullw_m2r(coefs[0], mm0);
+        pmullw_m2r(coefs[1], mm1);
+        pmullw_m2r(coefs[2], mm2);
+        pmullw_m2r(coefs[3], mm3);
+
+        paddw_r2r(mm1, mm0);
+        paddw_r2r(mm3, mm2);
+        paddw_r2r(mm2, mm0);
+        psraw_i2r(FILTER_BITS, mm0);
+        
+        packuswb_r2r(mm7, mm0);
+        movq_r2m(mm0, tmp);
+
+        *(UINT32 *)dst = tmp.ud[0];
+        dst += 4;
+        s += 4;
+        dst_width -= 4;
+    }
+    while (dst_width > 0) {
+        sum = s[0 * wrap] * filter[0] +
+            s[1 * wrap] * filter[1] +
+            s[2 * wrap] * filter[2] +
+            s[3 * wrap] * filter[3];
+        sum = sum >> FILTER_BITS;
+        if (sum < 0)
+            sum = 0;
+        else if (sum > 255)
+            sum = 255;
+        dst[0] = sum;
+        dst++;
+        s++;
+        dst_width--;
+    }
+    emms();
+}
+#endif
+
+/* slow version to handle limit cases. Does not need optimisation */
+static void h_resample_slow(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
+                            int src_start, int src_incr, INT16 *filters)
+{
+    int src_pos, phase, sum, j, v, i;
+    UINT8 *s, *src_end;
+    INT16 *filter;
+
+    src_end = src + src_width;
+    src_pos = src_start;
+    for(i=0;i<dst_width;i++) {
+        s = src + (src_pos >> POS_FRAC_BITS);
+        phase = get_phase(src_pos);
+        filter = filters + phase * NB_TAPS;
+        sum = 0;
+        for(j=0;j<NB_TAPS;j++) {
+            if (s < src)
+                v = src[0];
+            else if (s >= src_end)
+                v = src_end[-1];
+            else
+                v = s[0];
+            sum += v * filter[j];
+            s++;
+        }
+        sum = sum >> FILTER_BITS;
+        if (sum < 0)
+            sum = 0;
+        else if (sum > 255)
+            sum = 255;
+        dst[0] = sum;
+        src_pos += src_incr;
+        dst++;
+    }
+}
+
+static void h_resample(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
+                       int src_start, int src_incr, INT16 *filters)
+{
+    int n, src_end;
+
+    if (src_start < 0) {
+        n = (0 - src_start + src_incr - 1) / src_incr;
+        h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters);
+        dst += n;
+        dst_width -= n;
+        src_start += n * src_incr;
+    }
+    src_end = src_start + dst_width * src_incr;
+    if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) {
+        n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) / 
+            src_incr;
+    } else {
+        n = dst_width;
+    }
+#ifdef HAVE_MMX
+    if ((mm_flags & MM_MMX) && NB_TAPS == 4)
+        h_resample_fast4_mmx(dst, n, 
+                             src, src_width, src_start, src_incr, filters);
+    else
+#endif
+        h_resample_fast(dst, n, 
+                        src, src_width, src_start, src_incr, filters);
+    if (n < dst_width) {
+        dst += n;
+        dst_width -= n;
+        src_start += n * src_incr;
+        h_resample_slow(dst, dst_width, 
+                        src, src_width, src_start, src_incr, filters);
+    }
+}
+
+static void component_resample(ImgReSampleContext *s, 
+                               UINT8 *output, int owrap, int owidth, int oheight,
+                               UINT8 *input, int iwrap, int iwidth, int iheight)
+{
+    int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y;
+    UINT8 *new_line, *src_line;
+
+    last_src_y = - FCENTER - 1;
+    /* position of the bottom of the filter in the source image */
+    src_y = (last_src_y + NB_TAPS) * POS_FRAC; 
+    ring_y = NB_TAPS; /* position in ring buffer */
+    for(y=0;y<oheight;y++) {
+        /* apply horizontal filter on new lines from input if needed */
+        src_y1 = src_y >> POS_FRAC_BITS;
+        while (last_src_y < src_y1) {
+            if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
+                ring_y = NB_TAPS;
+            last_src_y++;
+            /* handle limit conditions : replicate line (slighly
+               inefficient because we filter multiple times */
+            y1 = last_src_y;
+            if (y1 < 0) {
+                y1 = 0;
+            } else if (y1 >= iheight) {
+                y1 = iheight - 1;
+            }
+            src_line = input + y1 * iwrap;
+            new_line = s->line_buf + ring_y * owidth;
+            /* apply filter and handle limit cases correctly */
+            h_resample(new_line, owidth, 
+                       src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr, 
+                       &s->h_filters[0][0]);
+            /* handle ring buffer wraping */
+            if (ring_y >= LINE_BUF_HEIGHT) {
+                memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth,
+                       new_line, owidth);
+            }
+        }
+        /* apply vertical filter */
+        phase_y = get_phase(src_y);
+#ifdef HAVE_MMX
+        /* desactivated MMX because loss of precision */
+        if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0)
+            v_resample4_mmx(output, owidth, 
+                            s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, 
+                            &s->v_filters[phase_y][0]);
+        else
+#endif
+            v_resample(output, owidth, 
+                       s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth, 
+                       &s->v_filters[phase_y][0]);
+            
+        src_y += s->v_incr;
+        output += owrap;
+    }
+}
+
+/* XXX: the following filter is quite naive, but it seems to suffice
+   for 4 taps */
+static void build_filter(INT16 *filter, float factor)
+{
+    int ph, i, v;
+    float x, y, tab[NB_TAPS], norm, mult;
+
+    /* if upsampling, only need to interpolate, no filter */
+    if (factor > 1.0)
+        factor = 1.0;
+
+    for(ph=0;ph<NB_PHASES;ph++) {
+        norm = 0;
+        for(i=0;i<NB_TAPS;i++) {
+            
+            x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
+            if (x == 0)
+                y = 1.0;
+            else
+                y = sin(x) / x;
+            tab[i] = y;
+            norm += y;
+        }
+
+        /* normalize so that an uniform color remains the same */
+        mult = (float)(1 << FILTER_BITS) / norm;
+        for(i=0;i<NB_TAPS;i++) {
+            v = (int)(tab[i] * mult);
+            filter[ph * NB_TAPS + i] = v;
+        }
+    }
+}
+
+ImgReSampleContext *img_resample_init(int owidth, int oheight,
+                                      int iwidth, int iheight)
+{
+    ImgReSampleContext *s;
+
+    s = av_mallocz(sizeof(ImgReSampleContext));
+    if (!s)
+        return NULL;
+    s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
+    if (!s->line_buf) 
+        goto fail;
+    
+    s->owidth = owidth;
+    s->oheight = oheight;
+    s->iwidth = iwidth;
+    s->iheight = iheight;
+    
+    s->h_incr = (iwidth * POS_FRAC) / owidth;
+    s->v_incr = (iheight * POS_FRAC) / oheight;
+    
+    build_filter(&s->h_filters[0][0], (float)owidth / (float)iwidth);
+    build_filter(&s->v_filters[0][0], (float)oheight / (float)iheight);
+
+    return s;
+ fail:
+    free(s);
+    return NULL;
+}
+
+void img_resample(ImgReSampleContext *s, 
+                  AVPicture *output, AVPicture *input)
+{
+    int i, shift;
+
+    for(i=0;i<3;i++) {
+        shift = (i == 0) ? 0 : 1;
+        component_resample(s, output->data[i], output->linesize[i], 
+                           s->owidth >> shift, s->oheight >> shift,
+                           input->data[i], input->linesize[i], 
+                           s->iwidth >> shift, s->iheight >> shift);
+    }
+}
+
+void img_resample_close(ImgReSampleContext *s)
+{
+    free(s->line_buf);
+    free(s);
+}
+
+#ifdef TEST
+
+void *av_mallocz(int size)
+{
+    void *ptr;
+    ptr = malloc(size);
+    memset(ptr, 0, size);
+    return ptr;
+}
+
+/* input */
+#define XSIZE 256
+#define YSIZE 256
+UINT8 img[XSIZE * YSIZE];
+
+/* output */
+#define XSIZE1 512
+#define YSIZE1 512
+UINT8 img1[XSIZE1 * YSIZE1];
+UINT8 img2[XSIZE1 * YSIZE1];
+
+void save_pgm(const char *filename, UINT8 *img, int xsize, int ysize)
+{
+    FILE *f;
+    f=fopen(filename,"w");
+    fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255);
+    fwrite(img,1, xsize * ysize,f);
+    fclose(f);
+}
+
+static void dump_filter(INT16 *filter)
+{
+    int i, ph;
+
+    for(ph=0;ph<NB_PHASES;ph++) {
+        printf("%2d: ", ph);
+        for(i=0;i<NB_TAPS;i++) {
+            printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
+        }
+        printf("\n");
+    }
+}
+
+#ifdef HAVE_MMX
+int mm_flags;
+#endif
+
+int main(int argc, char **argv)
+{
+    int x, y, v, i, xsize, ysize;
+    ImgReSampleContext *s;
+    float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
+    char buf[256];
+
+    /* build test image */
+    for(y=0;y<YSIZE;y++) {
+        for(x=0;x<XSIZE;x++) {
+            if (x < XSIZE/2 && y < YSIZE/2) {
+                if (x < XSIZE/4 && y < YSIZE/4) {
+                    if ((x % 10) <= 6 &&
+                        (y % 10) <= 6)
+                        v = 0xff;
+                    else
+                        v = 0x00;
+                } else if (x < XSIZE/4) {
+                    if (x & 1) 
+                        v = 0xff;
+                    else 
+                        v = 0;
+                } else if (y < XSIZE/4) {
+                    if (y & 1) 
+                        v = 0xff;
+                    else 
+                        v = 0;
+                } else {
+                    if (y < YSIZE*3/8) {
+                        if ((y+x) & 1) 
+                            v = 0xff;
+                        else 
+                            v = 0;
+                    } else {
+                        if (((x+3) % 4) <= 1 &&
+                            ((y+3) % 4) <= 1)
+                            v = 0xff;
+                        else
+                            v = 0x00;
+                    }
+                }
+            } else if (x < XSIZE/2) {
+                v = ((x - (XSIZE/2)) * 255) / (XSIZE/2);
+            } else if (y < XSIZE/2) {
+                v = ((y - (XSIZE/2)) * 255) / (XSIZE/2);
+            } else {
+                v = ((x + y - XSIZE) * 255) / XSIZE;
+            }
+            img[y * XSIZE + x] = v;
+        }
+    }
+    save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
+    for(i=0;i<sizeof(factors)/sizeof(float);i++) {
+        fact = factors[i];
+        xsize = (int)(XSIZE * fact);
+        ysize = (int)(YSIZE * fact);
+        s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
+        printf("Factor=%0.2f\n", fact);
+        dump_filter(&s->h_filters[0][0]);
+        component_resample(s, img1, xsize, xsize, ysize,
+                           img, XSIZE, XSIZE, YSIZE);
+        img_resample_close(s);
+
+        sprintf(buf, "/tmp/out%d.pgm", i);
+        save_pgm(buf, img1, xsize, ysize);
+    }
+
+    /* mmx test */
+#ifdef HAVE_MMX
+    printf("MMX test\n");
+    fact = 0.72;
+    xsize = (int)(XSIZE * fact);
+    ysize = (int)(YSIZE * fact);
+    mm_flags = MM_MMX;
+    s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
+    component_resample(s, img1, xsize, xsize, ysize,
+                       img, XSIZE, XSIZE, YSIZE);
+
+    mm_flags = 0;
+    s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
+    component_resample(s, img2, xsize, xsize, ysize,
+                       img, XSIZE, XSIZE, YSIZE);
+    if (memcmp(img1, img2, xsize * ysize) != 0) {
+        fprintf(stderr, "mmx error\n");
+        exit(1);
+    }
+    printf("MMX OK\n");
+#endif
+    return 0;
+}
+
+#endif
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 66d018e69..08281a501 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -16,6 +16,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
+//#define DEBUG
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
@@ -419,30 +420,6 @@ void mjpeg_encode_mb(MpegEncContext *s,
 /******************************************/
 /* decoding */
 
-//#define DEBUG
-
-#ifndef CONFIG_WIN32
-
-#ifdef DEBUG
-# if __GNUC__
-#  define dprintf(fmt,args...) printf(fmt, ## args)
-# else
-#  define dprintf(...)	       printf(__VA_ARGS__)
-# endif
-#else
-# if __GNUC__
-#  define dprintf(fmt,args...)
-# else
-#  define dprintf(...)
-# endif
-#endif
-
-#else
-
-inline void dprintf(const char* fmt,...) {}
-
-#endif
-
 /* compressed picture size */
 #define PICTURE_BUFFER_SIZE 100000
 
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index 013e1b935..758124eea 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -16,36 +16,13 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
+//#define DEBUG
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
 
 #include "mpeg12data.h"
 
-//#define DEBUG
-
-#ifndef CONFIG_WIN32
-
-#ifdef DEBUG
-# if __GNUC__
-#  define dprintf(fmt,args...) printf(fmt, ## args)
-# else
-#  define dprintf(...)	       printf(__VA_ARGS__)
-# endif
-#else
-# if __GNUC__
-#  define dprintf(fmt,args...)
-# else
-#  define dprintf(...)
-# endif
-#endif
-
-#else
-
-inline void dprintf(const char* fmt,...) {}
-
-#endif
-
 /* Start codes. */
 #define SEQ_END_CODE		0x000001b7
 #define SEQ_START_CODE		0x000001b3
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 9f7236f98..80a35635d 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -101,14 +101,12 @@ int MPV_common_init(MpegEncContext *s)
     int c_size, i;
     UINT8 *pict;
 
-    s->waiting_for_keyframe = 1;
-
     if (s->out_format == FMT_H263) 
         s->dct_unquantize = dct_unquantize_h263_c;
     else
         s->dct_unquantize = dct_unquantize_mpeg1_c;
         
-#ifdef ARCH_X86
+#ifdef HAVE_MMX
     MPV_common_init_mmx(s);
 #endif
     s->mb_width = (s->width + 15) / 16;
@@ -214,7 +212,6 @@ int MPV_common_init(MpegEncContext *s)
         if (s->aux_picture_base[i])
             free(s->aux_picture_base[i]);
     }
-
     return -1;
 }
 
@@ -252,6 +249,9 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->width = avctx->width;
     s->height = avctx->height;
     s->gop_size = avctx->gop_size;
+    s->rtp_mode = avctx->rtp_mode;
+    s->rtp_payload_size = avctx->rtp_payload_size;
+    
     if (s->gop_size <= 1) {
         s->intra_only = 1;
         s->gop_size = 12;
@@ -273,14 +273,22 @@ int MPV_encode_init(AVCodecContext *avctx)
             return -1;
         break;
     case CODEC_ID_H263:
-        if (h263_get_picture_format(s->width, s->height) == 7)
+        if (h263_get_picture_format(s->width, s->height) == 7){
+	    printf("Input picture size isn't suitable for h263 codec! try h263+\n");
             return -1;
+	}
         s->out_format = FMT_H263;
         break;
     case CODEC_ID_H263P:
         s->out_format = FMT_H263;
+        s->rtp_mode = 1;
+        s->rtp_payload_size = 1200; 
         s->h263_plus = 1;
-        /* XXX: not unrectricted mv yet */
+        s->unrestricted_mv = 1;
+        
+        /* These are just to be sure */
+        s->umvplus = 0;
+        s->umvplus_dec = 0;
         break;
     case CODEC_ID_RV10:
         s->out_format = FMT_H263;
@@ -395,9 +403,15 @@ void MPV_frame_end(MpegEncContext *s)
 {
     /* draw edge for correct motion prediction if outside */
     if (s->pict_type != B_TYPE) {
+#if 1
+        draw_edges(s->current_picture[0], s->linesize, s->mb_width*16, s->mb_height*16, EDGE_WIDTH);
+        draw_edges(s->current_picture[1], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2);
+        draw_edges(s->current_picture[2], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2);
+#else
         draw_edges(s->current_picture[0], s->linesize, s->width, s->height, EDGE_WIDTH);
         draw_edges(s->current_picture[1], s->linesize/2, s->width/2, s->height/2, EDGE_WIDTH/2);
         draw_edges(s->current_picture[2], s->linesize/2, s->width/2, s->height/2, EDGE_WIDTH/2);
+#endif
     }
 }
 
@@ -674,6 +688,11 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
     mb_x = s->mb_x;
     mb_y = s->mb_y;
 
+#ifdef FF_POSTPROCESS
+    quant_store[mb_y][mb_x]=s->qscale;
+    //printf("[%02d][%02d] %d\n",mb_x,mb_y,s->qscale);
+#endif
+
     /* update DC predictors for P macroblocks */
     if (!s->mb_intra) {
         if (s->h263_pred) {
@@ -813,7 +832,7 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
 
 static void encode_picture(MpegEncContext *s, int picture_number)
 {
-    int mb_x, mb_y, wrap;
+    int mb_x, mb_y, wrap, last_gob;
     UINT8 *ptr;
     int i, motion_x, motion_y;
 
@@ -863,7 +882,29 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->mv_type = MV_TYPE_16X16;
     s->mv_dir = MV_DIR_FORWARD;
 
+    /* Get the GOB height based on picture height */
+    if (s->out_format == FMT_H263 && s->h263_plus) {
+        if (s->height <= 400)
+            s->gob_index = 1;
+        else if (s->height <= 800)
+            s->gob_index = 2;
+        else
+            s->gob_index = 4;
+    }
+        
     for(mb_y=0; mb_y < s->mb_height; mb_y++) {
+        /* Put GOB header based on RTP MTU */
+        if (!mb_y) {
+            s->ptr_lastgob = s->pb.buf_ptr;
+            s->ptr_last_mb_line = s->pb.buf_ptr;
+        } else if (s->out_format == FMT_H263 && s->h263_plus) {
+            last_gob = h263_encode_gob_header(s, mb_y);
+            if (last_gob) {
+                //fprintf(stderr,"\nLast GOB size: %d", last_gob);
+                s->first_gob_line = 1;
+            } else
+                s->first_gob_line = 0;
+        }
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
 
             s->mb_x = mb_x;
@@ -975,7 +1016,17 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 
             MPV_decode_mb(s, s->block);
         }
+        /* Obtain average MB line size for RTP */
+        if (!mb_y)
+            s->mb_line_avgsize = s->pb.buf_ptr - s->ptr_last_mb_line;
+        else    
+            s->mb_line_avgsize = (s->mb_line_avgsize + s->pb.buf_ptr - s->ptr_last_mb_line) >> 1;
+        //fprintf(stderr, "\nMB line: %d\tSize: %u\tAvg. Size: %u", s->mb_y, 
+        //                    (s->pb.buf_ptr - s->ptr_last_mb_line), s->mb_line_avgsize);
+        s->ptr_last_mb_line = s->pb.buf_ptr;
     }
+    //if (s->gob_number)
+    //    fprintf(stderr,"\nNumber of GOB: %d", s->gob_number);
 }
 
 static int dct_quantize(MpegEncContext *s, 
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 9abe75a7c..e653edb9b 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -128,7 +128,16 @@ typedef struct MpegEncContext {
     int P_frame_bits;    /* same for P frame */
     INT64 wanted_bits;
     INT64 total_bits;
-
+    
+    /* H.263 specific */
+    int gob_number;
+    int gob_index;
+    int first_gob_line;
+    
+    /* H.263+ specific */
+    int umvplus;
+    int umvplus_dec;
+    
     /* mpeg4 specific */
     int time_increment_bits;
     int shape;
@@ -177,12 +186,17 @@ typedef struct MpegEncContext {
     int interlaced_dct;
     int last_qscale;
     int first_slice;
-
+    
+    /* RTP specific */
+    int rtp_mode;
+    int rtp_payload_size;
+    UINT8 *ptr_lastgob;
+    UINT8 *ptr_last_mb_line;
+    UINT32 mb_line_avgsize;
+    
     DCTELEM block[6][64] __align8;
     void (*dct_unquantize)(struct MpegEncContext *s, 
                            DCTELEM *block, int n, int qscale);
-
-    int waiting_for_keyframe;
 } MpegEncContext;
 
 int MPV_common_init(MpegEncContext *s);
@@ -190,7 +204,7 @@ void MPV_common_end(MpegEncContext *s);
 void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 void MPV_frame_start(MpegEncContext *s);
 void MPV_frame_end(MpegEncContext *s);
-#ifdef ARCH_X86
+#ifdef HAVE_MMX
 void MPV_common_init_mmx(MpegEncContext *s);
 #endif
 
@@ -230,7 +244,7 @@ typedef struct RLTable {
 void init_rl(RLTable *rl);
 void init_vlc_rl(RLTable *rl);
 
-static inline int get_rl_index(const RLTable *rl, int last, int run, int level)
+extern inline int get_rl_index(const RLTable *rl, int last, int run, int level)
 {
     int index;
     index = rl->index_run[last][run];
@@ -245,6 +259,7 @@ void h263_encode_mb(MpegEncContext *s,
                     DCTELEM block[6][64],
                     int motion_x, int motion_y);
 void h263_encode_picture_header(MpegEncContext *s, int picture_number);
+int h263_encode_gob_header(MpegEncContext * s, int mb_line);
 void h263_dc_scale(MpegEncContext *s);
 INT16 *h263_pred_motion(MpegEncContext * s, int block, 
                         int *px, int *py);
@@ -255,6 +270,7 @@ void h263_encode_init_vlc(MpegEncContext *s);
 
 void h263_decode_init_vlc(MpegEncContext *s);
 int h263_decode_picture_header(MpegEncContext *s);
+int h263_decode_gob_header(MpegEncContext *s);
 int mpeg4_decode_picture_header(MpegEncContext * s);
 int intel_h263_decode_picture_header(MpegEncContext *s);
 int h263_decode_mb(MpegEncContext *s,
diff --git a/src/libffmpeg/libavcodec/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/mpegvideo_mmx.c
deleted file mode 100644
index 7b3ba8aff..000000000
--- a/src/libffmpeg/libavcodec/mpegvideo_mmx.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * The simplest mpeg encoder (well, it was the simplest!)
- * Copyright (c) 2000,2001 Gerard Lantau.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
- */
-
-#include "dsputil.h"
-#include "mpegvideo.h"
-#include "xineutils.h"
-
-#if 0
-
-/* XXX: GL: I don't understand why this function needs optimization
-   (it is called only once per frame!), so I disabled it */
-
-void MPV_frame_start(MpegEncContext *s)
-{
-    if (s->pict_type == B_TYPE) {
-	__asm __volatile(
-	    "movl	(%1), %%eax\n\t"
-	    "movl	4(%1), %%edx\n\t"
-	    "movl	8(%1), %%ecx\n\t"
-	    "movl	%%eax, (%0)\n\t"
-	    "movl	%%edx, 4(%0)\n\t"
-	    "movl	%%ecx, 8(%0)\n\t"
-	    :
-	    :"r"(s->current_picture), "r"(s->aux_picture)
-	    :"eax","edx","ecx","memory");
-    } else {
-            /* swap next and last */
-	__asm __volatile(
-	    "movl	(%1), %%eax\n\t"
-	    "movl	4(%1), %%edx\n\t"
-	    "movl	8(%1), %%ecx\n\t"
-	    "xchgl	(%0), %%eax\n\t"
-	    "xchgl	4(%0), %%edx\n\t"
-	    "xchgl	8(%0), %%ecx\n\t"
-	    "movl	%%eax, (%1)\n\t"
-	    "movl	%%edx, 4(%1)\n\t"
-	    "movl	%%ecx, 8(%1)\n\t"
-	    "movl	%%eax, (%2)\n\t"
-	    "movl	%%edx, 4(%2)\n\t"
-	    "movl	%%ecx, 8(%2)\n\t"
-	    :
-	    :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture)
-	    :"eax","edx","ecx","memory");
-    }
-}
-#endif
-
-static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
-static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
-
-/*
-  NK:
-  Note: looking at PARANOID:
-  "enable all paranoid tests for rounding, overflows, etc..."
-
-#ifdef PARANOID
-                if (level < -2048 || level > 2047)
-                    fprintf(stderr, "unquant error %d %d\n", i, level);
-#endif
-  We can suppose that result of two multiplications can't be greate of 0xFFFF
-  i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
-  a complex multiplication.
-=====================================================
- Full formula for multiplication of 2 integer numbers
- which are represent as high:low words:
- input: value1 = high1:low1
-        value2 = high2:low2
- output: value3 = value1*value2
- value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
- this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
- but this algorithm will compute only 0x66cb0ce4
- this limited by 16-bit size of operands
- ---------------------------------
- tlow1 = high1*low2
- tlow2 = high2*low1
- tlow1 = tlow1 + tlow2
- high3:low3 = low1*low2
- high3 += tlow1
-*/
-static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
-                                     DCTELEM *block, int n, int qscale)
-{
-    int i, level;
-    const UINT16 *quant_matrix;
-    if (s->mb_intra) {
-        if (n < 4) 
-            block[0] = block[0] * s->y_dc_scale;
-        else
-            block[0] = block[0] * s->c_dc_scale;
-        if (s->out_format == FMT_H263) {
-            i = 1;
-            goto unquant_even;
-        }
-        /* XXX: only mpeg1 */
-        quant_matrix = s->intra_matrix;
-	i=1;
-	/* Align on 4 elements boundary */
-	while(i&3)
-	{
-            level = block[i];
-            if (level) {
-                if (level < 0) level = -level;
-                    level = (int)(level * qscale * quant_matrix[i]) >> 3;
-                    level = (level - 1) | 1;
-                if (block[i] < 0) level = -level;
-                block[i] = level;
-            }
-	    i++;
-	}
-	__asm __volatile(
-	"movd	%0, %%mm6\n\t"       /* mm6 = qscale | 0  */
-	"punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm6, %%mm7\n\t"
-	"movq	%1, %%mm5\n\t"
-	"packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */
-	"pxor	%%mm6, %%mm6\n\t"
-	::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory");
-        for(;i<64;i+=4) {
-		__asm __volatile(
-			"movq	%1, %%mm0\n\t"
-			"movq	%%mm7, %%mm1\n\t"
-			"movq	%%mm0, %%mm2\n\t"
-			"movq	%%mm0, %%mm3\n\t"
-			"pcmpgtw %%mm6, %%mm2\n\t"
-			"pmullw	%2, %%mm1\n\t"
-			"pandn	%%mm4, %%mm2\n\t"
-			"por	%%mm5, %%mm2\n\t"  
-			"pmullw	%%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */
-			
-			"pcmpeqw %%mm6, %%mm3\n\t"
-			"pmullw	%%mm0, %%mm1\n\t"
-			"psraw	$3, %%mm1\n\t"
-			"psubw	%%mm5, %%mm1\n\t"   /* block[i] --; */
-			"pandn	%%mm4, %%mm3\n\t"  /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */
-			"por	%%mm5, %%mm1\n\t"   /* block[i] |= 1 */
-			"pmullw %%mm2, %%mm1\n\t"   /* change signs again */
-
-			"pand	%%mm3, %%mm1\n\t" /* nullify if was zero */
-			"movq	%%mm1, %0"
-			:"=m"(block[i])
-			:"m"(block[i]), "m"(quant_matrix[i])
-			:"memory");
-        }
-    } else {
-        i = 0;
-    unquant_even:
-        quant_matrix = s->non_intra_matrix;
-	/* Align on 4 elements boundary */
-	while(i&3)
-	{
-	    level = block[i];
-            if (level) {
-                if (level < 0) level = -level;
-                    level = (((level << 1) + 1) * qscale *
-                             ((int) quant_matrix[i])) >> 4;
-                    level = (level - 1) | 1;
-                if(block[i] < 0) level = -level;
-                block[i] = level;
-	    }
-	    i++;
-	}
-	__asm __volatile(
-	"movd	%0, %%mm6\n\t"       /* mm6 = qscale | 0  */
-	"punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */
-	"movq	%2, %%mm4\n\t"
-	"movq	%%mm6, %%mm7\n\t"
-	"movq	%1, %%mm5\n\t"
-	"packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */
-	"pxor	%%mm6, %%mm6\n\t"
-	::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory");
-        for(;i<64;i+=4) {
-		__asm __volatile(
-			"movq	%1, %%mm0\n\t"
-			"movq	%%mm7, %%mm1\n\t"
-			"movq	%%mm0, %%mm2\n\t"
-			"movq	%%mm0, %%mm3\n\t"
-			"pcmpgtw %%mm6, %%mm2\n\t"
-			"pmullw	%2, %%mm1\n\t"
-			"pandn	%%mm4, %%mm2\n\t"
-			"por	%%mm5, %%mm2\n\t"  
-			"pmullw	%%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */
-			"psllw	$1, %%mm0\n\t" /* block[i] <<= 1 */
-			"paddw	%%mm5, %%mm0\n\t" /* block[i] ++ */
-			
-			"pmullw	%%mm0, %%mm1\n\t"
-			"psraw	$4, %%mm1\n\t"
-			"pcmpeqw %%mm6, %%mm3\n\t"
-			"psubw	%%mm5, %%mm1\n\t"   /* block[i] --; */
-			"pandn	%%mm4, %%mm3\n\t"  /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */
-			"por	%%mm5, %%mm1\n\t"   /* block[i] |= 1 */
-			"pmullw %%mm2, %%mm1\n\t"   /* change signs again */
-
-			"pand	%%mm3, %%mm1\n\t" /* nullify if was zero */
-			"movq	%%mm1, %0"
-			:"=m"(block[i])
-			:"m"(block[i]), "m"(quant_matrix[i])
-			:"memory");
-        }
-    }
-}
-
-void MPV_common_init_mmx(MpegEncContext *s)
-{
-    int mm_flags;
-
-    mm_flags = xine_mm_accel();
-
-    if (mm_flags & MM_ACCEL_X86_MMX) {
-        /* XXX: should include h263 optimization too. It would go even
-           faster! */
-        s->dct_unquantize = dct_unquantize_mpeg1_mmx;
-    }
-}
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 63f111bdd..01e3d5438 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -30,7 +30,7 @@
  *        - (encoding) select best vlc/dc table 
  *        - (decoding) handle slice indication
  */
-#undef DEBUG
+//#define DEBUG
 
 /* motion vector table */
 typedef struct MVTable {
@@ -664,7 +664,7 @@ int msmpeg4_decode_mb(MpegEncContext *s,
 
     /* special slice handling */
     if (s->mb_x == 0) {
-        if ((s->mb_y % s->slice_height) == 0) {
+        if (s->slice_height && (s->mb_y % s->slice_height) == 0) {
             int wrap;
             /* reset DC pred (set previous line to 1024) */
             wrap = 2 * s->mb_width + 2;
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 6d9376c54..f4ebc9016 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -24,7 +24,7 @@
 #include "avcodec.h"
 #include "mpegvideo.h"
 
-#undef DEBUG
+//#define DEBUG
 
 static const UINT16 rv_lum_code[256] =
 {
diff --git a/src/libffmpeg/libavcodec/sad_mmx.s b/src/libffmpeg/libavcodec/sad_mmx.s
deleted file mode 100644
index 42c7ade59..000000000
--- a/src/libffmpeg/libavcodec/sad_mmx.s
+++ /dev/null
@@ -1,799 +0,0 @@
-#  MMX/SSE optimized routines for SAD of 16*16 macroblocks
-#       Copyright (C) Juan J. Sierralta P. <juanjo@atmlab.utfsm.cl>
-#
-#  dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
-#  Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
-
-#
-#  This program is free software; you can redistribute it and/or
-#  modify it under the terms of the GNU General Public License
-#  as published by the Free Software Foundation; either version 2
-#  of the License, or (at your option) any later version.
-#
-#  This program is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#  GNU General Public License for more details.
-#
-#  You should have received a copy of the GNU General Public License
-#  along with this program; if not, write to the Free Software
-#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
-#
-
-.global pix_abs16x16_mmx
-
-# int  pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
-# esi = p1 (init:               blk1)
-# edi = p2 (init:               blk2)
-# ecx = rowsleft (init:  h)
-# edx = lx;
-
-# mm0 = distance accumulators (4 words)
-# mm1 = distance accumulators (4 words) 
-# mm2 = temp 
-# mm3 = temp
-# mm4 = temp
-# mm5 = temp 
-# mm6 = 0
-# mm7 = temp
-
-
-.align 32
-pix_abs16x16_mmx: 
-        pushl                           %ebp                                    # save frame pointer
-        movl                            %esp,%ebp
-
-        pushl                           %ebx                                    # Saves registers (called saves convention in
-        pushl                           %ecx                                    # x86 GCC it seems)
-        pushl                           %edx                                    # 
-        pushl                           %esi
-        pushl                           %edi
-
-        pxor                            %mm0,%mm0                               # zero acculumators
-        pxor                            %mm1,%mm1
-        pxor                            %mm6,%mm6
-        movl                            8(%ebp),%esi            # get pix1
-        movl                            12(%ebp),%edi           # get pix2
-        movl                            16(%ebp),%edx           # get lx
-        movl                            20(%ebp),%ecx           # get rowsleft
-        jmp                             pix_abs16x16_mmx.nextrow
-.align 32
-
-pix_abs16x16_mmx.nextrow: 
-        # First 8 bytes of the row
-
-        movq                            (%edi),%mm4     # load first 8 bytes of pix2 row 
-        movq                            (%esi),%mm5     # load first 8 bytes of pix1 row
-        movq                            %mm4,%mm3               # mm4 := abs(mm4-mm5)
-        movq                            8(%esi),%mm2    # load last 8 bytes of pix1 row 
-        psubusb                 %mm5,%mm4
-        movq                            8(%edi),%mm7    # load last 8 bytes of pix2 row                 
-        psubusb                 %mm3,%mm5
-        por                             %mm5,%mm4
-
-        # Last 8 bytes of the row       
-
-        movq                            %mm7,%mm3               # mm7 := abs(mm7-mm2)
-        psubusb                 %mm2,%mm7
-        psubusb                 %mm3,%mm2
-        por                             %mm2,%mm7
-
-        # Now mm4 and mm7 have 16 absdiffs to add
-
-        # First 8 bytes of the row2
-
-
-        addl                            %edx,%edi
-        movq                            (%edi),%mm2     # load first 8 bytes of pix2 row 
-        addl                            %edx,%esi
-        movq                            (%esi),%mm5     # load first 8 bytes of pix1 row
-
-
-
-        movq                            %mm2,%mm3               # mm2 := abs(mm2-mm5)
-        psubusb                 %mm5,%mm2
-        movq                            8(%esi),%mm6    # load last 8 bytes of pix1 row
-        psubusb                 %mm3,%mm5
-        por                             %mm5,%mm2
-
-        # Last 8 bytes of the row2
-
-        movq                            8(%edi),%mm5    # load last 8 bytes of pix2 row 
-
-
-        movq                            %mm5,%mm3               # mm5 := abs(mm5-mm6)
-        psubusb                 %mm6,%mm5
-        psubusb                 %mm3,%mm6
-        por                             %mm6,%mm5
-
-        # Now mm2, mm4, mm5, mm7 have 32 absdiffs
-
-        movq                            %mm7,%mm3
-
-        pxor                            %mm6,%mm6               # Zero mm6
-
-        punpcklbw               %mm6,%mm3               # Unpack to words and add
-        punpckhbw               %mm6,%mm7
-        paddusw                 %mm3,%mm7
-
-        movq                            %mm5,%mm3
-
-        punpcklbw               %mm6,%mm3               # Unpack to words and add
-        punpckhbw               %mm6,%mm5
-        paddusw                 %mm3,%mm5
-
-        paddusw                 %mm7,%mm0               # Add to the acumulator (mm0)
-        paddusw                 %mm5,%mm1               # Add to the acumulator (mm1)
-
-        movq                            %mm4,%mm3
-
-        punpcklbw               %mm6,%mm3               # Unpack to words and add
-        punpckhbw               %mm6,%mm4
-        movq                            %mm2,%mm5
-        paddusw                 %mm3,%mm4
-
-
-
-        punpcklbw               %mm6,%mm5               # Unpack to words and add
-        punpckhbw               %mm6,%mm2
-        paddusw                 %mm5,%mm2
-
-        # Loop termination
-
-        addl                            %edx,%esi               # update pointers to next row
-        paddusw                 %mm4,%mm0               # Add to the acumulator (mm0)
-        addl                            %edx,%edi
-        subl                            $2,%ecx
-        paddusw                 %mm2,%mm1               # Add to the acumulator (mm1)
-        testl                           %ecx,%ecx               # check rowsleft
-        jnz                             pix_abs16x16_mmx.nextrow
-
-        paddusw                 %mm1,%mm0
-        movq                            %mm0,%mm2               # Copy mm0 to mm2
-        psrlq                           $32,%mm2
-        paddusw                 %mm2,%mm0               # Add 
-        movq                            %mm0,%mm3
-        psrlq                           $16,%mm3
-        paddusw                 %mm3,%mm0
-        movd                            %mm0,%eax               # Store return value
-        andl                            $0xffff,%eax
-
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-
-        popl %ebp                                               # restore stack pointer
-
-        #emms                                                           ; clear mmx registers
-        ret                                                             # return
-
-.global pix_abs16x16_sse
-
-# int  pix_abs16x16_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
-# esi = p1 (init:               blk1)
-# edi = p2 (init:               blk2)
-# ecx = rowsleft (init:  h)
-# edx = lx;
-
-# mm0 = distance accumulators (4 words)
-# mm1 = distance accumulators (4 words) 
-# mm2 = temp 
-# mm3 = temp
-# mm4 = temp
-# mm5 = temp 
-# mm6 = temp
-# mm7 = temp
-
-
-.align 32
-pix_abs16x16_sse: 
-        pushl                           %ebp                                    # save frame pointer
-        movl                            %esp,%ebp
-
-        pushl                           %ebx                                    # Saves registers (called saves convention in
-        pushl                           %ecx                                    # x86 GCC it seems)
-        pushl                           %edx                                    # 
-        pushl                           %esi
-        pushl                           %edi
-
-        pxor                            %mm0,%mm0                               # zero acculumators
-        pxor                            %mm1,%mm1
-        movl                            8(%ebp),%esi            # get pix1
-        movl                            12(%ebp),%edi           # get pix2
-        movl                            16(%ebp),%edx           # get lx
-        movl                            20(%ebp),%ecx           # get rowsleft
-        jmp                             pix_abs16x16_sse.next4row
-.align 32
-
-pix_abs16x16_sse.next4row: 
-        # First row
-
-        movq                            (%edi),%mm4             # load first 8 bytes of pix2 row 
-        movq                            8(%edi),%mm5    # load last 8 bytes of pix2 row
-        psadbw                  (%esi),%mm4             # SAD of first 8 bytes
-        psadbw                  8(%esi),%mm5    # SAD of last 8 bytes
-        paddw                           %mm4,%mm0                       # Add to acumulators
-        paddw                           %mm5,%mm1
-
-        # Second row    
-
-        addl                            %edx,%edi #
-        addl                            %edx,%esi #
-
-        movq                            (%edi),%mm6             # load first 8 bytes of pix2 row 
-        movq                            8(%edi),%mm7    # load last 8 bytes of pix2 row
-        psadbw                  (%esi),%mm6             # SAD of first 8 bytes
-        psadbw                  8(%esi),%mm7    # SAD of last 8 bytes
-        paddw                           %mm6,%mm0                       # Add to acumulators
-        paddw                           %mm7,%mm1
-
-        # Third row
-
-        addl                            %edx,%edi #
-        addl                            %edx,%esi #
-
-        movq                            (%edi),%mm4             # load first 8 bytes of pix2 row 
-        movq                            8(%edi),%mm5    # load last 8 bytes of pix2 row
-        psadbw                  (%esi),%mm4             # SAD of first 8 bytes
-        psadbw                  8(%esi),%mm5    # SAD of last 8 bytes
-        paddw                           %mm4,%mm0                       # Add to acumulators
-        paddw                           %mm5,%mm1
-
-        # Fourth row    
-
-        addl                            %edx,%edi #
-        addl                            %edx,%esi #
-
-        movq                            (%edi),%mm6             # load first 8 bytes of pix2 row 
-        movq                            8(%edi),%mm7    # load last 8 bytes of pix2 row
-        psadbw                  (%esi),%mm6             # SAD of first 8 bytes
-        psadbw                  8(%esi),%mm7    # SAD of last 8 bytes
-        paddw                           %mm6,%mm0                       # Add to acumulators
-        paddw                           %mm7,%mm1
-
-        # Loop termination
-
-        addl                            %edx,%esi               # update pointers to next row
-        addl                            %edx,%edi
-        subl                            $4,%ecx
-        testl                           %ecx,%ecx               # check rowsleft
-        jnz                             pix_abs16x16_sse.next4row
-
-        paddd                           %mm1,%mm0               # Sum acumulators
-        movd                            %mm0,%eax               # Store return value
-
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-
-        popl %ebp                                               # restore stack pointer
-
-        #emms                                                           ; clear mmx registers
-        ret                                                             # return
-
-.global pix_abs16x16_x2_mmx
-
-# int  pix_abs16x16_x2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
-# esi = p1 (init:               blk1)
-# edi = p2 (init:               blk2)
-# ecx = rowsleft (init:  h)
-# edx = lx;
-
-# mm0 = distance accumulators (4 words)
-# mm1 = distance accumulators (4 words) 
-# mm2 = temp 
-# mm3 = temp
-# mm4 = temp
-# mm5 = temp 
-# mm6 = 0
-# mm7 = temp
-
-
-.align 32
-pix_abs16x16_x2_mmx: 
-        pushl                           %ebp                                    # save frame pointer
-        movl                            %esp,%ebp
-
-        pushl                           %ebx                                    # Saves registers (called saves convention in
-        pushl                           %ecx                                    # x86 GCC it seems)
-        pushl                           %edx                                    # 
-        pushl                           %esi
-        pushl                           %edi
-
-        pxor                            %mm0,%mm0                               # zero acculumators
-        pxor                            %mm1,%mm1
-        pxor                            %mm6,%mm6
-        movl                            8(%ebp),%esi            # get pix1
-        movl                            12(%ebp),%edi           # get pix2
-        movl                            16(%ebp),%edx           # get lx
-        movl                            20(%ebp),%ecx           # get rowsleft
-        jmp                             pix_abs16x16_x2_mmx.nextrow_x2
-.align 32
-
-pix_abs16x16_x2_mmx.nextrow_x2: 
-        # First 8 bytes of the row
-
-        movq                            (%edi),%mm4                     # load first 8 bytes of pix2 row 
-        movq                            1(%edi),%mm5            # load bytes 1-8 of pix2 row
-
-        movq                            %mm4,%mm2               # copy mm4 on mm2
-        movq                            %mm5,%mm3               # copy mm5 on mm3
-        punpcklbw               %mm6,%mm4               # first 4 bytes of [edi] on mm4
-        punpcklbw               %mm6,%mm5               # first 4 bytes of [edi+1] on mm5
-        paddusw                 %mm5,%mm4               # mm4 := first 4 bytes interpolated in words
-        psrlw                           $1,%mm4
-
-        punpckhbw               %mm6,%mm2               # last 4 bytes of [edi] on mm2
-        punpckhbw               %mm6,%mm3               # last 4 bytes of [edi+1] on mm3
-        paddusw                 %mm3,%mm2               # mm2 := last 4 bytes interpolated in words
-        psrlw                           $1,%mm2
-
-        packuswb                        %mm2,%mm4       # pack 8 bytes interpolated on mm4
-        movq                            (%esi),%mm5     # load first 8 bytes of pix1 row
-
-        movq                            %mm4,%mm3               # mm4 := abs(mm4-mm5)
-        psubusb                 %mm5,%mm4
-        psubusb                 %mm3,%mm5
-        por                             %mm5,%mm4
-
-        # Last 8 bytes of the row       
-
-        movq    8(%edi),%mm7                    # load last 8 bytes of pix2 row 
-        movq    9(%edi),%mm5                    # load bytes 10-17 of pix2 row
-
-        movq                            %mm7,%mm2               # copy mm7 on mm2
-        movq                            %mm5,%mm3               # copy mm5 on mm3
-        punpcklbw               %mm6,%mm7               # first 4 bytes of [edi+8] on mm7
-        punpcklbw               %mm6,%mm5               # first 4 bytes of [edi+9] on mm5
-        paddusw                 %mm5,%mm7               # mm1 := first 4 bytes interpolated in words
-        psrlw                           $1,%mm7
-
-        punpckhbw               %mm6,%mm2               # last 4 bytes of [edi] on mm2
-        punpckhbw               %mm6,%mm3               # last 4 bytes of [edi+1] on mm3
-        paddusw                 %mm3,%mm2               # mm2 := last 4 bytes interpolated in words
-        psrlw                           $1,%mm2
-
-        packuswb                        %mm2,%mm7       # pack 8 bytes interpolated on mm1
-        movq                            8(%esi),%mm5    # load last 8 bytes of pix1 row
-
-        movq                            %mm7,%mm3               # mm7 := abs(mm1-mm5)
-        psubusb                 %mm5,%mm7
-        psubusb                 %mm3,%mm5
-        por                             %mm5,%mm7
-
-        # Now mm4 and mm7 have 16 absdiffs to add
-
-        movq                            %mm4,%mm3               # Make copies of these bytes
-        movq                            %mm7,%mm2
-
-        punpcklbw               %mm6,%mm4               # Unpack to words and add
-        punpcklbw               %mm6,%mm7
-        paddusw                 %mm7,%mm4
-        paddusw                 %mm4,%mm0               # Add to the acumulator (mm0)
-
-        punpckhbw               %mm6,%mm3               # Unpack to words and add
-        punpckhbw               %mm6,%mm2
-        paddusw                 %mm2,%mm3
-        paddusw                 %mm3,%mm1               # Add to the acumulator (mm1)
-
-        # Loop termination
-
-        addl                            %edx,%esi               # update pointers to next row
-        addl                            %edx,%edi
-
-        subl                            $1,%ecx
-        testl                           %ecx,%ecx               # check rowsleft
-        jnz                             pix_abs16x16_x2_mmx.nextrow_x2
-
-        paddusw                 %mm1,%mm0
-
-        movq                            %mm0,%mm1               # Copy mm0 to mm1
-        psrlq                           $32,%mm1
-        paddusw                 %mm1,%mm0               # Add 
-        movq                            %mm0,%mm2
-        psrlq                           $16,%mm2
-        paddusw                 %mm2,%mm0
-        movd                            %mm0,%eax               # Store return value
-        andl                            $0xffff,%eax
-
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-
-        popl %ebp                                               # restore stack pointer
-
-        emms                                                            # clear mmx registers
-        ret                                                             # return
-
-.global pix_abs16x16_y2_mmx
-
-# int  pix_abs16x16_y2_mmx(unsigned char *pix1,unsigned char *pix2, int lx, int h);
-# esi = p1 (init:               blk1)
-# edi = p2 (init:               blk2)
-# ebx = p2 + lx
-# ecx = rowsleft (init:  h)
-# edx = lx;
-
-# mm0 = distance accumulators (4 words)
-# mm1 = distance accumulators (4 words) 
-# mm2 = temp 
-# mm3 = temp
-# mm4 = temp
-# mm5 = temp 
-# mm6 = 0
-# mm7 = temp
-
-
-.align 32
-pix_abs16x16_y2_mmx: 
-        pushl                           %ebp                                    # save frame pointer
-        movl                            %esp,%ebp
-
-        pushl                           %ebx                                    # Saves registers (called saves convention in
-        pushl                           %ecx                                    # x86 GCC it seems)
-        pushl                           %edx                                    # 
-        pushl                           %esi
-        pushl                           %edi
-
-        pxor                            %mm0,%mm0                               # zero acculumators
-        pxor                            %mm1,%mm1
-        pxor                            %mm6,%mm6
-        movl                            8(%ebp),%esi            # get pix1
-        movl                            12(%ebp),%edi           # get pix2
-        movl                            16(%ebp),%edx           # get lx
-        movl                            20(%ebp),%ecx           # get rowsleft
-        movl                            %edi,%ebx
-        addl                            %edx,%ebx
-        jmp                             pix_abs16x16_y2_mmx.nextrow_y2
-.align 32
-
-pix_abs16x16_y2_mmx.nextrow_y2: 
-        # First 8 bytes of the row
-
-        movq                            (%edi),%mm4                     # load first 8 bytes of pix2 row 
-        movq                            (%ebx),%mm5                     # load bytes 1-8 of pix2 row
-
-        movq                            %mm4,%mm2               # copy mm4 on mm2
-        movq                            %mm5,%mm3               # copy mm5 on mm3
-        punpcklbw               %mm6,%mm4               # first 4 bytes of [edi] on mm4
-        punpcklbw               %mm6,%mm5               # first 4 bytes of [ebx] on mm5
-        paddusw                 %mm5,%mm4               # mm4 := first 4 bytes interpolated in words
-        psrlw                           $1,%mm4
-
-        punpckhbw               %mm6,%mm2               # last 4 bytes of [edi] on mm2
-        punpckhbw               %mm6,%mm3               # last 4 bytes of [edi+1] on mm3
-        paddusw                 %mm3,%mm2               # mm2 := last 4 bytes interpolated in words
-        psrlw                           $1,%mm2
-
-        packuswb                        %mm2,%mm4       # pack 8 bytes interpolated on mm4
-        movq                            (%esi),%mm5     # load first 8 bytes of pix1 row
-
-        movq                            %mm4,%mm3               # mm4 := abs(mm4-mm5)
-        psubusb                 %mm5,%mm4
-        psubusb                 %mm3,%mm5
-        por                             %mm5,%mm4
-
-        # Last 8 bytes of the row       
-
-        movq    8(%edi),%mm7                    # load last 8 bytes of pix2 row 
-        movq    8(%ebx),%mm5                    # load bytes 10-17 of pix2 row
-
-        movq                            %mm7,%mm2               # copy mm7 on mm2
-        movq                            %mm5,%mm3               # copy mm5 on mm3
-        punpcklbw               %mm6,%mm7               # first 4 bytes of [edi+8] on mm7
-        punpcklbw               %mm6,%mm5               # first 4 bytes of [ebx+8] on mm5
-        paddusw                 %mm5,%mm7               # mm1 := first 4 bytes interpolated in words
-        psrlw                           $1,%mm7
-
-        punpckhbw               %mm6,%mm2               # last 4 bytes of [edi+8] on mm2
-        punpckhbw               %mm6,%mm3               # last 4 bytes of [ebx+8] on mm3
-        paddusw                 %mm3,%mm2               # mm2 := last 4 bytes interpolated in words
-        psrlw                           $1,%mm2
-
-        packuswb                        %mm2,%mm7       # pack 8 bytes interpolated on mm1
-        movq                            8(%esi),%mm5    # load last 8 bytes of pix1 row
-
-        movq                            %mm7,%mm3               # mm7 := abs(mm1-mm5)
-        psubusb                 %mm5,%mm7
-        psubusb                 %mm3,%mm5
-        por                             %mm5,%mm7
-
-        # Now mm4 and mm7 have 16 absdiffs to add
-
-        movq                            %mm4,%mm3               # Make copies of these bytes
-        movq                            %mm7,%mm2
-
-        punpcklbw               %mm6,%mm4               # Unpack to words and add
-        punpcklbw               %mm6,%mm7
-        paddusw                 %mm7,%mm4
-        paddusw                 %mm4,%mm0               # Add to the acumulator (mm0)
-
-        punpckhbw               %mm6,%mm3               # Unpack to words and add
-        punpckhbw               %mm6,%mm2
-        paddusw                 %mm2,%mm3
-        paddusw                 %mm3,%mm1               # Add to the acumulator (mm1)
-
-        # Loop termination
-
-        addl                            %edx,%esi               # update pointers to next row
-        addl                            %edx,%edi
-        addl                            %edx,%ebx
-        subl                            $1,%ecx
-        testl                           %ecx,%ecx               # check rowsleft
-        jnz                             pix_abs16x16_y2_mmx.nextrow_y2
-
-        paddusw                 %mm1,%mm0
-
-        movq                            %mm0,%mm1               # Copy mm0 to mm1
-        psrlq                           $32,%mm1
-        paddusw                 %mm1,%mm0               # Add 
-        movq                            %mm0,%mm2
-        psrlq                           $16,%mm2
-        paddusw                 %mm2,%mm0
-        movd                            %mm0,%eax               # Store return value
-        andl                            $0xffff,%eax
-
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-
-        popl %ebp                                               # restore stack pointer
-
-        emms                                                            # clear mmx registers
-        ret                                                             # return
-
-.global pix_abs16x16_xy2_mmx
-
-# int pix_abs16x16_xy2_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
-
-# esi = p1 (init:               blk1)
-# edi = p2 (init:               blk2)
-# ebx = p1+lx
-# ecx = rowsleft (init:  h)
-# edx = lx;
-
-# mm0 = distance accumulators (4 words)
-# mm1 = bytes p2
-# mm2 = bytes p1
-# mm3 = bytes p1+lx
-# I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
-# but I don't think thats going to happen in iA32-land...
-# mm4 = temp 4 bytes in words interpolating p1, p1+1
-# mm5 = temp 4 bytes in words from p2
-# mm6 = temp comparison bit mask p1,p2
-# mm7 = temp comparison bit mask p2,p1
-
-
-.align 32
-pix_abs16x16_xy2_mmx: 
-        pushl %ebp              # save stack pointer
-        movl %esp,%ebp  # so that we can do this
-
-        pushl %ebx              # Saves registers (called saves convention in
-        pushl %ecx              # x86 GCC it seems)
-        pushl %edx              # 
-        pushl %esi
-        pushl %edi
-
-        pxor %mm0,%mm0                          # zero acculumators
-
-        movl 12(%ebp),%esi                      # get p1
-        movl 8(%ebp),%edi                       # get p2
-        movl 16(%ebp),%edx                      # get lx
-        movl 20(%ebp),%ecx                      # rowsleft := h
-        movl %esi,%ebx
-    addl %edx,%ebx
-        jmp pix_abs16x16_xy2_mmx.nextrowmm11                    # snap to it
-.align 32
-pix_abs16x16_xy2_mmx.nextrowmm11: 
-
-                ## 
-                ## First 8 bytes of row
-                ## 
-
-                ## First 4 bytes of 8
-
-        movq (%esi),%mm4            # mm4 := first 4 bytes p1
-        pxor %mm7,%mm7
-        movq %mm4,%mm2                          #  mm2 records all 8 bytes
-        punpcklbw %mm7,%mm4           #  First 4 bytes p1 in Words...
-
-        movq (%ebx),%mm6                    #  mm6 := first 4 bytes p1+lx
-        movq %mm6,%mm3              #  mm3 records all 8 bytes
-        punpcklbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-
-        movq 1(%esi),%mm5                       # mm5 := first 4 bytes p1+1
-        punpcklbw %mm7,%mm5           #  First 4 bytes p1 in Words...
-        paddw %mm5,%mm4
-        movq 1(%ebx),%mm6           #  mm6 := first 4 bytes p1+lx+1
-        punpcklbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-        psrlw $2,%mm4               # mm4 := First 4 bytes interpolated in words
-
-        movq (%edi),%mm5                        # mm5:=first 4 bytes of p2 in words
-        movq %mm5,%mm1
-        punpcklbw %mm7,%mm5
-
-        movq  %mm4,%mm7
-        pcmpgtw %mm5,%mm7       # mm7 := [i : W0..3,mm4>mm5]
-
-        movq  %mm4,%mm6         # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
-        psubw %mm5,%mm6
-        pand  %mm7,%mm6
-
-        paddw %mm6,%mm0                         # Add to accumulator
-
-        movq  %mm5,%mm6     # mm6 := [i : W0..3,mm5>mm4]
-        pcmpgtw %mm4,%mm6
-        psubw %mm4,%mm5         # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
-        pand  %mm6,%mm5
-
-        paddw %mm5,%mm0                         # Add to accumulator
-
-                ## Second 4 bytes of 8
-
-        movq %mm2,%mm4              # mm4 := Second 4 bytes p1 in words
-        pxor  %mm7,%mm7
-        punpckhbw %mm7,%mm4
-        movq %mm3,%mm6                  # mm6 := Second 4 bytes p1+1 in words  
-        punpckhbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-        movq 1(%esi),%mm5                       # mm5 := first 4 bytes p1+1
-        punpckhbw %mm7,%mm5         #  First 4 bytes p1 in Words...
-        paddw %mm5,%mm4
-        movq 1(%ebx),%mm6           #  mm6 := first 4 bytes p1+lx+1
-        punpckhbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-        psrlw $2,%mm4               # mm4 := First 4 bytes interpolated in words
-
-        movq %mm1,%mm5                  # mm5:= second 4 bytes of p2 in words
-        punpckhbw %mm7,%mm5
-
-        movq  %mm4,%mm7
-        pcmpgtw %mm5,%mm7       # mm7 := [i : W0..3,mm4>mm5]
-
-        movq  %mm4,%mm6         # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
-        psubw %mm5,%mm6
-        pand  %mm7,%mm6
-
-        paddw %mm6,%mm0                         # Add to accumulator
-
-        movq  %mm5,%mm6     # mm6 := [i : W0..3,mm5>mm4]
-        pcmpgtw %mm4,%mm6
-        psubw %mm4,%mm5         # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
-        pand  %mm6,%mm5
-
-        paddw %mm5,%mm0                         # Add to accumulator
-
-
-                ##
-                ## Second 8 bytes of row
-                ## 
-                ## First 4 bytes of 8
-
-        movq 8(%esi),%mm4             # mm4 := first 4 bytes p1+8
-        pxor %mm7,%mm7
-        movq %mm4,%mm2                          #  mm2 records all 8 bytes
-        punpcklbw %mm7,%mm4           #  First 4 bytes p1 in Words...
-
-        movq 8(%ebx),%mm6                           #  mm6 := first 4 bytes p1+lx+8
-        movq %mm6,%mm3              #  mm3 records all 8 bytes
-        punpcklbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-
-        movq 9(%esi),%mm5                       # mm5 := first 4 bytes p1+9
-        punpcklbw %mm7,%mm5           #  First 4 bytes p1 in Words...
-        paddw %mm5,%mm4
-        movq 9(%ebx),%mm6           #  mm6 := first 4 bytes p1+lx+9
-        punpcklbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-        psrlw $2,%mm4               # mm4 := First 4 bytes interpolated in words
-
-        movq 8(%edi),%mm5                               # mm5:=first 4 bytes of p2+8 in words
-        movq %mm5,%mm1
-        punpcklbw %mm7,%mm5
-
-        movq  %mm4,%mm7
-        pcmpgtw %mm5,%mm7       # mm7 := [i : W0..3,mm4>mm5]
-
-        movq  %mm4,%mm6         # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
-        psubw %mm5,%mm6
-        pand  %mm7,%mm6
-
-        paddw %mm6,%mm0                         # Add to accumulator
-
-        movq  %mm5,%mm6     # mm6 := [i : W0..3,mm5>mm4]
-        pcmpgtw %mm4,%mm6
-        psubw %mm4,%mm5         # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
-        pand  %mm6,%mm5
-
-        paddw %mm5,%mm0                         # Add to accumulator
-
-                ## Second 4 bytes of 8
-
-        movq %mm2,%mm4              # mm4 := Second 4 bytes p1 in words
-        pxor  %mm7,%mm7
-        punpckhbw %mm7,%mm4
-        movq %mm3,%mm6                  # mm6 := Second 4 bytes p1+1 in words  
-        punpckhbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-        movq 9(%esi),%mm5                       # mm5 := first 4 bytes p1+1
-        punpckhbw %mm7,%mm5         #  First 4 bytes p1 in Words...
-        paddw %mm5,%mm4
-        movq 9(%ebx),%mm6           #  mm6 := first 4 bytes p1+lx+1
-        punpckhbw %mm7,%mm6
-        paddw %mm6,%mm4
-
-        psrlw $2,%mm4               # mm4 := First 4 bytes interpolated in words
-
-        movq %mm1,%mm5                  # mm5:= second 4 bytes of p2 in words
-        punpckhbw %mm7,%mm5
-
-        movq  %mm4,%mm7
-        pcmpgtw %mm5,%mm7       # mm7 := [i : W0..3,mm4>mm5]
-
-        movq  %mm4,%mm6         # mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
-        psubw %mm5,%mm6
-        pand  %mm7,%mm6
-
-        paddw %mm6,%mm0                         # Add to accumulator
-
-        movq  %mm5,%mm6     # mm6 := [i : W0..3,mm5>mm4]
-        pcmpgtw %mm4,%mm6
-        psubw %mm4,%mm5         # mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
-        pand  %mm6,%mm5
-
-        paddw %mm5,%mm0                         # Add to accumulator
-
-
-                ##
-                ##      Loop termination condition... and stepping
-                ##              
-
-        addl %edx,%esi          # update pointer to next row
-        addl %edx,%edi          # ditto
-        addl %edx,%ebx
-
-        subl $1,%ecx
-        testl %ecx,%ecx         # check rowsleft
-        jnz pix_abs16x16_xy2_mmx.nextrowmm11
-
-                ## Sum the Accumulators
-        movq  %mm0,%mm4
-        psrlq $32,%mm4
-        paddw %mm4,%mm0
-        movq  %mm0,%mm6
-        psrlq $16,%mm6
-        paddw %mm6,%mm0
-        movd %mm0,%eax          # store return value
-        andl $0xffff,%eax
-
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-
-        popl %ebp               # restore stack pointer
-
-        emms                    # clear mmx registers
-        ret                     # we now return you to your regular programming
-
-
-
diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c
new file mode 100644
index 000000000..5459b81b3
--- /dev/null
+++ b/src/libffmpeg/libavcodec/simple_idct.c
@@ -0,0 +1,231 @@
+/*
+    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+/*
+  based upon some outcommented c code from mpeg2dec (idct_mmx.c written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
+*/
+
+#include <inttypes.h>
+
+#include "simple_idct.h"
+
+#if 0
+#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
+#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
+#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
+#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
+#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
+#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
+#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
+#define ROW_SHIFT 8
+#define COL_SHIFT 17
+#else
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16384  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define ROW_SHIFT 11
+#define COL_SHIFT 20 // 6
+#endif
+#if 1
+static void inline idctRow (int16_t * row)
+{
+	int a0, a1, a2, a3, b0, b1, b2, b3;
+	const int C1 =W1;
+	const int C2 =W2;
+	const int C3 =W3;
+	const int C4 =W4;
+	const int C5 =W5;
+	const int C6 =W6;
+	const int C7 =W7;
+
+	if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7])) {
+		row[0] = row[1] = row[2] = row[3] = row[4] =
+			row[5] = row[6] = row[7] = row[0]<<3;
+		return;
+	}
+
+	a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
+	a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
+	a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
+	a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
+
+	b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+	b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+	b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+	b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+	row[0] = (a0 + b0) >> ROW_SHIFT;
+	row[1] = (a1 + b1) >> ROW_SHIFT;
+	row[2] = (a2 + b2) >> ROW_SHIFT;
+	row[3] = (a3 + b3) >> ROW_SHIFT;
+	row[4] = (a3 - b3) >> ROW_SHIFT;
+	row[5] = (a2 - b2) >> ROW_SHIFT;
+	row[6] = (a1 - b1) >> ROW_SHIFT;
+	row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+
+static void inline idctCol (int16_t * col)
+{
+	int a0, a1, a2, a3, b0, b1, b2, b3;
+	const int C1 =W1;
+	const int C2 =W2;
+	const int C3 =W3;
+	const int C4 =W4;
+	const int C5 =W5;
+	const int C6 =W6;
+	const int C7 =W7;
+/*
+	if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
+		col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
+			col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
+		return;
+	}*/
+	col[0] += (1<<(COL_SHIFT-1))/W4;
+	a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6];
+	a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6];
+	a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6];
+	a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6];
+
+	b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
+	b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
+	b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
+	b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
+
+	col[8*0] = (a0 + b0) >> COL_SHIFT;
+	col[8*1] = (a1 + b1) >> COL_SHIFT;
+	col[8*2] = (a2 + b2) >> COL_SHIFT;
+	col[8*3] = (a3 + b3) >> COL_SHIFT;
+	col[8*4] = (a3 - b3) >> COL_SHIFT;
+	col[8*5] = (a2 - b2) >> COL_SHIFT;
+	col[8*6] = (a1 - b1) >> COL_SHIFT;
+	col[8*7] = (a0 - b0) >> COL_SHIFT;
+}
+
+void simple_idct (short *block)
+{
+	int i;
+	for(i=0; i<8; i++)
+		idctRow(block + 8*i);
+
+	for(i=0; i<8; i++)
+		idctCol(block + i);
+
+}
+
+#else
+
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16384  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define COL_SHIFT 31 // 6
+
+static void inline idctRow (int32_t *out, int16_t * row)
+{
+	int a0, a1, a2, a3, b0, b1, b2, b3;
+	const int C1 =W1;
+	const int C2 =W2;
+	const int C3 =W3;
+	const int C4 =W4;
+	const int C5 =W5;
+	const int C6 =W6;
+	const int C7 =W7;
+/*
+	if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7])) {
+		row[0] = row[1] = row[2] = row[3] = row[4] =
+			row[5] = row[6] = row[7] = row[0]<<14;
+		return;
+	}
+*/
+	a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6];
+	a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6];
+	a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6];
+	a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6];
+
+	b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+	b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+	b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+	b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+	out[0] = (a0 + b0);
+	out[1] = (a1 + b1);
+	out[2] = (a2 + b2);
+	out[3] = (a3 + b3);
+	out[4] = (a3 - b3);
+	out[5] = (a2 - b2);
+	out[6] = (a1 - b1);
+	out[7] = (a0 - b0);
+}
+
+static void inline idctCol (int32_t *in, int16_t * col)
+{
+	int64_t a0, a1, a2, a3, b0, b1, b2, b3;
+	const int64_t C1 =W1;
+	const int64_t C2 =W2;
+	const int64_t C3 =W3;
+	const int64_t C4 =W4;
+	const int64_t C5 =W5;
+	const int64_t C6 =W6;
+	const int64_t C7 =W7;
+/*
+	if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
+		col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
+			col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
+		return;
+	}*/
+	in[0] += (1<<(COL_SHIFT-1))/W4;
+	a0 = C4*in[8*0] + C2*in[8*2] + C4*in[8*4] + C6*in[8*6];
+	a1 = C4*in[8*0] + C6*in[8*2] - C4*in[8*4] - C2*in[8*6];
+	a2 = C4*in[8*0] - C6*in[8*2] - C4*in[8*4] + C2*in[8*6];
+	a3 = C4*in[8*0] - C2*in[8*2] + C4*in[8*4] - C6*in[8*6];
+
+	b0 = C1*in[8*1] + C3*in[8*3] + C5*in[8*5] + C7*in[8*7];
+	b1 = C3*in[8*1] - C7*in[8*3] - C1*in[8*5] - C5*in[8*7];
+	b2 = C5*in[8*1] - C1*in[8*3] + C7*in[8*5] + C3*in[8*7];
+	b3 = C7*in[8*1] - C5*in[8*3] + C3*in[8*5] - C1*in[8*7];
+
+	col[8*0] = (a0 + b0) >> COL_SHIFT;
+	col[8*1] = (a1 + b1) >> COL_SHIFT;
+	col[8*2] = (a2 + b2) >> COL_SHIFT;
+	col[8*3] = (a3 + b3) >> COL_SHIFT;
+	col[8*4] = (a3 - b3) >> COL_SHIFT;
+	col[8*5] = (a2 - b2) >> COL_SHIFT;
+	col[8*6] = (a1 - b1) >> COL_SHIFT;
+	col[8*7] = (a0 - b0) >> COL_SHIFT;
+}
+
+void simple_idct (short *block)
+{
+	int i;
+	int32_t temp[64];
+	for(i=0; i<8; i++)
+		idctRow(temp+8*i, block + 8*i);
+
+	for(i=0; i<8; i++)
+		idctCol(temp+i, block + i);
+
+}
+
+#endif
diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h
new file mode 100644
index 000000000..54dff7396
--- /dev/null
+++ b/src/libffmpeg/libavcodec/simple_idct.h
@@ -0,0 +1,20 @@
+/*
+    Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+void simple_idct(short *block);
+void simple_idct_mmx(short *block);
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index 26510d87c..aef27e342 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -16,19 +16,36 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
-#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <errno.h>
 #include "common.h"
 #include "dsputil.h"
 #include "avcodec.h"
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#else
+#include <stdlib.h>
+#endif
 
 /* memory alloc */
 void *av_mallocz(int size)
 {
     void *ptr;
+#if defined ( ARCH_X86 ) && defined ( HAVE_MEMALIGN )
+    ptr = memalign(64,size);
+    /* Why 64? 
+       Indeed, we should align it:
+         on 4 for 386
+         on 16 for 486
+	 on 32 for 586, PPro - k6-III
+	 on 64 for K7 (maybe for P3 too).
+       Because L1 and L2 caches are aligned on those values.
+       But I don't want to code such logic here!
+     */
+#else
     ptr = malloc(size);
+#endif
     if (!ptr)
         return NULL;
     memset(ptr, 0, size);
@@ -138,6 +155,18 @@ AVCodec *avcodec_find_encoder(enum CodecID id)
     return NULL;
 }
 
+AVCodec *avcodec_find_encoder_by_name(const char *name)
+{
+    AVCodec *p;
+    p = first_avcodec;
+    while (p) {
+        if (p->encode != NULL && strcmp(name,p->name) == 0)
+            return p;
+        p = p->next;
+    }
+    return NULL;
+}
+
 AVCodec *avcodec_find_decoder(enum CodecID id)
 {
     AVCodec *p;
@@ -188,6 +217,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
     const char *codec_name;
     AVCodec *p;
     char buf1[32];
+    int bitrate;
 
     if (encode)
         p = avcodec_find_encoder(enc->codec_id);
@@ -228,6 +258,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
                      enc->width, enc->height, 
                      (float)enc->frame_rate / FRAME_RATE_BASE);
         }
+        bitrate = enc->bit_rate;
         break;
     case CODEC_TYPE_AUDIO:
         snprintf(buf, buf_size,
@@ -239,13 +270,31 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
                      enc->sample_rate,
                      enc->channels == 2 ? "stereo" : "mono");
         }
+        /* for PCM codecs, compute bitrate directly */
+        switch(enc->codec_id) {
+        case CODEC_ID_PCM_S16LE:
+        case CODEC_ID_PCM_S16BE:
+        case CODEC_ID_PCM_U16LE:
+        case CODEC_ID_PCM_U16BE:
+            bitrate = enc->sample_rate * enc->channels * 16;
+            break;
+        case CODEC_ID_PCM_S8:
+        case CODEC_ID_PCM_U8:
+        case CODEC_ID_PCM_ALAW:
+        case CODEC_ID_PCM_MULAW:
+            bitrate = enc->sample_rate * enc->channels * 8;
+            break;
+        default:
+            bitrate = enc->bit_rate;
+            break;
+        }
         break;
     default:
         abort();
     }
-    if (enc->bit_rate != 0) {
+    if (bitrate != 0) {
         snprintf(buf + strlen(buf), buf_size - strlen(buf), 
-                 ", %d kb/s", enc->bit_rate / 1000);
+                 ", %d kb/s", bitrate / 1000);
     }
 }
 
@@ -353,7 +402,6 @@ void avcodec_register_all(void)
     register_avcodec(&mpeg4_encoder);
     register_avcodec(&msmpeg4_encoder);
 #endif /* CONFIG_ENCODERS */
-    register_avcodec(&pcm_codec);
     register_avcodec(&rawvideo_codec);
 
     /* decoders */
@@ -365,13 +413,32 @@ void avcodec_register_all(void)
     register_avcodec(&h263i_decoder);
     register_avcodec(&rv10_decoder);
     register_avcodec(&mjpeg_decoder);
-#ifdef CONFIG_MPGLIB
+#ifdef FF_AUDIO_CODECS
     register_avcodec(&mp3_decoder);
-#endif
 #ifdef CONFIG_AC3
     register_avcodec(&ac3_decoder);
 #endif
+#endif
 #endif /* CONFIG_DECODERS */
+
+#ifdef FF_AUDIO_CODECS
+    /* pcm codecs */
+
+#define PCM_CODEC(id, name) \
+    register_avcodec(& name ## _encoder); \
+    register_avcodec(& name ## _decoder); \
+
+PCM_CODEC(CODEC_ID_PCM_S16LE, pcm_s16le);
+PCM_CODEC(CODEC_ID_PCM_S16BE, pcm_s16be);
+PCM_CODEC(CODEC_ID_PCM_U16LE, pcm_u16le);
+PCM_CODEC(CODEC_ID_PCM_U16BE, pcm_u16be);
+PCM_CODEC(CODEC_ID_PCM_S8, pcm_s8);
+PCM_CODEC(CODEC_ID_PCM_U8, pcm_u8);
+PCM_CODEC(CODEC_ID_PCM_ALAW, pcm_alaw);
+PCM_CODEC(CODEC_ID_PCM_MULAW, pcm_mulaw);
+
+#undef PCM_CODEC
+#endif
 }
 
 static int encode_init(AVCodecContext *s)
@@ -392,18 +459,6 @@ static int encode_frame(AVCodecContext *avctx,
     return -1;
 }
 
-/* dummy pcm codec */
-AVCodec pcm_codec = {
-    "pcm",
-    CODEC_TYPE_AUDIO,
-    CODEC_ID_PCM,
-    0,
-    encode_init,
-    encode_frame,
-    NULL,
-    decode_frame,
-};
-
 AVCodec rawvideo_codec = {
     "rawvideo",
     CODEC_TYPE_VIDEO,