update ffmpeg

CVS patchset: 4068 CVS date: 2003/01/31 18:29:43
author: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-01-31 18:29:43 +0000
committer: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-01-31 18:29:43 +0000
commit: 5350f2b7701f01bc4f234d3971fb8a623a8cd72a (patch)
tree: 5f6cd350778863ad8d2612bce4ac2f6270919115
parent: 8b0e8647a0d0c279b6a355362452dff4bd6f5c05 (diff)
download: xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.gz
xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.bz2
46 files changed, 4210 insertions, 1710 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
index 706462a59..5cbc00167 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -285,6 +285,16 @@ void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
     put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
 }
 
+static int sad16x16_mvi(void *s, uint8_t *a, uint8_t *b, int stride)
+{
+    return pix_abs16x16_mvi_asm(a, b, stride);
+}
+
+static int sad8x8_mvi(void *s, uint8_t *a, uint8_t *b, int stride)
+{
+    return pix_abs8x8_mvi(a, b, stride);
+}
+
 void dsputil_init_alpha(DSPContext* c, unsigned mask)
 {
     c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
@@ -336,6 +346,8 @@ void dsputil_init_alpha(DSPContext* c, unsigned mask)
 
         c->get_pixels       = get_pixels_mvi;
         c->diff_pixels      = diff_pixels_mvi;
+        c->sad[0]           = sad16x16_mvi;
+        c->sad[1]           = sad8x8_mvi;
         c->pix_abs8x8       = pix_abs8x8_mvi;
         c->pix_abs16x16     = pix_abs16x16_mvi_asm;
         c->pix_abs16x16_x2  = pix_abs16x16_x2_mvi;
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index fd7eafbd4..6ee2b84cd 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -1,12 +1,16 @@
 #ifndef AVCODEC_H
 #define AVCODEC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "common.h"
 
 #define LIBAVCODEC_VERSION_INT 0x000406
 #define LIBAVCODEC_VERSION     "0.4.6"
-#define LIBAVCODEC_BUILD       4652
-#define LIBAVCODEC_BUILD_STR   "4652"
+#define LIBAVCODEC_BUILD       4654
+#define LIBAVCODEC_BUILD_STR   "4654"
 
 enum CodecID {
     CODEC_ID_NONE, 
@@ -62,21 +66,19 @@ enum CodecType {
 enum PixelFormat {
     PIX_FMT_YUV420P,
     PIX_FMT_YUV422,
-    PIX_FMT_RGB24,
-    PIX_FMT_BGR24,
+    PIX_FMT_RGB24,     /* 3 bytes, R is first */
+    PIX_FMT_BGR24,     /* 3 bytes, B is first */
     PIX_FMT_YUV422P,
     PIX_FMT_YUV444P,
-    PIX_FMT_RGBA32,
-    PIX_FMT_BGRA32,
+    PIX_FMT_RGBA32,    /* always stored in cpu endianness */
     PIX_FMT_YUV410P,
     PIX_FMT_YUV411P,
-    PIX_FMT_RGB565,
-    PIX_FMT_RGB555,
-//    PIX_FMT_RGB5551,
-    PIX_FMT_BGR565,
-    PIX_FMT_BGR555,
-//    PIX_FMT_GBR565,
-//    PIX_FMT_GBR555
+    PIX_FMT_RGB565,    /* always stored in cpu endianness */
+    PIX_FMT_RGB555,    /* always stored in cpu endianness, most significant bit to 1 */
+    PIX_FMT_GRAY8,
+    PIX_FMT_MONOWHITE, /* 0 is white */
+    PIX_FMT_MONOBLACK, /* 0 is black */
+    PIX_FMT_NB,
 };
 
 /* currently unused, may be used if 24/32 bits samples ever supported */
@@ -520,6 +522,7 @@ typedef struct AVCodecContext {
 #define FF_BUG_NO_PADDING       16
 #define FF_BUG_AC_VLC           32
 #define FF_BUG_QPEL_CHROMA      64
+#define FF_BUG_STD_QPEL         128
 //#define FF_BUG_FAKE_SCALABILITY 16 //autodetection should work 100%
         
     /**
@@ -924,6 +927,16 @@ typedef struct AVCodecContext {
      */
     int me_subpel_quality;
 
+    /**
+     * callback to negotiate the pixelFormat
+     * @param fmt is the list of formats which are supported by the codec,
+     * its terminated by -1 as 0 is a valid format, the formats are ordered by quality
+     * the first is allways the native one
+     * @return the choosen format
+     * encoding: unused
+     * decoding: set by user, if not set then the native format will always be choosen
+     */
+    enum PixelFormat (*get_format)(struct AVCodecContext *s, enum PixelFormat * fmt);
 } AVCodecContext;
 
 typedef struct AVCodec {
@@ -1048,10 +1061,11 @@ void img_resample(ImgReSampleContext *s,
 
 void img_resample_close(ImgReSampleContext *s);
 
-void avpicture_fill(AVPicture *picture, UINT8 *ptr,
-                    int pix_fmt, int width, int height);
+int avpicture_fill(AVPicture *picture, UINT8 *ptr,
+                   int pix_fmt, int width, int height);
 int avpicture_get_size(int pix_fmt, int width, int height);
-void avcodec_get_chroma_sub_sample(int fmt, int *h_shift, int *v_shift);
+void avcodec_get_chroma_sub_sample(int pix_fmt, int *h_shift, int *v_shift);
+const char *avcodec_get_pix_fmt_name(int pix_fmt);
 
 /* convert among pixel formats */
 int img_convert(AVPicture *dst, int dst_pix_fmt,
@@ -1142,7 +1156,7 @@ typedef struct {
     const char* supported;
 } avc_config_t;
 
-void avcodec_getopt(AVCodecContext* avctx, char* str, avc_config_t** config);
+void avcodec_getopt(AVCodecContext* avctx, const char* str, avc_config_t** config);
 
 /**
  * Interface for 0.5.0 version
@@ -1223,13 +1237,20 @@ int avcodec(void* handle, avc_cmd_t cmd, void* pin, void* pout);
 /* memory */
 void *av_malloc(unsigned int size);
 void *av_mallocz(unsigned int size);
+void *av_realloc(void *ptr, unsigned int size);
 void av_free(void *ptr);
+char *av_strdup(const char *s);
 void __av_freep(void **ptr);
 #define av_freep(p) __av_freep((void **)(p))
+void *av_fast_realloc(void *ptr, int *size, int min_size);
 /* for static data only */
 /* call av_free_static to release all staticaly allocated tables */
 void av_free_static(void);
 void *__av_mallocz_static(void** location, unsigned int size);
 #define av_mallocz_static(p, s) __av_mallocz_static((void **)(p), s)
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* AVCODEC_H */
diff --git a/src/libffmpeg/libavcodec/common.c b/src/libffmpeg/libavcodec/common.c
index 40ba49811..aa766280b 100644
--- a/src/libffmpeg/libavcodec/common.c
+++ b/src/libffmpeg/libavcodec/common.c
@@ -27,6 +27,17 @@ const UINT8 ff_sqrt_tab[128]={
         9, 9, 9, 9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11
 };
 
+const uint8_t ff_log2_tab[256]={
+        0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+        5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
 void init_put_bits(PutBitContext *s, 
                    UINT8 *buffer, int buffer_size,
                    void *opaque,
@@ -99,10 +110,12 @@ void put_string(PutBitContext * pbc, char *s)
 /* bit input functions */
 
 void init_get_bits(GetBitContext *s,
-                   UINT8 *buffer, int buffer_size)
+                   UINT8 *buffer, int bit_size)
 {
+    const int buffer_size= (bit_size+7)>>3;
+
     s->buffer= buffer;
-    s->size= buffer_size;
+    s->size_in_bits= bit_size;
     s->buffer_end= buffer + buffer_size;
 #ifdef ALT_BITSTREAM_READER
     s->index=0;
@@ -169,8 +182,8 @@ static int alloc_table(VLC *vlc, int size)
     vlc->table_size += size;
     if (vlc->table_size > vlc->table_allocated) {
         vlc->table_allocated += (1 << vlc->bits);
-        vlc->table = realloc(vlc->table,
-                             sizeof(VLC_TYPE) * 2 * vlc->table_allocated);
+        vlc->table = av_realloc(vlc->table,
+                                sizeof(VLC_TYPE) * 2 * vlc->table_allocated);
         if (!vlc->table)
             return -1;
     }
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 405ddaa09..c2305b45e 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -88,8 +88,40 @@ typedef INT64 int64_t;
 #    endif
 
 #    define snprintf _snprintf
+#    define vsnprintf _vsnprintf
 
-#else /* CONFIG_WIN32 */
+/* CONFIG_WIN32 end */
+#elif defined (CONFIG_OS2)
+/* OS/2 EMX */
+
+#include <inttypes.h>
+
+typedef unsigned char UINT8;
+typedef unsigned short UINT16;
+typedef unsigned int UINT32;
+typedef unsigned long long UINT64;
+typedef signed char INT8;
+typedef signed short INT16;
+typedef signed int INT32;
+typedef signed long long INT64;
+
+#ifdef HAVE_AV_CONFIG_H
+
+#ifndef INT64_C
+#define INT64_C(c)     (c ## LL)
+#define UINT64_C(c)    (c ## ULL)
+#endif
+
+#ifdef USE_FASTMEMCPY
+#include "fastmemcpy.h"
+#endif
+
+#include <float.h>
+
+#endif /* HAVE_AV_CONFIG_H */
+
+/* CONFIG_OS2 end */
+#else
 
 /* unix */
 
@@ -119,7 +151,7 @@ typedef signed long long INT64;
 #        endif
 #    endif /* HAVE_AV_CONFIG_H */
 
-#endif /* !CONFIG_WIN32 */
+#endif /* !CONFIG_WIN32 && !CONFIG_OS2 */
 
 #ifdef HAVE_AV_CONFIG_H
 
@@ -238,7 +270,7 @@ typedef struct GetBitContext {
     UINT32 cache1;
     int bit_count;
 #endif
-    int size;
+    int size_in_bits;
 } GetBitContext;
 
 static inline int get_bits_count(GetBitContext *s);
@@ -667,6 +699,12 @@ int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
              const void *codes, int codes_wrap, int codes_size);
 void free_vlc(VLC *vlc);
 
+/**
+ *
+ * if the vlc code is invalid and max_depth=1 than no bits will be removed
+ * if the vlc code is invalid and max_depth>1 than the number of bits removed
+ * is undefined
+ */
 #define GET_VLC(code, name, gb, table, bits, max_depth)\
 {\
     int n, index, nb_bits;\
@@ -775,6 +813,7 @@ void print_stats(void);
 #endif
 
 /* misc math functions */
+extern const uint8_t ff_log2_tab[256];
 
 static inline int av_log2(unsigned int v)
 {
@@ -789,20 +828,26 @@ static inline int av_log2(unsigned int v)
         v >>= 8;
         n += 8;
     }
-    if (v & 0xf0) {
-        v >>= 4;
-        n += 4;
-    }
-    if (v & 0xc) {
-        v >>= 2;
-        n += 2;
-    }
-    if (v & 0x2) {
-        n++;
+    n += ff_log2_tab[v];
+
+    return n;
+}
+
+static inline int av_log2_16bit(unsigned int v)
+{
+    int n;
+
+    n = 0;
+    if (v & 0xff00) {
+        v >>= 8;
+        n += 8;
     }
+    n += ff_log2_tab[v];
+
     return n;
 }
 
+
 /* median of 3 */
 static inline int mid_pred(int a, int b, int c)
 {
@@ -832,7 +877,7 @@ static inline int clip(int a, int amin, int amax)
 }
 
 /* math */
-extern const UINT8 ff_sqrt_tab[128];
+extern const uint8_t ff_sqrt_tab[128];
 
 int ff_gcd(int a, int b);
 
@@ -902,6 +947,11 @@ if((y)<(x)){\
 
 #define CLAMP_TO_8BIT(d) ((d > 0xff) ? 0xff : (d < 0) ? 0 : d)
 
+/* avoid usage of various functions */
+#define malloc please_use_av_malloc
+#define free please_use_av_free
+#define realloc please_use_av_realloc
+
 #endif /* HAVE_AV_CONFIG_H */
 
 #endif /* COMMON_H */
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index 0d7556f65..06da93ba7 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -129,6 +129,7 @@ static int pix_norm1_c(UINT8 * pix, int line_size)
     s = 0;
     for (i = 0; i < 16; i++) {
 	for (j = 0; j < 16; j += 8) {
+#if 0
 	    s += sq[pix[0]];
 	    s += sq[pix[1]];
 	    s += sq[pix[2]];
@@ -137,6 +138,30 @@ static int pix_norm1_c(UINT8 * pix, int line_size)
 	    s += sq[pix[5]];
 	    s += sq[pix[6]];
 	    s += sq[pix[7]];
+#else
+#if LONG_MAX > 2147483647
+	    register uint64_t x=*(uint64_t*)pix;
+	    s += sq[x&0xff];
+	    s += sq[(x>>8)&0xff];
+	    s += sq[(x>>16)&0xff];
+	    s += sq[(x>>24)&0xff];
+            s += sq[(x>>32)&0xff];
+            s += sq[(x>>40)&0xff];
+            s += sq[(x>>48)&0xff];
+            s += sq[(x>>56)&0xff];
+#else
+	    register uint32_t x=*(uint32_t*)pix;
+	    s += sq[x&0xff];
+	    s += sq[(x>>8)&0xff];
+	    s += sq[(x>>16)&0xff];
+	    s += sq[(x>>24)&0xff];
+            x=*(uint32_t*)(pix+4);
+            s += sq[x&0xff];
+            s += sq[(x>>8)&0xff];
+            s += sq[(x>>16)&0xff];
+            s += sq[(x>>24)&0xff];
+#endif
+#endif
 	    pix += 8;
 	}
 	pix += line_size - 16;
@@ -166,27 +191,32 @@ static int sse8_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
     return s;
 }
 
-static int sse16_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
+static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i, j;
-    UINT32 *sq = squareTbl + 256;
+    int s, i;
+    uint32_t *sq = squareTbl + 256;
 
     s = 0;
     for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j += 8) {
-            s += sq[pix1[0] - pix2[0]];
-            s += sq[pix1[1] - pix2[1]];
-            s += sq[pix1[2] - pix2[2]];
-            s += sq[pix1[3] - pix2[3]];
-            s += sq[pix1[4] - pix2[4]];
-            s += sq[pix1[5] - pix2[5]];
-            s += sq[pix1[6] - pix2[6]];
-            s += sq[pix1[7] - pix2[7]];
-            pix1 += 8;
-            pix2 += 8;
-        }
-        pix1 += line_size - 16;
-        pix2 += line_size - 16;
+        s += sq[pix1[ 0] - pix2[ 0]];
+        s += sq[pix1[ 1] - pix2[ 1]];
+        s += sq[pix1[ 2] - pix2[ 2]];
+        s += sq[pix1[ 3] - pix2[ 3]];
+        s += sq[pix1[ 4] - pix2[ 4]];
+        s += sq[pix1[ 5] - pix2[ 5]];
+        s += sq[pix1[ 6] - pix2[ 6]];
+        s += sq[pix1[ 7] - pix2[ 7]];
+        s += sq[pix1[ 8] - pix2[ 8]];
+        s += sq[pix1[ 9] - pix2[ 9]];
+        s += sq[pix1[10] - pix2[10]];
+        s += sq[pix1[11] - pix2[11]];
+        s += sq[pix1[12] - pix2[12]];
+        s += sq[pix1[13] - pix2[13]];
+        s += sq[pix1[14] - pix2[14]];
+        s += sq[pix1[15] - pix2[15]];
+
+        pix1 += line_size;
+        pix2 += line_size;
     }
     return s;
 }
@@ -801,7 +831,8 @@ static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStrid
     }\
 }\
 \
-static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
+static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride){\
+    const int w=8;\
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
     int i;\
     for(i=0; i<w; i++)\
@@ -923,107 +954,163 @@ static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 half[64];\
     copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
 }\
 \
 static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     copy_block9(full, src, 16, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
+    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 }\
 \
 static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 half[64];\
     copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
+void ff_ ## OPNAME ## qpel8_mc11_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    UINT8 halfH[72];\
+    UINT8 halfHV[64];\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
+}\
+void ff_ ## OPNAME ## qpel8_mc31_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    UINT8 halfH[72];\
+    UINT8 halfHV[64];\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
+}\
+void ff_ ## OPNAME ## qpel8_mc13_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    UINT8 halfH[72];\
+    UINT8 halfHV[64];\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
+}\
+void ff_ ## OPNAME ## qpel8_mc33_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 }\
+static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    UINT8 halfH[72];\
+    UINT8 halfHV[64];\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
+}\
 static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[72];\
     UINT8 halfHV[64];\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
+void ff_ ## OPNAME ## qpel8_mc12_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    UINT8 halfH[72];\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
+}\
+void ff_ ## OPNAME ## qpel8_mc32_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[16*9];\
     UINT8 halfH[72];\
     UINT8 halfV[64];\
     UINT8 halfHV[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
+    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
 }\
+static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[16*9];\
+    UINT8 halfH[72];\
+    copy_block9(full, src, 16, stride, 9);\
+    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
+    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
+}\
 static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[72];\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
+    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 }\
 static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
     OPNAME ## pixels16_c(dst, src, stride, 16);\
@@ -1066,7 +1153,7 @@ static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
+void ff_ ## OPNAME ## qpel16_mc11_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 halfH[272];\
     UINT8 halfV[256];\
@@ -1077,7 +1164,17 @@ static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
+}\
+void ff_ ## OPNAME ## qpel16_mc31_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 halfH[272];\
     UINT8 halfV[256];\
@@ -1088,7 +1185,17 @@ static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
+}\
+void ff_ ## OPNAME ## qpel16_mc13_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 halfH[272];\
     UINT8 halfV[256];\
@@ -1099,7 +1206,17 @@ static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
+}\
+void ff_ ## OPNAME ## qpel16_mc33_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 halfH[272];\
     UINT8 halfV[256];\
@@ -1110,6 +1227,16 @@ static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
 }\
+static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    UINT8 halfHV[256];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
+}\
 static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[272];\
     UINT8 halfHV[256];\
@@ -1124,7 +1251,7 @@ static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
+void ff_ ## OPNAME ## qpel16_mc12_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 halfH[272];\
     UINT8 halfV[256];\
@@ -1135,7 +1262,15 @@ static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
+}\
+void ff_ ## OPNAME ## qpel16_mc32_old_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 full[24*17];\
     UINT8 halfH[272];\
     UINT8 halfV[256];\
@@ -1146,6 +1281,14 @@ static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
 }\
+static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
+    UINT8 full[24*17];\
+    UINT8 halfH[272];\
+    copy_block17(full, src, 24, stride, 17);\
+    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
+    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
+}\
 static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
     UINT8 halfH[272];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
@@ -1498,7 +1641,7 @@ static void clear_blocks_c(DCTELEM *blocks)
 
 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
     int i;
-    for(i=0; i+7<w; i++){
+    for(i=0; i+7<w; i+=8){
         dst[i+0] += src[i+0];
         dst[i+1] += src[i+1];
         dst[i+2] += src[i+2];
@@ -1514,7 +1657,7 @@ static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
 
 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     int i;
-    for(i=0; i+7<w; i++){
+    for(i=0; i+7<w; i+=8){
         dst[i+0] = src1[i+0]-src2[i+0];
         dst[i+1] = src1[i+1]-src2[i+1];
         dst[i+2] = src1[i+2]-src2[i+2];
@@ -1639,7 +1782,8 @@ static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
 
 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
     MpegEncContext * const s= (MpegEncContext *)c;
-    DCTELEM temp[64];
+    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
+    DCTELEM * const temp= (DCTELEM*)aligned_temp;
     int sum=0, i;
 
     s->dsp.diff_pixels(temp, src1, src2, stride);
@@ -1651,11 +1795,13 @@ static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2
     return sum;
 }
 
-void simple_idct(INT16 *block); //FIXME
+void simple_idct(DCTELEM *block); //FIXME
 
 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
     MpegEncContext * const s= (MpegEncContext *)c;
-    DCTELEM temp[64], bak[64];
+    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
+    DCTELEM * const temp= (DCTELEM*)aligned_temp;
+    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
     int sum=0, i;
 
     s->mb_intra=0;
@@ -1664,7 +1810,7 @@ static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *s
     
     memcpy(bak, temp, 64*sizeof(DCTELEM));
     
-    s->dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
     s->dct_unquantize(s, temp, 0, s->qscale);
     simple_idct(temp); //FIXME 
     
@@ -1674,9 +1820,144 @@ static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *s
     return sum;
 }
 
+static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
+    MpegEncContext * const s= (MpegEncContext *)c;
+    const UINT8 *scantable= s->intra_scantable.permutated;
+    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
+    uint64_t __align8 aligned_bak[stride];
+    DCTELEM * const temp= (DCTELEM*)aligned_temp;
+    uint8_t * const bak= (uint8_t*)aligned_bak;
+    int i, last, run, bits, level, distoration, start_i;
+    const int esc_length= s->ac_esc_length;
+    uint8_t * length;
+    uint8_t * last_length;
+    
+    for(i=0; i<8; i++){
+        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
+        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
+    }
+
+    s->dsp.diff_pixels(temp, src1, src2, stride);
+
+    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+
+    bits=0;
+    
+    if (s->mb_intra) {
+        start_i = 1; 
+        length     = s->intra_ac_vlc_length;
+        last_length= s->intra_ac_vlc_last_length;
+        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
+    } else {
+        start_i = 0;
+        length     = s->inter_ac_vlc_length;
+        last_length= s->inter_ac_vlc_last_length;
+    }
+    
+    if(last>=start_i){
+        run=0;
+        for(i=start_i; i<last; i++){
+            int j= scantable[i];
+            level= temp[j];
+        
+            if(level){
+                level+=64;
+                if((level&(~127)) == 0){
+                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
+                }else
+                    bits+= esc_length;
+                run=0;
+            }else
+                run++;
+        }
+        i= scantable[last];
+       
+        level= temp[i] + 64;
+
+        assert(level - 64);
+        
+        if((level&(~127)) == 0){
+            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
+        }else
+            bits+= esc_length;
+    
+    }
+
+    if(last>=0){
+        s->dct_unquantize(s, temp, 0, s->qscale);
+    }
+    
+    s->idct_add(bak, stride, temp);
+    
+    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
+
+    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
+}
+
+static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
+    MpegEncContext * const s= (MpegEncContext *)c;
+    const UINT8 *scantable= s->intra_scantable.permutated;
+    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
+    DCTELEM * const temp= (DCTELEM*)aligned_temp;
+    int i, last, run, bits, level, start_i;
+    const int esc_length= s->ac_esc_length;
+    uint8_t * length;
+    uint8_t * last_length;
+    
+    s->dsp.diff_pixels(temp, src1, src2, stride);
+
+    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
+
+    bits=0;
+    
+    if (s->mb_intra) {
+        start_i = 1; 
+        length     = s->intra_ac_vlc_length;
+        last_length= s->intra_ac_vlc_last_length;
+        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
+    } else {
+        start_i = 0;
+        length     = s->inter_ac_vlc_length;
+        last_length= s->inter_ac_vlc_last_length;
+    }
+    
+    if(last>=start_i){
+        run=0;
+        for(i=start_i; i<last; i++){
+            int j= scantable[i];
+            level= temp[j];
+        
+            if(level){
+                level+=64;
+                if((level&(~127)) == 0){
+                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
+                }else
+                    bits+= esc_length;
+                run=0;
+            }else
+                run++;
+        }
+        i= scantable[last];
+                
+        level= temp[i] + 64;
+        
+        assert(level - 64);
+        
+        if((level&(~127)) == 0){
+            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
+        }else
+            bits+= esc_length;
+    }
+
+    return bits;
+}
+
+
 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
+WARPER88_1616(rd8x8_c, rd16x16_c)
+WARPER88_1616(bit8x8_c, bit16x16_c)
 
 void dsputil_init(DSPContext* c, unsigned mask)
 {
@@ -1790,7 +2071,13 @@ void dsputil_init(DSPContext* c, unsigned mask)
     
     c->quant_psnr[0]= quant_psnr16x16_c;
     c->quant_psnr[1]= quant_psnr8x8_c;
-    
+
+    c->rd[0]= rd16x16_c;
+    c->rd[1]= rd8x8_c;
+
+    c->bit[0]= bit16x16_c;
+    c->bit[1]= bit8x8_c;
+        
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
 
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index f34a8f078..2220b4871 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -30,6 +30,7 @@
 #undef DEBUG
 /* dct code */
 typedef short DCTELEM;
+//typedef int DCTELEM;
 
 void fdct_ifast (DCTELEM *data);
 void ff_jpeg_fdct_islow (DCTELEM *data);
@@ -74,7 +75,23 @@ void clear_blocks_c(DCTELEM *blocks);
 typedef void (*op_pixels_func)(UINT8 *block/*align width (8 or 16)*/, const UINT8 *pixels/*align 1*/, int line_size, int h);
 typedef void (*qpel_mc_func)(UINT8 *dst/*align width (8 or 16)*/, UINT8 *src/*align 1*/, int stride);
 
-
+#define DEF_OLD_QPEL(name)\
+void ff_put_        ## name (UINT8 *dst/*align width (8 or 16)*/, UINT8 *src/*align 1*/, int stride);\
+void ff_put_no_rnd_ ## name (UINT8 *dst/*align width (8 or 16)*/, UINT8 *src/*align 1*/, int stride);\
+void ff_avg_        ## name (UINT8 *dst/*align width (8 or 16)*/, UINT8 *src/*align 1*/, int stride);
+
+DEF_OLD_QPEL(qpel16_mc11_old_c)
+DEF_OLD_QPEL(qpel16_mc31_old_c)
+DEF_OLD_QPEL(qpel16_mc12_old_c)
+DEF_OLD_QPEL(qpel16_mc32_old_c)
+DEF_OLD_QPEL(qpel16_mc13_old_c)
+DEF_OLD_QPEL(qpel16_mc33_old_c)
+DEF_OLD_QPEL(qpel8_mc11_old_c)
+DEF_OLD_QPEL(qpel8_mc31_old_c)
+DEF_OLD_QPEL(qpel8_mc12_old_c)
+DEF_OLD_QPEL(qpel8_mc32_old_c)
+DEF_OLD_QPEL(qpel8_mc13_old_c)
+DEF_OLD_QPEL(qpel8_mc33_old_c)
 
 #define CALL_2X_PIXELS(a, b, n)\
 static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
@@ -105,6 +122,8 @@ typedef struct DSPContext {
     me_cmp_func hadamard8_diff[2];
     me_cmp_func dct_sad[2];
     me_cmp_func quant_psnr[2];
+    me_cmp_func bit[2];
+    me_cmp_func rd[2];
     int (*hadamard8_abs )(uint8_t *src, int stride, int mean);
 
     me_cmp_func me_pre_cmp[11];
@@ -143,10 +162,14 @@ void dsputil_init(DSPContext* p, unsigned mask);
  * permute block according to permuatation.
  * @param last last non zero element in scantable order
  */
-void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last);
+void ff_block_permute(DCTELEM *block, UINT8 *permutation, const UINT8 *scantable, int last);
 
 #define emms_c()
 
+/* should be defined by architectures supporting
+   one or more MultiMedia extension */
+int mm_support(void);
+
 #if defined(HAVE_MMX)
 
 #undef emms_c
@@ -161,7 +184,6 @@ void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable,
 
 extern int mm_flags;
 
-int mm_support(void);
 void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size);
 void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size);
 
@@ -211,6 +233,10 @@ void dsputil_init_alpha(DSPContext* c, unsigned mask);
 
 extern int mm_flags;
 
+#if defined(HAVE_ALTIVEC) && !defined(CONFIG_DARWIN)
+#include <altivec.h>
+#endif
+
 #define __align8 __attribute__ ((aligned (16)))
 
 void dsputil_init_ppc(DSPContext* c, unsigned mask);
@@ -314,7 +340,12 @@ static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int st
 /* btw, rintf() is existing on fbsd too -- alex */
 static inline long int lrintf(float x)
 {
+#ifdef CONFIG_WIN32
+    /* XXX: incorrect, but make it compile */
+    return (int)(x);
+#else
     return (int)(rint(x));
+#endif
 }
 #endif
 
diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c
index f436caf12..554b593e7 100644
--- a/src/libffmpeg/libavcodec/dv.c
+++ b/src/libffmpeg/libavcodec/dv.c
@@ -157,7 +157,7 @@ static const UINT16 block_sizes[6] = {
 
 /* decode ac coefs */
 static void dv_decode_ac(DVVideoDecodeContext *s, 
-                         BlockInfo *mb, INT16 *block, int last_index)
+                         BlockInfo *mb, DCTELEM *block, int last_index)
 {
     int last_re_index;
     int shift_offset = mb->shift_offset;
@@ -195,7 +195,7 @@ static void dv_decode_ac(DVVideoDecodeContext *s,
                v, partial_bit_count, (mb->partial_bit_buffer << l));
 #endif
         /* try to read the codeword */
-        init_get_bits(&gb1, buf, 4);
+        init_get_bits(&gb1, buf, 4*8);
         {
             OPEN_READER(re1, &gb1);
             UPDATE_CACHE(re1, &gb1);
@@ -333,7 +333,7 @@ static inline void dv_decode_video_segment(DVVideoDecodeContext *s,
         block = block1;
         for(j = 0;j < 6; j++) {
             /* NOTE: size is not important here */
-            init_get_bits(&s->gb, buf_ptr, 14);
+            init_get_bits(&s->gb, buf_ptr, 14*8);
             
             /* get the dc */
             dc = get_bits(&s->gb, 9);
@@ -382,7 +382,7 @@ static inline void dv_decode_video_segment(DVVideoDecodeContext *s,
 #endif
         block = block1;
         mb = mb1;
-        init_get_bits(&s->gb, mb_bit_buffer, 80);
+        init_get_bits(&s->gb, mb_bit_buffer, 80*8);
         for(j = 0;j < 6; j++) {
             if (!mb->eob_reached && s->gb.index < mb_bit_count) {
                 dv_decode_ac(s, mb, block, mb_bit_count);
@@ -421,7 +421,7 @@ static inline void dv_decode_video_segment(DVVideoDecodeContext *s,
 #endif
     block = &s->block[0][0];
     mb = mb_data;
-    init_get_bits(&s->gb, vs_bit_buffer, 5 * 80);
+    init_get_bits(&s->gb, vs_bit_buffer, 5 * 80*8);
     for(mb_index = 0; mb_index < 5; mb_index++) {
         for(j = 0;j < 6; j++) {
             if (!mb->eob_reached) {
@@ -501,7 +501,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
     const UINT16 *mb_pos_ptr;
     
     /* parse id */
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
     sct = get_bits(&s->gb, 3);
     if (sct != 0)
         return -1;
@@ -634,7 +634,6 @@ AVCodec dvvideo_decoder = {
 typedef struct DVAudioDecodeContext {
     AVCodecContext *avctx;
     GetBitContext gb;
-
 } DVAudioDecodeContext;
 
 static int dvaudio_decode_init(AVCodecContext *avctx)
@@ -643,13 +642,126 @@ static int dvaudio_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+static UINT16 dv_audio_12to16(UINT16 sample)
+{
+    UINT16 shift, result;
+    
+    sample = (sample < 0x800) ? sample : sample | 0xf000;
+    shift = (sample & 0xf00) >> 8;
+
+    if (shift < 0x2 || shift > 0xd) {
+	result = sample;
+    } else if (shift < 0x8) {
+        shift--;
+	result = (sample - (256 * shift)) << shift;
+    } else {
+	shift = 0xe - shift;
+	result = ((sample + ((256 * shift) + 1)) << shift) - 1;
+    }
+
+    return result;
+}
+
 /* NOTE: exactly one frame must be given (120000 bytes for NTSC,
-   144000 bytes for PAL) */
+   144000 bytes for PAL) 
+
+   There's a couple of assumptions being made here:
+         1. We don't do any kind of audio error correction. It means,
+	    that erroneous samples 0x8000 are being passed upwards.
+            Do we need to silence erroneous samples ? Average them ?
+	 2. We don't do software emphasis.
+	 3. We are not checking for 'speed' argument being valid.
+	 4. Audio is always returned as 16bit linear samples: 12bit
+	    nonlinear samples are converted into 16bit linear ones.
+*/
 static int dvaudio_decode_frame(AVCodecContext *avctx, 
                                  void *data, int *data_size,
                                  UINT8 *buf, int buf_size)
 {
-    //    DVAudioDecodeContext *s = avctx->priv_data;
+    DVVideoDecodeContext *s = avctx->priv_data;
+    const UINT16 (*unshuffle)[9];
+    int smpls, freq, quant, sys, stride, difseg, ad, dp, nb_dif_segs, i;
+    UINT16 lc, rc;
+    UINT8 *buf_ptr;
+    
+    /* parse id */
+    init_get_bits(&s->gb, &buf[AAUX_OFFSET], 5*8);
+    i = get_bits(&s->gb, 8);
+    if (i != 0x50) { /* No audio ? */
+	*data_size = 0;
+	return buf_size;
+    }
+    
+    get_bits(&s->gb, 1); /* 0 - locked audio, 1 - unlocked audio */
+    skip_bits(&s->gb, 1);
+    smpls = get_bits(&s->gb, 6); /* samples in this frame - min. samples */
+
+    skip_bits(&s->gb, 8);
+
+    skip_bits(&s->gb, 2);
+    sys = get_bits(&s->gb, 1); /* 0 - 60 fields, 1 = 50 fields */
+    skip_bits(&s->gb, 5);
+
+    get_bits(&s->gb, 1); /* 0 - emphasis on, 1 - emphasis off */
+    get_bits(&s->gb, 1); /* 0 - reserved, 1 - emphasis time constant 50/15us */
+    freq = get_bits(&s->gb, 3); /* 0 - 48KHz, 1 - 44,1kHz, 2 - 32 kHz */
+    quant = get_bits(&s->gb, 3); /* 0 - 16bit linear, 1 - 12bit nonlinear */
+
+    if (quant > 1)
+	return -1; /* Unsupported quantization */
+
+    avctx->sample_rate = dv_audio_frequency[freq];
+    // What about:
+    // avctx->bit_rate = 
+    // avctx->frame_size =
+   
+    *data_size = (dv_audio_min_samples[sys][freq] + smpls) * 
+	         avctx->channels * 2;
+
+    if (sys) {
+	nb_dif_segs = 12;
+	stride = 108;
+	unshuffle = dv_place_audio50;
+    } else {
+	nb_dif_segs = 10;
+	stride = 90;
+	unshuffle = dv_place_audio60;
+    }
+    
+    /* for each DIF segment */
+    buf_ptr = buf;
+    for (difseg = 0; difseg < nb_dif_segs; difseg++) {
+         buf_ptr += 6 * 80; /* skip DIF segment header */
+         for (ad = 0; ad < 9; ad++) {
+              
+              for (dp = 8; dp < 80; dp+=2) {
+		   if (quant == 0) {  /* 16bit quantization */
+		       i = unshuffle[difseg][ad] + (dp - 8)/2 * stride;
+		       ((short *)data)[i] = (buf_ptr[dp] << 8) | buf_ptr[dp+1]; 
+		   } else {           /* 12bit quantization */
+		       if (difseg >= nb_dif_segs/2)
+			   goto out;  /* We're not doing 4ch at this time */
+		       
+		       lc = ((UINT16)buf_ptr[dp] << 4) | 
+			    ((UINT16)buf_ptr[dp+2] >> 4);
+		       rc = ((UINT16)buf_ptr[dp+1] << 4) |
+			    ((UINT16)buf_ptr[dp+2] & 0x0f);
+		       lc = dv_audio_12to16(lc);
+		       rc = dv_audio_12to16(rc);
+
+		       i = unshuffle[difseg][ad] + (dp - 8)/3 * stride;
+		       ((short *)data)[i] = lc;
+		       i = unshuffle[difseg+nb_dif_segs/2][ad] + (dp - 8)/3 * stride;
+		       ((short *)data)[i] = rc;
+		       ++dp;
+		   }
+	      }
+		
+	    buf_ptr += 16 * 80; /* 15 Video DIFs + 1 Audio DIF */
+        }
+    }
+
+out:
     return buf_size;
 }
 
diff --git a/src/libffmpeg/libavcodec/dvdata.h b/src/libffmpeg/libavcodec/dvdata.h
index b5c1f5607..4e1fc39c7 100644
--- a/src/libffmpeg/libavcodec/dvdata.h
+++ b/src/libffmpeg/libavcodec/dvdata.h
@@ -18,6 +18,7 @@
  */
 
 #define NB_DV_VLC 409
+#define AAUX_OFFSET (80*6 + 80*16*3 + 3)
 
 static const UINT16 dv_vlc_bits[409] = {
  0x0000, 0x0002, 0x0007, 0x0008, 0x0009, 0x0014, 0x0015, 0x0016,
@@ -905,3 +906,41 @@ static const UINT16 dv_place_411[1350] = {
  0x0834, 0x2320, 0x2f44, 0x3810, 0x1658,
 };
 
+static const UINT16 dv_place_audio60[10][9] = {
+  {  0, 30, 60, 20, 50, 80, 10, 40, 70 }, /* 1st channel */
+  {  6, 36, 66, 26, 56, 86, 16, 46, 76 },
+  { 12, 42, 72,  2, 32, 62, 22, 52, 82 },
+  { 18, 48, 78,  8, 38, 68, 28, 58, 88 },
+  { 24, 54, 84, 14, 44, 74,  4, 34, 64 },
+  
+  {  1, 31, 61, 21, 51, 81, 11, 41, 71 }, /* 2nd channel */
+  {  7, 37, 67, 27, 57, 87, 17, 47, 77 },
+  { 13, 43, 73,  3, 33, 63, 23, 53, 83 },
+  { 19, 49, 79,  9, 39, 69, 29, 59, 89 },
+  { 25, 55, 85, 15, 45, 75,  5, 35, 65 },
+};
+
+static const UINT16 dv_place_audio50[12][9] = {
+  {   0,  36,  72,  26,  62,  98,  16,  52,  88}, /* 1st channel */
+  {   6,  42,  78,  32,  68, 104,  22,  58,  94},
+  {  12,  48,  84,   2,  38,  74,  28,  64, 100},
+  {  18,  54,  90,   8,  44,  80,  34,  70, 106},
+  {  24,  60,  96,  14,  50,  86,   4,  40,  76},  
+  {  30,  66, 102,  20,  56,  92,  10,  46,  82},
+	
+  {   1,  37,  73,  27,  63,  99,  17,  53,  89}, /* 2nd channel */
+  {   7,  43,  79,  33,  69, 105,  23,  59,  95},
+  {  13,  49,  85,   3,  39,  75,  29,  65, 101},
+  {  19,  55,  91,   9,  45,  81,  35,  71, 107},
+  {  25,  61,  97,  15,  51,  87,   5,  41,  77},  
+  {  31,  67, 103,  21,  57,  93,  11,  47,  83},
+};
+
+static const int dv_audio_frequency[3] = {
+    48000, 44100, 32000, 
+};
+
+static const int dv_audio_min_samples[2][3] = {
+    { 1580, 1452, 1053 }, /* 60 fields */
+    { 1896, 1742, 1264 }, /* 50 fileds */
+};
diff --git a/src/libffmpeg/libavcodec/fft.c b/src/libffmpeg/libavcodec/fft.c
index f060992f4..65eb575f3 100644
--- a/src/libffmpeg/libavcodec/fft.c
+++ b/src/libffmpeg/libavcodec/fft.c
@@ -53,13 +53,13 @@ int fft_init(FFTContext *s, int nbits, int inverse)
     /* compute constant table for HAVE_SSE version */
 #if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(HAVE_ALTIVEC)
     {
-        int has_vectors;
+        int has_vectors = 0;
 
 #if defined(HAVE_MMX)
         has_vectors = mm_support() & MM_SSE;
-#else
-        /* XXX: should also use mm_support() ? */
-        has_vectors = has_altivec() & MM_ALTIVEC;
+#endif
+#if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
+        has_vectors = mm_support() & MM_ALTIVEC;
 #endif
         if (has_vectors) {
             int np, nblocks, np2, l;
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index bc21e0cd8..63bf19059 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -22,6 +22,12 @@
  * qpel decoding, gmc decoding, interlaced decoding, 
  * by Michael Niedermayer <michaelni@gmx.at>
  */
+
+/**
+ * @file h263.c
+ * @brief h263/mpeg4 codec
+ *
+ */
  
 //#define DEBUG
 #include "common.h"
@@ -67,15 +73,17 @@ static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
 static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                               int n, int coded, int intra);
 static int h263_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr);
-static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
+static void mpeg4_inv_pred_ac(MpegEncContext * s, DCTELEM *block, int n,
                               int dir);
 static void mpeg4_decode_sprite_trajectory(MpegEncContext * s);
 static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr);
 
 extern UINT32 inverse[256];
 
-static UINT16 uni_DCtab_lum  [512][2];
-static UINT16 uni_DCtab_chrom[512][2];
+static UINT8 uni_DCtab_lum_len[512];
+static UINT8 uni_DCtab_chrom_len[512];
+static UINT16 uni_DCtab_lum_bits[512];
+static UINT16 uni_DCtab_chrom_bits[512];
 
 #ifdef CONFIG_ENCODERS
 static UINT16 (*mv_penalty)[MAX_MV*2+1]= NULL;
@@ -999,7 +1007,7 @@ static int h263_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr)
 }
 
 
-void h263_pred_acdc(MpegEncContext * s, INT16 *block, int n)
+void h263_pred_acdc(MpegEncContext * s, DCTELEM *block, int n)
 {
     int x, y, wrap, a, c, pred_dc, scale, i;
     INT16 *dc_val, *ac_val, *ac_val1;
@@ -1309,8 +1317,8 @@ static void init_uni_dc_tab(void)
                 uni_len++;
             }
         }
-        uni_DCtab_lum[level+256][0]= uni_code;
-        uni_DCtab_lum[level+256][1]= uni_len;
+        uni_DCtab_lum_bits[level+256]= uni_code;
+        uni_DCtab_lum_len [level+256]= uni_len;
 
         /* chrominance */
         uni_code= DCtab_chrom[size][0];
@@ -1324,8 +1332,8 @@ static void init_uni_dc_tab(void)
                 uni_len++;
             }
         }
-        uni_DCtab_chrom[level+256][0]= uni_code;
-        uni_DCtab_chrom[level+256][1]= uni_len;
+        uni_DCtab_chrom_bits[level+256]= uni_code;
+        uni_DCtab_chrom_len [level+256]= uni_len;
 
     }
 }
@@ -1446,6 +1454,8 @@ void h263_encode_init(MpegEncContext *s)
         s->intra_ac_vlc_last_length= uni_mpeg4_intra_rl_len + 128*64;
         s->inter_ac_vlc_length     = uni_mpeg4_inter_rl_len;
         s->inter_ac_vlc_last_length= uni_mpeg4_inter_rl_len + 128*64;
+        s->luma_dc_vlc_length= uni_DCtab_lum_len;
+        s->chroma_dc_vlc_length= uni_DCtab_chrom_len;
         s->ac_esc_length= 7+2+1+6+1+12+1;
         break;
     case CODEC_ID_H263P:
@@ -1470,6 +1480,11 @@ void h263_encode_init(MpegEncContext *s)
     }
 }
 
+/**
+ * encodes a 8x8 block.
+ * @param block the 8x8 block
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ */
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block, int n)
 {
     int level, run, last, i, j, last_index, last_non_zero, sign, slevel, code;
@@ -1795,6 +1810,13 @@ static void change_qscale(MpegEncContext * s, int dquant)
     s->c_dc_scale= s->c_dc_scale_table[ s->qscale ];
 }
 
+/**
+ * predicts the dc.
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ * @param dc_val_ptr a pointer to the dc_val entry for the current MB will be stored here
+ * @param dir_ptr pointer to an integer where the prediction direction will be stored
+ * @return the quantized predicted dc
+ */
 static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_ptr, int *dir_ptr)
 {
     int a, b, c, wrap, pred, scale;
@@ -1852,7 +1874,12 @@ static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, UINT16 **dc_val_pt
     return pred;
 }
 
-void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
+/**
+ * predicts the ac.
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ * @param dir the ac prediction direction
+ */
+void mpeg4_pred_ac(MpegEncContext * s, DCTELEM *block, int n,
                    int dir)
 {
     int i;
@@ -1907,7 +1934,7 @@ void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n,
 
 }
 
-static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
+static void mpeg4_inv_pred_ac(MpegEncContext * s, DCTELEM *block, int n,
                               int dir)
 {
     int i;
@@ -1950,6 +1977,10 @@ static void mpeg4_inv_pred_ac(MpegEncContext * s, INT16 *block, int n,
     }
 }
 
+/**
+ * encodes the dc value.
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ */
 static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
 {
 #if 1
@@ -1957,10 +1988,10 @@ static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
     level+=256;
     if (n < 4) {
 	/* luminance */
-	put_bits(s, uni_DCtab_lum[level][1], uni_DCtab_lum[level][0]);
+	put_bits(s, uni_DCtab_lum_len[level], uni_DCtab_lum_bits[level]);
     } else {
 	/* chrominance */
-	put_bits(s, uni_DCtab_chrom[level][1], uni_DCtab_chrom[level][0]);
+	put_bits(s, uni_DCtab_chrom_len[level], uni_DCtab_chrom_bits[level]);
     }
 #else
     int size, v;
@@ -1991,6 +2022,10 @@ static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
 #endif
 }
 #ifdef CONFIG_ENCODERS
+/**
+ * encodes a 8x8 block
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ */
 static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
                                UINT8 *scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb)
 {
@@ -2175,44 +2210,6 @@ static VLC dc_lum, dc_chrom;
 static VLC sprite_trajectory;
 static VLC mb_type_b_vlc;
 
-void init_rl(RLTable *rl)
-{
-    INT8 max_level[MAX_RUN+1], max_run[MAX_LEVEL+1];
-    UINT8 index_run[MAX_RUN+1];
-    int last, run, level, start, end, i;
-
-    /* compute max_level[], max_run[] and index_run[] */
-    for(last=0;last<2;last++) {
-        if (last == 0) {
-            start = 0;
-            end = rl->last;
-        } else {
-            start = rl->last;
-            end = rl->n;
-        }
-
-        memset(max_level, 0, MAX_RUN + 1);
-        memset(max_run, 0, MAX_LEVEL + 1);
-        memset(index_run, rl->n, MAX_RUN + 1);
-        for(i=start;i<end;i++) {
-            run = rl->table_run[i];
-            level = rl->table_level[i];
-            if (index_run[run] == rl->n)
-                index_run[run] = i;
-            if (level > max_level[run])
-                max_level[run] = level;
-            if (run > max_run[level])
-                max_run[level] = run;
-        }
-        rl->max_level[last] = av_malloc(MAX_RUN + 1);
-        memcpy(rl->max_level[last], max_level, MAX_RUN + 1);
-        rl->max_run[last] = av_malloc(MAX_LEVEL + 1);
-        memcpy(rl->max_run[last], max_run, MAX_LEVEL + 1);
-        rl->index_run[last] = av_malloc(MAX_RUN + 1);
-        memcpy(rl->index_run[last], index_run, MAX_RUN + 1);
-    }
-}
-
 void init_vlc_rl(RLTable *rl)
 {
     int i, q;
@@ -2331,7 +2328,7 @@ static int h263_decode_gob_header(MpegEncContext *s)
 
         /* We have a GBSC probably with GSTUFF */
     skip_bits(&s->gb, 16); /* Drop the zeros */
-    left= s->gb.size*8 - get_bits_count(&s->gb);
+    left= s->gb.size_in_bits - get_bits_count(&s->gb);
     //MN: we must check the bits left or we might end in a infinite loop (or segfault)
     for(;left>13; left--){
         if(get_bits1(&s->gb)) break; /* Seek the '1' bit */
@@ -2431,7 +2428,7 @@ static inline int mpeg4_is_resync(MpegEncContext *s){
         return 0;
     }
 
-    if(bits_count + 8 >= s->gb.size*8){
+    if(bits_count + 8 >= s->gb.size_in_bits){
         int v= show_bits(&s->gb, 8);
         v|= 0x7F >> (7-(bits_count&7));
                 
@@ -2468,7 +2465,7 @@ static int mpeg4_decode_video_packet_header(MpegEncContext *s)
     int header_extension=0, mb_num, len;
     
     /* is there enough space left for a video packet + header */
-    if( get_bits_count(&s->gb) > s->gb.size*8-20) return -1;
+    if( get_bits_count(&s->gb) > s->gb.size_in_bits-20) return -1;
 
     for(len=0; len<32; len++){
         if(get_bits1(&s->gb)) break;
@@ -2600,7 +2597,7 @@ int ff_h263_resync(MpegEncContext *s){
     //ok, its not where its supposed to be ...
     s->gb= s->last_resync_gb;
     align_get_bits(&s->gb);
-    left= s->gb.size*8 - get_bits_count(&s->gb);
+    left= s->gb.size_in_bits - get_bits_count(&s->gb);
     
     for(;left>16+1+5+5; left-=8){ 
         if(show_bits(&s->gb, 16)==0){
@@ -2622,6 +2619,7 @@ int ff_h263_resync(MpegEncContext *s){
 }
 
 /**
+ * gets the average motion vector for a GMC MB.
  * @param n either 0 for the x component or 1 for y
  * @returns the average MV for a GMC MB
  */
@@ -2654,8 +2652,7 @@ static inline int get_amv(MpegEncContext *s, int n){
                 v+= dx;
             }
         }
-        sum /= 256;
-        sum= RSHIFT(sum<<s->quarter_sample, a);
+        sum= RSHIFT(sum, a+8-s->quarter_sample);
     }
 
     if      (sum < -len) sum= -len;
@@ -3055,7 +3052,7 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, DCTELEM block[6][64])
     /* per-MB end of slice check */
 
     if(--s->mb_num_left <= 0){
-//printf("%06X %d\n", show_bits(&s->gb, 24), s->gb.size*8 - get_bits_count(&s->gb));
+//printf("%06X %d\n", show_bits(&s->gb, 24), s->gb.size_in_bits - get_bits_count(&s->gb));
         if(mpeg4_is_resync(s))
             return SLICE_END;
         else
@@ -3419,8 +3416,8 @@ end:
     }else{
         int v= show_bits(&s->gb, 16);
     
-        if(get_bits_count(&s->gb) + 16 > s->gb.size*8){
-            v>>= get_bits_count(&s->gb) + 16 - s->gb.size*8;
+        if(get_bits_count(&s->gb) + 16 > s->gb.size_in_bits){
+            v>>= get_bits_count(&s->gb) + 16 - s->gb.size_in_bits;
         }
 
         if(v==0)
@@ -3588,6 +3585,12 @@ not_coded:
     return 0;
 }
 
+/**
+ * decodes the dc value.
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ * @param dir_ptr the prediction direction will be stored here
+ * @return the quantized dc
+ */
 static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
 {
     int level, pred, code;
@@ -4473,6 +4476,10 @@ static int decode_vol_header(MpegEncContext *s, GetBitContext *gb){
     return 0;
 }
 
+/**
+ * decodes the user data stuff in the header.
+ * allso inits divx/xvid/lavc_version/build
+ */
 static int decode_user_data(MpegEncContext *s, GetBitContext *gb){
     char buf[256];
     int i;
@@ -4685,7 +4692,7 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
              printf("qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d\n", 
                  s->qscale, s->f_code, s->b_code, 
                  s->pict_type == I_TYPE ? "I" : (s->pict_type == P_TYPE ? "P" : (s->pict_type == B_TYPE ? "B" : "S")), 
-                 gb->size,s->progressive_sequence, s->alternate_scan, s->top_field_first, 
+                 gb->size_in_bits,s->progressive_sequence, s->alternate_scan, s->top_field_first, 
                  s->quarter_sample ? "q" : "h", s->data_partitioning, s->resync_marker, s->num_sprite_warping_points,
                  s->sprite_warping_accuracy); 
          }
@@ -4740,9 +4747,9 @@ int ff_mpeg4_decode_picture_header(MpegEncContext * s, GetBitContext *gb)
         v = get_bits(gb, 8);
         startcode = ((startcode << 8) | v) & 0xffffffff;
         
-        if(get_bits_count(gb) >= gb->size*8){
-            if(gb->size==1 && s->divx_version){
-                printf("frame skip %d\n", gb->size);
+        if(get_bits_count(gb) >= gb->size_in_bits){
+            if(gb->size_in_bits==8 && s->divx_version){
+                printf("frame skip %d\n", gb->size_in_bits);
                 return FRAME_SKIPED; //divx bug
             }else
                 return -1; //end of stream
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 93a14a06e..a5dadeec4 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -249,15 +249,17 @@ static int decode_slice(MpegEncContext *s){
     /* try to detect the padding bug */
     if(      s->codec_id==CODEC_ID_MPEG4
        &&   (s->workaround_bugs&FF_BUG_AUTODETECT) 
-       &&    s->gb.size*8 - get_bits_count(&s->gb) >=0
-       &&    s->gb.size*8 - get_bits_count(&s->gb) < 48
+       &&    s->gb.size_in_bits - get_bits_count(&s->gb) >=0
+       &&    s->gb.size_in_bits - get_bits_count(&s->gb) < 48
 //       &&   !s->resync_marker
        &&   !s->data_partitioning){
         
         const int bits_count= get_bits_count(&s->gb);
-        const int bits_left = s->gb.size*8 - bits_count;
+        const int bits_left = s->gb.size_in_bits - bits_count;
         
-        if(bits_left==0 || bits_left>8){
+        if(bits_left==0){
+            s->padding_bug_score+=16;
+        }else if(bits_left>8){
             s->padding_bug_score++;
         } else if(bits_left != 1){
             int v= show_bits(&s->gb, 8);
@@ -267,17 +269,12 @@ static int decode_slice(MpegEncContext *s){
                 s->padding_bug_score--;
             else
                 s->padding_bug_score++;            
-        }
-        
-        if(s->padding_bug_score > -2)
-            s->workaround_bugs |=  FF_BUG_NO_PADDING;
-        else
-            s->workaround_bugs &= ~FF_BUG_NO_PADDING;
+        }                          
     }
 
     // handle formats which dont have unique end markers
     if(s->msmpeg4_version || (s->workaround_bugs&FF_BUG_NO_PADDING)){ //FIXME perhaps solve this more cleanly
-        int left= s->gb.size*8 - get_bits_count(&s->gb);
+        int left= s->gb.size_in_bits - get_bits_count(&s->gb);
         int max_extra=7;
         
         /* no markers in M$ crap */
@@ -302,7 +299,7 @@ static int decode_slice(MpegEncContext *s){
     }
 
     fprintf(stderr, "slice end not reached but screenspace end (%d left %06X)\n", 
-            s->gb.size*8 - get_bits_count(&s->gb),
+            s->gb.size_in_bits - get_bits_count(&s->gb),
             show_bits(&s->gb, 24));
     return -1;
 }
@@ -344,6 +341,61 @@ static int mpeg4_find_frame_end(MpegEncContext *s, UINT8 *buf, int buf_size){
     return -1;
 }
 
+static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){
+    int t, x, y, f;
+    
+    ex= clip(ex, 0, w-1);
+    ey= clip(ey, 0, h-1);
+    
+    buf[sy*stride + sx]+= color;
+    
+    if(ABS(ex - sx) > ABS(ey - sy)){
+        if(sx > ex){
+            t=sx; sx=ex; ex=t;
+            t=sy; sy=ey; ey=t;
+        }
+        buf+= sx + sy*stride;
+        ex-= sx;
+        f= ((ey-sy)<<16)/ex;
+        for(x= 0; x <= ex; x++){
+            y= ((x*f) + (1<<15))>>16;
+            buf[y*stride + x]+= color;
+        }
+    }else{
+        if(sy > ey){
+            t=sx; sx=ex; ex=t;
+            t=sy; sy=ey; ey=t;
+        }
+        buf+= sx + sy*stride;
+        ey-= sy;
+        if(ey) f= ((ex-sx)<<16)/ey;
+        else   f= 0;
+        for(y= 0; y <= ey; y++){
+            x= ((y*f) + (1<<15))>>16;
+            buf[y*stride + x]+= color;
+        }
+    }
+}
+
+static void draw_arrow(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){ 
+    int dx= ex - sx;
+    int dy= ey - sy;
+    
+    if(dx*dx + dy*dy > 3*3){
+        int rx=  dx + dy;
+        int ry= -dx + dy;
+        int length= ff_sqrt((rx*rx + ry*ry)<<8);
+        
+        //FIXME subpixel accuracy
+        rx= ROUNDED_DIV(rx*3<<4, length);
+        ry= ROUNDED_DIV(ry*3<<4, length);
+        
+        draw_line(buf, sx, sy, sx + rx, sy + ry, w, h, stride, color);
+        draw_line(buf, sx, sy, sx - ry, sy + rx, w, h, stride, color);
+    }
+    draw_line(buf, sx, sy, ex, ey, w, h, stride, color);
+}
+
 int ff_h263_decode_frame(AVCodecContext *avctx, 
                              void *data, int *data_size,
                              UINT8 *buf, int buf_size)
@@ -368,49 +420,27 @@ uint64_t time= rdtsc();
     if (buf_size == 0) {
         return 0;
     }
-    
+
     if(s->flags&CODEC_FLAG_TRUNCATED){
         int next;
-        ParseContext *pc= &s->parse_context;
         
-        pc->last_index= pc->index;
-
         if(s->codec_id==CODEC_ID_MPEG4){
             next= mpeg4_find_frame_end(s, buf, buf_size);
         }else{
             fprintf(stderr, "this codec doesnt support truncated bitstreams\n");
             return -1;
         }
-        if(next==-1){
-            if(buf_size + FF_INPUT_BUFFER_PADDING_SIZE + pc->index > pc->buffer_size){
-                pc->buffer_size= buf_size + pc->index + 10*1024;
-                pc->buffer= realloc(pc->buffer, pc->buffer_size);
-            }
-
-            memcpy(&pc->buffer[pc->index], buf, buf_size);
-            pc->index += buf_size;
+        
+        if( ff_combine_frame(s, next, &buf, &buf_size) < 0 )
             return buf_size;
-        }
-
-        if(pc->index){
-            if(next + FF_INPUT_BUFFER_PADDING_SIZE + pc->index > pc->buffer_size){
-                pc->buffer_size= next + pc->index + 10*1024;
-                pc->buffer= realloc(pc->buffer, pc->buffer_size);
-            }
-
-            memcpy(&pc->buffer[pc->index], buf, next + FF_INPUT_BUFFER_PADDING_SIZE );
-            pc->index = 0;
-            buf= pc->buffer;
-            buf_size= pc->last_index + next;
-        }
     }
 
 retry:
     
     if(s->bitstream_buffer_size && buf_size<20){ //divx 5.01+ frame reorder
-        init_get_bits(&s->gb, s->bitstream_buffer, s->bitstream_buffer_size);
+        init_get_bits(&s->gb, s->bitstream_buffer, s->bitstream_buffer_size*8);
     }else
-        init_get_bits(&s->gb, buf, buf_size);
+        init_get_bits(&s->gb, buf, buf_size*8);
     s->bitstream_buffer_size=0;
 
     if (!s->context_initialized) {
@@ -427,7 +457,7 @@ retry:
         if(s->avctx->extradata_size && s->picture_number==0){
             GetBitContext gb;
             
-            init_get_bits(&gb, s->avctx->extradata, s->avctx->extradata_size);
+            init_get_bits(&gb, s->avctx->extradata, s->avctx->extradata_size*8);
             ret = ff_mpeg4_decode_picture_header(s, &gb);
         }
         ret = ff_mpeg4_decode_picture_header(s, &s->gb);
@@ -442,6 +472,11 @@ retry:
     avctx->has_b_frames= !s->low_delay;
 
     if(s->workaround_bugs&FF_BUG_AUTODETECT){
+        if(s->padding_bug_score > -2 && !s->data_partitioning)
+            s->workaround_bugs |=  FF_BUG_NO_PADDING;
+        else
+            s->workaround_bugs &= ~FF_BUG_NO_PADDING;
+
         if(s->avctx->fourcc == ff_get_fourcc("XVIX")) 
             s->workaround_bugs|= FF_BUG_XVID_ILACE;
 #if 0
@@ -472,6 +507,14 @@ retry:
         if(s->xvid_build && s->xvid_build<=1)
             s->workaround_bugs|= FF_BUG_QPEL_CHROMA;
 
+#define SET_QPEL_FUNC(postfix1, postfix2) \
+    s->dsp.put_ ## postfix1 = ff_put_ ## postfix2;\
+    s->dsp.put_no_rnd_ ## postfix1 = ff_put_no_rnd_ ## postfix2;\
+    s->dsp.avg_ ## postfix1 = ff_avg_ ## postfix2;
+
+        if(s->lavc_build && s->lavc_build<4653)
+            s->workaround_bugs|= FF_BUG_STD_QPEL;
+        
 //printf("padding_bug_score: %d\n", s->padding_bug_score);
 #if 0
         if(s->divx_version==500)
@@ -489,6 +532,21 @@ retry:
 #endif
     }
     
+    if(s->workaround_bugs& FF_BUG_STD_QPEL){
+        SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_old_c)
+
+        SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_old_c)
+        SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_old_c)
+    }
 
 #if 0 // dump bits per frame / qp / complexity
 {
@@ -571,7 +629,7 @@ retry:
     
     decode_slice(s);
     s->error_status_table[0]|= VP_START;
-    while(s->mb_y<s->mb_height && s->gb.size*8 - get_bits_count(&s->gb)>16){
+    while(s->mb_y<s->mb_height && s->gb.size_in_bits - get_bits_count(&s->gb)>16){
         if(s->msmpeg4_version){
             if(s->mb_x!=0 || (s->mb_y%s->slice_height)!=0)
                 break;
@@ -580,7 +638,7 @@ retry:
                 break;
         }
         
-        if(s->msmpeg4_version!=4 && s->h263_pred)
+        if(s->msmpeg4_version<4 && s->h263_pred)
             ff_mpeg4_clean_buffers(s);
 
         decode_slice(s);
@@ -645,41 +703,40 @@ retry:
     }
 
     MPV_frame_end(s);
-#if 0 //dirty show MVs, we should export the MV tables and write a filter to show them
-{
-  int mb_y;
-  s->has_b_frames=1;
-  for(mb_y=0; mb_y<s->mb_height; mb_y++){
-    int mb_x;
-    int y= mb_y*16 + 8;
-    for(mb_x=0; mb_x<s->mb_width; mb_x++){
-      int x= mb_x*16 + 8;
-      uint8_t *ptr= s->last_picture.data[0];
-      int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2);
-      int mx= (s->motion_val[xy][0]>>1) + x;
-      int my= (s->motion_val[xy][1]>>1) + y;
-      int i;
-      int max;
-
-      if(mx<0) mx=0;
-      if(my<0) my=0;
-      if(mx>=s->width)  mx= s->width -1;
-      if(my>=s->height) my= s->height-1;
-      max= ABS(mx-x);
-      if(ABS(my-y) > max) max= ABS(my-y);
-      /* the ugliest linedrawing routine ... */
-      for(i=0; i<max; i++){
-        int x1= x + (mx-x)*i/max;
-        int y1= y + (my-y)*i/max;
-        ptr[y1*s->linesize + x1]+=100;
-      }
-      ptr[y*s->linesize + x]+=100;
-      s->mbskip_table[mb_x + mb_y*s->mb_width]=0;
+
+    if((avctx->debug&FF_DEBUG_VIS_MV) && s->last_picture.data[0]){
+        const int shift= 1 + s->quarter_sample;
+        int mb_y;
+        uint8_t *ptr= s->last_picture.data[0];
+        s->low_delay=0; //needed to see the vectors without trashing the buffers
+
+        for(mb_y=0; mb_y<s->mb_height; mb_y++){
+            int mb_x;
+            for(mb_x=0; mb_x<s->mb_width; mb_x++){
+                const int mb_index= mb_x + mb_y*s->mb_width;
+                if(s->co_located_type_table[mb_index] == MV_TYPE_8X8){
+                    int i;
+                    for(i=0; i<4; i++){
+                        int sx= mb_x*16 + 4 + 8*(i&1);
+                        int sy= mb_y*16 + 4 + 8*(i>>1);
+                        int xy= 1 + mb_x*2 + (i&1) + (mb_y*2 + 1 + (i>>1))*(s->mb_width*2 + 2);
+                        int mx= (s->motion_val[xy][0]>>shift) + sx;
+                        int my= (s->motion_val[xy][1]>>shift) + sy;
+                        draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
+                    }
+                }else{
+                    int sx= mb_x*16 + 8;
+                    int sy= mb_y*16 + 8;
+                    int xy= 1 + mb_x*2 + (mb_y*2 + 1)*(s->mb_width*2 + 2);
+                    int mx= (s->motion_val[xy][0]>>shift) + sx;
+                    int my= (s->motion_val[xy][1]>>shift) + sy;
+                    draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
+                }
+                s->mbskip_table[mb_index]=0;
+            }
+        }
     }
-  }
 
-}
-#endif
 
     if(s->pict_type==B_TYPE || s->low_delay){
         *pict= *(AVFrame*)&s->current_picture;
diff --git a/src/libffmpeg/libavcodec/huffyuv.c b/src/libffmpeg/libavcodec/huffyuv.c
index 0eb701037..cff642d11 100644
--- a/src/libffmpeg/libavcodec/huffyuv.c
+++ b/src/libffmpeg/libavcodec/huffyuv.c
@@ -1,7 +1,7 @@
 /*
  * huffyuv codec for libavcodec
  *
- * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2002-2003 Michael Niedermayer <michaelni@gmx.at>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -271,7 +271,7 @@ static int read_huffman_tables(HYuvContext *s, uint8_t *src, int length){
     GetBitContext gb;
     int i;
     
-    init_get_bits(&gb, src, length);
+    init_get_bits(&gb, src, length*8);
     
     for(i=0; i<3; i++){
         read_len_table(s->len[i], &gb);
@@ -295,9 +295,9 @@ static int read_old_huffman_tables(HYuvContext *s){
     GetBitContext gb;
     int i;
 
-    init_get_bits(&gb, classic_shift_luma, sizeof(classic_shift_luma));
+    init_get_bits(&gb, classic_shift_luma, sizeof(classic_shift_luma)*8);
     read_len_table(s->len[0], &gb);
-    init_get_bits(&gb, classic_shift_chroma, sizeof(classic_shift_chroma));
+    init_get_bits(&gb, classic_shift_chroma, sizeof(classic_shift_chroma)*8);
     read_len_table(s->len[1], &gb);
     
     for(i=0; i<256; i++) s->bits[0][i] = classic_add_luma  [i];
@@ -403,7 +403,7 @@ s->bgr32=1;
     case 24:
     case 32:
         if(s->bgr32){
-            avctx->pix_fmt = PIX_FMT_BGRA32;
+            avctx->pix_fmt = PIX_FMT_RGBA32;
         }else{
             avctx->pix_fmt = PIX_FMT_BGR24;
         }
@@ -461,8 +461,6 @@ static int encode_init(AVCodecContext *avctx)
     s->version=2;
     
     avctx->coded_frame= &s->picture;
-    s->picture.pict_type= FF_I_TYPE;
-    s->picture.key_frame= 1;
     
     switch(avctx->pix_fmt){
     case PIX_FMT_YUV420P:
@@ -682,7 +680,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
 
     bswap_buf((uint32_t*)s->bitstream_buffer, (uint32_t*)buf, buf_size/4);
     
-    init_get_bits(&s->gb, s->bitstream_buffer, buf_size);
+    init_get_bits(&s->gb, s->bitstream_buffer, buf_size*8);
 
     p->reference= 0;
     if(avctx->get_buffer(avctx, p) < 0){
@@ -933,6 +931,8 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     init_put_bits(&s->pb, buf, buf_size, NULL, NULL);
     
     *p = *pict;
+    p->pict_type= FF_I_TYPE;
+    p->key_frame= 1;
     
     if(avctx->pix_fmt == PIX_FMT_YUV422P || avctx->pix_fmt == PIX_FMT_YUV420P){
         int lefty, leftu, leftv, y, cy;
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 5fce7f914..857f1d398 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -485,6 +485,107 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
         dst[i+0] += src[i+0];
 }
 
+static int pix_norm1_mmx(uint8_t *pix, int line_size) {
+    int tmp;
+  asm volatile (
+      "movl $16,%%ecx\n"
+      "pxor %%mm0,%%mm0\n"
+      "pxor %%mm7,%%mm7\n"
+      "1:\n"
+      "movq (%0),%%mm2\n"	/* mm2 = pix[0-7] */
+      "movq 8(%0),%%mm3\n"	/* mm3 = pix[8-15] */
+
+      "movq %%mm2,%%mm1\n"	/* mm1 = mm2 = pix[0-7] */
+
+      "punpckhbw %%mm0,%%mm1\n"	/* mm1 = [pix4-7] */
+      "punpcklbw %%mm0,%%mm2\n"	/* mm2 = [pix0-3] */
+
+      "movq %%mm3,%%mm4\n"	/* mm4 = mm3 = pix[8-15] */
+      "punpckhbw %%mm0,%%mm3\n"	/* mm3 = [pix12-15] */
+      "punpcklbw %%mm0,%%mm4\n"	/* mm4 = [pix8-11] */
+
+      "pmaddwd %%mm1,%%mm1\n"	/* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
+      "pmaddwd %%mm2,%%mm2\n"	/* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
+
+      "pmaddwd %%mm3,%%mm3\n"
+      "pmaddwd %%mm4,%%mm4\n"
+
+      "paddd %%mm1,%%mm2\n"	/* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
+					  pix2^2+pix3^2+pix6^2+pix7^2) */
+      "paddd %%mm3,%%mm4\n"
+      "paddd %%mm2,%%mm7\n"
+
+      "addl %2, %0\n"
+      "paddd %%mm4,%%mm7\n"
+      "dec %%ecx\n"
+      "jnz 1b\n"
+
+      "movq %%mm7,%%mm1\n"
+      "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
+      "paddd %%mm7,%%mm1\n"
+      "movd %%mm1,%1\n"
+      : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
+    return tmp;
+}
+
+static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) {
+    int tmp;
+  asm volatile (
+      "movl $16,%%ecx\n"
+      "pxor %%mm0,%%mm0\n"	/* mm0 = 0 */
+      "pxor %%mm7,%%mm7\n"	/* mm7 holds the sum */
+      "1:\n"
+      "movq (%0),%%mm1\n"	/* mm1 = pix1[0-7] */
+      "movq (%1),%%mm2\n"	/* mm2 = pix2[0-7] */
+      "movq 8(%0),%%mm3\n"	/* mm3 = pix1[8-15] */
+      "movq 8(%1),%%mm4\n"	/* mm4 = pix2[8-15] */
+
+      /* todo: mm1-mm2, mm3-mm4 */
+      /* algo: substract mm1 from mm2 with saturation and vice versa */
+      /*       OR the results to get absolute difference */
+      "movq %%mm1,%%mm5\n"
+      "movq %%mm3,%%mm6\n"
+      "psubusb %%mm2,%%mm1\n"
+      "psubusb %%mm4,%%mm3\n"
+      "psubusb %%mm5,%%mm2\n"
+      "psubusb %%mm6,%%mm4\n"
+
+      "por %%mm1,%%mm2\n"
+      "por %%mm3,%%mm4\n"
+
+      /* now convert to 16-bit vectors so we can square them */
+      "movq %%mm2,%%mm1\n"
+      "movq %%mm4,%%mm3\n"
+
+      "punpckhbw %%mm0,%%mm2\n"
+      "punpckhbw %%mm0,%%mm4\n"
+      "punpcklbw %%mm0,%%mm1\n"	/* mm1 now spread over (mm1,mm2) */
+      "punpcklbw %%mm0,%%mm3\n"	/* mm4 now spread over (mm3,mm4) */
+
+      "pmaddwd %%mm2,%%mm2\n"
+      "pmaddwd %%mm4,%%mm4\n"
+      "pmaddwd %%mm1,%%mm1\n"
+      "pmaddwd %%mm3,%%mm3\n"
+
+      "addl %3,%0\n"
+      "addl %3,%1\n"
+
+      "paddd %%mm2,%%mm1\n"
+      "paddd %%mm4,%%mm3\n"
+      "paddd %%mm1,%%mm7\n"
+      "paddd %%mm3,%%mm7\n"
+
+      "decl %%ecx\n"
+      "jnz 1b\n"
+
+      "movq %%mm7,%%mm1\n"
+      "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
+      "paddd %%mm7,%%mm1\n"
+      "movd %%mm1,%2\n"
+      : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
+    return tmp;
+}
+
 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     int i=0;
     asm volatile(
@@ -1085,7 +1186,7 @@ static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
 }\
 \
 static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t temp[32];\
+    uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
     OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
@@ -1096,14 +1197,14 @@ static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
 }\
 \
 static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t temp[32];\
+    uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
     OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
 }\
 \
 static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t temp[32];\
+    uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
     OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
@@ -1114,53 +1215,49 @@ static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
 }\
 \
 static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t temp[32];\
+    uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
     OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
 }\
 static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 8*2 + 18*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
+    put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 8*2 + 18*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
+    put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 8*2 + 9*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
+    put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 8*2 + 9*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src  , 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
+    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 9*2];\
+    uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
@@ -1168,7 +1265,7 @@ static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 9*2];\
+    uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
@@ -1176,27 +1273,21 @@ static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 8*2 + 9*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
+    put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
 static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[8*2 + 8*2 + 9*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
+    put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
 static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[9*2];\
+    uint64_t half[9];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
@@ -1241,44 +1332,40 @@ static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
 }\
 static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[16*2 + 16*2 + 18*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
+    put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[16*2 + 16*2 + 18*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
+    put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[16*2 + 16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
+    put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[16*2 + 16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src  , 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
+    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     uint64_t half[16*2 + 17*2];\
@@ -1297,24 +1384,18 @@ static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[16*2 + 16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
+    put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }\
 static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
-    uint64_t half[16*2 + 16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
-    uint8_t * const halfV= ((uint8_t*)half);\
-    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
+    put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }\
 static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     uint64_t half[17*2];\
@@ -1436,6 +1517,9 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
         
         c->sad[0]= sad16x16_mmx;
         c->sad[1]= sad8x8_mmx;
+
+	c->pix_norm1 = pix_norm1_mmx;
+	c->sse[0] = sse16_mmx;
         
         if (mm_flags & MM_MMXEXT) {
             c->pix_abs16x16     = pix_abs16x16_mmx2;
@@ -1525,7 +1609,7 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
-        
+
             SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
             SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
             SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
index 0ae1cd99d..956edf798 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -58,6 +58,16 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
+	"testl $1, %0			\n\t"
+        " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"addl	%4, %1			\n\t"
+        "addl	$8, %2			\n\t"
+	PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
+	"movq	%%mm4, (%3)		\n\t"
+	"addl	%5, %3			\n\t"
+        "decl	%0			\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
@@ -144,6 +154,19 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
+	"testl $1, %0			\n\t"
+        " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"movq	8(%1), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"addl	%4, %1			\n\t"
+	"addl	$16, %2			\n\t"
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+	"movq	%%mm4, (%3)		\n\t"
+	"movq	%%mm5, 8(%3)		\n\t"
+	"addl	%5, %3			\n\t"
+	"decl	%0			\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
@@ -271,124 +294,6 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }
 
-static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm __volatile(
-	".balign 8      		\n\t"
-	"1:				\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	64(%2), %%mm2		\n\t"
-	"movq	136(%2), %%mm3		\n\t"
-	"punpcklbw %%mm7, %%mm0		\n\t"
-	"punpcklbw %%mm7, %%mm1		\n\t"
-	"punpcklbw %%mm7, %%mm2		\n\t"
-	"punpcklbw %%mm7, %%mm3		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm3		\n\t"
-	"paddusw %%mm1, %%mm3		\n\t"
-	"psrlw	$2, %%mm3		\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	64(%2), %%mm2		\n\t"
-	"movq	136(%2), %%mm4		\n\t"
-	"punpckhbw %%mm7, %%mm0		\n\t"
-	"punpckhbw %%mm7, %%mm1		\n\t"
-	"punpckhbw %%mm7, %%mm2		\n\t"
-	"punpckhbw %%mm7, %%mm4		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm4		\n\t"
-	"paddusw %%mm1, %%mm4		\n\t"
-	"psrlw	$2, %%mm4		\n\t"
-	"packuswb  %%mm4, %%mm3		\n\t"
-	"movq	%%mm3, (%0)		\n\t"
-        "addl	%4, %0			\n\t"
-        "addl	%4, %1			\n\t"
-        "addl	$8, %2			\n\t" 
-        "decl	%3			\n\t"
-	"jnz	1b			\n\t"
-	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
-	:"r"(stride)
-	:"memory");
-}
-
-static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm __volatile(
-	".balign 8      		\n\t"
-	"1:				\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	256(%2), %%mm2		\n\t"
-	"movq	528(%2), %%mm3		\n\t"
-	"punpcklbw %%mm7, %%mm0		\n\t"
-	"punpcklbw %%mm7, %%mm1		\n\t"
-	"punpcklbw %%mm7, %%mm2		\n\t"
-	"punpcklbw %%mm7, %%mm3		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm3		\n\t"
-	"paddusw %%mm1, %%mm3		\n\t"
-	"psrlw	$2, %%mm3		\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	256(%2), %%mm2		\n\t"
-	"movq	528(%2), %%mm4		\n\t"
-	"punpckhbw %%mm7, %%mm0		\n\t"
-	"punpckhbw %%mm7, %%mm1		\n\t"
-	"punpckhbw %%mm7, %%mm2		\n\t"
-	"punpckhbw %%mm7, %%mm4		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm4		\n\t"
-	"paddusw %%mm1, %%mm4		\n\t"
-	"psrlw	$2, %%mm4		\n\t"
-	"packuswb  %%mm4, %%mm3		\n\t"
-	"movq	%%mm3, (%0)		\n\t"
-	"movq	8(%1), %%mm0		\n\t"
-	"movq	8(%2), %%mm1		\n\t"
-	"movq	264(%2), %%mm2		\n\t"
-	"movq	536(%2), %%mm3		\n\t"
-	"punpcklbw %%mm7, %%mm0		\n\t"
-	"punpcklbw %%mm7, %%mm1		\n\t"
-	"punpcklbw %%mm7, %%mm2		\n\t"
-	"punpcklbw %%mm7, %%mm3		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm3		\n\t"
-	"paddusw %%mm1, %%mm3		\n\t"
-	"psrlw	$2, %%mm3		\n\t"
-	"movq	8(%1), %%mm0		\n\t"
-	"movq	8(%2), %%mm1		\n\t"
-	"movq	264(%2), %%mm2		\n\t"
-	"movq	536(%2), %%mm4		\n\t"
-	"punpckhbw %%mm7, %%mm0		\n\t"
-	"punpckhbw %%mm7, %%mm1		\n\t"
-	"punpckhbw %%mm7, %%mm2		\n\t"
-	"punpckhbw %%mm7, %%mm4		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm4		\n\t"
-	"paddusw %%mm1, %%mm4		\n\t"
-	"psrlw	$2, %%mm4		\n\t"
-	"packuswb  %%mm4, %%mm3		\n\t"
-	"movq	%%mm3, 8(%0)		\n\t"
-        "addl	%4, %0			\n\t"
-        "addl	%4, %1			\n\t"
-        "addl	$16, %2			\n\t" 
-        "decl	%3			\n\t"
-	"jnz	1b			\n\t"
-	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
-	:"r"(stride)
-	:"memory");
-}
-
 // avg_pixels
 // in case more speed is needed - unroling would certainly help
 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
@@ -641,133 +546,6 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 	:"eax", "memory");
 }
 
-static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    MOVQ_BFE(mm5);
-    __asm __volatile(
-	".balign 8      		\n\t"
-	"1:				\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	64(%2), %%mm2		\n\t"
-	"movq	136(%2), %%mm3		\n\t"
-	"punpcklbw %%mm7, %%mm0		\n\t"
-	"punpcklbw %%mm7, %%mm1		\n\t"
-	"punpcklbw %%mm7, %%mm2		\n\t"
-	"punpcklbw %%mm7, %%mm3		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm3		\n\t"
-	"paddusw %%mm1, %%mm3		\n\t"
-	"psrlw	$2, %%mm3		\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	64(%2), %%mm2		\n\t"
-	"movq	136(%2), %%mm4		\n\t"
-	"punpckhbw %%mm7, %%mm0		\n\t"
-	"punpckhbw %%mm7, %%mm1		\n\t"
-	"punpckhbw %%mm7, %%mm2		\n\t"
-	"punpckhbw %%mm7, %%mm4		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm4		\n\t"
-	"paddusw %%mm1, %%mm4		\n\t"
-	"psrlw	$2, %%mm4		\n\t"
-	"packuswb  %%mm4, %%mm3		\n\t"
-	"movq	(%0), %%mm4		\n\t"
-        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
-	"movq	%%mm0, (%0)		\n\t"
-        "addl	%4, %0			\n\t"
-        "addl	%4, %1			\n\t"
-        "addl	$8, %2			\n\t" 
-        "decl	%3			\n\t"
-	"jnz	1b			\n\t"
-	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
-	:"r"(stride)
-	:"memory");
-}
-
-static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    MOVQ_BFE(mm5);
-    __asm __volatile(
-	".balign 8      		\n\t"
-	"1:				\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	256(%2), %%mm2		\n\t"
-	"movq	528(%2), %%mm3		\n\t"
-	"punpcklbw %%mm7, %%mm0		\n\t"
-	"punpcklbw %%mm7, %%mm1		\n\t"
-	"punpcklbw %%mm7, %%mm2		\n\t"
-	"punpcklbw %%mm7, %%mm3		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm3		\n\t"
-	"paddusw %%mm1, %%mm3		\n\t"
-	"psrlw	$2, %%mm3		\n\t"
-	"movq	(%1), %%mm0		\n\t"
-	"movq	(%2), %%mm1		\n\t"
-	"movq	256(%2), %%mm2		\n\t"
-	"movq	528(%2), %%mm4		\n\t"
-	"punpckhbw %%mm7, %%mm0		\n\t"
-	"punpckhbw %%mm7, %%mm1		\n\t"
-	"punpckhbw %%mm7, %%mm2		\n\t"
-	"punpckhbw %%mm7, %%mm4		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm4		\n\t"
-	"paddusw %%mm1, %%mm4		\n\t"
-	"psrlw	$2, %%mm4		\n\t"
-	"packuswb  %%mm4, %%mm3		\n\t"
-	"movq	(%0), %%mm4		\n\t"
-        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
-	"movq	%%mm0, (%0)		\n\t"
-	"movq	8(%1), %%mm0		\n\t"
-	"movq	8(%2), %%mm1		\n\t"
-	"movq	264(%2), %%mm2		\n\t"
-	"movq	536(%2), %%mm3		\n\t"
-	"punpcklbw %%mm7, %%mm0		\n\t"
-	"punpcklbw %%mm7, %%mm1		\n\t"
-	"punpcklbw %%mm7, %%mm2		\n\t"
-	"punpcklbw %%mm7, %%mm3		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm3		\n\t"
-	"paddusw %%mm1, %%mm3		\n\t"
-	"psrlw	$2, %%mm3		\n\t"
-	"movq	8(%1), %%mm0		\n\t"
-	"movq	8(%2), %%mm1		\n\t"
-	"movq	264(%2), %%mm2		\n\t"
-	"movq	536(%2), %%mm4		\n\t"
-	"punpckhbw %%mm7, %%mm0		\n\t"
-	"punpckhbw %%mm7, %%mm1		\n\t"
-	"punpckhbw %%mm7, %%mm2		\n\t"
-	"punpckhbw %%mm7, %%mm4		\n\t"
-	"paddusw %%mm6, %%mm0		\n\t"
-	"paddusw %%mm0, %%mm1		\n\t"
-	"paddusw %%mm2, %%mm4		\n\t"
-	"paddusw %%mm1, %%mm4		\n\t"
-	"psrlw	$2, %%mm4		\n\t"
-	"packuswb  %%mm4, %%mm3		\n\t"
-	"movq	8(%0), %%mm4		\n\t"
-        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
-	"movq	%%mm0, 8(%0)		\n\t"
-        "addl	%4, %0			\n\t"
-        "addl	%4, %1			\n\t"
-        "addl	$16, %2			\n\t" 
-        "decl	%3			\n\t"
-	"jnz	1b			\n\t"
-	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
-	:"r"(stride)
-	:"memory");
-}
-
-
 //FIXME optimize
 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
     DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index 799ff1666..ead30ed31 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -53,8 +53,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         if (!s->h263_aic) {
 #if 1
         asm volatile (
-        	"xorl %%edx, %%edx	\n\t"
-        	"mul %%ecx		\n\t"
+        	"imul %%ecx		\n\t"
         	: "=d" (level), "=a"(dummy)
         	: "a" ((block[0]>>2) + q), "c" (inverse[q<<1])
         );
diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c
index bdf6fe65d..2304092fd 100644
--- a/src/libffmpeg/libavcodec/imgconvert.c
+++ b/src/libffmpeg/libavcodec/imgconvert.c
@@ -1,6 +1,6 @@
 /*
  * Misc image convertion routines
- * Copyright (c) 2001, 2002 Fabrice Bellard.
+ * Copyright (c) 2001, 2002, 2003 Fabrice Bellard.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -26,14 +26,220 @@
 #ifdef HAVE_MMX
 #include "i386/mmx.h"
 #endif
+
+typedef struct PixFmtInfo {
+    const char *name;
+    UINT8 nb_components;     /* number of components in AVPicture array  */
+    UINT8 is_yuv : 1;    /* true if YUV instead of RGB color space */
+    UINT8 is_packed : 1; /* true if multiple components in same word */
+    UINT8 is_paletted : 1; /* true if paletted */
+    UINT8 is_alpha : 1;    /* true if alpha can be specified */
+    UINT8 is_gray : 1;     /* true if gray or monochrome format */
+    UINT8 x_chroma_shift; /* X chroma subsampling factor is 2 ^ shift */
+    UINT8 y_chroma_shift; /* Y chroma subsampling factor is 2 ^ shift */
+} PixFmtInfo;
+
+/* this table gives more information about formats */
+static PixFmtInfo pix_fmt_info[PIX_FMT_NB] = {
+    /* YUV formats */
+    [PIX_FMT_YUV420P] = {
+        .name = "yuv420p",
+        .nb_components = 3, .is_yuv = 1,
+        .x_chroma_shift = 1, .y_chroma_shift = 1, 
+    },
+    [PIX_FMT_YUV422P] = {
+        .name = "yuv422p",
+        .nb_components = 3, .is_yuv = 1,
+        .x_chroma_shift = 1, .y_chroma_shift = 0, 
+    },
+    [PIX_FMT_YUV444P] = {
+        .name = "yuv444p",
+        .nb_components = 3, .is_yuv = 1,
+        .x_chroma_shift = 0, .y_chroma_shift = 0, 
+    },
+    [PIX_FMT_YUV422] = {
+        .name = "yuv422",
+        .nb_components = 1, .is_yuv = 1, .is_packed = 1,
+        .x_chroma_shift = 1, .y_chroma_shift = 0,
+    },
+    [PIX_FMT_YUV410P] = {
+        .name = "yuv410p",
+        .nb_components = 3, .is_yuv = 1,
+        .x_chroma_shift = 2, .y_chroma_shift = 2,
+    },
+    [PIX_FMT_YUV411P] = {
+        .name = "yuv411p",
+        .nb_components = 3, .is_yuv = 1,
+        .x_chroma_shift = 2, .y_chroma_shift = 0,
+    },
+
+    /* RGB formats */
+    [PIX_FMT_RGB24] = {
+        .name = "rgb24",
+        .nb_components = 1, .is_packed = 1,
+    },
+    [PIX_FMT_BGR24] = {
+        .name = "bgr24",
+        .nb_components = 1, .is_packed = 1,
+    },
+    [PIX_FMT_RGBA32] = {
+        .name = "rgba32",
+        .nb_components = 1, .is_packed = 1, .is_alpha = 1,
+    },
+    [PIX_FMT_RGB565] = {
+        .name = "rgb565",
+        .nb_components = 1, .is_packed = 1,
+    },
+    [PIX_FMT_RGB555] = {
+        .name = "rgb555",
+        .nb_components = 1, .is_packed = 1, .is_alpha = 1,
+    },
+
+    /* gray / mono formats */
+    [PIX_FMT_GRAY8] = {
+        .name = "gray",
+        .nb_components = 1, .is_gray = 1,
+    },
+    [PIX_FMT_MONOWHITE] = {
+        .name = "monow",
+        .nb_components = 1, .is_packed = 1, .is_gray = 1,
+    },
+    [PIX_FMT_MONOBLACK] = {
+        .name = "monob",
+        .nb_components = 1, .is_packed = 1, .is_gray = 1,
+    },
+};
+
+void avcodec_get_chroma_sub_sample(int pix_fmt, int *h_shift, int *v_shift)
+{
+    if (pix_fmt_info[pix_fmt].is_yuv) {
+        *h_shift = pix_fmt_info[pix_fmt].x_chroma_shift;
+        *v_shift = pix_fmt_info[pix_fmt].y_chroma_shift;
+    } else {
+        *h_shift=0;
+        *v_shift=0;
+    }
+}
+
+const char *avcodec_get_pix_fmt_name(int pix_fmt)
+{
+    if (pix_fmt < 0 || pix_fmt >= PIX_FMT_NB)
+        return "???";
+    else
+        return pix_fmt_info[pix_fmt].name;
+}
+
+/* Picture field are filled with 'ptr' addresses. Also return size */
+int avpicture_fill(AVPicture *picture, UINT8 *ptr,
+                   int pix_fmt, int width, int height)
+{
+    int size;
+
+    size = width * height;
+    switch(pix_fmt) {
+    case PIX_FMT_YUV420P:
+        picture->data[0] = ptr;
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size / 4;
+        picture->linesize[0] = width;
+        picture->linesize[1] = width / 2;
+        picture->linesize[2] = width / 2;
+        return (size * 3) / 2;
+    case PIX_FMT_RGB24:
+    case PIX_FMT_BGR24:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = width * 3;
+        return size * 3;
+    case PIX_FMT_YUV422P:
+        picture->data[0] = ptr;
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size / 2;
+        picture->linesize[0] = width;
+        picture->linesize[1] = width / 2;
+        picture->linesize[2] = width / 2;
+        return (size * 2);
+    case PIX_FMT_YUV444P:
+        picture->data[0] = ptr;
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size;
+        picture->linesize[0] = width;
+        picture->linesize[1] = width;
+        picture->linesize[2] = width;
+        return size * 3;
+    case PIX_FMT_RGBA32:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = width * 4;
+        return size * 4;
+    case PIX_FMT_YUV410P:
+        picture->data[0] = ptr;
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size / 16;
+        picture->linesize[0] = width;
+        picture->linesize[1] = width / 4;
+        picture->linesize[2] = width / 4;
+        return size + (size / 8);
+    case PIX_FMT_YUV411P:
+        picture->data[0] = ptr;
+        picture->data[1] = picture->data[0] + size;
+        picture->data[2] = picture->data[1] + size / 4;
+        picture->linesize[0] = width;
+        picture->linesize[1] = width / 4;
+        picture->linesize[2] = width / 4;
+        return size + (size / 2);
+    case PIX_FMT_RGB555:
+    case PIX_FMT_RGB565:
+    case PIX_FMT_YUV422:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = width * 2;
+        return size * 2;
+    case PIX_FMT_GRAY8:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = width;
+        return size;
+    case PIX_FMT_MONOWHITE:
+    case PIX_FMT_MONOBLACK:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = (width + 7) >> 3;
+        return picture->linesize[0] * height;
+    default:
+        picture->data[0] = NULL;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        return -1;
+    }
+}
+
+int avpicture_get_size(int pix_fmt, int width, int height)
+{
+    AVPicture dummy_pict;
+    return avpicture_fill(&dummy_pict, NULL, pix_fmt, width, height);
+}
+
+
 /* XXX: totally non optimized */
 
-static void yuv422_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
+static void yuv422_to_yuv420p(AVPicture *dst, AVPicture *src,
+                              int width, int height)
 {
+    UINT8 *lum, *cb, *cr;
     int x, y;
-    UINT8 *p = src;
-
+    const UINT8 *p;
+ 
+    lum = dst->data[0];
+    cb = dst->data[1];
+    cr = dst->data[2];
+    p = src->data[0];
+   
     for(y=0;y<height;y+=2) {
         for(x=0;x<width;x+=2) {
             lum[0] = p[0];
@@ -58,342 +264,6 @@ static void yuv422_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
 #define ONE_HALF  (1 << (SCALEBITS - 1))
 #define FIX(x)		((int) ((x) * (1L<<SCALEBITS) + 0.5))
 
-static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
-{
-    int wrap, wrap3, x, y;
-    int r, g, b, r1, g1, b1;
-    UINT8 *p;
-
-    wrap = width;
-    wrap3 = width * 3;
-    p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
-            r = p[0];
-            g = p[1];
-            b = p[2];
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            r = p[3];
-            g = p[4];
-            b = p[5];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap3;
-            lum += wrap;
-
-            r = p[0];
-            g = p[1];
-            b = p[2];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            r = p[3];
-            g = p[4];
-            b = p[5];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            
-            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
-                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-
-            cb++;
-            cr++;
-            p += -wrap3 + 2 * 3;
-            lum += -wrap + 2;
-        }
-        p += wrap3;
-        lum += wrap;
-    }
-}
-
-static void rgba32_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
-{
-    int wrap, wrap4, x, y;
-    int r, g, b, r1, g1, b1;
-    UINT8 *p;
-
-    wrap = width;
-    wrap4 = width * 4;
-    p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
-            r = p[0];
-            g = p[1];
-            b = p[2];
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            r = p[4];
-            g = p[5];
-            b = p[6];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap4;
-            lum += wrap;
-
-            r = p[0];
-            g = p[1];
-            b = p[2];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            r = p[4];
-            g = p[5];
-            b = p[6];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            
-            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
-                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-
-            cb++;
-            cr++;
-            p += -wrap4 + 2 * 4;
-            lum += -wrap + 2;
-        }
-        p += wrap4;
-        lum += wrap;
-    }
-}
-
-#define rgb565_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0800,31, 0x0020,63,0x0001,31)
-#define rgb555_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0400,31, 0x0020,31,0x0001,31)
-#define rgb5551_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0800,31, 0x0040,31,0x0002,31)
-#define bgr565_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0001,31, 0x0020,63,0x0800,31)
-#define bgr555_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0001,31, 0x0020,31,0x0400,31)
-#define gbr565_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0001,31, 0x0800,31,0x0040,63)
-#define gbr555_to_yuv420p(lum,cb,cr,src,width,height) rgbmisc_to_yuv420p((lum),(cb),(cr),(src),(width),(height),0x0001,31, 0x0400,31,0x0020,31)
-
-static void rgbmisc_to_yuv420p
-  (UINT8 *lum, UINT8 *cb, UINT8 *cr,
-   UINT8 *src, int width, int height,
-   
-   UINT16 R_LOWMASK, UINT16 R_MAX,
-   UINT16 G_LOWMASK, UINT16 G_MAX,
-   UINT16 B_LOWMASK, UINT16 B_MAX
-  )
-{
-    int wrap, wrap2, x, y;
-    int r, g, b, r1, g1, b1;
-    UINT8 *p;
-    UINT16 pixel;
-
-    wrap = width;
-    wrap2 = width * 2;
-    p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
-            pixel = p[0] | (p[1]<<8);
-            r = (((pixel/R_LOWMASK) & R_MAX) * (0x100 / (R_MAX+1)));
-            g = (((pixel/G_LOWMASK) & G_MAX) * (0x100 / (G_MAX+1)));
-            b = (((pixel/B_LOWMASK) & B_MAX) * (0x100 / (B_MAX+1)));
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-
-            pixel = p[2] | (p[3]<<8);
-            r = (((pixel/R_LOWMASK) & R_MAX) * (0x100 / (R_MAX+1)));
-            g = (((pixel/G_LOWMASK) & G_MAX) * (0x100 / (G_MAX+1)));
-            b = (((pixel/B_LOWMASK) & B_MAX) * (0x100 / (B_MAX+1)));
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap2;
-            lum += wrap;
-
-            pixel = p[0] | (p[1]<<8);
-            r = (((pixel/R_LOWMASK) & R_MAX) * (0x100 / (R_MAX+1)));
-            g = (((pixel/G_LOWMASK) & G_MAX) * (0x100 / (G_MAX+1)));
-            b = (((pixel/B_LOWMASK) & B_MAX) * (0x100 / (B_MAX+1)));
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            pixel = p[2] | (p[3]<<8);
-            r = (((pixel/R_LOWMASK) & R_MAX) * (0x100 / (R_MAX+1)));
-            g = (((pixel/G_LOWMASK) & G_MAX) * (0x100 / (G_MAX+1)));
-            b = (((pixel/B_LOWMASK) & B_MAX) * (0x100 / (B_MAX+1)));
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            
-            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
-                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-
-            cb++;
-            cr++;
-            p += -wrap2 + 2 * 2;
-            lum += -wrap + 2;
-        }
-        p += wrap2;
-        lum += wrap;
-    }
-}
-
-
-static void bgr24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
-{
-    int wrap, wrap3, x, y;
-    int r, g, b, r1, g1, b1;
-    UINT8 *p;
-
-    wrap = width;
-    wrap3 = width * 3;
-    p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
-            b = p[0];
-            g = p[1];
-            r = p[2];
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            b = p[3];
-            g = p[4];
-            r = p[5];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap3;
-            lum += wrap;
-
-            b = p[0];
-            g = p[1];
-            r = p[2];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            b = p[3];
-            g = p[4];
-            r = p[5];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            
-            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
-                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-
-            cb++;
-            cr++;
-            p += -wrap3 + 2 * 3;
-            lum += -wrap + 2;
-        }
-        p += wrap3;
-        lum += wrap;
-    }
-}
-
-static void bgra32_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
-{
-    int wrap, wrap4, x, y;
-    int r, g, b, r1, g1, b1;
-    UINT8 *p;
-
-    wrap = width;
-    wrap4 = width * 4;
-    p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
-            b = p[0];
-            g = p[1];
-            r = p[2];
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            b = p[4];
-            g = p[5];
-            r = p[6];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap4;
-            lum += wrap;
-
-            b = p[0];
-            g = p[1];
-            r = p[2];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            b = p[4];
-            g = p[5];
-            r = p[6];
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 
-                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            
-            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 
-                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
-
-            cb++;
-            cr++;
-            p += -wrap4 + 2 * 4;
-            lum += -wrap + 2;
-        }
-        p += wrap4;
-        lum += wrap;
-    }
-}
-
 /* XXX: use generic filter ? */
 /* 1x2 -> 1x1 */
 static void shrink2(UINT8 *dst, int dst_wrap, 
@@ -487,7 +357,7 @@ static void grow22(UINT8 *dst, int dst_wrap,
     }
 }
 
-/* 1x2 -> 2x1. width and height are given for the source picture */
+/* 1x2 -> 2x1 */
 static void conv411(UINT8 *dst, int dst_wrap, 
                     UINT8 *src, int src_wrap,
                     int width, int height)
@@ -495,7 +365,7 @@ static void conv411(UINT8 *dst, int dst_wrap,
     int w, c;
     UINT8 *s1, *s2, *d;
 
-    for(;height > 0; height -= 2) {
+    for(;height > 0; height--) {
         s1 = src;
         s2 = src + src_wrap;
         d = dst;
@@ -531,7 +401,7 @@ static void img_copy(UINT8 *dst, int dst_wrap,
 #define C_GU (13954 >> (16 - SCALE_BITS))
 #define C_GV (34903 >> (16 - SCALE_BITS))
 
-#define RGBOUT(r, g, b, y1)\
+#define YUV_TO_RGB2(r, g, b, y1)\
 {\
     y = (y1 - 16) * C_Y;\
     r = cm[(y + r_add) >> SCALE_BITS];\
@@ -540,340 +410,816 @@ static void img_copy(UINT8 *dst, int dst_wrap,
 }
 
 /* XXX: no chroma interpolating is done */
-static void yuv420p_to_bgra32(AVPicture *dst, AVPicture *src, 
-                             int width, int height)
+#define RGB_FUNCTIONS(rgb_name)                                         \
+                                                                        \
+static void yuv420p_to_ ## rgb_name (AVPicture *dst, AVPicture *src,    \
+                                     int width, int height)             \
+{                                                                       \
+    UINT8 *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;             \
+    int w, y, cb, cr, r_add, g_add, b_add, width2;                      \
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;                                 \
+    unsigned int r, g, b;                                               \
+                                                                        \
+    d = dst->data[0];                                                   \
+    y1_ptr = src->data[0];                                              \
+    cb_ptr = src->data[1];                                              \
+    cr_ptr = src->data[2];                                              \
+    width2 = width >> 1;                                                \
+    for(;height > 0; height -= 2) {                                     \
+        d1 = d;                                                         \
+        d2 = d + dst->linesize[0];                                      \
+        y2_ptr = y1_ptr + src->linesize[0];                             \
+        for(w = width2; w > 0; w --) {                                  \
+            cb = cb_ptr[0] - 128;                                       \
+            cr = cr_ptr[0] - 128;                                       \
+            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));                \
+            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));  \
+            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));                \
+                                                                        \
+            /* output 4 pixels */                                       \
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);                            \
+            RGB_OUT(d1, r, g, b);                                       \
+                                                                        \
+            YUV_TO_RGB2(r, g, b, y1_ptr[1]);                            \
+            RGB_OUT(d1 + BPP, r, g, b);                                 \
+                                                                        \
+            YUV_TO_RGB2(r, g, b, y2_ptr[0]);                            \
+            RGB_OUT(d2, r, g, b);                                       \
+                                                                        \
+            YUV_TO_RGB2(r, g, b, y2_ptr[1]);                            \
+            RGB_OUT(d2 + BPP, r, g, b);                                 \
+                                                                        \
+            d1 += 2 * BPP;                                              \
+            d2 += 2 * BPP;                                              \
+                                                                        \
+            y1_ptr += 2;                                                \
+            y2_ptr += 2;                                                \
+            cb_ptr++;                                                   \
+            cr_ptr++;                                                   \
+        }                                                               \
+        d += 2 * dst->linesize[0];                                      \
+        y1_ptr += 2 * src->linesize[0] - width;                         \
+        cb_ptr += src->linesize[1] - width2;                            \
+        cr_ptr += src->linesize[2] - width2;                            \
+    }                                                                   \
+}                                                                       \
+                                                                        \
+/* XXX: no chroma interpolating is done */                              \
+static void yuv422p_to_ ## rgb_name (AVPicture *dst, AVPicture *src,    \
+                                    int width, int height)              \
+{                                                                       \
+    UINT8 *y1_ptr, *cb_ptr, *cr_ptr, *d, *d1;                           \
+    int w, y, cb, cr, r_add, g_add, b_add, width2;                      \
+    UINT8 *cm = cropTbl + MAX_NEG_CROP;                                 \
+    unsigned int r, g, b;                                               \
+                                                                        \
+    d = dst->data[0];                                                   \
+    y1_ptr = src->data[0];                                              \
+    cb_ptr = src->data[1];                                              \
+    cr_ptr = src->data[2];                                              \
+    width2 = width >> 1;                                                \
+    for(;height > 0; height --) {                                       \
+        d1 = d;                                                         \
+        for(w = width2; w > 0; w --) {                                  \
+            cb = cb_ptr[0] - 128;                                       \
+            cr = cr_ptr[0] - 128;                                       \
+            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));                \
+            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));  \
+            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));                \
+                                                                        \
+            /* output 2 pixels */                                       \
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);                            \
+            RGB_OUT(d, r, g, b);                                        \
+                                                                        \
+            YUV_TO_RGB2(r, g, b, y1_ptr[1]);                            \
+            RGB_OUT(d + BPP, r, g, b);                                  \
+                                                                        \
+            d += 2 * BPP;                                               \
+                                                                        \
+            y1_ptr += 2;                                                \
+            cb_ptr++;                                                   \
+            cr_ptr++;                                                   \
+        }                                                               \
+        d += dst->linesize[0];                                          \
+        y1_ptr += src->linesize[0] - width;                             \
+        cb_ptr += src->linesize[1] - width2;                            \
+        cr_ptr += src->linesize[2] - width2;                            \
+    }                                                                   \
+}                                                                       \
+                                                                        \
+static void rgb_name ## _to_yuv420p(AVPicture *dst, AVPicture *src,     \
+                                    int width, int height)              \
+{                                                                       \
+    int wrap, wrap3, x, y;                                              \
+    int r, g, b, r1, g1, b1;                                            \
+    UINT8 *lum, *cb, *cr;                                               \
+    const UINT8 *p;                                                     \
+                                                                        \
+    lum = dst->data[0];                                                 \
+    cb = dst->data[1];                                                  \
+    cr = dst->data[2];                                                  \
+                                                                        \
+    wrap = width;                                                       \
+    wrap3 = width * BPP;                                                \
+    p = src->data[0];                                                   \
+    for(y=0;y<height;y+=2) {                                            \
+        for(x=0;x<width;x+=2) {                                         \
+            RGB_IN(r, g, b, p);                                         \
+            r1 = r;                                                     \
+            g1 = g;                                                     \
+            b1 = b;                                                     \
+            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +             \
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;        \
+            RGB_IN(r, g, b, p + BPP);                                   \
+            r1 += r;                                                    \
+            g1 += g;                                                    \
+            b1 += b;                                                    \
+            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +             \
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;        \
+            p += wrap3;                                                 \
+            lum += wrap;                                                \
+                                                                        \
+            RGB_IN(r, g, b, p);                                         \
+            r1 += r;                                                    \
+            g1 += g;                                                    \
+            b1 += b;                                                    \
+            lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +             \
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;        \
+                                                                        \
+            RGB_IN(r, g, b, p + BPP);                                   \
+            r1 += r;                                                    \
+            g1 += g;                                                    \
+            b1 += b;                                                    \
+            lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +             \
+                      FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;        \
+                                                                        \
+            cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +         \
+                      FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >>          \
+                     (SCALEBITS + 2)) + 128;                            \
+            cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -           \
+                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >>           \
+                     (SCALEBITS + 2)) + 128;                            \
+                                                                        \
+            cb++;                                                       \
+            cr++;                                                       \
+            p += -wrap3 + 2 * BPP;                                      \
+            lum += -wrap + 2;                                           \
+        }                                                               \
+        p += wrap3;                                                     \
+        lum += wrap;                                                    \
+    }                                                                   \
+}                                                                       \
+                                                                        \
+static void rgb_name ## _to_gray(AVPicture *dst, AVPicture *src,        \
+                                 int width, int height)                 \
+{                                                                       \
+    const unsigned char *p;                                             \
+    unsigned char *q;                                                   \
+    int r, g, b, dst_wrap, src_wrap;                                    \
+    int x, y;                                                           \
+                                                                        \
+    p = src->data[0];                                                   \
+    src_wrap = src->linesize[0] - BPP * width;                          \
+                                                                        \
+    q = dst->data[0];                                                   \
+    dst_wrap = dst->linesize[0] - width;                                \
+                                                                        \
+    for(y=0;y<height;y++) {                                             \
+        for(x=0;x<width;x++) {                                          \
+            RGB_IN(r, g, b, p);                                         \
+            q[0] = (FIX(0.29900) * r + FIX(0.58700) * g +               \
+                    FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;          \
+            q++;                                                        \
+            p += BPP;                                                   \
+        }                                                               \
+        p += src_wrap;                                                  \
+        q += dst_wrap;                                                  \
+    }                                                                   \
+}                                                                       \
+                                                                        \
+static void gray_to_ ## rgb_name(AVPicture *dst, AVPicture *src,        \
+                                 int width, int height)                 \
+{                                                                       \
+    const unsigned char *p;                                             \
+    unsigned char *q;                                                   \
+    int r, dst_wrap, src_wrap;                                          \
+    int x, y;                                                           \
+                                                                        \
+    p = src->data[0];                                                   \
+    src_wrap = src->linesize[0] - width;                                \
+                                                                        \
+    q = dst->data[0];                                                   \
+    dst_wrap = dst->linesize[0] - BPP * width;                          \
+                                                                        \
+    for(y=0;y<height;y++) {                                             \
+        for(x=0;x<width;x++) {                                          \
+            r = p[0];                                                   \
+            RGB_OUT(q, r, r, r);                                        \
+            q += BPP;                                                   \
+            p ++;                                                       \
+        }                                                               \
+        p += src_wrap;                                                  \
+        q += dst_wrap;                                                  \
+    }                                                                   \
+}
+
+/* copy bit n to bits 0 ... n - 1 */
+static inline unsigned int bitcopy_n(unsigned int a, int n)
 {
-    UINT8 *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
-    int w, y, cb, cr, r_add, g_add, b_add, width2;
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
+    int mask;
+    mask = (1 << n) - 1;
+    return (a & (0xff & ~mask)) | ((-((a >> n) & 1)) & mask);
+}
 
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    width2 = width >> 1;
-    for(;height > 0; height -= 2) {
-        d1 = d;
-        d2 = d + dst->linesize[0];
-        y2_ptr = y1_ptr + src->linesize[0];
-        for(w = width2; w > 0; w --) {
-            cb = cb_ptr[0] - 128;
-            cr = cr_ptr[0] - 128;
-            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
-            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
-            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
-            
-            /* output 4 pixels */
-            RGBOUT(d1[2], d1[1], d1[0], y1_ptr[0]);
-            RGBOUT(d1[6], d1[5], d1[4], y1_ptr[1]);
-            RGBOUT(d2[2], d2[1], d2[0], y2_ptr[0]);
-            RGBOUT(d2[6], d2[5], d2[4], y2_ptr[1]);
-
-            d1[3] = d1[7] = d2[3] = d2[7] = 255;
-
-            d1 += 8;
-            d2 += 8;
-            y1_ptr += 2;
-            y2_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
+/* rgb555 handling */
+
+#define RGB_IN(r, g, b, s)\
+{\
+    unsigned int v = ((UINT16 *)(s))[0];\
+    r = bitcopy_n(v >> (10 - 3), 3);\
+    g = bitcopy_n(v >> (5 - 3), 3);\
+    b = bitcopy_n(v << 3, 3);\
+}
+
+#define RGB_OUT(d, r, g, b)\
+{\
+    ((UINT16 *)(d))[0] = ((r >> 3) << 10) | ((g >> 3) << 5) | (b >> 3) | 0x8000;\
+}
+
+#define BPP 2
+
+RGB_FUNCTIONS(rgb555)
+
+#undef RGB_IN
+#undef RGB_OUT
+#undef BPP
+
+/* rgb565 handling */
+
+#define RGB_IN(r, g, b, s)\
+{\
+    unsigned int v = ((UINT16 *)(s))[0];\
+    r = bitcopy_n(v >> (11 - 3), 3);\
+    g = bitcopy_n(v >> (5 - 2), 2);\
+    b = bitcopy_n(v << 3, 3);\
+}
+
+#define RGB_OUT(d, r, g, b)\
+{\
+    ((UINT16 *)(d))[0] = ((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3);\
+}
+
+#define BPP 2
+
+RGB_FUNCTIONS(rgb565)
+
+#undef RGB_IN
+#undef RGB_OUT
+#undef BPP
+
+/* bgr24 handling */
+
+#define RGB_IN(r, g, b, s)\
+{\
+    b = (s)[0];\
+    g = (s)[1];\
+    r = (s)[2];\
+}
+
+#define RGB_OUT(d, r, g, b)\
+{\
+    (d)[0] = b;\
+    (d)[1] = g;\
+    (d)[2] = r;\
+}
+
+#define BPP 3
+
+RGB_FUNCTIONS(bgr24)
+
+#undef RGB_IN
+#undef RGB_OUT
+#undef BPP
+
+/* rgb24 handling */
+
+#define RGB_IN(r, g, b, s)\
+{\
+    r = (s)[0];\
+    g = (s)[1];\
+    b = (s)[2];\
+}
+
+#define RGB_OUT(d, r, g, b)\
+{\
+    (d)[0] = r;\
+    (d)[1] = g;\
+    (d)[2] = b;\
+}
+
+#define BPP 3
+
+RGB_FUNCTIONS(rgb24)
+
+#undef RGB_IN
+#undef RGB_OUT
+#undef BPP
+
+/* rgba32 handling */
+
+#define RGB_IN(r, g, b, s)\
+{\
+    unsigned int v = ((UINT32 *)(s))[0];\
+    r = (v >> 16) & 0xff;\
+    g = (v >> 8) & 0xff;\
+    b = v & 0xff;\
+}
+
+#define RGB_OUT(d, r, g, b)\
+{\
+    ((UINT32 *)(d))[0] = (0xff << 24) | (r << 16) | (g << 8) | b;\
+}
+
+#define BPP 4
+
+RGB_FUNCTIONS(rgba32)
+
+#undef RGB_IN
+#undef RGB_OUT
+#undef BPP
+
+
+static void rgb24_to_rgb565(AVPicture *dst, AVPicture *src,
+                            int width, int height)
+{
+    const unsigned char *p;
+    unsigned char *q;
+    int r, g, b, dst_wrap, src_wrap;
+    int x, y;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - 3 * width;
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - 2 * width;
+
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            r = p[0];
+            g = p[1];
+            b = p[2];
+
+            ((unsigned short *)q)[0] = 
+                ((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3);
+            q += 2;
+            p += 3;
         }
-        d += 2 * dst->linesize[0];
-        y1_ptr += 2 * src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width2;
-        cr_ptr += src->linesize[2] - width2;
+        p += src_wrap;
+        q += dst_wrap;
     }
 }
 
-/* XXX: no chroma interpolating is done */
-static void yuv420p_to_rgba32(AVPicture *dst, AVPicture *src, 
-                             int width, int height)
+/* NOTE: we also add a dummy alpha bit */
+static void rgb24_to_rgb555(AVPicture *dst, AVPicture *src,
+                            int width, int height)
 {
-    UINT8 *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
-    int w, y, cb, cr, r_add, g_add, b_add, width2;
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
+    const unsigned char *p;
+    unsigned char *q;
+    int r, g, b, dst_wrap, src_wrap;
+    int x, y;
 
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    width2 = width >> 1;
-    for(;height > 0; height -= 2) {
-        d1 = d;
-        d2 = d + dst->linesize[0];
-        y2_ptr = y1_ptr + src->linesize[0];
-        for(w = width2; w > 0; w --) {
-            cb = cb_ptr[0] - 128;
-            cr = cr_ptr[0] - 128;
-            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
-            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
-            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
-            
-            /* output 4 pixels */
-            RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]);
-            RGBOUT(d1[4], d1[5], d1[6], y1_ptr[1]);
-            RGBOUT(d2[0], d2[1], d2[2], y2_ptr[0]);
-            RGBOUT(d2[4], d2[5], d2[6], y2_ptr[1]);
-
-            d1[3] = d1[7] = d2[3] = d2[7] = 255;
-
-            d1 += 8;
-            d2 += 8;
-            y1_ptr += 2;
-            y2_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
+    p = src->data[0];
+    src_wrap = src->linesize[0] - 3 * width;
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - 2 * width;
+
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            r = p[0];
+            g = p[1];
+            b = p[2];
+
+            ((unsigned short *)q)[0] = 
+                ((r >> 3) << 10) | ((g >> 3) << 5) | (b >> 3) | 0x8000;
+            q += 2;
+            p += 3;
         }
-        d += 2 * dst->linesize[0];
-        y1_ptr += 2 * src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width2;
-        cr_ptr += src->linesize[2] - width2;
+        p += src_wrap;
+        q += dst_wrap;
     }
 }
 
-/* XXX: no chroma interpolating is done */
-static void yuv420p_to_rgb24(AVPicture *dst, AVPicture *src, 
-                             int width, int height)
+static void mono_to_gray(AVPicture *dst, AVPicture *src,
+                         int width, int height, int xor_mask)
 {
-    UINT8 *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
-    int w, y, cb, cr, r_add, g_add, b_add, width2;
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
-
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    width2 = width >> 1;
-    for(;height > 0; height -= 2) {
-        d1 = d;
-        d2 = d + dst->linesize[0];
-        y2_ptr = y1_ptr + src->linesize[0];
-        for(w = width2; w > 0; w --) {
-            cb = cb_ptr[0] - 128;
-            cr = cr_ptr[0] - 128;
-            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
-            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
-            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
-            
-            /* output 4 pixels */
-            RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]);
-            RGBOUT(d1[3], d1[4], d1[5], y1_ptr[1]);
-            RGBOUT(d2[0], d2[1], d2[2], y2_ptr[0]);
-            RGBOUT(d2[3], d2[4], d2[5], y2_ptr[1]);
-
-            d1 += 6;
-            d2 += 6;
-            y1_ptr += 2;
-            y2_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
+    const unsigned char *p;
+    unsigned char *q;
+    int v, dst_wrap, src_wrap;
+    int y, w;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - ((width + 7) >> 3);
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - width;
+    for(y=0;y<height;y++) {
+        w = width; 
+        while (w >= 8) {
+            v = *p++ ^ xor_mask;
+            q[0] = -(v >> 7);
+            q[1] = -((v >> 6) & 1);
+            q[2] = -((v >> 5) & 1);
+            q[3] = -((v >> 4) & 1);
+            q[4] = -((v >> 3) & 1);
+            q[5] = -((v >> 2) & 1);
+            q[6] = -((v >> 1) & 1);
+            q[7] = -((v >> 0) & 1);
+            w -= 8;
+            q += 8;
+        }
+        if (w > 0) {
+            v = *p++ ^ xor_mask;
+            do {
+                q[0] = -((v >> 7) & 1);
+                q++;
+                v <<= 1;
+            } while (--w);
         }
-        d += 2 * dst->linesize[0];
-        y1_ptr += 2 * src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width2;
-        cr_ptr += src->linesize[2] - width2;
+        p += src_wrap;
+        q += dst_wrap;
     }
 }
 
-/* XXX: no chroma interpolating is done */
-static void yuv422p_to_rgb24(AVPicture *dst, AVPicture *src, 
-                             int width, int height)
+static void monowhite_to_gray(AVPicture *dst, AVPicture *src,
+                               int width, int height)
 {
-    UINT8 *y1_ptr, *cb_ptr, *cr_ptr, *d, *d1;
-    int w, y, cb, cr, r_add, g_add, b_add, width2;
-    UINT8 *cm = cropTbl + MAX_NEG_CROP;
+    mono_to_gray(dst, src, width, height, 0xff);
+}
+
+static void monoblack_to_gray(AVPicture *dst, AVPicture *src,
+                               int width, int height)
+{
+    mono_to_gray(dst, src, width, height, 0x00);
+}
+
+static void gray_to_mono(AVPicture *dst, AVPicture *src,
+                         int width, int height, int xor_mask)
+{
+    int n;
+    const UINT8 *s;
+    UINT8 *d;
+    int j, b, v, n1, src_wrap, dst_wrap, y;
+
+    s = src->data[0];
+    src_wrap = src->linesize[0] - width;
 
     d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    width2 = width >> 1;
-    for(;height > 0; height --) {
-        d1 = d;
-        for(w = width2; w > 0; w --) {
-            cb = cb_ptr[0] - 128;
-            cr = cr_ptr[0] - 128;
-            r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
-            g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
-            b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
-            
-            /* output 2 pixels */
-            RGBOUT(d1[0], d1[1], d1[2], y1_ptr[0]);
-            RGBOUT(d1[3], d1[4], d1[5], y1_ptr[1]);
-
-            d1 += 6;
-            y1_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
+    dst_wrap = dst->linesize[0] - ((width + 7) >> 3);
+    printf("%d %d\n", width, height);
+
+    for(y=0;y<height;y++) {
+        n = width;
+        while (n >= 8) {
+            v = 0;
+            for(j=0;j<8;j++) {
+                b = s[0];
+                s++;
+                v = (v << 1) | (b >> 7);
+            }
+            d[0] = v ^ xor_mask;
+            d++;
+            n -= 8;
+        }
+        if (n > 0) {
+            n1 = n;
+            v = 0;
+            while (n > 0) {
+                b = s[0];
+                s++;
+                v = (v << 1) | (b >> 7);
+                n--;
+            }
+            d[0] = (v << (8 - (n1 & 7))) ^ xor_mask;
+            d++;
         }
-        d += dst->linesize[0];
-        y1_ptr += src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width2;
-        cr_ptr += src->linesize[2] - width2;
+        s += src_wrap;
+        d += dst_wrap;
     }
 }
 
+static void gray_to_monowhite(AVPicture *dst, AVPicture *src,
+                              int width, int height)
+{
+    gray_to_mono(dst, src, width, height, 0xff);
+}
+
+static void gray_to_monoblack(AVPicture *dst, AVPicture *src,
+                              int width, int height)
+{
+    gray_to_mono(dst, src, width, height, 0x00);
+}
+
+typedef struct ConvertEntry {
+    void (*convert)(AVPicture *dst, AVPicture *src, int width, int height);
+} ConvertEntry;
+
+/* add each new convertion function in this table */
+/* constraints;
+   - all non YUV modes must convert at least to and from PIX_FMT_RGB24
+*/
+static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
+    [PIX_FMT_YUV420P] = {
+        [PIX_FMT_RGB555] = { 
+            .convert = yuv420p_to_rgb555
+        },
+        [PIX_FMT_RGB565] = { 
+            .convert = yuv420p_to_rgb565
+        },
+        [PIX_FMT_BGR24] = { 
+            .convert = yuv420p_to_bgr24
+        },
+        [PIX_FMT_RGB24] = { 
+            .convert = yuv420p_to_rgb24
+        },
+        [PIX_FMT_RGBA32] = { 
+            .convert = yuv420p_to_rgba32
+        },
+    },
+    [PIX_FMT_YUV422P] = {
+        [PIX_FMT_RGB555] = { 
+            .convert = yuv422p_to_rgb555
+        },
+        [PIX_FMT_RGB565] = { 
+            .convert = yuv422p_to_rgb565
+        },
+        [PIX_FMT_BGR24] = { 
+            .convert = yuv422p_to_bgr24
+        },
+        [PIX_FMT_RGB24] = { 
+            .convert = yuv422p_to_rgb24
+        },
+        [PIX_FMT_RGBA32] = { 
+            .convert = yuv422p_to_rgba32
+        },
+    },
+    [PIX_FMT_YUV422] = { 
+        [PIX_FMT_YUV420P] = { 
+            .convert = yuv422_to_yuv420p,
+        },
+    },
+
+    [PIX_FMT_RGB24] = {
+        [PIX_FMT_YUV420P] = { 
+            .convert = rgb24_to_yuv420p
+        },
+        [PIX_FMT_RGB565] = { 
+            .convert = rgb24_to_rgb565
+        },
+        [PIX_FMT_RGB555] = { 
+            .convert = rgb24_to_rgb555
+        },
+        [PIX_FMT_GRAY8] = { 
+            .convert = rgb24_to_gray
+        },
+    },
+    [PIX_FMT_RGBA32] = {
+        [PIX_FMT_YUV420P] = { 
+            .convert = rgba32_to_yuv420p
+        },
+        [PIX_FMT_GRAY8] = { 
+            .convert = rgba32_to_gray
+        },
+    },
+    [PIX_FMT_BGR24] = {
+        [PIX_FMT_YUV420P] = { 
+            .convert = bgr24_to_yuv420p
+        },
+        [PIX_FMT_GRAY8] = { 
+            .convert = bgr24_to_gray
+        },
+    },
+    [PIX_FMT_RGB555] = {
+        [PIX_FMT_YUV420P] = { 
+            .convert = rgb555_to_yuv420p
+        },
+        [PIX_FMT_GRAY8] = { 
+            .convert = rgb555_to_gray
+        },
+    },
+    [PIX_FMT_RGB565] = {
+        [PIX_FMT_YUV420P] = { 
+            .convert = rgb565_to_yuv420p
+        },
+        [PIX_FMT_GRAY8] = { 
+            .convert = rgb565_to_gray
+        },
+    },
+    [PIX_FMT_GRAY8] = {
+        [PIX_FMT_RGB555] = { 
+            .convert = gray_to_rgb555
+        },
+        [PIX_FMT_RGB565] = { 
+            .convert = gray_to_rgb565
+        },
+        [PIX_FMT_RGB24] = { 
+            .convert = gray_to_rgb24
+        },
+        [PIX_FMT_BGR24] = { 
+            .convert = gray_to_bgr24
+        },
+        [PIX_FMT_RGBA32] = { 
+            .convert = gray_to_rgba32
+        },
+        [PIX_FMT_MONOWHITE] = { 
+            .convert = gray_to_monowhite
+        },
+        [PIX_FMT_MONOBLACK] = { 
+            .convert = gray_to_monoblack
+        },
+    },
+    [PIX_FMT_MONOWHITE] = {
+        [PIX_FMT_GRAY8] = { 
+            .convert = monowhite_to_gray
+        },
+    },
+    [PIX_FMT_MONOBLACK] = {
+        [PIX_FMT_GRAY8] = { 
+            .convert = monoblack_to_gray
+        },
+    },
+};
+
+static int avpicture_alloc(AVPicture *picture,
+                           int pix_fmt, int width, int height)
+{
+    int size;
+    void *ptr;
+
+    size = avpicture_get_size(pix_fmt, width, height);
+    if (size < 0)
+        goto fail;
+    ptr = av_malloc(size);
+    if (!ptr)
+        goto fail;
+    avpicture_fill(picture, ptr, pix_fmt, width, height);
+    return 0;
+ fail:
+    memset(picture, 0, sizeof(AVPicture));
+    return -1;
+}
+
+static void avpicture_free(AVPicture *picture)
+{
+    av_free(picture->data[0]);
+}
+
 /* XXX: always use linesize. Return -1 if not supported */
 int img_convert(AVPicture *dst, int dst_pix_fmt,
-                AVPicture *src, int pix_fmt, 
-                int width, int height)
+                AVPicture *src, int src_pix_fmt, 
+                int src_width, int src_height)
 {
-    int i;
+    int i, ret, dst_width, dst_height, int_pix_fmt;
+    PixFmtInfo *src_pix, *dst_pix;
+    ConvertEntry *ce;
+    AVPicture tmp1, *tmp = &tmp1;
 
-    if (dst_pix_fmt == pix_fmt) {
-        switch(pix_fmt) {
-        case PIX_FMT_YUV420P:
-            for(i=0;i<3;i++) {
-                if (i == 1) {
-                    width >>= 1;
-                    height >>= 1;
-                }
-                img_copy(dst->data[i], dst->linesize[i],
-                         src->data[i], src->linesize[i],
-                         width, height);
+    if (src_pix_fmt < 0 || src_pix_fmt >= PIX_FMT_NB ||
+        dst_pix_fmt < 0 || dst_pix_fmt >= PIX_FMT_NB)
+        return -1;
+    if (src_width <= 0 || src_height <= 0)
+        return 0;
+
+    dst_width = src_width;
+    dst_height = src_height;
+
+    dst_pix = &pix_fmt_info[dst_pix_fmt];
+    src_pix = &pix_fmt_info[src_pix_fmt];
+    if (src_pix_fmt == dst_pix_fmt) {
+        /* XXX: incorrect */
+        /* same format: just copy */
+        for(i = 0; i < dst_pix->nb_components; i++) {
+            int w, h;
+            w = dst_width;
+            h = dst_height;
+            if (dst_pix->is_yuv && (i == 1 || i == 2)) {
+                w >>= dst_pix->x_chroma_shift;
+                h >>= dst_pix->y_chroma_shift;
             }
-            break;
-        default:
-            return -1;
+            img_copy(dst->data[i], dst->linesize[i],
+                     src->data[i], src->linesize[i],
+                     w, h);
         }
-    } else if (dst_pix_fmt == PIX_FMT_YUV420P) {
-        
-        switch(pix_fmt) {
-        case PIX_FMT_YUV411P:
-            img_copy(dst->data[0], dst->linesize[0],
-                     src->data[0], src->linesize[0],
-                     width, height);
-            conv411(dst->data[1], dst->linesize[1],
-                    src->data[1], src->linesize[1],
-                    width / 4, height);
-            conv411(dst->data[2], dst->linesize[2],
-                    src->data[2], src->linesize[2],
-                    width / 4, height);
-            break;
-        case PIX_FMT_YUV410P:
-            img_copy(dst->data[0], dst->linesize[0],
-                     src->data[0], src->linesize[0],
-                     width, height);
-            grow22(dst->data[1], dst->linesize[1],
-                     src->data[1], src->linesize[1],
-                     width/2, height/2);
-            grow22(dst->data[2], dst->linesize[2],
-                     src->data[2], src->linesize[2],
-                     width/2, height/2);
-            break;
-        case PIX_FMT_YUV420P:
-            for(i=0;i<3;i++) {
-                img_copy(dst->data[i], dst->linesize[i],
-                         src->data[i], src->linesize[i],
-                         width, height);
-            }
-            break;
-        case PIX_FMT_YUV422P:
-            img_copy(dst->data[0], dst->linesize[0],
-                     src->data[0], src->linesize[0],
-                     width, height);
-            width >>= 1;
-            height >>= 1;
-            for(i=1;i<3;i++) {
-                shrink2(dst->data[i], dst->linesize[i],
-                        src->data[i], src->linesize[i],
-                        width, height);
-            }
-            break;
-        case PIX_FMT_YUV444P:
-            img_copy(dst->data[0], dst->linesize[0],
-                     src->data[0], src->linesize[0],
-                     width, height);
-            width >>= 1;
-            height >>= 1;
-            for(i=1;i<3;i++) {
-                shrink22(dst->data[i], dst->linesize[i],
-                         src->data[i], src->linesize[i],
-                         width, height);
+        return 0;
+    }
+
+    ce = &convert_table[src_pix_fmt][dst_pix_fmt];
+    if (ce->convert) {
+        /* specific convertion routine */
+        ce->convert(dst, src, dst_width, dst_height);
+        return 0;
+    }
+
+    /* gray to YUV */
+    if (dst_pix->is_yuv && src_pix_fmt == PIX_FMT_GRAY8) {
+        int w, h, y;
+        uint8_t *d;
+
+        img_copy(dst->data[0], dst->linesize[0],
+                 src->data[0], src->linesize[0],
+                 dst_width, dst_height);
+        /* fill U and V with 128 */
+        w = dst_width;
+        h = dst_height;
+        w >>= dst_pix->x_chroma_shift;
+        h >>= dst_pix->y_chroma_shift;
+        for(i = 1; i <= 2; i++) {
+            d = dst->data[i];
+            for(y = 0; y< h; y++) {
+                memset(d, 128, w);
+                d += dst->linesize[i];
             }
-            break;
-        case PIX_FMT_YUV422:
-            yuv422_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                              src->data[0], width, height);
-            break;
-        case PIX_FMT_RGB24:
-            rgb24_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_RGBA32:
-            rgba32_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_BGR24:
-            bgr24_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_BGRA32:
-            bgra32_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_RGB565:
-            rgb565_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_RGB555:
-            rgb555_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-/*        case PIX_FMT_RGB5551:
-            rgb5551_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;*/
-        case PIX_FMT_BGR565:
-            bgr565_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_BGR555:
-            bgr555_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-/*        case PIX_FMT_GBR565:
-            gbr565_to_yuv420p(dst->data[0], dst->data[1], dst->data[2], 
-                             src->data[0], width, height);
-            break;
-        case PIX_FMT_GBR555:
-            gbr555_to_yuv420p(dst->data[0], dst->data[1], dst->data[2],
-                             src->data[0], width, height);
-            break;*/
-        default:
-            return -1;
-        }
-    } else if (dst_pix_fmt == PIX_FMT_RGB24) {
-        switch(pix_fmt) {
-        case PIX_FMT_YUV420P:
-            yuv420p_to_rgb24(dst, src, width, height);
-            break;
-        case PIX_FMT_YUV422P:
-            yuv422p_to_rgb24(dst, src, width, height);
-            break;
-        default:
-            return -1;
-        }
-    } else if (dst_pix_fmt == PIX_FMT_RGBA32) {
-        switch(pix_fmt) {
-        case PIX_FMT_YUV420P:
-            yuv420p_to_rgba32(dst, src, width, height);
-            break;
-        default:
-            return -1;
         }
-    } else if (dst_pix_fmt == PIX_FMT_BGRA32) {
-        switch(pix_fmt) {
-        case PIX_FMT_YUV420P:
-            yuv420p_to_bgra32(dst, src, width, height);
-            break;
-        default:
+        return 0;
+    }
+
+    /* YUV to gray */
+    if (src_pix->is_yuv && dst_pix_fmt == PIX_FMT_GRAY8) {
+        img_copy(dst->data[0], dst->linesize[0],
+                 src->data[0], src->linesize[0],
+                 dst_width, dst_height);
+        return 0;
+    }
+
+    /* YUV to YUV */
+    if (dst_pix->is_yuv && src_pix->is_yuv) {
+        int x_shift, y_shift, w, h;
+        void (*resize_func)(UINT8 *dst, int dst_wrap, 
+                            UINT8 *src, int src_wrap,
+                            int width, int height);
+
+        /* compute chroma size of the smallest dimensions */
+        w = dst_width;
+        h = dst_height;
+        if (dst_pix->x_chroma_shift >= src_pix->x_chroma_shift)
+            w >>= dst_pix->x_chroma_shift;
+        else
+            w >>= src_pix->x_chroma_shift;
+        if (dst_pix->y_chroma_shift >= src_pix->y_chroma_shift)
+            h >>= dst_pix->y_chroma_shift;
+        else
+            h >>= src_pix->y_chroma_shift;
+
+        x_shift = (dst_pix->x_chroma_shift - src_pix->x_chroma_shift);
+        y_shift = (dst_pix->y_chroma_shift - src_pix->y_chroma_shift);
+        if (x_shift == 0 && y_shift == 0) {
+            resize_func = img_copy; /* should never happen */
+        } else if (x_shift == 0 && y_shift == 1) {
+            resize_func = shrink2;
+        } else if (x_shift == 1 && y_shift == 1) {
+            resize_func = shrink22;
+        } else if (x_shift == -1 && y_shift == -1) {
+            resize_func = grow22;
+        } else if (x_shift == -1 && y_shift == 1) {
+            resize_func = conv411;
+        } else {
+            /* currently not handled */
             return -1;
         }
+
+        img_copy(dst->data[0], dst->linesize[0],
+                 src->data[0], src->linesize[0],
+                 dst_width, dst_height);
+
+        for(i = 1;i <= 2; i++)
+            resize_func(dst->data[i], dst->linesize[i],
+                        src->data[i], src->linesize[i],
+                        w, h);
+       return 0;
+    }
+
+    /* try to use an intermediate format */
+    if (src_pix_fmt == PIX_FMT_MONOWHITE ||
+        src_pix_fmt == PIX_FMT_MONOBLACK ||
+        dst_pix_fmt == PIX_FMT_MONOWHITE ||
+        dst_pix_fmt == PIX_FMT_MONOBLACK) {
+        int_pix_fmt = PIX_FMT_GRAY8;
     } else {
-        return -1;
+        int_pix_fmt = PIX_FMT_RGB24;
     }
-    return 0;
+    if (avpicture_alloc(tmp, int_pix_fmt, dst_width, dst_height) < 0)
+        return -1;
+    ret = -1;
+    if (img_convert(tmp, int_pix_fmt,
+                    src, src_pix_fmt, src_width, src_height) < 0)
+        goto fail1;
+    if (img_convert(dst, dst_pix_fmt,
+                    tmp, int_pix_fmt, dst_width, dst_height) < 0)
+        goto fail1;
+    ret = 0;
+ fail1:
+    avpicture_free(tmp);
+    return ret;
 }
 
 
@@ -948,6 +1294,15 @@ static void deinterlace_line(UINT8 *dst, UINT8 *lum_m4, UINT8 *lum_m3, UINT8 *lu
     }
 #else
 
+    {
+        mmx_t rounder;
+        rounder.uw[0]=4;
+        rounder.uw[1]=4;
+        rounder.uw[2]=4;
+        rounder.uw[3]=4;
+        pxor_r2r(mm7,mm7);
+        movq_m2r(rounder,mm6);
+    }
     for (;size > 3; size-=4) {
         DEINT_LINE_LUM
         lum_m4+=4;
@@ -982,6 +1337,15 @@ static void deinterlace_line_inplace(UINT8 *lum_m4, UINT8 *lum_m3, UINT8 *lum_m2
     }
 #else
 
+    {
+        mmx_t rounder;
+        rounder.uw[0]=4;
+        rounder.uw[1]=4;
+        rounder.uw[2]=4;
+        rounder.uw[3]=4;
+        pxor_r2r(mm7,mm7);
+        movq_m2r(rounder,mm6);
+    }
     for (;size > 3; size-=4) {
         DEINT_INPLACE_LINE_LUM
         lum_m4+=4;
@@ -1064,19 +1428,6 @@ int avpicture_deinterlace(AVPicture *dst, AVPicture *src,
     if ((width & 3) != 0 || (height & 3) != 0)
         return -1;
 
-#ifdef HAVE_MMX
-    {
-        mmx_t rounder;
-        rounder.uw[0]=4;
-        rounder.uw[1]=4;
-        rounder.uw[2]=4;
-        rounder.uw[3]=4;
-        pxor_r2r(mm7,mm7);
-        movq_m2r(rounder,mm6);
-    }
-#endif
-
-    
     for(i=0;i<3;i++) {
         if (i == 1) {
             switch(pix_fmt) {
diff --git a/src/libffmpeg/libavcodec/mem.c b/src/libffmpeg/libavcodec/mem.c
index a9b5e0afa..a36952fd7 100644
--- a/src/libffmpeg/libavcodec/mem.c
+++ b/src/libffmpeg/libavcodec/mem.c
@@ -17,6 +17,12 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "avcodec.h"
+
+/* here we can use OS dependant allocation functions */
+#undef malloc
+#undef free
+#undef realloc
+
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
@@ -25,10 +31,15 @@
    memory allocator. You do not need to suppress this file because the
    linker will do it automatically */
 
-/* memory alloc */
+/** 
+ * Memory allocation of size byte with alignment suitable for all
+ * memory accesses (including vectors if available on the
+ * CPU). av_malloc(0) must return a non NULL pointer.
+ */
 void *av_malloc(unsigned int size)
 {
     void *ptr;
+    
 #if defined (HAVE_MEMALIGN)
     ptr = memalign(16,size);
     /* Why 64? 
@@ -60,14 +71,19 @@ void *av_malloc(unsigned int size)
 #else
     ptr = malloc(size);
 #endif
-    if (!ptr)
-        return NULL;
-//fprintf(stderr, "%X %d\n", (int)ptr, size);
-    /* NOTE: this memset should not be present */
-    memset(ptr, 0, size);
     return ptr;
 }
 
+/**
+ * av_realloc semantics (same as glibc): if ptr is NULL and size > 0,
+ * identical to malloc(size). If size is zero, it is identical to
+ * free(ptr) and NULL is returned.  
+ */
+void *av_realloc(void *ptr, unsigned int size)
+{
+    return realloc(ptr, size);
+}
+
 /* NOTE: ptr = NULL is explicetly allowed */
 void av_free(void *ptr)
 {
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 9b4943582..9617816bb 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -735,7 +735,7 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
     if (avctx->flags & CODEC_FLAG_EXTERN_HUFF)
     {
 	printf("mjpeg: using external huffman table\n");
-	init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size);
+	init_get_bits(&s->gb, avctx->extradata, avctx->extradata_size*8);
 	mjpeg_decode_dht(s);
 	/* should check for error - but dunno */
     }
@@ -1404,13 +1404,13 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
 				break;
 			}
 		    }
-		    init_get_bits(&s->gb, s->buffer, dst - s->buffer);
+		    init_get_bits(&s->gb, s->buffer, (dst - s->buffer)*8);
 		    
 		    dprintf("escaping removed %d bytes\n",
 			(buf_end - buf_ptr) - (dst - s->buffer));
 		}
 		else
-		    init_get_bits(&s->gb, buf_ptr, buf_end - buf_ptr);
+		    init_get_bits(&s->gb, buf_ptr, (buf_end - buf_ptr)*8);
 		
 		s->start_code = start_code;
 
@@ -1548,7 +1548,7 @@ read_header:
     /* reset on every SOI */
     s->restart_interval = 0;
 
-    init_get_bits(&hgb, buf_ptr, /*buf_size*/buf_end - buf_ptr);
+    init_get_bits(&hgb, buf_ptr, /*buf_size*/(buf_end - buf_ptr)*8);
 
     skip_bits(&hgb, 32); /* reserved zeros */
     
@@ -1570,7 +1570,7 @@ read_header:
     dprintf("dqt offs: 0x%x\n", dqt_offs);
     if (dqt_offs)
     {
-	init_get_bits(&s->gb, buf+dqt_offs, buf_end - (buf+dqt_offs));
+	init_get_bits(&s->gb, buf+dqt_offs, (buf_end - (buf+dqt_offs))*8);
 	s->start_code = DQT;
 	mjpeg_decode_dqt(s);
     }
@@ -1579,7 +1579,7 @@ read_header:
     dprintf("dht offs: 0x%x\n", dht_offs);
     if (dht_offs)
     {
-	init_get_bits(&s->gb, buf+dht_offs, buf_end - (buf+dht_offs));
+	init_get_bits(&s->gb, buf+dht_offs, (buf_end - (buf+dht_offs))*8);
 	s->start_code = DHT;
 	mjpeg_decode_dht(s);
     }
@@ -1588,7 +1588,7 @@ read_header:
     dprintf("sof offs: 0x%x\n", sof_offs);
     if (sof_offs)
     {
-	init_get_bits(&s->gb, buf+sof_offs, buf_end - (buf+sof_offs));
+	init_get_bits(&s->gb, buf+sof_offs, (buf_end - (buf+sof_offs))*8);
 	s->start_code = SOF0;
 	if (mjpeg_decode_sof0(s) < 0)
 	    return -1;
@@ -1598,8 +1598,8 @@ read_header:
     dprintf("sos offs: 0x%x\n", sos_offs);
     if (sos_offs)
     {
-//	init_get_bits(&s->gb, buf+sos_offs, buf_end - (buf+sos_offs));
-	init_get_bits(&s->gb, buf+sos_offs, field_size);
+//	init_get_bits(&s->gb, buf+sos_offs, (buf_end - (buf+sos_offs))*8);
+	init_get_bits(&s->gb, buf+sos_offs, field_size*8);
 	s->start_code = SOS;
 	mjpeg_decode_sos(s);
     }
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 8310db8d5..e4b67b22f 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -1,7 +1,7 @@
 /*
  * Motion estimation 
  * Copyright (c) 2000,2001 Fabrice Bellard.
- * Copyright (c) 2002 Michael Niedermayer
+ * Copyright (c) 2002-2003 Michael Niedermayer
  * 
  *
  * This library is free software; you can redistribute it and/or
@@ -286,6 +286,14 @@ static void set_cmp(MpegEncContext *s, me_cmp_func *cmp, int type){
         cmp[0]= c->quant_psnr[0];
         cmp[1]= c->quant_psnr[1];
         break;
+    case FF_CMP_BIT:
+        cmp[0]= c->bit[0];
+        cmp[1]= c->bit[1];
+        break;
+    case FF_CMP_RD:
+        cmp[0]= c->rd[0];
+        cmp[1]= c->rd[1];
+        break;
     case FF_CMP_ZERO:
         for(i=0; i<7; i++){
             cmp[i]= zero_cmp;
@@ -294,19 +302,24 @@ static void set_cmp(MpegEncContext *s, me_cmp_func *cmp, int type){
     default:
         fprintf(stderr,"internal error in cmp function selection\n");
     }
-};
+}
 
 static inline int get_penalty_factor(MpegEncContext *s, int type){
-
-    switch(type){
+    switch(type&0xFF){
     default:
     case FF_CMP_SAD:
-        return s->qscale;
-    case FF_CMP_SSE:
-//        return s->qscale*8;
+        return s->qscale*2;
     case FF_CMP_DCT:
+        return s->qscale*3;
     case FF_CMP_SATD:
-        return s->qscale*8;
+        return s->qscale*6;
+    case FF_CMP_SSE:
+        return s->qscale*s->qscale*2;
+    case FF_CMP_BIT:
+        return 1;
+    case FF_CMP_RD:
+    case FF_CMP_PSNR:
+        return (s->qscale*s->qscale*185 + 64)>>7;
     }
 }
 
@@ -324,7 +337,9 @@ void ff_init_me(MpegEncContext *s){
     }else{
         if(s->avctx->me_sub_cmp&FF_CMP_CHROMA)
             s->me.sub_motion_search= simple_chroma_hpel_motion_search;
-        else if(s->avctx->me_sub_cmp == FF_CMP_SAD && s->avctx->me_cmp == FF_CMP_SAD)
+        else if(   s->avctx->me_sub_cmp == FF_CMP_SAD 
+                && s->avctx->    me_cmp == FF_CMP_SAD 
+                && s->avctx->    mb_cmp == FF_CMP_SAD)
             s->me.sub_motion_search= sad_hpel_motion_search;
         else
             s->me.sub_motion_search= simple_hpel_motion_search;
@@ -343,6 +358,18 @@ void ff_init_me(MpegEncContext *s){
     }else{
         s->me.pre_motion_search= simple_epzs_motion_search;
     }
+    
+    if(s->flags&CODEC_FLAG_QPEL){
+        if(s->avctx->mb_cmp&FF_CMP_CHROMA)
+            s->me.get_mb_score= simple_chroma_qpel_get_mb_score;
+        else
+            s->me.get_mb_score= simple_qpel_get_mb_score;
+    }else{
+        if(s->avctx->mb_cmp&FF_CMP_CHROMA)
+            s->me.get_mb_score= simple_chroma_hpel_get_mb_score;
+        else
+            s->me.get_mb_score= simple_hpel_get_mb_score;
+    }
 }
       
 static int pix_dev(UINT8 * pix, int line_size, int mean)
@@ -776,12 +803,11 @@ static inline void get_limits(MpegEncContext *s, int *range, int *xmin, int *ymi
     }
 }
 
-static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift)
+static inline int h263_mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, int ymax, int mx, int my, int shift)
 {
     int block;
     int P[10][2];
-    uint8_t *ref_picture= s->last_picture.data[0];
-    int dmin_sum=0;
+    int dmin_sum=0, mx4_sum=0, my4_sum=0;
     uint16_t * const mv_penalty= s->me.mv_penalty[s->f_code] + MAX_MV;
 
     for(block=0; block<4; block++){
@@ -826,13 +852,15 @@ static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, in
             P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
             P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
 
-            if(s->out_format == FMT_H263){
+//            if(s->out_format == FMT_H263){
                 pred_x4 = P_MEDIAN[0];
                 pred_y4 = P_MEDIAN[1];
+#if 0
             }else { /* mpeg1 at least */
                 pred_x4= P_LEFT[0];
                 pred_y4= P_LEFT[1];
             }
+#endif
         }
         P_MV1[0]= mx;
         P_MV1[1]= my;
@@ -842,12 +870,80 @@ static inline int mv4_search(MpegEncContext *s, int xmin, int ymin, int xmax, in
 
         dmin4= s->me.sub_motion_search(s, &mx4, &my4, dmin4, rel_xmin4, rel_ymin4, rel_xmax4, rel_ymax4, 
 					  pred_x4, pred_y4, &s->last_picture, block, 1, mv_penalty);
- 
+        
+        if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
+            int dxy;
+            const int offset= ((block&1) + (block>>1)*s->linesize)*8;
+            uint8_t *dest_y = s->me.scratchpad + offset;
+
+            if(s->quarter_sample){
+                uint8_t *ref= s->last_picture.data[0] + (s->mb_x*16 + (mx4>>2)) + (s->mb_y*16 + (my4>>2))*s->linesize + offset;
+                dxy = ((my4 & 3) << 2) | (mx4 & 3);
+
+                if(s->no_rounding)
+                    s->dsp.put_no_rnd_qpel_pixels_tab[1][dxy](dest_y   , ref    , s->linesize);
+                else
+                    s->dsp.put_qpel_pixels_tab       [1][dxy](dest_y   , ref    , s->linesize);
+            }else{
+                uint8_t *ref= s->last_picture.data[0] + (s->mb_x*16 + (mx4>>1)) + (s->mb_y*16 + (my4>>1))*s->linesize + offset;
+                dxy = ((my4 & 1) << 1) | (mx4 & 1);
+
+                if(s->no_rounding)
+                    s->dsp.put_no_rnd_pixels_tab[1][dxy](dest_y    , ref    , s->linesize, 8);
+                else
+                    s->dsp.put_pixels_tab       [1][dxy](dest_y    , ref    , s->linesize, 8);
+            }
+            dmin_sum+= (mv_penalty[mx4-pred_x4] + mv_penalty[my4-pred_y4])*s->me.mb_penalty_factor;
+        }else
+            dmin_sum+= dmin4;
+
+        if(s->quarter_sample){
+            mx4_sum+= mx4/2;
+            my4_sum+= my4/2;
+        }else{
+            mx4_sum+= mx4;
+            my4_sum+= my4;
+        }
+            
         s->motion_val[ s->block_index[block] ][0]= mx4;
         s->motion_val[ s->block_index[block] ][1]= my4;
-        dmin_sum+= dmin4;
     }
-    return dmin_sum;
+    
+    if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){
+        dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.data[0] + s->mb_x*16 + s->mb_y*16*s->linesize, s->me.scratchpad, s->linesize);
+    }
+    
+    if(s->avctx->mb_cmp&FF_CMP_CHROMA){
+        int dxy;
+        int mx, my;
+        int offset;
+
+        mx= ff_h263_round_chroma(mx4_sum);
+        my= ff_h263_round_chroma(my4_sum);
+        dxy = ((my & 1) << 1) | (mx & 1);
+        
+        offset= (s->mb_x*8 + (mx>>1)) + (s->mb_y*8 + (my>>1))*s->uvlinesize;
+       
+        if(s->no_rounding){
+            s->dsp.put_no_rnd_pixels_tab[1][dxy](s->me.scratchpad    , s->last_picture.data[1] + offset, s->uvlinesize, 8);
+            s->dsp.put_no_rnd_pixels_tab[1][dxy](s->me.scratchpad+8  , s->last_picture.data[2] + offset, s->uvlinesize, 8);
+        }else{
+            s->dsp.put_pixels_tab       [1][dxy](s->me.scratchpad    , s->last_picture.data[1] + offset, s->uvlinesize, 8);
+            s->dsp.put_pixels_tab       [1][dxy](s->me.scratchpad+8  , s->last_picture.data[2] + offset, s->uvlinesize, 8);
+        }
+
+        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad  , s->uvlinesize);
+        dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, s->me.scratchpad+8, s->uvlinesize);
+    }
+
+    switch(s->avctx->mb_cmp&0xFF){
+    /*case FF_CMP_SSE:
+        return dmin_sum+ 32*s->qscale*s->qscale;*/
+    case FF_CMP_RD:
+        return dmin_sum;
+    default:
+        return dmin_sum+ 11*s->me.mb_penalty_factor;
+    }
 }
 
 void ff_estimate_p_frame_motion(MpegEncContext * s,
@@ -869,6 +965,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 
     s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
     s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
+    s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
 
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
     rel_xmin= xmin - mb_x*16;
@@ -959,6 +1056,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     pic->mb_var   [s->mb_width * mb_y + mb_x] = varc;
     pic->mc_mb_var[s->mb_width * mb_y + mb_x] = vard;
     pic->mb_mean  [s->mb_width * mb_y + mb_x] = (sum+128)>>8;
+//    pic->mb_cmp_score[s->mb_width * mb_y + mb_x] = dmin; 
     pic->mb_var_sum    += varc;
     pic->mc_mb_var_sum += vard;
 //printf("E%d %d %d %X %X %X\n", s->mb_width, mb_x, mb_y,(int)s, (int)s->mb_var, (int)s->mc_mb_var); fflush(stdout);
@@ -985,44 +1083,36 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
         }
         if((s->flags&CODEC_FLAG_4MV)
            && !s->me.skip && varc>50 && vard>10){
-            mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+            h263_mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
             mb_type|=MB_TYPE_INTER4V;
 
             set_p_mv_tables(s, mx, my, 0);
         }else
             set_p_mv_tables(s, mx, my, 1);
     }else{
-        if (vard <= 64 || vard < varc) {
-//        if (sadP <= 32 || sadP < sadI + 500) {
-            s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
-            mb_type|= MB_TYPE_INTER;
-            if (s->me_method != ME_ZERO) {
-                dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
-                                            pred_x, pred_y, &s->last_picture, 0, 0, mv_penalty);
-                if((s->flags&CODEC_FLAG_4MV)
-                   && !s->me.skip && varc>50 && vard>10){
-                    int dmin4= mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
-                    if(dmin4 + 128 <dmin)
-                        mb_type= MB_TYPE_INTER4V;
-                }
-                set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
+        mb_type= MB_TYPE_INTER;
 
-            } else {
-                mx <<=shift;
-                my <<=shift;
-            }
-#if 0
-            if (vard < 10) {
-                skip++;
-                fprintf(stderr,"\nEarly skip: %d vard: %2d varc: %5d dmin: %d", 
-                                skip, vard, varc, dmin);
+        dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
+                                    pred_x, pred_y, &s->last_picture, 0, 0, mv_penalty);
+        
+        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+            dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, &s->last_picture, mv_penalty);
+
+        if((s->flags&CODEC_FLAG_4MV)
+           && !s->me.skip && varc>50 && vard>10){
+            int dmin4= h263_mv4_search(s, rel_xmin, rel_ymin, rel_xmax, rel_ymax, mx, my, shift);
+            if(dmin4 < dmin){
+                mb_type= MB_TYPE_INTER4V;
+                dmin=dmin4;
             }
-#endif
+        }
+        pic->mb_cmp_score[s->mb_width * mb_y + mb_x] = dmin; 
+        set_p_mv_tables(s, mx, my, mb_type!=MB_TYPE_INTER4V);
+        
+        if (vard <= 64 || vard < varc) {
+            s->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
         }else{
-            s->scene_change_score+= 20;
-            mb_type|= MB_TYPE_INTRA;
-            mx = 0;
-            my = 0;
+            s->scene_change_score+= s->qscale;
         }
     }
 
@@ -1105,6 +1195,7 @@ int ff_estimate_motion_b(MpegEncContext * s,
         
     s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
     s->me.sub_penalty_factor= get_penalty_factor(s, s->avctx->me_sub_cmp);
+    s->me.mb_penalty_factor = get_penalty_factor(s, s->avctx->mb_cmp);
 
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, f_code);
     rel_xmin= xmin - mb_x*16;
@@ -1174,6 +1265,10 @@ int ff_estimate_motion_b(MpegEncContext * s,
     
     dmin= s->me.sub_motion_search(s, &mx, &my, dmin, rel_xmin, rel_ymin, rel_xmax, rel_ymax,
 				   pred_x, pred_y, picture, 0, 0, mv_penalty);
+                                   
+    if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+        dmin= s->me.get_mb_score(s, mx, my, pred_x, pred_y, picture, mv_penalty);
+
 //printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
     mv_table[mot_xy][0]= mx;
@@ -1237,10 +1332,14 @@ static inline int check_bidir_mv(MpegEncContext * s,
         s->dsp.avg_pixels_tab[0][dxy](dest_y    , ptr    , s->linesize, 16);
     }
 
-    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->me.sub_penalty_factor
-           +(mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->me.sub_penalty_factor;
-           + s->dsp.me_sub_cmp[0](s, s->new_picture.data[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
-
+    fbmin = (mv_penalty[motion_fx-pred_fx] + mv_penalty[motion_fy-pred_fy])*s->me.mb_penalty_factor
+           +(mv_penalty[motion_bx-pred_bx] + mv_penalty[motion_by-pred_by])*s->me.mb_penalty_factor
+           + s->dsp.mb_cmp[0](s, s->new_picture.data[0] + mb_x*16 + mb_y*16*s->linesize, dest_y, s->linesize);
+           
+    if(s->avctx->mb_cmp&FF_CMP_CHROMA){
+    }
+    //FIXME CHROMA !!!
+           
     return fbmin;
 }
 
@@ -1344,17 +1443,24 @@ static inline int direct_search(MpegEncContext * s,
         P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
         P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
-    
+ 
+    //FIXME direct_search  ptr in context!!! (needed for chroma anyway or this will get messy)   
     if(s->flags&CODEC_FLAG_QPEL){
         dmin = simple_direct_qpel_epzs_motion_search(s, 0, &mx, &my, P, 0, 0, xmin, ymin, xmax, ymax, 
                                                      &s->last_picture, mv_table, 1<<14, mv_penalty);
         dmin = simple_direct_qpel_qpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax,
                                                 0, 0, &s->last_picture, 0, 0, mv_penalty);
+        
+        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+            dmin= simple_direct_qpel_qpel_get_mb_score(s, mx, my, 0, 0, &s->last_picture, mv_penalty);
     }else{
         dmin = simple_direct_hpel_epzs_motion_search(s, 0, &mx, &my, P, 0, 0, xmin, ymin, xmax, ymax, 
                                                      &s->last_picture, mv_table, 1<<15, mv_penalty);
         dmin = simple_direct_hpel_hpel_motion_search(s, &mx, &my, dmin, xmin, ymin, xmax, ymax,
                                                 0, 0, &s->last_picture, 0, 0, mv_penalty);
+
+        if(s->avctx->me_sub_cmp != s->avctx->mb_cmp && !s->me.skip)
+            dmin= simple_direct_hpel_hpel_get_mb_score(s, mx, my, 0, 0, &s->last_picture, mv_penalty);
     }
 
     s->b_direct_mv_table[mot_xy][0]= mx;
@@ -1365,18 +1471,18 @@ static inline int direct_search(MpegEncContext * s,
 void ff_estimate_b_frame_motion(MpegEncContext * s,
                              int mb_x, int mb_y)
 {
-    const int penalty_factor= s->me.penalty_factor;
+    const int penalty_factor= s->me.mb_penalty_factor;
     int fmin, bmin, dmin, fbmin;
     int type=0;
     
     dmin= direct_search(s, mb_x, mb_y);
 
-    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, &s->last_picture, s->f_code);
-    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, &s->next_picture, s->b_code) - penalty_factor;
+    fmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_forw_mv_table, &s->last_picture, s->f_code) + 3*penalty_factor;
+    bmin= ff_estimate_motion_b(s, mb_x, mb_y, s->b_back_mv_table, &s->next_picture, s->b_code) + 2*penalty_factor;
 //printf(" %d %d ", s->b_forw_mv_table[xy][0], s->b_forw_mv_table[xy][1]);
 
-    fbmin= bidir_refine(s, mb_x, mb_y);
-
+    fbmin= bidir_refine(s, mb_x, mb_y) + penalty_factor;
+//printf("%d %d %d %d\n", dmin, fmin, bmin, fbmin);
     {
         int score= dmin;
         type=MB_TYPE_DIRECT;
@@ -1393,9 +1499,10 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
             score=fbmin;
             type= MB_TYPE_BIDIR;
         }
+        
         score= ((unsigned)(score*score + 128*256))>>16;
         s->current_picture.mc_mb_var_sum += score;
-        s->current_picture.mc_mb_var[mb_y*s->mb_width + mb_x] = score; //FIXME use SSD
+        s->current_picture.mc_mb_var[mb_y*s->mb_width + mb_x] = score; //FIXME use SSE
     }
 
     if(s->flags&CODEC_FLAG_HQ){
diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c
index d1ca6e7fb..4725ed994 100644
--- a/src/libffmpeg/libavcodec/motion_est_template.c
+++ b/src/libffmpeg/libavcodec/motion_est_template.c
@@ -39,7 +39,7 @@
     qpel_mc_func (*qpel_put)[16];\
     qpel_mc_func (*qpel_avg)[16]= &s->dsp.avg_qpel_pixels_tab[size];\
     const __attribute__((unused)) int unu= time_pp + time_pb + (int)src_u + (int)src_v + (int)ref_u + (int)ref_v\
-                                           + (int)ref2_y + (int)hpel_avg + (int)qpel_avg;\
+                                           + (int)ref2_y + (int)hpel_avg + (int)qpel_avg + (int)score_map;\
     if(s->no_rounding /*FIXME b_type*/){\
         hpel_put= &s->dsp.put_no_rnd_pixels_tab[size];\
         chroma_hpel_put= &s->dsp.put_no_rnd_pixels_tab[size+1];\
@@ -144,6 +144,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
     const int my = *my_ptr;   
     const int penalty_factor= s->me.sub_penalty_factor;
     me_cmp_func cmp_sub, chroma_cmp_sub;
+    int bx=2*mx, by=2*my;
 
     LOAD_COMMON(xx, yy);
     
@@ -160,13 +161,12 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
         
     if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
         CMP_HPEL(dmin, 0, 0, mx, my, size);
-        if(mx || my)
+        if(mx || my || size>0)
             dmin += (mv_penalty[2*mx - pred_x] + mv_penalty[2*my - pred_y])*penalty_factor;
     }
         
     if (mx > xmin && mx < xmax && 
         my > ymin && my < ymax) {
-        int bx=2*mx, by=2*my;
         int d= dmin;
         const int index= (my<<ME_MAP_SHIFT) + mx;
         const int t= score_map[(index-(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)] 
@@ -178,7 +178,7 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
         const int b= score_map[(index+(1<<ME_MAP_SHIFT))&(ME_MAP_SIZE-1)]
                      + (mv_penalty[bx   - pred_x] + mv_penalty[by+2 - pred_y])*s->me.penalty_factor;
     
-#if 0
+#if 1
         int key;
         int map_generation= s->me.map_generation;
         uint32_t *map= s->me.map;
@@ -231,20 +231,50 @@ static int RENAME(hpel_motion_search)(MpegEncContext * s,
             CHECK_HALF_MV(0, 1, mx  , my)
         }
         assert(bx >= xmin*2 && bx <= xmax*2 && by >= ymin*2 && by <= ymax*2);
-
-        *mx_ptr = bx;
-        *my_ptr = by;
-    }else{
-        *mx_ptr =2*mx;
-        *my_ptr =2*my;
     }
 
+    *mx_ptr = bx;
+    *my_ptr = by;
+    
     return dmin;
 }
 #endif
 
+static int RENAME(hpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, Picture *ref_picture, 
+                                  uint16_t * const mv_penalty)
+{
+//    const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
+    const int size= 0;
+    const int xx = 16 * s->mb_x;
+    const int yy = 16 * s->mb_y;
+    const int penalty_factor= s->me.mb_penalty_factor;
+    const int xmin= -256*256, ymin= -256*256, xmax= 256*256, ymax= 256*256; //assume that the caller checked these
+    const __attribute__((unused)) int unu2= xmin + xmax +ymin + ymax; //no unused warning shit
+    me_cmp_func cmp_sub, chroma_cmp_sub;
+    int d;
+
+    LOAD_COMMON(xx, yy);
+    
+ //FIXME factorize
+
+    cmp_sub= s->dsp.mb_cmp[size];
+    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
+    
+    assert(!s->me.skip);
+    assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
+
+    CMP_HPEL(d, mx&1, my&1, mx>>1, my>>1, size);
+    //FIXME check cbp before adding penalty for (0,0) vector
+    if(mx || my || size>0)
+        d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
+        
+    return d;
+}
+
 #endif /* CMP_HPEL */
 
+
+
 #ifdef CMP_QPEL
 
 #define CHECK_QUARTER_MV(dx, dy, x, y)\
@@ -290,7 +320,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
         
     if(s->avctx->me_cmp != s->avctx->me_sub_cmp){
         CMP_QPEL(dmin, 0, 0, mx, my, size);
-        if(mx || my)
+        if(mx || my || size>0)
             dmin += (mv_penalty[4*mx - pred_x] + mv_penalty[4*my - pred_y])*penalty_factor;
     }
         
@@ -477,6 +507,37 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
     return dmin;
 }
 
+static int RENAME(qpel_get_mb_score)(MpegEncContext * s, int mx, int my, int pred_x, int pred_y, Picture *ref_picture, 
+                                  uint16_t * const mv_penalty)
+{
+    const int size= 0;
+    const int xx = 16 * s->mb_x;
+    const int yy = 16 * s->mb_y;
+    const int penalty_factor= s->me.mb_penalty_factor;
+    const int xmin= -256*256, ymin= -256*256, xmax= 256*256, ymax= 256*256; //assume that the caller checked these
+    const __attribute__((unused)) int unu2= xmin + xmax +ymin + ymax; //no unused warning shit
+    me_cmp_func cmp_sub, chroma_cmp_sub;
+    int d;
+
+    LOAD_COMMON(xx, yy);
+    
+ //FIXME factorize
+
+    cmp_sub= s->dsp.mb_cmp[size];
+    chroma_cmp_sub= s->dsp.mb_cmp[size+1];
+    
+    assert(!s->me.skip);
+    assert(s->avctx->me_sub_cmp != s->avctx->mb_cmp);
+
+    CMP_QPEL(d, mx&3, my&3, mx>>2, my>>2, size);
+    //FIXME check cbp before adding penalty for (0,0) vector
+    if(mx || my || size>0)
+        d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
+        
+    return d;
+}
+
+
 #endif /* CMP_QPEL */
 
 #define CHECK_MV(x,y)\
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index 10abf1024..fecb097bd 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -504,7 +504,7 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val)
 
 void ff_mpeg1_encode_init(MpegEncContext *s)
 {
-#ifdef CONFIG_ENCODERS
+#if 0
     static int done=0;
 
     common_init(s);
@@ -769,6 +769,8 @@ static int mpeg_decode_mb(MpegEncContext *s,
     
     dprintf("decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
 
+    assert(s->mb_skiped==0);
+
     if (--s->mb_incr != 0) {
         /* skip mb */
         s->mb_intra = 0;
@@ -781,15 +783,18 @@ static int mpeg_decode_mb(MpegEncContext *s,
             s->mv[0][0][0] = s->mv[0][0][1] = 0;
             s->last_mv[0][0][0] = s->last_mv[0][0][1] = 0;
             s->last_mv[0][1][0] = s->last_mv[0][1][1] = 0;
+            s->mb_skiped = 1;
         } else {
             /* if B type, reuse previous vectors and directions */
             s->mv[0][0][0] = s->last_mv[0][0][0];
             s->mv[0][0][1] = s->last_mv[0][0][1];
             s->mv[1][0][0] = s->last_mv[1][0][0];
             s->mv[1][0][1] = s->last_mv[1][0][1];
+
+            if((s->mv[0][0][0]|s->mv[0][0][1]|s->mv[1][0][0]|s->mv[1][0][1])==0) 
+                s->mb_skiped = 1;
         }
 
-        s->mb_skiped = 1;
         return 0;
     }
 
@@ -1464,7 +1469,7 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     int ref, f_code;
 
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
 
     ref = get_bits(&s->gb, 10); /* temporal ref */
     s->pict_type = get_bits(&s->gb, 3);
@@ -1616,7 +1621,7 @@ static void mpeg_decode_extension(AVCodecContext *avctx,
     MpegEncContext *s = &s1->mpeg_enc_ctx;
     int ext_type;
 
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
     
     ext_type = get_bits(&s->gb, 4);
     switch(ext_type) {
@@ -1672,7 +1677,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
             return DECODE_SLICE_FATAL_ERROR;
             
         if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-             printf("qp:%d fc:%d%d%d%d %s %s %s %s dc:%d pstruct:%d fdct:%d cmv:%d qtype:%d ivlc:%d rff:%d %s\n", 
+             printf("qp:%d fc:%2d%2d%2d%2d %s %s %s %s dc:%d pstruct:%d fdct:%d cmv:%d qtype:%d ivlc:%d rff:%d %s\n", 
                  s->qscale, s->mpeg_f_code[0][0],s->mpeg_f_code[0][1],s->mpeg_f_code[1][0],s->mpeg_f_code[1][1],
                  s->pict_type == I_TYPE ? "I" : (s->pict_type == P_TYPE ? "P" : (s->pict_type == B_TYPE ? "B" : "S")), 
                  s->progressive_sequence ? "pro" :"", s->alternate_scan ? "alt" :"", s->top_field_first ? "top" :"", 
@@ -1681,7 +1686,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
         }
     }
 
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
 
     s->qscale = get_qscale(s);
     /* extra slice info */
@@ -1790,7 +1795,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
     int width, height, i, v, j;
     float aspect;
 
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
 
     width = get_bits(&s->gb, 12);
     height = get_bits(&s->gb, 12);
diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c
index b2c0966aa..9a066c905 100644
--- a/src/libffmpeg/libavcodec/mpegaudiodec.c
+++ b/src/libffmpeg/libavcodec/mpegaudiodec.c
@@ -507,7 +507,7 @@ static int decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */;
+/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */
 
 /* cos(i*pi/64) */
 
@@ -1460,7 +1460,7 @@ static void seek_to_maindata(MPADecodeContext *s, long backstep)
     memcpy(ptr, s->inbuf1[s->inbuf_index ^ 1] + 
            BACKSTEP_SIZE + s->old_frame_size - backstep, backstep);
     /* init get bits again */
-    init_get_bits(&s->gb, ptr, s->frame_size + backstep);
+    init_get_bits(&s->gb, ptr, (s->frame_size + backstep)*8);
 
     /* prepare next buffer */
     s->inbuf_index ^= 1;
@@ -2280,7 +2280,7 @@ static int mp_decode_frame(MPADecodeContext *s,
     short *samples_ptr;
 
     init_get_bits(&s->gb, s->inbuf + HEADER_SIZE, 
-                  s->inbuf_ptr - s->inbuf - HEADER_SIZE);
+                  (s->inbuf_ptr - s->inbuf - HEADER_SIZE)*8);
     
     /* skip error protection field */
     if (s->error_protection)
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 8206df470..d721647a5 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -20,6 +20,7 @@
  */
  
 #include <ctype.h>
+#include <limits.h>
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
@@ -80,12 +81,15 @@ static const uint8_t simple_mmx_permutation[64]={
 };
 
 static const uint8_t h263_chroma_roundtab[16] = {
+//  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
     0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
 };
 
 static UINT16 (*default_mv_penalty)[MAX_MV*2+1]=NULL;
 static UINT8 default_fcode_tab[MAX_MV*2+1];
 
+enum PixelFormat ff_yuv420p_list[2]= {PIX_FMT_YUV420P, -1};
+
 static void convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
                            const UINT16 *quant_matrix, int bias, int qmin, int qmax)
 {
@@ -230,6 +234,8 @@ int DCT_common_init(MpegEncContext *s)
     MPV_common_init_ppc(s);
 #endif
 
+    s->fast_dct_quantize= s->dct_quantize;
+
     if(s->flags&CODEC_FLAG_TRELLIS_QUANT){
         s->dct_quantize= dct_quantize_trellis_c; //move before MPV_common_init_*
     }
@@ -286,7 +292,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
         r= s->avctx->get_buffer(s->avctx, (AVFrame*)pic);
         
         if(r<0 || !pic->age || !pic->type || !pic->data[0]){
-            fprintf(stderr, "get_buffer() failed (%d %d %d %X)\n", r, pic->age, pic->type, (int)pic->data[0]);
+            fprintf(stderr, "get_buffer() failed (%d %d %d %p)\n", r, pic->age, pic->type, pic->data[0]);
             return -1;
         }
 
@@ -309,6 +315,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
             CHECKED_ALLOCZ(pic->mb_var   , s->mb_num * sizeof(INT16))
             CHECKED_ALLOCZ(pic->mc_mb_var, s->mb_num * sizeof(INT16))
             CHECKED_ALLOCZ(pic->mb_mean  , s->mb_num * sizeof(INT8))
+            CHECKED_ALLOCZ(pic->mb_cmp_score, s->mb_num * sizeof(int32_t))
         }
 
         CHECKED_ALLOCZ(pic->mbskip_table , s->mb_num * sizeof(UINT8)+1) //the +1 is for the slice end check
@@ -316,6 +323,12 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
         pic->qstride= s->mb_width;
     }
     
+    //it might be nicer if the application would keep track of these but it would require a API change
+    memmove(s->prev_pict_types+1, s->prev_pict_types, PREV_PICT_TYPES_BUFFER_SIZE-1);
+    s->prev_pict_types[0]= s->pict_type;
+    if(pic->age < PREV_PICT_TYPES_BUFFER_SIZE && s->prev_pict_types[pic->age] == B_TYPE)
+        pic->age= INT_MAX; // skiped MBs in b frames are quite rare in mpeg1/2 and its a bit tricky to skip them anyway
+    
     return 0;
 fail: //for the CHECKED_ALLOCZ macro
     return -1;
@@ -334,6 +347,7 @@ static void free_picture(MpegEncContext *s, Picture *pic){
     av_freep(&pic->mb_var);
     av_freep(&pic->mc_mb_var);
     av_freep(&pic->mb_mean);
+    av_freep(&pic->mb_cmp_score);
     av_freep(&pic->mbskip_table);
     av_freep(&pic->qscale_table);
     
@@ -472,6 +486,7 @@ int MPV_common_init(MpegEncContext *s)
     /* init macroblock skip table */
     CHECKED_ALLOCZ(s->mbskip_table, s->mb_num+1);
     //Note the +1 is for a quicker mpeg4 slice_end detection
+    CHECKED_ALLOCZ(s->prev_pict_types, PREV_PICT_TYPES_BUFFER_SIZE);
     
     s->block= s->blocks[0];
 
@@ -511,6 +526,7 @@ void MPV_common_end(MpegEncContext *s)
     av_freep(&s->me.score_map);
     
     av_freep(&s->mbskip_table);
+    av_freep(&s->prev_pict_types);
     av_freep(&s->bitstream_buffer);
     av_freep(&s->tex_pb_buffer);
     av_freep(&s->pb2_buffer);
@@ -609,6 +625,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         avctx->delay=0;
         s->low_delay=1;
         break;
+#ifdef CONFIG_RISKY
     case CODEC_ID_H263:
         if (h263_get_picture_format(s->width, s->height) == 7) {
             printf("Input picture size isn't suitable for h263 codec! try h263+\n");
@@ -688,6 +705,7 @@ int MPV_encode_init(AVCodecContext *avctx)
         avctx->delay=0;
         s->low_delay=1;
         break;
+#endif
     default:
         return -1;
     }
@@ -725,24 +743,29 @@ int MPV_encode_init(AVCodecContext *avctx)
     ff_init_me(s);
 
 #ifdef CONFIG_ENCODERS
+#ifdef CONFIG_RISKY
     if (s->out_format == FMT_H263)
         h263_encode_init(s);
-    else if (s->out_format == FMT_MPEG1)
-        ff_mpeg1_encode_init(s);
     if(s->msmpeg4_version)
         ff_msmpeg4_encode_init(s);
 #endif
+    if (s->out_format == FMT_MPEG1)
+        ff_mpeg1_encode_init(s);
+#endif
 
     /* init default q matrix */
     for(i=0;i<64;i++) {
         int j= s->idct_permutation[i];
+#ifdef CONFIG_RISKY
         if(s->codec_id==CODEC_ID_MPEG4 && s->mpeg_quant){
             s->intra_matrix[j] = ff_mpeg4_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg4_default_non_intra_matrix[i];
         }else if(s->out_format == FMT_H263){
             s->intra_matrix[j] =
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
-        }else{ /* mpeg1 */
+        }else
+#endif
+        { /* mpeg1 */
             s->intra_matrix[j] = ff_mpeg1_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         }
@@ -787,6 +810,44 @@ int MPV_encode_end(AVCodecContext *avctx)
     return 0;
 }
 
+void init_rl(RLTable *rl)
+{
+    INT8 max_level[MAX_RUN+1], max_run[MAX_LEVEL+1];
+    UINT8 index_run[MAX_RUN+1];
+    int last, run, level, start, end, i;
+
+    /* compute max_level[], max_run[] and index_run[] */
+    for(last=0;last<2;last++) {
+        if (last == 0) {
+            start = 0;
+            end = rl->last;
+        } else {
+            start = rl->last;
+            end = rl->n;
+        }
+
+        memset(max_level, 0, MAX_RUN + 1);
+        memset(max_run, 0, MAX_LEVEL + 1);
+        memset(index_run, rl->n, MAX_RUN + 1);
+        for(i=start;i<end;i++) {
+            run = rl->table_run[i];
+            level = rl->table_level[i];
+            if (index_run[run] == rl->n)
+                index_run[run] = i;
+            if (level > max_level[run])
+                max_level[run] = level;
+            if (run > max_run[level])
+                max_run[level] = run;
+        }
+        rl->max_level[last] = av_malloc(MAX_RUN + 1);
+        memcpy(rl->max_level[last], max_level, MAX_RUN + 1);
+        rl->max_run[last] = av_malloc(MAX_LEVEL + 1);
+        memcpy(rl->max_run[last], max_run, MAX_LEVEL + 1);
+        rl->index_run[last] = av_malloc(MAX_RUN + 1);
+        memcpy(rl->index_run[last], index_run, MAX_RUN + 1);
+    }
+}
+
 /* draw the edges of width 'w' of an image of size width, height */
 //FIXME check that this is ok for mpeg4 interlaced
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w)
@@ -1292,11 +1353,10 @@ static inline void gmc1_motion(MpegEncContext *s,
 
     dest_y+=dest_offset;
     if(s->flags&CODEC_FLAG_EMU_EDGE){
-        if(src_x<0 || src_y<0 || src_x + (motion_x&15) + 16 > s->h_edge_pos
-                              || src_y + (motion_y&15) + 16 > s->v_edge_pos){
+        if(src_x<0 || src_y<0 || src_x + 17 >= s->h_edge_pos
+                              || src_y + 17 >= s->v_edge_pos){
             ff_emulated_edge_mc(s, ptr, linesize, 17, 17, src_x, src_y, s->h_edge_pos, s->v_edge_pos);
             ptr= s->edge_emu_buffer;
-            emu=1;
         }
     }
     
@@ -1331,9 +1391,13 @@ static inline void gmc1_motion(MpegEncContext *s,
 
     offset = (src_y * uvlinesize) + src_x + (src_offset>>1);
     ptr = ref_picture[1] + offset;
-    if(emu){
-        ff_emulated_edge_mc(s, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
-        ptr= s->edge_emu_buffer;
+    if(s->flags&CODEC_FLAG_EMU_EDGE){
+        if(src_x<0 || src_y<0 || src_x + 9 >= s->h_edge_pos>>1
+                              || src_y + 9 >= s->v_edge_pos>>1){
+            ff_emulated_edge_mc(s, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
+            ptr= s->edge_emu_buffer;
+            emu=1;
+        }
     }
     s->dsp.gmc1(dest_cb + (dest_offset>>1), ptr, uvlinesize, 8, motion_x&15, motion_y&15, 128 - s->no_rounding);
     
@@ -1656,6 +1720,14 @@ static inline void qpel_motion(MpegEncContext *s,
     pix_op[1][dxy](dest_cr + (dest_offset >> 1), ptr,  uvlinesize, h >> 1);
 }
 
+inline int ff_h263_round_chroma(int x){
+    if (x >= 0)
+        return  (h263_chroma_roundtab[x & 0xf] + ((x >> 3) & ~1));
+    else {
+        x = -x;
+        return -(h263_chroma_roundtab[x & 0xf] + ((x >> 3) & ~1));
+    }
+}
 
 static inline void MPV_motion(MpegEncContext *s, 
                               UINT8 *dest_y, UINT8 *dest_cb, UINT8 *dest_cr,
@@ -1672,6 +1744,7 @@ static inline void MPV_motion(MpegEncContext *s,
 
     switch(s->mv_type) {
     case MV_TYPE_16X16:
+#ifdef CONFIG_RISKY
         if(s->mcsel){
             if(s->real_sprite_warping_points==1){
                 gmc1_motion(s, dest_y, dest_cb, dest_cr, 0,
@@ -1689,7 +1762,9 @@ static inline void MPV_motion(MpegEncContext *s,
             ff_mspel_motion(s, dest_y, dest_cb, dest_cr,
                         ref_picture, pix_op,
                         s->mv[dir][0][0], s->mv[dir][0][1], 16);
-        }else{
+        }else
+#endif
+        {
             mpeg_motion(s, dest_y, dest_cb, dest_cr, 0,
                         ref_picture, 0,
                         0, pix_op,
@@ -1766,20 +1841,8 @@ static inline void MPV_motion(MpegEncContext *s,
         if(s->flags&CODEC_FLAG_GRAY) break;
         /* In case of 8X8, we construct a single chroma motion vector
            with a special rounding */
-        for(i=0;i<4;i++) {
-        }
-        if (mx >= 0)
-            mx = (h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
-        else {
-            mx = -mx;
-            mx = -(h263_chroma_roundtab[mx & 0xf] + ((mx >> 3) & ~1));
-        }
-        if (my >= 0)
-            my = (h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
-        else {
-            my = -my;
-            my = -(h263_chroma_roundtab[my & 0xf] + ((my >> 3) & ~1));
-        }
+        mx= ff_h263_round_chroma(mx);
+        my= ff_h263_round_chroma(my);
         dxy = ((my & 1) << 1) | (mx & 1);
         mx >>= 1;
         my >>= 1;
@@ -2010,14 +2073,13 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
                 if(*mbskip_ptr >99) *mbskip_ptr= 99;
 
                 /* if previous was skipped too, then nothing to do !  */
-                if (*mbskip_ptr >= age){
-//if(s->pict_type!=B_TYPE && s->mb_x==0) printf("\n");
-//if(s->pict_type!=B_TYPE) printf("%d%d ", *mbskip_ptr, age);
-                    if(s->pict_type!=B_TYPE) return;
-                    if(s->avctx->draw_horiz_band==NULL && *mbskip_ptr > age) return; 
-                    /* we dont draw complete frames here so we cant skip */
+                if (*mbskip_ptr >= age && s->current_picture.reference){
+                    return;
                 }
-            } else {
+            } else if(!s->current_picture.reference){
+                (*mbskip_ptr) ++; /* increase counter so the age can be compared cleanly */
+                if(*mbskip_ptr >99) *mbskip_ptr= 99;
+            } else{
                 *mbskip_ptr = 0; /* not skipped */
             }
         }else
@@ -2088,9 +2150,12 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
                     add_dct(s, block[4], 4, dest_cb, s->uvlinesize);
                     add_dct(s, block[5], 5, dest_cr, s->uvlinesize);
                 }
-            } else{
+            } 
+#ifdef CONFIG_RISKY
+            else{
                 ff_wmv2_add_mb(s, block, dest_y, dest_cb, dest_cr);
             }
+#endif
         } else {
             /* dct only in intra block */
             if(s->encoding || !(s->mpeg2 || s->codec_id==CODEC_ID_MPEG1VIDEO)){
@@ -2585,7 +2650,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         s->block_last_index[4]=
         s->block_last_index[5]= 0;
         s->block[4][0]=
-        s->block[5][0]= 128;
+        s->block[5][0]= (1024 + s->c_dc_scale/2)/ s->c_dc_scale;
     }
 
 #ifdef CONFIG_ENCODERS
@@ -2593,6 +2658,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     switch(s->codec_id){ //FIXME funct ptr could be slightly faster
     case CODEC_ID_MPEG1VIDEO:
         mpeg1_encode_mb(s, s->block, motion_x, motion_y); break;
+#ifdef CONFIG_RISKY
     case CODEC_ID_MPEG4:
         mpeg4_encode_mb(s, s->block, motion_x, motion_y); break;
     case CODEC_ID_MSMPEG4V2:
@@ -2601,18 +2667,48 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         msmpeg4_encode_mb(s, s->block, motion_x, motion_y); break;
     case CODEC_ID_WMV2:
          ff_wmv2_encode_mb(s, s->block, motion_x, motion_y); break;
-    case CODEC_ID_MJPEG:
-        mjpeg_encode_mb(s, s->block); break;
     case CODEC_ID_H263:
     case CODEC_ID_H263P:
     case CODEC_ID_RV10:
         h263_encode_mb(s, s->block, motion_x, motion_y); break;
+#endif
+    case CODEC_ID_MJPEG:
+        mjpeg_encode_mb(s, s->block); break;
     default:
         assert(0);
     }
 #endif
 }
 
+/**
+ * combines the (truncated) bitstream to a complete frame
+ * @returns -1 if no complete frame could be created
+ */
+int ff_combine_frame( MpegEncContext *s, int next, uint8_t **buf, int *buf_size){
+    ParseContext *pc= &s->parse_context;
+        
+    pc->last_index= pc->index;
+
+    if(next==-1){
+        pc->buffer= av_fast_realloc(pc->buffer, &pc->buffer_size, (*buf_size) + pc->index + FF_INPUT_BUFFER_PADDING_SIZE);
+
+        memcpy(&pc->buffer[pc->index], *buf, *buf_size);
+        pc->index += *buf_size;
+        return -1;
+    }
+
+    if(pc->index){
+        pc->buffer= av_fast_realloc(pc->buffer, &pc->buffer_size, next + pc->index + FF_INPUT_BUFFER_PADDING_SIZE);
+
+        memcpy(&pc->buffer[pc->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE );
+        pc->index = 0;
+        *buf= pc->buffer;
+        *buf_size= pc->last_index + next;
+    }
+
+    return 0;
+}
+
 void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length)
 {
     int bytes= length>>4;
@@ -2769,10 +2865,12 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->current_picture.mb_var_sum = 0;
     s->current_picture.mc_mb_var_sum = 0;
 
+#ifdef CONFIG_RISKY
     /* we need to initialize some time vars before we can encode b-frames */
     if (s->h263_pred && !s->h263_msmpeg4)
         ff_set_mpeg4_time(s, s->picture_number); 
-
+#endif
+        
     s->scene_change_score=0;
     
     s->qscale= (int)(s->frame_qscale + 0.5); //FIXME qscale / ... stuff for ME ratedistoration
@@ -2789,6 +2887,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->no_rounding ^= 1;          
     }
     /* Estimate motion for every MB */
+    s->mb_intra=0; //for the rate distoration & bit compare functions
     if(s->pict_type != I_TYPE){
         if(s->pict_type != B_TYPE){
             if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
@@ -2880,6 +2979,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->frame_qscale = ff_rate_estimate_qscale(s);
 
     if(s->adaptive_quant){
+#ifdef CONFIG_RISKY
         switch(s->codec_id){
         case CODEC_ID_MPEG4:
             ff_clean_mpeg4_qscales(s);
@@ -2889,6 +2989,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             ff_clean_h263_qscales(s);
             break;
         }
+#endif
 
         s->qscale= s->current_picture.qscale_table[0];
     }else
@@ -2918,6 +3019,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     case FMT_MJPEG:
         mjpeg_picture_header(s);
         break;
+#ifdef CONFIG_RISKY
     case FMT_H263:
         if (s->codec_id == CODEC_ID_WMV2) 
             ff_wmv2_encode_picture_header(s, picture_number);
@@ -2930,6 +3032,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         else
             h263_encode_picture_header(s, picture_number);
         break;
+#endif
     case FMT_MPEG1:
         mpeg1_encode_picture_header(s, picture_number);
         break;
@@ -2957,11 +3060,13 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     s->last_mv[0][0][0] = 0;
     s->last_mv[0][0][1] = 0;
 
+#ifdef CONFIG_RISKY
     if (s->codec_id==CODEC_ID_H263 || s->codec_id==CODEC_ID_H263P)
         s->gob_index = ff_h263_get_gob_height(s);
 
     if(s->codec_id==CODEC_ID_MPEG4 && s->partitioned_frame)
         ff_mpeg4_init_partitions(s);
+#endif
 
     s->resync_mb_x=0;
     s->resync_mb_y=0;
@@ -2979,7 +3084,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         s->block_index[4]= s->block_wrap[4]*(mb_y + 1)                    + s->block_wrap[0]*(s->mb_height*2 + 2);
         s->block_index[5]= s->block_wrap[4]*(mb_y + 1 + s->mb_height + 2) + s->block_wrap[0]*(s->mb_height*2 + 2);
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-            const int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
+            int mb_type= s->mb_type[mb_y * s->mb_width + mb_x];
             const int xy= (mb_y+1) * (s->mb_width+2) + mb_x + 1;
 //            int d;
             int dmin=10000000;
@@ -2994,6 +3099,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->block_index[5]++;
 
             /* write gob / video packet header  */
+#ifdef CONFIG_RISKY
             if(s->rtp_mode){
                 int current_packet_size, is_gob_start;
                 
@@ -3034,6 +3140,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                     s->resync_mb_y=mb_y;
                 }
             }
+#endif
 
             if(  (s->resync_mb_x   == s->mb_x)
                && s->resync_mb_y+1 == s->mb_y){
@@ -3108,7 +3215,9 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                     
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
                     s->mb_intra= 0;
+#ifdef CONFIG_RISKY
                     ff_mpeg4_set_direct_mv(s, mx, my);
+#endif
                     encode_mb_hq(s, &backup_s, &best_s, MB_TYPE_DIRECT, pb, pb2, tex_pb, 
                                  &dmin, &next_block, mx, my);
                 }
@@ -3145,8 +3254,93 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                 s->last_bits= get_bit_count(&s->pb);
             } else {
                 int motion_x, motion_y;
+                int intra_score;
+                int inter_score= s->current_picture.mb_cmp_score[mb_x + mb_y*s->mb_width];
+                
+              if(!(s->flags&CODEC_FLAG_HQ) && s->pict_type==P_TYPE){
+                /* get luma score */
+                if((s->avctx->mb_cmp&0xFF)==FF_CMP_SSE){
+                    intra_score= (s->current_picture.mb_var[mb_x + mb_y*s->mb_width]<<8) - 500; //FIXME dont scale it down so we dont have to fix it
+                }else{
+                    uint8_t *dest_y;
+
+                    int mean= s->current_picture.mb_mean[mb_x + mb_y*s->mb_width]; //FIXME
+                    mean*= 0x01010101;
+                    
+                    dest_y  = s->new_picture.data[0] + (mb_y * 16 * s->linesize    ) + mb_x * 16;
+                
+                    for(i=0; i<16; i++){
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+ 0]) = mean;
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+ 4]) = mean;
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+ 8]) = mean;
+                        *(uint32_t*)(&s->me.scratchpad[i*s->linesize+12]) = mean;
+                    }
+
+                    s->mb_intra=1;
+                    intra_score= s->dsp.mb_cmp[0](s, s->me.scratchpad, dest_y, s->linesize);
+                                        
+/*                    printf("intra:%7d inter:%7d var:%7d mc_var.%7d\n", intra_score>>8, inter_score>>8, 
+                        s->current_picture.mb_var[mb_x + mb_y*s->mb_width],
+                        s->current_picture.mc_mb_var[mb_x + mb_y*s->mb_width]);*/
+                }
+                
+                /* get chroma score */
+                if(s->avctx->mb_cmp&FF_CMP_CHROMA){
+                    int i;
+                    
+                    s->mb_intra=1;
+                    for(i=1; i<3; i++){
+                        uint8_t *dest_c;
+                        int mean;
+                        
+                        if(s->out_format == FMT_H263){
+                            mean= (s->dc_val[i][mb_x + (mb_y+1)*(s->mb_width+2)] + 4)>>3; //FIXME not exact but simple ;)
+                        }else{
+                            mean= (s->last_dc[i] + 4)>>3;
+                        }
+                        dest_c = s->new_picture.data[i] + (mb_y * 8  * (s->uvlinesize)) + mb_x * 8;
+                        
+                        mean*= 0x01010101;
+                        for(i=0; i<8; i++){
+                            *(uint32_t*)(&s->me.scratchpad[i*s->uvlinesize+ 0]) = mean;
+                            *(uint32_t*)(&s->me.scratchpad[i*s->uvlinesize+ 4]) = mean;
+                        }
+                        
+                        intra_score+= s->dsp.mb_cmp[1](s, s->me.scratchpad, dest_c, s->uvlinesize);
+                    }                
+                }
+
+                /* bias */
+                switch(s->avctx->mb_cmp&0xFF){
+                default:
+                case FF_CMP_SAD:
+                    intra_score+= 32*s->qscale;
+                    break;
+                case FF_CMP_SSE:
+                    intra_score+= 24*s->qscale*s->qscale;
+                    break;
+                case FF_CMP_SATD:
+                    intra_score+= 96*s->qscale;
+                    break;
+                case FF_CMP_DCT:
+                    intra_score+= 48*s->qscale;
+                    break;
+                case FF_CMP_BIT:
+                    intra_score+= 16;
+                    break;
+                case FF_CMP_PSNR:
+                case FF_CMP_RD:
+                    intra_score+= (s->qscale*s->qscale*109*8 + 64)>>7;
+                    break;
+                }
+
+                if(intra_score < inter_score)
+                    mb_type= MB_TYPE_INTRA;
+              }  
+                
                 s->mv_type=MV_TYPE_16X16;
                 // only one MB-Type possible
+                
                 switch(mb_type){
                 case MB_TYPE_INTRA:
                     s->mv_dir = MV_DIR_FORWARD;
@@ -3175,7 +3369,9 @@ static void encode_picture(MpegEncContext *s, int picture_number)
                     s->mb_intra= 0;
                     motion_x=s->b_direct_mv_table[xy][0];
                     motion_y=s->b_direct_mv_table[xy][1];
+#ifdef CONFIG_RISKY
                     ff_mpeg4_set_direct_mv(s, motion_x, motion_y);
+#endif
                     break;
                 case MB_TYPE_BIDIR:
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
@@ -3253,6 +3449,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     }
     emms_c();
 
+#ifdef CONFIG_RISKY
     if(s->codec_id==CODEC_ID_MPEG4 && s->partitioned_frame)
         ff_mpeg4_merge_partitions(s);
 
@@ -3261,6 +3458,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 
     if(s->codec_id==CODEC_ID_MPEG4) 
         ff_mpeg4_stuffing(&s->pb);
+#endif
 
     //if (s->gob_number)
     //    fprintf(stderr,"\nNumber of GOB: %d", s->gob_number);
@@ -3376,7 +3574,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
         return last_non_zero;
     }
 
-    lambda= (qscale*qscale*64*82 + 50)/100; //FIXME finetune
+    lambda= (qscale*qscale*64*105 + 64)>>7; //FIXME finetune
         
     score_tab[0]= 0;
     for(i=0; i<=last_non_zero - start_i; i++){
@@ -3783,6 +3981,8 @@ AVCodec mpeg1video_encoder = {
     MPV_encode_end,
 };
 
+#ifdef CONFIG_RISKY
+
 AVCodec h263_encoder = {
     "h263",
     CODEC_TYPE_VIDEO,
@@ -3813,16 +4013,6 @@ AVCodec rv10_encoder = {
     MPV_encode_end,
 };
 
-AVCodec mjpeg_encoder = {
-    "mjpeg",
-    CODEC_TYPE_VIDEO,
-    CODEC_ID_MJPEG,
-    sizeof(MpegEncContext),
-    MPV_encode_init,
-    MPV_encode_picture,
-    MPV_encode_end,
-};
-
 AVCodec mpeg4_encoder = {
     "mpeg4",
     CODEC_TYPE_VIDEO,
@@ -3873,3 +4063,14 @@ AVCodec wmv1_encoder = {
     MPV_encode_end,
 };
 
+#endif
+
+AVCodec mjpeg_encoder = {
+    "mjpeg",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_MJPEG,
+    sizeof(MpegEncContext),
+    MPV_encode_init,
+    MPV_encode_picture,
+    MPV_encode_end,
+};
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index e6909817a..7ecc6fd38 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -98,7 +98,6 @@ typedef struct RateControlContext{
     int last_non_b_pict_type;
 }RateControlContext;
 
-
 typedef struct ScanTable{
     const UINT8 *scantable;
     UINT8 permutated[64];
@@ -117,6 +116,7 @@ typedef struct Picture{
     uint16_t *mb_var;           /* Table for MB variances */
     uint16_t *mc_mb_var;        /* Table for motion compensated MB variances */
     uint8_t *mb_mean;           /* Table for MB luminance */
+    int32_t *mb_cmp_score;	/* Table for MB cmp scores, for mb decission */
     int b_frame_score;          /* */
 } Picture;
 
@@ -142,6 +142,7 @@ typedef struct MotionEstContext{
     int pre_penalty_factor;
     int penalty_factor;
     int sub_penalty_factor;
+    int mb_penalty_factor;
     int pre_pass;                      /* = 1 for the pre pass */
     int dia_size;
     UINT16 (*mv_penalty)[MAX_MV*2+1];  /* amount of bits needed to encode a MV */
@@ -160,6 +161,8 @@ typedef struct MotionEstContext{
                              int P[10][2], int pred_x, int pred_y,
                              int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
                              int ref_mv_scale, uint16_t * const mv_penalty);
+    int (*get_mb_score)(struct MpegEncContext * s, int mx, int my, int pred_x, int pred_y, Picture *ref_picture, 
+                                  uint16_t * const mv_penalty);
 }MotionEstContext;
 
 typedef struct MpegEncContext {
@@ -226,6 +229,8 @@ typedef struct MpegEncContext {
     UINT8 *coded_block;          /* used for coded block pattern prediction (msmpeg4v3, wmv1)*/
     INT16 (*ac_val[3])[16];      /* used for for mpeg4 AC prediction, all 3 arrays must be continuous */
     int ac_pred;
+    uint8_t *prev_pict_types;   /* previous picture types in bitstream order, used for mb skip */
+#define PREV_PICT_TYPES_BUFFER_SIZE 256
     int mb_skiped;              /* MUST BE SET only during DECODING */
     UINT8 *mbskip_table;        /* used to avoid copy if macroblock skipped (for black regions for example) 
                                    and used for b-frame encoding & decoding (contains skip table of next P Frame) */
@@ -321,6 +326,8 @@ typedef struct MpegEncContext {
     uint8_t *intra_ac_vlc_last_length;
     uint8_t *inter_ac_vlc_length;
     uint8_t *inter_ac_vlc_last_length;
+    uint8_t *luma_dc_vlc_length;
+    uint8_t *chroma_dc_vlc_length;
 #define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
 
     /* precomputed matrix (combine qscale and DCT renorm) */
@@ -544,14 +551,15 @@ typedef struct MpegEncContext {
 #define SLICE_NOEND     -3 //no end marker or error found but mb count exceeded
     
     void (*dct_unquantize_mpeg1)(struct MpegEncContext *s, 
-                           DCTELEM *block, int n, int qscale);
+                           DCTELEM *block/*align 16*/, int n, int qscale);
     void (*dct_unquantize_mpeg2)(struct MpegEncContext *s, 
-                           DCTELEM *block, int n, int qscale);
+                           DCTELEM *block/*align 16*/, int n, int qscale);
     void (*dct_unquantize_h263)(struct MpegEncContext *s, 
-                           DCTELEM *block, int n, int qscale);
+                           DCTELEM *block/*align 16*/, int n, int qscale);
     void (*dct_unquantize)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both)
-                           DCTELEM *block, int n, int qscale);
-    int (*dct_quantize)(struct MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
+                           DCTELEM *block/*align 16*/, int n, int qscale);
+    int (*dct_quantize)(struct MpegEncContext *s, DCTELEM *block/*align 16*/, int n, int qscale, int *overflow);
+    int (*fast_dct_quantize)(struct MpegEncContext *s, DCTELEM *block/*align 16*/, int n, int qscale, int *overflow);
     void (*fdct)(DCTELEM *block/* align 16*/);
     void (*idct_put)(UINT8 *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
     void (*idct_add)(UINT8 *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
@@ -596,7 +604,9 @@ void ff_draw_horiz_band(MpegEncContext *s);
 void ff_emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
                                     int src_x, int src_y, int w, int h);
 char ff_get_pict_type_char(int pict_type);
+int ff_combine_frame( MpegEncContext *s, int next, uint8_t **buf, int *buf_size);
 
+extern enum PixelFormat ff_yuv420p_list[2];
 
 extern int ff_bit_exact;
 
@@ -690,7 +700,7 @@ void h263_encode_picture_header(MpegEncContext *s, int picture_number);
 int h263_encode_gob_header(MpegEncContext * s, int mb_line);
 INT16 *h263_pred_motion(MpegEncContext * s, int block, 
                         int *px, int *py);
-void mpeg4_pred_ac(MpegEncContext * s, INT16 *block, int n, 
+void mpeg4_pred_ac(MpegEncContext * s, DCTELEM *block, int n, 
                    int dir);
 void ff_set_mpeg4_time(MpegEncContext * s, int picture_number);
 void mpeg4_encode_picture_header(MpegEncContext *s, int picture_number);
@@ -717,6 +727,7 @@ int ff_mpeg4_get_video_packet_prefix_length(MpegEncContext *s);
 int ff_h263_resync(MpegEncContext *s);
 int ff_h263_get_gob_height(MpegEncContext *s);
 void ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my);
+inline int ff_h263_round_chroma(int x);
 
 
 /* rv10.c */
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index a08418874..2c524a067 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -502,7 +502,7 @@ static void msmpeg4_encode_motion(MpegEncContext * s,
 static inline void handle_slices(MpegEncContext *s){
     if (s->mb_x == 0) {
         if (s->slice_height && (s->mb_y % s->slice_height) == 0) {
-            if(s->msmpeg4_version != 4){
+            if(s->msmpeg4_version < 4){
                 ff_mpeg4_clean_buffers(s);
             }
             s->first_slice_line = 1;
@@ -691,7 +691,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n,
     b = dc_val[ - 1 - wrap];
     c = dc_val[ - wrap];
     
-    if(s->first_slice_line && (n&2)==0 && s->msmpeg4_version!=4){
+    if(s->first_slice_line && (n&2)==0 && s->msmpeg4_version<4){
         b=c=1024;
     }
 
@@ -1195,7 +1195,7 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
 #if 0
 {
 int i;
-for(i=0; i<s->gb.size*8; i++)
+for(i=0; i<s->gb.size_in_bits; i++)
     printf("%d", get_bits1(&s->gb));
 //    get_bits1(&s->gb);
 printf("END\n");
@@ -1869,7 +1869,7 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         if (i > 62){
             i-= 192;
             if(i&(~63)){
-                const int left= s->gb.size*8 - get_bits_count(&s->gb);
+                const int left= s->gb.size_in_bits - get_bits_count(&s->gb);
                 if(((i+192 == 64 && level/qmul==-1) || s->error_resilience<=1) && left>=0){
                     fprintf(stderr, "ignoring overflow at %d %d\n", s->mb_x, s->mb_y);
                     break;
diff --git a/src/libffmpeg/libavcodec/msmpeg4data.h b/src/libffmpeg/libavcodec/msmpeg4data.h
index 3490fc08c..2c3a28f0e 100644
--- a/src/libffmpeg/libavcodec/msmpeg4data.h
+++ b/src/libffmpeg/libavcodec/msmpeg4data.h
@@ -1868,7 +1868,10 @@ static const uint8_t *wmv1_scantable[WMV1_SCANTABLE_COUNT+1]={
 };
 
 static const uint8_t table_inter_intra[4][2]={
-    {0,1},{2,2},{6,3},{7,3}
+    {0,1} /*Luma-Left Chroma-Left*/,
+    {2,2} /*Luma-Top  Chroma-Left*/,
+    {6,3} /*luma-Left Chroma-Top */,
+    {7,3} /*luma-Top  Chroma-Top */
 };
 
 #define WMV2_INTER_CBP_TABLE_COUNT 4
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
index 5f14ed0eb..dc62e70f4 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2002 Brian Foley
  * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -20,21 +21,39 @@
 #include "../dsputil.h"
 #include "dsputil_altivec.h"
 
-#if CONFIG_DARWIN
+#ifdef CONFIG_DARWIN
 #include <sys/sysctl.h>
-#endif
+#else /* CONFIG_DARWIN */
+#include <signal.h>
+#include <setjmp.h>
+
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler (int sig)
+{
+    if (!canjump) {
+        signal (sig, SIG_DFL);
+        raise (sig);
+    }
+    
+    canjump = 0;
+    siglongjmp (jmpbuf, 1);
+}
+#endif /* CONFIG_DARWIN */
 
 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    vector unsigned char *tv;
     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
     vector unsigned int sad;
     vector signed int sumdiffs;
 
     s = 0;
-    zero = vec_splat_u8(0);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)vec_splat_u32(0);
     for(i=0;i<16;i++) {
         /*
            Read unaligned pixels into our vectors. The vectors are as follows:
@@ -72,16 +91,17 @@ int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    vector unsigned char *tv;
     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
     vector unsigned int sad;
     vector signed int sumdiffs;
     uint8_t *pix3 = pix2 + line_size;
 
     s = 0;
-    zero = vec_splat_u8(0);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)vec_splat_u32(0);
 
     /*
        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
@@ -131,20 +151,21 @@ int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
+    int i;
+    int s __attribute__((aligned(16)));
     uint8_t *pix3 = pix2 + line_size;
-    vector unsigned char *tv, avgv, t5, zero;
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
+    vector unsigned char *tv, avgv, t5;
     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
-    vector unsigned short avghv, avglv, two;
+    vector unsigned short avghv, avglv;
     vector unsigned short t1, t2, t3, t4;
     vector unsigned int sad;
     vector signed int sumdiffs;
 
-    zero = vec_splat_u8(0);
-    two = vec_splat_u16(2);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)vec_splat_u32(0);
     
     s = 0;
 
@@ -231,14 +252,15 @@ int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int i, s;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
     vector unsigned char perm1, perm2, *pix1v, *pix2v;
     vector unsigned char t1, t2, t3,t4, t5;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
     
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
+    sad = (vector unsigned int)vec_splat_u32(0);
 
 
     for(i=0;i<16;i++) {
@@ -272,15 +294,20 @@ int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int i, s;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
     vector unsigned char t1, t2, t3,t4, t5;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
 
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
-    permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+    sad = (vector unsigned int)vec_splat_u32(0);
+#ifdef CONFIG_DARWIN
+    permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+#else
+    permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
+#endif
 
     for(i=0;i<8;i++) {
 	/* Read potentially unaligned pixels into t1 and t2
@@ -315,14 +342,15 @@ int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
+    vector unsigned char *tv;
     vector unsigned char pixv;
     vector unsigned int sv;
     vector signed int sum;
-        
-    zero = vec_splat_u8(0);
-    sv = vec_splat_u32(0);
+    
+    sv = (vector unsigned int)vec_splat_u32(0);
     
     s = 0;
     for (i = 0; i < 16; i++) {
@@ -343,18 +371,127 @@ int pix_norm1_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
-int pix_sum_altivec(UINT8 * pix, int line_size)
+/**
+ * Sum of Squared Errors for a 8x8 block.
+ * AltiVec-enhanced.
+ * It's the pix_abs8x8_altivec code above w/ squaring added.
+ */
+int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 {
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
+    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sum;
+    vector signed int sumsqr;
+    
+    sum = (vector unsigned int)vec_splat_u32(0);
+#ifdef CONFIG_DARWIN
+    permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+#else
+    permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
+#endif
+    
+    for(i=0;i<8;i++) {
+	/* Read potentially unaligned pixels into t1 and t2
+	   Since we're reading 16 pixels, and actually only want 8,
+	   mask out the last 8 pixels. The 0s don't change the sum. */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
+        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 
+        /*
+          Since we want to use unsigned chars, we can take advantage
+          of the fact that abs(a-b)^2 = (a-b)^2.
+        */
+        
+	/* Calculate abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+        
+        /* Square the values and add them to our sum */
+        sum = vec_msum(t5, t5, sum);
+        
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
+    sumsqr = vec_splat(sumsqr, 3);
+    vec_ste(sumsqr, 0, &s);
+    
+    return s;
+}
+
+/**
+ * Sum of Squared Errors for a 16x16 block.
+ * AltiVec-enhanced.
+ * It's the pix_abs16x16_altivec code above w/ squaring added.
+ */
+int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
+    vector unsigned char perm1, perm2, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sum;
+    vector signed int sumsqr;
+    
+    sum = (vector unsigned int)vec_splat_u32(0);
+    
+    for(i=0;i<16;i++) {
+	/* Read potentially unaligned pixels into t1 and t2 */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
+        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
+
+        /*
+          Since we want to use unsigned chars, we can take advantage
+          of the fact that abs(a-b)^2 = (a-b)^2.
+        */
+        
+	/* Calculate abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+        
+        /* Square the values and add them to our sum */
+        sum = vec_msum(t5, t5, sum);
+        
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
+    sumsqr = vec_splat(sumsqr, 3);
+    vec_ste(sumsqr, 0, &s);
+    
+    return s;
+}
+
+int pix_sum_altivec(UINT8 * pix, int line_size)
+{
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
     vector unsigned char perm, *pixv;
     vector unsigned char t1;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
 
-    int s, i;
-
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
+    int i;
+    int s __attribute__((aligned(16)));
+    
+    sad = (vector unsigned int)vec_splat_u32(0);
     
     for (i = 0; i < 16; i++) {
 	/* Read the potentially unaligned 16 pixels into t1 */
@@ -380,7 +517,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_s
 {
     int i;
     vector unsigned char perm, bytes, *pixv;
-    vector unsigned char zero = (vector unsigned char) (0);
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
     vector signed short shorts;
 
     for(i=0;i<8;i++)
@@ -407,7 +544,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
 {
     int i;
     vector unsigned char perm, bytes, *pixv;
-    vector unsigned char zero = (vector unsigned char) (0);
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
     vector signed short shorts1, shorts2;
 
     for(i=0;i<4;i++)
@@ -474,10 +611,675 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
     }
 }
 
+int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
+  return pix_abs16x16_altivec(a,b,stride);
+}
+
+int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
+  return pix_abs8x8_altivec(a,b,stride);
+}
+
+void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+    for(i=0; i+7<w; i++){
+        dst[i+0] += src[i+0];
+        dst[i+1] += src[i+1];
+        dst[i+2] += src[i+2];
+        dst[i+3] += src[i+3];
+        dst[i+4] += src[i+4];
+        dst[i+5] += src[i+5];
+        dst[i+6] += src[i+6];
+        dst[i+7] += src[i+7];
+    }
+    for(; i<w; i++)
+        dst[i+0] += src[i+0];
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register int i;
+    register vector unsigned char vdst, vsrc;
+    
+    /* dst and src are 16 bytes-aligned (guaranteed) */
+    for(i = 0 ; (i + 15) < w ; i++)
+    {
+      vdst = vec_ld(i << 4, (unsigned char*)dst);
+      vsrc = vec_ld(i << 4, (unsigned char*)src);
+      vdst = vec_add(vsrc, vdst);
+      vec_st(vdst, i << 4, (unsigned char*)dst);
+    }
+    /* if w is not a multiple of 16 */
+    for (; (i < w) ; i++)
+    {
+      dst[i] = src[i];
+    }
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
+      *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
+      *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
+      *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
+      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
+      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+             0, (unsigned char*)block);
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
+
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
+void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
+      op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
+      op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
+      op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
+      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
+      blockv = vec_ld(0, block);
+      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+      blockv = vec_avg(blockv,pixelsv);
+      vec_st(blockv, 0, (unsigned char*)block);
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
+
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
+    for (i = 0; i < h; i++) {
+        *((uint32_t *) (block)) =
+            (((*((uint32_t *) (block))) |
+              ((((const struct unaligned_32 *) (pixels))->l))) -
+             ((((*((uint32_t *) (block))) ^
+                ((((const struct unaligned_32 *) (pixels))->
+                  l))) & 0xFEFEFEFEUL) >> 1));
+        *((uint32_t *) (block + 4)) =
+            (((*((uint32_t *) (block + 4))) |
+              ((((const struct unaligned_32 *) (pixels + 4))->l))) -
+             ((((*((uint32_t *) (block + 4))) ^
+                ((((const struct unaligned_32 *) (pixels +
+                                                  4))->
+                  l))) & 0xFEFEFEFEUL) >> 1));
+        pixels += line_size;
+        block += line_size;
+    }
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
+ 
+   for (i = 0; i < h; i++) {
+     /*
+       block is 8 bytes-aligned, so we're either in the
+       left block (16 bytes-aligned) or in the right block (not)
+     */
+     int rightside = ((unsigned long)block & 0x0000000F);
+     
+     blockv = vec_ld(0, block);
+     pixelsv1 = vec_ld(0, (unsigned char*)pixels);
+     pixelsv2 = vec_ld(16, (unsigned char*)pixels);
+     pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+     
+     if (rightside)
+     {
+       pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
+     }
+     else
+     {
+       pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
+     }
+     
+     blockv = vec_avg(blockv, pixelsv);
+
+     vec_st(blockv, 0, block);
+     
+     pixels += line_size;
+     block += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
+ 
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
+    for (j = 0; j < 2; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2,
+     pixelsavg;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vctwo);
+   
+POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     int rightside = ((unsigned long)block & 0x0000000F);
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vctwo);
+     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+     
+     if (rightside)
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+     }
+     else
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+     }
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+    for (j = 0; j < 2; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+    
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2,
+     pixelsavg;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vcone);
+   
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     int rightside = ((unsigned long)block & 0x0000000F);
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vcone);
+     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+     
+     if (rightside)
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+     }
+     else
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+     }
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
+      for (j = 0; j < 4; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3,
+     pixelssum3, pixelssum4, temp4;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv3 = vec_mergel(vczero, pixelsv1);
+   pixelsv4 = vec_mergel(vczero, pixelsv2);
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                        (vector unsigned short)pixelsv4);
+   pixelssum3 = vec_add(pixelssum3, vctwo);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vctwo);
+   
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv3 = vec_mergel(vczero, pixelsv1);
+     pixelsv4 = vec_mergel(vczero, pixelsv2);
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     
+     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                          (vector unsigned short)pixelsv4);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp4 = vec_add(pixelssum3, pixelssum4);
+     temp4 = vec_sra(temp4, vctwo);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+
+     pixelssum3 = vec_add(pixelssum4, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vctwo);
+
+     blockv = vec_packsu(temp3, temp4);
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+      for (j = 0; j < 4; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3,
+     pixelssum3, pixelssum4, temp4;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv3 = vec_mergel(vczero, pixelsv1);
+   pixelsv4 = vec_mergel(vczero, pixelsv2);
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                        (vector unsigned short)pixelsv4);
+   pixelssum3 = vec_add(pixelssum3, vcone);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vcone);
+   
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv3 = vec_mergel(vczero, pixelsv1);
+     pixelsv4 = vec_mergel(vczero, pixelsv2);
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     
+     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                          (vector unsigned short)pixelsv4);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp4 = vec_add(pixelssum3, pixelssum4);
+     temp4 = vec_sra(temp4, vctwo);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+
+     pixelssum3 = vec_add(pixelssum4, vcone);
+     pixelssum1 = vec_add(pixelssum2, vcone);
+
+     blockv = vec_packsu(temp3, temp4);
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
 
 int has_altivec(void)
 {
-#if CONFIG_DARWIN
+#ifdef CONFIG_DARWIN
     int sels[2] = {CTL_HW, HW_VECTORUNIT};
     int has_vu = 0;
     size_t len = sizeof(has_vu);
@@ -486,7 +1288,25 @@ int has_altivec(void)
     err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
 
     if (err == 0) return (has_vu != 0);
-#endif
+#else /* CONFIG_DARWIN */
+/* no Darwin, do it the brute-force way */
+/* this is borrowed from the libmpeg2 library */
+    {
+      signal (SIGILL, sigill_handler);
+      if (sigsetjmp (jmpbuf, 1)) {
+        signal (SIGILL, SIG_DFL);
+      } else {
+        canjump = 1;
+        
+        asm volatile ("mtspr 256, %0\n\t"
+                      "vand %%v0, %%v0, %%v0"
+                      :
+                      : "r" (-1));
+        
+        signal (SIGILL, SIG_DFL);
+        return 1;
+      }
+    }
+#endif /* CONFIG_DARWIN */
     return 0;
 }
-
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
index d4d259d9e..61dbec548 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.h
@@ -17,14 +17,79 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#ifndef _DSPUTIL_ALTIVEC_
+#define _DSPUTIL_ALTIVEC_
+
+#include "dsputil_ppc.h"
+
+#ifdef HAVE_ALTIVEC
+
 extern int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size);
+extern int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride);
+extern int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride);
 extern int pix_norm1_altivec(uint8_t *pix, int line_size);
+extern int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size);
+extern int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size);
 extern int pix_sum_altivec(UINT8 * pix, int line_size);
 extern void diff_pixels_altivec(DCTELEM* block, const UINT8* s1, const UINT8* s2, int stride);
 extern void get_pixels_altivec(DCTELEM* block, const UINT8 * pixels, int line_size);
 
+extern void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w);
+extern void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels, int line_size);
+extern void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+extern void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+extern void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
+extern void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
+extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
+
+extern void gmc1_altivec(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder);
+
 extern int has_altivec(void);
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#ifdef CONFIG_DARWIN
+#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
+#else
+#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
+#endif
+
+// vcprmle is used to keep the same index as in the SSE version.
+// it's the same as vcprm, with the index inversed
+// ('le' is Little Endian)
+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
+
+// used to build inverse/identity vectors (vcii)
+// n is _n_egative, p is _p_ositive
+#define FLOAT_n -1.
+#define FLOAT_p 1.
+
+
+#ifdef CONFIG_DARWIN
+#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
+#else
+#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
+#endif
+
+#else /* HAVE_ALTIVEC */
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+#error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC"
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+#endif /* HAVE_ALTIVEC */
+
+#endif /* _DSPUTIL_ALTIVEC_ */
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
index 733d0c156..c502f5819 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -19,18 +19,168 @@
 
 #include "../dsputil.h"
 
+#include "dsputil_ppc.h"
+
 #ifdef HAVE_ALTIVEC
 #include "dsputil_altivec.h"
 #endif
 
 int mm_flags = 0;
 
+int mm_support(void)
+{
+    int result = 0;
+#if HAVE_ALTIVEC
+    if (has_altivec()) {
+        result |= MM_ALTIVEC;
+    }
+#endif /* result */
+    return result;
+}
+
+#ifdef POWERPC_TBL_PERFORMANCE_REPORT
+unsigned long long perfdata[powerpc_perf_total][powerpc_data_total];
+/* list below must match enum in dsputil_ppc.h */
+static unsigned char* perfname[] = {
+  "fft_calc_altivec",
+  "gmc1_altivec",
+  "dct_unquantize_h263_altivec",
+  "idct_add_altivec",
+  "idct_put_altivec",
+  "put_pixels16_altivec",
+  "avg_pixels16_altivec",
+  "avg_pixels8_altivec",
+  "put_pixels8_xy2_altivec",
+  "put_no_rnd_pixels8_xy2_altivec",
+  "put_pixels16_xy2_altivec",
+  "put_no_rnd_pixels16_xy2_altivec",
+  "clear_blocks_dcbz32_ppc"
+};
+#ifdef POWERPC_PERF_USE_PMC
+unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total];
+#endif
+#include <stdio.h>
+#endif
+
+#ifdef POWERPC_TBL_PERFORMANCE_REPORT
+void powerpc_display_perf_report(void)
+{
+  int i;
+#ifndef POWERPC_PERF_USE_PMC
+  fprintf(stderr, "PowerPC performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
+#else /* POWERPC_PERF_USE_PMC */
+  fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
+#endif /* POWERPC_PERF_USE_PMC */
+  for(i = 0 ; i < powerpc_perf_total ; i++)
+  {
+    if (perfdata[i][powerpc_data_num] != (unsigned long long)0)
+      fprintf(stderr, " Function \"%s\" (pmc1):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
+              perfname[i],
+              perfdata[i][powerpc_data_min],
+              perfdata[i][powerpc_data_max],
+              (double)perfdata[i][powerpc_data_sum] /
+              (double)perfdata[i][powerpc_data_num],
+              perfdata[i][powerpc_data_num]);
+#ifdef POWERPC_PERF_USE_PMC
+    if (perfdata_miss[i][powerpc_data_num] != (unsigned long long)0)
+      fprintf(stderr, " Function \"%s\" (pmc2):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
+              perfname[i],
+              perfdata_miss[i][powerpc_data_min],
+              perfdata_miss[i][powerpc_data_max],
+              (double)perfdata_miss[i][powerpc_data_sum] /
+              (double)perfdata_miss[i][powerpc_data_num],
+              perfdata_miss[i][powerpc_data_num]);
+#endif
+  }
+}
+#endif /* POWERPC_TBL_PERFORMANCE_REPORT */
+
+/* ***** WARNING ***** WARNING ***** WARNING ***** */
+/*
+  clear_blocks_dcbz32_ppc will not work properly
+  on PowerPC processors with a cache line size
+  not equal to 32 bytes.
+  Fortunately all processor used by Apple up to
+  at least the 7450 (aka second generation G4)
+  use 32 bytes cache line.
+  This is due to the use of the 'dcbz' instruction.
+  It simply clear to zero a single cache line,
+  so you need to know the cache line size to use it !
+  It's absurd, but it's fast...
+*/
+void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
+{
+POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1);
+    register int misal = ((unsigned long)blocks & 0x00000010);
+    register int i = 0;
+POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
+#if 1
+    if (misal) {
+      ((unsigned long*)blocks)[0] = 0L;
+      ((unsigned long*)blocks)[1] = 0L;
+      ((unsigned long*)blocks)[2] = 0L;
+      ((unsigned long*)blocks)[3] = 0L;
+      i += 16;
+    }
+    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
+      asm volatile("dcbz %0,%1" : : "r" (blocks), "r" (i) : "memory");
+    }
+    if (misal) {
+      ((unsigned long*)blocks)[188] = 0L;
+      ((unsigned long*)blocks)[189] = 0L;
+      ((unsigned long*)blocks)[190] = 0L;
+      ((unsigned long*)blocks)[191] = 0L;
+      i += 16;
+    }
+#else
+    memset(blocks, 0, sizeof(DCTELEM)*6*64);
+#endif
+POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
+}
+
+/* check dcbz report how many bytes are set to 0 by dcbz */
+long check_dcbz_effect(void)
+{
+  register char *fakedata = (char*)av_malloc(1024);
+  register char *fakedata_middle;
+  register long zero = 0;
+  register long i = 0;
+  long count = 0;
+
+  if (!fakedata)
+  {
+    return 0L;
+  }
+
+  fakedata_middle = (fakedata + 512);
+
+  memset(fakedata, 0xFF, 1024);
+
+  asm volatile("dcbz %0, %1" : : "r" (fakedata_middle), "r" (zero));
+
+  for (i = 0; i < 1024 ; i ++)
+  {
+    if (fakedata[i] == (char)0)
+      count++;
+  }
+
+  av_free(fakedata);
+  
+  return count;
+}
+
 void dsputil_init_ppc(DSPContext* c, unsigned mask)
 {
     // Common optimisations whether Altivec or not
 
-    // ... pending ...
-
+  switch (check_dcbz_effect()) {
+  case 32:
+    c->clear_blocks = clear_blocks_dcbz32_ppc;
+    break;
+  default:
+    break;
+  }
+  
 #if HAVE_ALTIVEC
     if (has_altivec()) {
         mm_flags |= MM_ALTIVEC;
@@ -41,12 +191,51 @@ void dsputil_init_ppc(DSPContext* c, unsigned mask)
         c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
         c->pix_abs16x16 = pix_abs16x16_altivec;
         c->pix_abs8x8 = pix_abs8x8_altivec;
+        c->sad[0]= sad16x16_altivec;
+        c->sad[1]= sad8x8_altivec;
         c->pix_norm1 = pix_norm1_altivec;
+        c->sse[1]= sse8_altivec;
+        c->sse[0]= sse16_altivec;
         c->pix_sum = pix_sum_altivec;
         c->diff_pixels = diff_pixels_altivec;
         c->get_pixels = get_pixels_altivec;
+// next one disabled as it's untested.
+#if 0
+        c->add_bytes= add_bytes_altivec;
+#endif /* 0 */
+        c->put_pixels_tab[0][0] = put_pixels16_altivec;
+        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
+// next one disabled as it's untested.
+#if 0
+        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
+#endif /* 0 */
+        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
+        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
+        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
+        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
+        
+	c->gmc1 = gmc1_altivec;
+        
+#ifdef POWERPC_TBL_PERFORMANCE_REPORT
+        {
+          int i;
+          for (i = 0 ; i < powerpc_perf_total ; i++)
+          {
+            perfdata[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
+            perfdata[i][powerpc_data_max] = 0x0000000000000000;
+            perfdata[i][powerpc_data_sum] = 0x0000000000000000;
+            perfdata[i][powerpc_data_num] = 0x0000000000000000;
+#ifdef POWERPC_PERF_USE_PMC
+            perfdata_miss[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
+            perfdata_miss[i][powerpc_data_max] = 0x0000000000000000;
+            perfdata_miss[i][powerpc_data_sum] = 0x0000000000000000;
+            perfdata_miss[i][powerpc_data_num] = 0x0000000000000000;
+#endif /* POWERPC_PERF_USE_PMC */
+          }
+        }
+#endif /* POWERPC_TBL_PERFORMANCE_REPORT */
     } else
-#endif
+#endif /* HAVE_ALTIVEC */
     {
         // Non-AltiVec PPC optimisations
 
diff --git a/src/libffmpeg/libavcodec/ppc/fft_altivec.c b/src/libffmpeg/libavcodec/ppc/fft_altivec.c
index 1a926b77c..992be5b8e 100644
--- a/src/libffmpeg/libavcodec/ppc/fft_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/fft_altivec.c
@@ -1,7 +1,7 @@
 /*
  * FFT/IFFT transforms
  * AltiVec-enabled
- * Copyright (c) 2002 Romain Dolbeau <romain@dolbeau.org>
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  * Based on code Copyright (c) 2002 Fabrice Bellard.
  *
  * This library is free software; you can redistribute it and/or
@@ -22,30 +22,30 @@
 
 #include "dsputil_altivec.h"
 
-// used to build registers permutation vectors (vcprm)
-// the 's' are for words in the _s_econd vector
-#define WORD_0 0x00,0x01,0x02,0x03
-#define WORD_1 0x04,0x05,0x06,0x07
-#define WORD_2 0x08,0x09,0x0a,0x0b
-#define WORD_3 0x0c,0x0d,0x0e,0x0f
-#define WORD_s0 0x10,0x11,0x12,0x13
-#define WORD_s1 0x14,0x15,0x16,0x17
-#define WORD_s2 0x18,0x19,0x1a,0x1b
-#define WORD_s3 0x1c,0x1d,0x1e,0x1f
-
-#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
-
-// vcprmle is used to keep the same index as in the SSE version.
-// it's the same as vcprm, with the index inversed
-// ('le' is Little Endian)
-#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
-
-// used to build inverse/identity vectors (vcii)
-// n is _n_egative, p is _p_ositive
-#define FLOAT_n -1.
-#define FLOAT_p 1.
+/*
+  those three macros are from libavcodec/fft.c
+  and are required for the reference C code
+*/
+/* butter fly op */
+#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
+{\
+  FFTSample ax, ay, bx, by;\
+  bx=pre1;\
+  by=pim1;\
+  ax=qre1;\
+  ay=qim1;\
+  pre = (bx + ax);\
+  pim = (by + ay);\
+  qre = (bx - ax);\
+  qim = (by - ay);\
+}
+#define MUL16(a,b) ((a) * (b))
+#define CMUL(pre, pim, are, aim, bre, bim) \
+{\
+   pre = (MUL16(are, bre) - MUL16(aim, bim));\
+   pim = (MUL16(are, bim) + MUL16(bre, aim));\
+}
 
-#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
 
 /**
  * Do a complex FFT with the parameters defined in fft_init(). The
@@ -55,20 +55,94 @@
  * This code assumes that the 'z' pointer is 16 bytes-aligned
  * It also assumes all FFTComplex are 8 bytes-aligned pair of float
  * The code is exactly the same as the SSE version, except
- * that successive MUL + ADD/SUB have been fusionned into
+ * that successive MUL + ADD/SUB have been merged into
  * fused multiply-add ('vec_madd' in altivec)
- *
- * To test this code you can use fft-test in libavcodec ; use
- * the following line in libavcodec to compile (MacOS X):
- * #####
- * gcc -I. -Ippc -no-cpp-precomp -pipe -O3 -fomit-frame-pointer -mdynamic-no-pic -Wall
- *     -faltivec -DARCH_POWERPC -DHAVE_ALTIVEC -DCONFIG_DARWIN fft-test.c fft.c
- *     ppc/fft_altivec.c ppc/dsputil_altivec.c mdct.c -DHAVE_LRINTF -o fft-test
- * #####
  */
 void fft_calc_altivec(FFTContext *s, FFTComplex *z)
 {
-    register const vector float vczero = (vector float)( 0., 0., 0., 0.);
+POWERPC_TBL_DECLARE(altivec_fft_num, s->nbits >= 6);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int ln = s->nbits;
+    int	j, np, np2;
+    int	nblocks, nloops;
+    register FFTComplex *p, *q;
+    FFTComplex *exptab = s->exptab;
+    int l;
+    FFTSample tmp_re, tmp_im;
+    
+POWERPC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6);
+ 
+    np = 1 << ln;
+
+    /* pass 0 */
+
+    p=&z[0];
+    j=(np >> 1);
+    do {
+        BF(p[0].re, p[0].im, p[1].re, p[1].im, 
+           p[0].re, p[0].im, p[1].re, p[1].im);
+        p+=2;
+    } while (--j != 0);
+
+    /* pass 1 */
+
+    
+    p=&z[0];
+    j=np >> 2;
+    if (s->inverse) {
+        do {
+            BF(p[0].re, p[0].im, p[2].re, p[2].im, 
+               p[0].re, p[0].im, p[2].re, p[2].im);
+            BF(p[1].re, p[1].im, p[3].re, p[3].im, 
+               p[1].re, p[1].im, -p[3].im, p[3].re);
+            p+=4;
+        } while (--j != 0);
+    } else {
+        do {
+            BF(p[0].re, p[0].im, p[2].re, p[2].im, 
+               p[0].re, p[0].im, p[2].re, p[2].im);
+            BF(p[1].re, p[1].im, p[3].re, p[3].im, 
+               p[1].re, p[1].im, p[3].im, -p[3].re);
+            p+=4;
+        } while (--j != 0);
+    }
+    /* pass 2 .. ln-1 */
+
+    nblocks = np >> 3;
+    nloops = 1 << 2;
+    np2 = np >> 1;
+    do {
+        p = z;
+        q = z + nloops;
+        for (j = 0; j < nblocks; ++j) {
+            BF(p->re, p->im, q->re, q->im,
+               p->re, p->im, q->re, q->im);
+            
+            p++;
+            q++;
+            for(l = nblocks; l < np2; l += nblocks) {
+                CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
+                BF(p->re, p->im, q->re, q->im,
+                   p->re, p->im, tmp_re, tmp_im);
+                p++;
+                q++;
+            }
+
+            p += nloops;
+            q += nloops;
+        }
+        nblocks = nblocks >> 1;
+        nloops = nloops << 1;
+    } while (nblocks != 0);
+
+POWERPC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+#ifdef CONFIG_DARWIN
+    register const vector float vczero = (const vector float)(0.);
+#else
+    register const vector float vczero = (const vector float){0.,0.,0.,0.};
+#endif
     
     int ln = s->nbits;
     int	j, np, np2;
@@ -77,6 +151,8 @@ void fft_calc_altivec(FFTContext *s, FFTComplex *z)
     FFTComplex *cptr, *cptr1;
     int k;
 
+POWERPC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6);
+
     np = 1 << ln;
 
     {
@@ -162,5 +238,8 @@ void fft_calc_altivec(FFTContext *s, FFTComplex *z)
         nblocks = nblocks >> 1;
         nloops = nloops << 1;
     } while (nblocks != 0);
-}
 
+POWERPC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
+
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
diff --git a/src/libffmpeg/libavcodec/ppc/idct_altivec.c b/src/libffmpeg/libavcodec/ppc/idct_altivec.c
index 8036d403f..1619f1731 100644
--- a/src/libffmpeg/libavcodec/ppc/idct_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/idct_altivec.c
@@ -38,6 +38,7 @@
 #include <stdlib.h>                                      /* malloc(), free() */
 #include <string.h>
 #include "../dsputil.h"
+#include "dsputil_altivec.h"
 
 #define vector_s16_t vector signed short
 #define vector_u16_t vector unsigned short
@@ -150,6 +151,8 @@
     vx6 = vec_sra (vy6, shift);						\
     vx7 = vec_sra (vy7, shift);
 
+
+#ifdef CONFIG_DARWIN
 static const vector_s16_t constants[5] = {
     (vector_s16_t)(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
     (vector_s16_t)(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
@@ -157,11 +160,30 @@ static const vector_s16_t constants[5] = {
     (vector_s16_t)(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
     (vector_s16_t)(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
 };
+#else
+// broken gcc
+static const vector_s16_t constants[5] = {
+    (vector_s16_t){23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
+    (vector_s16_t){16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
+    (vector_s16_t){22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
+    (vector_s16_t){21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
+    (vector_s16_t){19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
+};
+#endif
 
 void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
 {
+POWERPC_TBL_DECLARE(altivec_idct_put_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+POWERPC_TBL_START_COUNT(altivec_idct_put_num, 1);
+    void simple_idct_put(UINT8 *dest, int line_size, INT16 *block);
+    simple_idct_put(dest, stride, (INT16*)block);
+POWERPC_TBL_STOP_COUNT(altivec_idct_put_num, 1);
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
     vector_u8_t tmp;
 
+POWERPC_TBL_START_COUNT(altivec_idct_put_num, 1);
+
     IDCT
 
 #define COPY(dest,src)						\
@@ -177,16 +199,28 @@ void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
     COPY (dest, vx5)	dest += stride;
     COPY (dest, vx6)	dest += stride;
     COPY (dest, vx7)
+
+POWERPC_TBL_STOP_COUNT(altivec_idct_put_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
 void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
 {
+POWERPC_TBL_DECLARE(altivec_idct_add_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+POWERPC_TBL_START_COUNT(altivec_idct_add_num, 1);
+    void simple_idct_add(UINT8 *dest, int line_size, INT16 *block);
+    simple_idct_add(dest, stride, (INT16*)block);
+POWERPC_TBL_STOP_COUNT(altivec_idct_add_num, 1);
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
     vector_u8_t tmp;
     vector_s16_t tmp2, tmp3;
     vector_u8_t perm0;
     vector_u8_t perm1;
     vector_u8_t p0, p1, p;
 
+POWERPC_TBL_START_COUNT(altivec_idct_add_num, 1);
+
     IDCT
 
     p0 = vec_lvsl (0, dest);
@@ -212,5 +246,8 @@ void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
     ADD (dest, vx5, perm1)	dest += stride;
     ADD (dest, vx6, perm0)	dest += stride;
     ADD (dest, vx7, perm1)
+
+POWERPC_TBL_STOP_COUNT(altivec_idct_add_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
index bcbc1e6ba..dd898e158 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_altivec.c
@@ -20,10 +20,7 @@
 #include <stdio.h>
 #include "../dsputil.h"
 #include "../mpegvideo.h"
-
-
-// Used when initializing constant vectors
-#define FOUR_INSTANCES(x) x,x,x,x
+#include "dsputil_altivec.h"
 
 // Swaps two variables (used for altivec registers)
 #define SWAP(a,b) \
@@ -93,6 +90,13 @@ do { \
     vec = vec_splat(vec, 0); \
 }
 
+
+#ifdef CONFIG_DARWIN
+#define FOUROF(a) (a)
+#else
+// slower, for dumb non-apple GCC
+#define FOUROF(a) {a,a,a,a}
+#endif
 int dct_quantize_altivec(MpegEncContext* s, 
                         DCTELEM* data, int n,
                         int qscale, int* overflow)
@@ -100,7 +104,7 @@ int dct_quantize_altivec(MpegEncContext* s,
     int lastNonZero;
     vector float row0, row1, row2, row3, row4, row5, row6, row7;
     vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
-    const vector float zero = {FOUR_INSTANCES(0.0f)};
+    const vector float zero = (const vector float)FOUROF(0.);
 
     // Load the data into the row/alt vectors
     {
@@ -144,18 +148,18 @@ int dct_quantize_altivec(MpegEncContext* s,
 		// in the vector local variables, as floats, which we'll use during the
 		// quantize step...
     {
-        const vector float vec_0_298631336 = {FOUR_INSTANCES(0.298631336f)};
-        const vector float vec_0_390180644 = {FOUR_INSTANCES(-0.390180644f)};
-        const vector float vec_0_541196100 = {FOUR_INSTANCES(0.541196100f)};
-        const vector float vec_0_765366865 = {FOUR_INSTANCES(0.765366865f)};
-        const vector float vec_0_899976223 = {FOUR_INSTANCES(-0.899976223f)};
-        const vector float vec_1_175875602 = {FOUR_INSTANCES(1.175875602f)};
-        const vector float vec_1_501321110 = {FOUR_INSTANCES(1.501321110f)};
-        const vector float vec_1_847759065 = {FOUR_INSTANCES(-1.847759065f)};
-        const vector float vec_1_961570560 = {FOUR_INSTANCES(-1.961570560f)};
-        const vector float vec_2_053119869 = {FOUR_INSTANCES(2.053119869f)};
-        const vector float vec_2_562915447 = {FOUR_INSTANCES(-2.562915447f)};
-        const vector float vec_3_072711026 = {FOUR_INSTANCES(3.072711026f)};
+        const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
+        const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
+        const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
+        const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
+        const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
+        const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
+        const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
+        const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
+        const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
+        const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
+        const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
+        const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);
 
 
         int whichPass, whichHalf;
@@ -309,7 +313,7 @@ int dct_quantize_altivec(MpegEncContext* s,
 				// rounding when we convert to int, instead of flooring.)
         {
             vector signed int biasInt;
-            const vector float negOneFloat = (vector float)(FOUR_INSTANCES(-1.0f));
+            const vector float negOneFloat = (vector float)FOUROF(-1.0f);
             LOAD4(biasInt, biasAddr);
             bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
             negBias = vec_madd(bias, negOneFloat, zero);
@@ -506,4 +510,133 @@ int dct_quantize_altivec(MpegEncContext* s,
 
     return lastNonZero;
 }
+#undef FOUROF
+
+/*
+  AltiVec version of dct_unquantize_h263
+  this code assumes `block' is 16 bytes-aligned
+*/
+void dct_unquantize_h263_altivec(MpegEncContext *s, 
+                                 DCTELEM *block, int n, int qscale)
+{
+POWERPC_TBL_DECLARE(altivec_dct_unquantize_h263_num, 1);
+    int i, level, qmul, qadd;
+    int nCoeffs;
+    
+    assert(s->block_last_index[n]>=0);
+
+POWERPC_TBL_START_COUNT(altivec_dct_unquantize_h263_num, 1);
+    
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+    
+    if (s->mb_intra) {
+        if (!s->h263_aic) {
+            if (n < 4) 
+                block[0] = block[0] * s->y_dc_scale;
+            else
+                block[0] = block[0] * s->c_dc_scale;
+        }else
+            qadd = 0;
+        i = 1;
+        nCoeffs= 63; //does not allways use zigzag table 
+    } else {
+        i = 0;
+        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+    }
 
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    for(;i<=nCoeffs;i++) {
+        level = block[i];
+        if (level) {
+            if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[i] = level;
+        }
+    }
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    {
+      register const vector short vczero = (const vector short)vec_splat_s16(0);
+      short __attribute__ ((aligned(16))) qmul8[] =
+          {
+            qmul, qmul, qmul, qmul,
+            qmul, qmul, qmul, qmul
+          };
+      short __attribute__ ((aligned(16))) qadd8[] =
+          {
+            qadd, qadd, qadd, qadd,
+            qadd, qadd, qadd, qadd
+          };
+      short __attribute__ ((aligned(16))) nqadd8[] =
+          {
+            -qadd, -qadd, -qadd, -qadd,
+            -qadd, -qadd, -qadd, -qadd
+          };
+      register vector short blockv, qmulv, qaddv, nqaddv, temp1;
+      register vector bool short blockv_null, blockv_neg;
+      register short backup_0 = block[0];
+      register int j = 0;
+      
+      qmulv = vec_ld(0, qmul8);
+      qaddv = vec_ld(0, qadd8);
+      nqaddv = vec_ld(0, nqadd8);
+
+#if 0 // block *is* 16 bytes-aligned, it seems.
+      // first make sure block[j] is 16 bytes-aligned
+      for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
+        level = block[j];
+        if (level) {
+          if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[j] = level;
+        }
+      }
+#endif
+      
+      // vectorize all the 16 bytes-aligned blocks
+      // of 8 elements
+      for(; (j + 7) <= nCoeffs ; j+=8)
+      {
+        blockv = vec_ld(j << 1, block);
+        blockv_neg = vec_cmplt(blockv, vczero);
+        blockv_null = vec_cmpeq(blockv, vczero);
+        // choose between +qadd or -qadd as the third operand
+        temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
+        // multiply & add (block{i,i+7} * qmul [+-] qadd)
+        temp1 = vec_mladd(blockv, qmulv, temp1);
+        // put 0 where block[{i,i+7} used to have 0
+        blockv = vec_sel(temp1, blockv, blockv_null);
+        vec_st(blockv, j << 1, block);
+      }
+
+      // if nCoeffs isn't a multiple of 8, finish the job
+      // using good old scalar units.
+      // (we could do it using a truncated vector,
+      // but I'm not sure it's worth the hassle)
+      for(; j <= nCoeffs ; j++) {
+        level = block[j];
+        if (level) {
+          if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[j] = level;
+        }
+      }
+      
+      if (i == 1)
+      { // cheat. this avoid special-casing the first iteration
+        block[0] = backup_0;
+      }
+    }
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+
+POWERPC_TBL_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
+}
diff --git a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
index 94d608b63..9757f5f39 100644
--- a/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/mpegvideo_ppc.c
@@ -27,6 +27,8 @@
 extern int dct_quantize_altivec(MpegEncContext *s,  
         DCTELEM *block, int n,
         int qscale, int *overflow);
+extern void dct_unquantize_h263_altivec(MpegEncContext *s,
+                                        DCTELEM *block, int n, int qscale);
 
 extern void idct_put_altivec(UINT8 *dest, int line_size, INT16 *block);
 extern void idct_add_altivec(UINT8 *dest, int line_size, INT16 *block);
@@ -42,7 +44,11 @@ void MPV_common_init_ppc(MpegEncContext *s)
         {
             s->idct_put = idct_put_altivec;
             s->idct_add = idct_add_altivec;
+#ifndef ALTIVEC_USE_REFERENCE_C_CODE
             s->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+            s->idct_permutation_type = FF_NO_IDCT_PERM;
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
         }
 
         // Test to make sure that the dct required alignments are met.
@@ -66,6 +72,7 @@ void MPV_common_init_ppc(MpegEncContext *s)
                 (s->avctx->dct_algo == FF_DCT_ALTIVEC))
         {
             s->dct_quantize = dct_quantize_altivec;
+            s->dct_unquantize_h263 = dct_unquantize_h263_altivec;
         }
     } else
 #endif
diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c
index bda408dfe..6bcbe1c67 100644
--- a/src/libffmpeg/libavcodec/ratecontrol.c
+++ b/src/libffmpeg/libavcodec/ratecontrol.c
@@ -751,8 +751,8 @@ static int init_pass2(MpegEncContext *s)
     }
 //printf("%lld %lld %lld %lld\n", available_bits[I_TYPE], available_bits[P_TYPE], available_bits[B_TYPE], all_available_bits);
         
-    qscale= malloc(sizeof(double)*rcc->num_entries);
-    blured_qscale= malloc(sizeof(double)*rcc->num_entries);
+    qscale= av_malloc(sizeof(double)*rcc->num_entries);
+    blured_qscale= av_malloc(sizeof(double)*rcc->num_entries);
 
     for(step=256*256; step>0.0000001; step*=0.5){
         expected_bits=0;
@@ -809,8 +809,8 @@ static int init_pass2(MpegEncContext *s)
 //        printf("%f %d %f\n", expected_bits, (int)all_available_bits, rate_factor);
         if(expected_bits > all_available_bits) rate_factor-= step;
     }
-    free(qscale);
-    free(blured_qscale);
+    av_free(qscale);
+    av_free(blured_qscale);
 
     if(abs(expected_bits/all_available_bits - 1.0) > 0.01 ){
         fprintf(stderr, "Error: 2pass curve failed to converge\n");
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 4907c2347..012b1dc5c 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -395,7 +395,7 @@ static int rv10_decode_packet(AVCodecContext *avctx,
     MpegEncContext *s = avctx->priv_data;
     int i, mb_count, mb_pos, left;
 
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
     
     mb_count = rv10_decode_picture_header(s);
     if (mb_count < 0) {
diff --git a/src/libffmpeg/libavcodec/simple_idct.c b/src/libffmpeg/libavcodec/simple_idct.c
index 8c9ce7b93..703e94f21 100644
--- a/src/libffmpeg/libavcodec/simple_idct.c
+++ b/src/libffmpeg/libavcodec/simple_idct.c
@@ -67,7 +67,7 @@
 
 #endif
 
-static inline void idctRowCondDC (int16_t * row)
+static inline void idctRowCondDC (DCTELEM * row)
 {
 	int a0, a1, a2, a3, b0, b1, b2, b3;
 #ifdef FAST_64BIT
@@ -82,26 +82,40 @@ static inline void idctRowCondDC (int16_t * row)
 #else
 #define ROW0_MASK 0xffffLL
 #endif
-	if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | 
-              ((uint64_t *)row)[1]) == 0) {
-            temp = (row[0] << 3) & 0xffff;
-            temp += temp << 16;
-            temp += temp << 32;
-            ((uint64_t *)row)[0] = temp;
-            ((uint64_t *)row)[1] = temp;
-            return;
-	}
+        if(sizeof(DCTELEM)==2){
+            if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | 
+                  ((uint64_t *)row)[1]) == 0) {
+                temp = (row[0] << 3) & 0xffff;
+                temp += temp << 16;
+                temp += temp << 32;
+                ((uint64_t *)row)[0] = temp;
+                ((uint64_t *)row)[1] = temp;
+                return;
+	    }
+        }else{
+            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
+                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
+                return;
+            }
+        }
 #else
-	if (!(((uint32_t*)row)[1] |
-              ((uint32_t*)row)[2] |
-              ((uint32_t*)row)[3] | 
-              row[1])) {
-            temp = (row[0] << 3) & 0xffff;
-            temp += temp << 16;
-            ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
-		((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
-		return;
-	}
+        if(sizeof(DCTELEM)==2){
+            if (!(((uint32_t*)row)[1] |
+                  ((uint32_t*)row)[2] |
+                  ((uint32_t*)row)[3] | 
+                  row[1])) {
+                temp = (row[0] << 3) & 0xffff;
+                temp += temp << 16;
+                ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
+                ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
+                return;
+            }
+        }else{
+            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
+                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
+                return;
+            }
+        }
 #endif
 
         a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
@@ -159,7 +173,7 @@ static inline void idctRowCondDC (int16_t * row)
 }
 
 static inline void idctSparseColPut (UINT8 *dest, int line_size, 
-                                     int16_t * col)
+                                     DCTELEM * col)
 {
 	int a0, a1, a2, a3, b0, b1, b2, b3;
         UINT8 *cm = cropTbl + MAX_NEG_CROP;
@@ -231,7 +245,7 @@ static inline void idctSparseColPut (UINT8 *dest, int line_size,
 }
 
 static inline void idctSparseColAdd (UINT8 *dest, int line_size, 
-                                     int16_t * col)
+                                     DCTELEM * col)
 {
 	int a0, a1, a2, a3, b0, b1, b2, b3;
         UINT8 *cm = cropTbl + MAX_NEG_CROP;
@@ -302,7 +316,7 @@ static inline void idctSparseColAdd (UINT8 *dest, int line_size,
         dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
 }
 
-static inline void idctSparseCol (int16_t * col)
+static inline void idctSparseCol (DCTELEM * col)
 {
 	int a0, a1, a2, a3, b0, b1, b2, b3;
 
@@ -365,7 +379,7 @@ static inline void idctSparseCol (int16_t * col)
         col[56] = ((a0 - b0) >> COL_SHIFT);
 }
 
-void simple_idct_put(UINT8 *dest, int line_size, INT16 *block)
+void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
 {
     int i;
     for(i=0; i<8; i++)
@@ -375,7 +389,7 @@ void simple_idct_put(UINT8 *dest, int line_size, INT16 *block)
         idctSparseColPut(dest + i, line_size, block + i);
 }
 
-void simple_idct_add(UINT8 *dest, int line_size, INT16 *block)
+void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
 {
     int i;
     for(i=0; i<8; i++)
@@ -385,7 +399,7 @@ void simple_idct_add(UINT8 *dest, int line_size, INT16 *block)
         idctSparseColAdd(dest + i, line_size, block + i);
 }
 
-void simple_idct(INT16 *block)
+void simple_idct(DCTELEM *block)
 {
     int i;
     for(i=0; i<8; i++)
@@ -406,7 +420,7 @@ void simple_idct(INT16 *block)
    and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
 #define C_SHIFT (4+1+12)
 
-static inline void idct4col(UINT8 *dest, int line_size, const INT16 *col)
+static inline void idct4col(UINT8 *dest, int line_size, const DCTELEM *col)
 {
     int c0, c1, c2, c3, a0, a1, a2, a3;
     const UINT8 *cm = cropTbl + MAX_NEG_CROP;
@@ -443,10 +457,10 @@ static inline void idct4col(UINT8 *dest, int line_size, const INT16 *col)
 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
    compensate the extra butterfly stage - I don't have the full DV
    specification */
-void simple_idct248_put(UINT8 *dest, int line_size, INT16 *block)
+void simple_idct248_put(UINT8 *dest, int line_size, DCTELEM *block)
 {
     int i;
-    INT16 *ptr;
+    DCTELEM *ptr;
     
     /* butterfly */
     ptr = block;
@@ -486,7 +500,7 @@ void simple_idct248_put(UINT8 *dest, int line_size, INT16 *block)
 #define C2 C_FIX(0.2705980501)
 #define C3 C_FIX(0.5)
 #define C_SHIFT (4+1+12)
-static inline void idct4col_add(UINT8 *dest, int line_size, const INT16 *col)
+static inline void idct4col_add(UINT8 *dest, int line_size, const DCTELEM *col)
 {
     int c0, c1, c2, c3, a0, a1, a2, a3;
     const UINT8 *cm = cropTbl + MAX_NEG_CROP;
@@ -514,7 +528,7 @@ static inline void idct4col_add(UINT8 *dest, int line_size, const INT16 *col)
 #define R2 R_FIX(0.2705980501)
 #define R3 R_FIX(0.5)
 #define R_SHIFT 11
-static inline void idct4row(INT16 *row)
+static inline void idct4row(DCTELEM *row)
 {
     int c0, c1, c2, c3, a0, a1, a2, a3;
     const UINT8 *cm = cropTbl + MAX_NEG_CROP;
@@ -533,7 +547,7 @@ static inline void idct4row(INT16 *row)
     row[3]= (c0 - c1) >> R_SHIFT;
 }
 
-void simple_idct84_add(UINT8 *dest, int line_size, INT16 *block)
+void simple_idct84_add(UINT8 *dest, int line_size, DCTELEM *block)
 {
     int i;
 
@@ -548,7 +562,7 @@ void simple_idct84_add(UINT8 *dest, int line_size, INT16 *block)
     }
 }
 
-void simple_idct48_add(UINT8 *dest, int line_size, INT16 *block)
+void simple_idct48_add(UINT8 *dest, int line_size, DCTELEM *block)
 {
     int i;
 
diff --git a/src/libffmpeg/libavcodec/simple_idct.h b/src/libffmpeg/libavcodec/simple_idct.h
index 428c6072c..0ee1e05ed 100644
--- a/src/libffmpeg/libavcodec/simple_idct.h
+++ b/src/libffmpeg/libavcodec/simple_idct.h
@@ -18,14 +18,14 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-void simple_idct_put(UINT8 *dest, int line_size, INT16 *block);
-void simple_idct_add(UINT8 *dest, int line_size, INT16 *block);
-void ff_simple_idct_mmx(short *block);
-void ff_simple_idct_add_mmx(UINT8 *dest, int line_size, INT16 *block);
-void ff_simple_idct_put_mmx(UINT8 *dest, int line_size, INT16 *block);
-void simple_idct(short *block);
+void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
+void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_mmx(int16_t *block);
+void ff_simple_idct_add_mmx(UINT8 *dest, int line_size, int16_t *block);
+void ff_simple_idct_put_mmx(UINT8 *dest, int line_size, int16_t *block);
+void simple_idct(DCTELEM *block);
 
-void simple_idct248_put(UINT8 *dest, int line_size, INT16 *block);
+void simple_idct248_put(UINT8 *dest, int line_size, DCTELEM *block);
 
-void simple_idct84_add(UINT8 *dest, int line_size, INT16 *block);
-void simple_idct48_add(UINT8 *dest, int line_size, INT16 *block);
+void simple_idct84_add(UINT8 *dest, int line_size, DCTELEM *block);
+void simple_idct48_add(UINT8 *dest, int line_size, DCTELEM *block);
diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c
index 77035f1f9..5a9a290b8 100644
--- a/src/libffmpeg/libavcodec/svq1.c
+++ b/src/libffmpeg/libavcodec/svq1.c
@@ -1066,7 +1066,7 @@ static int svq1_decode_frame(AVCodecContext *avctx,
   AVFrame *pict = data; 
 
   /* initialize bit buffer */
-  init_get_bits(&s->gb,buf,buf_size);
+  init_get_bits(&s->gb,buf,buf_size*8);
 
   /* decode frame header */
   s->f_code = get_bits (&s->gb, 22);
@@ -1093,6 +1093,10 @@ static int svq1_decode_frame(AVCodecContext *avctx,
     return result;
   }
   
+  //FIXME this avoids some confusion for "B frames" without 2 references
+  //this should be removed after libavcodec can handle more flaxible picture types & ordering
+  if(s->pict_type==B_TYPE && s->last_picture.data[0]==NULL) return buf_size;
+  
   if(avctx->hurry_up && s->pict_type==B_TYPE) return buf_size;
 
   if(MPV_frame_start(s, avctx) < 0)
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index af6ba986b..ca71807f7 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -24,8 +24,6 @@ void *av_mallocz(unsigned int size)
 {
     void *ptr;
     
-    if(size == 0) fprintf(stderr, "Warning, allocating 0 bytes\n");
-    
     ptr = av_malloc(size);
     if (!ptr)
         return NULL;
@@ -33,6 +31,32 @@ void *av_mallocz(unsigned int size)
     return ptr;
 }
 
+char *av_strdup(const char *s)
+{
+    char *ptr;
+    int len;
+    len = strlen(s) + 1;
+    ptr = av_malloc(len);
+    if (!ptr)
+        return NULL;
+    memcpy(ptr, s, len);
+    return ptr;
+}
+
+/**
+ * realloc which does nothing if the block is large enough
+ */
+void *av_fast_realloc(void *ptr, int *size, int min_size)
+{
+    if(min_size < *size) 
+        return ptr;
+    
+    *size= min_size + 10*1024;
+
+    return av_realloc(ptr, *size);
+}
+
+
 /* allocation of static arrays - do not use for normal allocation */
 static unsigned int last_static = 0;
 static char*** array_static = NULL;
@@ -47,7 +71,7 @@ void *__av_mallocz_static(void** location, unsigned int size)
     if (location)
     {
 	if (l > last_static)
-	    array_static = realloc(array_static, l);
+	    array_static = av_realloc(array_static, l);
 	array_static[last_static++] = (char**) location;
 	*location = ptr;
     }
@@ -61,10 +85,10 @@ void av_free_static()
 	unsigned i;
 	for (i = 0; i < last_static; i++)
 	{
-	    free(*array_static[i]);
+	    av_free(*array_static[i]);
             *array_static[i] = NULL;
 	}
-	free(array_static);
+	av_free(array_static);
 	array_static = 0;
     }
     last_static = 0;
@@ -89,32 +113,6 @@ void register_avcodec(AVCodec *format)
     format->next = NULL;
 }
 
-void avcodec_get_chroma_sub_sample(int fmt, int *h_shift, int *v_shift){
-    switch(fmt){
-    case PIX_FMT_YUV410P:
-        *h_shift=2;
-        *v_shift=2;
-        break;
-    case PIX_FMT_YUV420P:
-        *h_shift=1;
-        *v_shift=1;
-        break;
-    case PIX_FMT_YUV411P:
-        *h_shift=2;
-        *v_shift=0;
-        break;
-    case PIX_FMT_YUV422P:
-    case PIX_FMT_YUV422:
-        *h_shift=1;
-        *v_shift=0;
-        break;
-    default: //RGB/...
-        *h_shift=0;
-        *v_shift=0;
-        break;
-    }
-}
-
 typedef struct DefaultPicOpaque{
     int last_pic_num;
     uint8_t *data[4];
@@ -125,10 +123,10 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
     const int width = s->width;
     const int height= s->height;
     DefaultPicOpaque *opaque;
-    
+/*    
     assert(pic->data[0]==NULL);
-    /* assert(pic->type==0 || pic->type==FF_TYPE_INTERNAL); */
-
+    assert(pic->type==0 || pic->type==FF_TYPE_INTERNAL);
+*/
     if(pic->opaque){
         opaque= (DefaultPicOpaque *)pic->opaque;
         for(i=0; i<3; i++)
@@ -152,7 +150,6 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
         case PIX_FMT_BGR24:
             pixel_size=3;
             break;
-        case PIX_FMT_BGRA32:
         case PIX_FMT_RGBA32:
             pixel_size=4;
             break;
@@ -212,6 +209,10 @@ void avcodec_default_release_buffer(AVCodecContext *s, AVFrame *pic){
 //printf("R%X\n", pic->opaque);
 }
 
+enum PixelFormat avcodec_default_get_format(struct AVCodecContext *s, enum PixelFormat * fmt){
+    return fmt[0];
+}
+
 void avcodec_get_context_defaults(AVCodecContext *s){
     s->bit_rate= 800*1000;
     s->bit_rate_tolerance= s->bit_rate*10;
@@ -234,6 +235,7 @@ void avcodec_get_context_defaults(AVCodecContext *s){
     s->me_method= ME_EPZS;
     s->get_buffer= avcodec_default_get_buffer;
     s->release_buffer= avcodec_default_release_buffer;
+    s->get_format= avcodec_default_get_format;
     s->me_subpel_quality=8;
 }
 
@@ -410,19 +412,6 @@ AVCodec *avcodec_find(enum CodecID id)
     return NULL;
 }
 
-const char *pix_fmt_str[] = {
-    "yuv420p",
-    "yuv422",
-    "rgb24",
-    "bgr24",
-    "yuv422p",
-    "yuv444p",
-    "rgba32",
-    "bgra32",
-    "yuv410p",
-    "yuv411p",
-};
-
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 {
     const char *codec_name;
@@ -462,7 +451,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
         if (enc->codec_id == CODEC_ID_RAWVIDEO) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
                      ", %s",
-                     pix_fmt_str[enc->pix_fmt]);
+                     avcodec_get_pix_fmt_name(enc->pix_fmt));
         }
         if (enc->width) {
             snprintf(buf + strlen(buf), buf_size - strlen(buf),
@@ -537,99 +526,6 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
     }
 }
 
-/* Picture field are filled with 'ptr' addresses */
-void avpicture_fill(AVPicture *picture, UINT8 *ptr,
-                    int pix_fmt, int width, int height)
-{
-    int size;
-
-    size = width * height;
-    switch(pix_fmt) {
-    case PIX_FMT_YUV420P:
-        picture->data[0] = ptr;
-        picture->data[1] = picture->data[0] + size;
-        picture->data[2] = picture->data[1] + size / 4;
-        picture->linesize[0] = width;
-        picture->linesize[1] = width / 2;
-        picture->linesize[2] = width / 2;
-        break;
-    case PIX_FMT_YUV422P:
-        picture->data[0] = ptr;
-        picture->data[1] = picture->data[0] + size;
-        picture->data[2] = picture->data[1] + size / 2;
-        picture->linesize[0] = width;
-        picture->linesize[1] = width / 2;
-        picture->linesize[2] = width / 2;
-        break;
-    case PIX_FMT_YUV444P:
-        picture->data[0] = ptr;
-        picture->data[1] = picture->data[0] + size;
-        picture->data[2] = picture->data[1] + size;
-        picture->linesize[0] = width;
-        picture->linesize[1] = width;
-        picture->linesize[2] = width;
-        break;
-    case PIX_FMT_RGB24:
-    case PIX_FMT_BGR24:
-        picture->data[0] = ptr;
-        picture->data[1] = NULL;
-        picture->data[2] = NULL;
-        picture->linesize[0] = width * 3;
-        break;
-    case PIX_FMT_RGBA32:
-    case PIX_FMT_BGRA32:
-        picture->data[0] = ptr;
-        picture->data[1] = NULL;
-        picture->data[2] = NULL;
-        picture->linesize[0] = width * 4;
-        break;
-    case PIX_FMT_YUV422:
-        picture->data[0] = ptr;
-        picture->data[1] = NULL;
-        picture->data[2] = NULL;
-        picture->linesize[0] = width * 2;
-        break;
-    default:
-        picture->data[0] = NULL;
-        picture->data[1] = NULL;
-        picture->data[2] = NULL;
-        break;
-    }
-}
-
-int avpicture_get_size(int pix_fmt, int width, int height)
-{
-    int size;
-
-    size = width * height;
-    switch(pix_fmt) {
-    case PIX_FMT_YUV420P:
-        size = (size * 3) / 2;
-        break;
-    case PIX_FMT_YUV422P:
-        size = (size * 2);
-        break;
-    case PIX_FMT_YUV444P:
-        size = (size * 3);
-        break;
-    case PIX_FMT_RGB24:
-    case PIX_FMT_BGR24:
-        size = (size * 3);
-        break;
-    case PIX_FMT_RGBA32:
-    case PIX_FMT_BGRA32:
-        size = (size * 4);
-        break;
-    case PIX_FMT_YUV422:
-        size = (size * 2);
-        break;
-    default:
-        size = -1;
-        break;
-    }
-    return size;
-}
-
 unsigned avcodec_version( void )
 {
   return LIBAVCODEC_VERSION_INT;
diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c
index a6fa2f8b2..5305e1c5d 100644
--- a/src/libffmpeg/libavcodec/wmadec.c
+++ b/src/libffmpeg/libavcodec/wmadec.c
@@ -92,7 +92,7 @@ typedef struct WMADecodeContext {
     int16_t coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
     float coefs[MAX_CHANNELS][BLOCK_MAX_SIZE] __attribute__((aligned(16)));
     MDCTContext mdct_ctx[BLOCK_NB_SIZES];
-    float *windows[BLOCK_NB_SIZES] __attribute__((aligned(16)));
+    float *windows[BLOCK_NB_SIZES];
     FFTSample mdct_tmp[BLOCK_MAX_SIZE] __attribute__((aligned(16))); /* temporary storage for imdct */
     /* output buffer for one frame and the last for IMDCT windowing */
     float frame_out[MAX_CHANNELS][BLOCK_MAX_SIZE * 2] __attribute__((aligned(16)));
@@ -212,8 +212,8 @@ static void init_coef_vlc(VLC *vlc,
 
     init_vlc(vlc, 9, n, table_bits, 1, 1, table_codes, 4, 4);
 
-    run_table = malloc(n * sizeof(uint16_t));
-    level_table = malloc(n * sizeof(uint16_t));
+    run_table = av_malloc(n * sizeof(uint16_t));
+    level_table = av_malloc(n * sizeof(uint16_t));
     p = levels_table;
     i = 2;
     level = 1;
@@ -1226,7 +1226,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
 
     samples = data;
 
-    init_get_bits(&s->gb, buf, buf_size);
+    init_get_bits(&s->gb, buf, buf_size*8);
     
     if (s->use_bit_reservoir) {
         /* read super frame header */
@@ -1252,7 +1252,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
             }
             
             /* XXX: bit_offset bits into last frame */
-            init_get_bits(&s->gb, s->last_superframe, MAX_CODED_SUPERFRAME_SIZE);
+            init_get_bits(&s->gb, s->last_superframe, MAX_CODED_SUPERFRAME_SIZE*8);
             /* skip unused bits */
             if (s->last_bitoffset > 0)
                 skip_bits(&s->gb, s->last_bitoffset);
@@ -1265,7 +1265,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
 
         /* read each frame starting from bit_offset */
         pos = bit_offset + 4 + 4 + s->byte_offset_bits + 3;
-        init_get_bits(&s->gb, buf + (pos >> 3), MAX_CODED_SUPERFRAME_SIZE - (pos >> 3));
+        init_get_bits(&s->gb, buf + (pos >> 3), (MAX_CODED_SUPERFRAME_SIZE - (pos >> 3))*8);
         len = pos & 7;
         if (len > 0)
             skip_bits(&s->gb, len);
diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c
index d25b7a5f1..6def6f2a8 100644
--- a/src/libffmpeg/libavcodec/wmv2.c
+++ b/src/libffmpeg/libavcodec/wmv2.c
@@ -313,7 +313,7 @@ static int decode_ext_header(Wmv2Context *w){
 
     if(s->avctx->extradata_size<4) return -1;
     
-    init_get_bits(&gb, s->avctx->extradata, s->avctx->extradata_size);
+    init_get_bits(&gb, s->avctx->extradata, s->avctx->extradata_size*8);
 
     fps                = get_bits(&gb, 5);
     s->bit_rate        = get_bits(&gb, 11)*1024;
@@ -330,8 +330,9 @@ static int decode_ext_header(Wmv2Context *w){
     s->slice_height = s->mb_height / code;
 
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-        printf("fps:%d, br:%d, qpbit:%d, abt_flag:%d, j_type_bit:%d, tl_mv_flag:%d, mbrl_bit:%d, code:%d, flag3:%d\n", 
-        fps, s->bit_rate, w->mspel_bit, w->abt_flag, w->j_type_bit, w->top_left_mv_flag, w->per_mb_rl_bit, code, w->flag3);
+        printf("fps:%d, br:%d, qpbit:%d, abt_flag:%d, j_type_bit:%d, tl_mv_flag:%d, mbrl_bit:%d, code:%d, flag3:%d, slices:%d\n", 
+        fps, s->bit_rate, w->mspel_bit, w->abt_flag, w->j_type_bit, w->top_left_mv_flag, w->per_mb_rl_bit, code, w->flag3, 
+        code);
     }
     return 0;
 }
@@ -503,8 +504,7 @@ static int16_t *wmv2_pred_motion(Wmv2Context *w, int *px, int *py){
     
     diff= FFMAX(ABS(A[0] - B[0]), ABS(A[1] - B[1]));
     
-    if(s->mb_x && s->mb_y && !s->mspel && w->top_left_mv_flag && diff >= 8)
-        //FIXME top/left bit too if y=!0 && first_slice_line?
+    if(s->mb_x && !s->first_slice_line && !s->mspel && w->top_left_mv_flag && diff >= 8)
         type= get_bits1(&s->gb);
     else
         type= 2;
@@ -577,16 +577,7 @@ static void wmv2_add_block(Wmv2Context *w, DCTELEM *block1, uint8_t *dst, int st
     MpegEncContext * const s= &w->s;
     uint8_t temp[2][64];
     int i;
-    
-    if(w->abt_type_table[n] && 0){
-        int a,b;
         
-        a= block1[0];
-        b= w->abt_block2[n][0];
-        block1[0]= a+b;
-        w->abt_block2[n][0]= a-b;
-    }
-    
     switch(w->abt_type_table[n]){
     case 0:
         if (s->block_last_index[n] >= 0) {
diff --git a/src/libffmpeg/xine_decoder.c b/src/libffmpeg/xine_decoder.c
index 8be7844c2..fe1aca44d 100644
--- a/src/libffmpeg/xine_decoder.c
+++ b/src/libffmpeg/xine_decoder.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: xine_decoder.c,v 1.87 2003/01/10 23:33:09 holstsn Exp $
+ * $Id: xine_decoder.c,v 1.88 2003/01/31 18:29:43 miguelfreitas Exp $
  *
  * xine decoder plugin using ffmpeg
  *
@@ -667,7 +667,7 @@ void avcodec_register_all(void)
     register_avcodec(&rv10_decoder);
     register_avcodec(&svq1_decoder);
     register_avcodec(&dvvideo_decoder);
-    //    register_avcodec(&dvaudio_decoder);
+    register_avcodec(&dvaudio_decoder);
     register_avcodec(&mjpeg_decoder);
     register_avcodec(&mjpegb_decoder);
     register_avcodec(&mp2_decoder);
@@ -810,6 +810,16 @@ static void ff_audio_decode_data (audio_decoder_t *this_gen, buf_element_t *buf)
       this->stream->meta_info[XINE_META_INFO_AUDIOCODEC] 
 	= strdup ("Windows Media Audio v2 (ffmpeg)");
       break;
+    case BUF_AUDIO_DV:
+      this->codec = avcodec_find_decoder (CODEC_ID_DVAUDIO);
+      this->stream->meta_info[XINE_META_INFO_AUDIOCODEC] 
+	= strdup ("DV Audio (ffmpeg)");
+      break;
+    case BUF_AUDIO_MPEG:
+      this->codec = avcodec_find_decoder (CODEC_ID_MP3LAME);
+      this->stream->meta_info[XINE_META_INFO_AUDIOCODEC] 
+	= strdup ("MP3 (ffmpeg)");
+      break;
     }
 
     if (!this->codec) {
@@ -1034,14 +1044,14 @@ static uint32_t supported_video_types[] = {
   BUF_VIDEO_MSMPEG4_V2,
   BUF_VIDEO_MSMPEG4_V3, 
   BUF_VIDEO_WMV7, 
-  /*BUF_VIDEO_WMV8,*/
+  /* BUF_VIDEO_WMV8, */
   BUF_VIDEO_MPEG4,
   BUF_VIDEO_XVID, 
   BUF_VIDEO_DIVX5, 
   BUF_VIDEO_MJPEG,
   BUF_VIDEO_H263, 
   BUF_VIDEO_RV10,
-  /* BUF_VIDEO_SORENSON_V1, -- ffmpeg svq1 decoder is segfaulting */ 
+  BUF_VIDEO_SORENSON_V1,
   BUF_VIDEO_JPEG, 
   BUF_VIDEO_MPEG, 
   BUF_VIDEO_DV,
@@ -1051,6 +1061,8 @@ static uint32_t supported_video_types[] = {
 static uint32_t supported_audio_types[] = { 
   BUF_AUDIO_WMAV1,
   BUF_AUDIO_WMAV2,
+  BUF_AUDIO_DV,
+  /* BUF_AUDIO_MPEG, */
   0
 };
 
diff --git a/src/libmad/xine_decoder.c b/src/libmad/xine_decoder.c
index b53682387..a4f30eee3 100644
--- a/src/libmad/xine_decoder.c
+++ b/src/libmad/xine_decoder.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: xine_decoder.c,v 1.40 2003/01/11 11:29:22 esnel Exp $
+ * $Id: xine_decoder.c,v 1.41 2003/01/31 18:29:47 miguelfreitas Exp $
  *
  * stuff needed to turn libmad into a xine decoder plugin
  */
@@ -355,7 +355,7 @@ static uint32_t audio_types[] = {
 
 static decoder_info_t dec_info_audio = {
   audio_types,         /* supported types */
-  5                    /* priority        */
+  6                    /* priority        */
 };
 
 plugin_info_t xine_plugin_info[] = {
diff --git a/src/libxinevdec/svq1.c b/src/libxinevdec/svq1.c
index 935172ff2..c92795b6a 100644
--- a/src/libxinevdec/svq1.c
+++ b/src/libxinevdec/svq1.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: svq1.c,v 1.23 2003/01/08 01:02:32 miguelfreitas Exp $
+ * $Id: svq1.c,v 1.24 2003/01/31 18:29:47 miguelfreitas Exp $
  */
 
 #include <stdio.h>
@@ -1495,7 +1495,7 @@ static uint32_t video_types[] = {
 
 static decoder_info_t dec_info_video = {
   video_types,         /* supported types */
-  4                    /* priority        */
+  6                    /* priority        */
 };
 
 plugin_info_t xine_plugin_info[] = {
diff --git a/src/xine-engine/buffer.h b/src/xine-engine/buffer.h
index 64bbaa8d2..057e3222b 100644
--- a/src/xine-engine/buffer.h
+++ b/src/xine-engine/buffer.h
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: buffer.h,v 1.98 2003/01/26 23:36:46 f1rmb Exp $
+ * $Id: buffer.h,v 1.99 2003/01/31 18:29:47 miguelfreitas Exp $
  *
  *
  * contents:
@@ -193,6 +193,7 @@ extern "C" {
 #define BUF_AUDIO_DIALOGIC_IMA	0x032A0000
 #define BUF_AUDIO_NSF		0x032B0000
 #define BUF_AUDIO_FLAC		0x032C0000
+#define BUF_AUDIO_DV		0x032D0000
 
 /* spu buffer types:    */
  
diff --git a/src/xine-engine/buffer_types.c b/src/xine-engine/buffer_types.c
index 5b82579ed..9850e4a68 100644
--- a/src/xine-engine/buffer_types.c
+++ b/src/xine-engine/buffer_types.c
@@ -17,7 +17,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  *
- * $Id: buffer_types.c,v 1.52 2003/01/23 16:12:19 miguelfreitas Exp $
+ * $Id: buffer_types.c,v 1.53 2003/01/31 18:29:47 miguelfreitas Exp $
  *
  *
  * contents:
@@ -802,6 +802,13 @@ static audio_db_t audio_db[] = {
   BUF_AUDIO_FLAC,
   "Free Lossless Audio Codec (FLAC)"
 },
+{
+  {
+    0
+  },
+  BUF_AUDIO_DV,
+  "DV Audio"
+},
 { { 0 }, 0, "last entry" }
 };
author	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-01-31 18:29:43 +0000
committer	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-01-31 18:29:43 +0000
commit	5350f2b7701f01bc4f234d3971fb8a623a8cd72a (patch)
tree	5f6cd350778863ad8d2612bce4ac2f6270919115
parent	8b0e8647a0d0c279b6a355362452dff4bd6f5c05 (diff)
download	xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.gz xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.bz2