115 files changed, 27562 insertions, 6551 deletions
diff --git a/CREDITS b/CREDITS
index 35ae72796..8028e1f34 100644
--- a/CREDITS
+++ b/CREDITS
@@ -12,7 +12,7 @@ updates (the word 'maintainer' is intentionally avoided here).
 project				version			mediator
 -----------------------------------------------------------------------
 
-ffmpeg				build 4715		Mike Melanson
+ffmpeg				build 4752		Mike Melanson
 goom				2k4-dev21
 gsm610				1.0.10			Mike Melanson
 liba52				0.7.4
diff --git a/src/libffmpeg/libavcodec/4xm.c b/src/libffmpeg/libavcodec/4xm.c
index fd84f8968..6932d52ab 100644
--- a/src/libffmpeg/libavcodec/4xm.c
+++ b/src/libffmpeg/libavcodec/4xm.c
@@ -220,17 +220,12 @@ static void idct(DCTELEM block[64]){
 }
 
 static void init_vlcs(FourXContext *f){
-    static int done = 0;
     int i;
 
-    if (!done) {
-        done = 1;
-
-        for(i=0; i<4; i++){
-            init_vlc(&block_type_vlc[i], BLOCK_TYPE_VLC_BITS, 7, 
-                     &block_type_tab[i][0][1], 2, 1,
-                     &block_type_tab[i][0][0], 2, 1);
-        }
+    for(i=0; i<4; i++){
+        init_vlc(&block_type_vlc[i], BLOCK_TYPE_VLC_BITS, 7, 
+                 &block_type_tab[i][0][1], 2, 1,
+                 &block_type_tab[i][0][0], 2, 1, 1);
     }
 }
 
@@ -328,13 +323,19 @@ static int decode_p_frame(FourXContext *f, uint8_t *buf, int length){
     uint16_t *src= (uint16_t*)f->last_picture.data[0];
     uint16_t *dst= (uint16_t*)f->current_picture.data[0];
     const int stride= f->current_picture.linesize[0]>>1;
-    const int bitstream_size= get32(buf+8);
-    const int bytestream_size= get32(buf+16);
-    const int wordstream_size= get32(buf+12);
+    const unsigned int bitstream_size= get32(buf+8);
+    const unsigned int bytestream_size= get32(buf+16);
+    const unsigned int wordstream_size= get32(buf+12);
     
-    if(bitstream_size+ bytestream_size+ wordstream_size + 20 != length)
+    if(bitstream_size+ bytestream_size+ wordstream_size + 20 != length
+       || bitstream_size  > (1<<26)
+       || bytestream_size > (1<<26)
+       || wordstream_size > (1<<26)
+       ){
         av_log(f->avctx, AV_LOG_ERROR, "lengths %d %d %d %d\n", bitstream_size, bytestream_size, wordstream_size, 
         bitstream_size+ bytestream_size+ wordstream_size - length);
+        return -1;
+    }
     
     f->bitstream_buffer= av_fast_realloc(f->bitstream_buffer, &f->bitstream_buffer_size, bitstream_size + FF_INPUT_BUFFER_PADDING_SIZE);
     f->dsp.bswap_buf((uint32_t*)f->bitstream_buffer, (uint32_t*)(buf + 20), bitstream_size/4);
@@ -544,7 +545,7 @@ static uint8_t *read_huffman_tables(FourXContext *f, uint8_t * const buf){
     
     init_vlc(&f->pre_vlc, ACDC_VLC_BITS, 257, 
              len_tab , 1, 1,
-             bits_tab, 4, 4);
+             bits_tab, 4, 4, 0);
              
     return ptr;
 }
@@ -555,13 +556,17 @@ static int decode_i_frame(FourXContext *f, uint8_t *buf, int length){
     const int height= f->avctx->height;
     uint16_t *dst= (uint16_t*)f->current_picture.data[0];
     const int stride= f->current_picture.linesize[0]>>1;
-    const int bitstream_size= get32(buf);
+    const unsigned int bitstream_size= get32(buf);
     const int token_count __attribute__((unused)) = get32(buf + bitstream_size + 8);
-    int prestream_size= 4*get32(buf + bitstream_size + 4);
+    unsigned int prestream_size= 4*get32(buf + bitstream_size + 4);
     uint8_t *prestream= buf + bitstream_size + 12;
     
-    if(prestream_size + bitstream_size + 12 != length)
+    if(prestream_size + bitstream_size + 12 != length
+       || bitstream_size > (1<<26)
+       || prestream_size > (1<<26)){
         av_log(f->avctx, AV_LOG_ERROR, "size missmatch %d %d %d\n", prestream_size, bitstream_size, length);
+        return -1;
+    }
    
     prestream= read_huffman_tables(f, prestream);
 
@@ -600,11 +605,6 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame *p, temp;
     int i, frame_4cc, frame_size;
 
-    /* special case for last picture */
-    if (buf_size == 0) {
-        return 0;
-    }
-
     frame_4cc= get32(buf);
     if(buf_size != get32(buf+4)+8){
         av_log(f->avctx, AV_LOG_ERROR, "size missmatch %d %d\n", buf_size, get32(buf+4));
diff --git a/src/libffmpeg/libavcodec/8bps.c b/src/libffmpeg/libavcodec/8bps.c
index 9509f42ad..3898ac5dd 100644
--- a/src/libffmpeg/libavcodec/8bps.c
+++ b/src/libffmpeg/libavcodec/8bps.c
@@ -61,7 +61,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
 {
 	EightBpsContext * const c = (EightBpsContext *)avctx->priv_data;
 	unsigned char *encoded = (unsigned char *)buf;
-	unsigned char *pixptr;
+	unsigned char *pixptr, *pixptr_end;
 	unsigned int height = avctx->height; // Real image height
 	unsigned int dlen, p, row;
 	unsigned char *lp, *dp;
@@ -70,11 +70,6 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
 	unsigned int planes = c->planes;
 	unsigned char *planemap = c->planemap;
   
-  
-	/* no supplementary picture */
-	if (buf_size == 0)
-		return 0;
-
 	if(c->pic.data[0])
 		avctx->release_buffer(avctx, &c->pic);
 
@@ -101,18 +96,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
 		/* Decode a plane */
 		for(row = 0; row < height; row++) {
 			pixptr = c->pic.data[0] + row * c->pic.linesize[0] + planemap[p];
+			pixptr_end = pixptr + c->pic.linesize[0];
 			dlen = be2me_16(*(unsigned short *)(lp+row*2));
 			/* Decode a row of this plane */
 			while(dlen > 0) {
 				if ((count = *dp++) <= 127) {
 					count++;
 					dlen -= count + 1;
+					if (pixptr + count * px_inc > pixptr_end)
+					    break;
 					while(count--) {
 						*pixptr = *dp++;
 						pixptr += px_inc;
 					}
 				} else {
 					count = 257 - count;
+					if (pixptr + count * px_inc > pixptr_end)
+					    break;
 					while(count--) {
 						*pixptr = *dp;
 						pixptr += px_inc;
@@ -155,6 +155,10 @@ static int decode_init(AVCodecContext *avctx)
 
 	c->pic.data[0] = NULL;
 
+    if (avcodec_check_dimensions(avctx, avctx->width, avctx->height) < 0) {
+        return 1;
+    }
+
 	switch (avctx->bits_per_sample) {
 		case 8:
 			avctx->pix_fmt = PIX_FMT_PAL8;
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index 5d9c0d2d0..8b4ae4fef 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -17,11 +17,13 @@ noinst_LTLIBRARIES = libavcodec.la
 libavcodec_la_SOURCES = \
 	4xm.c \
 	8bps.c \
+	aasc.c \
 	adpcm.c \
+	alac.c \
 	asv1.c \
+	bitstream.c \
 	cabac.c \
 	cinepak.c \
-	common.c \
 	cyuv.c \
 	dpcm.c \
 	dsputil.c \
@@ -33,9 +35,11 @@ libavcodec_la_SOURCES = \
 	flicvideo.c \
 	fft.c \
 	golomb.c \
+	h261.c \
 	h263.c \
 	h263dec.c \
 	h264.c \
+	h264idct.c \
 	huffyuv.c \
 	idcinvideo.c \
 	imgconvert.c \
@@ -46,6 +50,7 @@ libavcodec_la_SOURCES = \
 	jfdctint.c \
 	jrevdct.c \
 	lcl.c \
+	loco.c \
 	mdct.c \
 	mace.c \
 	mem.c \
@@ -57,21 +62,27 @@ libavcodec_la_SOURCES = \
 	msmpeg4.c \
 	msrle.c \
 	msvideo1.c \
-	opts.c \
 	parser.c \
 	pcm.c \
+	qdrw.c \
+	qpeg.c \
 	qtrle.c \
 	ra144.c \
 	ra288.c \
+	rangecoder.c \
 	ratecontrol.c \
 	rational.c \
 	roqvideo.c \
 	rpza.c \
 	rv10.c \
+	shorten.c \
 	simple_idct.c \
 	smc.c \
+	snow.c \
 	svq1.c \
+	tscc.c \
 	truemotion1.c \
+	ulti.c \
 	utils.c \
 	vcr1.c \
 	vmdav.c \
@@ -79,7 +90,9 @@ libavcodec_la_SOURCES = \
 	vp3dsp.c \
 	vqavideo.c \
 	wmadec.c \
-	xan.c
+	wnv1.c \
+	xan.c \
+	xl.c
 
 libavcodec_la_LDFLAGS = \
 	$(top_builddir)/src/libffmpeg/libavcodec/armv4l/libavcodec_armv4l.la \
@@ -92,6 +105,7 @@ libavcodec_la_LDFLAGS = \
 
 noinst_HEADERS = \
 	avcodec.h \
+	bitstream.h \
 	bswap.h \
 	cabac.h \
 	common.h \
@@ -103,6 +117,7 @@ noinst_HEADERS = \
 	imgconvert_template.h \
 	indeo3data.h \
 	integer.h \
+	h261data.h \
 	h263data.h \
 	h264data.h \
 	mpeg4data.h \
@@ -114,11 +129,13 @@ noinst_HEADERS = \
 	msmpeg4data.h \
 	ra144.h \
 	ra288.h \
+	rangecoder.h \
 	rational.h \
 	simple_idct.h \
 	sp5x.h \
 	svq1_cb.h \
 	svq1_vlc.h \
 	truemotion1data.h \
+	ulti_cb.h \
 	vp3data.h \
 	wmadata.h
diff --git a/src/libffmpeg/libavcodec/aasc.c b/src/libffmpeg/libavcodec/aasc.c
new file mode 100644
index 000000000..d2419e98c
--- /dev/null
+++ b/src/libffmpeg/libavcodec/aasc.c
@@ -0,0 +1,174 @@
+/*
+ * Autodesc RLE Decoder
+ * Copyright (C) 2005 the ffmpeg project
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**
+ * @file aasc.c
+ * Autodesc RLE Video Decoder by Konstantin Shishkov
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "avcodec.h"
+#include "dsputil.h"
+
+typedef struct AascContext {
+    AVCodecContext *avctx;
+    AVFrame frame;
+} AascContext;
+
+#define FETCH_NEXT_STREAM_BYTE() \
+    if (stream_ptr >= buf_size) \
+    { \
+      av_log(s->avctx, AV_LOG_ERROR, " AASC: stream ptr just went out of bounds (fetch)\n"); \
+      break; \
+    } \
+    stream_byte = buf[stream_ptr++];
+
+static int aasc_decode_init(AVCodecContext *avctx)
+{
+    AascContext *s = (AascContext *)avctx->priv_data;
+
+    s->avctx = avctx;
+
+    avctx->pix_fmt = PIX_FMT_BGR24;
+    avctx->has_b_frames = 0;
+    s->frame.data[0] = NULL;
+
+    return 0;
+}
+
+static int aasc_decode_frame(AVCodecContext *avctx,
+                              void *data, int *data_size,
+                              uint8_t *buf, int buf_size)
+{
+    AascContext *s = (AascContext *)avctx->priv_data;
+    int stream_ptr = 4;
+    unsigned char rle_code;
+    unsigned char stream_byte;
+    int pixel_ptr = 0;
+    int row_dec, row_ptr;
+    int frame_size;
+    int i;
+
+    s->frame.reference = 1;
+    s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE | FF_BUFFER_HINTS_REUSABLE;
+    if (avctx->reget_buffer(avctx, &s->frame)) {
+        av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n");
+        return -1;
+    }
+
+    row_dec = s->frame.linesize[0];
+    row_ptr = (s->avctx->height - 1) * row_dec;
+    frame_size = row_dec * s->avctx->height;
+
+    while (row_ptr >= 0) {
+        FETCH_NEXT_STREAM_BYTE();
+        rle_code = stream_byte;
+        if (rle_code == 0) {
+            /* fetch the next byte to see how to handle escape code */
+            FETCH_NEXT_STREAM_BYTE();
+            if (stream_byte == 0) {
+                /* line is done, goto the next one */
+                row_ptr -= row_dec;
+                pixel_ptr = 0;
+            } else if (stream_byte == 1) {
+                /* decode is done */
+                break;
+            } else if (stream_byte == 2) {
+                /* reposition frame decode coordinates */
+                FETCH_NEXT_STREAM_BYTE();
+                pixel_ptr += stream_byte;
+                FETCH_NEXT_STREAM_BYTE();
+                row_ptr -= stream_byte * row_dec;
+            } else {
+                /* copy pixels from encoded stream */
+                if ((pixel_ptr + stream_byte > avctx->width * 3) ||
+                    (row_ptr < 0)) {
+                    av_log(s->avctx, AV_LOG_ERROR, " AASC: frame ptr just went out of bounds (copy1)\n");
+                    break;
+                }
+
+                rle_code = stream_byte;
+                if (stream_ptr + rle_code > buf_size) {
+                    av_log(s->avctx, AV_LOG_ERROR, " AASC: stream ptr just went out of bounds (copy2)\n");
+                    break;
+                }
+
+                for (i = 0; i < rle_code; i++) {
+                    FETCH_NEXT_STREAM_BYTE();
+                    s->frame.data[0][row_ptr + pixel_ptr] = stream_byte;
+                    pixel_ptr++;
+                }
+                if (rle_code & 1)
+                    stream_ptr++;
+            }
+        } else {
+            /* decode a run of data */
+            if ((pixel_ptr + rle_code > avctx->width * 3) ||
+                (row_ptr < 0)) {
+                av_log(s->avctx, AV_LOG_ERROR, " AASC: frame ptr just went out of bounds (run1)\n");
+                break;
+            }
+
+            FETCH_NEXT_STREAM_BYTE();
+
+            while(rle_code--) {
+                s->frame.data[0][row_ptr + pixel_ptr] = stream_byte;
+                pixel_ptr++;
+            }
+        }
+    }
+
+    /* one last sanity check on the way out */
+    if (stream_ptr < buf_size)
+        av_log(s->avctx, AV_LOG_ERROR, " AASC: ended frame decode with bytes left over (%d < %d)\n",
+            stream_ptr, buf_size);
+
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = s->frame;
+
+    /* report that the buffer was completely consumed */
+    return buf_size;
+}
+
+static int aasc_decode_end(AVCodecContext *avctx)
+{
+    AascContext *s = (AascContext *)avctx->priv_data;
+
+    /* release the last frame */
+    if (s->frame.data[0])
+        avctx->release_buffer(avctx, &s->frame);
+
+    return 0;
+}
+
+AVCodec aasc_decoder = {
+    "aasc",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_AASC,
+    sizeof(AascContext),
+    aasc_decode_init,
+    NULL,
+    aasc_decode_end,
+    aasc_decode_frame,
+    CODEC_CAP_DR1,
+};
diff --git a/src/libffmpeg/libavcodec/adpcm.c b/src/libffmpeg/libavcodec/adpcm.c
index 0755e24fe..043c4d4b2 100644
--- a/src/libffmpeg/libavcodec/adpcm.c
+++ b/src/libffmpeg/libavcodec/adpcm.c
@@ -17,6 +17,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "avcodec.h"
+#include "bitstream.h"
 
 /**
  * @file adpcm.c
@@ -103,6 +104,19 @@ static int ea_adpcm_table[] = {
     3, 4, 7, 8, 10, 11, 0, -1, -3, -4
 };
 
+static int ct_adpcm_table[8] = {
+    0x00E6, 0x00E6, 0x00E6, 0x00E6,
+    0x0133, 0x0199, 0x0200, 0x0266
+};
+
+// padded to zero where table size is less then 16
+static int swf_index_tables[4][16] = {
+    /*2*/ { -1, 2 },
+    /*3*/ { -1, -1, 2, 4 },
+    /*4*/ { -1, -1, -1, -1, 2, 4, 6, 8 },
+    /*5*/ { -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 4, 6, 8, 10, 13, 16 }
+};
+
 /* end of tables */
 
 typedef struct ADPCMChannelStatus {
@@ -124,6 +138,10 @@ typedef struct ADPCMContext {
     int channel; /* for stereo MOVs, decode left, then decode right, then tell it's decoded */
     ADPCMChannelStatus status[2];
     short sample_buffer[32]; /* hold left samples while waiting for right samples */
+
+    /* SWF only */
+    int nb_bits;
+    int nb_samples;
 } ADPCMContext;
 
 /* XXX: implement encoding */
@@ -361,6 +379,9 @@ static int adpcm_decode_init(AVCodecContext * avctx)
     c->status[0].step = c->status[1].step = 0;
 
     switch(avctx->codec->id) {
+    case CODEC_ID_ADPCM_CT:
+	c->status[0].step = c->status[1].step = 511;
+	break;
     default:
         break;
     }
@@ -411,6 +432,37 @@ static inline short adpcm_ms_expand_nibble(ADPCMChannelStatus *c, char nibble)
     return (short)predictor;
 }
 
+static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
+{
+    int predictor;
+    int sign, delta, diff;
+    int new_step;
+
+    sign = nibble & 8;
+    delta = nibble & 7;
+    /* perform direct multiplication instead of series of jumps proposed by
+     * the reference ADPCM implementation since modern CPUs can do the mults
+     * quickly enough */
+    diff = ((2 * delta + 1) * c->step) >> 3;
+    predictor = c->predictor;
+    /* predictor update is not so trivial: predictor is multiplied on 254/256 before updating */
+    if(sign)
+	predictor = ((predictor * 254) >> 8) - diff;
+    else
+    	predictor = ((predictor * 254) >> 8) + diff;
+    /* calculate new step and clamp it to range 511..32767 */
+    new_step = (ct_adpcm_table[nibble & 7] * c->step) >> 8;
+    c->step = new_step;
+    if(c->step < 511)
+	c->step = 511;
+    if(c->step > 32767)
+	c->step = 32767;
+
+    CLAMP_TO_SHORT(predictor);
+    c->predictor = predictor;
+    return (short)predictor;
+}
+
 static void xa_decode(short *out, const unsigned char *in, 
     ADPCMChannelStatus *left, ADPCMChannelStatus *right, int inc)
 {
@@ -840,6 +892,92 @@ static int adpcm_decode_frame(AVCodecContext *avctx,
             src++;
         }
         break;
+    case CODEC_ID_ADPCM_CT:
+	while (src < buf + buf_size) {
+            if (st) {
+                *samples++ = adpcm_ct_expand_nibble(&c->status[0], 
+                    (src[0] >> 4) & 0x0F);
+                *samples++ = adpcm_ct_expand_nibble(&c->status[1], 
+                    src[0] & 0x0F);
+            } else {
+                *samples++ = adpcm_ct_expand_nibble(&c->status[0], 
+                    (src[0] >> 4) & 0x0F);
+                *samples++ = adpcm_ct_expand_nibble(&c->status[0], 
+                    src[0] & 0x0F);
+            }
+	    src++;
+        }
+        break;
+    case CODEC_ID_ADPCM_SWF:
+    {
+	GetBitContext gb;
+	int *table;
+	int k0, signmask;
+	int size = buf_size*8;
+	
+	init_get_bits(&gb, buf, size);
+
+	// first frame, read bits & inital values
+	if (!c->nb_bits)
+	{
+	    c->nb_bits = get_bits(&gb, 2)+2;
+//	    av_log(NULL,AV_LOG_INFO,"nb_bits: %d\n", c->nb_bits);
+	}
+	
+	table = swf_index_tables[c->nb_bits-2];
+	k0 = 1 << (c->nb_bits-2);
+	signmask = 1 << (c->nb_bits-1);
+	
+	while (get_bits_count(&gb) <= size)
+	{
+	    int i;
+
+	    c->nb_samples++;
+	    // wrap around at every 4096 samples...
+	    if ((c->nb_samples & 0xfff) == 1)
+	    {
+		for (i = 0; i <= st; i++)
+		{
+		    *samples++ = c->status[i].predictor = get_sbits(&gb, 16);
+		    c->status[i].step_index = get_bits(&gb, 6);
+		}
+	    }
+
+	    // similar to IMA adpcm
+	    for (i = 0; i <= st; i++)
+	    {
+		int delta = get_bits(&gb, c->nb_bits);
+		int step = step_table[c->status[i].step_index];
+		long vpdiff = 0; // vpdiff = (delta+0.5)*step/4
+		int k = k0;
+		
+		do {
+		    if (delta & k)
+			vpdiff += step;
+		    step >>= 1;
+		    k >>= 1;
+		} while(k);
+		vpdiff += step;
+		
+		if (delta & signmask)
+		    c->status[i].predictor -= vpdiff;
+		else
+		    c->status[i].predictor += vpdiff;
+		
+		c->status[i].step_index += table[delta & (~signmask)];
+		
+		c->status[i].step_index = clip(c->status[i].step_index, 0, 88);
+		c->status[i].predictor = clip(c->status[i].predictor, -32768, 32767);
+		
+		*samples++ = c->status[i].predictor;
+	    }
+	}
+	
+//	src += get_bits_count(&gb)*8;
+	src += size;
+	
+	break;
+    }
     default:
         return -1;
     }
@@ -895,5 +1033,7 @@ ADPCM_CODEC(CODEC_ID_ADPCM_4XM, adpcm_4xm);
 ADPCM_CODEC(CODEC_ID_ADPCM_XA, adpcm_xa);
 ADPCM_CODEC(CODEC_ID_ADPCM_ADX, adpcm_adx);
 ADPCM_CODEC(CODEC_ID_ADPCM_EA, adpcm_ea);
+ADPCM_CODEC(CODEC_ID_ADPCM_CT, adpcm_ct);
+ADPCM_CODEC(CODEC_ID_ADPCM_SWF, adpcm_swf);
 
 #undef ADPCM_CODEC
diff --git a/src/libffmpeg/libavcodec/adx.c b/src/libffmpeg/libavcodec/adx.c
index e41a75726..a52575c13 100644
--- a/src/libffmpeg/libavcodec/adx.c
+++ b/src/libffmpeg/libavcodec/adx.c
@@ -314,6 +314,7 @@ static int adx_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
+#if 0
 static void dump(unsigned char *buf,size_t len)
 {
     int i;
@@ -324,6 +325,8 @@ static void dump(unsigned char *buf,size_t len)
     }
     av_log(NULL, AV_LOG_ERROR, "\n");
 }
+#endif
+
 static int adx_decode_frame(AVCodecContext *avctx,
                 void *data, int *data_size,
                 uint8_t *buf0, int buf_size)
diff --git a/src/libffmpeg/libavcodec/alac.c b/src/libffmpeg/libavcodec/alac.c
new file mode 100644
index 000000000..5ae2e00f4
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alac.c
@@ -0,0 +1,833 @@
+/*
+ * ALAC (Apple Lossless Audio Codec) decoder
+ * Copyright (c) 2005 David Hammerton
+ * All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**
+ * @file alac.c
+ * ALAC (Apple Lossless Audio Codec) decoder
+ * @author 2005 David Hammerton
+ *
+ * For more information on the ALAC format, visit:
+ *  http://crazney.net/programs/itunes/alac.html
+ *
+ * Note: This decoder expects a 36- (0x24-)byte QuickTime atom to be
+ * passed through the extradata[_size] fields. This atom is tacked onto
+ * the end of an 'alac' stsd atom and has the following format:
+ *  bytes 0-3   atom size (0x24), big-endian
+ *  bytes 4-7   atom type ('alac', not the 'alac' tag from start of stsd)
+ *  bytes 8-35  data bytes needed by decoder
+ */
+
+
+#include "avcodec.h"
+#include "bitstream.h"
+
+#define ALAC_EXTRADATA_SIZE 36
+
+typedef struct {
+
+    AVCodecContext *avctx;
+    GetBitContext gb;
+    /* init to 0; first frame decode should initialize from extradata and
+     * set this to 1 */
+    int context_initialized;
+
+    int samplesize;
+    int numchannels;
+    int bytespersample;
+
+    /* buffers */
+    int32_t *predicterror_buffer_a;
+    int32_t *predicterror_buffer_b;
+
+    int32_t *outputsamples_buffer_a;
+    int32_t *outputsamples_buffer_b;
+
+    /* stuff from setinfo */
+    uint32_t setinfo_max_samples_per_frame; /* 0x1000 = 4096 */    /* max samples per frame? */
+    uint8_t setinfo_7a; /* 0x00 */
+    uint8_t setinfo_sample_size; /* 0x10 */
+    uint8_t setinfo_rice_historymult; /* 0x28 */
+    uint8_t setinfo_rice_initialhistory; /* 0x0a */
+    uint8_t setinfo_rice_kmodifier; /* 0x0e */
+    uint8_t setinfo_7f; /* 0x02 */
+    uint16_t setinfo_80; /* 0x00ff */
+    uint32_t setinfo_82; /* 0x000020e7 */
+    uint32_t setinfo_86; /* 0x00069fe4 */
+    uint32_t setinfo_8a_rate; /* 0x0000ac44 */
+    /* end setinfo stuff */
+
+} ALACContext;
+
+static void allocate_buffers(ALACContext *alac)
+{
+    alac->predicterror_buffer_a = av_malloc(alac->setinfo_max_samples_per_frame * 4);
+    alac->predicterror_buffer_b = av_malloc(alac->setinfo_max_samples_per_frame * 4);
+
+    alac->outputsamples_buffer_a = av_malloc(alac->setinfo_max_samples_per_frame * 4);
+    alac->outputsamples_buffer_b = av_malloc(alac->setinfo_max_samples_per_frame * 4);
+}
+
+void alac_set_info(ALACContext *alac)
+{
+    unsigned char *ptr = alac->avctx->extradata;
+
+    ptr += 4; /* size */
+    ptr += 4; /* alac */
+    ptr += 4; /* 0 ? */
+
+    alac->setinfo_max_samples_per_frame = BE_32(ptr); /* buffer size / 2 ? */
+    ptr += 4;
+    alac->setinfo_7a = *ptr++;
+    alac->setinfo_sample_size = *ptr++;
+    alac->setinfo_rice_historymult = *ptr++;
+    alac->setinfo_rice_initialhistory = *ptr++;
+    alac->setinfo_rice_kmodifier = *ptr++;
+    alac->setinfo_7f = *ptr++;
+    alac->setinfo_80 = BE_16(ptr);
+    ptr += 2;
+    alac->setinfo_82 = BE_32(ptr);
+    ptr += 4;
+    alac->setinfo_86 = BE_32(ptr);
+    ptr += 4;
+    alac->setinfo_8a_rate = BE_32(ptr);
+    ptr += 4;
+
+    allocate_buffers(alac);
+}
+
+/* hideously inefficient. could use a bitmask search,
+ * alternatively bsr on x86,
+ */
+static int count_leading_zeros(int32_t input)
+{
+    int i = 0;
+    while (!(0x80000000 & input) && i < 32) {
+        i++;
+        input = input << 1;
+    }
+    return i;
+}
+
+void bastardized_rice_decompress(ALACContext *alac,
+                                 int32_t *output_buffer,
+                                 int output_size,
+                                 int readsamplesize, /* arg_10 */
+                                 int rice_initialhistory, /* arg424->b */
+                                 int rice_kmodifier, /* arg424->d */
+                                 int rice_historymult, /* arg424->c */
+                                 int rice_kmodifier_mask /* arg424->e */
+        )
+{
+    int output_count;
+    unsigned int history = rice_initialhistory;
+    int sign_modifier = 0;
+
+    for (output_count = 0; output_count < output_size; output_count++) {
+        int32_t x = 0;
+        int32_t x_modified;
+        int32_t final_val;
+
+        /* read x - number of 1s before 0 represent the rice */
+        while (x <= 8 && get_bits1(&alac->gb)) {
+            x++;
+        }
+
+
+        if (x > 8) { /* RICE THRESHOLD */
+          /* use alternative encoding */
+            int32_t value;
+
+            value = get_bits(&alac->gb, readsamplesize);
+
+            /* mask value to readsamplesize size */
+            if (readsamplesize != 32)
+                value &= (0xffffffff >> (32 - readsamplesize));
+
+            x = value;
+        } else {
+          /* standard rice encoding */
+            int extrabits;
+            int k; /* size of extra bits */
+
+            /* read k, that is bits as is */
+            k = 31 - rice_kmodifier - count_leading_zeros((history >> 9) + 3);
+
+            if (k < 0) 
+                k += rice_kmodifier;
+            else 
+                k = rice_kmodifier;
+
+            if (k != 1) {
+                extrabits = show_bits(&alac->gb, k);
+
+                /* multiply x by 2^k - 1, as part of their strange algorithm */
+                x = (x << k) - x;
+
+                if (extrabits > 1) {
+                    x += extrabits - 1;
+                    get_bits(&alac->gb, k);
+                } else {
+                    get_bits(&alac->gb, k - 1);
+                }
+            }
+        }
+
+        x_modified = sign_modifier + x;
+        final_val = (x_modified + 1) / 2;
+        if (x_modified & 1) final_val *= -1;
+
+        output_buffer[output_count] = final_val;
+
+        sign_modifier = 0;
+
+        /* now update the history */
+        history += (x_modified * rice_historymult)
+                 - ((history * rice_historymult) >> 9);
+
+        if (x_modified > 0xffff)
+            history = 0xffff;
+
+        /* special case: there may be compressed blocks of 0 */
+        if ((history < 128) && (output_count+1 < output_size)) {
+            int block_size;
+
+            sign_modifier = 1;
+
+            x = 0;
+            while (x <= 8 && get_bits1(&alac->gb)) {
+                x++;
+            }
+
+            if (x > 8) {
+                block_size = get_bits(&alac->gb, 16);
+                block_size &= 0xffff;
+            } else {
+                int k;
+                int extrabits;
+
+                k = count_leading_zeros(history) + ((history + 16) >> 6 /* / 64 */) - 24;
+
+                extrabits = show_bits(&alac->gb, k);
+
+                block_size = (((1 << k) - 1) & rice_kmodifier_mask) * x
+                           + extrabits - 1;
+
+                if (extrabits < 2) {
+                    x = 1 - extrabits;
+                    block_size += x;
+                    get_bits(&alac->gb, k - 1);
+                } else {
+                    get_bits(&alac->gb, k);
+                }
+            }
+
+            if (block_size > 0) {
+                memset(&output_buffer[output_count+1], 0, block_size * 4);
+                output_count += block_size;
+
+            }
+
+            if (block_size > 0xffff)
+                sign_modifier = 0;
+
+            history = 0;
+        }
+    }
+}
+
+#define SIGN_EXTENDED32(val, bits) ((val << (32 - bits)) >> (32 - bits))
+
+#define SIGN_ONLY(v) \
+                     ((v < 0) ? (-1) : \
+                                ((v > 0) ? (1) : \
+                                           (0)))
+
+static void predictor_decompress_fir_adapt(int32_t *error_buffer,
+                                           int32_t *buffer_out,
+                                           int output_size,
+                                           int readsamplesize,
+                                           int16_t *predictor_coef_table,
+                                           int predictor_coef_num,
+                                           int predictor_quantitization)
+{
+    int i;
+
+    /* first sample always copies */
+    *buffer_out = *error_buffer;
+
+    if (!predictor_coef_num) {
+        if (output_size <= 1) return;
+        memcpy(buffer_out+1, error_buffer+1, (output_size-1) * 4);
+        return;
+    }
+
+    if (predictor_coef_num == 0x1f) { /* 11111 - max value of predictor_coef_num */
+      /* second-best case scenario for fir decompression,
+       * error describes a small difference from the previous sample only
+       */
+        if (output_size <= 1) return;
+        for (i = 0; i < output_size - 1; i++) {
+            int32_t prev_value;
+            int32_t error_value;
+
+            prev_value = buffer_out[i];
+            error_value = error_buffer[i+1];
+            buffer_out[i+1] = SIGN_EXTENDED32((prev_value + error_value), readsamplesize);
+        }
+        return;
+    }
+
+    /* read warm-up samples */
+    if (predictor_coef_num > 0) {
+        int i;
+        for (i = 0; i < predictor_coef_num; i++) {
+            int32_t val;
+
+            val = buffer_out[i] + error_buffer[i+1];
+
+            val = SIGN_EXTENDED32(val, readsamplesize);
+
+            buffer_out[i+1] = val;
+        }
+    }
+
+#if 0
+    /* 4 and 8 are very common cases (the only ones i've seen). these
+     * should be unrolled and optimised
+     */
+    if (predictor_coef_num == 4) {
+        /* FIXME: optimised general case */
+        return;
+    }
+
+    if (predictor_coef_table == 8) {
+        /* FIXME: optimised general case */
+        return;
+    }
+#endif
+
+
+    /* general case */
+    if (predictor_coef_num > 0) {
+        for (i = predictor_coef_num + 1;
+             i < output_size;
+             i++) {
+            int j;
+            int sum = 0;
+            int outval;
+            int error_val = error_buffer[i];
+
+            for (j = 0; j < predictor_coef_num; j++) {
+                sum += (buffer_out[predictor_coef_num-j] - buffer_out[0]) *
+                       predictor_coef_table[j];
+            }
+
+            outval = (1 << (predictor_quantitization-1)) + sum;
+            outval = outval >> predictor_quantitization;
+            outval = outval + buffer_out[0] + error_val;
+            outval = SIGN_EXTENDED32(outval, readsamplesize);
+
+            buffer_out[predictor_coef_num+1] = outval;
+
+            if (error_val > 0) {
+                int predictor_num = predictor_coef_num - 1;
+
+                while (predictor_num >= 0 && error_val > 0) {
+                    int val = buffer_out[0] - buffer_out[predictor_coef_num - predictor_num];
+                    int sign = SIGN_ONLY(val);
+
+                    predictor_coef_table[predictor_num] -= sign;
+
+                    val *= sign; /* absolute value */
+
+                    error_val -= ((val >> predictor_quantitization) *
+                                  (predictor_coef_num - predictor_num));
+
+                    predictor_num--;
+                }
+            } else if (error_val < 0) {
+                int predictor_num = predictor_coef_num - 1;
+
+                while (predictor_num >= 0 && error_val < 0) {
+                    int val = buffer_out[0] - buffer_out[predictor_coef_num - predictor_num];
+                    int sign = - SIGN_ONLY(val);
+
+                    predictor_coef_table[predictor_num] -= sign;
+
+                    val *= sign; /* neg value */
+
+                    error_val -= ((val >> predictor_quantitization) *
+                                  (predictor_coef_num - predictor_num));
+
+                    predictor_num--;
+                }
+            }
+
+            buffer_out++;
+        }
+    }
+}
+
+void deinterlace_16(int32_t *buffer_a, int32_t *buffer_b,
+                    int16_t *buffer_out,
+                    int numchannels, int numsamples,
+                    uint8_t interlacing_shift,
+                    uint8_t interlacing_leftweight)
+{
+    int i;
+    if (numsamples <= 0) return;
+
+    /* weighted interlacing */
+    if (interlacing_leftweight) {
+        for (i = 0; i < numsamples; i++) {
+            int32_t difference, midright;
+            int16_t left;
+            int16_t right;
+
+            midright = buffer_a[i];
+            difference = buffer_b[i];
+
+
+            right = midright - ((difference * interlacing_leftweight) >> interlacing_shift);
+            left = (midright - ((difference * interlacing_leftweight) >> interlacing_shift))
+                 + difference;
+
+            buffer_out[i*numchannels] = left;
+            buffer_out[i*numchannels + 1] = right;
+        }
+
+        return;
+    }
+
+    /* otherwise basic interlacing took place */
+    for (i = 0; i < numsamples; i++) {
+        int16_t left, right;
+
+        left = buffer_a[i];
+        right = buffer_b[i];
+
+        buffer_out[i*numchannels] = left;
+        buffer_out[i*numchannels + 1] = right;
+    }
+}
+
+static int alac_decode_frame(AVCodecContext *avctx,
+                             void *outbuffer, int *outputsize,
+                             uint8_t *inbuffer, int input_buffer_size)
+{
+    ALACContext *alac = avctx->priv_data;
+
+    int channels;
+    int32_t outputsamples;
+
+    /* short-circuit null buffers */
+    if (!inbuffer || !input_buffer_size)
+        return input_buffer_size;
+
+    /* initialize from the extradata */
+    if (!alac->context_initialized) {
+        if (alac->avctx->extradata_size != ALAC_EXTRADATA_SIZE) {
+            av_log(NULL, AV_LOG_ERROR, "alac: expected %d extradata bytes\n", 
+                ALAC_EXTRADATA_SIZE);
+            return input_buffer_size;
+        }
+        alac_set_info(alac);
+        alac->context_initialized = 1;
+    }
+
+    outputsamples = alac->setinfo_max_samples_per_frame;
+
+    init_get_bits(&alac->gb, inbuffer, input_buffer_size * 8);
+
+    channels = get_bits(&alac->gb, 3);
+
+    *outputsize = outputsamples * alac->bytespersample;
+
+    switch(channels) {
+    case 0: { /* 1 channel */
+        int hassize;
+        int isnotcompressed;
+        int readsamplesize;
+
+        int wasted_bytes;
+        int ricemodifier;
+
+
+        /* 2^result = something to do with output waiting.
+         * perhaps matters if we read > 1 frame in a pass?
+         */
+        get_bits(&alac->gb, 4);
+
+        get_bits(&alac->gb, 12); /* unknown, skip 12 bits */
+
+        hassize = get_bits(&alac->gb, 1); /* the output sample size is stored soon */
+
+        wasted_bytes = get_bits(&alac->gb, 2); /* unknown ? */
+
+        isnotcompressed = get_bits(&alac->gb, 1); /* whether the frame is compressed */
+
+        if (hassize) {
+            /* now read the number of samples,
+             * as a 32bit integer */
+            outputsamples = get_bits(&alac->gb, 32);
+            *outputsize = outputsamples * alac->bytespersample;
+        }
+
+        readsamplesize = alac->setinfo_sample_size - (wasted_bytes * 8);
+
+        if (!isnotcompressed) {
+         /* so it is compressed */
+            int16_t predictor_coef_table[32];
+            int predictor_coef_num;
+            int prediction_type;
+            int prediction_quantitization;
+            int i;
+
+            /* skip 16 bits, not sure what they are. seem to be used in
+             * two channel case */
+            get_bits(&alac->gb, 8);
+            get_bits(&alac->gb, 8);
+
+            prediction_type = get_bits(&alac->gb, 4);
+            prediction_quantitization = get_bits(&alac->gb, 4);
+
+            ricemodifier = get_bits(&alac->gb, 3);
+            predictor_coef_num = get_bits(&alac->gb, 5);
+
+            /* read the predictor table */
+            for (i = 0; i < predictor_coef_num; i++) {
+                predictor_coef_table[i] = (int16_t)get_bits(&alac->gb, 16);
+            }
+
+            if (wasted_bytes) {
+                /* these bytes seem to have something to do with
+                 * > 2 channel files.
+                 */
+                av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented, unhandling of wasted_bytes\n");
+            }
+
+            bastardized_rice_decompress(alac,
+                                        alac->predicterror_buffer_a,
+                                        outputsamples,
+                                        readsamplesize,
+                                        alac->setinfo_rice_initialhistory,
+                                        alac->setinfo_rice_kmodifier,
+                                        ricemodifier * alac->setinfo_rice_historymult / 4,
+                                        (1 << alac->setinfo_rice_kmodifier) - 1);
+
+            if (prediction_type == 0) {
+              /* adaptive fir */
+                predictor_decompress_fir_adapt(alac->predicterror_buffer_a,
+                                               alac->outputsamples_buffer_a,
+                                               outputsamples,
+                                               readsamplesize,
+                                               predictor_coef_table,
+                                               predictor_coef_num,
+                                               prediction_quantitization);
+            } else {
+                av_log(NULL, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type);
+                /* i think the only other prediction type (or perhaps this is just a
+                 * boolean?) runs adaptive fir twice.. like:
+                 * predictor_decompress_fir_adapt(predictor_error, tempout, ...)
+                 * predictor_decompress_fir_adapt(predictor_error, outputsamples ...)
+                 * little strange..
+                 */
+            }
+
+        } else {
+          /* not compressed, easy case */
+            if (readsamplesize <= 16) {
+                int i;
+                for (i = 0; i < outputsamples; i++) {
+                    int32_t audiobits = get_bits(&alac->gb, readsamplesize);
+
+                    audiobits = SIGN_EXTENDED32(audiobits, readsamplesize);
+
+                    alac->outputsamples_buffer_a[i] = audiobits;
+                }
+            } else {
+                int i;
+                for (i = 0; i < outputsamples; i++) {
+                    int32_t audiobits;
+
+                    audiobits = get_bits(&alac->gb, 16);
+                    /* special case of sign extension..
+                     * as we'll be ORing the low 16bits into this */
+                    audiobits = audiobits << 16;
+                    audiobits = audiobits >> (32 - readsamplesize);
+
+                    audiobits |= get_bits(&alac->gb, readsamplesize - 16);
+
+                    alac->outputsamples_buffer_a[i] = audiobits;
+                }
+            }
+            /* wasted_bytes = 0; // unused */
+        }
+
+        switch(alac->setinfo_sample_size) {
+        case 16: {
+            int i;
+            for (i = 0; i < outputsamples; i++) {
+                int16_t sample = alac->outputsamples_buffer_a[i];
+                be2me_16(sample);
+                ((int16_t*)outbuffer)[i * alac->numchannels] = sample;
+            }
+            break;
+        }
+        case 20:
+        case 24:
+        case 32:
+            av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented sample size %i\n", alac->setinfo_sample_size);
+            break;
+        default:
+            break;
+        }
+        break;
+    }
+    case 1: { /* 2 channels */
+        int hassize;
+        int isnotcompressed;
+        int readsamplesize;
+
+        int wasted_bytes;
+
+        uint8_t interlacing_shift;
+        uint8_t interlacing_leftweight;
+
+        /* 2^result = something to do with output waiting.
+         * perhaps matters if we read > 1 frame in a pass?
+         */
+        get_bits(&alac->gb, 4);
+
+        get_bits(&alac->gb, 12); /* unknown, skip 12 bits */
+
+        hassize = get_bits(&alac->gb, 1); /* the output sample size is stored soon */
+
+        wasted_bytes = get_bits(&alac->gb, 2); /* unknown ? */
+
+        isnotcompressed = get_bits(&alac->gb, 1); /* whether the frame is compressed */
+
+        if (hassize) {
+            /* now read the number of samples,
+             * as a 32bit integer */
+            outputsamples = get_bits(&alac->gb, 32);
+            *outputsize = outputsamples * alac->bytespersample;
+        }
+
+        readsamplesize = alac->setinfo_sample_size - (wasted_bytes * 8) + 1;
+
+        if (!isnotcompressed) {
+         /* compressed */
+            int16_t predictor_coef_table_a[32];
+            int predictor_coef_num_a;
+            int prediction_type_a;
+            int prediction_quantitization_a;
+            int ricemodifier_a;
+
+            int16_t predictor_coef_table_b[32];
+            int predictor_coef_num_b;
+            int prediction_type_b;
+            int prediction_quantitization_b;
+            int ricemodifier_b;
+
+            int i;
+
+            interlacing_shift = get_bits(&alac->gb, 8);
+            interlacing_leftweight = get_bits(&alac->gb, 8);
+
+            /******** channel 1 ***********/
+            prediction_type_a = get_bits(&alac->gb, 4);
+            prediction_quantitization_a = get_bits(&alac->gb, 4);
+
+            ricemodifier_a = get_bits(&alac->gb, 3);
+            predictor_coef_num_a = get_bits(&alac->gb, 5);
+
+            /* read the predictor table */
+            for (i = 0; i < predictor_coef_num_a; i++) {
+                predictor_coef_table_a[i] = (int16_t)get_bits(&alac->gb, 16);
+            }
+
+            /******** channel 2 *********/
+            prediction_type_b = get_bits(&alac->gb, 4);
+            prediction_quantitization_b = get_bits(&alac->gb, 4);
+
+            ricemodifier_b = get_bits(&alac->gb, 3);
+            predictor_coef_num_b = get_bits(&alac->gb, 5);
+
+            /* read the predictor table */
+            for (i = 0; i < predictor_coef_num_b; i++) {
+                predictor_coef_table_b[i] = (int16_t)get_bits(&alac->gb, 16);
+            }
+
+            /*********************/
+            if (wasted_bytes) {
+              /* see mono case */
+                av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented, unhandling of wasted_bytes\n");
+            }
+
+            /* channel 1 */
+            bastardized_rice_decompress(alac,
+                                        alac->predicterror_buffer_a,
+                                        outputsamples,
+                                        readsamplesize,
+                                        alac->setinfo_rice_initialhistory,
+                                        alac->setinfo_rice_kmodifier,
+                                        ricemodifier_a * alac->setinfo_rice_historymult / 4,
+                                        (1 << alac->setinfo_rice_kmodifier) - 1);
+
+            if (prediction_type_a == 0) {
+              /* adaptive fir */
+                predictor_decompress_fir_adapt(alac->predicterror_buffer_a,
+                                               alac->outputsamples_buffer_a,
+                                               outputsamples,
+                                               readsamplesize,
+                                               predictor_coef_table_a,
+                                               predictor_coef_num_a,
+                                               prediction_quantitization_a);
+            } else {
+              /* see mono case */
+                av_log(NULL, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type_a);
+            }
+
+            /* channel 2 */
+            bastardized_rice_decompress(alac,
+                                        alac->predicterror_buffer_b,
+                                        outputsamples,
+                                        readsamplesize,
+                                        alac->setinfo_rice_initialhistory,
+                                        alac->setinfo_rice_kmodifier,
+                                        ricemodifier_b * alac->setinfo_rice_historymult / 4,
+                                        (1 << alac->setinfo_rice_kmodifier) - 1);
+
+            if (prediction_type_b == 0) {
+              /* adaptive fir */
+                predictor_decompress_fir_adapt(alac->predicterror_buffer_b,
+                                               alac->outputsamples_buffer_b,
+                                               outputsamples,
+                                               readsamplesize,
+                                               predictor_coef_table_b,
+                                               predictor_coef_num_b,
+                                               prediction_quantitization_b);
+            } else {
+                av_log(NULL, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type_b);
+            }
+        } else { 
+         /* not compressed, easy case */
+            if (alac->setinfo_sample_size <= 16) {
+                int i;
+                for (i = 0; i < outputsamples; i++) {
+                    int32_t audiobits_a, audiobits_b;
+
+                    audiobits_a = get_bits(&alac->gb, alac->setinfo_sample_size);
+                    audiobits_b = get_bits(&alac->gb, alac->setinfo_sample_size);
+
+                    audiobits_a = SIGN_EXTENDED32(audiobits_a, alac->setinfo_sample_size);
+                    audiobits_b = SIGN_EXTENDED32(audiobits_b, alac->setinfo_sample_size);
+
+                    alac->outputsamples_buffer_a[i] = audiobits_a;
+                    alac->outputsamples_buffer_b[i] = audiobits_b;
+                }
+            } else {
+                int i;
+                for (i = 0; i < outputsamples; i++) {
+                    int32_t audiobits_a, audiobits_b;
+
+                    audiobits_a = get_bits(&alac->gb, 16);
+                    audiobits_a = audiobits_a << 16;
+                    audiobits_a = audiobits_a >> (32 - alac->setinfo_sample_size);
+                    audiobits_a |= get_bits(&alac->gb, alac->setinfo_sample_size - 16);
+
+                    audiobits_b = get_bits(&alac->gb, 16);
+                    audiobits_b = audiobits_b << 16;
+                    audiobits_b = audiobits_b >> (32 - alac->setinfo_sample_size);
+                    audiobits_b |= get_bits(&alac->gb, alac->setinfo_sample_size - 16);
+
+                    alac->outputsamples_buffer_a[i] = audiobits_a;
+                    alac->outputsamples_buffer_b[i] = audiobits_b;
+                }
+            }
+            /* wasted_bytes = 0; */
+            interlacing_shift = 0;
+            interlacing_leftweight = 0;
+        }
+
+        switch(alac->setinfo_sample_size) {
+        case 16: {
+            deinterlace_16(alac->outputsamples_buffer_a,
+                           alac->outputsamples_buffer_b,
+                           (int16_t*)outbuffer,
+                           alac->numchannels,
+                           outputsamples,
+                           interlacing_shift,
+                           interlacing_leftweight);
+            break;
+        }
+        case 20:
+        case 24:
+        case 32:
+            av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented sample size %i\n", alac->setinfo_sample_size);
+            break;
+        default:
+            break;
+        }
+
+        break;
+    }
+    }
+
+    return input_buffer_size;
+}
+
+static int alac_decode_init(AVCodecContext * avctx)
+{
+    ALACContext *alac = avctx->priv_data;
+    alac->avctx = avctx;
+    alac->context_initialized = 0;
+
+    alac->samplesize = alac->avctx->bits_per_sample;
+    alac->numchannels = alac->avctx->channels;
+    alac->bytespersample = (alac->samplesize / 8) * alac->numchannels;
+
+    return 0;
+}
+
+static int alac_decode_close(AVCodecContext *avctx)
+{
+    ALACContext *alac = avctx->priv_data;
+
+    av_free(alac->predicterror_buffer_a);
+    av_free(alac->predicterror_buffer_b);
+
+    av_free(alac->outputsamples_buffer_a);
+    av_free(alac->outputsamples_buffer_b);
+
+    return 0;
+}
+
+AVCodec alac_decoder = {
+    "alac",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_ALAC,
+    sizeof(ALACContext),
+    alac_decode_init,
+    NULL,
+    alac_decode_close,
+    alac_decode_frame,
+};
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
index 7ec6757d7..6519a9590 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
@@ -23,9 +23,6 @@
  */
 
 #include "regdef.h"
-#ifdef HAVE_AV_CONFIG_H	
-#include "config.h"
-#endif
 
 /* Some nicer register names.  */
 #define ta t10
diff --git a/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S b/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S
index 0042e7e82..9e6b75f53 100644
--- a/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S
+++ b/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S
@@ -18,9 +18,6 @@
  */
 
 #include "regdef.h"
-#ifdef HAVE_AV_CONFIG_H	
-#include "config.h"
-#endif
 
 /* Some nicer register names.  */
 #define ta t10
diff --git a/src/libffmpeg/libavcodec/asv1.c b/src/libffmpeg/libavcodec/asv1.c
index 2ab729c17..4ab2518ab 100644
--- a/src/libffmpeg/libavcodec/asv1.c
+++ b/src/libffmpeg/libavcodec/asv1.c
@@ -63,7 +63,7 @@ static const uint8_t scantab[64]={
 };
 
 
-static const uint8_t reverse[256]={
+const uint8_t ff_reverse[256]={
 0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
 0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
 0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
@@ -137,29 +137,29 @@ static void init_vlcs(ASV1Context *a){
 
         init_vlc(&ccp_vlc, VLC_BITS, 17, 
                  &ccp_tab[0][1], 2, 1,
-                 &ccp_tab[0][0], 2, 1);
+                 &ccp_tab[0][0], 2, 1, 1);
         init_vlc(&dc_ccp_vlc, VLC_BITS, 8, 
                  &dc_ccp_tab[0][1], 2, 1,
-                 &dc_ccp_tab[0][0], 2, 1);
+                 &dc_ccp_tab[0][0], 2, 1, 1);
         init_vlc(&ac_ccp_vlc, VLC_BITS, 16, 
                  &ac_ccp_tab[0][1], 2, 1,
-                 &ac_ccp_tab[0][0], 2, 1);
+                 &ac_ccp_tab[0][0], 2, 1, 1);
         init_vlc(&level_vlc,  VLC_BITS, 7, 
                  &level_tab[0][1], 2, 1,
-                 &level_tab[0][0], 2, 1);
+                 &level_tab[0][0], 2, 1, 1);
         init_vlc(&asv2_level_vlc, ASV2_LEVEL_VLC_BITS, 63, 
                  &asv2_level_tab[0][1], 2, 1,
-                 &asv2_level_tab[0][0], 2, 1);
+                 &asv2_level_tab[0][0], 2, 1, 1);
     }
 }
 
 //FIXME write a reversed bitstream reader to avoid the double reverse
 static inline int asv2_get_bits(GetBitContext *gb, int n){
-    return reverse[ get_bits(gb, n) << (8-n) ];
+    return ff_reverse[ get_bits(gb, n) << (8-n) ];
 }
 
 static inline void asv2_put_bits(PutBitContext *pb, int n, int v){
-    put_bits(pb, n, reverse[ v << (8-n) ]);
+    put_bits(pb, n, ff_reverse[ v << (8-n) ]);
 }
 
 static inline int asv1_get_level(GetBitContext *gb){
@@ -339,8 +339,13 @@ static inline int decode_mb(ASV1Context *a, DCTELEM block[6][64]){
     return 0;
 }
 
-static inline void encode_mb(ASV1Context *a, DCTELEM block[6][64]){
+static inline int encode_mb(ASV1Context *a, DCTELEM block[6][64]){
     int i;
+    
+    if(a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb)>>3) < 30*16*16*3/2/8){
+        av_log(a->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+        return -1;
+    }
 
     if(a->avctx->codec_id == CODEC_ID_ASV1){
         for(i=0; i<6; i++)
@@ -349,6 +354,7 @@ static inline void encode_mb(ASV1Context *a, DCTELEM block[6][64]){
         for(i=0; i<6; i++)
             asv2_encode_block(a, block[i]);
     }
+    return 0;
 }
 
 static inline void idct_put(ASV1Context *a, int mb_x, int mb_y){
@@ -403,11 +409,6 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p= (AVFrame*)&a->picture;
     int mb_x, mb_y;
 
-    /* special case for last picture */
-    if (buf_size == 0) {
-        return 0;
-    }
-
     if(p->data[0])
         avctx->release_buffer(avctx, p);
 
@@ -426,7 +427,7 @@ static int decode_frame(AVCodecContext *avctx,
     else{
         int i;
         for(i=0; i<buf_size; i++)
-            a->bitstream_buffer[i]= reverse[ buf[i] ];
+            a->bitstream_buffer[i]= ff_reverse[ buf[i] ];
     }
 
     init_get_bits(&a->gb, a->bitstream_buffer, buf_size*8);
@@ -527,7 +528,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     else{
         int i;
         for(i=0; i<4*size; i++)
-            buf[i]= reverse[ buf[i] ];
+            buf[i]= ff_reverse[ buf[i] ];
     }
     
     return size*4;
@@ -573,7 +574,7 @@ static int decode_init(AVCodecContext *avctx){
     }
 
     p->qstride= a->mb_width;
-    p->qscale_table= av_mallocz( p->qstride * a->mb_height);
+    p->qscale_table= av_malloc( p->qstride * a->mb_height);
     p->quality= (32*scale + a->inv_qscale/2)/a->inv_qscale;
     memset(p->qscale_table, p->quality, p->qstride*a->mb_height);
 
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 26fee27ef..064f58df2 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -15,16 +15,9 @@ extern "C" {
 #include "rational.h"
 #include <sys/types.h> /* size_t */
 
-/* FIXME: We cannot use ffmpeg's XvMC capabilities, since that would require
- * linking the ffmpeg plugin against XvMC libraries, which is a bad thing,
- * since they are output dependend.
- * The correct fix would be to reimplement the XvMC functions libavcodec uses
- * and do the necessary talking with our XvMC output plugin there. */
-#undef HAVE_XVMC
-
-#define FFMPEG_VERSION_INT     0x000408
-#define FFMPEG_VERSION         "0.4.8"
-#define LIBAVCODEC_BUILD       4715
+#define FFMPEG_VERSION_INT     0x000409
+#define FFMPEG_VERSION         "0.4.9-pre1"
+#define LIBAVCODEC_BUILD       4752
 
 #define LIBAVCODEC_VERSION_INT FFMPEG_VERSION_INT
 #define LIBAVCODEC_VERSION     FFMPEG_VERSION
@@ -36,18 +29,22 @@ extern "C" {
 #define AV_NOPTS_VALUE int64_t_C(0x8000000000000000)
 #define AV_TIME_BASE 1000000
 
+/* FIXME: We cannot use ffmpeg's XvMC capabilities, since that would require
+ * linking the ffmpeg plugin against XvMC libraries, which is a bad thing,
+ * since they are output dependend.
+ * The correct fix would be to reimplement the XvMC functions libavcodec uses
+ * and do the necessary talking with our XvMC output plugin there. */
+#undef HAVE_XVMC
+
 enum CodecID {
     CODEC_ID_NONE, 
     CODEC_ID_MPEG1VIDEO,
     CODEC_ID_MPEG2VIDEO, /* prefered ID for MPEG Video 1 or 2 decoding */
     CODEC_ID_MPEG2VIDEO_XVMC,
+    CODEC_ID_H261,
     CODEC_ID_H263,
     CODEC_ID_RV10,
     CODEC_ID_RV20,
-    CODEC_ID_MP2,
-    CODEC_ID_MP3, /* prefered ID for MPEG Audio layer 1, 2 or3 decoding */
-    CODEC_ID_VORBIS,
-    CODEC_ID_AC3,
     CODEC_ID_MJPEG,
     CODEC_ID_MJPEGB,
     CODEC_ID_LJPEG,
@@ -65,19 +62,12 @@ enum CodecID {
     CODEC_ID_SVQ1,
     CODEC_ID_SVQ3,
     CODEC_ID_DVVIDEO,
-    CODEC_ID_DVAUDIO,
-    CODEC_ID_WMAV1,
-    CODEC_ID_WMAV2,
-    CODEC_ID_MACE3,
-    CODEC_ID_MACE6,
     CODEC_ID_HUFFYUV,
     CODEC_ID_CYUV,
     CODEC_ID_H264,
     CODEC_ID_INDEO3,
     CODEC_ID_VP3,
     CODEC_ID_THEORA,
-    CODEC_ID_AAC,
-    CODEC_ID_MPEG4AAC,
     CODEC_ID_ASV1,
     CODEC_ID_ASV2,
     CODEC_ID_FFV1,
@@ -100,13 +90,33 @@ enum CodecID {
     CODEC_ID_FLIC,
     CODEC_ID_TRUEMOTION1,
     CODEC_ID_VMDVIDEO,
-    CODEC_ID_VMDAUDIO,
     CODEC_ID_MSZH,
     CODEC_ID_ZLIB,
     CODEC_ID_QTRLE,
+    CODEC_ID_SNOW,
+    CODEC_ID_TSCC,
+    CODEC_ID_ULTI,
+    CODEC_ID_QDRAW,
+    CODEC_ID_VIXL,
+    CODEC_ID_QPEG,
+    CODEC_ID_XVID,
+    CODEC_ID_PNG,
+    CODEC_ID_PPM,
+    CODEC_ID_PBM,
+    CODEC_ID_PGM,
+    CODEC_ID_PGMYUV,
+    CODEC_ID_PAM,
+    CODEC_ID_FFVHUFF,
+    CODEC_ID_RV30,
+    CODEC_ID_RV40,
+    CODEC_ID_VC9,
+    CODEC_ID_WMV3,
+    CODEC_ID_LOCO,
+    CODEC_ID_WNV1,
+    CODEC_ID_AASC,
 
     /* various pcm "codecs" */
-    CODEC_ID_PCM_S16LE,
+    CODEC_ID_PCM_S16LE= 0x10000,
     CODEC_ID_PCM_S16BE,
     CODEC_ID_PCM_U16LE,
     CODEC_ID_PCM_U16BE,
@@ -116,7 +126,7 @@ enum CodecID {
     CODEC_ID_PCM_ALAW,
 
     /* various adpcm codecs */
-    CODEC_ID_ADPCM_IMA_QT,
+    CODEC_ID_ADPCM_IMA_QT= 0x11000,
     CODEC_ID_ADPCM_IMA_WAV,
     CODEC_ID_ADPCM_IMA_DK3,
     CODEC_ID_ADPCM_IMA_DK4,
@@ -128,23 +138,48 @@ enum CodecID {
     CODEC_ID_ADPCM_ADX,
     CODEC_ID_ADPCM_EA,
     CODEC_ID_ADPCM_G726,
+    CODEC_ID_ADPCM_CT,
+    CODEC_ID_ADPCM_SWF,
 
-	/* AMR */
-    CODEC_ID_AMR_NB,
+    /* AMR */
+    CODEC_ID_AMR_NB= 0x12000,
     CODEC_ID_AMR_WB,
 
     /* RealAudio codecs*/
-    CODEC_ID_RA_144,
+    CODEC_ID_RA_144= 0x13000,
     CODEC_ID_RA_288,
 
     /* various DPCM codecs */
-    CODEC_ID_ROQ_DPCM,
+    CODEC_ID_ROQ_DPCM= 0x14000,
     CODEC_ID_INTERPLAY_DPCM,
     CODEC_ID_XAN_DPCM,
+    CODEC_ID_SOL_DPCM,
     
+    CODEC_ID_MP2= 0x15000,
+    CODEC_ID_MP3, /* prefered ID for MPEG Audio layer 1, 2 or3 decoding */
+    CODEC_ID_AAC,
+    CODEC_ID_MPEG4AAC,
+    CODEC_ID_AC3,
+    CODEC_ID_DTS,
+    CODEC_ID_VORBIS,
+    CODEC_ID_DVAUDIO,
+    CODEC_ID_WMAV1,
+    CODEC_ID_WMAV2,
+    CODEC_ID_MACE3,
+    CODEC_ID_MACE6,
+    CODEC_ID_VMDAUDIO,
+    CODEC_ID_SONIC,
+    CODEC_ID_SONIC_LS,
     CODEC_ID_FLAC,
+    CODEC_ID_MP3ADU,
+    CODEC_ID_MP3ON4,
+    CODEC_ID_SHORTEN,
+    CODEC_ID_ALAC,
+    CODEC_ID_WESTWOOD_SND1,
+    
+    CODEC_ID_OGGTHEORA= 0x16000, 
     
-    CODEC_ID_MPEG2TS, /* _FAKE_ codec to indicate a raw MPEG2 transport
+    CODEC_ID_MPEG2TS= 0x20000, /* _FAKE_ codec to indicate a raw MPEG2 transport
                          stream (only used by libavformat) */
 };
 
@@ -178,7 +213,7 @@ enum CodecType {
  */
 enum PixelFormat {
     PIX_FMT_YUV420P,   ///< Planar YUV 4:2:0 (1 Cr & Cb sample per 2x2 Y samples)
-    PIX_FMT_YUV422,    
+    PIX_FMT_YUV422,    ///< Packed pixel, Y0 Cb Y1 Cr 
     PIX_FMT_RGB24,     ///< Packed pixel, 3 bytes per pixel, RGBRGB...
     PIX_FMT_BGR24,     ///< Packed pixel, 3 bytes per pixel, BGRBGR...
     PIX_FMT_YUV422P,   ///< Planar YUV 4:2:2 (1 Cr & Cb sample per 2x1 Y samples)
@@ -197,12 +232,17 @@ enum PixelFormat {
     PIX_FMT_YUVJ444P,  ///< Planar YUV 4:4:4 full scale (jpeg)
     PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing(xvmc_render.h)
     PIX_FMT_XVMC_MPEG2_IDCT,
+    PIX_FMT_UYVY422,   ///< Packed pixel, Cb Y0 Cr Y1 
+    PIX_FMT_UYVY411,   ///< Packed pixel, Cb Y0 Y1 Cr Y2 Y3
     PIX_FMT_NB,
 };
 
 /* currently unused, may be used if 24/32 bits samples ever supported */
 enum SampleFormat {
     SAMPLE_FMT_S16 = 0,         ///< signed 16 bits 
+    SAMPLE_FMT_S32,             ///< signed 32 bits 
+    SAMPLE_FMT_FLT,             ///< float
+    SAMPLE_FMT_DBL,             ///< double
 };
 
 /* in bytes */
@@ -217,6 +257,12 @@ enum SampleFormat {
  */
 #define FF_INPUT_BUFFER_PADDING_SIZE 8
 
+/**
+ * minimum encoding buffer size.
+ * used to avoid some checks during header writing
+ */
+#define FF_MIN_BUFFER_SIZE 16384
+
 /* motion estimation type, EPZS by default */
 enum Motion_Est_ID {
     ME_ZERO = 1,
@@ -227,6 +273,14 @@ enum Motion_Est_ID {
     ME_X1
 };
 
+enum AVRounding {
+    AV_ROUND_ZERO     = 0, ///< round toward zero
+    AV_ROUND_INF      = 1, ///< round away from zero
+    AV_ROUND_DOWN     = 2, ///< round toward -infinity
+    AV_ROUND_UP       = 3, ///< round toward +infinity
+    AV_ROUND_NEAR_INF = 5, ///< round to nearest and halfway cases away from zero
+};
+
 typedef struct RcOverride{
     int start_frame;
     int end_frame;
@@ -281,6 +335,10 @@ extern int motion_estimation_method;
 #define CODEC_FLAG_INTERLACED_ME  0x20000000 ///< interlaced motion estimation
 #define CODEC_FLAG_SVCD_SCAN_OFFSET 0x40000000 ///< will reserve space for SVCD scan offset user data
 #define CODEC_FLAG_CLOSED_GOP     0x80000000
+#define CODEC_FLAG2_FAST          0x00000001 ///< allow non spec compliant speedup tricks
+#define CODEC_FLAG2_STRICT_GOP    0x00000002 ///< strictly enforce GOP size
+#define CODEC_FLAG2_NO_OUTPUT     0x00000004 ///< skip bitstream encoding
+
 /* Unsupported options :
  * 		Syntax Arithmetic coding (SAC)
  * 		Reference Picture Selection
@@ -300,6 +358,11 @@ extern int motion_estimation_method;
 #define CODEC_CAP_TRUNCATED       0x0008
 /* codec can export data for HW decoding (XvMC) */
 #define CODEC_CAP_HWACCEL         0x0010
+/** 
+ * codec has a non zero delay and needs to be feeded with NULL at the end to get the delayed data.
+ * if this is not set, the codec is guranteed to never be feeded with NULL data
+ */
+#define CODEC_CAP_DELAY           0x0020
 
 //the following defines might change, so dont expect compatibility if u use them
 #define MB_TYPE_INTRA4x4   0x0001
@@ -448,7 +511,14 @@ typedef struct AVPanScan{
     uint8_t *mbskip_table;\
 \
     /**\
-     * Motion vector table\
+     * Motion vector table.\
+     * @code\
+     * example:\
+     * int mv_sample_log2= 4 - motion_subsample_log2;\
+     * int mb_width= (width+15)>>4;\
+     * int mv_stride= (mb_width << mv_sample_log2) + 1;\
+     * motion_val[direction][x + y*mv_stride][0->mv_x, 1->mv_y];\
+     * @endcode\
      * - encoding: set by user\
      * - decoding: set by lavc\
      */\
@@ -463,7 +533,8 @@ typedef struct AVPanScan{
     uint32_t *mb_type;\
 \
     /**\
-     * Macroblock size: (0->16x16, 1->8x8, 2-> 4x4, 3-> 2x2)\
+     * log2 of the size of the block which a single vector in motion_val represents: \
+     * (4->16x16, 3->8x8, 2-> 4x4, 1-> 2x2)\
      * - encoding: unused\
      * - decoding: set by lavc\
      */\
@@ -645,6 +716,8 @@ typedef struct AVCodecContext {
      * mjpeg: huffman tables
      * rv10: additional flags
      * mpeg4: global headers (they can be in the bitstream or here)
+     * the allocated memory should be FF_INPUT_BUFFER_PADDING_SIZE bytes larger
+     * then extradata_size to avoid prolems if its read with the bitstream reader
      * - encoding: set/allocated/freed by lavc.
      * - decoding: set/allocated/freed by user.
      */
@@ -662,9 +735,11 @@ typedef struct AVCodecContext {
     int frame_rate;
     
     /**
-     * width / height.
+     * picture width / height.
      * - encoding: MUST be set by user. 
-     * - decoding: set by user if known, codec should override / dynamically change if needed
+     * - decoding: set by lavc.
+     * Note, for compatibility its possible to set this instead of 
+     * coded_width/height before decoding
      */
     int width, height;
     
@@ -679,10 +754,7 @@ typedef struct AVCodecContext {
 
     /**
      * pixel format, see PIX_FMT_xxx.
-     * - encoding: FIXME: used by ffmpeg to decide whether an pix_fmt
-     *                    conversion is in order. This only works for
-     *                    codecs with one supported pix_fmt, we should
-     *                    do something for a generic case as well.
+     * - encoding: set by user.
      * - decoding: set by lavc.
      */
     enum PixelFormat pix_fmt;
@@ -714,7 +786,13 @@ typedef struct AVCodecContext {
     /* audio only */
     int sample_rate; ///< samples per sec 
     int channels;
-    int sample_fmt;  ///< sample format, currenly unused 
+
+    /**
+     * audio sample format.
+     * - encoding: set by user.
+     * - decoding: set by lavc.
+     */
+    enum SampleFormat sample_fmt;  ///< sample format, currenly unused 
 
     /* the following data should not be initialized */
     int frame_size;     ///< in samples, initialized when calling 'init' 
@@ -798,8 +876,10 @@ typedef struct AVCodecContext {
     /* The RTP callcack: This function is called  */
     /* every time the encoder as a packet to send */
     /* Depends on the encoder if the data starts  */
-    /* with a Start Code (it should) H.263 does   */
-    void (*rtp_callback)(struct AVCodecContext *avctx, void *data, int size, int packet_number); 
+    /* with a Start Code (it should) H.263 does.  */
+    /* mb_nb contains the number of macroblocks   */
+    /* encoded in the RTP payload                 */
+    void (*rtp_callback)(struct AVCodecContext *avctx, void *data, int size, int mb_nb); 
 
     /* statistics, used for 2-pass encoding */
     int mv_bits;
@@ -839,7 +919,7 @@ typedef struct AVCodecContext {
     
     /**
      * workaround bugs in encoders which sometimes cannot be detected automatically.
-     * - encoding: unused
+     * - encoding: set by user
      * - decoding: set by user
      */
     int workaround_bugs;
@@ -857,6 +937,7 @@ typedef struct AVCodecContext {
 #define FF_BUG_EDGE             1024
 #define FF_BUG_HPEL_CHROMA      2048
 #define FF_BUG_DC_CLIP          4096
+#define FF_BUG_MS               8192 ///< workaround various bugs in microsofts broken decoders
 //#define FF_BUG_FAKE_SCALABILITY 16 //autodetection should work 100%
         
     /**
@@ -921,8 +1002,8 @@ typedef struct AVCodecContext {
     void (*release_buffer)(struct AVCodecContext *c, AVFrame *pic);
 
     /**
-     * is 1 if the decoded stream contains b frames, 0 otherwise.
-     * - encoding: unused
+     * if 1 the stream has a 1 frame delay during decoding.
+     * - encoding: set by lavc
      * - decoding: set by lavc
      */
     int has_b_frames;
@@ -1096,6 +1177,7 @@ typedef struct AVCodecContext {
 #define FF_IDCT_ALTIVEC      8
 #define FF_IDCT_SH4          9
 #define FF_IDCT_SIMPLEARM    10
+#define FF_IDCT_H264         11
 
     /**
      * slice count.
@@ -1136,6 +1218,7 @@ typedef struct AVCodecContext {
 #define FF_MM_MMXEXT	0x0002 /* SSE integer functions or AMD MMX ext */
 #define FF_MM_SSE	0x0008 /* SSE functions */
 #define FF_MM_SSE2	0x0010 /* PIV SSE2 functions */
+#define FF_MM_3DNOWEXT	0x0020 /* AMD 3DNowExt */
 #endif /* HAVE_MMX */
 
     /**
@@ -1211,14 +1294,14 @@ typedef struct AVCodecContext {
     
     /**
      * minimum MB quantizer.
-     * - encoding: set by user.
+     * - encoding: unused
      * - decoding: unused
      */
     int mb_qmin;
 
     /**
      * maximum MB quantizer.
-     * - encoding: set by user.
+     * - encoding: unused
      * - decoding: unused
      */
     int mb_qmax;
@@ -1257,6 +1340,10 @@ typedef struct AVCodecContext {
 #define FF_CMP_ZERO 7
 #define FF_CMP_VSAD 8
 #define FF_CMP_VSSE 9
+#define FF_CMP_NSSE 10
+#define FF_CMP_W53  11
+#define FF_CMP_W97  12
+#define FF_CMP_DCTMAX 13
 #define FF_CMP_CHROMA 256
     
     /**
@@ -1596,11 +1683,120 @@ typedef struct AVCodecContext {
      int mb_threshold;
 
     /**
-     * 
+     * precision of the intra dc coefficient - 8.
      * - encoding: set by user
      * - decoding: unused
      */
      int intra_dc_precision;
+
+    /**
+     * noise vs. sse weight for the nsse comparsion function.
+     * - encoding: set by user
+     * - decoding: unused
+     */
+     int nsse_weight;
+
+    /**
+     * number of macroblock rows at the top which are skiped.
+     * - encoding: unused
+     * - decoding: set by user
+     */
+     int skip_top;
+
+    /**
+     * number of macroblock rows at the bottom which are skiped.
+     * - encoding: unused
+     * - decoding: set by user
+     */
+     int skip_bottom;
+
+    /**
+     * profile
+     * - encoding: set by user
+     * - decoding: set by lavc
+     */
+     int profile;
+#define FF_PROFILE_UNKNOWN -99
+
+    /**
+     * level
+     * - encoding: set by user
+     * - decoding: set by lavc
+     */
+     int level;
+#define FF_LEVEL_UNKNOWN -99
+
+    /**
+     * low resolution decoding. 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: set by user
+     */
+     int lowres;
+
+    /**
+     * bitsream width / height. may be different from width/height if lowres
+     * or other things are used
+     * - encoding: unused
+     * - decoding: set by user before init if known, codec should override / dynamically change if needed
+     */
+    int coded_width, coded_height;
+
+    /**
+     * frame skip threshold
+     * - encoding: set by user
+     * - decoding: unused
+     */
+    int frame_skip_threshold;
+
+    /**
+     * frame skip factor
+     * - encoding: set by user
+     * - decoding: unused
+     */
+    int frame_skip_factor;
+
+    /**
+     * frame skip exponent
+     * - encoding: set by user
+     * - decoding: unused
+     */
+    int frame_skip_exp;
+
+    /**
+     * frame skip comparission function
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int frame_skip_cmp;
+
+    /**
+     * border processing masking. raises the quantizer for mbs on the borders
+     * of the picture.
+     * - encoding: set by user
+     * - decoding: unused
+     */
+    float border_masking;
+
+    /**
+     * minimum MB lagrange multipler.
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int mb_lmin;
+
+    /**
+     * maximum MB lagrange multipler.
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int mb_lmax;
+
+    /**
+     * 
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int me_penalty_compensation;
 } AVCodecContext;
 
 
@@ -1640,21 +1836,12 @@ typedef struct AVOption {
 } AVOption;
 
 /**
- * Parse option(s) and sets fields in passed structure
- * @param strct	structure where the parsed results will be written
- * @param list  list with AVOptions
- * @param opts	string with options for parsing
- */
-int avoption_parse(void* strct, const AVOption* list, const char* opts);
-
-
-/**
  * AVCodec.
  */
 typedef struct AVCodec {
     const char *name;
     enum CodecType type;
-    int id;
+    enum CodecID id;
     int priv_data_size;
     int (*init)(AVCodecContext *);
     int (*encode)(AVCodecContext *, uint8_t *buf, int buf_size, void *data);
@@ -1662,7 +1849,7 @@ typedef struct AVCodec {
     int (*decode)(AVCodecContext *, void *outdata, int *outdata_size,
                   uint8_t *buf, int buf_size);
     int capabilities;
-    const AVOption *options;
+    void *dummy; // FIXME remove next time we break binary compatibility
     struct AVCodec *next;
     void (*flush)(AVCodecContext *);
     const AVRational *supported_framerates; ///array of supported framerates, or NULL if any, array is terminated by {0,0}
@@ -1703,9 +1890,12 @@ extern AVCodec ac3_encoder;
 extern AVCodec mp2_encoder;
 extern AVCodec mp3lame_encoder;
 extern AVCodec oggvorbis_encoder;
+extern AVCodec oggtheora_encoder;
 extern AVCodec faac_encoder;
+extern AVCodec xvid_encoder;
 extern AVCodec mpeg1video_encoder;
 extern AVCodec mpeg2video_encoder;
+extern AVCodec h261_encoder;
 extern AVCodec h263_encoder;
 extern AVCodec h263p_encoder;
 extern AVCodec flv_encoder;
@@ -1714,6 +1904,12 @@ extern AVCodec rv20_encoder;
 extern AVCodec dvvideo_encoder;
 extern AVCodec mjpeg_encoder;
 extern AVCodec ljpeg_encoder;
+extern AVCodec png_encoder;
+extern AVCodec ppm_encoder;
+extern AVCodec pgm_encoder;
+extern AVCodec pgmyuv_encoder;
+extern AVCodec pbm_encoder;
+extern AVCodec pam_encoder;
 extern AVCodec mpeg4_encoder;
 extern AVCodec msmpeg4v1_encoder;
 extern AVCodec msmpeg4v2_encoder;
@@ -1721,22 +1917,30 @@ extern AVCodec msmpeg4v3_encoder;
 extern AVCodec wmv1_encoder;
 extern AVCodec wmv2_encoder;
 extern AVCodec huffyuv_encoder;
+extern AVCodec ffvhuff_encoder;
 extern AVCodec h264_encoder;
 extern AVCodec asv1_encoder;
 extern AVCodec asv2_encoder;
 extern AVCodec vcr1_encoder;
 extern AVCodec ffv1_encoder;
+extern AVCodec snow_encoder;
 extern AVCodec mdec_encoder;
 extern AVCodec zlib_encoder;
+extern AVCodec sonic_encoder;
+extern AVCodec sonic_ls_encoder;
 extern AVCodec svq1_encoder;
+extern AVCodec x264_encoder;
 
 extern AVCodec h263_decoder;
+extern AVCodec h261_decoder;
 extern AVCodec mpeg4_decoder;
 extern AVCodec msmpeg4v1_decoder;
 extern AVCodec msmpeg4v2_decoder;
 extern AVCodec msmpeg4v3_decoder;
 extern AVCodec wmv1_decoder;
 extern AVCodec wmv2_decoder;
+extern AVCodec vc9_decoder;
+extern AVCodec wmv3_decoder;
 extern AVCodec mpeg1video_decoder;
 extern AVCodec mpeg2video_decoder;
 extern AVCodec mpegvideo_decoder;
@@ -1745,6 +1949,8 @@ extern AVCodec h263i_decoder;
 extern AVCodec flv_decoder;
 extern AVCodec rv10_decoder;
 extern AVCodec rv20_decoder;
+extern AVCodec rv30_decoder;
+extern AVCodec rv40_decoder;
 extern AVCodec svq1_decoder;
 extern AVCodec svq3_decoder;
 extern AVCodec dvvideo_decoder;
@@ -1753,12 +1959,17 @@ extern AVCodec wmav2_decoder;
 extern AVCodec mjpeg_decoder;
 extern AVCodec mjpegb_decoder;
 extern AVCodec sp5x_decoder;
+extern AVCodec png_decoder;
 extern AVCodec mp2_decoder;
 extern AVCodec mp3_decoder;
+extern AVCodec mp3adu_decoder;
+extern AVCodec mp3on4_decoder;
 extern AVCodec mace3_decoder;
 extern AVCodec mace6_decoder;
 extern AVCodec huffyuv_decoder;
+extern AVCodec ffvhuff_decoder;
 extern AVCodec oggvorbis_decoder;
+extern AVCodec oggtheora_decoder;
 extern AVCodec cyuv_decoder;
 extern AVCodec h264_decoder;
 extern AVCodec indeo3_decoder;
@@ -1775,6 +1986,7 @@ extern AVCodec asv2_decoder;
 extern AVCodec vcr1_decoder;
 extern AVCodec cljr_decoder;
 extern AVCodec ffv1_decoder;
+extern AVCodec snow_decoder;
 extern AVCodec fourxm_decoder;
 extern AVCodec mdec_decoder;
 extern AVCodec roq_decoder;
@@ -1799,8 +2011,21 @@ extern AVCodec ra_288_decoder;
 extern AVCodec roq_dpcm_decoder;
 extern AVCodec interplay_dpcm_decoder;
 extern AVCodec xan_dpcm_decoder;
+extern AVCodec sol_dpcm_decoder;
+extern AVCodec sonic_decoder;
 extern AVCodec qtrle_decoder;
 extern AVCodec flac_decoder;
+extern AVCodec tscc_decoder;
+extern AVCodec ulti_decoder;
+extern AVCodec qdraw_decoder;
+extern AVCodec xl_decoder;
+extern AVCodec qpeg_decoder;
+extern AVCodec shorten_decoder;
+extern AVCodec loco_decoder;
+extern AVCodec wnv1_decoder;
+extern AVCodec aasc_decoder;
+extern AVCodec alac_decoder;
+extern AVCodec ws_snd1_decoder;
 
 /* pcm codecs */
 #define PCM_CODEC(id, name) \
@@ -1830,6 +2055,8 @@ PCM_CODEC(CODEC_ID_ADPCM_XA, adpcm_xa);
 PCM_CODEC(CODEC_ID_ADPCM_ADX, adpcm_adx);
 PCM_CODEC(CODEC_ID_ADPCM_EA, adpcm_ea);
 PCM_CODEC(CODEC_ID_ADPCM_G726, adpcm_g726);
+PCM_CODEC(CODEC_ID_ADPCM_CT, adpcm_ct);
+PCM_CODEC(CODEC_ID_ADPCM_SWF, adpcm_swf);
 
 #undef PCM_CODEC
 
@@ -1839,10 +2066,12 @@ extern AVCodec rawvideo_decoder;
 
 /* the following codecs use external GPL libs */
 extern AVCodec ac3_decoder;
+extern AVCodec dts_decoder;
 
 /* resample.c */
 
 struct ReSampleContext;
+struct AVResampleContext;
 
 typedef struct ReSampleContext ReSampleContext;
 
@@ -1851,6 +2080,11 @@ ReSampleContext *audio_resample_init(int output_channels, int input_channels,
 int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples);
 void audio_resample_close(ReSampleContext *s);
 
+struct AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_length, int log2_phase_count, int linear, double cutoff);
+int av_resample(struct AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx);
+void av_resample_compensate(struct AVResampleContext *c, int sample_delta, int compensation_distance);
+void av_resample_close(struct AVResampleContext *c);
+
 /* YUV420 format is assumed ! */
 
 struct ImgReSampleContext;
@@ -1894,7 +2128,9 @@ int avpicture_layout(const AVPicture* src, int pix_fmt, int width, int height,
 int avpicture_get_size(int pix_fmt, int width, int height);
 void avcodec_get_chroma_sub_sample(int pix_fmt, int *h_shift, int *v_shift);
 const char *avcodec_get_pix_fmt_name(int pix_fmt);
+void avcodec_set_dimensions(AVCodecContext *s, int width, int height);
 enum PixelFormat avcodec_get_pix_fmt(const char* name);
+unsigned int avcodec_pix_fmt_to_codec_tag(enum PixelFormat p);
 
 #define FF_LOSS_RESOLUTION  0x0001 /* loss due to resolution change */
 #define FF_LOSS_DEPTH       0x0002 /* loss due to color depth change */
@@ -1948,6 +2184,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic);
 void avcodec_default_release_buffer(AVCodecContext *s, AVFrame *pic);
 int avcodec_default_reget_buffer(AVCodecContext *s, AVFrame *pic);
 void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height);
+int avcodec_check_dimensions(void *av_log_ctx, unsigned int w, unsigned int h);
 enum PixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum PixelFormat * fmt);
 
 int avcodec_thread_init(AVCodecContext *s, int thread_count);
@@ -1982,6 +2219,8 @@ void avcodec_register_all(void);
 
 void avcodec_flush_buffers(AVCodecContext *avctx);
 
+void avcodec_default_free_buffers(AVCodecContext *s);
+
 /* misc usefull functions */
 
 /**
@@ -1998,87 +2237,16 @@ char av_get_pict_type_char(int pict_type);
 int av_reduce(int *dst_nom, int *dst_den, int64_t nom, int64_t den, int64_t max);
 
 /**
- * rescale a 64bit integer.
+ * rescale a 64bit integer with rounding to nearest.
  * a simple a*b/c isnt possible as it can overflow
  */
 int64_t av_rescale(int64_t a, int64_t b, int64_t c);
 
-
 /**
- * Interface for 0.5.0 version
- *
- * do not even think about it's usage for this moment
- */
-
-typedef struct {
-    /// compressed size used from given memory buffer
-    int size;
-    /// I/P/B frame type
-    int frame_type;
-} avc_enc_result_t;
-
-/**
- * Commands
- * order can't be changed - once it was defined
- */
-typedef enum {
-    // general commands
-    AVC_OPEN_BY_NAME = 0xACA000,
-    AVC_OPEN_BY_CODEC_ID,
-    AVC_OPEN_BY_FOURCC,
-    AVC_CLOSE,
-
-    AVC_FLUSH,
-    // pin - struct { uint8_t* src, uint_t src_size }
-    // pout - struct { AVPicture* img, consumed_bytes,
-    AVC_DECODE,
-    // pin - struct { AVPicture* img, uint8_t* dest, uint_t dest_size }
-    // pout - uint_t used_from_dest_size
-    AVC_ENCODE, 
-
-    // query/get video commands
-    AVC_GET_VERSION = 0xACB000,
-    AVC_GET_WIDTH,
-    AVC_GET_HEIGHT,
-    AVC_GET_DELAY,
-    AVC_GET_QUANT_TABLE,
-    // ...
-
-    // query/get audio commands
-    AVC_GET_FRAME_SIZE = 0xABC000,
-
-    // maybe define some simple structure which
-    // might be passed to the user - but they can't
-    // contain any codec specific parts and these
-    // calls are usualy necessary only few times
-
-    // set video commands
-    AVC_SET_WIDTH = 0xACD000,
-    AVC_SET_HEIGHT,
-
-    // set video encoding commands
-    AVC_SET_FRAME_RATE = 0xACD800,
-    AVC_SET_QUALITY,
-    AVC_SET_HURRY_UP,
-
-    // set audio commands
-    AVC_SET_SAMPLE_RATE = 0xACE000,
-    AVC_SET_CHANNELS,
-
-} avc_cmd_t;
-
-/**
- * \param handle  allocated private structure by libavcodec
- *                for initialization pass NULL - will be returned pout
- *                user is supposed to know nothing about its structure
- * \param cmd     type of operation to be performed
- * \param pint    input parameter
- * \param pout    output parameter
- *
- * \returns  command status - eventually for query command it might return
- * integer resulting value
+ * rescale a 64bit integer with specified rounding.
+ * a simple a*b/c isnt possible as it can overflow
  */
-int avcodec(void* handle, avc_cmd_t cmd, void* pin, void* pout);
+int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding);
 
 /* frame parsing */
 typedef struct AVCodecParserContext {
@@ -2107,7 +2275,7 @@ typedef struct AVCodecParserContext {
 } AVCodecParserContext;
 
 typedef struct AVCodecParser {
-    int codec_ids[3]; /* several codec IDs are permitted */
+    int codec_ids[5]; /* several codec IDs are permitted */
     int priv_data_size;
     int (*parser_init)(AVCodecParserContext *s);
     int (*parser_parse)(AVCodecParserContext *s, 
@@ -2131,8 +2299,11 @@ void av_parser_close(AVCodecParserContext *s);
 
 extern AVCodecParser mpegvideo_parser;
 extern AVCodecParser mpeg4video_parser;
+extern AVCodecParser h261_parser;
 extern AVCodecParser h263_parser;
 extern AVCodecParser h264_parser;
+extern AVCodecParser mjpeg_parser;
+extern AVCodecParser pnm_parser;
 extern AVCodecParser mpegaudio_parser;
 extern AVCodecParser ac3_parser;
 
@@ -2148,6 +2319,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
 /* call av_free_static to release all staticaly allocated tables */
 void av_free_static(void);
 void *av_mallocz_static(unsigned int size);
+void *av_realloc_static(void *ptr, unsigned int size);
 
 /* add by bero : in adx.c */
 int is_adx(const unsigned char *buf,size_t bufsize);
@@ -2164,7 +2336,12 @@ void img_copy(AVPicture *dst, const AVPicture *src,
 #define AV_LOG_INFO 1
 #define AV_LOG_DEBUG 2
 
+#ifdef __GNUC__
 extern void av_log(void*, int level, const char *fmt, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+#else
+extern void av_log(void*, int level, const char *fmt, ...);
+#endif
+
 extern void av_vlog(void*, int level, const char *fmt, va_list);
 extern int av_log_get_level(void);
 extern void av_log_set_level(int);
diff --git a/src/libffmpeg/libavcodec/bitstream.c b/src/libffmpeg/libavcodec/bitstream.c
new file mode 100755
index 000000000..2678772c4
--- /dev/null
+++ b/src/libffmpeg/libavcodec/bitstream.c
@@ -0,0 +1,287 @@
+/*
+ * Common bit i/o utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at>
+ */
+
+/**
+ * @file bitstream.c
+ * bitstream api.
+ */
+ 
+#include "avcodec.h"
+#include "bitstream.h"
+
+void align_put_bits(PutBitContext *s)
+{
+#ifdef ALT_BITSTREAM_WRITER
+    put_bits(s,(  - s->index) & 7,0);
+#else
+    put_bits(s,s->bit_left & 7,0);
+#endif
+}
+
+void put_string(PutBitContext * pbc, char *s, int put_zero)
+{
+    while(*s){
+        put_bits(pbc, 8, *s);
+        s++;
+    }
+    if(put_zero)
+        put_bits(pbc, 8, 0);
+}
+
+/* bit input functions */
+
+/** 
+ * reads 0-32 bits.
+ */
+unsigned int get_bits_long(GetBitContext *s, int n){
+    if(n<=17) return get_bits(s, n);
+    else{
+        int ret= get_bits(s, 16) << (n-16);
+        return ret | get_bits(s, n-16);
+    }
+}
+
+/** 
+ * shows 0-32 bits.
+ */
+unsigned int show_bits_long(GetBitContext *s, int n){
+    if(n<=17) return show_bits(s, n);
+    else{
+        GetBitContext gb= *s;
+        int ret= get_bits_long(s, n);
+        *s= gb;
+        return ret;
+    }
+}
+
+void align_get_bits(GetBitContext *s)
+{
+    int n= (-get_bits_count(s)) & 7;
+    if(n) skip_bits(s, n);
+}
+
+int check_marker(GetBitContext *s, const char *msg)
+{
+    int bit= get_bits1(s);
+    if(!bit)
+	    av_log(NULL, AV_LOG_INFO, "Marker bit missing %s\n", msg);
+
+    return bit;
+}
+
+/* VLC decoding */
+
+//#define DEBUG_VLC
+
+#define GET_DATA(v, table, i, wrap, size) \
+{\
+    const uint8_t *ptr = (const uint8_t *)table + i * wrap;\
+    switch(size) {\
+    case 1:\
+        v = *(const uint8_t *)ptr;\
+        break;\
+    case 2:\
+        v = *(const uint16_t *)ptr;\
+        break;\
+    default:\
+        v = *(const uint32_t *)ptr;\
+        break;\
+    }\
+}
+
+
+static int alloc_table(VLC *vlc, int size, int use_static)
+{
+    int index;
+    index = vlc->table_size;
+    vlc->table_size += size;
+    if (vlc->table_size > vlc->table_allocated) {
+        vlc->table_allocated += (1 << vlc->bits);
+        if(use_static)
+            vlc->table = av_realloc_static(vlc->table,
+                                           sizeof(VLC_TYPE) * 2 * vlc->table_allocated);
+        else
+            vlc->table = av_realloc(vlc->table,
+                                    sizeof(VLC_TYPE) * 2 * vlc->table_allocated);
+        if (!vlc->table)
+            return -1;
+    }
+    return index;
+}
+
+static int build_table(VLC *vlc, int table_nb_bits,
+                       int nb_codes,
+                       const void *bits, int bits_wrap, int bits_size,
+                       const void *codes, int codes_wrap, int codes_size,
+                       uint32_t code_prefix, int n_prefix, int use_static)
+{
+    int i, j, k, n, table_size, table_index, nb, n1, index;
+    uint32_t code;
+    VLC_TYPE (*table)[2];
+
+    table_size = 1 << table_nb_bits;
+    table_index = alloc_table(vlc, table_size, use_static);
+#ifdef DEBUG_VLC
+    printf("new table index=%d size=%d code_prefix=%x n=%d\n",
+           table_index, table_size, code_prefix, n_prefix);
+#endif
+    if (table_index < 0)
+        return -1;
+    table = &vlc->table[table_index];
+
+    for(i=0;i<table_size;i++) {
+        table[i][1] = 0; //bits
+        table[i][0] = -1; //codes
+    }
+
+    /* first pass: map codes and compute auxillary table sizes */
+    for(i=0;i<nb_codes;i++) {
+        GET_DATA(n, bits, i, bits_wrap, bits_size);
+        GET_DATA(code, codes, i, codes_wrap, codes_size);
+        /* we accept tables with holes */
+        if (n <= 0)
+            continue;
+#if defined(DEBUG_VLC) && 0
+        printf("i=%d n=%d code=0x%x\n", i, n, code);
+#endif
+        /* if code matches the prefix, it is in the table */
+        n -= n_prefix;
+        if (n > 0 && (code >> n) == code_prefix) {
+            if (n <= table_nb_bits) {
+                /* no need to add another table */
+                j = (code << (table_nb_bits - n)) & (table_size - 1);
+                nb = 1 << (table_nb_bits - n);
+                for(k=0;k<nb;k++) {
+#ifdef DEBUG_VLC
+                    av_log(NULL, AV_LOG_DEBUG, "%4x: code=%d n=%d\n",
+                           j, i, n);
+#endif
+                    if (table[j][1] /*bits*/ != 0) {
+                        av_log(NULL, AV_LOG_ERROR, "incorrect codes\n");
+                        return -1;
+                    }
+                    table[j][1] = n; //bits
+                    table[j][0] = i; //code
+                    j++;
+                }
+            } else {
+                n -= table_nb_bits;
+                j = (code >> n) & ((1 << table_nb_bits) - 1);
+#ifdef DEBUG_VLC
+                printf("%4x: n=%d (subtable)\n",
+                       j, n);
+#endif
+                /* compute table size */
+                n1 = -table[j][1]; //bits
+                if (n > n1)
+                    n1 = n;
+                table[j][1] = -n1; //bits
+            }
+        }
+    }
+
+    /* second pass : fill auxillary tables recursively */
+    for(i=0;i<table_size;i++) {
+        n = table[i][1]; //bits
+        if (n < 0) {
+            n = -n;
+            if (n > table_nb_bits) {
+                n = table_nb_bits;
+                table[i][1] = -n; //bits
+            }
+            index = build_table(vlc, n, nb_codes,
+                                bits, bits_wrap, bits_size,
+                                codes, codes_wrap, codes_size,
+                                (code_prefix << table_nb_bits) | i,
+                                n_prefix + table_nb_bits, use_static);
+            if (index < 0)
+                return -1;
+            /* note: realloc has been done, so reload tables */
+            table = &vlc->table[table_index];
+            table[i][0] = index; //code
+        }
+    }
+    return table_index;
+}
+
+
+/* Build VLC decoding tables suitable for use with get_vlc().
+
+   'nb_bits' set thee decoding table size (2^nb_bits) entries. The
+   bigger it is, the faster is the decoding. But it should not be too
+   big to save memory and L1 cache. '9' is a good compromise.
+   
+   'nb_codes' : number of vlcs codes
+
+   'bits' : table which gives the size (in bits) of each vlc code.
+
+   'codes' : table which gives the bit pattern of of each vlc code.
+
+   'xxx_wrap' : give the number of bytes between each entry of the
+   'bits' or 'codes' tables.
+
+   'xxx_size' : gives the number of bytes of each entry of the 'bits'
+   or 'codes' tables.
+
+   'wrap' and 'size' allows to use any memory configuration and types
+   (byte/word/long) to store the 'bits' and 'codes' tables.  
+
+   'use_static' should be set to 1 for tables, which should be freed
+   with av_free_static(), 0 if free_vlc() will be used.
+*/
+int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
+             const void *bits, int bits_wrap, int bits_size,
+             const void *codes, int codes_wrap, int codes_size,
+             int use_static)
+{
+    vlc->bits = nb_bits;
+    if(!use_static) {
+        vlc->table = NULL;
+        vlc->table_allocated = 0;
+        vlc->table_size = 0;
+    } else {
+        /* Static tables are initially always NULL, return
+           if vlc->table != NULL to avoid double allocation */
+        if(vlc->table)
+            return 0;
+    }
+
+#ifdef DEBUG_VLC
+    printf("build table nb_codes=%d\n", nb_codes);
+#endif
+
+    if (build_table(vlc, nb_bits, nb_codes,
+                    bits, bits_wrap, bits_size,
+                    codes, codes_wrap, codes_size,
+                    0, 0, use_static) < 0) {
+        av_free(vlc->table);
+        return -1;
+    }
+    return 0;
+}
+
+
+void free_vlc(VLC *vlc)
+{
+    av_free(vlc->table);
+}
+
diff --git a/src/libffmpeg/libavcodec/bitstream.h b/src/libffmpeg/libavcodec/bitstream.h
new file mode 100644
index 000000000..fd69915d8
--- /dev/null
+++ b/src/libffmpeg/libavcodec/bitstream.h
@@ -0,0 +1,854 @@
+/**
+ * @file bitstream.h
+ * bitstream api header.
+ */
+
+#ifndef BITSTREAM_H
+#define BITSTREAM_H
+
+//#define ALT_BITSTREAM_WRITER
+//#define ALIGNED_BITSTREAM_WRITER
+
+#define ALT_BITSTREAM_READER
+//#define LIBMPEG2_BITSTREAM_READER
+//#define A32_BITSTREAM_READER
+#define LIBMPEG2_BITSTREAM_READER_HACK //add BERO
+ 
+extern const uint8_t ff_reverse[256];
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+// avoid +32 for shift optimization (gcc should do that ...)
+static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
+    asm ("sarl %1, %0\n\t"
+         : "+r" (a)
+         : "ic" ((uint8_t)(-s))
+    );
+    return a;
+}
+static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
+    asm ("shrl %1, %0\n\t"
+         : "+r" (a)
+         : "ic" ((uint8_t)(-s))
+    );
+    return a;
+}
+#else
+#    define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
+#    define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
+#endif
+
+/* bit output */
+
+/* buf and buf_end must be present and used by every alternative writer. */
+typedef struct PutBitContext {
+#ifdef ALT_BITSTREAM_WRITER
+    uint8_t *buf, *buf_end;
+    int index;
+#else
+    uint32_t bit_buf;
+    int bit_left;
+    uint8_t *buf, *buf_ptr, *buf_end;
+#endif
+} PutBitContext;
+
+static inline void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
+{
+    s->buf = buffer;
+    s->buf_end = s->buf + buffer_size;
+#ifdef ALT_BITSTREAM_WRITER
+    s->index=0;
+    ((uint32_t*)(s->buf))[0]=0;
+//    memset(buffer, 0, buffer_size);
+#else
+    s->buf_ptr = s->buf;
+    s->bit_left=32;
+    s->bit_buf=0;
+#endif
+}
+
+/* return the number of bits output */
+static inline int put_bits_count(PutBitContext *s)
+{
+#ifdef ALT_BITSTREAM_WRITER
+    return s->index;
+#else
+    return (s->buf_ptr - s->buf) * 8 + 32 - s->bit_left;
+#endif
+}
+
+/* pad the end of the output stream with zeros */
+static inline void flush_put_bits(PutBitContext *s)
+{
+#ifdef ALT_BITSTREAM_WRITER
+    align_put_bits(s);
+#else
+    s->bit_buf<<= s->bit_left;
+    while (s->bit_left < 32) {
+        /* XXX: should test end of buffer */
+        *s->buf_ptr++=s->bit_buf >> 24;
+        s->bit_buf<<=8;
+        s->bit_left+=8;
+    }
+    s->bit_left=32;
+    s->bit_buf=0;
+#endif
+}
+
+void align_put_bits(PutBitContext *s);
+void put_string(PutBitContext * pbc, char *s, int put_zero);
+
+/* bit input */
+/* buffer, buffer_end and size_in_bits must be present and used by every reader */
+typedef struct GetBitContext {
+    const uint8_t *buffer, *buffer_end;
+#ifdef ALT_BITSTREAM_READER
+    int index;
+#elif defined LIBMPEG2_BITSTREAM_READER
+    uint8_t *buffer_ptr;
+    uint32_t cache;
+    int bit_count;
+#elif defined A32_BITSTREAM_READER
+    uint32_t *buffer_ptr;
+    uint32_t cache0;
+    uint32_t cache1;
+    int bit_count;
+#endif
+    int size_in_bits;
+} GetBitContext;
+
+#define VLC_TYPE int16_t
+
+typedef struct VLC {
+    int bits;
+    VLC_TYPE (*table)[2]; ///< code, bits
+    int table_size, table_allocated;
+} VLC;
+
+typedef struct RL_VLC_ELEM {
+    int16_t level;
+    int8_t len;
+    uint8_t run;
+} RL_VLC_ELEM;
+
+#ifdef ARCH_SPARC
+#define UNALIGNED_STORES_ARE_BAD
+#endif
+
+/* used to avoid missaligned exceptions on some archs (alpha, ...) */
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#    define unaligned32(a) (*(const uint32_t*)(a))
+#else
+#    ifdef __GNUC__
+static inline uint32_t unaligned32(const void *v) {
+    struct Unaligned {
+	uint32_t i;
+    } __attribute__((packed));
+
+    return ((const struct Unaligned *) v)->i;
+}
+#    elif defined(__DECC)
+static inline uint32_t unaligned32(const void *v) {
+    return *(const __unaligned uint32_t *) v;
+}
+#    else
+static inline uint32_t unaligned32(const void *v) {
+    return *(const uint32_t *) v;
+}
+#    endif
+#endif //!ARCH_X86
+
+#ifndef ALT_BITSTREAM_WRITER
+static inline void put_bits(PutBitContext *s, int n, unsigned int value)
+{
+    unsigned int bit_buf;
+    int bit_left;
+
+#ifdef STATS
+    st_out_bit_counts[st_current_index] += n;
+#endif
+    //    printf("put_bits=%d %x\n", n, value);
+    assert(n == 32 || value < (1U << n));
+    
+    bit_buf = s->bit_buf;
+    bit_left = s->bit_left;
+
+    //    printf("n=%d value=%x cnt=%d buf=%x\n", n, value, bit_cnt, bit_buf);
+    /* XXX: optimize */
+    if (n < bit_left) {
+        bit_buf = (bit_buf<<n) | value;
+        bit_left-=n;
+    } else {
+	bit_buf<<=bit_left;
+        bit_buf |= value >> (n - bit_left);
+#ifdef UNALIGNED_STORES_ARE_BAD
+        if (3 & (intptr_t) s->buf_ptr) {
+            s->buf_ptr[0] = bit_buf >> 24;
+            s->buf_ptr[1] = bit_buf >> 16;
+            s->buf_ptr[2] = bit_buf >>  8;
+            s->buf_ptr[3] = bit_buf      ;
+        } else
+#endif
+        *(uint32_t *)s->buf_ptr = be2me_32(bit_buf);
+        //printf("bitbuf = %08x\n", bit_buf);
+        s->buf_ptr+=4;
+	bit_left+=32 - n;
+        bit_buf = value;
+    }
+
+    s->bit_buf = bit_buf;
+    s->bit_left = bit_left;
+}
+#endif
+
+
+#ifdef ALT_BITSTREAM_WRITER
+static inline void put_bits(PutBitContext *s, int n, unsigned int value)
+{
+#    ifdef ALIGNED_BITSTREAM_WRITER
+#        if defined(ARCH_X86) || defined(ARCH_X86_64)
+    asm volatile(
+	"movl %0, %%ecx			\n\t"
+	"xorl %%eax, %%eax		\n\t"
+	"shrdl %%cl, %1, %%eax		\n\t"
+	"shrl %%cl, %1			\n\t"
+	"movl %0, %%ecx			\n\t"
+	"shrl $3, %%ecx			\n\t"
+	"andl $0xFFFFFFFC, %%ecx	\n\t"
+	"bswapl %1			\n\t"
+	"orl %1, (%2, %%ecx)		\n\t"
+	"bswapl %%eax			\n\t"
+	"addl %3, %0			\n\t"
+	"movl %%eax, 4(%2, %%ecx)	\n\t"
+	: "=&r" (s->index), "=&r" (value)
+	: "r" (s->buf), "r" (n), "0" (s->index), "1" (value<<(-n))
+	: "%eax", "%ecx"
+    );
+#        else
+    int index= s->index;
+    uint32_t *ptr= ((uint32_t *)s->buf)+(index>>5);
+    
+    value<<= 32-n; 
+    
+    ptr[0] |= be2me_32(value>>(index&31));
+    ptr[1]  = be2me_32(value<<(32-(index&31)));
+//if(n>24) printf("%d %d\n", n, value);
+    index+= n;
+    s->index= index;
+#        endif
+#    else //ALIGNED_BITSTREAM_WRITER
+#        if defined(ARCH_X86) || defined(ARCH_X86_64)
+    asm volatile(
+	"movl $7, %%ecx			\n\t"
+	"andl %0, %%ecx			\n\t"
+	"addl %3, %%ecx			\n\t"
+	"negl %%ecx			\n\t"
+	"shll %%cl, %1			\n\t"
+	"bswapl %1			\n\t"
+	"movl %0, %%ecx			\n\t"
+	"shrl $3, %%ecx			\n\t"
+	"orl %1, (%%ecx, %2)		\n\t"
+	"addl %3, %0			\n\t"
+	"movl $0, 4(%%ecx, %2)		\n\t"
+	: "=&r" (s->index), "=&r" (value)
+	: "r" (s->buf), "r" (n), "0" (s->index), "1" (value)
+	: "%ecx"
+    );
+#        else
+    int index= s->index;
+    uint32_t *ptr= (uint32_t*)(((uint8_t *)s->buf)+(index>>3));
+    
+    ptr[0] |= be2me_32(value<<(32-n-(index&7) ));
+    ptr[1] = 0;
+//if(n>24) printf("%d %d\n", n, value);
+    index+= n;
+    s->index= index;
+#        endif
+#    endif //!ALIGNED_BITSTREAM_WRITER
+}
+#endif
+
+
+static inline uint8_t* pbBufPtr(PutBitContext *s)
+{
+#ifdef ALT_BITSTREAM_WRITER
+	return s->buf + (s->index>>3);
+#else
+	return s->buf_ptr;
+#endif
+}
+
+/**
+ *
+ * PutBitContext must be flushed & aligned to a byte boundary before calling this.
+ */
+static inline void skip_put_bytes(PutBitContext *s, int n){
+        assert((put_bits_count(s)&7)==0);
+#ifdef ALT_BITSTREAM_WRITER
+        FIXME may need some cleaning of the buffer
+	s->index += n<<3;
+#else
+        assert(s->bit_left==32);
+	s->buf_ptr += n;
+#endif    
+}
+
+/**
+ * skips the given number of bits.
+ * must only be used if the actual values in the bitstream dont matter
+ */
+static inline void skip_put_bits(PutBitContext *s, int n){
+#ifdef ALT_BITSTREAM_WRITER
+    s->index += n;
+#else
+    s->bit_left -= n;
+    s->buf_ptr-= s->bit_left>>5;
+    s->bit_left &= 31;
+#endif        
+}
+
+/**
+ * Changes the end of the buffer.
+ */
+static inline void set_put_bits_buffer_size(PutBitContext *s, int size){
+    s->buf_end= s->buf + size;
+}
+
+/* Bitstream reader API docs:
+name
+    abritary name which is used as prefix for the internal variables
+
+gb
+    getbitcontext
+
+OPEN_READER(name, gb)
+    loads gb into local variables
+
+CLOSE_READER(name, gb)
+    stores local vars in gb
+
+UPDATE_CACHE(name, gb)
+    refills the internal cache from the bitstream
+    after this call at least MIN_CACHE_BITS will be available,
+
+GET_CACHE(name, gb)
+    will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit)
+
+SHOW_UBITS(name, gb, num)
+    will return the nest num bits
+
+SHOW_SBITS(name, gb, num)
+    will return the nest num bits and do sign extension
+
+SKIP_BITS(name, gb, num)
+    will skip over the next num bits
+    note, this is equinvalent to SKIP_CACHE; SKIP_COUNTER
+
+SKIP_CACHE(name, gb, num)
+    will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER)
+
+SKIP_COUNTER(name, gb, num)
+    will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS)
+
+LAST_SKIP_CACHE(name, gb, num)
+    will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing
+
+LAST_SKIP_BITS(name, gb, num)
+    is equinvalent to SKIP_LAST_CACHE; SKIP_COUNTER
+
+for examples see get_bits, show_bits, skip_bits, get_vlc
+*/
+
+static inline int unaligned32_be(const void *v)
+{
+#ifdef CONFIG_ALIGN
+	const uint8_t *p=v;
+	return (((p[0]<<8) | p[1])<<16) | (p[2]<<8) | (p[3]);
+#else
+	return be2me_32( unaligned32(v)); //original
+#endif
+}
+
+#ifdef ALT_BITSTREAM_READER
+#   define MIN_CACHE_BITS 25
+
+#   define OPEN_READER(name, gb)\
+        int name##_index= (gb)->index;\
+        int name##_cache= 0;\
+
+#   define CLOSE_READER(name, gb)\
+        (gb)->index= name##_index;\
+
+#   define UPDATE_CACHE(name, gb)\
+        name##_cache= unaligned32_be( ((const uint8_t *)(gb)->buffer)+(name##_index>>3) ) << (name##_index&0x07);\
+
+#   define SKIP_CACHE(name, gb, num)\
+        name##_cache <<= (num);\
+
+// FIXME name?
+#   define SKIP_COUNTER(name, gb, num)\
+        name##_index += (num);\
+
+#   define SKIP_BITS(name, gb, num)\
+        {\
+            SKIP_CACHE(name, gb, num)\
+            SKIP_COUNTER(name, gb, num)\
+        }\
+
+#   define LAST_SKIP_BITS(name, gb, num) SKIP_COUNTER(name, gb, num)
+#   define LAST_SKIP_CACHE(name, gb, num) ;
+
+#   define SHOW_UBITS(name, gb, num)\
+        NEG_USR32(name##_cache, num)
+
+#   define SHOW_SBITS(name, gb, num)\
+        NEG_SSR32(name##_cache, num)
+
+#   define GET_CACHE(name, gb)\
+        ((uint32_t)name##_cache)
+
+static inline int get_bits_count(GetBitContext *s){
+    return s->index;
+}
+#elif defined LIBMPEG2_BITSTREAM_READER
+//libmpeg2 like reader
+
+#   define MIN_CACHE_BITS 17
+
+#   define OPEN_READER(name, gb)\
+        int name##_bit_count=(gb)->bit_count;\
+        int name##_cache= (gb)->cache;\
+        uint8_t * name##_buffer_ptr=(gb)->buffer_ptr;\
+
+#   define CLOSE_READER(name, gb)\
+        (gb)->bit_count= name##_bit_count;\
+        (gb)->cache= name##_cache;\
+        (gb)->buffer_ptr= name##_buffer_ptr;\
+
+#ifdef LIBMPEG2_BITSTREAM_READER_HACK
+
+#   define UPDATE_CACHE(name, gb)\
+    if(name##_bit_count >= 0){\
+        name##_cache+= (int)be2me_16(*(uint16_t*)name##_buffer_ptr) << name##_bit_count;\
+        name##_buffer_ptr += 2;\
+        name##_bit_count-= 16;\
+    }\
+
+#else
+
+#   define UPDATE_CACHE(name, gb)\
+    if(name##_bit_count >= 0){\
+        name##_cache+= ((name##_buffer_ptr[0]<<8) + name##_buffer_ptr[1]) << name##_bit_count;\
+        name##_buffer_ptr+=2;\
+        name##_bit_count-= 16;\
+    }\
+
+#endif
+
+#   define SKIP_CACHE(name, gb, num)\
+        name##_cache <<= (num);\
+
+#   define SKIP_COUNTER(name, gb, num)\
+        name##_bit_count += (num);\
+
+#   define SKIP_BITS(name, gb, num)\
+        {\
+            SKIP_CACHE(name, gb, num)\
+            SKIP_COUNTER(name, gb, num)\
+        }\
+
+#   define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
+#   define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
+
+#   define SHOW_UBITS(name, gb, num)\
+        NEG_USR32(name##_cache, num)
+
+#   define SHOW_SBITS(name, gb, num)\
+        NEG_SSR32(name##_cache, num)
+
+#   define GET_CACHE(name, gb)\
+        ((uint32_t)name##_cache)
+
+static inline int get_bits_count(GetBitContext *s){
+    return (s->buffer_ptr - s->buffer)*8 - 16 + s->bit_count;
+}
+
+#elif defined A32_BITSTREAM_READER
+
+#   define MIN_CACHE_BITS 32
+
+#   define OPEN_READER(name, gb)\
+        int name##_bit_count=(gb)->bit_count;\
+        uint32_t name##_cache0= (gb)->cache0;\
+        uint32_t name##_cache1= (gb)->cache1;\
+        uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\
+
+#   define CLOSE_READER(name, gb)\
+        (gb)->bit_count= name##_bit_count;\
+        (gb)->cache0= name##_cache0;\
+        (gb)->cache1= name##_cache1;\
+        (gb)->buffer_ptr= name##_buffer_ptr;\
+
+#   define UPDATE_CACHE(name, gb)\
+    if(name##_bit_count > 0){\
+        const uint32_t next= be2me_32( *name##_buffer_ptr );\
+        name##_cache0 |= NEG_USR32(next,name##_bit_count);\
+        name##_cache1 |= next<<name##_bit_count;\
+        name##_buffer_ptr++;\
+        name##_bit_count-= 32;\
+    }\
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#   define SKIP_CACHE(name, gb, num)\
+        asm(\
+            "shldl %2, %1, %0		\n\t"\
+            "shll %2, %1		\n\t"\
+            : "+r" (name##_cache0), "+r" (name##_cache1)\
+            : "Ic" ((uint8_t)num)\
+           );
+#else
+#   define SKIP_CACHE(name, gb, num)\
+        name##_cache0 <<= (num);\
+        name##_cache0 |= NEG_USR32(name##_cache1,num);\
+        name##_cache1 <<= (num);
+#endif
+
+#   define SKIP_COUNTER(name, gb, num)\
+        name##_bit_count += (num);\
+
+#   define SKIP_BITS(name, gb, num)\
+        {\
+            SKIP_CACHE(name, gb, num)\
+            SKIP_COUNTER(name, gb, num)\
+        }\
+
+#   define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
+#   define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
+
+#   define SHOW_UBITS(name, gb, num)\
+        NEG_USR32(name##_cache0, num)
+
+#   define SHOW_SBITS(name, gb, num)\
+        NEG_SSR32(name##_cache0, num)
+
+#   define GET_CACHE(name, gb)\
+        (name##_cache0)
+
+static inline int get_bits_count(GetBitContext *s){
+    return ((uint8_t*)s->buffer_ptr - s->buffer)*8 - 32 + s->bit_count;
+}
+
+#endif
+
+/**
+ * read mpeg1 dc style vlc (sign bit + mantisse with no MSB).
+ * if MSB not set it is negative 
+ * @param n length in bits
+ * @author BERO  
+ */
+static inline int get_xbits(GetBitContext *s, int n){
+    register int tmp;
+    register int32_t cache;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    cache = GET_CACHE(re,s);
+    if ((int32_t)cache<0) { //MSB=1
+        tmp = NEG_USR32(cache,n);
+    } else {
+    //   tmp = (-1<<n) | NEG_USR32(cache,n) + 1; mpeg12.c algo
+    //   tmp = - (NEG_USR32(cache,n) ^ ((1 << n) - 1)); h263.c algo
+        tmp = - NEG_USR32(~cache,n);
+    }
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+    return tmp;
+}
+
+static inline int get_sbits(GetBitContext *s, int n){
+    register int tmp;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    tmp= SHOW_SBITS(re, s, n);
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+    return tmp;
+}
+
+/**
+ * reads 0-17 bits.
+ * Note, the alt bitstream reader can read upto 25 bits, but the libmpeg2 reader cant
+ */
+static inline unsigned int get_bits(GetBitContext *s, int n){
+    register int tmp;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    tmp= SHOW_UBITS(re, s, n);
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+    return tmp;
+}
+
+unsigned int get_bits_long(GetBitContext *s, int n);
+
+/**
+ * shows 0-17 bits.
+ * Note, the alt bitstream reader can read upto 25 bits, but the libmpeg2 reader cant
+ */
+static inline unsigned int show_bits(GetBitContext *s, int n){
+    register int tmp;
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    tmp= SHOW_UBITS(re, s, n);
+//    CLOSE_READER(re, s)
+    return tmp;
+}
+
+unsigned int show_bits_long(GetBitContext *s, int n);
+
+static inline void skip_bits(GetBitContext *s, int n){
+ //Note gcc seems to optimize this to s->index+=n for the ALT_READER :))
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+    LAST_SKIP_BITS(re, s, n)
+    CLOSE_READER(re, s)
+}
+
+static inline unsigned int get_bits1(GetBitContext *s){
+#ifdef ALT_BITSTREAM_READER
+    int index= s->index;
+    uint8_t result= s->buffer[ index>>3 ];
+    result<<= (index&0x07);
+    result>>= 8 - 1;
+    index++;
+    s->index= index;
+
+    return result;
+#else
+    return get_bits(s, 1);
+#endif
+}
+
+static inline unsigned int show_bits1(GetBitContext *s){
+    return show_bits(s, 1);
+}
+
+static inline void skip_bits1(GetBitContext *s){
+    skip_bits(s, 1);
+}
+
+/**
+ * init GetBitContext.
+ * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits
+ * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end
+ * @param bit_size the size of the buffer in bits
+ */
+static inline void init_get_bits(GetBitContext *s,
+                   const uint8_t *buffer, int bit_size)
+{
+    const int buffer_size= (bit_size+7)>>3;
+
+    s->buffer= buffer;
+    s->size_in_bits= bit_size;
+    s->buffer_end= buffer + buffer_size;
+#ifdef ALT_BITSTREAM_READER
+    s->index=0;
+#elif defined LIBMPEG2_BITSTREAM_READER
+#ifdef LIBMPEG2_BITSTREAM_READER_HACK
+  if ((int)buffer&1) {
+     /* word alignment */
+    s->cache = (*buffer++)<<24;
+    s->buffer_ptr = buffer;
+    s->bit_count = 16-8;
+  } else
+#endif
+  {
+    s->buffer_ptr = buffer;
+    s->bit_count = 16;
+    s->cache = 0;
+  }
+#elif defined A32_BITSTREAM_READER
+    s->buffer_ptr = (uint32_t*)buffer;
+    s->bit_count = 32;
+    s->cache0 = 0;
+    s->cache1 = 0;
+#endif
+    {
+        OPEN_READER(re, s)
+        UPDATE_CACHE(re, s)
+        UPDATE_CACHE(re, s)
+        CLOSE_READER(re, s)
+    }
+#ifdef A32_BITSTREAM_READER
+    s->cache1 = 0;
+#endif
+}
+
+int check_marker(GetBitContext *s, const char *msg);
+void align_get_bits(GetBitContext *s);
+int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
+             const void *bits, int bits_wrap, int bits_size,
+             const void *codes, int codes_wrap, int codes_size,
+             int use_static);
+void free_vlc(VLC *vlc);
+
+/**
+ *
+ * if the vlc code is invalid and max_depth=1 than no bits will be removed
+ * if the vlc code is invalid and max_depth>1 than the number of bits removed
+ * is undefined
+ */
+#define GET_VLC(code, name, gb, table, bits, max_depth)\
+{\
+    int n, index, nb_bits;\
+\
+    index= SHOW_UBITS(name, gb, bits);\
+    code = table[index][0];\
+    n    = table[index][1];\
+\
+    if(max_depth > 1 && n < 0){\
+        LAST_SKIP_BITS(name, gb, bits)\
+        UPDATE_CACHE(name, gb)\
+\
+        nb_bits = -n;\
+\
+        index= SHOW_UBITS(name, gb, nb_bits) + code;\
+        code = table[index][0];\
+        n    = table[index][1];\
+        if(max_depth > 2 && n < 0){\
+            LAST_SKIP_BITS(name, gb, nb_bits)\
+            UPDATE_CACHE(name, gb)\
+\
+            nb_bits = -n;\
+\
+            index= SHOW_UBITS(name, gb, nb_bits) + code;\
+            code = table[index][0];\
+            n    = table[index][1];\
+        }\
+    }\
+    SKIP_BITS(name, gb, n)\
+}
+
+#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
+{\
+    int n, index, nb_bits;\
+\
+    index= SHOW_UBITS(name, gb, bits);\
+    level = table[index].level;\
+    n     = table[index].len;\
+\
+    if(max_depth > 1 && n < 0){\
+        SKIP_BITS(name, gb, bits)\
+        if(need_update){\
+            UPDATE_CACHE(name, gb)\
+        }\
+\
+        nb_bits = -n;\
+\
+        index= SHOW_UBITS(name, gb, nb_bits) + level;\
+        level = table[index].level;\
+        n     = table[index].len;\
+    }\
+    run= table[index].run;\
+    SKIP_BITS(name, gb, n)\
+}
+
+// deprecated, dont use get_vlc for new code, use get_vlc2 instead or use GET_VLC directly
+static inline int get_vlc(GetBitContext *s, VLC *vlc)
+{
+    int code;
+    VLC_TYPE (*table)[2]= vlc->table;
+    
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+
+    GET_VLC(code, re, s, table, vlc->bits, 3)    
+
+    CLOSE_READER(re, s)
+    return code;
+}
+
+/**
+ * parses a vlc code, faster then get_vlc()
+ * @param bits is the number of bits which will be read at once, must be 
+ *             identical to nb_bits in init_vlc()
+ * @param max_depth is the number of times bits bits must be readed to completly
+ *                  read the longest vlc code 
+ *                  = (max_vlc_length + bits - 1) / bits
+ */
+static always_inline int get_vlc2(GetBitContext *s, VLC_TYPE (*table)[2],
+                                  int bits, int max_depth)
+{
+    int code;
+    
+    OPEN_READER(re, s)
+    UPDATE_CACHE(re, s)
+
+    GET_VLC(code, re, s, table, bits, max_depth)
+
+    CLOSE_READER(re, s)
+    return code;
+}
+
+//#define TRACE
+
+#ifdef TRACE
+#include "avcodec.h"
+static inline void print_bin(int bits, int n){
+    int i;
+    
+    for(i=n-1; i>=0; i--){
+        av_log(NULL, AV_LOG_DEBUG, "%d", (bits>>i)&1);
+    }
+    for(i=n; i<24; i++)
+        av_log(NULL, AV_LOG_DEBUG, " ");
+}
+
+static inline int get_bits_trace(GetBitContext *s, int n, char *file, const char *func, int line){
+    int r= get_bits(s, n);
+    
+    print_bin(r, n);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d bit @%5d in %s %s:%d\n", r, n, r, get_bits_count(s)-n, file, func, line);
+    return r;
+}
+static inline int get_vlc_trace(GetBitContext *s, VLC_TYPE (*table)[2], int bits, int max_depth, char *file, const char *func, int line){
+    int show= show_bits(s, 24);
+    int pos= get_bits_count(s);
+    int r= get_vlc2(s, table, bits, max_depth);
+    int len= get_bits_count(s) - pos;
+    int bits2= show>>(24-len);
+    
+    print_bin(bits2, len);
+    
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d vlc @%5d in %s %s:%d\n", bits2, len, r, pos, file, func, line);
+    return r;
+}
+static inline int get_xbits_trace(GetBitContext *s, int n, char *file, const char *func, int line){
+    int show= show_bits(s, n);
+    int r= get_xbits(s, n);
+    
+    print_bin(show, n);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d xbt @%5d in %s %s:%d\n", show, n, r, get_bits_count(s)-n, file, func, line);
+    return r;
+}
+
+#define get_bits(s, n)  get_bits_trace(s, n, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_bits1(s)    get_bits_trace(s, 1, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_xbits(s, n) get_xbits_trace(s, n, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_vlc(s, vlc)            get_vlc_trace(s, (vlc)->table, (vlc)->bits, 3, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+#define get_vlc2(s, tab, bits, max) get_vlc_trace(s, tab, bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__)
+
+#define tprintf(...) av_log(NULL, AV_LOG_DEBUG, __VA_ARGS__)
+
+#else //TRACE
+#define tprintf(...) {}
+#endif
+
+static inline int decode012(GetBitContext *gb){
+    int n;
+    n = get_bits1(gb);
+    if (n == 0)
+        return 0;
+    else
+        return get_bits1(gb) + 1;
+}
+
+#endif /* BITSTREAM_H */
diff --git a/src/libffmpeg/libavcodec/bswap.h b/src/libffmpeg/libavcodec/bswap.h
index 460f7abd4..eb1d87a55 100644
--- a/src/libffmpeg/libavcodec/bswap.h
+++ b/src/libffmpeg/libavcodec/bswap.h
@@ -10,17 +10,23 @@
 #include <byteswap.h>
 #else
 
-#ifdef ARCH_X86
-static inline unsigned short ByteSwap16(unsigned short x)
+#ifdef ARCH_X86_64
+#  define LEGACY_REGS "=Q"
+#else
+#  define LEGACY_REGS "=q"
+#endif
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static inline uint16_t ByteSwap16(uint16_t x)
 {
   __asm("xchgb %b0,%h0"	:
-        "=q" (x)	:
+        LEGACY_REGS (x)	:
         "0" (x));
     return x;
 }
 #define bswap_16(x) ByteSwap16(x)
 
-static inline unsigned int ByteSwap32(unsigned int x)
+static inline uint32_t ByteSwap32(uint32_t x)
 {
 #if __CPU__ > 386
  __asm("bswap	%0":
@@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x)
  __asm("xchgb	%b0,%h0\n"
       "	rorl	$16,%0\n"
       "	xchgb	%b0,%h0":
-      "=q" (x)		:
+      LEGACY_REGS (x)		:
 #endif
       "0" (x));
   return x;
 }
 #define bswap_32(x) ByteSwap32(x)
 
-static inline unsigned long long int ByteSwap64(unsigned long long int x)
+static inline uint64_t ByteSwap64(uint64_t x)
 {
+#ifdef ARCH_X86_64
+  __asm("bswap	%0":
+        "=r" (x)     :
+        "0" (x));
+  return x;
+#else
   register union { __extension__ uint64_t __ll;
           uint32_t __l[2]; } __x;
   asm("xchgl	%0,%1":
       "=r"(__x.__l[0]),"=r"(__x.__l[1]):
-      "0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32))));
+      "0"(bswap_32((uint32_t)x)),"1"(bswap_32((uint32_t)(x>>32))));
   return __x.__ll;
+#endif
 }
 #define bswap_64(x) ByteSwap64(x)
 
diff --git a/src/libffmpeg/libavcodec/cabac.c b/src/libffmpeg/libavcodec/cabac.c
index 9d56e23fc..9a598fa47 100644
--- a/src/libffmpeg/libavcodec/cabac.c
+++ b/src/libffmpeg/libavcodec/cabac.c
@@ -26,6 +26,7 @@
 #include <string.h>
 
 #include "common.h"
+#include "bitstream.h"
 #include "cabac.h"
 
 const uint8_t ff_h264_lps_range[64][4]= {
@@ -69,6 +70,25 @@ const uint8_t ff_h264_lps_state[64]= {
  36,36,37,37,37,38,38,63,
 };
 
+const uint8_t ff_h264_norm_shift[256]= {
+ 8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
+};
+
 /**
  *
  * @param buf_size size of buf in bits
@@ -95,10 +115,14 @@ void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
     c->bytestream= buf;
     c->bytestream_end= buf + buf_size;
 
-    c->low= *c->bytestream++;
-    c->low= (c->low<<9) + ((*c->bytestream++)<<1);
-    c->range= 0x1FE00;
-    c->bits_left= 7;
+#if CABAC_BITS == 16
+    c->low =  (*c->bytestream++)<<18;
+    c->low+=  (*c->bytestream++)<<10;
+#else
+    c->low =  (*c->bytestream++)<<10;
+#endif
+    c->low+= ((*c->bytestream++)<<2) + 2;
+    c->range= 0x1FE<<(CABAC_BITS + 1);
 }
 
 void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4], 
@@ -107,8 +131,8 @@ void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4],
     
     for(i=0; i<state_count; i++){
         for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
-            c->lps_range[2*i+0][j]=
-            c->lps_range[2*i+1][j]= lps_range[i][j];
+            c->lps_range[2*i+0][j+4]=
+            c->lps_range[2*i+1][j+4]= lps_range[i][j];
         }
 
         c->mps_state[2*i+0]= 2*mps_state[i];
@@ -126,6 +150,9 @@ void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4],
 
 #if 0 //selftest
 #define SIZE 10240
+
+#include "avcodec.h"
+
 int main(){
     CABACContext c;
     uint8_t b[9*SIZE];
@@ -173,33 +200,33 @@ STOP_TIMER("put_cabac_ueg")
     for(i=0; i<SIZE; i++){
 START_TIMER
         if( (r[i]&1) != get_cabac_bypass(&c) )
-            printf("CABAC bypass failure at %d\n", i);
+            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
 STOP_TIMER("get_cabac_bypass")
     }
     
     for(i=0; i<SIZE; i++){
 START_TIMER
         if( (r[i]&1) != get_cabac(&c, state) )
-            printf("CABAC failure at %d\n", i);
+            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
 STOP_TIMER("get_cabac")
     }
-
+#if 0
     for(i=0; i<SIZE; i++){
 START_TIMER
         if( r[i] != get_cabac_u(&c, state, (i&1) ? 6 : 7, 3, i&1) )
-            printf("CABAC unary (truncated) binarization failure at %d\n", i);
+            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
 STOP_TIMER("get_cabac_u")
     }
 
     for(i=0; i<SIZE; i++){
 START_TIMER
         if( r[i] != get_cabac_ueg(&c, state, 3, 0, 1, 2))
-            printf("CABAC unary (truncated) binarization failure at %d\n", i);
+            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
 STOP_TIMER("get_cabac_ueg")
     }
-
+#endif
     if(!get_cabac_terminate(&c))
-        printf("where's the Terminator?\n");
+        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
     
     return 0;
 }
diff --git a/src/libffmpeg/libavcodec/cabac.h b/src/libffmpeg/libavcodec/cabac.h
index 21085b21e..cc03eba96 100644
--- a/src/libffmpeg/libavcodec/cabac.h
+++ b/src/libffmpeg/libavcodec/cabac.h
@@ -27,6 +27,9 @@
 #undef NDEBUG
 #include <assert.h>
 
+#define CABAC_BITS 8
+#define CABAC_MASK ((1<<CABAC_BITS)-1)
+
 typedef struct CABACContext{
     int low;
     int range;
@@ -34,19 +37,20 @@ typedef struct CABACContext{
 #ifdef STRICT_LIMITS
     int symCount;
 #endif
-    uint8_t lps_range[2*64][4];   ///< rangeTabLPS
+    uint8_t lps_range[2*65][4];   ///< rangeTabLPS
     uint8_t lps_state[2*64];      ///< transIdxLPS
     uint8_t mps_state[2*64];      ///< transIdxMPS
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
-    int bits_left;                ///<
     PutBitContext pb;
 }CABACContext;
 
 extern const uint8_t ff_h264_lps_range[64][4];
 extern const uint8_t ff_h264_mps_state[64];
 extern const uint8_t ff_h264_lps_state[64];
+extern const uint8_t ff_h264_norm_shift[256];
+
 
 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size);
 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
@@ -80,7 +84,7 @@ static inline void renorm_cabac_encoder(CABACContext *c){
 }
 
 static inline void put_cabac(CABACContext *c, uint8_t * const state, int bit){
-    int RangeLPS= c->lps_range[*state][((c->range)>>6)&3];
+    int RangeLPS= c->lps_range[*state][c->range>>6];
     
     if(bit == ((*state)&1)){
         c->range -= RangeLPS;
@@ -249,63 +253,102 @@ static inline void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int ma
     }
 }
 
+static void refill(CABACContext *c){
+    if(c->bytestream < c->bytestream_end)
+#if CABAC_BITS == 16
+        c->low+= ((c->bytestream[0]<<9) + (c->bytestream[1])<<1);
+#else
+        c->low+= c->bytestream[0]<<1;
+#endif
+    c->low -= CABAC_MASK;
+    c->bytestream+= CABAC_BITS/8;
+}
+
+#if 0 /* all use commented */
+static void refill2(CABACContext *c){
+    int i, x;
+
+    x= c->low ^ (c->low-1);
+    i= 8 - ff_h264_norm_shift[x>>(CABAC_BITS+1)];
+
+    x= -CABAC_MASK;
+    
+    if(c->bytestream < c->bytestream_end)
+#if CABAC_BITS == 16
+        x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
+#else
+        x+= c->bytestream[0]<<1;
+#endif
+    
+    c->low += x<<i;
+    c->bytestream+= CABAC_BITS/8;
+}
+#endif
+
 static inline void renorm_cabac_decoder(CABACContext *c){
-    while(c->range < 0x10000){
+    while(c->range < (0x200 << CABAC_BITS)){
         c->range+= c->range;
         c->low+= c->low;
-        if(--c->bits_left == 0){
-            if(c->bytestream < c->bytestream_end)
-                c->low+= *c->bytestream;
-            c->bytestream++;
-            c->bits_left= 8;
-        }
+        if(!(c->low & CABAC_MASK))
+            refill(c);
     }
 }
 
+static inline void renorm_cabac_decoder_once(CABACContext *c){
+    int mask= (c->range - (0x200 << CABAC_BITS))>>31;
+    c->range+= c->range&mask;
+    c->low  += c->low  &mask;
+    if(!(c->low & CABAC_MASK))
+        refill(c);
+}
+
 static inline int get_cabac(CABACContext *c, uint8_t * const state){
-    int RangeLPS= c->lps_range[*state][((c->range)>>14)&3]<<8;
-    int bit;
+    int RangeLPS= c->lps_range[*state][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1);
+    int bit, lps_mask attribute_unused;
     
     c->range -= RangeLPS;
+#if 1
     if(c->low < c->range){
         bit= (*state)&1;
         *state= c->mps_state[*state];
+        renorm_cabac_decoder_once(c);
     }else{
+//        int shift= ff_h264_norm_shift[RangeLPS>>17];
         bit= ((*state)&1)^1;
         c->low -= c->range;
-        c->range = RangeLPS;
         *state= c->lps_state[*state];
+        c->range = RangeLPS;
+        renorm_cabac_decoder(c);
+/*        c->range = RangeLPS<<shift;
+        c->low <<= shift;
+        if(!(c->low & 0xFFFF)){
+            refill2(c);
+        }*/
     }
-    renorm_cabac_decoder(c);
+#else
+    lps_mask= (c->range - c->low)>>31;
     
-    return bit;    
-}
-
-static inline int get_cabac_static(CABACContext *c, int RangeLPS){
-    int bit;
+    c->low -= c->range & lps_mask;
+    c->range += (RangeLPS - c->range) & lps_mask;
     
-    c->range -= RangeLPS;
-    if(c->low < c->range){
-        bit= 0;
-    }else{
-        bit= 1;
-        c->low -= c->range;
-        c->range = RangeLPS;
-    }
-    renorm_cabac_decoder(c);
+    bit= ((*state)^lps_mask)&1;
+    *state= c->mps_state[(*state) - (128&lps_mask)];
     
+    lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+2)];
+    c->range<<= lps_mask;
+    c->low  <<= lps_mask;
+    if(!(c->low & CABAC_MASK))
+        refill2(c);
+#endif
+
     return bit;    
 }
 
 static inline int get_cabac_bypass(CABACContext *c){
     c->low += c->low;
 
-    if(--c->bits_left == 0){
-        if(c->bytestream < c->bytestream_end)
-            c->low+= *c->bytestream;
-        c->bytestream++;
-        c->bits_left= 8;
-    }
+    if(!(c->low & CABAC_MASK))
+        refill(c);
     
     if(c->low < c->range){
         return 0;
@@ -320,9 +363,9 @@ static inline int get_cabac_bypass(CABACContext *c){
  * @return the number of bytes read or 0 if no end
  */
 static inline int get_cabac_terminate(CABACContext *c){
-    c->range -= 2<<8;
+    c->range -= 4<<CABAC_BITS;
     if(c->low < c->range){
-        renorm_cabac_decoder(c);    
+        renorm_cabac_decoder_once(c);
         return 0;
     }else{
         return c->bytestream - c->bytestream_start;
diff --git a/src/libffmpeg/libavcodec/cinepak.c b/src/libffmpeg/libavcodec/cinepak.c
index da9a8127f..3c560fdc5 100644
--- a/src/libffmpeg/libavcodec/cinepak.c
+++ b/src/libffmpeg/libavcodec/cinepak.c
@@ -35,7 +35,6 @@
 #include "avcodec.h"
 #include "dsputil.h"
 
-#define PALETTE_COUNT 256
 
 typedef struct {
     uint8_t  y0, y1, y2, y3;
@@ -63,7 +62,6 @@ typedef struct CinepakContext {
 
     int width, height;
 
-    unsigned char palette[PALETTE_COUNT * 4];
     int palette_video;
     cvid_strip_t strips[MAX_STRIPS];
 
@@ -177,28 +175,28 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip_t *strip,
                         s->frame.data[2][iv[0]] = codebook->v;
                     }
 
-                    s->frame.data[0][iy[0] + 2] = codebook->y0;
-                    s->frame.data[0][iy[0] + 3] = codebook->y0;
-                    s->frame.data[0][iy[1] + 2] = codebook->y0;
-                    s->frame.data[0][iy[1] + 3] = codebook->y0;
+                    s->frame.data[0][iy[0] + 2] = codebook->y1;
+                    s->frame.data[0][iy[0] + 3] = codebook->y1;
+                    s->frame.data[0][iy[1] + 2] = codebook->y1;
+                    s->frame.data[0][iy[1] + 3] = codebook->y1;
                     if (!s->palette_video) {
                         s->frame.data[1][iu[0] + 1] = codebook->u;
                         s->frame.data[2][iv[0] + 1] = codebook->v;
                     }
 
-                    s->frame.data[0][iy[2] + 0] = codebook->y0;
-                    s->frame.data[0][iy[2] + 1] = codebook->y0;
-                    s->frame.data[0][iy[3] + 0] = codebook->y0;
-                    s->frame.data[0][iy[3] + 1] = codebook->y0;
+                    s->frame.data[0][iy[2] + 0] = codebook->y2;
+                    s->frame.data[0][iy[2] + 1] = codebook->y2;
+                    s->frame.data[0][iy[3] + 0] = codebook->y2;
+                    s->frame.data[0][iy[3] + 1] = codebook->y2;
                     if (!s->palette_video) {
                         s->frame.data[1][iu[1]] = codebook->u;
                         s->frame.data[2][iv[1]] = codebook->v;
                     }
 
-                    s->frame.data[0][iy[2] + 2] = codebook->y0;
-                    s->frame.data[0][iy[2] + 3] = codebook->y0;
-                    s->frame.data[0][iy[3] + 2] = codebook->y0;
-                    s->frame.data[0][iy[3] + 3] = codebook->y0;
+                    s->frame.data[0][iy[2] + 2] = codebook->y3;
+                    s->frame.data[0][iy[2] + 3] = codebook->y3;
+                    s->frame.data[0][iy[3] + 2] = codebook->y3;
+                    s->frame.data[0][iy[3] + 3] = codebook->y3;
                     if (!s->palette_video) {
                         s->frame.data[1][iu[1] + 1] = codebook->u;
                         s->frame.data[2][iv[1] + 1] = codebook->v;
@@ -361,22 +359,20 @@ static int cinepak_decode (CinepakContext *s)
 static int cinepak_decode_init(AVCodecContext *avctx)
 {
     CinepakContext *s = (CinepakContext *)avctx->priv_data;
-/*
-    int i;
-    unsigned char r, g, b;
-    unsigned char *raw_palette;
-    unsigned int *palette32;
-*/
 
     s->avctx = avctx;
     s->width = (avctx->width + 3) & ~3;
     s->height = (avctx->height + 3) & ~3;
 
-// check for paletted data
-s->palette_video = 0;
-
+    // check for paletted data
+    if ((avctx->palctrl == NULL) || (avctx->bits_per_sample == 40)) {
+        s->palette_video = 0;
+        avctx->pix_fmt = PIX_FMT_YUV420P;
+    } else {
+        s->palette_video = 1;
+        avctx->pix_fmt = PIX_FMT_PAL8;
+    }
 
-    avctx->pix_fmt = PIX_FMT_YUV420P;
     avctx->has_b_frames = 0;
     dsputil_init(&s->dsp, avctx);
 
@@ -404,6 +400,15 @@ static int cinepak_decode_frame(AVCodecContext *avctx,
 
     cinepak_decode(s);
 
+    if (s->palette_video) {
+        memcpy (s->frame.data[1], avctx->palctrl->palette, AVPALETTE_SIZE);
+        if (avctx->palctrl->palette_changed) {
+            s->frame.palette_has_changed = 1;
+            avctx->palctrl->palette_changed = 0;
+        } else
+            s->frame.palette_has_changed = 0;
+    }
+
     *data_size = sizeof(AVFrame);
     *(AVFrame*)data = s->frame;
 
diff --git a/src/libffmpeg/libavcodec/cljr.c b/src/libffmpeg/libavcodec/cljr.c
index df1f79851..8072eee18 100644
--- a/src/libffmpeg/libavcodec/cljr.c
+++ b/src/libffmpeg/libavcodec/cljr.c
@@ -43,11 +43,6 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p= (AVFrame*)&a->picture;
     int x, y;
 
-    /* special case for last picture */
-    if (buf_size == 0) {
-        return 0;
-    }
-
     if(p->data[0])
         avctx->release_buffer(avctx, p);
 
@@ -124,12 +119,14 @@ static int decode_init(AVCodecContext *avctx){
     return 0;
 }
 
+#if 0
 static int encode_init(AVCodecContext *avctx){
 
     common_init(avctx);
     
     return 0;
 }
+#endif
 
 AVCodec cljr_decoder = {
     "cljr",
diff --git a/src/libffmpeg/libavcodec/common.h b/src/libffmpeg/libavcodec/common.h
index 5cf9f2c73..b9e89be46 100644
--- a/src/libffmpeg/libavcodec/common.h
+++ b/src/libffmpeg/libavcodec/common.h
@@ -15,14 +15,6 @@
 #    define CONFIG_WIN32
 #endif
 
-//#define ALT_BITSTREAM_WRITER
-//#define ALIGNED_BITSTREAM_WRITER
-
-#define ALT_BITSTREAM_READER
-//#define LIBMPEG2_BITSTREAM_READER
-//#define A32_BITSTREAM_READER
-#define LIBMPEG2_BITSTREAM_READER_HACK //add BERO
-
 #ifndef M_PI
 #define M_PI    3.14159265358979323846
 #endif
@@ -35,6 +27,7 @@
 #    include <stdio.h>
 #    include <string.h>
 #    include <ctype.h>
+#    include <limits.h>
 #    ifndef __BEOS__
 #        include <errno.h>
 #    else
@@ -66,14 +59,6 @@
 #define AVOPTION_SUB(ptr) { .name = NULL, .help = (const char*)ptr }
 #define AVOPTION_END() AVOPTION_SUB(NULL)
 
-struct AVOption;
-#ifdef HAVE_MMX
-extern const struct AVOption avoptions_common[3 + 5];
-#else
-extern const struct AVOption avoptions_common[3];
-#endif
-extern const struct AVOption avoptions_workaround_bug[11];
-
 #endif /* HAVE_AV_CONFIG_H */
 
 /* Suppress restrict if it was not defined in config.h.  */
@@ -97,6 +82,14 @@ extern const struct AVOption avoptions_workaround_bug[11];
 #endif
 #endif
 
+#ifndef attribute_unused
+#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#    define attribute_unused __attribute__((unused))
+#else
+#    define attribute_unused
+#endif
+#endif
+
 #ifndef EMULATE_INTTYPES
 #   include <inttypes.h>
 #else
@@ -116,6 +109,18 @@ extern const struct AVOption avoptions_workaround_bug[11];
 #   endif /* other OS */
 #endif /* HAVE_INTTYPES_H */
 
+#ifndef INT16_MIN
+#define INT16_MIN       (-0x7fff-1)
+#endif
+
+#ifndef INT16_MAX
+#define INT16_MAX       0x7fff
+#endif
+
+#ifndef INT64_MIN
+#define INT64_MIN       (-0x7fffffffffffffffLL-1)
+#endif
+
 #ifndef INT64_MAX
 #define INT64_MAX int64_t_C(9223372036854775807)
 #endif
@@ -134,6 +139,14 @@ typedef unsigned int  uint_fast16_t;
 typedef unsigned int  uint_fast32_t;
 #endif
 
+#ifndef INT_BIT
+#    if INT_MAX != 2147483647
+#        define INT_BIT 64
+#    else
+#        define INT_BIT 32
+#    endif
+#endif
+
 #if defined(CONFIG_OS2) || defined(CONFIG_SUNOS)
 static inline float floorf(float f) { 
     return floor(f); 
@@ -208,11 +221,20 @@ static inline float floorf(float f) {
 
 #    include "bswap.h"
 
+// Use rip-relative addressing if compiling PIC code on x86-64.
 #    if defined(__MINGW32__) || defined(__CYGWIN__) || \
         defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-#        define MANGLE(a) "_" #a
+#        if defined(ARCH_X86_64) && defined(PIC)
+#            define MANGLE(a) "_" #a"(%%rip)"
+#        else
+#            define MANGLE(a) "_" #a
+#        endif
 #    else
-#        define MANGLE(a) #a
+#        if defined(ARCH_X86_64) && defined(PIC)
+#            define MANGLE(a) #a"(%%rip)"
+#        else
+#            define MANGLE(a) #a
+#        endif
 #    endif
 
 /* debug stuff */
@@ -252,7 +274,7 @@ inline void dprintf(const char* fmt,...) {}
 
 extern const uint32_t inverse[256];
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #    define FASTDIV(a,b) \
     ({\
         int ret,dmy;\
@@ -269,819 +291,6 @@ extern const uint32_t inverse[256];
 #    define FASTDIV(a,b)   ((a)/(b))
 #endif
  
-#ifdef ARCH_X86
-// avoid +32 for shift optimization (gcc should do that ...)
-static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
-    asm ("sarl %1, %0\n\t"
-         : "+r" (a)
-         : "ic" ((uint8_t)(-s))
-    );
-    return a;
-}
-static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
-    asm ("shrl %1, %0\n\t"
-         : "+r" (a)
-         : "ic" ((uint8_t)(-s))
-    );
-    return a;
-}
-#else
-#    define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
-#    define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
-#endif
-
-/* bit output */
-
-struct PutBitContext;
-
-typedef void (*WriteDataFunc)(void *, uint8_t *, int);
-
-/* buf and buf_end must be present and used by every alternative writer. */
-typedef struct PutBitContext {
-#ifdef ALT_BITSTREAM_WRITER
-    uint8_t *buf, *buf_end;
-    int index;
-#else
-    uint32_t bit_buf;
-    int bit_left;
-    uint8_t *buf, *buf_ptr, *buf_end;
-#endif
-} PutBitContext;
-
-static inline void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
-{
-    s->buf = buffer;
-    s->buf_end = s->buf + buffer_size;
-#ifdef ALT_BITSTREAM_WRITER
-    s->index=0;
-    ((uint32_t*)(s->buf))[0]=0;
-//    memset(buffer, 0, buffer_size);
-#else
-    s->buf_ptr = s->buf;
-    s->bit_left=32;
-    s->bit_buf=0;
-#endif
-}
-
-/* return the number of bits output */
-static inline int put_bits_count(PutBitContext *s)
-{
-#ifdef ALT_BITSTREAM_WRITER
-    return s->index;
-#else
-    return (s->buf_ptr - s->buf) * 8 + 32 - s->bit_left;
-#endif
-}
-
-/* pad the end of the output stream with zeros */
-static inline void flush_put_bits(PutBitContext *s)
-{
-#ifdef ALT_BITSTREAM_WRITER
-    align_put_bits(s);
-#else
-    s->bit_buf<<= s->bit_left;
-    while (s->bit_left < 32) {
-        /* XXX: should test end of buffer */
-        *s->buf_ptr++=s->bit_buf >> 24;
-        s->bit_buf<<=8;
-        s->bit_left+=8;
-    }
-    s->bit_left=32;
-    s->bit_buf=0;
-#endif
-}
-
-void align_put_bits(PutBitContext *s);
-void put_string(PutBitContext * pbc, char *s, int put_zero);
-
-/* bit input */
-/* buffer, buffer_end and size_in_bits must be present and used by every reader */
-typedef struct GetBitContext {
-    const uint8_t *buffer, *buffer_end;
-#ifdef ALT_BITSTREAM_READER
-    int index;
-#elif defined LIBMPEG2_BITSTREAM_READER
-    uint8_t *buffer_ptr;
-    uint32_t cache;
-    int bit_count;
-#elif defined A32_BITSTREAM_READER
-    uint32_t *buffer_ptr;
-    uint32_t cache0;
-    uint32_t cache1;
-    int bit_count;
-#endif
-    int size_in_bits;
-} GetBitContext;
-
-#define VLC_TYPE int16_t
-
-typedef struct VLC {
-    int bits;
-    VLC_TYPE (*table)[2]; ///< code, bits
-    int table_size, table_allocated;
-} VLC;
-
-typedef struct RL_VLC_ELEM {
-    int16_t level;
-    int8_t len;
-    uint8_t run;
-} RL_VLC_ELEM;
-
-#ifdef ARCH_SPARC
-#define UNALIGNED_STORES_ARE_BAD
-#endif
-
-/* used to avoid missaligned exceptions on some archs (alpha, ...) */
-#ifdef ARCH_X86
-#    define unaligned32(a) (*(uint32_t*)(a))
-#else
-#    ifdef __GNUC__
-static inline uint32_t unaligned32(const void *v) {
-    struct Unaligned {
-	uint32_t i;
-    } __attribute__((packed));
-
-    return ((const struct Unaligned *) v)->i;
-}
-#    elif defined(__DECC)
-static inline uint32_t unaligned32(const void *v) {
-    return *(const __unaligned uint32_t *) v;
-}
-#    else
-static inline uint32_t unaligned32(const void *v) {
-    return *(const uint32_t *) v;
-}
-#    endif
-#endif //!ARCH_X86
-
-#ifndef ALT_BITSTREAM_WRITER
-static inline void put_bits(PutBitContext *s, int n, unsigned int value)
-{
-    unsigned int bit_buf;
-    int bit_left;
-
-#ifdef STATS
-    st_out_bit_counts[st_current_index] += n;
-#endif
-    //    printf("put_bits=%d %x\n", n, value);
-    assert(n == 32 || value < (1U << n));
-    
-    bit_buf = s->bit_buf;
-    bit_left = s->bit_left;
-
-    //    printf("n=%d value=%x cnt=%d buf=%x\n", n, value, bit_cnt, bit_buf);
-    /* XXX: optimize */
-    if (n < bit_left) {
-        bit_buf = (bit_buf<<n) | value;
-        bit_left-=n;
-    } else {
-	bit_buf<<=bit_left;
-        bit_buf |= value >> (n - bit_left);
-#ifdef UNALIGNED_STORES_ARE_BAD
-        if (3 & (intptr_t) s->buf_ptr) {
-            s->buf_ptr[0] = bit_buf >> 24;
-            s->buf_ptr[1] = bit_buf >> 16;
-            s->buf_ptr[2] = bit_buf >>  8;
-            s->buf_ptr[3] = bit_buf      ;
-        } else
-#endif
-        *(uint32_t *)s->buf_ptr = be2me_32(bit_buf);
-        //printf("bitbuf = %08x\n", bit_buf);
-        s->buf_ptr+=4;
-	bit_left+=32 - n;
-        bit_buf = value;
-    }
-
-    s->bit_buf = bit_buf;
-    s->bit_left = bit_left;
-}
-#endif
-
-
-#ifdef ALT_BITSTREAM_WRITER
-static inline void put_bits(PutBitContext *s, int n, unsigned int value)
-{
-#    ifdef ALIGNED_BITSTREAM_WRITER
-#        ifdef ARCH_X86
-    asm volatile(
-	"movl %0, %%ecx			\n\t"
-	"xorl %%eax, %%eax		\n\t"
-	"shrdl %%cl, %1, %%eax		\n\t"
-	"shrl %%cl, %1			\n\t"
-	"movl %0, %%ecx			\n\t"
-	"shrl $3, %%ecx			\n\t"
-	"andl $0xFFFFFFFC, %%ecx	\n\t"
-	"bswapl %1			\n\t"
-	"orl %1, (%2, %%ecx)		\n\t"
-	"bswapl %%eax			\n\t"
-	"addl %3, %0			\n\t"
-	"movl %%eax, 4(%2, %%ecx)	\n\t"
-	: "=&r" (s->index), "=&r" (value)
-	: "r" (s->buf), "r" (n), "0" (s->index), "1" (value<<(-n))
-	: "%eax", "%ecx"
-    );
-#        else
-    int index= s->index;
-    uint32_t *ptr= ((uint32_t *)s->buf)+(index>>5);
-    
-    value<<= 32-n; 
-    
-    ptr[0] |= be2me_32(value>>(index&31));
-    ptr[1]  = be2me_32(value<<(32-(index&31)));
-//if(n>24) printf("%d %d\n", n, value);
-    index+= n;
-    s->index= index;
-#        endif
-#    else //ALIGNED_BITSTREAM_WRITER
-#        ifdef ARCH_X86
-    asm volatile(
-	"movl $7, %%ecx			\n\t"
-	"andl %0, %%ecx			\n\t"
-	"addl %3, %%ecx			\n\t"
-	"negl %%ecx			\n\t"
-	"shll %%cl, %1			\n\t"
-	"bswapl %1			\n\t"
-	"movl %0, %%ecx			\n\t"
-	"shrl $3, %%ecx			\n\t"
-	"orl %1, (%%ecx, %2)		\n\t"
-	"addl %3, %0			\n\t"
-	"movl $0, 4(%%ecx, %2)		\n\t"
-	: "=&r" (s->index), "=&r" (value)
-	: "r" (s->buf), "r" (n), "0" (s->index), "1" (value)
-	: "%ecx"
-    );
-#        else
-    int index= s->index;
-    uint32_t *ptr= (uint32_t*)(((uint8_t *)s->buf)+(index>>3));
-    
-    ptr[0] |= be2me_32(value<<(32-n-(index&7) ));
-    ptr[1] = 0;
-//if(n>24) printf("%d %d\n", n, value);
-    index+= n;
-    s->index= index;
-#        endif
-#    endif //!ALIGNED_BITSTREAM_WRITER
-}
-#endif
-
-
-static inline uint8_t* pbBufPtr(PutBitContext *s)
-{
-#ifdef ALT_BITSTREAM_WRITER
-	return s->buf + (s->index>>3);
-#else
-	return s->buf_ptr;
-#endif
-}
-
-/**
- *
- * PutBitContext must be flushed & aligned to a byte boundary before calling this.
- */
-static inline void skip_put_bytes(PutBitContext *s, int n){
-        assert((put_bits_count(s)&7)==0);
-#ifdef ALT_BITSTREAM_WRITER
-        FIXME may need some cleaning of the buffer
-	s->index += n<<3;
-#else
-        assert(s->bit_left==32);
-	s->buf_ptr += n;
-#endif    
-}
-
-/**
- * Changes the end of the buffer.
- */
-static inline void set_put_bits_buffer_size(PutBitContext *s, int size){
-    s->buf_end= s->buf + size;
-}
-
-/* Bitstream reader API docs:
-name
-    abritary name which is used as prefix for the internal variables
-
-gb
-    getbitcontext
-
-OPEN_READER(name, gb)
-    loads gb into local variables
-
-CLOSE_READER(name, gb)
-    stores local vars in gb
-
-UPDATE_CACHE(name, gb)
-    refills the internal cache from the bitstream
-    after this call at least MIN_CACHE_BITS will be available,
-
-GET_CACHE(name, gb)
-    will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit)
-
-SHOW_UBITS(name, gb, num)
-    will return the nest num bits
-
-SHOW_SBITS(name, gb, num)
-    will return the nest num bits and do sign extension
-
-SKIP_BITS(name, gb, num)
-    will skip over the next num bits
-    note, this is equinvalent to SKIP_CACHE; SKIP_COUNTER
-
-SKIP_CACHE(name, gb, num)
-    will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER)
-
-SKIP_COUNTER(name, gb, num)
-    will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS)
-
-LAST_SKIP_CACHE(name, gb, num)
-    will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing
-
-LAST_SKIP_BITS(name, gb, num)
-    is equinvalent to SKIP_LAST_CACHE; SKIP_COUNTER
-
-for examples see get_bits, show_bits, skip_bits, get_vlc
-*/
-
-static inline int unaligned32_be(const void *v)
-{
-#ifdef CONFIG_ALIGN
-	const uint8_t *p=v;
-	return (((p[0]<<8) | p[1])<<16) | (p[2]<<8) | (p[3]);
-#else
-	return be2me_32( unaligned32(v)); //original
-#endif
-}
-
-#ifdef ALT_BITSTREAM_READER
-#   define MIN_CACHE_BITS 25
-
-#   define OPEN_READER(name, gb)\
-        int name##_index= (gb)->index;\
-        int name##_cache= 0;\
-
-#   define CLOSE_READER(name, gb)\
-        (gb)->index= name##_index;\
-
-#   define UPDATE_CACHE(name, gb)\
-        name##_cache= unaligned32_be( ((uint8_t *)(gb)->buffer)+(name##_index>>3) ) << (name##_index&0x07);\
-
-#   define SKIP_CACHE(name, gb, num)\
-        name##_cache <<= (num);\
-
-// FIXME name?
-#   define SKIP_COUNTER(name, gb, num)\
-        name##_index += (num);\
-
-#   define SKIP_BITS(name, gb, num)\
-        {\
-            SKIP_CACHE(name, gb, num)\
-            SKIP_COUNTER(name, gb, num)\
-        }\
-
-#   define LAST_SKIP_BITS(name, gb, num) SKIP_COUNTER(name, gb, num)
-#   define LAST_SKIP_CACHE(name, gb, num) ;
-
-#   define SHOW_UBITS(name, gb, num)\
-        NEG_USR32(name##_cache, num)
-
-#   define SHOW_SBITS(name, gb, num)\
-        NEG_SSR32(name##_cache, num)
-
-#   define GET_CACHE(name, gb)\
-        ((uint32_t)name##_cache)
-
-static inline int get_bits_count(GetBitContext *s){
-    return s->index;
-}
-#elif defined LIBMPEG2_BITSTREAM_READER
-//libmpeg2 like reader
-
-#   define MIN_CACHE_BITS 17
-
-#   define OPEN_READER(name, gb)\
-        int name##_bit_count=(gb)->bit_count;\
-        int name##_cache= (gb)->cache;\
-        uint8_t * name##_buffer_ptr=(gb)->buffer_ptr;\
-
-#   define CLOSE_READER(name, gb)\
-        (gb)->bit_count= name##_bit_count;\
-        (gb)->cache= name##_cache;\
-        (gb)->buffer_ptr= name##_buffer_ptr;\
-
-#ifdef LIBMPEG2_BITSTREAM_READER_HACK
-
-#   define UPDATE_CACHE(name, gb)\
-    if(name##_bit_count >= 0){\
-        name##_cache+= (int)be2me_16(*(uint16_t*)name##_buffer_ptr) << name##_bit_count;\
-        ((uint16_t*)name##_buffer_ptr)++;\
-        name##_bit_count-= 16;\
-    }\
-
-#else
-
-#   define UPDATE_CACHE(name, gb)\
-    if(name##_bit_count >= 0){\
-        name##_cache+= ((name##_buffer_ptr[0]<<8) + name##_buffer_ptr[1]) << name##_bit_count;\
-        name##_buffer_ptr+=2;\
-        name##_bit_count-= 16;\
-    }\
-
-#endif
-
-#   define SKIP_CACHE(name, gb, num)\
-        name##_cache <<= (num);\
-
-#   define SKIP_COUNTER(name, gb, num)\
-        name##_bit_count += (num);\
-
-#   define SKIP_BITS(name, gb, num)\
-        {\
-            SKIP_CACHE(name, gb, num)\
-            SKIP_COUNTER(name, gb, num)\
-        }\
-
-#   define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
-#   define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
-
-#   define SHOW_UBITS(name, gb, num)\
-        NEG_USR32(name##_cache, num)
-
-#   define SHOW_SBITS(name, gb, num)\
-        NEG_SSR32(name##_cache, num)
-
-#   define GET_CACHE(name, gb)\
-        ((uint32_t)name##_cache)
-
-static inline int get_bits_count(GetBitContext *s){
-    return (s->buffer_ptr - s->buffer)*8 - 16 + s->bit_count;
-}
-
-#elif defined A32_BITSTREAM_READER
-
-#   define MIN_CACHE_BITS 32
-
-#   define OPEN_READER(name, gb)\
-        int name##_bit_count=(gb)->bit_count;\
-        uint32_t name##_cache0= (gb)->cache0;\
-        uint32_t name##_cache1= (gb)->cache1;\
-        uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\
-
-#   define CLOSE_READER(name, gb)\
-        (gb)->bit_count= name##_bit_count;\
-        (gb)->cache0= name##_cache0;\
-        (gb)->cache1= name##_cache1;\
-        (gb)->buffer_ptr= name##_buffer_ptr;\
-
-#   define UPDATE_CACHE(name, gb)\
-    if(name##_bit_count > 0){\
-        const uint32_t next= be2me_32( *name##_buffer_ptr );\
-        name##_cache0 |= NEG_USR32(next,name##_bit_count);\
-        name##_cache1 |= next<<name##_bit_count;\
-        name##_buffer_ptr++;\
-        name##_bit_count-= 32;\
-    }\
-
-#ifdef ARCH_X86
-#   define SKIP_CACHE(name, gb, num)\
-        asm(\
-            "shldl %2, %1, %0		\n\t"\
-            "shll %2, %1		\n\t"\
-            : "+r" (name##_cache0), "+r" (name##_cache1)\
-            : "Ic" ((uint8_t)num)\
-           );
-#else
-#   define SKIP_CACHE(name, gb, num)\
-        name##_cache0 <<= (num);\
-        name##_cache0 |= NEG_USR32(name##_cache1,num);\
-        name##_cache1 <<= (num);
-#endif
-
-#   define SKIP_COUNTER(name, gb, num)\
-        name##_bit_count += (num);\
-
-#   define SKIP_BITS(name, gb, num)\
-        {\
-            SKIP_CACHE(name, gb, num)\
-            SKIP_COUNTER(name, gb, num)\
-        }\
-
-#   define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
-#   define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
-
-#   define SHOW_UBITS(name, gb, num)\
-        NEG_USR32(name##_cache0, num)
-
-#   define SHOW_SBITS(name, gb, num)\
-        NEG_SSR32(name##_cache0, num)
-
-#   define GET_CACHE(name, gb)\
-        (name##_cache0)
-
-static inline int get_bits_count(GetBitContext *s){
-    return ((uint8_t*)s->buffer_ptr - s->buffer)*8 - 32 + s->bit_count;
-}
-
-#endif
-
-/**
- * read mpeg1 dc style vlc (sign bit + mantisse with no MSB).
- * if MSB not set it is negative 
- * @param n length in bits
- * @author BERO  
- */
-static inline int get_xbits(GetBitContext *s, int n){
-    register int tmp;
-    register int32_t cache;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    cache = GET_CACHE(re,s);
-    if ((int32_t)cache<0) { //MSB=1
-        tmp = NEG_USR32(cache,n);
-    } else {
-    //   tmp = (-1<<n) | NEG_USR32(cache,n) + 1; mpeg12.c algo
-    //   tmp = - (NEG_USR32(cache,n) ^ ((1 << n) - 1)); h263.c algo
-        tmp = - NEG_USR32(~cache,n);
-    }
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-    return tmp;
-}
-
-static inline int get_sbits(GetBitContext *s, int n){
-    register int tmp;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    tmp= SHOW_SBITS(re, s, n);
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-    return tmp;
-}
-
-/**
- * reads 0-17 bits.
- * Note, the alt bitstream reader can read upto 25 bits, but the libmpeg2 reader cant
- */
-static inline unsigned int get_bits(GetBitContext *s, int n){
-    register int tmp;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    tmp= SHOW_UBITS(re, s, n);
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-    return tmp;
-}
-
-unsigned int get_bits_long(GetBitContext *s, int n);
-
-/**
- * shows 0-17 bits.
- * Note, the alt bitstream reader can read upto 25 bits, but the libmpeg2 reader cant
- */
-static inline unsigned int show_bits(GetBitContext *s, int n){
-    register int tmp;
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    tmp= SHOW_UBITS(re, s, n);
-//    CLOSE_READER(re, s)
-    return tmp;
-}
-
-unsigned int show_bits_long(GetBitContext *s, int n);
-
-static inline void skip_bits(GetBitContext *s, int n){
- //Note gcc seems to optimize this to s->index+=n for the ALT_READER :))
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-    LAST_SKIP_BITS(re, s, n)
-    CLOSE_READER(re, s)
-}
-
-static inline unsigned int get_bits1(GetBitContext *s){
-#ifdef ALT_BITSTREAM_READER
-    int index= s->index;
-    uint8_t result= s->buffer[ index>>3 ];
-    result<<= (index&0x07);
-    result>>= 8 - 1;
-    index++;
-    s->index= index;
-
-    return result;
-#else
-    return get_bits(s, 1);
-#endif
-}
-
-static inline unsigned int show_bits1(GetBitContext *s){
-    return show_bits(s, 1);
-}
-
-static inline void skip_bits1(GetBitContext *s){
-    skip_bits(s, 1);
-}
-
-/**
- * init GetBitContext.
- * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits
- * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end
- * @param bit_size the size of the buffer in bits
- */
-static inline void init_get_bits(GetBitContext *s,
-                   const uint8_t *buffer, int bit_size)
-{
-    const int buffer_size= (bit_size+7)>>3;
-
-    s->buffer= buffer;
-    s->size_in_bits= bit_size;
-    s->buffer_end= buffer + buffer_size;
-#ifdef ALT_BITSTREAM_READER
-    s->index=0;
-#elif defined LIBMPEG2_BITSTREAM_READER
-#ifdef LIBMPEG2_BITSTREAM_READER_HACK
-  if ((int)buffer&1) {
-     /* word alignment */
-    s->cache = (*buffer++)<<24;
-    s->buffer_ptr = buffer;
-    s->bit_count = 16-8;
-  } else
-#endif
-  {
-    s->buffer_ptr = buffer;
-    s->bit_count = 16;
-    s->cache = 0;
-  }
-#elif defined A32_BITSTREAM_READER
-    s->buffer_ptr = (uint32_t*)buffer;
-    s->bit_count = 32;
-    s->cache0 = 0;
-    s->cache1 = 0;
-#endif
-    {
-        OPEN_READER(re, s)
-        UPDATE_CACHE(re, s)
-        UPDATE_CACHE(re, s)
-        CLOSE_READER(re, s)
-    }
-#ifdef A32_BITSTREAM_READER
-    s->cache1 = 0;
-#endif
-}
-
-int check_marker(GetBitContext *s, const char *msg);
-void align_get_bits(GetBitContext *s);
-int init_vlc(VLC *vlc, int nb_bits, int nb_codes,
-             const void *bits, int bits_wrap, int bits_size,
-             const void *codes, int codes_wrap, int codes_size);
-void free_vlc(VLC *vlc);
-
-/**
- *
- * if the vlc code is invalid and max_depth=1 than no bits will be removed
- * if the vlc code is invalid and max_depth>1 than the number of bits removed
- * is undefined
- */
-#define GET_VLC(code, name, gb, table, bits, max_depth)\
-{\
-    int n, index, nb_bits;\
-\
-    index= SHOW_UBITS(name, gb, bits);\
-    code = table[index][0];\
-    n    = table[index][1];\
-\
-    if(max_depth > 1 && n < 0){\
-        LAST_SKIP_BITS(name, gb, bits)\
-        UPDATE_CACHE(name, gb)\
-\
-        nb_bits = -n;\
-\
-        index= SHOW_UBITS(name, gb, nb_bits) + code;\
-        code = table[index][0];\
-        n    = table[index][1];\
-        if(max_depth > 2 && n < 0){\
-            LAST_SKIP_BITS(name, gb, nb_bits)\
-            UPDATE_CACHE(name, gb)\
-\
-            nb_bits = -n;\
-\
-            index= SHOW_UBITS(name, gb, nb_bits) + code;\
-            code = table[index][0];\
-            n    = table[index][1];\
-        }\
-    }\
-    SKIP_BITS(name, gb, n)\
-}
-
-#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth)\
-{\
-    int n, index, nb_bits;\
-\
-    index= SHOW_UBITS(name, gb, bits);\
-    level = table[index].level;\
-    n     = table[index].len;\
-\
-    if(max_depth > 1 && n < 0){\
-        LAST_SKIP_BITS(name, gb, bits)\
-        UPDATE_CACHE(name, gb)\
-\
-        nb_bits = -n;\
-\
-        index= SHOW_UBITS(name, gb, nb_bits) + level;\
-        level = table[index].level;\
-        n     = table[index].len;\
-    }\
-    run= table[index].run;\
-    SKIP_BITS(name, gb, n)\
-}
-
-// deprecated, dont use get_vlc for new code, use get_vlc2 instead or use GET_VLC directly
-static inline int get_vlc(GetBitContext *s, VLC *vlc)
-{
-    int code;
-    VLC_TYPE (*table)[2]= vlc->table;
-    
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-
-    GET_VLC(code, re, s, table, vlc->bits, 3)    
-
-    CLOSE_READER(re, s)
-    return code;
-}
-
-/**
- * parses a vlc code, faster then get_vlc()
- * @param bits is the number of bits which will be read at once, must be 
- *             identical to nb_bits in init_vlc()
- * @param max_depth is the number of times bits bits must be readed to completly
- *                  read the longest vlc code 
- *                  = (max_vlc_length + bits - 1) / bits
- */
-static always_inline int get_vlc2(GetBitContext *s, VLC_TYPE (*table)[2],
-                                  int bits, int max_depth)
-{
-    int code;
-    
-    OPEN_READER(re, s)
-    UPDATE_CACHE(re, s)
-
-    GET_VLC(code, re, s, table, bits, max_depth)
-
-    CLOSE_READER(re, s)
-    return code;
-}
-
-//#define TRACE
-
-#ifdef TRACE
-
-static inline void print_bin(int bits, int n){
-    int i;
-    
-    for(i=n-1; i>=0; i--){
-        printf("%d", (bits>>i)&1);
-    }
-    for(i=n; i<24; i++)
-        printf(" ");
-}
-
-static inline int get_bits_trace(GetBitContext *s, int n, char *file, char *func, int line){
-    int r= get_bits(s, n);
-    
-    print_bin(r, n);
-    printf("%5d %2d %3d bit @%5d in %s %s:%d\n", r, n, r, get_bits_count(s)-n, file, func, line);
-    return r;
-}
-static inline int get_vlc_trace(GetBitContext *s, VLC_TYPE (*table)[2], int bits, int max_depth, char *file, char *func, int line){
-    int show= show_bits(s, 24);
-    int pos= get_bits_count(s);
-    int r= get_vlc2(s, table, bits, max_depth);
-    int len= get_bits_count(s) - pos;
-    int bits2= show>>(24-len);
-    
-    print_bin(bits2, len);
-    
-    printf("%5d %2d %3d vlc @%5d in %s %s:%d\n", bits2, len, r, pos, file, func, line);
-    return r;
-}
-static inline int get_xbits_trace(GetBitContext *s, int n, char *file, char *func, int line){
-    int show= show_bits(s, n);
-    int r= get_xbits(s, n);
-    
-    print_bin(show, n);
-    printf("%5d %2d %3d xbt @%5d in %s %s:%d\n", show, n, r, get_bits_count(s)-n, file, func, line);
-    return r;
-}
-
-#define get_bits(s, n)  get_bits_trace(s, n, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_bits1(s)    get_bits_trace(s, 1, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_xbits(s, n) get_xbits_trace(s, n, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_vlc(s, vlc)            get_vlc_trace(s, (vlc)->table, (vlc)->bits, 3, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-#define get_vlc2(s, tab, bits, max) get_vlc_trace(s, tab, bits, max, __FILE__, __PRETTY_FUNCTION__, __LINE__)
-
-#define tprintf(...) av_log(NULL, AV_LOG_DEBUG, __VA_ARGS__)
-
-#else //TRACE
-#define tprintf(...) {}
-#endif
-
 /* define it to include statistics code (useful only for optimizing
    codec efficiency */
 //#define STATS
@@ -1220,7 +429,7 @@ static inline int ff_get_fourcc(const char *s){
 #define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
 
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 #define MASK_ABS(mask, level)\
             asm volatile(\
 		"cdq			\n\t"\
@@ -1254,8 +463,18 @@ if((y)<(x)){\
 }
 #endif
 
-#ifdef ARCH_X86
-static inline long long rdtsc()
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#if defined(ARCH_X86_64)
+static inline uint64_t rdtsc(void)
+{
+	uint64_t a, d;
+	asm volatile(	"rdtsc\n\t"
+		: "=a" (a), "=d" (d)
+	);
+	return (d << 32) | (a & 0xffffffff);
+}
+#else
+static inline long long rdtsc(void)
 {
 	long long l;
 	asm volatile(	"rdtsc\n\t"
@@ -1263,6 +482,7 @@ static inline long long rdtsc()
 	);
 	return l;
 }
+#endif
 
 #define START_TIMER \
 uint64_t tend;\
@@ -1283,6 +503,9 @@ tend= rdtsc();\
       av_log(NULL, AV_LOG_DEBUG, "%Ld dezicycles in %s, %d runs, %d skips\n", tsum*10/tcount, id, tcount, tskip_count);\
   }\
 }
+#else
+#define START_TIMER 
+#define STOP_TIMER(id) {}
 #endif
 
 #define CLAMP_TO_8BIT(d) ((d > 0xff) ? 0xff : (d < 0) ? 0 : d)
@@ -1294,6 +517,8 @@ tend= rdtsc();\
 #define time time_is_forbidden_due_to_security_issues
 #define rand rand_is_forbidden_due_to_state_trashing
 #define srand srand_is_forbidden_due_to_state_trashing
+#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf
+#define strcat strcat_is_forbidden_due_to_security_issues_use_pstrcat
 #if !(defined(LIBAVFORMAT_BUILD) || defined(_FRAMEHOOK_H))
 #define printf please_use_av_log
 #define fprintf please_use_av_log
diff --git a/src/libffmpeg/libavcodec/dpcm.c b/src/libffmpeg/libavcodec/dpcm.c
index b80604e5f..487203ae9 100644
--- a/src/libffmpeg/libavcodec/dpcm.c
+++ b/src/libffmpeg/libavcodec/dpcm.c
@@ -24,6 +24,7 @@
  * Xan DPCM decoder by Mario Brito (mbrito@student.dei.uc.pt)
  * for more information on the specific data formats, visit:
  *   http://www.pcisys.net/~melanson/codecs/simpleaudio.html
+ * SOL DPCMs implemented by Konstantin Shishkov
  *
  * Note about using the Xan DPCM decoder: Xan DPCM is used in AVI files
  * found in the Wing Commander IV computer game. These AVI files contain
@@ -39,6 +40,8 @@
 typedef struct DPCMContext {
     int channels;
     short roq_square_array[256];
+    long sample[2];//for SOL_DPCM
+    int *sol_table;//for SOL_DPCM
 } DPCMContext;
 
 #define SATURATE_S16(x)  if (x < -32768) x = -32768; \
@@ -81,6 +84,32 @@ static int interplay_delta_table[] = {
 
 };
 
+static int sol_table_old[16] =
+    { 0x0,  0x1,  0x2 , 0x3,  0x6,  0xA,  0xF, 0x15,
+    -0x15, -0xF, -0xA, -0x6, -0x3, -0x2, -0x1, 0x0};
+
+static int sol_table_new[16] =
+    { 0x0,  0x1,  0x2,  0x3,  0x6,  0xA,  0xF,  0x15,
+      0x0, -0x1, -0x2, -0x3, -0x6, -0xA, -0xF, -0x15};
+    
+static int sol_table_16[128] = {
+    0x000, 0x008, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x080,
+    0x090, 0x0A0, 0x0B0, 0x0C0, 0x0D0, 0x0E0, 0x0F0, 0x100, 0x110, 0x120,
+    0x130, 0x140, 0x150, 0x160, 0x170, 0x180, 0x190, 0x1A0, 0x1B0, 0x1C0,
+    0x1D0, 0x1E0, 0x1F0, 0x200, 0x208, 0x210, 0x218, 0x220, 0x228, 0x230,
+    0x238, 0x240, 0x248, 0x250, 0x258, 0x260, 0x268, 0x270, 0x278, 0x280,
+    0x288, 0x290, 0x298, 0x2A0, 0x2A8, 0x2B0, 0x2B8, 0x2C0, 0x2C8, 0x2D0,
+    0x2D8, 0x2E0, 0x2E8, 0x2F0, 0x2F8, 0x300, 0x308, 0x310, 0x318, 0x320,
+    0x328, 0x330, 0x338, 0x340, 0x348, 0x350, 0x358, 0x360, 0x368, 0x370,
+    0x378, 0x380, 0x388, 0x390, 0x398, 0x3A0, 0x3A8, 0x3B0, 0x3B8, 0x3C0,
+    0x3C8, 0x3D0, 0x3D8, 0x3E0, 0x3E8, 0x3F0, 0x3F8, 0x400, 0x440, 0x480,
+    0x4C0, 0x500, 0x540, 0x580, 0x5C0, 0x600, 0x640, 0x680, 0x6C0, 0x700,
+    0x740, 0x780, 0x7C0, 0x800, 0x900, 0xA00, 0xB00, 0xC00, 0xD00, 0xE00,
+    0xF00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x3000, 0x4000
+};
+
+
+
 static int dpcm_decode_init(AVCodecContext *avctx)
 {
     DPCMContext *s = avctx->priv_data;
@@ -88,6 +117,7 @@ static int dpcm_decode_init(AVCodecContext *avctx)
     short square;
 
     s->channels = avctx->channels;
+    s->sample[0] = s->sample[1] = 0;
 
     switch(avctx->codec->id) {
 
@@ -100,6 +130,26 @@ static int dpcm_decode_init(AVCodecContext *avctx)
         }
         break;
 
+        
+    case CODEC_ID_SOL_DPCM:
+        switch(avctx->codec_tag){
+        case 1:
+            s->sol_table=sol_table_old;
+            s->sample[0] = s->sample[1] = 0x80;
+            break;
+        case 2:
+            s->sol_table=sol_table_new;
+            s->sample[0] = s->sample[1] = 0x80;
+            break;
+        case 3:
+            s->sol_table=sol_table_16;
+            break;
+        default:
+            av_log(avctx, AV_LOG_ERROR, "Unknown SOL subcodec\n");
+            return -1;
+        }
+        break;
+     
     default:
         break;
     }
@@ -203,6 +253,38 @@ static int dpcm_decode_frame(AVCodecContext *avctx,
             channel_number ^= s->channels - 1;
         }
         break;
+    case CODEC_ID_SOL_DPCM:
+        in = 0;
+        if (avctx->codec_tag != 3) {
+            while (in < buf_size) {
+                int n1, n2;
+                n1 = (buf[in] >> 4) & 0xF;
+                n2 = buf[in++] & 0xF;
+                s->sample[0] += s->sol_table[n1];
+                 if (s->sample[0] < 0) s->sample[0] = 0;
+                if (s->sample[0] > 255) s->sample[0] = 255;
+                output_samples[out++] = (s->sample[0] - 128) << 8;
+                s->sample[s->channels - 1] += s->sol_table[n2];
+                if (s->sample[s->channels - 1] < 0) s->sample[s->channels - 1] = 0;
+                if (s->sample[s->channels - 1] > 255) s->sample[s->channels - 1] = 255;
+                output_samples[out++] = (s->sample[s->channels - 1] - 128) << 8;
+            }
+        } else {
+            while (in < buf_size) {
+                int n;
+                n = buf[in++];
+                if (n & 0x80) s->sample[channel_number] -= s->sol_table[n & 0x7F];
+                else s->sample[channel_number] += s->sol_table[n & 0x7F];
+                SATURATE_S16(s->sample[channel_number]);
+                output_samples[out++] = s->sample[channel_number];
+                /* toggle channel */
+                channel_number ^= s->channels - 1;
+            }
+        }
+        break;
+
+    default:
+        break;
     }
 
     *data_size = out * sizeof(short);
@@ -241,3 +323,14 @@ AVCodec xan_dpcm_decoder = {
     NULL,
     dpcm_decode_frame,
 };
+
+AVCodec sol_dpcm_decoder = {
+    "sol_dpcm",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_SOL_DPCM,
+    sizeof(DPCMContext),
+    dpcm_decode_init,
+    NULL,
+    NULL,
+    dpcm_decode_frame,
+};
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index b1252251a..926832ff1 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -31,8 +31,11 @@
 #include "simple_idct.h"
 #include "faandct.h"
 
-uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
-uint32_t squareTbl[512];
+/* snow.c */
+void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
+
+uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
+uint32_t squareTbl[512] = {0, };
 
 const uint8_t ff_zigzag_direct[64] = {
     0,   1,  8, 16,  9,  2,  3, 10,
@@ -59,7 +62,7 @@ const uint8_t ff_zigzag248_direct[64] = {
 };
 
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
-uint16_t __align8 inv_zigzag_direct16[64];
+uint16_t __align8 inv_zigzag_direct16[64] = {0, };
 
 const uint8_t ff_alternate_horizontal_scan[64] = {
     0,  1,   2,  3,  8,  9, 16, 17, 
@@ -219,6 +222,23 @@ static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
     }
 }
 
+static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
+{
+    int s, i;
+    uint32_t *sq = squareTbl + 256;
+
+    s = 0;
+    for (i = 0; i < h; i++) {
+        s += sq[pix1[0] - pix2[0]];
+        s += sq[pix1[1] - pix2[1]];
+        s += sq[pix1[2] - pix2[2]];
+        s += sq[pix1[3] - pix2[3]];
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    return s;
+}
+
 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 {
     int s, i;
@@ -270,6 +290,103 @@ static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
     return s;
 }
 
+
+static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
+    int s, i, j;
+    const int dec_count= w==8 ? 3 : 4;
+    int tmp[16*16];
+#if 0
+    int level, ori;
+    static const int scale[2][2][4][4]={ 
+      {
+        {
+            //8x8 dec=3
+            {268, 239, 239, 213},
+            {  0, 224, 224, 152},
+            {  0, 135, 135, 110},
+        },{
+            //16x16 dec=4
+            {344, 310, 310, 280},
+            {  0, 320, 320, 228},
+            {  0, 175, 175, 136},
+            {  0, 129, 129, 102},
+        }
+      },{
+        {//FIXME 5/3
+            //8x8 dec=3
+            {275, 245, 245, 218},
+            {  0, 230, 230, 156},
+            {  0, 138, 138, 113},
+        },{
+            //16x16 dec=4
+            {352, 317, 317, 286},
+            {  0, 328, 328, 233},
+            {  0, 180, 180, 140},
+            {  0, 132, 132, 105},
+        }
+      }
+    };
+#endif
+
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j+=4) {
+            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
+            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
+            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
+            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
+        }
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
+
+    s=0;
+#if 0
+    for(level=0; level<dec_count; level++){
+        for(ori= level ? 1 : 0; ori<4; ori++){
+            int sx= (ori&1) ? 1<<level: 0;
+            int stride= 16<<(dec_count-level);
+            int sy= (ori&2) ? stride>>1 : 0;
+            int size= 1<<level;
+            
+            for(i=0; i<size; i++){
+                for(j=0; j<size; j++){
+                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
+                    s += ABS(v);
+                }
+            }
+        }
+    }
+#endif
+    for (i = 0; i < h; i++) {
+        for (j = 0; j < w; j+=4) {
+            s+= ABS(tmp[16*i+j+0]);
+            s+= ABS(tmp[16*i+j+1]);
+            s+= ABS(tmp[16*i+j+2]);
+            s+= ABS(tmp[16*i+j+3]);
+        }
+    }
+    assert(s>=0); 
+    
+    return s>>2;
+}
+
+static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
+    return w_c(v, pix1, pix2, line_size,  8, h, 1);
+}
+
+static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
+    return w_c(v, pix1, pix2, line_size,  8, h, 0);
+}
+
+static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
+    return w_c(v, pix1, pix2, line_size, 16, h, 1);
+}
+
+static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
+    return w_c(v, pix1, pix2, line_size, 16, h, 0);
+}
+
 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 {
     int i;
@@ -332,6 +449,40 @@ static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
     }
 }
 
+static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
+				 int line_size)
+{
+    int i;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = cm[block[0]];
+        pixels[1] = cm[block[1]];
+        pixels[2] = cm[block[2]];
+        pixels[3] = cm[block[3]];
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
+				 int line_size)
+{
+    int i;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = cm[block[0]];
+        pixels[1] = cm[block[1]];
+
+        pixels += line_size;
+        block += 8;
+    }
+}
+
 static void put_signed_pixels_clamped_c(const DCTELEM *block, 
                                         uint8_t *restrict pixels,
                                         int line_size)
@@ -373,6 +524,38 @@ static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
         block += 8;
     }
 }
+
+static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
+                          int line_size)
+{
+    int i;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    
+    /* read the pixels */
+    for(i=0;i<4;i++) {
+        pixels[0] = cm[pixels[0] + block[0]];
+        pixels[1] = cm[pixels[1] + block[1]];
+        pixels[2] = cm[pixels[2] + block[2]];
+        pixels[3] = cm[pixels[3] + block[3]];
+        pixels += line_size;
+        block += 8;
+    }
+}
+
+static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
+                          int line_size)
+{
+    int i;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    
+    /* read the pixels */
+    for(i=0;i<2;i++) {
+        pixels[0] = cm[pixels[0] + block[0]];
+        pixels[1] = cm[pixels[1] + block[1]];
+        pixels += line_size;
+        block += 8;
+    }
+}
 #if 0
 
 #define PIXOP2(OPNAME, OP) \
@@ -2031,7 +2214,6 @@ static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t
     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
     src += 8*srcStride;\
-    tmp += 8*tmpStride;\
     dst += 8*dstStride;\
     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
@@ -2195,6 +2377,77 @@ H264_MC(avg_, 16)
 #undef op2_put
 #endif
 
+#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
+#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
+#define H264_WEIGHT(W,H) \
+static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
+    int attribute_unused x, y; \
+    offset <<= log2_denom; \
+    if(log2_denom) offset += 1<<(log2_denom-1); \
+    for(y=0; y<H; y++, block += stride){ \
+        op_scale1(0); \
+        op_scale1(1); \
+        if(W==2) continue; \
+        op_scale1(2); \
+        op_scale1(3); \
+        if(W==4) continue; \
+        op_scale1(4); \
+        op_scale1(5); \
+        op_scale1(6); \
+        op_scale1(7); \
+        if(W==8) continue; \
+        op_scale1(8); \
+        op_scale1(9); \
+        op_scale1(10); \
+        op_scale1(11); \
+        op_scale1(12); \
+        op_scale1(13); \
+        op_scale1(14); \
+        op_scale1(15); \
+    } \
+} \
+static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
+    int attribute_unused x, y; \
+    int offset = (offsets + offsetd + 1) >> 1; \
+    offset = ((offset << 1) + 1) << log2_denom; \
+    for(y=0; y<H; y++, dst += stride, src += stride){ \
+        op_scale2(0); \
+        op_scale2(1); \
+        if(W==2) continue; \
+        op_scale2(2); \
+        op_scale2(3); \
+        if(W==4) continue; \
+        op_scale2(4); \
+        op_scale2(5); \
+        op_scale2(6); \
+        op_scale2(7); \
+        if(W==8) continue; \
+        op_scale2(8); \
+        op_scale2(9); \
+        op_scale2(10); \
+        op_scale2(11); \
+        op_scale2(12); \
+        op_scale2(13); \
+        op_scale2(14); \
+        op_scale2(15); \
+    } \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16,8)
+H264_WEIGHT(8,16)
+H264_WEIGHT(8,8)
+H264_WEIGHT(8,4)
+H264_WEIGHT(4,8)
+H264_WEIGHT(4,4)
+H264_WEIGHT(4,2)
+H264_WEIGHT(2,4)
+H264_WEIGHT(2,2)
+
+#undef op_scale1
+#undef op_scale2
+#undef H264_WEIGHT
+
 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
     uint8_t *cm = cropTbl + MAX_NEG_CROP;
     int i;
@@ -2360,6 +2613,33 @@ static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
     }
 }
 
+static void h261_loop_filter_c(uint8_t *src, int stride){
+    int x,y,xy,yz;
+    int temp[64];
+
+    for(x=0; x<8; x++){
+        temp[x      ] = 4*src[x           ];
+        temp[x + 7*8] = 4*src[x + 7*stride];
+    }
+    for(y=1; y<7; y++){
+        for(x=0; x<8; x++){
+            xy = y * stride + x;
+            yz = y * 8 + x;
+            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
+        }
+    }
+        
+    for(y=0; y<8; y++){
+        src[  y*stride] = (temp[  y*8] + 2)>>2;
+        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
+        for(x=1; x<7; x++){
+            xy = y * stride + x;
+            yz = y * 8 + x;
+            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
+        }
+    }
+}
+
 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 {
     int s, i;
@@ -2560,6 +2840,56 @@ static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size,
     return s;
 }
 
+static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
+    int score1=0;
+    int score2=0;
+    int x,y;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<16; x++){
+            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
+        }
+        if(y+1<h){
+            for(x=0; x<15; x++){
+                score2+= ABS(  s1[x  ] - s1[x  +stride]
+                             - s1[x+1] + s1[x+1+stride])
+                        -ABS(  s2[x  ] - s2[x  +stride]
+                             - s2[x+1] + s2[x+1+stride]);
+            }
+        }
+        s1+= stride;
+        s2+= stride;
+    }
+
+    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
+    else  return score1 + ABS(score2)*8;
+}
+
+static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
+    int score1=0;
+    int score2=0;
+    int x,y;
+    
+    for(y=0; y<h; y++){
+        for(x=0; x<8; x++){
+            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
+        }
+        if(y+1<h){
+            for(x=0; x<7; x++){
+                score2+= ABS(  s1[x  ] - s1[x  +stride]
+                             - s1[x+1] + s1[x+1+stride])
+                        -ABS(  s2[x  ] - s2[x  +stride]
+                             - s2[x+1] + s2[x+1+stride]);
+            }
+        }
+        s1+= stride;
+        s2+= stride;
+    }
+    
+    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
+    else  return score1 + ABS(score2)*8;
+}
+
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
     int i;
     unsigned int sum=0;
@@ -2635,6 +2965,9 @@ void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
         case FF_CMP_DCT:
             cmp[i]= c->dct_sad[i];
             break;
+        case FF_CMP_DCTMAX:
+            cmp[i]= c->dct_max[i];
+            break;
         case FF_CMP_PSNR:
             cmp[i]= c->quant_psnr[i];
             break;
@@ -2653,6 +2986,15 @@ void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
         case FF_CMP_ZERO:
             cmp[i]= zero_cmp;
             break;
+        case FF_CMP_NSSE:
+            cmp[i]= c->nsse[i];
+            break;
+        case FF_CMP_W53:
+            cmp[i]= c->w53[i];
+            break;
+        case FF_CMP_W97:
+            cmp[i]= c->w97[i];
+            break;
         default:
             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
         }
@@ -2849,6 +3191,23 @@ static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2
     return sum;
 }
 
+static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
+    MpegEncContext * const s= (MpegEncContext *)c;
+    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
+    DCTELEM * const temp= (DCTELEM*)aligned_temp;
+    int sum=0, i;
+    
+    assert(h==8);
+
+    s->dsp.diff_pixels(temp, src1, src2, stride);
+    s->dsp.fdct(temp);
+
+    for(i=0; i<64; i++)
+        sum= FFMAX(sum, ABS(temp[i]));
+        
+    return sum;
+}
+
 void simple_idct(DCTELEM *block); //FIXME
 
 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
@@ -3078,6 +3437,7 @@ static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int st
 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
+WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
 WARPER8_16_SQ(rd8x8_c, rd16_c)
 WARPER8_16_SQ(bit8x8_c, bit16_c)
@@ -3095,6 +3455,41 @@ static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
     add_pixels_clamped_c(block, dest, line_size);
 }
 
+static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    j_rev_dct4 (block);
+    put_pixels_clamped4_c(block, dest, line_size);
+}
+static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    j_rev_dct4 (block);
+    add_pixels_clamped4_c(block, dest, line_size);
+}
+
+static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    j_rev_dct2 (block);
+    put_pixels_clamped2_c(block, dest, line_size);
+}
+static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    j_rev_dct2 (block);
+    add_pixels_clamped2_c(block, dest, line_size);
+}
+
+static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+
+    dest[0] = cm[(block[0] + 4)>>3];
+}
+static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+
+    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
+}
+
 /* init static data */
 void dsputil_static_init(void)
 {
@@ -3133,18 +3528,42 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     }
 #endif //CONFIG_ENCODERS
 
-    if(avctx->idct_algo==FF_IDCT_INT){
-        c->idct_put= ff_jref_idct_put;
-        c->idct_add= ff_jref_idct_add;
-        c->idct    = j_rev_dct;
-        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
-    }else{ //accurate/default
-        c->idct_put= simple_idct_put;
-        c->idct_add= simple_idct_add;
-        c->idct    = simple_idct;
+    if(avctx->lowres==1){
+        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
+            c->idct_put= ff_jref_idct4_put;
+            c->idct_add= ff_jref_idct4_add;
+        }else{
+            c->idct_put= ff_h264_lowres_idct_put_c;
+            c->idct_add= ff_h264_lowres_idct_add_c;
+        }
+        c->idct    = j_rev_dct4;
+        c->idct_permutation_type= FF_NO_IDCT_PERM;
+    }else if(avctx->lowres==2){
+        c->idct_put= ff_jref_idct2_put;
+        c->idct_add= ff_jref_idct2_add;
+        c->idct    = j_rev_dct2;
+        c->idct_permutation_type= FF_NO_IDCT_PERM;
+    }else if(avctx->lowres==3){
+        c->idct_put= ff_jref_idct1_put;
+        c->idct_add= ff_jref_idct1_add;
+        c->idct    = j_rev_dct1;
         c->idct_permutation_type= FF_NO_IDCT_PERM;
+    }else{
+        if(avctx->idct_algo==FF_IDCT_INT){
+            c->idct_put= ff_jref_idct_put;
+            c->idct_add= ff_jref_idct_add;
+            c->idct    = j_rev_dct;
+            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+        }else{ //accurate/default
+            c->idct_put= simple_idct_put;
+            c->idct_add= simple_idct_add;
+            c->idct    = simple_idct;
+            c->idct_permutation_type= FF_NO_IDCT_PERM;
+        }
     }
 
+    c->h264_idct_add= ff_h264_idct_add_c;
+
     /* VP3 DSP support */
     c->vp3_dsp_init = vp3_dsp_init_c;
     c->vp3_idct = vp3_idct_c;
@@ -3259,6 +3678,27 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
 
+    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
+    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
+    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
+    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
+    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
+    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
+    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
+    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
+    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
+    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
+    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
+    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
+    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
+    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
+    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
+    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
+    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
+    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
+    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
+    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
+
     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
@@ -3275,10 +3715,12 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     SET_CMP_FUNC(hadamard8_diff)
     c->hadamard8_diff[4]= hadamard8_intra16_c;
     SET_CMP_FUNC(dct_sad)
+    SET_CMP_FUNC(dct_max)
     c->sad[0]= pix_abs16_c;
     c->sad[1]= pix_abs8_c;
     c->sse[0]= sse16_c;
     c->sse[1]= sse8_c;
+    c->sse[2]= sse4_c;
     SET_CMP_FUNC(quant_psnr)
     SET_CMP_FUNC(rd)
     SET_CMP_FUNC(bit)
@@ -3286,7 +3728,13 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->vsad[4]= vsad_intra16_c;
     c->vsse[0]= vsse16_c;
     c->vsse[4]= vsse_intra16_c;
-        
+    c->nsse[0]= nsse16_c;
+    c->nsse[1]= nsse8_c;
+    c->w53[0]= w53_16_c;
+    c->w53[1]= w53_8_c;
+    c->w97[0]= w97_16_c;
+    c->w97[1]= w97_8_c;
+
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
@@ -3295,6 +3743,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->h263_h_loop_filter= h263_h_loop_filter_c;
     c->h263_v_loop_filter= h263_v_loop_filter_c;
     
+    c->h261_loop_filter= h261_loop_filter_c;
+    
     c->try_8x8basis= try_8x8basis_c;
     c->add_8x8basis= add_8x8basis_c;
 
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index adb4679e0..c728a24d6 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -45,11 +45,18 @@ void ff_jpeg_fdct_islow (DCTELEM *data);
 void ff_fdct248_islow (DCTELEM *data);
 
 void j_rev_dct (DCTELEM *data);
+void j_rev_dct4 (DCTELEM *data);
+void j_rev_dct2 (DCTELEM *data);
+void j_rev_dct1 (DCTELEM *data);
 
 void ff_fdct_mmx(DCTELEM *block);
 void ff_fdct_mmx2(DCTELEM *block);
 void ff_fdct_sse2(DCTELEM *block);
 
+void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
+
 /* encoding scans */
 extern const uint8_t ff_alternate_horizontal_scan[64];
 extern const uint8_t ff_alternate_vertical_scan[64];
@@ -57,7 +64,7 @@ extern const uint8_t ff_zigzag_direct[64];
 extern const uint8_t ff_zigzag248_direct[64];
 
 /* pixel operations */
-#define MAX_NEG_CROP 384
+#define MAX_NEG_CROP 1024
 
 /* temporary */
 extern uint32_t squareTbl[512];
@@ -101,6 +108,8 @@ typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const ui
 typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
 typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets);
 
 #define DEF_OLD_QPEL(name)\
 void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
@@ -165,12 +174,17 @@ typedef struct DSPContext {
     me_cmp_func rd[5];
     me_cmp_func vsad[5];
     me_cmp_func vsse[5];
+    me_cmp_func nsse[5];
+    me_cmp_func w53[5];
+    me_cmp_func w97[5];
+    me_cmp_func dct_max[5];
 
     me_cmp_func me_pre_cmp[5];
     me_cmp_func me_cmp[5];
     me_cmp_func me_sub_cmp[5];
     me_cmp_func mb_cmp[5];
     me_cmp_func ildct_cmp[5]; //only width 16 used
+    me_cmp_func frame_skip_cmp[5]; //only width 8 used
 
     /**
      * Halfpel motion compensation with rounding (a+b+1)>>1.
@@ -206,7 +220,7 @@ typedef struct DSPContext {
      * @param line_size number of bytes in a horizontal line of block
      * @param h height
      */
-    op_pixels_func put_no_rnd_pixels_tab[2][4];
+    op_pixels_func put_no_rnd_pixels_tab[4][4];
 
     /**
      * Halfpel motion compensation with no rounding (a+b)>>1.
@@ -218,7 +232,7 @@ typedef struct DSPContext {
      * @param line_size number of bytes in a horizontal line of block
      * @param h height
      */
-    op_pixels_func avg_no_rnd_pixels_tab[2][4];
+    op_pixels_func avg_no_rnd_pixels_tab[4][4];
     
     void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
     
@@ -249,6 +263,9 @@ typedef struct DSPContext {
     qpel_mc_func put_h264_qpel_pixels_tab[3][16];
     qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
     
+    h264_weight_func weight_h264_pixels_tab[10];
+    h264_biweight_func biweight_h264_pixels_tab[10];
+    
     me_cmp_func pix_abs[2][4];
     
     /* huffyuv specific */
@@ -264,6 +281,8 @@ typedef struct DSPContext {
     void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
     void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
 
+    void (*h261_loop_filter)(uint8_t *src, int stride);
+
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
     void (*fdct248)(DCTELEM *block/* align 16*/);
@@ -325,7 +344,8 @@ typedef struct DSPContext {
      */
     void (*vp3_idct)(int16_t *input_data, int16_t *dequant_matrix,
         int coeff_count, DCTELEM *output_samples);
-
+ 
+    void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
 } DSPContext;
 
 void dsputil_static_init(void);
@@ -351,6 +371,29 @@ static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
     return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
 }
 
+static inline int get_penalty_factor(int lambda, int lambda2, int type){
+    switch(type&0xFF){
+    default:
+    case FF_CMP_SAD:
+        return lambda>>FF_LAMBDA_SHIFT;
+    case FF_CMP_DCT:
+        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
+    case FF_CMP_W53:
+        return (4*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_W97:
+        return (2*lambda)>>(FF_LAMBDA_SHIFT);
+    case FF_CMP_SATD:
+        return (2*lambda)>>FF_LAMBDA_SHIFT;
+    case FF_CMP_RD:
+    case FF_CMP_PSNR:
+    case FF_CMP_SSE:
+    case FF_CMP_NSSE:
+        return lambda2>>FF_LAMBDA_SHIFT;
+    case FF_CMP_BIT:
+        return 1;
+    }
+}
+
 /**
  * Empty mmx state.
  * this must be called between any dsp function and float/double code.
@@ -373,6 +416,7 @@ int mm_support(void);
 #define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */
 #define MM_SSE    0x0008 /* SSE functions */
 #define MM_SSE2   0x0010 /* PIV SSE2 functions */
+#define MM_3DNOWEXT  0x0020 /* AMD 3DNowExt */
 
 extern int mm_flags;
 
@@ -393,6 +437,7 @@ static inline void emms(void)
 }
 
 #define __align8 __attribute__ ((aligned (8)))
+#define STRIDE_ALIGN 8
 
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
@@ -402,6 +447,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
 /* This is to use 4 bytes read to the IDCT pointers for some 'zero'
    line optimizations */
 #define __align8 __attribute__ ((aligned (4)))
+#define STRIDE_ALIGN 4
 
 void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
 
@@ -409,6 +455,7 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
 
 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
 #define __align8 __attribute__ ((aligned (8)))
+#define STRIDE_ALIGN 8
 
 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
 
@@ -416,11 +463,13 @@ void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
 
 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
 #define __align8 __attribute__ ((aligned (8)))
+#define STRIDE_ALIGN 8
 void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
 
 #elif defined(ARCH_ALPHA)
 
 #define __align8 __attribute__ ((aligned (8)))
+#define STRIDE_ALIGN 8
 
 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
 
@@ -437,24 +486,28 @@ extern int mm_flags;
 #endif
 
 #define __align8 __attribute__ ((aligned (16)))
+#define STRIDE_ALIGN 16
 
 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
 
 #elif defined(HAVE_MMI)
 
 #define __align8 __attribute__ ((aligned (16)))
+#define STRIDE_ALIGN 16
 
 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
 
 #elif defined(ARCH_SH4)
 
 #define __align8 __attribute__ ((aligned (8)))
+#define STRIDE_ALIGN 8
 
 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
 
 #else
 
-#define __align8
+#define __align8 __attribute__ ((aligned (8)))
+#define STRIDE_ALIGN 8
 
 #endif
 
@@ -558,11 +611,20 @@ static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int st
 /* XXX: add ISOC specific test to avoid specific BSD testing. */
 /* better than nothing implementation. */
 /* btw, rintf() is existing on fbsd too -- alex */
-static inline long int lrintf(float x)
+static always_inline long int lrintf(float x)
 {
 #ifdef CONFIG_WIN32
+#  ifdef ARCH_X86
+    int32_t i;
+    asm volatile(
+        "fistpl %0\n\t"
+        : "=m" (i) : "t" (x) : "st"
+    );
+    return i;
+#  else
     /* XXX: incorrect, but make it compile */
-    return (int)(x);
+    return (int)(x + (x < 0 ? -0.5 : 0.5));
+#  endif
 #else
     return (int)(rint(x));
 #endif
diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c
index 8e359e361..94440ed5c 100644
--- a/src/libffmpeg/libavcodec/dv.c
+++ b/src/libffmpeg/libavcodec/dv.c
@@ -144,7 +144,7 @@ static int dvvideo_init(AVCodecContext *avctx)
         /* NOTE: as a trick, we use the fact the no codes are unused
            to accelerate the parsing of partial codes */
         init_vlc(&dv_vlc, TEX_VLC_BITS, j, 
-                 new_dv_vlc_len, 1, 1, new_dv_vlc_bits, 2, 2);
+                 new_dv_vlc_len, 1, 1, new_dv_vlc_bits, 2, 2, 0);
 
         dv_rl_vlc = av_malloc(dv_vlc.table_size * sizeof(RL_VLC_ELEM));
 	if (!dv_rl_vlc) {
@@ -263,6 +263,7 @@ static const int mb_area_start[5] = { 1, 6, 21, 43, 64 };
 
 #ifndef ALT_BITSTREAM_READER
 #warning only works with ALT_BITSTREAM_READER
+static int re_index; //Hack to make it compile
 #endif
 
 static inline int get_bits_left(GetBitContext *s)
@@ -394,8 +395,7 @@ static inline void dv_decode_video_segment(DVVideoContext *s,
 	    init_get_bits(&gb, buf_ptr, last_index);
             
             /* get the dc */
-            dc = get_bits(&gb, 9);
-            dc = (dc << (32 - 9)) >> (32 - 9);
+            dc = get_sbits(&gb, 9);
             dct_mode = get_bits1(&gb);
             mb->dct_mode = dct_mode;
             mb->scan_table = s->dv_zigzag[dct_mode];
@@ -889,10 +889,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
 {
     DVVideoContext *s = avctx->priv_data;
   
-    /* special case for last picture */
-    if(buf_size==0)
-        return 0;
-    
     s->sys = dv_frame_profile(buf);
     if (!s->sys || buf_size < s->sys->frame_size)
         return -1; /* NOTE: we only accept several full frames */
@@ -932,7 +928,9 @@ static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size,
     s->sys = dv_codec_profile(c);
     if (!s->sys)
 	return -1;
-    
+    if(buf_size < s->sys->frame_size)
+        return -1;
+
     c->pix_fmt = s->sys->pix_fmt;
     s->picture = *((AVFrame *)data);
 
diff --git a/src/libffmpeg/libavcodec/dvdata.h b/src/libffmpeg/libavcodec/dvdata.h
index e60d99448..acda751d6 100644
--- a/src/libffmpeg/libavcodec/dvdata.h
+++ b/src/libffmpeg/libavcodec/dvdata.h
@@ -1299,7 +1299,7 @@ static const DVprofile dv_profiles[] = {
       .frame_rate_base = 1001,
       .height = 480,
       .width = 720,
-      .sar = {{72, 79}, {96, 79}},
+      .sar = {{10, 11}, {40, 33}},
       .video_place = dv_place_411,
       .pix_fmt = PIX_FMT_YUV411P,
       .audio_stride = 90,
@@ -1315,7 +1315,7 @@ static const DVprofile dv_profiles[] = {
       .ltc_divisor = 25,
       .height = 576,
       .width = 720,
-      .sar = {{128, 117}, {512, 351}},
+      .sar = {{59, 54}, {118, 81}},
       .video_place = dv_place_420,
       .pix_fmt = PIX_FMT_YUV420P,
       .audio_stride = 108,
@@ -1331,7 +1331,7 @@ static const DVprofile dv_profiles[] = {
       .ltc_divisor = 25,
       .height = 576,
       .width = 720,
-      .sar = {{128, 117}, {512, 351}},
+      .sar = {{59, 54}, {118, 81}},
       .video_place = dv_place_411P,
       .pix_fmt = PIX_FMT_YUV411P,
       .audio_stride = 108,
diff --git a/src/libffmpeg/libavcodec/error_resilience.c b/src/libffmpeg/libavcodec/error_resilience.c
index b7aeebddf..b0d22ddf9 100644
--- a/src/libffmpeg/libavcodec/error_resilience.c
+++ b/src/libffmpeg/libavcodec/error_resilience.c
@@ -652,7 +652,7 @@ void ff_er_add_slice(MpegEncContext *s, int startx, int starty, int endx, int en
  
     s->error_status_table[start_xy] |= VP_START;
 
-    if(start_xy > 0 && s->avctx->thread_count <= 1){
+    if(start_xy > 0 && s->avctx->thread_count <= 1 && s->avctx->skip_top*s->mb_width < start_i){
         int prev_status= s->error_status_table[ s->mb_index2xy[start_i - 1] ];
         
         prev_status &= ~ VP_START;
@@ -661,31 +661,34 @@ void ff_er_add_slice(MpegEncContext *s, int startx, int starty, int endx, int en
 }
 
 void ff_er_frame_end(MpegEncContext *s){
-    int i, mb_x, mb_y, error, error_type;
+    int i, mb_x, mb_y, error, error_type, dc_error, mv_error, ac_error;
     int distance;
     int threshold_part[4]= {100,100,100};
     int threshold= 50;
     int is_intra_likely;
+    int size = s->b8_stride * 2 * s->mb_height;
+    Picture *pic= s->current_picture_ptr;
     
-    if(!s->error_resilience || s->error_count==0) return;
+    if(!s->error_resilience || s->error_count==0 || 
+       s->error_count==3*s->mb_width*(s->avctx->skip_top + s->avctx->skip_bottom)) return;
 
-    av_log(s->avctx, AV_LOG_INFO, "concealing errors\n");
-    
     if(s->current_picture.motion_val[0] == NULL){
-        int size = s->b8_stride * 2 * s->mb_height;
-        Picture *pic= s->current_picture_ptr;
-        
         av_log(s->avctx, AV_LOG_ERROR, "Warning MVs not available\n");
             
         for(i=0; i<2; i++){
             pic->ref_index[i]= av_mallocz(size * sizeof(uint8_t));
-            pic->motion_val_base[i]= av_mallocz((size+2) * 2 * sizeof(uint16_t));
-            pic->motion_val[i]= pic->motion_val_base[i]+2;
+            pic->motion_val_base[i]= av_mallocz((size+4) * 2 * sizeof(uint16_t));
+            pic->motion_val[i]= pic->motion_val_base[i]+4;
         }
         pic->motion_subsample_log2= 3;
         s->current_picture= *s->current_picture_ptr;
     }
     
+    for(i=0; i<2; i++){
+        if(pic->ref_index[i])
+            memset(pic->ref_index[i], 0, size * sizeof(uint8_t));
+    }
+
     if(s->avctx->debug&FF_DEBUG_ER){
         for(mb_y=0; mb_y<s->mb_height; mb_y++){
             for(mb_x=0; mb_x<s->mb_width; mb_x++){
@@ -816,6 +819,17 @@ void ff_er_frame_end(MpegEncContext *s){
         }
     }
 #endif
+
+    dc_error= ac_error= mv_error=0;
+    for(i=0; i<s->mb_num; i++){
+        const int mb_xy= s->mb_index2xy[i];
+        error= s->error_status_table[mb_xy];
+        if(error&DC_ERROR) dc_error ++;
+        if(error&AC_ERROR) ac_error ++;
+        if(error&MV_ERROR) mv_error ++;
+    }
+    av_log(s->avctx, AV_LOG_INFO, "concealing %d DC, %d AC, %d MV errors\n", dc_error, ac_error, mv_error);
+
     is_intra_likely= is_intra_more_likely(s);
 
     /* set unknown mb-type to most likely */
diff --git a/src/libffmpeg/libavcodec/eval.c b/src/libffmpeg/libavcodec/eval.c
index aead600e8..330781581 100644
--- a/src/libffmpeg/libavcodec/eval.c
+++ b/src/libffmpeg/libavcodec/eval.c
@@ -42,10 +42,7 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-#define STACK_SIZE 100
-
 typedef struct Parser{
-    double stack[STACK_SIZE];
     int stack_index;
     char *s;
     double *const_value;
@@ -57,25 +54,7 @@ typedef struct Parser{
     void *opaque;
 } Parser;
 
-static void evalExpression(Parser *p);
-
-static void push(Parser *p, double d){
-    if(p->stack_index+1>= STACK_SIZE){
-        av_log(NULL, AV_LOG_ERROR, "stack overflow in the parser\n");
-        return;
-    }
-    p->stack[ p->stack_index++ ]= d;
-//printf("push %f\n", d); fflush(stdout);
-}
-
-static double pop(Parser *p){
-    if(p->stack_index<=0){
-        av_log(NULL, AV_LOG_ERROR, "stack underflow in the parser\n");
-        return NAN;
-    }
-//printf("pop\n"); fflush(stdout);
-    return p->stack[ --p->stack_index ];
-}
+static double evalExpression(Parser *p);
 
 static int strmatch(const char *s, const char *prefix){
     int i;
@@ -85,7 +64,7 @@ static int strmatch(const char *s, const char *prefix){
     return 1;
 }
 
-static void evalPrimary(Parser *p){
+static double evalPrimary(Parser *p){
     double d, d2=NAN;
     char *next= p->s;
     int i;
@@ -93,36 +72,32 @@ static void evalPrimary(Parser *p){
     /* number */
     d= strtod(p->s, &next);
     if(next != p->s){
-        push(p, d);
         p->s= next;
-        return;
+        return d;
     }
     
     /* named constants */
-    for(i=0; p->const_name[i]; i++){
+    for(i=0; p->const_name && p->const_name[i]; i++){
         if(strmatch(p->s, p->const_name[i])){
-            push(p, p->const_value[i]);
             p->s+= strlen(p->const_name[i]);
-            return;
+            return p->const_value[i];
         }
     }
     
     p->s= strchr(p->s, '(');
     if(p->s==NULL){
         av_log(NULL, AV_LOG_ERROR, "Parser: missing ( in \"%s\"\n", next);
-        return;
+        return NAN;
     }
     p->s++; // "("
-    evalExpression(p);
-    d= pop(p);
+    d= evalExpression(p);
     if(p->s[0]== ','){
         p->s++; // ","
-        evalExpression(p);
-        d2= pop(p);
+        d2= evalExpression(p);
     }
     if(p->s[0] != ')'){
         av_log(NULL, AV_LOG_ERROR, "Parser: missing ) in \"%s\"\n", next);
-        return;
+        return NAN;
     }
     p->s++; // ")"
     
@@ -144,96 +119,67 @@ static void evalPrimary(Parser *p){
     else if( strmatch(next, "lt"    ) ) d= d > d2 ? 0.0 : 1.0;
     else if( strmatch(next, "lte"    ) ) d= d >= d2 ? 0.0 : 1.0;
     else if( strmatch(next, "eq"    ) ) d= d == d2 ? 1.0 : 0.0;
+    else if( strmatch(next, "("     ) ) d= d;
 //    else if( strmatch(next, "l1"    ) ) d= 1 + d2*(d - 1);
 //    else if( strmatch(next, "sq01"  ) ) d= (d >= 0.0 && d <=1.0) ? 1.0 : 0.0;
     else{
-        int error=1;
         for(i=0; p->func1_name && p->func1_name[i]; i++){
             if(strmatch(next, p->func1_name[i])){
-                d= p->func1[i](p->opaque, d);
-                error=0;
-                break;
+                return p->func1[i](p->opaque, d);
             }
         }
 
         for(i=0; p->func2_name && p->func2_name[i]; i++){
             if(strmatch(next, p->func2_name[i])){
-                d= p->func2[i](p->opaque, d, d2);
-                error=0;
-                break;
+                return p->func2[i](p->opaque, d, d2);
             }
         }
 
-        if(error){
-            av_log(NULL, AV_LOG_ERROR, "Parser: unknown function in \"%s\"\n", next);
-            return;
-        }
+        av_log(NULL, AV_LOG_ERROR, "Parser: unknown function in \"%s\"\n", next);
+        return NAN;
     }
-    
-    push(p, d);
+
+    return d;
 }      
-       
-static void evalPow(Parser *p){
-    int neg= 0;
-    if(p->s[0]=='+') p->s++;
-       
-    if(p->s[0]=='-'){ 
-        neg= 1;
-        p->s++;
-    }
-    
-    if(p->s[0]=='('){
-        p->s++;;
-        evalExpression(p);
 
-        if(p->s[0]!=')')
-            av_log(NULL, AV_LOG_ERROR, "Parser: missing )\n");
-        p->s++;
-    }else{
-        evalPrimary(p);
-    }
-    
-    if(neg) push(p, -pop(p));
+static double evalPow(Parser *p){
+    int sign= (*p->s == '+') - (*p->s == '-');
+    p->s += sign&1;
+    return (sign|1) * evalPrimary(p);
 }
 
-static void evalFactor(Parser *p){
-    evalPow(p);
+static double evalFactor(Parser *p){
+    double ret= evalPow(p);
     while(p->s[0]=='^'){
-        double d;
-
         p->s++;
-        evalPow(p);
-        d= pop(p);
-        push(p, pow(pop(p), d));
+        ret= pow(ret, evalPow(p));
     }
+    return ret;
 }
 
-static void evalTerm(Parser *p){
-    evalFactor(p);
+static double evalTerm(Parser *p){
+    double ret= evalFactor(p);
     while(p->s[0]=='*' || p->s[0]=='/'){
-        int inv= p->s[0]=='/';
-        double d;
-
-        p->s++;
-        evalFactor(p);
-        d= pop(p);
-        if(inv) d= 1.0/d;
-        push(p, d * pop(p));
+        if(*p->s++ == '*') ret*= evalFactor(p);
+        else               ret/= evalFactor(p);
     }
+    return ret;
 }
 
-static void evalExpression(Parser *p){
-    evalTerm(p);
-    while(p->s[0]=='+' || p->s[0]=='-'){
-        int sign= p->s[0]=='-';
-        double d;
+static double evalExpression(Parser *p){
+    double ret= 0;
 
-        p->s++;
-        evalTerm(p);
-        d= pop(p);
-        if(sign) d= -d;
-        push(p, d + pop(p));
-    }
+    if(p->stack_index <= 0) //protect against stack overflows
+        return NAN;
+    p->stack_index--;
+
+    do{
+        ret += evalTerm(p);
+    }while(*p->s == '+' || *p->s == '-');
+
+    p->stack_index++;
+
+    return ret;
 }
 
 double ff_eval(char *s, double *const_value, const char **const_name,
@@ -242,7 +188,7 @@ double ff_eval(char *s, double *const_value, const char **const_name,
                void *opaque){
     Parser p;
     
-    p.stack_index=0;
+    p.stack_index=100;
     p.s= s;
     p.const_value= const_value;
     p.const_name = const_name;
@@ -252,6 +198,29 @@ double ff_eval(char *s, double *const_value, const char **const_name,
     p.func2_name = func2_name;
     p.opaque     = opaque;
     
-    evalExpression(&p);
-    return pop(&p);
+    return evalExpression(&p);
+}
+
+#ifdef TEST
+#undef printf 
+static double const_values[]={
+    M_PI,
+    M_E,
+    0
+};
+static const char *const_names[]={
+    "PI",
+    "E",
+    0
+};
+main(){
+    int i;
+    printf("%f == 12.7\n", ff_eval("1+(5-2)^(3-1)+1/2+sin(PI)-max(-2.2,-3.1)", const_values, const_names, NULL, NULL, NULL, NULL, NULL));
+    
+    for(i=0; i<1050; i++){
+        START_TIMER
+            ff_eval("1+(5-2)^(3-1)+1/2+sin(PI)-max(-2.2,-3.1)", const_values, const_names, NULL, NULL, NULL, NULL, NULL);
+        STOP_TIMER("ff_eval")
+    }
 }
+#endif
diff --git a/src/libffmpeg/libavcodec/ffv1.c b/src/libffmpeg/libavcodec/ffv1.c
index a85baea4b..6a4c6ed3f 100644
--- a/src/libffmpeg/libavcodec/ffv1.c
+++ b/src/libffmpeg/libavcodec/ffv1.c
@@ -25,9 +25,10 @@
  */
 
 #include "common.h"
+#include "bitstream.h"
 #include "avcodec.h"
 #include "dsputil.h"
-#include "cabac.h"
+#include "rangecoder.h"
 #include "golomb.h"
 
 #define MAX_PLANES 4
@@ -164,7 +165,7 @@ typedef struct PlaneContext{
 
 typedef struct FFV1Context{
     AVCodecContext *avctx;
-    CABACContext c;
+    RangeCoder c;
     GetBitContext gb;
     PutBitContext pb;
     int version;
@@ -218,57 +219,52 @@ static inline int get_context(FFV1Context *f, int_fast16_t *src, int_fast16_t *l
         return f->quant_table[0][(L-LT) & 0xFF] + f->quant_table[1][(LT-T) & 0xFF] + f->quant_table[2][(T-RT) & 0xFF];
 }
 
-/**
- * put 
- */
-static inline void put_symbol(CABACContext *c, uint8_t *state, int v, int is_signed, int max_exp){
+static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
     int i;
 
     if(v){
         const int a= ABS(v);
         const int e= av_log2(a);
-
-        put_cabac(c, state+0, 0);
+        put_rac(c, state+0, 0);
         
+        assert(e<=9);
+
         for(i=0; i<e; i++){
-            put_cabac(c, state+1+i, 1);  //1..8
+            put_rac(c, state+1+i, 1);  //1..10
         }
+        put_rac(c, state+1+i, 0);
 
-        if(e<max_exp){
-            put_cabac(c, state+1+i, 0);      //1..8
-
-            for(i=e-1; i>=0; i--){
-                put_cabac(c, state+16+e+i, (a>>i)&1); //17..29
-            }
-            if(is_signed)
-                put_cabac(c, state+9 + e, v < 0); //9..16
+        for(i=e-1; i>=0; i--){
+            put_rac(c, state+22+i, (a>>i)&1); //22..31
         }
+
+        if(is_signed)
+            put_rac(c, state+11 + e, v < 0); //11..21
     }else{
-        put_cabac(c, state+0, 1);
+        put_rac(c, state+0, 1);
     }
 }
 
-static inline int get_symbol(CABACContext *c, uint8_t *state, int is_signed, int max_exp){
-    if(get_cabac(c, state+0))
+static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
         return 0;
     else{
-        int i, e;
- 
-        for(e=0; e<max_exp; e++){ 
-            int a= 1<<e;
-
-            if(get_cabac(c, state + 1 + e)==0){ // 1..8
-                for(i=e-1; i>=0; i--){
-                    a += get_cabac(c, state+16+e+i)<<i; //17..29
-                }
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + e)){ //1..10
+            e++;
+        }
+        assert(e<=9);
 
-                if(is_signed && get_cabac(c, state+9 + e)) //9..16
-                    return -a;
-                else
-                    return a;
-            }
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + i); //22..31
         }
-        return -(1<<e);
+
+        if(is_signed && get_rac(c, state+11 + e)) //11..21
+            return -a;
+        else
+            return a;
     }
 }
 
@@ -324,10 +320,8 @@ static inline void put_vlc_symbol(PutBitContext *pb, VlcState * const state, int
      code= v ^ ((2*state->drift + state->count)>>31);
 #endif
     
-    code = -2*code-1;
-    code^= (code>>31);
 //printf("v:%d/%d bias:%d error:%d drift:%d count:%d k:%d\n", v, code, state->bias, state->error_sum, state->drift, state->count, k);
-    set_ur_golomb(pb, code, k, 12, bits);
+    set_sr_golomb(pb, code, k, 12, bits);
 
     update_vlc_state(state, v);
 }
@@ -344,13 +338,9 @@ static inline int get_vlc_symbol(GetBitContext *gb, VlcState * const state, int
 
     assert(k<=8);
 
-    v= get_ur_golomb(gb, k, 12, bits);
+    v= get_sr_golomb(gb, k, 12, bits);
 //printf("v:%d bias:%d error:%d drift:%d count:%d k:%d", v, state->bias, state->error_sum, state->drift, state->count, k);
 
-    v++;
-    if(v&1) v=  (v>>1);
-    else    v= -(v>>1);
-
 #if 0 // JPEG LS
     if(k==0 && 2*state->drift <= - state->count) v ^= (-1);
 #else
@@ -364,14 +354,26 @@ static inline int get_vlc_symbol(GetBitContext *gb, VlcState * const state, int
     return ret;
 }
 
-static inline void encode_line(FFV1Context *s, int w, int_fast16_t *sample[2], int plane_index, int bits){
+static inline int encode_line(FFV1Context *s, int w, int_fast16_t *sample[2], int plane_index, int bits){
     PlaneContext * const p= &s->plane[plane_index];
-    CABACContext * const c= &s->c;
+    RangeCoder * const c= &s->c;
     int x;
     int run_index= s->run_index;
     int run_count=0;
     int run_mode=0;
 
+    if(s->ac){
+        if(c->bytestream_end - c->bytestream < w*20){
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return -1;
+        }
+    }else{
+        if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < w*4){
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return -1;
+        }
+    }
+
     for(x=0; x<w; x++){
         int diff, context;
         
@@ -386,7 +388,7 @@ static inline void encode_line(FFV1Context *s, int w, int_fast16_t *sample[2], i
         diff= fold(diff, bits);
         
         if(s->ac){
-            put_symbol(c, p->state[context], diff, 1, bits-1);
+            put_symbol(c, p->state[context], diff, 1);
         }else{
             if(context == 0) run_mode=1;
             
@@ -426,11 +428,13 @@ static inline void encode_line(FFV1Context *s, int w, int_fast16_t *sample[2], i
             put_bits(&s->pb, 1, 1);
     }
     s->run_index= run_index;
+    
+    return 0;
 }
 
 static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h, int stride, int plane_index){
     int x,y,i;
-    const int ring_size=2;
+    const int ring_size= s->avctx->context_model ? 3 : 2;
     int_fast16_t sample_buffer[ring_size][w+6], *sample[ring_size];
     s->run_index=0;
     
@@ -453,7 +457,7 @@ static void encode_plane(FFV1Context *s, uint8_t *src, int w, int h, int stride,
 
 static void encode_rgb_frame(FFV1Context *s, uint32_t *src, int w, int h, int stride){
     int x, y, p, i;
-    const int ring_size=2;
+    const int ring_size= s->avctx->context_model ? 3 : 2;
     int_fast16_t sample_buffer[3][ring_size][w+6], *sample[3][ring_size];
     s->run_index=0;
     
@@ -490,32 +494,35 @@ static void encode_rgb_frame(FFV1Context *s, uint32_t *src, int w, int h, int st
     }
 }
 
-static void write_quant_table(CABACContext *c, int16_t *quant_table){
+static void write_quant_table(RangeCoder *c, int16_t *quant_table){
     int last=0;
     int i;
-    uint8_t state[CONTEXT_SIZE]={0};
+    uint8_t state[CONTEXT_SIZE];
+    memset(state, 128, sizeof(state));
 
     for(i=1; i<128 ; i++){
         if(quant_table[i] != quant_table[i-1]){
-            put_symbol(c, state, i-last-1, 0, 7);
+            put_symbol(c, state, i-last-1, 0);
             last= i;
         }
     }
-    put_symbol(c, state, i-last-1, 0, 7);
+    put_symbol(c, state, i-last-1, 0);
 }
 
 static void write_header(FFV1Context *f){
-    uint8_t state[CONTEXT_SIZE]={0};
+    uint8_t state[CONTEXT_SIZE];
     int i;
-    CABACContext * const c= &f->c;
+    RangeCoder * const c= &f->c;
 
-    put_symbol(c, state, f->version, 0, 7);
-    put_symbol(c, state, f->avctx->coder_type, 0, 7);
-    put_symbol(c, state, f->colorspace, 0, 7); //YUV cs type 
-    put_cabac(c, state, 1); //chroma planes
-        put_symbol(c, state, f->chroma_h_shift, 0, 7);
-        put_symbol(c, state, f->chroma_v_shift, 0, 7);
-    put_cabac(c, state, 0); //no transparency plane
+    memset(state, 128, sizeof(state));
+    
+    put_symbol(c, state, f->version, 0);
+    put_symbol(c, state, f->avctx->coder_type, 0);
+    put_symbol(c, state, f->colorspace, 0); //YUV cs type 
+    put_rac(c, state, 1); //chroma planes
+        put_symbol(c, state, f->chroma_h_shift, 0);
+        put_symbol(c, state, f->chroma_v_shift, 0);
+    put_rac(c, state, 0); //no transparency plane
 
     for(i=0; i<5; i++)
         write_quant_table(c, f->quant_table[i]);
@@ -543,6 +550,12 @@ static int encode_init(AVCodecContext *avctx)
     FFV1Context *s = avctx->priv_data;
     int i;
 
+    if(avctx->strict_std_compliance >= 0){
+        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it wont be decodeable with future versions!!!\n"
+               "use vstrict=-1 / -strict -1 to use it anyway\n");
+        return -1;
+    }
+        
     common_init(avctx);
  
     s->version=0;
@@ -609,13 +622,12 @@ static void clear_state(FFV1Context *f){
     for(i=0; i<f->plane_count; i++){
         PlaneContext *p= &f->plane[i];
 
-        p->interlace_bit_state[0]= 0;
-        p->interlace_bit_state[1]= 0;
+        p->interlace_bit_state[0]= 128;
+        p->interlace_bit_state[1]= 128;
         
         for(j=0; j<p->context_count; j++){
             if(f->ac){
-                memset(p->state[j], 0, sizeof(uint8_t)*CONTEXT_SIZE);
-                p->state[j][7] = 2*62;
+                memset(p->state[j], 128, sizeof(uint8_t)*CONTEXT_SIZE);
             }else{
                 p->vlc_state[j].drift= 0;
                 p->vlc_state[j].error_sum= 4; //FFMAX((RANGE + 32)/64, 2);
@@ -628,39 +640,33 @@ static void clear_state(FFV1Context *f){
 
 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
     FFV1Context *f = avctx->priv_data;
-    CABACContext * const c= &f->c;
+    RangeCoder * const c= &f->c;
     AVFrame *pict = data;
     const int width= f->width;
     const int height= f->height;
     AVFrame * const p= &f->picture;
     int used_count= 0;
+    uint8_t keystate=128;
+
+    ff_init_range_encoder(c, buf, buf_size);
+//    ff_init_cabac_states(c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
 
-    if(avctx->strict_std_compliance >= 0){
-        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it wont be decodeable with future versions!!!\n"
-               "use vstrict=-1 to use it anyway\n");
-        return -1;
-    }
-        
-    ff_init_cabac_encoder(c, buf, buf_size);
-    ff_init_cabac_states(c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
-    c->lps_state[2] = 1;
-    c->lps_state[3] = 0;
-    
     *p = *pict;
     p->pict_type= FF_I_TYPE;
     
     if(avctx->gop_size==0 || f->picture_number % avctx->gop_size == 0){
-        put_cabac_bypass(c, 1);
+        put_rac(c, &keystate, 1);
         p->key_frame= 1;
         write_header(f);
         clear_state(f);
     }else{
-        put_cabac_bypass(c, 0);
+        put_rac(c, &keystate, 0);
         p->key_frame= 0;
     }
 
     if(!f->ac){
-        used_count += put_cabac_terminate(c, 1);
+        used_count += ff_rac_terminate(c);
 //printf("pos=%d\n", used_count);
         init_put_bits(&f->pb, buf + used_count, buf_size - used_count);
     }
@@ -681,7 +687,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     f->picture_number++;
 
     if(f->ac){
-        return put_cabac_terminate(c, 1);
+        return ff_rac_terminate(c);
     }else{
         flush_put_bits(&f->pb); //nicer padding FIXME
         return used_count + (put_bits_count(&f->pb)+7)/8;
@@ -709,7 +715,7 @@ static int encode_end(AVCodecContext *avctx)
 
 static inline void decode_line(FFV1Context *s, int w, int_fast16_t *sample[2], int plane_index, int bits){
     PlaneContext * const p= &s->plane[plane_index];
-    CABACContext * const c= &s->c;
+    RangeCoder * const c= &s->c;
     int x;
     int run_count=0;
     int run_mode=0;
@@ -726,9 +732,9 @@ static inline void decode_line(FFV1Context *s, int w, int_fast16_t *sample[2], i
             sign=0;
         
 
-        if(s->ac)
-            diff= get_symbol(c, p->state[context], 1, bits-1);
-        else{
+        if(s->ac){
+            diff= get_symbol(c, p->state[context], 1);
+        }else{
             if(context == 0 && run_mode==0) run_mode=1;
             
             if(run_mode){
@@ -833,13 +839,15 @@ static void decode_rgb_frame(FFV1Context *s, uint32_t *src, int w, int h, int st
     }
 }
 
-static int read_quant_table(CABACContext *c, int16_t *quant_table, int scale){
+static int read_quant_table(RangeCoder *c, int16_t *quant_table, int scale){
     int v;
     int i=0;
-    uint8_t state[CONTEXT_SIZE]={0};
+    uint8_t state[CONTEXT_SIZE];
+
+    memset(state, 128, sizeof(state));
 
     for(v=0; i<128 ; v++){
-        int len= get_symbol(c, state, 0, 7) + 1;
+        int len= get_symbol(c, state, 0) + 1;
 
         if(len + i > 128) return -1;
         
@@ -860,17 +868,19 @@ static int read_quant_table(CABACContext *c, int16_t *quant_table, int scale){
 }
 
 static int read_header(FFV1Context *f){
-    uint8_t state[CONTEXT_SIZE]={0};
+    uint8_t state[CONTEXT_SIZE];
     int i, context_count;
-    CABACContext * const c= &f->c;
+    RangeCoder * const c= &f->c;
     
-    f->version= get_symbol(c, state, 0, 7);
-    f->ac= f->avctx->coder_type= get_symbol(c, state, 0, 7);
-    f->colorspace= get_symbol(c, state, 0, 7); //YUV cs type
-    get_cabac(c, state); //no chroma = false
-    f->chroma_h_shift= get_symbol(c, state, 0, 7);
-    f->chroma_v_shift= get_symbol(c, state, 0, 7);
-    get_cabac(c, state); //transparency plane
+    memset(state, 128, sizeof(state));
+
+    f->version= get_symbol(c, state, 0);
+    f->ac= f->avctx->coder_type= get_symbol(c, state, 0);
+    f->colorspace= get_symbol(c, state, 0); //YUV cs type
+    get_rac(c, state); //no chroma = false
+    f->chroma_h_shift= get_symbol(c, state, 0);
+    f->chroma_v_shift= get_symbol(c, state, 0);
+    get_rac(c, state); //transparency plane
     f->plane_count= 2;
 
     if(f->colorspace==0){
@@ -879,7 +889,7 @@ static int read_header(FFV1Context *f){
         case 0x10: f->avctx->pix_fmt= PIX_FMT_YUV422P; break;
         case 0x11: f->avctx->pix_fmt= PIX_FMT_YUV420P; break;
         case 0x20: f->avctx->pix_fmt= PIX_FMT_YUV411P; break;
-        case 0x33: f->avctx->pix_fmt= PIX_FMT_YUV410P; break;
+        case 0x22: f->avctx->pix_fmt= PIX_FMT_YUV410P; break;
         default:
             av_log(f->avctx, AV_LOG_ERROR, "format not supported\n");
             return -1;
@@ -900,7 +910,7 @@ static int read_header(FFV1Context *f){
     context_count=1;
     for(i=0; i<5; i++){
         context_count*= read_quant_table(c, f->quant_table[i], context_count);
-        if(context_count < 0){
+        if(context_count < 0 || context_count > 32768){
             av_log(f->avctx, AV_LOG_ERROR, "read_quant_table error\n");
             return -1;
         }
@@ -933,26 +943,21 @@ static int decode_init(AVCodecContext *avctx)
 
 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
     FFV1Context *f = avctx->priv_data;
-    CABACContext * const c= &f->c;
+    RangeCoder * const c= &f->c;
     const int width= f->width;
     const int height= f->height;
     AVFrame * const p= &f->picture;
     int bytes_read;
+    uint8_t keystate= 128;
 
     AVFrame *picture = data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
-    ff_init_cabac_decoder(c, buf, buf_size);
-    ff_init_cabac_states(c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
-    c->lps_state[2] = 1;
-    c->lps_state[3] = 0;
+    ff_init_range_decoder(c, buf, buf_size);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
 
 
     p->pict_type= FF_I_TYPE; //FIXME I vs. P
-    if(get_cabac_bypass(c)){
+    if(get_rac(c, &keystate)){
         p->key_frame= 1;
         read_header(f);
         clear_state(f);
@@ -970,8 +975,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
         av_log(avctx, AV_LOG_ERROR, "keyframe:%d coder:%d\n", p->key_frame, f->ac);
     
     if(!f->ac){
-        bytes_read = get_cabac_terminate(c);
-        if(bytes_read ==0) av_log(avctx, AV_LOG_ERROR, "error at end of AC stream\n");
+        bytes_read = c->bytestream - c->bytestream_start - 1;
+        if(bytes_read ==0) av_log(avctx, AV_LOG_ERROR, "error at end of AC stream\n"); //FIXME
 //printf("pos=%d\n", bytes_read);
         init_get_bits(&f->gb, buf + bytes_read, buf_size - bytes_read);
     } else {
@@ -1000,7 +1005,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
     *data_size = sizeof(AVFrame);
     
     if(f->ac){
-        bytes_read= get_cabac_terminate(c);
+        bytes_read= c->bytestream - c->bytestream_start - 1;
         if(bytes_read ==0) av_log(f->avctx, AV_LOG_ERROR, "error at end of frame\n");
     }else{
         bytes_read+= (get_bits_count(&f->gb)+7)/8;
diff --git a/src/libffmpeg/libavcodec/flac.c b/src/libffmpeg/libavcodec/flac.c
index 464d7999d..894da9384 100644
--- a/src/libffmpeg/libavcodec/flac.c
+++ b/src/libffmpeg/libavcodec/flac.c
@@ -34,6 +34,7 @@
 #include <limits.h>
  
 #include "avcodec.h"
+#include "bitstream.h"
 #include "golomb.h"
 
 #undef NDEBUG
@@ -142,6 +143,29 @@ static int64_t get_utf8(GetBitContext *gb)
     return val;
 }
 
+#if 0
+static int skip_utf8(GetBitContext *gb)
+{
+    int ones=0, bytes;
+    
+    while(get_bits1(gb))
+        ones++;
+
+    if     (ones==0) bytes=0;
+    else if(ones==1) return -1;
+    else             bytes= ones - 1;
+    
+    skip_bits(gb, 7-ones);
+    while(bytes--){
+        const int tmp = get_bits(gb, 8);
+        
+        if((tmp>>6) != 2)
+            return -1;
+    }
+    return 0;
+}
+#endif
+
 static int get_crc8(const uint8_t *buf, int count){
     int crc=0;
     int i;
@@ -569,12 +593,12 @@ static int flac_decode_frame(AVCodecContext *avctx,
     int16_t *samples = data;
 
     if(s->max_framesize == 0){
-        s->max_framesize= 8192; // should hopefully be enough for the first header
+        s->max_framesize= 65536; // should hopefully be enough for the first header
         s->bitstream= av_fast_realloc(s->bitstream, &s->allocated_bitstream_size, s->max_framesize);
     }
 
     if(1 && s->max_framesize){//FIXME truncated
-            buf_size= FFMIN(buf_size, s->max_framesize - s->bitstream_size);
+            buf_size= FFMAX(FFMIN(buf_size, s->max_framesize - s->bitstream_size), 0);
             input_buf_size= buf_size;
 
             if(s->bitstream_index + s->bitstream_size + buf_size > s->allocated_bitstream_size){
@@ -612,10 +636,20 @@ static int flac_decode_frame(AVCodecContext *avctx,
             if(metadata_size){
                 switch(metadata_type)
                 {
-                case METADATA_TYPE_STREAMINFO:
+                case METADATA_TYPE_STREAMINFO:{
                     metadata_streaminfo(s);
+
+                    /* Buffer might have been reallocated, reinit bitreader */
+                    if(buf != &s->bitstream[s->bitstream_index])
+                    {
+                        int bits_count = get_bits_count(&s->gb);
+                        buf= &s->bitstream[s->bitstream_index];
+                        init_get_bits(&s->gb, buf, buf_size*8);
+                        skip_bits(&s->gb, bits_count);
+                    }
+ 
                     dump_headers(s);
-                    break;
+                    break;}
                 default:
                     for(i=0; i<metadata_size; i++)
                         skip_bits(&s->gb, 8);
diff --git a/src/libffmpeg/libavcodec/flicvideo.c b/src/libffmpeg/libavcodec/flicvideo.c
index 99825cebc..92cb8bd0b 100644
--- a/src/libffmpeg/libavcodec/flicvideo.c
+++ b/src/libffmpeg/libavcodec/flicvideo.c
@@ -176,7 +176,7 @@ static int flic_decode_frame(AVCodecContext *avctx,
                 for (j = 0; j < color_changes; j++) {
 
                     /* wrap around, for good measure */
-                    if (palette_ptr >= 256)
+                    if ((unsigned)palette_ptr >= 256)
                         palette_ptr = 0;
 
                     r = buf[stream_ptr++] << color_shift;
diff --git a/src/libffmpeg/libavcodec/g726.c b/src/libffmpeg/libavcodec/g726.c
index c016f32cf..bc9374d3e 100644
--- a/src/libffmpeg/libavcodec/g726.c
+++ b/src/libffmpeg/libavcodec/g726.c
@@ -22,16 +22,18 @@
 #include <limits.h>
 #include "avcodec.h"
 #include "common.h"
+#include "bitstream.h"
 
-/*
+/**
+ * G.726 11bit float.
  * G.726 Standard uses rather odd 11bit floating point arithmentic for 
  * numerous occasions. It's a mistery to me why they did it this way
  * instead of simply using 32bit integer arithmetic.
  */
 typedef struct Float11 {
-	int sign;   /* 1bit sign */
-	int exp;    /* 4bit exponent */
-	int mant;   /* 6bit mantissa */
+	int sign;   /**< 1bit sign */
+	int exp;    /**< 4bit exponent */
+	int mant;   /**< 6bit mantissa */
 } Float11;
 
 static inline Float11* i2f(int16_t i, Float11* f)
@@ -61,35 +63,35 @@ static inline int sgn(int value)
 }
 
 typedef struct G726Tables {
-	int  bits;            /* bits per sample */
-	int* quant;           /* quantization table */
-	int* iquant;          /* inverse quantization table */
-	int* W;               /* special table #1 ;-) */
-	int* F;               /* special table #2 */
+	int  bits;            /**< bits per sample */
+	int* quant;           /**< quantization table */
+	int* iquant;          /**< inverse quantization table */
+	int* W;               /**< special table #1 ;-) */
+	int* F;               /**< special table #2 */
 } G726Tables;
 
 typedef struct G726Context {
-	 G726Tables* tbls;    /* static tables needed for computation */
+	 G726Tables* tbls;    /**< static tables needed for computation */
 	 
-	 Float11 sr[2];       /* prev. reconstructed samples */
-	 Float11 dq[6];       /* prev. difference */
-	 int a[2];            /* second order predictor coeffs */
-	 int b[6];            /* sixth order predictor coeffs */
-	 int pk[2];           /* signs of prev. 2 sez + dq */
+	 Float11 sr[2];       /**< prev. reconstructed samples */
+	 Float11 dq[6];       /**< prev. difference */
+	 int a[2];            /**< second order predictor coeffs */
+	 int b[6];            /**< sixth order predictor coeffs */
+	 int pk[2];           /**< signs of prev. 2 sez + dq */
 	 
-	 int ap;              /* scale factor control */
-	 int yu;              /* fast scale factor */
-	 int yl;              /* slow scale factor */
-	 int dms;             /* short average magnitude of F[i] */
-	 int dml;             /* long average magnitude of F[i] */
-	 int td;              /* tone detect */
-
-	 int se;              /* estimated signal for the next iteration */
-	 int sez;             /* estimated second order prediction */
-	 int y;               /* quantizer scaling factor for the next iteration */
+	 int ap;              /**< scale factor control */
+	 int yu;              /**< fast scale factor */
+	 int yl;              /**< slow scale factor */
+	 int dms;             /**< short average magnitude of F[i] */
+	 int dml;             /**< long average magnitude of F[i] */
+	 int td;              /**< tone detect */
+
+	 int se;              /**< estimated signal for the next iteration */
+	 int sez;             /**< estimated second order prediction */
+	 int y;               /**< quantizer scaling factor for the next iteration */
 } G726Context;
 
-static int quant_tbl16[] =                       /* 16kbit/s 2bits per sample */
+static int quant_tbl16[] =                       /**< 16kbit/s 2bits per sample */
            { 260, INT_MAX }; 
 static int iquant_tbl16[] =
            { 116, 365, 365, 116 };
@@ -98,7 +100,7 @@ static int W_tbl16[] =
 static int F_tbl16[] =
            { 0, 7, 7, 0 };
 	   
-static int quant_tbl24[] =                       /* 24kbit/s 3bits per sample */
+static int quant_tbl24[] =                       /**< 24kbit/s 3bits per sample */
            {  7, 217, 330, INT_MAX };
 static int iquant_tbl24[] =
            { INT_MIN, 135, 273, 373, 373, 273, 135, INT_MIN };
@@ -107,7 +109,7 @@ static int W_tbl24[] =
 static int F_tbl24[] =
            { 0, 1, 2, 7, 7, 2, 1, 0 };
 	   
-static int quant_tbl32[] =                       /* 32kbit/s 4bits per sample */
+static int quant_tbl32[] =                       /**< 32kbit/s 4bits per sample */
            { -125,  79, 177, 245, 299, 348, 399, INT_MAX };
 static int iquant_tbl32[] =
            { INT_MIN,   4, 135, 213, 273, 323, 373, 425,  
@@ -118,7 +120,7 @@ static int W_tbl32[] =
 static int F_tbl32[] = 
            { 0, 0, 0, 1, 1, 1, 3, 7, 7, 3, 1, 1, 1, 0, 0, 0 };
 	   
-static int quant_tbl40[] =                      /* 40kbit/s 5bits per sample */
+static int quant_tbl40[] =                      /**< 40kbit/s 5bits per sample */
            { -122, -16,  67, 138, 197, 249, 297, 338,
 	      377, 412, 444, 474, 501, 527, 552, INT_MAX };
 static int iquant_tbl40[] =
@@ -142,7 +144,7 @@ static G726Tables G726Tables_pool[] =
             { 5, quant_tbl40, iquant_tbl40, W_tbl40, F_tbl40 }};
 					       
 
-/*
+/**
  * Para 4.2.2 page 18: Adaptive quantizer. 
  */
 static inline uint8_t quant(G726Context* c, int d)
@@ -168,7 +170,7 @@ static inline uint8_t quant(G726Context* c, int d)
    return i; 
 }
 
-/* 
+/**
  * Para 4.2.3 page 22: Inverse adaptive quantizer.
  */
 static inline int16_t inverse_quant(G726Context* c, int i)
diff --git a/src/libffmpeg/libavcodec/golomb.h b/src/libffmpeg/libavcodec/golomb.h
index cd8bdd38d..1204a52e2 100644
--- a/src/libffmpeg/libavcodec/golomb.h
+++ b/src/libffmpeg/libavcodec/golomb.h
@@ -1,6 +1,7 @@
 /*
  * exp golomb vlc stuff
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2004 Alex Beregszaszi
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -22,7 +23,7 @@
  * @file golomb.h
  * @brief 
  *     exp golomb vlc stuff
- * @author Michael Niedermayer <michaelni@gmx.at>
+ * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi
  */
 
 #define INVALID_VLC           0x80000000
@@ -80,7 +81,10 @@ static inline int svq3_get_ue_golomb(GetBitContext *gb){
         
         return ff_interleaved_ue_golomb_vlc_code[buf];
     }else{
-        buf|=1;
+        LAST_SKIP_BITS(re, gb, 8);
+        UPDATE_CACHE(re, gb);
+        buf |= 1 | (GET_CACHE(re, gb) >> 8);
+
         if((buf & 0xAAAAAAAA) == 0)
             return INVALID_VLC;
 
@@ -88,7 +92,7 @@ static inline int svq3_get_ue_golomb(GetBitContext *gb){
             buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
         }
 
-        LAST_SKIP_BITS(re, gb, 63 - 2*log);
+        LAST_SKIP_BITS(re, gb, 63 - 2*log - 8);
         CLOSE_READER(re, gb);
 
         return ((buf << log) >> log) - 1;
@@ -163,7 +167,10 @@ static inline int svq3_get_se_golomb(GetBitContext *gb){
         
         return ff_interleaved_se_golomb_vlc_code[buf];
     }else{
-        buf |=1;
+        LAST_SKIP_BITS(re, gb, 8);
+        UPDATE_CACHE(re, gb);
+        buf |= 1 | (GET_CACHE(re, gb) >> 8);
+
         if((buf & 0xAAAAAAAA) == 0)
             return INVALID_VLC;
 
@@ -171,7 +178,7 @@ static inline int svq3_get_se_golomb(GetBitContext *gb){
             buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
         }
 
-        LAST_SKIP_BITS(re, gb, 63 - 2*log);
+        LAST_SKIP_BITS(re, gb, 63 - 2*log - 8);
         CLOSE_READER(re, gb);
 
         return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
@@ -257,16 +264,50 @@ static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit, int
 }
 
 /**
- * read unsigned golomb rice code (flac).
+ * read signed golomb rice code (ffv1).
+ */
+static inline int get_sr_golomb(GetBitContext *gb, int k, int limit, int esc_len){
+    int v= get_ur_golomb(gb, k, limit, esc_len);
+    
+    v++;
+    if (v&1) return v>>1;
+    else return -(v>>1);
+    
+//    return (v>>1) ^ -(v&1);
+}
+
+/**
+ * read signed golomb rice code (flac).
  */
 static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){
     int v= get_ur_golomb_jpegls(gb, k, limit, esc_len);
     return (v>>1) ^ -(v&1);
 }
 
+/**
+ * read unsigned golomb rice code (shorten).
+ */
+static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){
+	return get_ur_golomb_jpegls(gb, k, INT_MAX, 0);
+}
+
+/**
+ * read signed golomb rice code (shorten).
+ */
+static inline int get_sr_golomb_shorten(GetBitContext* gb, int k)
+{
+    int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
+    if (uvar & 1)
+        return ~(uvar >> 1);
+    else
+        return uvar >> 1;
+}
+
+
+
 #ifdef TRACE
 
-static inline int get_ue(GetBitContext *s, char *file, char *func, int line){
+static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){
     int show= show_bits(s, 24);
     int pos= get_bits_count(s);
     int i= get_ue_golomb(s);
@@ -275,12 +316,12 @@ static inline int get_ue(GetBitContext *s, char *file, char *func, int line){
     
     print_bin(bits, len);
     
-    printf("%5d %2d %3d ue  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
     
     return i;
 }
 
-static inline int get_se(GetBitContext *s, char *file, char *func, int line){
+static inline int get_se(GetBitContext *s, char *file, const char *func, int line){
     int show= show_bits(s, 24);
     int pos= get_bits_count(s);
     int i= get_se_golomb(s);
@@ -289,12 +330,12 @@ static inline int get_se(GetBitContext *s, char *file, char *func, int line){
     
     print_bin(bits, len);
     
-    printf("%5d %2d %3d se  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
     
     return i;
 }
 
-static inline int get_te(GetBitContext *s, int r, char *file, char *func, int line){
+static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){
     int show= show_bits(s, 24);
     int pos= get_bits_count(s);
     int i= get_te0_golomb(s, r);
@@ -303,7 +344,7 @@ static inline int get_te(GetBitContext *s, int r, char *file, char *func, int li
     
     print_bin(bits, len);
     
-    printf("%5d %2d %3d te  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
     
     return i;
 }
@@ -400,3 +441,27 @@ static inline void set_ur_golomb_jpegls(PutBitContext *pb, int i, int k, int lim
         put_bits(pb, esc_len, i - 1);
     }
 }
+
+/**
+ * write signed golomb rice code (ffv1).
+ */
+static inline void set_sr_golomb(PutBitContext *pb, int i, int k, int limit, int esc_len){
+    int v;
+
+    v = -2*i-1;
+    v ^= (v>>31);
+
+    set_ur_golomb(pb, v, k, limit, esc_len);
+}
+
+/**
+ * write signed golomb rice code (flac).
+ */
+static inline void set_sr_golomb_flac(PutBitContext *pb, int i, int k, int limit, int esc_len){
+    int v;
+
+    v = -2*i-1;
+    v ^= (v>>31);
+
+    set_ur_golomb_jpegls(pb, v, k, limit, esc_len);
+}
diff --git a/src/libffmpeg/libavcodec/h261.c b/src/libffmpeg/libavcodec/h261.c
new file mode 100644
index 000000000..aceebaa38
--- /dev/null
+++ b/src/libffmpeg/libavcodec/h261.c
@@ -0,0 +1,1041 @@
+/*
+ * H261 decoder
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2004 Maarten Daniels
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**
+ * @file h261.c
+ * h261codec.
+ */
+
+#include "common.h"
+#include "dsputil.h"
+#include "avcodec.h"
+#include "mpegvideo.h"
+#include "h261data.h"
+
+
+#define H261_MBA_VLC_BITS 9
+#define H261_MTYPE_VLC_BITS 6
+#define H261_MV_VLC_BITS 7
+#define H261_CBP_VLC_BITS 9
+#define TCOEFF_VLC_BITS 9
+
+#define MBA_STUFFING 33
+#define MBA_STARTCODE 34
+#define IS_FIL(a)    ((a)&MB_TYPE_H261_FIL)
+
+/**
+ * H261Context
+ */
+typedef struct H261Context{
+    MpegEncContext s;
+
+    int current_mba;
+    int previous_mba;
+    int mba_diff;
+    int mtype;
+    int current_mv_x;
+    int current_mv_y;
+    int gob_number;
+    int gob_start_code_skipped; // 1 if gob start code is already read before gob header is read
+}H261Context;
+
+void ff_h261_loop_filter(MpegEncContext *s){
+    H261Context * h= (H261Context*)s;
+    const int linesize  = s->linesize;
+    const int uvlinesize= s->uvlinesize;
+    uint8_t *dest_y = s->dest[0];
+    uint8_t *dest_cb= s->dest[1];
+    uint8_t *dest_cr= s->dest[2];
+
+    if(!(IS_FIL (h->mtype)))
+        return;
+
+    s->dsp.h261_loop_filter(dest_y                   , linesize);
+    s->dsp.h261_loop_filter(dest_y                + 8, linesize);
+    s->dsp.h261_loop_filter(dest_y + 8 * linesize    , linesize);
+    s->dsp.h261_loop_filter(dest_y + 8 * linesize + 8, linesize);
+    s->dsp.h261_loop_filter(dest_cb, uvlinesize);
+    s->dsp.h261_loop_filter(dest_cr, uvlinesize);
+}
+
+static int ff_h261_get_picture_format(int width, int height){
+    // QCIF
+    if (width == 176 && height == 144)
+        return 0;
+    // CIF
+    else if (width == 352 && height == 288)
+        return 1;
+    // ERROR
+    else
+        return -1;
+}
+
+static void h261_encode_block(H261Context * h, DCTELEM * block,
+                              int n);
+static int h261_decode_block(H261Context *h, DCTELEM *block,
+                             int n, int coded);
+
+void ff_h261_encode_picture_header(MpegEncContext * s, int picture_number){
+    H261Context * h = (H261Context *) s;
+    int format, temp_ref;
+
+    align_put_bits(&s->pb);
+
+    /* Update the pointer to last GOB */
+    s->ptr_lastgob = pbBufPtr(&s->pb);
+
+    put_bits(&s->pb, 20, 0x10); /* PSC */
+
+    temp_ref= s->picture_number * (int64_t)30000 * s->avctx->frame_rate_base / 
+                         (1001 * (int64_t)s->avctx->frame_rate);
+    put_bits(&s->pb, 5, temp_ref & 0x1f); /* TemporalReference */
+
+    put_bits(&s->pb, 1, 0); /* split screen off */
+    put_bits(&s->pb, 1, 0); /* camera  off */
+    put_bits(&s->pb, 1, 0); /* freeze picture release off */
+    
+    format = ff_h261_get_picture_format(s->width, s->height);
+    
+    put_bits(&s->pb, 1, format); /* 0 == QCIF, 1 == CIF */
+
+    put_bits(&s->pb, 1, 0); /* still image mode */
+    put_bits(&s->pb, 1, 0); /* reserved */
+
+    put_bits(&s->pb, 1, 0); /* no PEI */    
+    if(format == 0)
+        h->gob_number = -1;
+    else
+        h->gob_number = 0;
+    h->current_mba = 0;
+}
+
+/**
+ * Encodes a group of blocks header.
+ */
+static void h261_encode_gob_header(MpegEncContext * s, int mb_line){
+    H261Context * h = (H261Context *)s;
+    if(ff_h261_get_picture_format(s->width, s->height) == 0){
+        h->gob_number+=2; // QCIF
+    }
+    else{
+        h->gob_number++; // CIF
+    }
+    put_bits(&s->pb, 16, 1); /* GBSC */
+    put_bits(&s->pb, 4, h->gob_number); /* GN */
+    put_bits(&s->pb, 5, s->qscale); /* GQUANT */
+    put_bits(&s->pb, 1, 0); /* no GEI */
+    h->current_mba = 0;
+    h->previous_mba = 0;
+    h->current_mv_x=0;
+    h->current_mv_y=0;
+}
+
+void ff_h261_reorder_mb_index(MpegEncContext* s){
+    int index= s->mb_x + s->mb_y*s->mb_width;
+
+    if(index % 33 == 0)
+        h261_encode_gob_header(s,0);
+
+    /* for CIF the GOB's are fragmented in the middle of a scanline
+       that's why we need to adjust the x and y index of the macroblocks */
+    if(ff_h261_get_picture_format(s->width,s->height) == 1){ // CIF
+        s->mb_x =     index % 11 ; index /= 11;
+        s->mb_y =     index %  3 ; index /=  3;
+        s->mb_x+= 11*(index %  2); index /=  2;
+        s->mb_y+=  3*index;
+        
+        ff_init_block_index(s);
+        ff_update_block_index(s);
+    }
+}
+
+static void h261_encode_motion(H261Context * h, int val){
+    MpegEncContext * const s = &h->s;
+    int sign, code;
+    if(val==0){
+        code = 0;
+        put_bits(&s->pb,h261_mv_tab[code][1],h261_mv_tab[code][0]);
+    } 
+    else{
+        if(val > 15)
+            val -=32;
+        if(val < -16)
+            val+=32;
+        sign = val < 0;
+        code = sign ? -val : val; 
+        put_bits(&s->pb,h261_mv_tab[code][1],h261_mv_tab[code][0]);
+        put_bits(&s->pb,1,sign);
+    }
+}
+
+static inline int get_cbp(MpegEncContext * s,
+                      DCTELEM block[6][64])
+{
+    int i, cbp;
+    cbp= 0;
+    for (i = 0; i < 6; i++) {
+        if (s->block_last_index[i] >= 0)
+            cbp |= 1 << (5 - i);
+    }
+    return cbp;
+}
+void ff_h261_encode_mb(MpegEncContext * s,
+         DCTELEM block[6][64],
+         int motion_x, int motion_y)
+{
+    H261Context * h = (H261Context *)s;
+    int mvd, mv_diff_x, mv_diff_y, i, cbp;
+    cbp = 63; // avoid warning
+    mvd = 0;
+ 
+    h->current_mba++;
+    h->mtype = 0;
+ 
+    if (!s->mb_intra){
+        /* compute cbp */
+        cbp= get_cbp(s, block);
+   
+        /* mvd indicates if this block is motion compensated */
+        mvd = motion_x | motion_y;
+
+        if((cbp | mvd | s->dquant ) == 0) {
+            /* skip macroblock */
+            s->skip_count++;
+            h->current_mv_x=0;
+            h->current_mv_y=0;
+            return;
+        }
+    }
+
+    /* MB is not skipped, encode MBA */
+    put_bits(&s->pb, h261_mba_bits[(h->current_mba-h->previous_mba)-1], h261_mba_code[(h->current_mba-h->previous_mba)-1]);
+ 
+    /* calculate MTYPE */
+    if(!s->mb_intra){
+        h->mtype++;
+        
+        if(mvd || s->loop_filter)
+            h->mtype+=3;
+        if(s->loop_filter)
+            h->mtype+=3;
+        if(cbp || s->dquant)
+            h->mtype++;
+        assert(h->mtype > 1);
+    }
+
+    if(s->dquant) 
+        h->mtype++;
+
+    put_bits(&s->pb, h261_mtype_bits[h->mtype], h261_mtype_code[h->mtype]);
+ 
+    h->mtype = h261_mtype_map[h->mtype];
+ 
+    if(IS_QUANT(h->mtype)){
+        ff_set_qscale(s,s->qscale+s->dquant);
+        put_bits(&s->pb, 5, s->qscale);
+    }
+ 
+    if(IS_16X16(h->mtype)){
+        mv_diff_x = (motion_x >> 1) - h->current_mv_x;
+        mv_diff_y = (motion_y >> 1) - h->current_mv_y;
+        h->current_mv_x = (motion_x >> 1);
+        h->current_mv_y = (motion_y >> 1);
+        h261_encode_motion(h,mv_diff_x);
+        h261_encode_motion(h,mv_diff_y);
+    }
+ 
+    h->previous_mba = h->current_mba;
+ 
+    if(HAS_CBP(h->mtype)){
+        put_bits(&s->pb,h261_cbp_tab[cbp-1][1],h261_cbp_tab[cbp-1][0]); 
+    }
+    for(i=0; i<6; i++) {
+        /* encode each block */
+        h261_encode_block(h, block[i], i);
+    }
+
+    if ( ( h->current_mba == 11 ) || ( h->current_mba == 22 ) || ( h->current_mba == 33 ) || ( !IS_16X16 ( h->mtype ) )){
+        h->current_mv_x=0;
+        h->current_mv_y=0;
+    }
+}
+
+void ff_h261_encode_init(MpegEncContext *s){
+    static int done = 0;
+    
+    if (!done) {
+        done = 1;
+        init_rl(&h261_rl_tcoeff, 1);
+    }
+
+    s->min_qcoeff= -127;
+    s->max_qcoeff=  127;
+    s->y_dc_scale_table=
+    s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+}
+
+
+/**
+ * encodes a 8x8 block.
+ * @param block the 8x8 block
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ */
+static void h261_encode_block(H261Context * h, DCTELEM * block, int n){
+    MpegEncContext * const s = &h->s;
+    int level, run, last, i, j, last_index, last_non_zero, sign, slevel, code;
+    RLTable *rl;
+
+    rl = &h261_rl_tcoeff;
+    if (s->mb_intra) {
+        /* DC coef */
+        level = block[0];
+        /* 255 cannot be represented, so we clamp */
+        if (level > 254) {
+            level = 254;
+            block[0] = 254;
+        }
+        /* 0 cannot be represented also */
+        else if (level < 1) {
+            level = 1;
+            block[0] = 1;
+        }
+        if (level == 128)
+            put_bits(&s->pb, 8, 0xff);
+        else
+            put_bits(&s->pb, 8, level);
+        i = 1;
+    } else if((block[0]==1 || block[0] == -1) && (s->block_last_index[n] > -1)){
+        //special case
+        put_bits(&s->pb,2,block[0]>0 ? 2 : 3 );
+        i = 1;
+    } else {
+        i = 0;
+    }
+   
+    /* AC coefs */
+    last_index = s->block_last_index[n];
+    last_non_zero = i - 1;
+    for (; i <= last_index; i++) {
+        j = s->intra_scantable.permutated[i];
+        level = block[j];
+        if (level) {
+            run = i - last_non_zero - 1;
+            last = (i == last_index);
+            sign = 0;
+            slevel = level;
+            if (level < 0) {
+                sign = 1;
+                level = -level;
+            }
+            code = get_rl_index(rl, 0 /*no last in H.261, EOB is used*/, run, level);
+            if(run==0 && level < 16)
+            code+=1;
+            put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+            if (code == rl->n) {
+                put_bits(&s->pb, 6, run);
+                assert(slevel != 0);
+                assert(level <= 127);
+                put_bits(&s->pb, 8, slevel & 0xff);
+            } else {
+                put_bits(&s->pb, 1, sign);
+            }
+            last_non_zero = i;
+        }
+    }
+    if(last_index > -1){
+        put_bits(&s->pb, rl->table_vlc[0][1], rl->table_vlc[0][0]);// END OF BLOCK
+    }
+}
+
+/***********************************************/
+/* decoding */
+
+static VLC h261_mba_vlc;
+static VLC h261_mtype_vlc;
+static VLC h261_mv_vlc;
+static VLC h261_cbp_vlc;
+
+void init_vlc_rl(RLTable *rl, int use_static);
+
+static void h261_decode_init_vlc(H261Context *h){
+    static int done = 0;
+
+    if(!done){
+        done = 1;
+        init_vlc(&h261_mba_vlc, H261_MBA_VLC_BITS, 35,
+                 h261_mba_bits, 1, 1,
+                 h261_mba_code, 1, 1, 1);
+        init_vlc(&h261_mtype_vlc, H261_MTYPE_VLC_BITS, 10,
+                 h261_mtype_bits, 1, 1,
+                 h261_mtype_code, 1, 1, 1);
+        init_vlc(&h261_mv_vlc, H261_MV_VLC_BITS, 17,
+                 &h261_mv_tab[0][1], 2, 1,
+                 &h261_mv_tab[0][0], 2, 1, 1);
+        init_vlc(&h261_cbp_vlc, H261_CBP_VLC_BITS, 63,
+                 &h261_cbp_tab[0][1], 2, 1,
+                 &h261_cbp_tab[0][0], 2, 1, 1);
+        init_rl(&h261_rl_tcoeff, 1);
+        init_vlc_rl(&h261_rl_tcoeff, 1);
+    }
+}
+
+static int h261_decode_init(AVCodecContext *avctx){
+    H261Context *h= avctx->priv_data;
+    MpegEncContext * const s = &h->s;
+
+    // set defaults
+    MPV_decode_defaults(s);
+    s->avctx = avctx;
+
+    s->width  = s->avctx->coded_width;
+    s->height = s->avctx->coded_height;
+    s->codec_id = s->avctx->codec->id;
+
+    s->out_format = FMT_H261;
+    s->low_delay= 1;
+    avctx->pix_fmt= PIX_FMT_YUV420P;
+
+    s->codec_id= avctx->codec->id;
+
+    h261_decode_init_vlc(h);
+
+    h->gob_start_code_skipped = 0;
+    
+    return 0;
+}
+
+/**
+ * decodes the group of blocks header or slice header.
+ * @return <0 if an error occured
+ */
+static int h261_decode_gob_header(H261Context *h){
+    unsigned int val;
+    MpegEncContext * const s = &h->s;
+    
+    if ( !h->gob_start_code_skipped ){
+        /* Check for GOB Start Code */
+        val = show_bits(&s->gb, 15);
+        if(val)
+            return -1;
+
+        /* We have a GBSC */
+        skip_bits(&s->gb, 16);
+    }
+
+    h->gob_start_code_skipped = 0;
+
+    h->gob_number = get_bits(&s->gb, 4); /* GN */
+    s->qscale = get_bits(&s->gb, 5); /* GQUANT */
+
+    /* Check if gob_number is valid */
+    if (s->mb_height==18){ //cif
+        if ((h->gob_number<=0) || (h->gob_number>12))
+            return -1;
+    }
+    else{ //qcif
+        if ((h->gob_number!=1) && (h->gob_number!=3) && (h->gob_number!=5))
+            return -1;
+    }
+
+    /* GEI */
+    while (get_bits1(&s->gb) != 0) {
+        skip_bits(&s->gb, 8);
+    }
+
+    if(s->qscale==0)
+        return -1;
+
+    // For the first transmitted macroblock in a GOB, MBA is the absolute address. For
+    // subsequent macroblocks, MBA is the difference between the absolute addresses of
+    // the macroblock and the last transmitted macroblock.
+    h->current_mba = 0;
+    h->mba_diff = 0;
+
+    return 0;
+}
+
+/**
+ * decodes the group of blocks / video packet header.
+ * @return <0 if no resync found
+ */
+static int ff_h261_resync(H261Context *h){
+    MpegEncContext * const s = &h->s;
+    int left, ret;
+
+    if ( h->gob_start_code_skipped ){
+        ret= h261_decode_gob_header(h);
+        if(ret>=0)
+            return 0;
+    }
+    else{
+        if(show_bits(&s->gb, 15)==0){
+            ret= h261_decode_gob_header(h);
+            if(ret>=0)
+                return 0;
+        }
+        //ok, its not where its supposed to be ...
+        s->gb= s->last_resync_gb;
+        align_get_bits(&s->gb);
+        left= s->gb.size_in_bits - get_bits_count(&s->gb);
+
+        for(;left>15+1+4+5; left-=8){
+            if(show_bits(&s->gb, 15)==0){
+                GetBitContext bak= s->gb;
+
+                ret= h261_decode_gob_header(h);
+                if(ret>=0)
+                    return 0;
+
+                s->gb= bak;
+            }
+            skip_bits(&s->gb, 8);
+        }
+    }
+
+    return -1;
+}
+
+/**
+ * decodes skipped macroblocks
+ * @return 0
+ */
+static int h261_decode_mb_skipped(H261Context *h, int mba1, int mba2 )
+{
+    MpegEncContext * const s = &h->s;
+    int i;
+    
+    s->mb_intra = 0;
+
+    for(i=mba1; i<mba2; i++){
+        int j, xy;
+
+        s->mb_x= ((h->gob_number-1) % 2) * 11 + i % 11;
+        s->mb_y= ((h->gob_number-1) / 2) * 3 + i / 11;
+        xy = s->mb_x + s->mb_y * s->mb_stride;
+        ff_init_block_index(s);
+        ff_update_block_index(s);
+        s->dsp.clear_blocks(s->block[0]);
+
+        for(j=0;j<6;j++)
+            s->block_last_index[j] = -1;
+
+        s->mv_dir = MV_DIR_FORWARD;
+        s->mv_type = MV_TYPE_16X16;
+        s->current_picture.mb_type[xy]= MB_TYPE_SKIP | MB_TYPE_16x16 | MB_TYPE_L0;
+        s->mv[0][0][0] = 0;
+        s->mv[0][0][1] = 0;
+        s->mb_skiped = 1;
+        h->mtype &= ~MB_TYPE_H261_FIL;
+
+        MPV_decode_mb(s, s->block);
+    }
+
+    return 0;
+}
+
+static int decode_mv_component(GetBitContext *gb, int v){
+    int mv_diff = get_vlc2(gb, h261_mv_vlc.table, H261_MV_VLC_BITS, 2);
+
+    /* check if mv_diff is valid */
+    if ( mv_diff < 0 )
+        return v;
+
+    mv_diff = mvmap[mv_diff];
+
+    if(mv_diff && !get_bits1(gb))
+        mv_diff= -mv_diff;
+    
+    v += mv_diff;
+    if     (v <=-16) v+= 32;
+    else if(v >= 16) v-= 32;
+
+    return v;
+}
+
+static int h261_decode_mb(H261Context *h){
+    MpegEncContext * const s = &h->s;
+    int i, cbp, xy;
+
+    cbp = 63;
+    // Read mba
+    do{
+        h->mba_diff = get_vlc2(&s->gb, h261_mba_vlc.table, H261_MBA_VLC_BITS, 2);
+
+        /* Check for slice end */
+        /* NOTE: GOB can be empty (no MB data) or exist only of MBA_stuffing */
+        if (h->mba_diff == MBA_STARTCODE){ // start code
+            h->gob_start_code_skipped = 1;
+            return SLICE_END;
+        }
+    }
+    while( h->mba_diff == MBA_STUFFING ); // stuffing
+
+    if ( h->mba_diff < 0 ){
+        if ( get_bits_count(&s->gb) + 7 >= s->gb.size_in_bits )
+            return SLICE_END;
+
+        av_log(s->avctx, AV_LOG_ERROR, "illegal mba at %d %d\n", s->mb_x, s->mb_y);
+        return SLICE_ERROR;
+    }
+
+    h->mba_diff += 1;
+    h->current_mba += h->mba_diff;
+
+    if ( h->current_mba > MBA_STUFFING )
+        return SLICE_ERROR;
+    
+    s->mb_x= ((h->gob_number-1) % 2) * 11 + ((h->current_mba-1) % 11);
+    s->mb_y= ((h->gob_number-1) / 2) * 3 + ((h->current_mba-1) / 11);
+    xy = s->mb_x + s->mb_y * s->mb_stride;
+    ff_init_block_index(s);
+    ff_update_block_index(s);
+    s->dsp.clear_blocks(s->block[0]);
+
+    // Read mtype
+    h->mtype = get_vlc2(&s->gb, h261_mtype_vlc.table, H261_MTYPE_VLC_BITS, 2);
+    h->mtype = h261_mtype_map[h->mtype];
+
+    // Read mquant
+    if ( IS_QUANT ( h->mtype ) ){
+        ff_set_qscale(s, get_bits(&s->gb, 5));
+    }
+
+    s->mb_intra = IS_INTRA4x4(h->mtype);
+
+    // Read mv
+    if ( IS_16X16 ( h->mtype ) ){
+        // Motion vector data is included for all MC macroblocks. MVD is obtained from the macroblock vector by subtracting the
+        // vector of the preceding macroblock. For this calculation the vector of the preceding macroblock is regarded as zero in the
+        // following three situations:
+        // 1) evaluating MVD for macroblocks 1, 12 and 23;
+        // 2) evaluating MVD for macroblocks in which MBA does not represent a difference of 1;
+        // 3) MTYPE of the previous macroblock was not MC.
+        if ( ( h->current_mba == 1 ) || ( h->current_mba == 12 ) || ( h->current_mba == 23 ) ||
+             ( h->mba_diff != 1))
+        {
+            h->current_mv_x = 0;
+            h->current_mv_y = 0;
+        }
+
+        h->current_mv_x= decode_mv_component(&s->gb, h->current_mv_x);
+        h->current_mv_y= decode_mv_component(&s->gb, h->current_mv_y);
+    }else{
+        h->current_mv_x = 0;
+        h->current_mv_y = 0;
+    }
+
+    // Read cbp
+    if ( HAS_CBP( h->mtype ) ){
+        cbp = get_vlc2(&s->gb, h261_cbp_vlc.table, H261_CBP_VLC_BITS, 2) + 1;
+    }
+
+    if(s->mb_intra){
+        s->current_picture.mb_type[xy]= MB_TYPE_INTRA;
+        goto intra;
+    }
+
+    //set motion vectors
+    s->mv_dir = MV_DIR_FORWARD;
+    s->mv_type = MV_TYPE_16X16;
+    s->current_picture.mb_type[xy]= MB_TYPE_16x16 | MB_TYPE_L0;
+    s->mv[0][0][0] = h->current_mv_x * 2;//gets divided by 2 in motion compensation
+    s->mv[0][0][1] = h->current_mv_y * 2;
+
+intra:
+    /* decode each block */
+    if(s->mb_intra || HAS_CBP(h->mtype)){
+        for (i = 0; i < 6; i++) {
+            if (h261_decode_block(h, s->block[i], i, cbp&32) < 0){
+                return SLICE_ERROR;
+            }
+            cbp+=cbp;
+        }
+    }
+
+    MPV_decode_mb(s, s->block);
+
+    return SLICE_OK;
+}
+
+/**
+ * decodes a macroblock
+ * @return <0 if an error occured
+ */
+static int h261_decode_block(H261Context * h, DCTELEM * block,
+                             int n, int coded)
+{
+    MpegEncContext * const s = &h->s;
+    int code, level, i, j, run;
+    RLTable *rl = &h261_rl_tcoeff;
+    const uint8_t *scan_table;
+    
+    // For the variable length encoding there are two code tables, one being used for
+    // the first transmitted LEVEL in INTER, INTER+MC and INTER+MC+FIL blocks, the second
+    // for all other LEVELs except the first one in INTRA blocks which is fixed length
+    // coded with 8 bits.
+    // NOTE: the two code tables only differ in one VLC so we handle that manually.
+    scan_table = s->intra_scantable.permutated;
+    if (s->mb_intra){
+        /* DC coef */
+        level = get_bits(&s->gb, 8);
+        // 0 (00000000b) and -128 (10000000b) are FORBIDDEN
+        if((level&0x7F) == 0){
+            av_log(s->avctx, AV_LOG_ERROR, "illegal dc %d at %d %d\n", level, s->mb_x, s->mb_y);
+            return -1;
+        }
+        // The code 1000 0000 is not used, the reconstruction level of 1024 being coded as 1111 1111.
+        if (level == 255)
+            level = 128;
+        block[0] = level;
+        i = 1;
+    }else if(coded){
+        // Run  Level   Code
+        // EOB                  Not possible for first level when cbp is available (that's why the table is different)
+        // 0    1               1s
+        // *    *               0*
+        int check = show_bits(&s->gb, 2);
+        i = 0;
+        if ( check & 0x2 ){
+            skip_bits(&s->gb, 2);
+            block[0] = ( check & 0x1 ) ? -1 : 1;
+            i = 1;
+        }
+    }else{
+        i = 0;
+    }
+    if(!coded){
+        s->block_last_index[n] = i - 1;
+        return 0;
+    }
+    for(;;){
+        code = get_vlc2(&s->gb, rl->vlc.table, TCOEFF_VLC_BITS, 2);
+        if (code < 0){
+            av_log(s->avctx, AV_LOG_ERROR, "illegal ac vlc code at %dx%d\n", s->mb_x, s->mb_y);
+            return -1;
+        }
+        if (code == rl->n) {
+            /* escape */
+            // The remaining combinations of (run, level) are encoded with a 20-bit word consisting of 6 bits escape, 6 bits run and 8 bits level.
+            run = get_bits(&s->gb, 6);
+            level = get_sbits(&s->gb, 8);
+        }else if(code == 0){
+            break;
+        }else{
+            run = rl->table_run[code];
+            level = rl->table_level[code];
+            if (get_bits1(&s->gb))
+                level = -level;
+        }
+        i += run;
+        if (i >= 64){
+            av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d\n", s->mb_x, s->mb_y);
+            return -1;
+        }
+        j = scan_table[i];
+        block[j] = level;
+        i++;
+    }
+    s->block_last_index[n] = i-1;
+    return 0;
+}
+
+/**
+ * decodes the H261 picture header.
+ * @return <0 if no startcode found
+ */
+int h261_decode_picture_header(H261Context *h){
+    MpegEncContext * const s = &h->s;
+    int format, i;
+    uint32_t startcode= 0;
+
+    for(i= s->gb.size_in_bits - get_bits_count(&s->gb); i>24; i-=1){
+        startcode = ((startcode << 1) | get_bits(&s->gb, 1)) & 0x000FFFFF;
+
+        if(startcode == 0x10)
+            break;
+    }
+
+    if (startcode != 0x10){
+        av_log(s->avctx, AV_LOG_ERROR, "Bad picture start code\n");
+        return -1;
+    }
+
+    /* temporal reference */
+    s->picture_number = get_bits(&s->gb, 5); /* picture timestamp */
+
+    /* PTYPE starts here */
+    skip_bits1(&s->gb); /* split screen off */
+    skip_bits1(&s->gb); /* camera  off */
+    skip_bits1(&s->gb); /* freeze picture release off */
+
+    format = get_bits1(&s->gb);
+
+    //only 2 formats possible
+    if (format == 0){//QCIF
+        s->width = 176;
+        s->height = 144;
+        s->mb_width = 11;
+        s->mb_height = 9;
+    }else{//CIF
+        s->width = 352;
+        s->height = 288;
+        s->mb_width = 22;
+        s->mb_height = 18;
+    }
+
+    s->mb_num = s->mb_width * s->mb_height;
+
+    skip_bits1(&s->gb); /* still image mode off */
+    skip_bits1(&s->gb); /* Reserved */
+
+    /* PEI */
+    while (get_bits1(&s->gb) != 0){
+        skip_bits(&s->gb, 8);
+    }
+
+    // h261 has no I-FRAMES, but if we pass I_TYPE for the first frame, the codec crashes if it does 
+    // not contain all I-blocks (e.g. when a packet is lost)
+    s->pict_type = P_TYPE;
+
+    h->gob_number = 0;
+    return 0;
+}
+
+static int h261_decode_gob(H261Context *h){
+    MpegEncContext * const s = &h->s;
+    
+    ff_set_qscale(s, s->qscale);
+
+    /* decode mb's */
+    while(h->current_mba <= MBA_STUFFING)
+    {
+        int ret;
+        /* DCT & quantize */
+        ret= h261_decode_mb(h);
+        if(ret<0){
+            if(ret==SLICE_END){
+                h261_decode_mb_skipped(h, h->current_mba, 33);                
+                return 0;
+            }
+            av_log(s->avctx, AV_LOG_ERROR, "Error at MB: %d\n", s->mb_x + s->mb_y*s->mb_stride);
+            return -1;
+        }
+        
+        h261_decode_mb_skipped(h, h->current_mba-h->mba_diff, h->current_mba-1);
+    }
+    
+    return -1;
+}
+
+static int h261_find_frame_end(ParseContext *pc, AVCodecContext* avctx, const uint8_t *buf, int buf_size){
+    int vop_found, i, j;
+    uint32_t state;
+
+    vop_found= pc->frame_start_found;
+    state= pc->state;
+   
+    for(i=0; i<buf_size && !vop_found; i++){
+        state= (state<<8) | buf[i];
+        for(j=0; j<8; j++){
+            if(((state>>j)&0xFFFFF) == 0x00010){
+                i++;
+                vop_found=1;
+                break;
+            }
+        }
+    }
+    if(vop_found){
+        for(; i<buf_size; i++){
+            state= (state<<8) | buf[i];
+            for(j=0; j<8; j++){
+                if(((state>>j)&0xFFFFF) == 0x00010){
+                    pc->frame_start_found=0;
+                    pc->state= state>>(2*8);
+                    return i-1;
+                }
+            }
+        }
+    }
+
+    pc->frame_start_found= vop_found;
+    pc->state= state;
+    return END_NOT_FOUND;
+}
+
+static int h261_parse(AVCodecParserContext *s,
+                      AVCodecContext *avctx,
+                      uint8_t **poutbuf, int *poutbuf_size, 
+                      const uint8_t *buf, int buf_size)
+{
+    ParseContext *pc = s->priv_data;
+    int next;
+    
+    next= h261_find_frame_end(pc,avctx, buf, buf_size);
+    if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+    *poutbuf = (uint8_t *)buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
+/**
+ * returns the number of bytes consumed for building the current frame
+ */
+static int get_consumed_bytes(MpegEncContext *s, int buf_size){
+    int pos= get_bits_count(&s->gb)>>3;
+    if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
+    if(pos+10>buf_size) pos=buf_size; // oops ;)
+
+    return pos;
+}
+
+static int h261_decode_frame(AVCodecContext *avctx,
+                             void *data, int *data_size,
+                             uint8_t *buf, int buf_size)
+{
+    H261Context *h= avctx->priv_data;
+    MpegEncContext *s = &h->s;
+    int ret;
+    AVFrame *pict = data;
+
+#ifdef DEBUG
+    printf("*****frame %d size=%d\n", avctx->frame_number, buf_size);
+    printf("bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]);
+#endif
+    s->flags= avctx->flags;
+    s->flags2= avctx->flags2;
+
+    h->gob_start_code_skipped=0;
+
+retry:
+
+    init_get_bits(&s->gb, buf, buf_size*8);
+
+    if(!s->context_initialized){
+        if (MPV_common_init(s) < 0) //we need the idct permutaton for reading a custom matrix
+            return -1;
+    }
+
+    //we need to set current_picture_ptr before reading the header, otherwise we cant store anyting im there
+    if(s->current_picture_ptr==NULL || s->current_picture_ptr->data[0]){
+        int i= ff_find_unused_picture(s, 0);
+        s->current_picture_ptr= &s->picture[i];
+    }
+
+    ret = h261_decode_picture_header(h);
+
+    /* skip if the header was thrashed */
+    if (ret < 0){
+        av_log(s->avctx, AV_LOG_ERROR, "header damaged\n");
+        return -1;
+    }
+
+    if (s->width != avctx->coded_width || s->height != avctx->coded_height){
+        ParseContext pc= s->parse_context; //FIXME move these demuxng hack to avformat
+        s->parse_context.buffer=0;
+        MPV_common_end(s);
+        s->parse_context= pc;
+    }
+    if (!s->context_initialized) {
+        avcodec_set_dimensions(avctx, s->width, s->height);
+
+        goto retry;
+    }
+
+    // for hurry_up==5
+    s->current_picture.pict_type= s->pict_type;
+    s->current_picture.key_frame= s->pict_type == I_TYPE;
+
+    /* skip everything if we are in a hurry>=5 */
+    if(avctx->hurry_up>=5) return get_consumed_bytes(s, buf_size);
+
+    if(MPV_frame_start(s, avctx) < 0)
+        return -1;
+
+    ff_er_frame_start(s);
+
+    /* decode each macroblock */
+    s->mb_x=0;
+    s->mb_y=0;
+
+    while(h->gob_number < (s->mb_height==18 ? 12 : 5)){
+        if(ff_h261_resync(h)<0)
+            break;
+        h261_decode_gob(h);
+    }
+    MPV_frame_end(s);
+
+assert(s->current_picture.pict_type == s->current_picture_ptr->pict_type);
+assert(s->current_picture.pict_type == s->pict_type);
+    *pict= *(AVFrame*)s->current_picture_ptr;
+    ff_print_debug_info(s, pict);
+
+    /* Return the Picture timestamp as the frame number */
+    /* we substract 1 because it is added on utils.c    */
+    avctx->frame_number = s->picture_number - 1;
+
+    *data_size = sizeof(AVFrame);
+
+    return get_consumed_bytes(s, buf_size);
+}
+
+static int h261_decode_end(AVCodecContext *avctx)
+{
+    H261Context *h= avctx->priv_data;
+    MpegEncContext *s = &h->s;
+
+    MPV_common_end(s);
+    return 0;
+}
+
+#ifdef CONFIG_ENCODERS
+AVCodec h261_encoder = {
+    "h261",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_H261,
+    sizeof(H261Context),
+    MPV_encode_init,
+    MPV_encode_picture,
+    MPV_encode_end,
+};
+#endif
+
+AVCodec h261_decoder = {
+    "h261",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_H261,
+    sizeof(H261Context),
+    h261_decode_init,
+    NULL,
+    h261_decode_end,
+    h261_decode_frame,
+    CODEC_CAP_DR1,
+};
+
+AVCodecParser h261_parser = {
+    { CODEC_ID_H261 },
+    sizeof(ParseContext),
+    NULL,
+    h261_parse,
+    ff_parse_close,
+};
diff --git a/src/libffmpeg/libavcodec/h261data.h b/src/libffmpeg/libavcodec/h261data.h
new file mode 100755
index 000000000..9ea991b23
--- /dev/null
+++ b/src/libffmpeg/libavcodec/h261data.h
@@ -0,0 +1,136 @@
+/**
+ * @file h261data.h
+ * H.261 tables.
+ */
+#define MB_TYPE_H261_FIL 0x800000
+
+// H.261 VLC table for macroblock addressing
+const uint8_t h261_mba_code[35] = {
+     1,  3,  2,  3,
+     2,  3,  2,  7,
+     6, 11, 10,  9,
+     8,  7,  6, 23,
+    22, 21, 20, 19,
+    18, 35, 34, 33,
+    32, 31, 30, 29,
+    28, 27, 26, 25,
+    24,
+    15,           //(MBA stuffing)
+    1             //(start code)
+};
+
+const uint8_t h261_mba_bits[35] = {
+     1,  3,  3,  4,
+     4,  5,  5,  7,
+     7,  8,  8,  8,
+     8,  8,  8, 10,
+    10, 10, 10, 10,
+    10, 11, 11, 11,
+    11, 11, 11, 11,
+    11, 11, 11, 11,
+    11,
+    11,           //(MBA stuffing)
+    16            //(start code)
+};
+
+//H.261 VLC table for macroblock type
+const uint8_t h261_mtype_code[10] = {
+    1,  1,  1,  1,
+    1,  1,  1,  1,
+    1,  1
+};
+
+const uint8_t h261_mtype_bits[10] = {
+    4,  7,  1,  5,
+    9,  8, 10,  3,
+    2,  6
+};
+
+static const int h261_mtype_map[10]= {
+        MB_TYPE_INTRA4x4,
+        MB_TYPE_INTRA4x4  |  MB_TYPE_QUANT,
+                                               MB_TYPE_CBP,
+                             MB_TYPE_QUANT  |  MB_TYPE_CBP,
+                                                               MB_TYPE_16x16,
+                                               MB_TYPE_CBP  |  MB_TYPE_16x16,
+                             MB_TYPE_QUANT  |  MB_TYPE_CBP  |  MB_TYPE_16x16,
+                                                               MB_TYPE_16x16  |  MB_TYPE_H261_FIL,
+                                               MB_TYPE_CBP  |  MB_TYPE_16x16  |  MB_TYPE_H261_FIL,
+                             MB_TYPE_QUANT  |  MB_TYPE_CBP  |  MB_TYPE_16x16  |  MB_TYPE_H261_FIL
+};
+
+//H.261 VLC table for motion vectors
+const uint8_t h261_mv_tab[17][2] = {
+    {1,1}, {1,2}, {1,3}, {1,4}, {3,6}, {5,7}, {4,7}, {3,7},
+    {11,9}, {10,9}, {9,9}, {17,10}, {16,10}, {15,10}, {14,10}, {13,10}, {12,10}
+};
+
+static const int mvmap[17] =
+{
+    0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16
+};
+
+//H.261 VLC table for coded block pattern
+const uint8_t h261_cbp_tab[63][2] =
+{
+    {11,5}, {9,5}, {13,6}, {13,4}, {23,7}, {19,7}, {31,8}, {12,4},
+    {22,7}, {18,7}, {30,8}, {19,5}, {27,8}, {23,8}, {19,8}, {11,4},
+    {21,7}, {17,7}, {29,8}, {17,5}, {25,8}, {21,8}, {17,8}, {15,6},
+    {15,8}, {13,8}, {3,9}, {15,5}, {11,8}, {7,8}, {7,9}, {10,4},
+    {20,7}, {16,7}, {28,8}, {14,6}, {14,8}, {12,8}, {2,9}, {16,5},
+    {24,8}, {20,8}, {16,8}, {14,5}, {10,8}, {6,8}, {6,9}, {18,5},
+    {26,8}, {22,8}, {18,8}, {13,5}, {9,8}, {5,8}, {5,9}, {12,5},
+    {8,8}, {4,8}, {4,9}, {7,3}, {10,5}, {8,5}, {12,6}
+};
+
+//H.261 VLC table for transform coefficients
+const uint16_t h261_tcoeff_vlc[65][2] = {
+{ 0x2, 2 }, { 0x3, 2 },{ 0x4, 4 },{ 0x5, 5 },
+{ 0x6, 7 },{ 0x26, 8 },{ 0x21, 8 },{ 0xa, 10 },
+{ 0x1d, 12 },{ 0x18, 12 },{ 0x13, 12 },{ 0x10 , 12 },
+{ 0x1a, 13},{ 0x19, 13 }, { 0x18, 13 }, { 0x17, 13 },
+{ 0x3, 3 }, { 0x6, 6 }, { 0x25 , 8 }, { 0xc, 10 },
+{ 0x1b, 12 }, { 0x16, 13 }, { 0x15, 13 }, { 0x5, 4},
+{ 0x4, 7}, { 0xb, 10 }, { 0x14, 12 }, { 0x14, 13 },
+{ 0x7, 5 }, { 0x24, 8 }, { 0x1c, 12 }, { 0x13, 13 },
+{ 0x6, 5 }, { 0xf, 10 }, { 0x12, 12}, { 0x7, 6},
+{ 0x9 , 10 }, { 0x12, 13 }, { 0x5, 6 }, { 0x1e, 12 },
+{ 0x4, 6 }, { 0x15, 12 }, { 0x7, 7 }, { 0x11, 12},
+{ 0x5, 7 }, { 0x11, 13 }, { 0x27, 8 }, { 0x10, 13 },
+{ 0x23, 8 }, { 0x22, 8 }, { 0x20, 8 }, { 0xe , 10 },
+{ 0xd, 10 }, { 0x8, 10 },{ 0x1f, 12 }, { 0x1a, 12 },
+{ 0x19, 12 }, { 0x17, 12 }, { 0x16, 12}, { 0x1f, 13},
+{ 0x1e, 13 }, { 0x1d, 13 }, { 0x1c, 13}, { 0x1b, 13},
+{ 0x1, 6 }                                             //escape
+};
+
+const int8_t h261_tcoeff_level[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,
+    8,  9, 10, 11, 12, 13, 14, 15,
+    1,  2,  3,  4,  5,  6,  7,  1,
+    2,  3,  4,  5,  1,  2,  3,  4,
+    1,  2,  3,  1,  2,  3,  1,  2,
+    1,  2,  1,  2,  1,  2,  1,  2,
+    1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1
+};
+
+const int8_t h261_tcoeff_run[64] = {
+    0,
+    0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  1,
+    1,  1,  1,  1,  1,  1,  2,  2,
+    2,  2,  2,  3,  3,  3,  3,  4,
+    4,  4,  5,  5,  5,  6,  6,  7,
+    7,  8,  8,  9,  9, 10, 10, 11,
+   12, 13, 14, 15, 16, 17, 18, 19,
+   20, 21, 22, 23, 24, 25, 26
+};
+
+static RLTable h261_rl_tcoeff = {
+    64,
+    64,
+    h261_tcoeff_vlc,
+    h261_tcoeff_run,
+    h261_tcoeff_level,
+};
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index dd14a7bef..81c3648f1 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -57,20 +57,27 @@
 static void h263_encode_block(MpegEncContext * s, DCTELEM * block,
                               int n);
 static void h263p_encode_umotion(MpegEncContext * s, int val);
+static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block,
+                               int n, int dc, uint8_t *scan_table, 
+                               PutBitContext *dc_pb, PutBitContext *ac_pb);
 #endif
 
 static int h263_decode_motion(MpegEncContext * s, int pred, int fcode);
 static int h263p_decode_umotion(MpegEncContext * s, int pred);
 static int h263_decode_block(MpegEncContext * s, DCTELEM * block,
                              int n, int coded);
+static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
 static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                               int n, int coded, int intra, int rvlc);
+static int mpeg4_get_block_length(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
+                               uint8_t *scan_table);
 static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr);
 #ifdef CONFIG_ENCODERS
 static void mpeg4_encode_visual_object_header(MpegEncContext * s);
 static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_number);
 #endif //CONFIG_ENCODERS
 static void mpeg4_decode_sprite_trajectory(MpegEncContext * s, GetBitContext *gb);
+static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, int level, int *dir_ptr, int encoding);
 
 #ifdef CONFIG_ENCODERS
 static uint8_t uni_DCtab_lum_len[512];
@@ -86,6 +93,8 @@ static uint32_t uni_mpeg4_intra_rl_bits[64*64*2*2];
 static uint8_t  uni_mpeg4_intra_rl_len [64*64*2*2];
 static uint32_t uni_mpeg4_inter_rl_bits[64*64*2*2];
 static uint8_t  uni_mpeg4_inter_rl_len [64*64*2*2];
+static uint8_t  uni_h263_intra_aic_rl_len [64*64*2*2];
+static uint8_t  uni_h263_inter_rl_len [64*64*2*2];
 //#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128 + (run)*256 + (level))
 //#define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128*64 + (run) + (level)*64)
 #define UNI_MPEG4_ENC_INDEX(last,run,level) ((last)*128*64 + (run)*128 + (level))
@@ -656,87 +665,36 @@ void ff_h263_update_motion_val(MpegEncContext * s){
     }
 }
 
-/**
- * predicts the dc.
- * encoding quantized level -> quantized diff
- * decoding quantized diff -> quantized level  
- * @param n block index (0-3 are luma, 4-5 are chroma)
- * @param dir_ptr pointer to an integer where the prediction direction will be stored
- */
-static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, int level, int *dir_ptr, int encoding)
-{
-    int a, b, c, wrap, pred, scale, ret;
-    uint16_t *dc_val;
-
-    /* find prediction */
-    if (n < 4) {
-	scale = s->y_dc_scale;
-    } else {
-	scale = s->c_dc_scale;
-    }
-    if(IS_3IV1)
-        scale= 8;
-
-    wrap= s->block_wrap[n];
-    dc_val = s->dc_val[0] + s->block_index[n];
+#ifdef CONFIG_ENCODERS
 
-    /* B C
-     * A X 
-     */
-    a = dc_val[ - 1];
-    b = dc_val[ - 1 - wrap];
-    c = dc_val[ - wrap];
+static inline int h263_get_motion_length(MpegEncContext * s, int val, int f_code){
+    int l, bit_size, code;
 
-    /* outside slice handling (we cant do that by memset as we need the dc for error resilience) */
-    if(s->first_slice_line && n!=3){
-        if(n!=2) b=c= 1024;
-        if(n!=1 && s->mb_x == s->resync_mb_x) b=a= 1024;
-    }
-    if(s->mb_x == s->resync_mb_x && s->mb_y == s->resync_mb_y+1){
-        if(n==0 || n==4 || n==5)
-            b=1024;
-    }
-
-    if (abs(a - b) < abs(b - c)) {
-	pred = c;
-        *dir_ptr = 1; /* top */
+    if (val == 0) {
+        return mvtab[0][1];
     } else {
-	pred = a;
-        *dir_ptr = 0; /* left */
+        bit_size = f_code - 1;
+        /* modulo encoding */
+        l= INT_BIT - 6 - bit_size;
+        val = (val<<l)>>l;
+        val--;
+        code = (val >> bit_size) + 1;
+
+        return mvtab[code][1] + 1 + bit_size;
     }
-    /* we assume pred is positive */
-    pred = FASTDIV((pred + (scale >> 1)), scale);
+}
 
-    if(encoding){
-        ret = level - pred;
+static inline void ff_h263_encode_motion_vector(MpegEncContext * s, int x, int y, int f_code){
+    if(s->flags2 & CODEC_FLAG2_NO_OUTPUT){
+        skip_put_bits(&s->pb, 
+            h263_get_motion_length(s, x, f_code)
+           +h263_get_motion_length(s, y, f_code));
     }else{
-        level += pred;
-        ret= level;
-        if(s->error_resilience>=3){
-            if(level<0){
-                av_log(s->avctx, AV_LOG_ERROR, "dc<0 at %dx%d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-            if(level*scale > 2048 + scale){
-                av_log(s->avctx, AV_LOG_ERROR, "dc overflow at %dx%d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-        }
-    }
-    level *=scale;
-    if(level&(~2047)){
-        if(level<0) 
-            level=0;
-        else if(!(s->workaround_bugs&FF_BUG_DC_CLIP))
-            level=2047;
+        ff_h263_encode_motion(s, x, f_code);
+        ff_h263_encode_motion(s, y, f_code);
     }
-    dc_val[0]= level;
-
-    return ret;
 }
 
-#ifdef CONFIG_ENCODERS
-
 static inline int get_p_cbp(MpegEncContext * s,
                       DCTELEM block[6][64],
                       int motion_x, int motion_y){
@@ -836,169 +794,33 @@ static inline int get_b_cbp(MpegEncContext * s, DCTELEM block[6][64],
     return cbp;
 }
 
-/**
- * encodes the dc value.
- * @param n block index (0-3 are luma, 4-5 are chroma)
- */
-static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
-{
-#if 1
-//    if(level<-255 || level>255) printf("dc overflow\n");
-    level+=256;
-    if (n < 4) {
-	/* luminance */
-	put_bits(s, uni_DCtab_lum_len[level], uni_DCtab_lum_bits[level]);
-    } else {
-	/* chrominance */
-	put_bits(s, uni_DCtab_chrom_len[level], uni_DCtab_chrom_bits[level]);
-    }
-#else
-    int size, v;
-    /* find number of bits */
-    size = 0;
-    v = abs(level);
-    while (v) {
-	v >>= 1;
-	size++;
-    }
-
-    if (n < 4) {
-	/* luminance */
-	put_bits(&s->pb, DCtab_lum[size][1], DCtab_lum[size][0]);
-    } else {
-	/* chrominance */
-	put_bits(&s->pb, DCtab_chrom[size][1], DCtab_chrom[size][0]);
-    }
-
-    /* encode remaining bits */
-    if (size > 0) {
-	if (level < 0)
-	    level = (-level) ^ ((1 << size) - 1);
-	put_bits(&s->pb, size, level);
-	if (size > 8)
-	    put_bits(&s->pb, 1, 1);
-    }
-#endif
-}
-
-/**
- * encodes a 8x8 block
- * @param n block index (0-3 are luma, 4-5 are chroma)
- */
-static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
-                               uint8_t *scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb)
-{
-    int i, last_non_zero;
-#if 0 //variables for the outcommented version
-    int code, sign, last;
-#endif
-    const RLTable *rl;
-    uint32_t *bits_tab;
-    uint8_t *len_tab;
-    const int last_index = s->block_last_index[n];
-
-    if (s->mb_intra) { //Note gcc (3.2.1 at least) will optimize this away
-	/* mpeg4 based DC predictor */
-	mpeg4_encode_dc(dc_pb, intra_dc, n);
-        if(last_index<1) return;
-	i = 1;
-        rl = &rl_intra;
-        bits_tab= uni_mpeg4_intra_rl_bits;
-        len_tab = uni_mpeg4_intra_rl_len;
-    } else {
-        if(last_index<0) return;
-	i = 0;
-        rl = &rl_inter;
-        bits_tab= uni_mpeg4_inter_rl_bits;
-        len_tab = uni_mpeg4_inter_rl_len;
-    }
-
-    /* AC coefs */
-    last_non_zero = i - 1;
-#if 1
-    for (; i < last_index; i++) {
-	int level = block[ scan_table[i] ];
-	if (level) {
-	    int run = i - last_non_zero - 1;
-            level+=64;
-            if((level&(~127)) == 0){
-                const int index= UNI_MPEG4_ENC_INDEX(0, run, level);
-                put_bits(ac_pb, len_tab[index], bits_tab[index]);
-            }else{ //ESC3
-                put_bits(ac_pb, 7+2+1+6+1+12+1, (3<<23)+(3<<21)+(0<<20)+(run<<14)+(1<<13)+(((level-64)&0xfff)<<1)+1);
+static inline void mpeg4_encode_blocks(MpegEncContext * s, DCTELEM block[6][64], int intra_dc[6], 
+                               uint8_t **scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb){
+    int i;
+    
+    if(scan_table){
+        if(s->flags2 & CODEC_FLAG2_NO_OUTPUT){
+            for (i = 0; i < 6; i++) {
+                skip_put_bits(&s->pb, mpeg4_get_block_length(s, block[i], i, intra_dc[i], scan_table[i]));
+            }
+        }else{
+            /* encode each block */
+            for (i = 0; i < 6; i++) {
+                mpeg4_encode_block(s, block[i], i, intra_dc[i], scan_table[i], dc_pb, ac_pb);
             }
-	    last_non_zero = i;
-	}
-    }
-    /*if(i<=last_index)*/{
-	int level = block[ scan_table[i] ];
-        int run = i - last_non_zero - 1;
-        level+=64;
-        if((level&(~127)) == 0){
-            const int index= UNI_MPEG4_ENC_INDEX(1, run, level);
-            put_bits(ac_pb, len_tab[index], bits_tab[index]);
-        }else{ //ESC3
-            put_bits(ac_pb, 7+2+1+6+1+12+1, (3<<23)+(3<<21)+(1<<20)+(run<<14)+(1<<13)+(((level-64)&0xfff)<<1)+1);
         }
-    }
-#else
-    for (; i <= last_index; i++) {
-	const int slevel = block[ scan_table[i] ];
-	if (slevel) {
-            int level;
-	    int run = i - last_non_zero - 1;
-	    last = (i == last_index);
-	    sign = 0;
-	    level = slevel;
-	    if (level < 0) {
-		sign = 1;
-		level = -level;
-	    }
-            code = get_rl_index(rl, last, run, level);
-            put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-            if (code == rl->n) {
-                int level1, run1;
-                level1 = level - rl->max_level[last][run];
-                if (level1 < 1) 
-                    goto esc2;
-                code = get_rl_index(rl, last, run, level1);
-                if (code == rl->n) {
-                esc2:
-                    put_bits(ac_pb, 1, 1);
-                    if (level > MAX_LEVEL)
-                        goto esc3;
-                    run1 = run - rl->max_run[last][level] - 1;
-                    if (run1 < 0)
-                        goto esc3;
-                    code = get_rl_index(rl, last, run1, level);
-                    if (code == rl->n) {
-                    esc3:
-                        /* third escape */
-                        put_bits(ac_pb, 1, 1);
-                        put_bits(ac_pb, 1, last);
-                        put_bits(ac_pb, 6, run);
-                        put_bits(ac_pb, 1, 1);
-                        put_bits(ac_pb, 12, slevel & 0xfff);
-                        put_bits(ac_pb, 1, 1);
-                    } else {
-                        /* second escape */
-                        put_bits(ac_pb, 1, 0);
-                        put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-                        put_bits(ac_pb, 1, sign);
-                    }
-                } else {
-                    /* first escape */
-                    put_bits(ac_pb, 1, 0);
-                    put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-                    put_bits(ac_pb, 1, sign);
-                }
-            } else {
-                put_bits(ac_pb, 1, sign);
+    }else{
+        if(s->flags2 & CODEC_FLAG2_NO_OUTPUT){
+            for (i = 0; i < 6; i++) {
+                skip_put_bits(&s->pb, mpeg4_get_block_length(s, block[i], i, 0, s->intra_scantable.permutated));
             }
-	    last_non_zero = i;
-	}
+        }else{
+            /* encode each block */
+            for (i = 0; i < 6; i++) {
+                mpeg4_encode_block(s, block[i], i, 0, s->intra_scantable.permutated, dc_pb, ac_pb);
+            }
+        }
     }
-#endif
 }
 
 void mpeg4_encode_mb(MpegEncContext * s,
@@ -1089,23 +911,22 @@ void mpeg4_encode_mb(MpegEncContext * s,
 
             if(mb_type == 0){
                 assert(s->mv_dir & MV_DIRECT);
-                ff_h263_encode_motion(s, motion_x, 1);
-                ff_h263_encode_motion(s, motion_y, 1);                
+                ff_h263_encode_motion_vector(s, motion_x, motion_y, 1);
                 s->b_count++;
                 s->f_count++;
             }else{
                 assert(mb_type > 0 && mb_type < 4);
                 if(s->mv_type != MV_TYPE_FIELD){
                     if(s->mv_dir & MV_DIR_FORWARD){
-                        ff_h263_encode_motion(s, s->mv[0][0][0] - s->last_mv[0][0][0], s->f_code);
-                        ff_h263_encode_motion(s, s->mv[0][0][1] - s->last_mv[0][0][1], s->f_code);
+                        ff_h263_encode_motion_vector(s, s->mv[0][0][0] - s->last_mv[0][0][0],
+                                                        s->mv[0][0][1] - s->last_mv[0][0][1], s->f_code);
                         s->last_mv[0][0][0]= s->last_mv[0][1][0]= s->mv[0][0][0];
                         s->last_mv[0][0][1]= s->last_mv[0][1][1]= s->mv[0][0][1];
                         s->f_count++;
                     }
                     if(s->mv_dir & MV_DIR_BACKWARD){
-                        ff_h263_encode_motion(s, s->mv[1][0][0] - s->last_mv[1][0][0], s->b_code);
-                        ff_h263_encode_motion(s, s->mv[1][0][1] - s->last_mv[1][0][1], s->b_code);
+                        ff_h263_encode_motion_vector(s, s->mv[1][0][0] - s->last_mv[1][0][0],
+                                                        s->mv[1][0][1] - s->last_mv[1][0][1], s->b_code);
                         s->last_mv[1][0][0]= s->last_mv[1][1][0]= s->mv[1][0][0];
                         s->last_mv[1][0][1]= s->last_mv[1][1][1]= s->mv[1][0][1];
                         s->b_count++;
@@ -1121,8 +942,8 @@ void mpeg4_encode_mb(MpegEncContext * s,
                     }
                     if(s->mv_dir & MV_DIR_FORWARD){
                         for(i=0; i<2; i++){
-                            ff_h263_encode_motion(s, s->mv[0][i][0] - s->last_mv[0][i][0]  , s->f_code);
-                            ff_h263_encode_motion(s, s->mv[0][i][1] - s->last_mv[0][i][1]/2, s->f_code);
+                            ff_h263_encode_motion_vector(s, s->mv[0][i][0] - s->last_mv[0][i][0]  ,
+                                                            s->mv[0][i][1] - s->last_mv[0][i][1]/2, s->f_code);
                             s->last_mv[0][i][0]= s->mv[0][i][0];
                             s->last_mv[0][i][1]= s->mv[0][i][1]*2;
                         }
@@ -1130,8 +951,8 @@ void mpeg4_encode_mb(MpegEncContext * s,
                     }
                     if(s->mv_dir & MV_DIR_BACKWARD){
                         for(i=0; i<2; i++){
-                            ff_h263_encode_motion(s, s->mv[1][i][0] - s->last_mv[1][i][0]  , s->b_code);
-                            ff_h263_encode_motion(s, s->mv[1][i][1] - s->last_mv[1][i][1]/2, s->b_code);
+                            ff_h263_encode_motion_vector(s, s->mv[1][i][0] - s->last_mv[1][i][0]  ,
+                                                            s->mv[1][i][1] - s->last_mv[1][i][1]/2, s->b_code);
                             s->last_mv[1][i][0]= s->mv[1][i][0];
                             s->last_mv[1][i][1]= s->mv[1][i][1]*2;
                         }
@@ -1144,10 +965,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 s->mv_bits+= get_bits_diff(s);
             }
 
-            /* encode each block */
-            for (i = 0; i < 6; i++) {
-                mpeg4_encode_block(s, block[i], i, 0, s->intra_scantable.permutated, NULL, &s->pb);
-            }
+            mpeg4_encode_blocks(s, block, NULL, NULL, NULL, &s->pb);
 
             if(interleaved_stats){
                 s->p_tex_bits+= get_bits_diff(s);
@@ -1231,8 +1049,8 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 /* motion vectors: 16x16 mode */
                 h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             
-                ff_h263_encode_motion(s, motion_x - pred_x, s->f_code);
-                ff_h263_encode_motion(s, motion_y - pred_y, s->f_code);
+                ff_h263_encode_motion_vector(s, motion_x - pred_x,
+                                                motion_y - pred_y, s->f_code);
             }else if(s->mv_type==MV_TYPE_FIELD){
                 if(s->dquant) cbpc+= 8;
                 put_bits(&s->pb,
@@ -1259,10 +1077,10 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 put_bits(&s->pb, 1, s->field_select[0][0]);
                 put_bits(&s->pb, 1, s->field_select[0][1]);
             
-                ff_h263_encode_motion(s, s->mv[0][0][0] - pred_x, s->f_code);
-                ff_h263_encode_motion(s, s->mv[0][0][1] - pred_y, s->f_code);
-                ff_h263_encode_motion(s, s->mv[0][1][0] - pred_x, s->f_code);
-                ff_h263_encode_motion(s, s->mv[0][1][1] - pred_y, s->f_code);
+                ff_h263_encode_motion_vector(s, s->mv[0][0][0] - pred_x,
+                                                s->mv[0][0][1] - pred_y, s->f_code);
+                ff_h263_encode_motion_vector(s, s->mv[0][1][0] - pred_x,
+                                                s->mv[0][1][1] - pred_y, s->f_code);
             }else{
                 assert(s->mv_type==MV_TYPE_8X8);
                 put_bits(&s->pb,
@@ -1283,8 +1101,8 @@ void mpeg4_encode_mb(MpegEncContext * s,
                     /* motion vectors: 8x8 mode*/
                     h263_pred_motion(s, i, 0, &pred_x, &pred_y);
 
-                    ff_h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][0] - pred_x, s->f_code);
-                    ff_h263_encode_motion(s, s->current_picture.motion_val[0][ s->block_index[i] ][1] - pred_y, s->f_code);
+                    ff_h263_encode_motion_vector(s, s->current_picture.motion_val[0][ s->block_index[i] ][0] - pred_x,
+                                                    s->current_picture.motion_val[0][ s->block_index[i] ][1] - pred_y, s->f_code);
                 }
             }
 
@@ -1292,10 +1110,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                 s->mv_bits+= get_bits_diff(s);
             }
 
-            /* encode each block */
-            for (i = 0; i < 6; i++) {
-                mpeg4_encode_block(s, block[i], i, 0, s->intra_scantable.permutated, NULL, tex_pb);
-            }
+            mpeg4_encode_blocks(s, block, NULL, NULL, NULL, tex_pb);
 
             if(interleaved_stats){
                 s->p_tex_bits+= get_bits_diff(s);
@@ -1357,10 +1172,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
             s->misc_bits+= get_bits_diff(s);
         }
 
-        /* encode each block */
-        for (i = 0; i < 6; i++) {
-            mpeg4_encode_block(s, block[i], i, dc_diff[i], scan_table[i], dc_pb, tex_pb);
-        }
+        mpeg4_encode_blocks(s, block, dc_diff, scan_table, dc_pb, tex_pb);
 
         if(interleaved_stats){
             s->i_tex_bits+= get_bits_diff(s);
@@ -1373,8 +1185,6 @@ void mpeg4_encode_mb(MpegEncContext * s,
     }
 }
 
-
-
 void h263_encode_mb(MpegEncContext * s,
 		    DCTELEM block[6][64],
 		    int motion_x, int motion_y)
@@ -1426,8 +1236,8 @@ void h263_encode_mb(MpegEncContext * s,
             h263_pred_motion(s, 0, 0, &pred_x, &pred_y);
             
             if (!s->umvplus) {  
-                ff_h263_encode_motion(s, motion_x - pred_x, 1);
-                ff_h263_encode_motion(s, motion_y - pred_y, 1);
+                ff_h263_encode_motion_vector(s, motion_x - pred_x,
+                                                motion_y - pred_y, 1);
             }
             else {
                 h263p_encode_umotion(s, motion_x - pred_x);
@@ -1455,8 +1265,8 @@ void h263_encode_mb(MpegEncContext * s,
                 motion_x= s->current_picture.motion_val[0][ s->block_index[i] ][0];
                 motion_y= s->current_picture.motion_val[0][ s->block_index[i] ][1];
                 if (!s->umvplus) {  
-                    ff_h263_encode_motion(s, motion_x - pred_x, 1);
-                    ff_h263_encode_motion(s, motion_y - pred_y, 1);
+                    ff_h263_encode_motion_vector(s, motion_x - pred_x,
+                                                    motion_y - pred_y, 1);
                 }
                 else {
                     h263p_encode_umotion(s, motion_x - pred_x);
@@ -1636,7 +1446,7 @@ void ff_h263_loop_filter(MpegEncContext * s){
                 const int chroma_qp= s->chroma_qscale_table[qp_dt];
                 s->dsp.h263_h_loop_filter(dest_y -8*linesize  ,   linesize, qp_dt);
                 s->dsp.h263_h_loop_filter(dest_cb-8*uvlinesize, uvlinesize, chroma_qp);
-                s->dsp.h263_h_loop_filter(dest_cb-8*uvlinesize, uvlinesize, chroma_qp);
+                s->dsp.h263_h_loop_filter(dest_cr-8*uvlinesize, uvlinesize, chroma_qp);
             }
         }
     }
@@ -1871,30 +1681,12 @@ void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
         bit_size = f_code - 1;
         range = 1 << bit_size;
         /* modulo encoding */
-        l = range * 32;
-#if 1
-        val+= l;
-        val&= 2*l-1;
-        val-= l;
+        l= INT_BIT - 6 - bit_size;
+        val = (val<<l)>>l;
         sign = val>>31;
         val= (val^sign)-sign;
         sign&=1;
-#else
-        if (val < -l) {
-            val += 2*l;
-        } else if (val >= l) {
-            val -= 2*l;
-        }
 
-        assert(val>=-l && val<l);
-
-        if (val >= 0) {
-            sign = 0;
-        } else {
-            val = -val;
-            sign = 1;
-        }
-#endif
         val--;
         code = (val >> bit_size) + 1;
         bits = val & (range - 1);
@@ -1904,7 +1696,6 @@ void ff_h263_encode_motion(MpegEncContext * s, int val, int f_code)
             put_bits(&s->pb, bit_size, bits);
         }
     }
-
 }
 
 /* Encode MV differences on H.263+ with Unrestricted MV mode */
@@ -2135,6 +1926,49 @@ static void init_uni_mpeg4_rl_tab(RLTable *rl, uint32_t *bits_tab, uint8_t *len_
     }
 }
 
+static void init_uni_h263_rl_tab(RLTable *rl, uint32_t *bits_tab, uint8_t *len_tab){
+    int slevel, run, last;
+    
+    assert(MAX_LEVEL >= 64);
+    assert(MAX_RUN   >= 63);
+
+    for(slevel=-64; slevel<64; slevel++){
+        if(slevel==0) continue;
+        for(run=0; run<64; run++){
+            for(last=0; last<=1; last++){
+                const int index= UNI_MPEG4_ENC_INDEX(last, run, slevel+64);
+                int level= slevel < 0 ? -slevel : slevel;
+                int sign= slevel < 0 ? 1 : 0;
+                int bits, len, code;
+                
+                len_tab[index]= 100;
+                     
+                /* ESC0 */
+                code= get_rl_index(rl, last, run, level);
+                bits= rl->table_vlc[code][0];
+                len=  rl->table_vlc[code][1];
+                bits=bits*2+sign; len++;
+                
+                if(code!=rl->n && len < len_tab[index]){
+                    if(bits_tab) bits_tab[index]= bits;
+                    len_tab [index]= len;
+                }
+                /* ESC */
+                bits= rl->table_vlc[rl->n][0];
+                len = rl->table_vlc[rl->n][1];
+                bits=bits*2+last; len++;
+                bits=bits*64+run; len+=6;
+                bits=bits*256+(level&0xff); len+=8;
+                
+                if(len < len_tab[index]){
+                    if(bits_tab) bits_tab[index]= bits;
+                    len_tab [index]= len;
+                }
+            }
+        }
+    }
+}
+
 void h263_encode_init(MpegEncContext *s)
 {
     static int done = 0;
@@ -2144,17 +1978,28 @@ void h263_encode_init(MpegEncContext *s)
 
         init_uni_dc_tab();
 
-        init_rl(&rl_inter);
-        init_rl(&rl_intra);
-        init_rl(&rl_intra_aic);
+        init_rl(&rl_inter, 1);
+        init_rl(&rl_intra, 1);
+        init_rl(&rl_intra_aic, 1);
         
         init_uni_mpeg4_rl_tab(&rl_intra, uni_mpeg4_intra_rl_bits, uni_mpeg4_intra_rl_len);
         init_uni_mpeg4_rl_tab(&rl_inter, uni_mpeg4_inter_rl_bits, uni_mpeg4_inter_rl_len);
 
+        init_uni_h263_rl_tab(&rl_intra_aic, NULL, uni_h263_intra_aic_rl_len);
+        init_uni_h263_rl_tab(&rl_inter    , NULL, uni_h263_inter_rl_len);
+
         init_mv_penalty_and_fcode(s);
     }
     s->me.mv_penalty= mv_penalty; //FIXME exact table for msmpeg4 & h263p
     
+    s->intra_ac_vlc_length     =s->inter_ac_vlc_length     = uni_h263_inter_rl_len;
+    s->intra_ac_vlc_last_length=s->inter_ac_vlc_last_length= uni_h263_inter_rl_len + 128*64;
+    if(s->h263_aic){
+        s->intra_ac_vlc_length     = uni_h263_intra_aic_rl_len;
+        s->intra_ac_vlc_last_length= uni_h263_intra_aic_rl_len + 128*64;
+    }
+    s->ac_esc_length= 7+1+6+8;
+
     // use fcodes >1 only for mpeg4 & h263 & h263p FIXME
     switch(s->codec_id){
     case CODEC_ID_MPEG4:
@@ -2176,7 +2021,8 @@ void h263_encode_init(MpegEncContext *s)
             s->avctx->extradata= av_malloc(1024);
             init_put_bits(&s->pb, s->avctx->extradata, 1024);
             
-            mpeg4_encode_visual_object_header(s);
+            if(!(s->workaround_bugs & FF_BUG_MS))
+                mpeg4_encode_visual_object_header(s);
             mpeg4_encode_vol_header(s, 0, 0);
 
 //            ff_mpeg4_stuffing(&s->pb); ?
@@ -2371,11 +2217,13 @@ void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){
 
     if(s->pict_type==B_TYPE){
         s->pb_time= s->pp_time - (s->last_non_b_time - s->time);
+        assert(s->pb_time > 0 && s->pb_time < s->pp_time);
     }else{
         s->last_time_base= s->time_base;
         s->time_base= time_div;
         s->pp_time= s->time - s->last_non_b_time;
         s->last_non_b_time= s->time;
+        assert(picture_number==0 || s->pp_time > 0);
     }
 }
 
@@ -2413,13 +2261,26 @@ static void mpeg4_encode_visual_object_header(MpegEncContext * s){
     int profile_and_level_indication;
     int vo_ver_id;
     
-    if(s->max_b_frames || s->quarter_sample){
-        profile_and_level_indication= 0xF1; // adv simple level 1
+    if(s->avctx->profile != FF_PROFILE_UNKNOWN){
+        profile_and_level_indication = s->avctx->profile << 4;
+    }else if(s->max_b_frames || s->quarter_sample){
+        profile_and_level_indication= 0xF0; // adv simple
+    }else{
+        profile_and_level_indication= 0x00; // simple
+    }
+
+    if(s->avctx->level != FF_LEVEL_UNKNOWN){
+        profile_and_level_indication |= s->avctx->level;
+    }else{
+        profile_and_level_indication |= 1; //level 1
+    }
+
+    if(profile_and_level_indication>>4 == 0xF){
         vo_ver_id= 5;
     }else{
-        profile_and_level_indication= 0x01; // simple level 1
         vo_ver_id= 1;
     }
+
     //FIXME levels
 
     put_bits(&s->pb, 16, 0);
@@ -2460,9 +2321,13 @@ static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_n
 
     put_bits(&s->pb, 1, 0);		/* random access vol */
     put_bits(&s->pb, 8, s->vo_type);	/* video obj type indication */
-    put_bits(&s->pb, 1, 1);		/* is obj layer id= yes */
-      put_bits(&s->pb, 4, vo_ver_id);	/* is obj layer ver id */
-      put_bits(&s->pb, 3, 1);		/* is obj layer priority */
+    if(s->workaround_bugs & FF_BUG_MS) {
+        put_bits(&s->pb, 1, 0);        /* is obj layer id= no */
+    } else {
+        put_bits(&s->pb, 1, 1);        /* is obj layer id= yes */
+        put_bits(&s->pb, 4, vo_ver_id);    /* is obj layer ver id */
+        put_bits(&s->pb, 3, 1);        /* is obj layer priority */
+    }
     
     aspect_to_info(s, s->avctx->sample_aspect_ratio);
 
@@ -2472,13 +2337,13 @@ static void mpeg4_encode_vol_header(MpegEncContext * s, int vo_number, int vol_n
         put_bits(&s->pb, 8, s->avctx->sample_aspect_ratio.den);
     }
 
-    if(s->low_delay){
-        put_bits(&s->pb, 1, 1);		/* vol control parameters= yes */
-        put_bits(&s->pb, 2, 1);		/* chroma format YUV 420/YV12 */
+    if(s->workaround_bugs & FF_BUG_MS) { //
+        put_bits(&s->pb, 1, 0);        /* vol control parameters= no @@@ */
+    } else {
+        put_bits(&s->pb, 1, 1);        /* vol control parameters= yes */
+        put_bits(&s->pb, 2, 1);        /* chroma format YUV 420/YV12 */
         put_bits(&s->pb, 1, s->low_delay);
-        put_bits(&s->pb, 1, 0);		/* vbv parameters= no */
-    }else{
-        put_bits(&s->pb, 1, 0);		/* vol control parameters= no */
+        put_bits(&s->pb, 1, 0);        /* vbv parameters= no */
     }
 
     put_bits(&s->pb, 2, RECT_SHAPE);	/* vol shape= rectangle */
@@ -2549,7 +2414,8 @@ void mpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
             if(s->strict_std_compliance < 2 || picture_number==0) //HACK, the reference sw is buggy
                 mpeg4_encode_vol_header(s, 0, 0);
         }
-        mpeg4_encode_gop_header(s);
+        if(!(s->workaround_bugs & FF_BUG_MS))
+            mpeg4_encode_gop_header(s);
     }
     
     s->partitioned_frame= s->data_partitioning && s->pict_type!=B_TYPE;
@@ -2612,6 +2478,84 @@ void ff_set_qscale(MpegEncContext * s, int qscale)
     s->c_dc_scale= s->c_dc_scale_table[ s->chroma_qscale ];
 }
 
+/**
+ * predicts the dc.
+ * encoding quantized level -> quantized diff
+ * decoding quantized diff -> quantized level  
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ * @param dir_ptr pointer to an integer where the prediction direction will be stored
+ */
+static inline int ff_mpeg4_pred_dc(MpegEncContext * s, int n, int level, int *dir_ptr, int encoding)
+{
+    int a, b, c, wrap, pred, scale, ret;
+    uint16_t *dc_val;
+
+    /* find prediction */
+    if (n < 4) {
+	scale = s->y_dc_scale;
+    } else {
+	scale = s->c_dc_scale;
+    }
+    if(IS_3IV1)
+        scale= 8;
+
+    wrap= s->block_wrap[n];
+    dc_val = s->dc_val[0] + s->block_index[n];
+
+    /* B C
+     * A X 
+     */
+    a = dc_val[ - 1];
+    b = dc_val[ - 1 - wrap];
+    c = dc_val[ - wrap];
+
+    /* outside slice handling (we cant do that by memset as we need the dc for error resilience) */
+    if(s->first_slice_line && n!=3){
+        if(n!=2) b=c= 1024;
+        if(n!=1 && s->mb_x == s->resync_mb_x) b=a= 1024;
+    }
+    if(s->mb_x == s->resync_mb_x && s->mb_y == s->resync_mb_y+1){
+        if(n==0 || n==4 || n==5)
+            b=1024;
+    }
+
+    if (abs(a - b) < abs(b - c)) {
+	pred = c;
+        *dir_ptr = 1; /* top */
+    } else {
+	pred = a;
+        *dir_ptr = 0; /* left */
+    }
+    /* we assume pred is positive */
+    pred = FASTDIV((pred + (scale >> 1)), scale);
+
+    if(encoding){
+        ret = level - pred;
+    }else{
+        level += pred;
+        ret= level;
+        if(s->error_resilience>=3){
+            if(level<0){
+                av_log(s->avctx, AV_LOG_ERROR, "dc<0 at %dx%d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+            if(level*scale > 2048 + scale){
+                av_log(s->avctx, AV_LOG_ERROR, "dc overflow at %dx%d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+        }
+    }
+    level *=scale;
+    if(level&(~2047)){
+        if(level<0) 
+            level=0;
+        else if(!(s->workaround_bugs&FF_BUG_DC_CLIP))
+            level=2047;
+    }
+    dc_val[0]= level;
+
+    return ret;
+}
 
 /**
  * predicts the ac.
@@ -2675,11 +2619,180 @@ void mpeg4_pred_ac(MpegEncContext * s, DCTELEM *block, int n,
 
 #ifdef CONFIG_ENCODERS
 
+/**
+ * encodes the dc value.
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ */
+static inline void mpeg4_encode_dc(PutBitContext * s, int level, int n)
+{
+#if 1
+//    if(level<-255 || level>255) printf("dc overflow\n");
+    level+=256;
+    if (n < 4) {
+	/* luminance */
+	put_bits(s, uni_DCtab_lum_len[level], uni_DCtab_lum_bits[level]);
+    } else {
+	/* chrominance */
+	put_bits(s, uni_DCtab_chrom_len[level], uni_DCtab_chrom_bits[level]);
+    }
+#else
+    int size, v;
+    /* find number of bits */
+    size = 0;
+    v = abs(level);
+    while (v) {
+	v >>= 1;
+	size++;
+    }
+
+    if (n < 4) {
+	/* luminance */
+	put_bits(&s->pb, DCtab_lum[size][1], DCtab_lum[size][0]);
+    } else {
+	/* chrominance */
+	put_bits(&s->pb, DCtab_chrom[size][1], DCtab_chrom[size][0]);
+    }
+
+    /* encode remaining bits */
+    if (size > 0) {
+	if (level < 0)
+	    level = (-level) ^ ((1 << size) - 1);
+	put_bits(&s->pb, size, level);
+	if (size > 8)
+	    put_bits(&s->pb, 1, 1);
+    }
+#endif
+}
 
+static inline int mpeg4_get_dc_length(int level, int n){
+    if (n < 4) {
+        return uni_DCtab_lum_len[level + 256];
+    } else {
+        return uni_DCtab_chrom_len[level + 256];
+    }
+}
 
+/**
+ * encodes a 8x8 block
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ */
+static inline void mpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
+                               uint8_t *scan_table, PutBitContext *dc_pb, PutBitContext *ac_pb)
+{
+    int i, last_non_zero;
+#if 0 //variables for the outcommented version
+    int code, sign, last;
+#endif
+    const RLTable *rl;
+    uint32_t *bits_tab;
+    uint8_t *len_tab;
+    const int last_index = s->block_last_index[n];
 
+    if (s->mb_intra) { //Note gcc (3.2.1 at least) will optimize this away
+	/* mpeg4 based DC predictor */
+	mpeg4_encode_dc(dc_pb, intra_dc, n);
+        if(last_index<1) return;
+	i = 1;
+        rl = &rl_intra;
+        bits_tab= uni_mpeg4_intra_rl_bits;
+        len_tab = uni_mpeg4_intra_rl_len;
+    } else {
+        if(last_index<0) return;
+	i = 0;
+        rl = &rl_inter;
+        bits_tab= uni_mpeg4_inter_rl_bits;
+        len_tab = uni_mpeg4_inter_rl_len;
+    }
+
+    /* AC coefs */
+    last_non_zero = i - 1;
+#if 1
+    for (; i < last_index; i++) {
+	int level = block[ scan_table[i] ];
+	if (level) {
+	    int run = i - last_non_zero - 1;
+            level+=64;
+            if((level&(~127)) == 0){
+                const int index= UNI_MPEG4_ENC_INDEX(0, run, level);
+                put_bits(ac_pb, len_tab[index], bits_tab[index]);
+            }else{ //ESC3
+                put_bits(ac_pb, 7+2+1+6+1+12+1, (3<<23)+(3<<21)+(0<<20)+(run<<14)+(1<<13)+(((level-64)&0xfff)<<1)+1);
+            }
+	    last_non_zero = i;
+	}
+    }
+    /*if(i<=last_index)*/{
+	int level = block[ scan_table[i] ];
+        int run = i - last_non_zero - 1;
+        level+=64;
+        if((level&(~127)) == 0){
+            const int index= UNI_MPEG4_ENC_INDEX(1, run, level);
+            put_bits(ac_pb, len_tab[index], bits_tab[index]);
+        }else{ //ESC3
+            put_bits(ac_pb, 7+2+1+6+1+12+1, (3<<23)+(3<<21)+(1<<20)+(run<<14)+(1<<13)+(((level-64)&0xfff)<<1)+1);
+        }
+    }
+#else
+    for (; i <= last_index; i++) {
+	const int slevel = block[ scan_table[i] ];
+	if (slevel) {
+            int level;
+	    int run = i - last_non_zero - 1;
+	    last = (i == last_index);
+	    sign = 0;
+	    level = slevel;
+	    if (level < 0) {
+		sign = 1;
+		level = -level;
+	    }
+            code = get_rl_index(rl, last, run, level);
+            put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+            if (code == rl->n) {
+                int level1, run1;
+                level1 = level - rl->max_level[last][run];
+                if (level1 < 1) 
+                    goto esc2;
+                code = get_rl_index(rl, last, run, level1);
+                if (code == rl->n) {
+                esc2:
+                    put_bits(ac_pb, 1, 1);
+                    if (level > MAX_LEVEL)
+                        goto esc3;
+                    run1 = run - rl->max_run[last][level] - 1;
+                    if (run1 < 0)
+                        goto esc3;
+                    code = get_rl_index(rl, last, run1, level);
+                    if (code == rl->n) {
+                    esc3:
+                        /* third escape */
+                        put_bits(ac_pb, 1, 1);
+                        put_bits(ac_pb, 1, last);
+                        put_bits(ac_pb, 6, run);
+                        put_bits(ac_pb, 1, 1);
+                        put_bits(ac_pb, 12, slevel & 0xfff);
+                        put_bits(ac_pb, 1, 1);
+                    } else {
+                        /* second escape */
+                        put_bits(ac_pb, 1, 0);
+                        put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+                        put_bits(ac_pb, 1, sign);
+                    }
+                } else {
+                    /* first escape */
+                    put_bits(ac_pb, 1, 0);
+                    put_bits(ac_pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+                    put_bits(ac_pb, 1, sign);
+                }
+            } else {
+                put_bits(ac_pb, 1, sign);
+            }
+	    last_non_zero = i;
+	}
+    }
+#endif
+}
 
-static inline int mpeg4_get_block_length(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
+static int mpeg4_get_block_length(MpegEncContext * s, DCTELEM * block, int n, int intra_dc, 
                                uint8_t *scan_table)
 {
     int i, last_non_zero;
@@ -2690,7 +2803,7 @@ static inline int mpeg4_get_block_length(MpegEncContext * s, DCTELEM * block, in
 
     if (s->mb_intra) { //Note gcc (3.2.1 at least) will optimize this away
 	/* mpeg4 based DC predictor */
-	//mpeg4_encode_dc(dc_pb, intra_dc, n); //FIXME
+	len += mpeg4_get_dc_length(intra_dc, n);
         if(last_index<1) return len;
 	i = 1;
         rl = &rl_intra;
@@ -2749,13 +2862,17 @@ static VLC mb_type_b_vlc;
 static VLC h263_mbtype_b_vlc;
 static VLC cbpc_b_vlc;
 
-void init_vlc_rl(RLTable *rl)
+void init_vlc_rl(RLTable *rl, int use_static)
 {
     int i, q;
-    
+ 
+    /* Return if static table is already initialized */
+    if(use_static && rl->rl_vlc[0])
+        return;    
+
     init_vlc(&rl->vlc, 9, rl->n + 1, 
              &rl->table_vlc[0][1], 4, 2,
-             &rl->table_vlc[0][0], 4, 2);
+             &rl->table_vlc[0][0], 4, 2, use_static);
 
     
     for(q=0; q<32; q++){
@@ -2766,8 +2883,10 @@ void init_vlc_rl(RLTable *rl)
             qmul=1;
             qadd=0;
         }
-        
-        rl->rl_vlc[q]= av_malloc(rl->vlc.table_size*sizeof(RL_VLC_ELEM));
+        if(use_static)        
+            rl->rl_vlc[q]= av_mallocz_static(rl->vlc.table_size*sizeof(RL_VLC_ELEM));
+        else
+            rl->rl_vlc[q]= av_malloc(rl->vlc.table_size*sizeof(RL_VLC_ELEM));
         for(i=0; i<rl->vlc.table_size; i++){
             int code= rl->vlc.table[i][0];
             int len = rl->vlc.table[i][1];
@@ -2808,44 +2927,44 @@ void h263_decode_init_vlc(MpegEncContext *s)
 
         init_vlc(&intra_MCBPC_vlc, INTRA_MCBPC_VLC_BITS, 9, 
                  intra_MCBPC_bits, 1, 1,
-                 intra_MCBPC_code, 1, 1);
+                 intra_MCBPC_code, 1, 1, 1);
         init_vlc(&inter_MCBPC_vlc, INTER_MCBPC_VLC_BITS, 28, 
                  inter_MCBPC_bits, 1, 1,
-                 inter_MCBPC_code, 1, 1);
+                 inter_MCBPC_code, 1, 1, 1);
         init_vlc(&cbpy_vlc, CBPY_VLC_BITS, 16,
                  &cbpy_tab[0][1], 2, 1,
-                 &cbpy_tab[0][0], 2, 1);
+                 &cbpy_tab[0][0], 2, 1, 1);
         init_vlc(&mv_vlc, MV_VLC_BITS, 33,
                  &mvtab[0][1], 2, 1,
-                 &mvtab[0][0], 2, 1);
-        init_rl(&rl_inter);
-        init_rl(&rl_intra);
-        init_rl(&rvlc_rl_inter);
-        init_rl(&rvlc_rl_intra);
-        init_rl(&rl_intra_aic);
-        init_vlc_rl(&rl_inter);
-        init_vlc_rl(&rl_intra);
-        init_vlc_rl(&rvlc_rl_inter);
-        init_vlc_rl(&rvlc_rl_intra);
-        init_vlc_rl(&rl_intra_aic);
+                 &mvtab[0][0], 2, 1, 1);
+        init_rl(&rl_inter, 1);
+        init_rl(&rl_intra, 1);
+        init_rl(&rvlc_rl_inter, 1);
+        init_rl(&rvlc_rl_intra, 1);
+        init_rl(&rl_intra_aic, 1);
+        init_vlc_rl(&rl_inter, 1);
+        init_vlc_rl(&rl_intra, 1);
+        init_vlc_rl(&rvlc_rl_inter, 1);
+        init_vlc_rl(&rvlc_rl_intra, 1);
+        init_vlc_rl(&rl_intra_aic, 1);
         init_vlc(&dc_lum, DC_VLC_BITS, 10 /* 13 */,
                  &DCtab_lum[0][1], 2, 1,
-                 &DCtab_lum[0][0], 2, 1);
+                 &DCtab_lum[0][0], 2, 1, 1);
         init_vlc(&dc_chrom, DC_VLC_BITS, 10 /* 13 */,
                  &DCtab_chrom[0][1], 2, 1,
-                 &DCtab_chrom[0][0], 2, 1);
+                 &DCtab_chrom[0][0], 2, 1, 1);
         init_vlc(&sprite_trajectory, SPRITE_TRAJ_VLC_BITS, 15,
                  &sprite_trajectory_tab[0][1], 4, 2,
-                 &sprite_trajectory_tab[0][0], 4, 2);
+                 &sprite_trajectory_tab[0][0], 4, 2, 1);
         init_vlc(&mb_type_b_vlc, MB_TYPE_B_VLC_BITS, 4,
                  &mb_type_b_tab[0][1], 2, 1,
-                 &mb_type_b_tab[0][0], 2, 1);
+                 &mb_type_b_tab[0][0], 2, 1, 1);
         init_vlc(&h263_mbtype_b_vlc, H263_MBTYPE_B_VLC_BITS, 15,
                  &h263_mbtype_b_tab[0][1], 2, 1,
-                 &h263_mbtype_b_tab[0][0], 2, 1);
+                 &h263_mbtype_b_tab[0][0], 2, 1, 1);
         init_vlc(&cbpc_b_vlc, CBPC_B_VLC_BITS, 4,
                  &cbpc_b_tab[0][1], 2, 1,
-                 &cbpc_b_tab[0][0], 2, 1);
+                 &cbpc_b_tab[0][0], 2, 1, 1);
     }
 }
 
@@ -2955,7 +3074,7 @@ void ff_mpeg4_init_partitions(MpegEncContext *s)
     uint8_t *start= pbBufPtr(&s->pb);
     uint8_t *end= s->pb.buf_end;
     int size= end - start;
-    int pb_size = (((int)start + size/3)&(~3)) - (int)start;
+    int pb_size = (((long)start + size/3)&(~3)) - (long)start;
     int tex_size= (size - 2*pb_size)&(~3);
     
     set_put_bits_buffer_size(&s->pb, pb_size);
@@ -3274,53 +3393,6 @@ static inline int get_amv(MpegEncContext *s, int n){
 }
 
 /**
- * decodes the dc value.
- * @param n block index (0-3 are luma, 4-5 are chroma)
- * @param dir_ptr the prediction direction will be stored here
- * @return the quantized dc
- */
-static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
-{
-    int level, code;
-
-    if (n < 4) 
-        code = get_vlc2(&s->gb, dc_lum.table, DC_VLC_BITS, 1);
-    else 
-        code = get_vlc2(&s->gb, dc_chrom.table, DC_VLC_BITS, 1);
-    if (code < 0 || code > 9 /* && s->nbit<9 */){
-        av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
-        return -1;
-    }
-    if (code == 0) {
-        level = 0;
-    } else {
-        if(IS_3IV1){
-            if(code==1)
-                level= 2*get_bits1(&s->gb)-1;
-            else{
-                if(get_bits1(&s->gb))
-                    level = get_bits(&s->gb, code-1) + (1<<(code-1));
-                else
-                    level = -get_bits(&s->gb, code-1) - (1<<(code-1));
-            }
-        }else{
-            level = get_xbits(&s->gb, code);
-        }
-
-        if (code > 8){
-            if(get_bits1(&s->gb)==0){ /* marker */
-                if(s->error_resilience>=2){
-                    av_log(s->avctx, AV_LOG_ERROR, "dc marker bit missing\n");
-                    return -1;
-                }
-            }
-        }
-    }
-
-    return ff_mpeg4_pred_dc(s, n, level, dir_ptr, 0);
-}
-
-/**
  * decodes first partition.
  * @return number of MBs decoded or <0 if an error occured
  */
@@ -3626,263 +3698,6 @@ int ff_mpeg4_decode_partitions(MpegEncContext *s)
 }
 
 /**
- * decodes a block.
- * @return <0 if an error occured
- */
-static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
-                              int n, int coded, int intra, int rvlc)
-{
-    int level, i, last, run;
-    int dc_pred_dir;
-    RLTable * rl;
-    RL_VLC_ELEM * rl_vlc;
-    const uint8_t * scan_table;
-    int qmul, qadd;
-
-    //Note intra & rvlc should be optimized away if this is inlined
-    
-    if(intra) {
-      if(s->qscale < s->intra_dc_threshold){
-	/* DC coef */
-        if(s->partitioned_frame){
-            level = s->dc_val[0][ s->block_index[n] ];
-            if(n<4) level= FASTDIV((level + (s->y_dc_scale>>1)), s->y_dc_scale);
-            else    level= FASTDIV((level + (s->c_dc_scale>>1)), s->c_dc_scale);
-            dc_pred_dir= (s->pred_dir_table[s->mb_x + s->mb_y*s->mb_stride]<<n)&32;
-        }else{
-            level = mpeg4_decode_dc(s, n, &dc_pred_dir);
-            if (level < 0)
-                return -1;
-        }
-        block[0] = level;
-        i = 0;
-      }else{
-            i = -1;
-      }  
-        if (!coded) 
-            goto not_coded;
-        
-        if(rvlc){        
-            rl = &rvlc_rl_intra;
-            rl_vlc = rvlc_rl_intra.rl_vlc[0];
-        }else{
-            rl = &rl_intra;
-            rl_vlc = rl_intra.rl_vlc[0];
-        }
-        if (s->ac_pred) {
-            if (dc_pred_dir == 0) 
-                scan_table = s->intra_v_scantable.permutated; /* left */
-            else
-                scan_table = s->intra_h_scantable.permutated; /* top */
-        } else {
-            scan_table = s->intra_scantable.permutated;
-        }
-        qmul=1;
-        qadd=0;
-    } else {
-        i = -1;
-        if (!coded) {
-            s->block_last_index[n] = i;
-            return 0;
-        }
-        if(rvlc) rl = &rvlc_rl_inter;
-        else     rl = &rl_inter;
-   
-        scan_table = s->intra_scantable.permutated;
-
-        if(s->mpeg_quant){
-            qmul=1;
-            qadd=0;
-            if(rvlc){        
-                rl_vlc = rvlc_rl_inter.rl_vlc[0];        
-            }else{
-                rl_vlc = rl_inter.rl_vlc[0];        
-            }
-        }else{
-            qmul = s->qscale << 1;
-            qadd = (s->qscale - 1) | 1;
-            if(rvlc){        
-                rl_vlc = rvlc_rl_inter.rl_vlc[s->qscale];        
-            }else{
-                rl_vlc = rl_inter.rl_vlc[s->qscale];        
-            }
-        }
-    }
-  {
-    OPEN_READER(re, &s->gb);
-    for(;;) {
-        UPDATE_CACHE(re, &s->gb);
-        GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2);
-        if (level==0) {
-          /* escape */                
-          if(rvlc){
-                if(SHOW_UBITS(re, &s->gb, 1)==0){
-                    av_log(s->avctx, AV_LOG_ERROR, "1. marker bit missing in rvlc esc\n");
-                    return -1;
-                }; SKIP_CACHE(re, &s->gb, 1);
- 
-                last=  SHOW_UBITS(re, &s->gb, 1); SKIP_CACHE(re, &s->gb, 1);
-                run=   SHOW_UBITS(re, &s->gb, 6); LAST_SKIP_CACHE(re, &s->gb, 6);
-                SKIP_COUNTER(re, &s->gb, 1+1+6);
-                UPDATE_CACHE(re, &s->gb);
-              
-                if(SHOW_UBITS(re, &s->gb, 1)==0){
-                    av_log(s->avctx, AV_LOG_ERROR, "2. marker bit missing in rvlc esc\n");
-                    return -1;
-                }; SKIP_CACHE(re, &s->gb, 1);
- 
-                level= SHOW_UBITS(re, &s->gb, 11); SKIP_CACHE(re, &s->gb, 11);
- 
-                if(SHOW_UBITS(re, &s->gb, 5)!=0x10){
-                    av_log(s->avctx, AV_LOG_ERROR, "reverse esc missing\n");
-                    return -1;
-                }; SKIP_CACHE(re, &s->gb, 5);
-
-                level=  level * qmul + qadd;
-                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1); LAST_SKIP_CACHE(re, &s->gb, 1);
-                SKIP_COUNTER(re, &s->gb, 1+11+5+1);
-
-                i+= run + 1;
-                if(last) i+=192;
-          }else{
-            int cache;
-            cache= GET_CACHE(re, &s->gb);
-
-            if(IS_3IV1) 
-                cache ^= 0xC0000000;
-
-            if (cache&0x80000000) {
-                if (cache&0x40000000) {
-                    int ulevel;
-
-                    /* third escape */
-                    SKIP_CACHE(re, &s->gb, 2);
-                    last=  SHOW_UBITS(re, &s->gb, 1); SKIP_CACHE(re, &s->gb, 1);
-                    run=   SHOW_UBITS(re, &s->gb, 6); LAST_SKIP_CACHE(re, &s->gb, 6);
-                    SKIP_COUNTER(re, &s->gb, 2+1+6);
-                    UPDATE_CACHE(re, &s->gb);
-
-                    if(IS_3IV1){
-                        level= SHOW_SBITS(re, &s->gb, 12); LAST_SKIP_BITS(re, &s->gb, 12);
-                    }else{
-                        if(SHOW_UBITS(re, &s->gb, 1)==0){
-                            av_log(s->avctx, AV_LOG_ERROR, "1. marker bit missing in 3. esc\n");
-                            return -1;
-                        }; SKIP_CACHE(re, &s->gb, 1);
-
-                        level= SHOW_SBITS(re, &s->gb, 12); SKIP_CACHE(re, &s->gb, 12);
-
-                        if(SHOW_UBITS(re, &s->gb, 1)==0){
-                            av_log(s->avctx, AV_LOG_ERROR, "2. marker bit missing in 3. esc\n");
-                            return -1;
-                        }; LAST_SKIP_CACHE(re, &s->gb, 1);
-
-                        SKIP_COUNTER(re, &s->gb, 1+12+1);
-                    }
- 
-                    if(s->mpeg_quant){
-                        if(intra) ulevel= level*s->qscale*s->intra_matrix[scan_table[1]];
-                        else      ulevel= level*s->qscale*s->inter_matrix[scan_table[0]];
-                    }else
-                        ulevel= level*s->qscale*16;
-                    if(ulevel>1030*16 || ulevel<-1030*16){
-                        av_log(s->avctx, AV_LOG_ERROR, "|level| overflow in 3. esc, qp=%d\n", s->qscale);
-                        return -1;
-                    }
-
-#if 0
-                    if(s->error_resilience >= FF_ER_COMPLIANT){
-                        const int abs_level= ABS(level);
-                        if(abs_level<=MAX_LEVEL && run<=MAX_RUN){
-                            const int run1= run - rl->max_run[last][abs_level] - 1;
-                            if(abs_level <= rl->max_level[last][run]){
-                                av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, vlc encoding possible\n");
-                                return -1;
-                            }
-                            if(s->error_resilience > FF_ER_COMPLIANT){
-                                if(abs_level <= rl->max_level[last][run]*2){
-                                    fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n");
-                                    return -1;
-                                }
-                                if(run1 >= 0 && abs_level <= rl->max_level[last][run1]){
-                                    fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n");
-                                    return -1;
-                                }
-                            }
-                        }
-                    }
-#endif
-		    if (level>0) level= level * qmul + qadd;
-                    else         level= level * qmul - qadd;
-
-                    i+= run + 1;
-                    if(last) i+=192;
-                } else {
-                    /* second escape */
-#if MIN_CACHE_BITS < 20
-                    LAST_SKIP_BITS(re, &s->gb, 2);
-                    UPDATE_CACHE(re, &s->gb);
-#else
-                    SKIP_BITS(re, &s->gb, 2);
-#endif
-                    GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2);
-                    i+= run + rl->max_run[run>>7][level/qmul] +1; //FIXME opt indexing
-                    level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                    LAST_SKIP_BITS(re, &s->gb, 1);
-                }
-            } else {
-                /* first escape */
-#if MIN_CACHE_BITS < 19
-                LAST_SKIP_BITS(re, &s->gb, 1);
-                UPDATE_CACHE(re, &s->gb);
-#else
-                SKIP_BITS(re, &s->gb, 1);
-#endif
-                GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2);
-                i+= run;
-                level = level + rl->max_level[run>>7][(run-1)&63] * qmul;//FIXME opt indexing
-                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                LAST_SKIP_BITS(re, &s->gb, 1);
-            }
-          }
-        } else {
-            i+= run;
-            level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-            LAST_SKIP_BITS(re, &s->gb, 1);
-        }
-        if (i > 62){
-            i-= 192;
-            if(i&(~63)){
-                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-
-            block[scan_table[i]] = level;
-            break;
-        }
-
-        block[scan_table[i]] = level;
-    }
-    CLOSE_READER(re, &s->gb);
-  }
- not_coded:
-    if (intra) {
-        if(s->qscale >= s->intra_dc_threshold){
-            block[0] = ff_mpeg4_pred_dc(s, n, block[0], &dc_pred_dir, 0);
-            
-            if(i == -1) i=0;
-        }
-
-        mpeg4_pred_ac(s, block, n, dc_pred_dir);
-        if (s->ac_pred) {
-            i = 63; /* XXX: not optimal */
-        }
-    }
-    s->block_last_index[n] = i;
-    return 0;
-}
-
-/**
  * decode partition C of one MB.
  * @return <0 if an error occured
  */
@@ -4681,8 +4496,8 @@ static int h263_decode_motion(MpegEncContext * s, int pred, int f_code)
 
     /* modulo decoding */
     if (!s->h263_long_vectors) {
-        l = 1 << (f_code + 4);
-        val = ((val + l)&(l*2-1)) - l;
+        l = INT_BIT - 5 - f_code;
+        val = (val<<l)>>l;
     } else {
         /* horrible h263 long vector mode */
         if (pred < -31 && val < -63)
@@ -4830,7 +4645,7 @@ retry:
                 memset(block, 0, sizeof(DCTELEM)*64);
                 goto retry;
             }
-            av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d\n", s->mb_x, s->mb_y);
+            av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d i:%d\n", s->mb_x, s->mb_y, s->mb_intra);
             return -1;
         }
         j = scan_table[i];
@@ -4848,9 +4663,307 @@ not_coded:
     return 0;
 }
 
+/**
+ * decodes the dc value.
+ * @param n block index (0-3 are luma, 4-5 are chroma)
+ * @param dir_ptr the prediction direction will be stored here
+ * @return the quantized dc
+ */
+static inline int mpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
+{
+    int level, code;
+
+    if (n < 4) 
+        code = get_vlc2(&s->gb, dc_lum.table, DC_VLC_BITS, 1);
+    else 
+        code = get_vlc2(&s->gb, dc_chrom.table, DC_VLC_BITS, 1);
+    if (code < 0 || code > 9 /* && s->nbit<9 */){
+        av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
+        return -1;
+    }
+    if (code == 0) {
+        level = 0;
+    } else {
+        if(IS_3IV1){
+            if(code==1)
+                level= 2*get_bits1(&s->gb)-1;
+            else{
+                if(get_bits1(&s->gb))
+                    level = get_bits(&s->gb, code-1) + (1<<(code-1));
+                else
+                    level = -get_bits(&s->gb, code-1) - (1<<(code-1));
+            }
+        }else{
+            level = get_xbits(&s->gb, code);
+        }
+
+        if (code > 8){
+            if(get_bits1(&s->gb)==0){ /* marker */
+                if(s->error_resilience>=2){
+                    av_log(s->avctx, AV_LOG_ERROR, "dc marker bit missing\n");
+                    return -1;
+                }
+            }
+        }
+    }
+
+    return ff_mpeg4_pred_dc(s, n, level, dir_ptr, 0);
+}
+
+/**
+ * decodes a block.
+ * @return <0 if an error occured
+ */
+static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
+                              int n, int coded, int intra, int rvlc)
+{
+    int level, i, last, run;
+    int dc_pred_dir;
+    RLTable * rl;
+    RL_VLC_ELEM * rl_vlc;
+    const uint8_t * scan_table;
+    int qmul, qadd;
+
+    //Note intra & rvlc should be optimized away if this is inlined
+    
+    if(intra) {
+      if(s->qscale < s->intra_dc_threshold){
+	/* DC coef */
+        if(s->partitioned_frame){
+            level = s->dc_val[0][ s->block_index[n] ];
+            if(n<4) level= FASTDIV((level + (s->y_dc_scale>>1)), s->y_dc_scale);
+            else    level= FASTDIV((level + (s->c_dc_scale>>1)), s->c_dc_scale);
+            dc_pred_dir= (s->pred_dir_table[s->mb_x + s->mb_y*s->mb_stride]<<n)&32;
+        }else{
+            level = mpeg4_decode_dc(s, n, &dc_pred_dir);
+            if (level < 0)
+                return -1;
+        }
+        block[0] = level;
+        i = 0;
+      }else{
+            i = -1;
+      }  
+        if (!coded) 
+            goto not_coded;
+        
+        if(rvlc){        
+            rl = &rvlc_rl_intra;
+            rl_vlc = rvlc_rl_intra.rl_vlc[0];
+        }else{
+            rl = &rl_intra;
+            rl_vlc = rl_intra.rl_vlc[0];
+        }
+        if (s->ac_pred) {
+            if (dc_pred_dir == 0) 
+                scan_table = s->intra_v_scantable.permutated; /* left */
+            else
+                scan_table = s->intra_h_scantable.permutated; /* top */
+        } else {
+            scan_table = s->intra_scantable.permutated;
+        }
+        qmul=1;
+        qadd=0;
+    } else {
+        i = -1;
+        if (!coded) {
+            s->block_last_index[n] = i;
+            return 0;
+        }
+        if(rvlc) rl = &rvlc_rl_inter;
+        else     rl = &rl_inter;
+   
+        scan_table = s->intra_scantable.permutated;
+
+        if(s->mpeg_quant){
+            qmul=1;
+            qadd=0;
+            if(rvlc){        
+                rl_vlc = rvlc_rl_inter.rl_vlc[0];        
+            }else{
+                rl_vlc = rl_inter.rl_vlc[0];        
+            }
+        }else{
+            qmul = s->qscale << 1;
+            qadd = (s->qscale - 1) | 1;
+            if(rvlc){        
+                rl_vlc = rvlc_rl_inter.rl_vlc[s->qscale];        
+            }else{
+                rl_vlc = rl_inter.rl_vlc[s->qscale];        
+            }
+        }
+    }
+  {
+    OPEN_READER(re, &s->gb);
+    for(;;) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2, 0);
+        if (level==0) {
+          /* escape */                
+          if(rvlc){
+                if(SHOW_UBITS(re, &s->gb, 1)==0){
+                    av_log(s->avctx, AV_LOG_ERROR, "1. marker bit missing in rvlc esc\n");
+                    return -1;
+                }; SKIP_CACHE(re, &s->gb, 1);
+ 
+                last=  SHOW_UBITS(re, &s->gb, 1); SKIP_CACHE(re, &s->gb, 1);
+                run=   SHOW_UBITS(re, &s->gb, 6); LAST_SKIP_CACHE(re, &s->gb, 6);
+                SKIP_COUNTER(re, &s->gb, 1+1+6);
+                UPDATE_CACHE(re, &s->gb);
+              
+                if(SHOW_UBITS(re, &s->gb, 1)==0){
+                    av_log(s->avctx, AV_LOG_ERROR, "2. marker bit missing in rvlc esc\n");
+                    return -1;
+                }; SKIP_CACHE(re, &s->gb, 1);
+ 
+                level= SHOW_UBITS(re, &s->gb, 11); SKIP_CACHE(re, &s->gb, 11);
+ 
+                if(SHOW_UBITS(re, &s->gb, 5)!=0x10){
+                    av_log(s->avctx, AV_LOG_ERROR, "reverse esc missing\n");
+                    return -1;
+                }; SKIP_CACHE(re, &s->gb, 5);
+
+                level=  level * qmul + qadd;
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1); LAST_SKIP_CACHE(re, &s->gb, 1);
+                SKIP_COUNTER(re, &s->gb, 1+11+5+1);
+
+                i+= run + 1;
+                if(last) i+=192;
+          }else{
+            int cache;
+            cache= GET_CACHE(re, &s->gb);
+
+            if(IS_3IV1) 
+                cache ^= 0xC0000000;
+
+            if (cache&0x80000000) {
+                if (cache&0x40000000) {
+                    /* third escape */
+                    SKIP_CACHE(re, &s->gb, 2);
+                    last=  SHOW_UBITS(re, &s->gb, 1); SKIP_CACHE(re, &s->gb, 1);
+                    run=   SHOW_UBITS(re, &s->gb, 6); LAST_SKIP_CACHE(re, &s->gb, 6);
+                    SKIP_COUNTER(re, &s->gb, 2+1+6);
+                    UPDATE_CACHE(re, &s->gb);
+
+                    if(IS_3IV1){
+                        level= SHOW_SBITS(re, &s->gb, 12); LAST_SKIP_BITS(re, &s->gb, 12);
+                    }else{
+                        if(SHOW_UBITS(re, &s->gb, 1)==0){
+                            av_log(s->avctx, AV_LOG_ERROR, "1. marker bit missing in 3. esc\n");
+                            return -1;
+                        }; SKIP_CACHE(re, &s->gb, 1);
+
+                        level= SHOW_SBITS(re, &s->gb, 12); SKIP_CACHE(re, &s->gb, 12);
+
+                        if(SHOW_UBITS(re, &s->gb, 1)==0){
+                            av_log(s->avctx, AV_LOG_ERROR, "2. marker bit missing in 3. esc\n");
+                            return -1;
+                        }; LAST_SKIP_CACHE(re, &s->gb, 1);
+
+                        SKIP_COUNTER(re, &s->gb, 1+12+1);
+                    }
+ 
+#if 0
+                    if(s->error_resilience >= FF_ER_COMPLIANT){
+                        const int abs_level= ABS(level);
+                        if(abs_level<=MAX_LEVEL && run<=MAX_RUN){
+                            const int run1= run - rl->max_run[last][abs_level] - 1;
+                            if(abs_level <= rl->max_level[last][run]){
+                                av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, vlc encoding possible\n");
+                                return -1;
+                            }
+                            if(s->error_resilience > FF_ER_COMPLIANT){
+                                if(abs_level <= rl->max_level[last][run]*2){
+                                    fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n");
+                                    return -1;
+                                }
+                                if(run1 >= 0 && abs_level <= rl->max_level[last][run1]){
+                                    fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n");
+                                    return -1;
+                                }
+                            }
+                        }
+                    }
+#endif
+		    if (level>0) level= level * qmul + qadd;
+                    else         level= level * qmul - qadd;
+
+                    if((unsigned)(level + 2048) > 4095){
+                        if(s->error_resilience > FF_ER_COMPLIANT){
+                            if(level > 2560 || level<-2560){
+                                av_log(s->avctx, AV_LOG_ERROR, "|level| overflow in 3. esc, qp=%d\n", s->qscale);
+                                return -1;
+                            }
+                        }
+                        level= level<0 ? -2048 : 2047;
+                    }
 
+                    i+= run + 1;
+                    if(last) i+=192;
+                } else {
+                    /* second escape */
+#if MIN_CACHE_BITS < 20
+                    LAST_SKIP_BITS(re, &s->gb, 2);
+                    UPDATE_CACHE(re, &s->gb);
+#else
+                    SKIP_BITS(re, &s->gb, 2);
+#endif
+                    GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2, 1);
+                    i+= run + rl->max_run[run>>7][level/qmul] +1; //FIXME opt indexing
+                    level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                    LAST_SKIP_BITS(re, &s->gb, 1);
+                }
+            } else {
+                /* first escape */
+#if MIN_CACHE_BITS < 19
+                LAST_SKIP_BITS(re, &s->gb, 1);
+                UPDATE_CACHE(re, &s->gb);
+#else
+                SKIP_BITS(re, &s->gb, 1);
+#endif
+                GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2, 1);
+                i+= run;
+                level = level + rl->max_level[run>>7][(run-1)&63] * qmul;//FIXME opt indexing
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+            }
+          }
+        } else {
+            i+= run;
+            level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+            LAST_SKIP_BITS(re, &s->gb, 1);
+        }
+        if (i > 62){
+            i-= 192;
+            if(i&(~63)){
+                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+
+            block[scan_table[i]] = level;
+            break;
+        }
 
+        block[scan_table[i]] = level;
+    }
+    CLOSE_READER(re, &s->gb);
+  }
+ not_coded:
+    if (intra) {
+        if(s->qscale >= s->intra_dc_threshold){
+            block[0] = ff_mpeg4_pred_dc(s, n, block[0], &dc_pred_dir, 0);
+            
+            if(i == -1) i=0;
+        }
 
+        mpeg4_pred_ac(s, block, n, dc_pred_dir);
+        if (s->ac_pred) {
+            i = 63; /* XXX: not optimal */
+        }
+    }
+    s->block_last_index[n] = i;
+    return 0;
+}
 
 /* most is hardcoded. should extend to handle all h263 streams */
 int h263_decode_picture_header(MpegEncContext *s)
@@ -5434,7 +5547,7 @@ static int decode_vol_header(MpegEncContext *s, GetBitContext *gb){
             skip_bits1(gb);   /* marker */
             height = get_bits(gb, 13);
             skip_bits1(gb);   /* marker */
-            if(width && height){ /* they should be non zero but who knows ... */
+            if(width && height && !(s->width && s->avctx->codec_tag == ff_get_fourcc("MP4S"))){ /* they should be non zero but who knows ... */
                 s->width = width;
                 s->height = height;
 //                printf("width/height: %d %d\n", width, height);
@@ -5738,7 +5851,8 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
     
     /* vop coded */
     if (get_bits1(gb) != 1){
-        av_log(s->avctx, AV_LOG_ERROR, "vop not coded\n");
+        if(s->avctx->debug&FF_DEBUG_PICT_INFO)
+            av_log(s->avctx, AV_LOG_ERROR, "vop not coded\n");
         return FRAME_SKIPED;
     }
 //printf("time %d %d %d || %Ld %Ld %Ld\n", s->time_increment_bits, s->time_increment_resolution, s->time_base,
@@ -5874,13 +5988,20 @@ int ff_mpeg4_decode_picture_header(MpegEncContext * s, GetBitContext *gb)
 
     /* search next start code */
     align_get_bits(gb);
+
+    if(s->avctx->codec_tag == ff_get_fourcc("WV1F") && show_bits(gb, 24) == 0x575630){
+        skip_bits(gb, 24);
+        if(get_bits(gb, 8) == 0xF0)
+            return decode_vop_header(s, gb);
+    }
+
     startcode = 0xff;
     for(;;) {
         v = get_bits(gb, 8);
         startcode = ((startcode << 8) | v) & 0xffffffff;
         
         if(get_bits_count(gb) >= gb->size_in_bits){
-            if(gb->size_in_bits==8 && s->divx_version){
+            if(gb->size_in_bits==8 && (s->divx_version || s->xvid_build)){
                 av_log(s->avctx, AV_LOG_ERROR, "frame skip %d\n", gb->size_in_bits);
                 return FRAME_SKIPED; //divx bug
             }else
@@ -6059,7 +6180,7 @@ int flv_h263_decode_picture_header(MpegEncContext *s)
         width = height = 0;
         break;
     }
-    if ((width == 0) || (height == 0))
+    if(avcodec_check_dimensions(s->avctx, width, height))
         return -1;
     s->width = width;
     s->height = height;
@@ -6068,7 +6189,7 @@ int flv_h263_decode_picture_header(MpegEncContext *s)
     s->dropable= s->pict_type > P_TYPE;
     if (s->dropable)
         s->pict_type = P_TYPE;
-
+    
     skip_bits1(&s->gb);	/* deblocking flag */
     s->chroma_qscale= s->qscale = get_bits(&s->gb, 5);
 
@@ -6085,7 +6206,7 @@ int flv_h263_decode_picture_header(MpegEncContext *s)
 
     if(s->avctx->debug & FF_DEBUG_PICT_INFO){
         av_log(s->avctx, AV_LOG_DEBUG, "%c esc_type:%d, qp:%d num:%d\n",
-               av_get_pict_type_char(s->pict_type), s->h263_flv-1, s->qscale, s->picture_number);
+               s->dropable ? 'D' : av_get_pict_type_char(s->pict_type), s->h263_flv-1, s->qscale, s->picture_number);
     }
     
     s->y_dc_scale_table=
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 1ffefa1b2..da2bd54a2 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -37,8 +37,8 @@ int ff_h263_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     s->out_format = FMT_H263;
 
-    s->width = avctx->width;
-    s->height = avctx->height;
+    s->width  = avctx->coded_width;
+    s->height = avctx->coded_height;
     s->workaround_bugs= avctx->workaround_bugs;
 
     // set defaults
@@ -85,6 +85,11 @@ int ff_h263_decode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->msmpeg4_version=5;
         break;
+    case CODEC_ID_WMV3:
+        s->h263_msmpeg4 = 1;
+        s->h263_pred = 1;
+        s->msmpeg4_version=6;
+        break;
     case CODEC_ID_H263I:
         break;
     case CODEC_ID_FLV1:
@@ -139,6 +144,7 @@ static int get_consumed_bytes(MpegEncContext *s, int buf_size){
 
 static int decode_slice(MpegEncContext *s){
     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
+    const int mb_size= 16>>s->avctx->lowres;
     s->last_resync_gb= s->gb;
     s->first_slice_line= 1;
         
@@ -214,7 +220,7 @@ static int decode_slice(MpegEncContext *s){
                         
                     if(++s->mb_x >= s->mb_width){
                         s->mb_x=0;
-                        ff_draw_horiz_band(s, s->mb_y*16, 16);
+                        ff_draw_horiz_band(s, s->mb_y*mb_size, mb_size);
                         s->mb_y++;
                     }
                     return 0; 
@@ -234,7 +240,7 @@ static int decode_slice(MpegEncContext *s){
                 ff_h263_loop_filter(s);
         }
         
-        ff_draw_horiz_band(s, s->mb_y*16, 16);
+        ff_draw_horiz_band(s, s->mb_y*mb_size, mb_size);
         
         s->mb_x= 0;
     }
@@ -254,18 +260,25 @@ static int decode_slice(MpegEncContext *s){
         
         if(bits_left==0){
             s->padding_bug_score+=16;
-        }else if(bits_left>8){
-            s->padding_bug_score++;
         } else if(bits_left != 1){
             int v= show_bits(&s->gb, 8);
             v|= 0x7F >> (7-(bits_count&7));
 
-            if(v==0x7F)
+            if(v==0x7F && bits_left<=8)
                 s->padding_bug_score--;
+            else if(v==0x7F && ((get_bits_count(&s->gb)+8)&8) && bits_left<=16)
+                s->padding_bug_score+= 4;
             else
                 s->padding_bug_score++;            
         }                          
     }
+    
+    if(s->workaround_bugs&FF_BUG_AUTODETECT){
+        if(s->padding_bug_score > -2 && !s->data_partitioning /*&& (s->divx_version || !s->resync_marker)*/)
+            s->workaround_bugs |=  FF_BUG_NO_PADDING;
+        else
+            s->workaround_bugs &= ~FF_BUG_NO_PADDING;
+    }
 
     // handle formats which dont have unique end markers
     if(s->msmpeg4_version || (s->workaround_bugs&FF_BUG_NO_PADDING)){ //FIXME perhaps solve this more cleanly
@@ -515,13 +528,13 @@ retry:
         if(s->avctx->codec_tag == ff_get_fourcc("DIVX") && s->vo_type==0 && s->vol_control_parameters==0)
             s->divx_version= 400; //divx 4
     }
+    
+    if(s->xvid_build && s->divx_version){
+        s->divx_version=
+        s->divx_build= 0;
+    }
 
     if(s->workaround_bugs&FF_BUG_AUTODETECT){
-        s->workaround_bugs &= ~FF_BUG_NO_PADDING;
-        
-        if(s->padding_bug_score > -2 && !s->data_partitioning && (s->divx_version || !s->resync_marker))
-            s->workaround_bugs |=  FF_BUG_NO_PADDING;
-
         if(s->avctx->codec_tag == ff_get_fourcc("XVIX")) 
             s->workaround_bugs|= FF_BUG_XVID_ILACE;
 
@@ -623,13 +636,21 @@ retry:
     fprintf(f, "%d %d %f\n", buf_size, s->qscale, buf_size*(double)s->qscale);
 }
 #endif
-       
+
+#ifdef HAVE_MMX
+    if(s->codec_id == CODEC_ID_MPEG4 && s->xvid_build && avctx->idct_algo == FF_IDCT_AUTO && (mm_flags & MM_MMX) && !(s->flags&CODEC_FLAG_BITEXACT)){
+        avctx->idct_algo= FF_IDCT_LIBMPEG2MMX;
+        avctx->coded_width= 0; // force reinit
+    }
+#endif
+
         /* After H263 & mpeg4 header decode we have the height, width,*/
         /* and other parameters. So then we could init the picture   */
         /* FIXME: By the way H263 decoder is evolving it should have */
         /* an H263EncContext                                         */
     
-    if (   s->width != avctx->width || s->height != avctx->height) {
+    if (   s->width  != avctx->coded_width 
+        || s->height != avctx->coded_height) {
         /* H.263 could change picture size any time */
         ParseContext pc= s->parse_context; //FIXME move these demuxng hack to avformat
         s->parse_context.buffer=0;
@@ -637,8 +658,7 @@ retry:
         s->parse_context= pc;
     }
     if (!s->context_initialized) {
-        avctx->width = s->width;
-        avctx->height = s->height;
+        avcodec_set_dimensions(avctx, s->width, s->height);
 
         goto retry;
     }
@@ -651,7 +671,7 @@ retry:
     s->current_picture.key_frame= s->pict_type == I_TYPE;
 
     /* skip b frames if we dont have reference frames */
-    if(s->last_picture_ptr==NULL && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
+    if(s->last_picture_ptr==NULL && (s->pict_type==B_TYPE || s->dropable)) return get_consumed_bytes(s, buf_size);
     /* skip b frames if we are in a hurry */
     if(avctx->hurry_up && s->pict_type==B_TYPE) return get_consumed_bytes(s, buf_size);
     /* skip everything if we are in a hurry>=5 */
@@ -709,9 +729,8 @@ retry:
     if(s->codec_id==CODEC_ID_MPEG4 && s->bitstream_buffer_size==0 && s->divx_packed){
         int current_pos= get_bits_count(&s->gb)>>3;
         int startcode_found=0;
-
-        if(   buf_size - current_pos > 5 
-           && buf_size - current_pos < BITSTREAM_BUFFER_SIZE){
+        
+        if(buf_size - current_pos > 5){
             int i;
             for(i=current_pos; i<buf_size-3; i++){
                 if(buf[i]==0 && buf[i+1]==0 && buf[i+2]==1 && buf[i+3]==0xB6){
@@ -726,6 +745,10 @@ retry:
         }
 
         if(startcode_found){
+            s->bitstream_buffer= av_fast_realloc(
+                s->bitstream_buffer, 
+                &s->allocated_bitstream_buffer_size, 
+                buf_size - current_pos + FF_INPUT_BUFFER_PADDING_SIZE);
             memcpy(s->bitstream_buffer, buf + current_pos, buf_size - current_pos);
             s->bitstream_buffer_size= buf_size - current_pos;
         }
@@ -760,12 +783,6 @@ printf("%Ld\n", rdtsc()-time);
     return get_consumed_bytes(s, buf_size);
 }
 
-static const AVOption mpeg4_decoptions[] =
-{
-    AVOPTION_SUB(avoptions_workaround_bug),
-    AVOPTION_END()
-};
-
 AVCodec mpeg4_decoder = {
     "mpeg4",
     CODEC_TYPE_VIDEO,
@@ -775,8 +792,7 @@ AVCodec mpeg4_decoder = {
     NULL,
     ff_h263_decode_end,
     ff_h263_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
-    .options = mpeg4_decoptions,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
     .flush= ff_mpeg_flush,
 };
 
@@ -789,7 +805,7 @@ AVCodec h263_decoder = {
     NULL,
     ff_h263_decode_end,
     ff_h263_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
     .flush= ff_mpeg_flush,
 };
 
@@ -803,7 +819,6 @@ AVCodec msmpeg4v1_decoder = {
     ff_h263_decode_end,
     ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
-    mpeg4_decoptions,
 };
 
 AVCodec msmpeg4v2_decoder = {
@@ -816,7 +831,6 @@ AVCodec msmpeg4v2_decoder = {
     ff_h263_decode_end,
     ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
-    mpeg4_decoptions,
 };
 
 AVCodec msmpeg4v3_decoder = {
@@ -829,7 +843,6 @@ AVCodec msmpeg4v3_decoder = {
     ff_h263_decode_end,
     ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
-    .options = mpeg4_decoptions,
 };
 
 AVCodec wmv1_decoder = {
@@ -842,7 +855,6 @@ AVCodec wmv1_decoder = {
     ff_h263_decode_end,
     ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
-    mpeg4_decoptions,
 };
 
 AVCodec h263i_decoder = {
@@ -855,7 +867,6 @@ AVCodec h263i_decoder = {
     ff_h263_decode_end,
     ff_h263_decode_frame,
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
-    mpeg4_decoptions,
 };
 
 AVCodec flv_decoder = {
diff --git a/src/libffmpeg/libavcodec/h264.c b/src/libffmpeg/libavcodec/h264.c
index 889900362..39ee4c6a2 100644
--- a/src/libffmpeg/libavcodec/h264.c
+++ b/src/libffmpeg/libavcodec/h264.c
@@ -87,6 +87,8 @@ typedef struct SPS{
     uint32_t time_scale;
     int fixed_frame_rate_flag;
     short offset_for_ref_frame[256]; //FIXME dyn aloc?
+    int bitstream_restriction_flag;
+    int num_reorder_frames;
 }SPS;
 
 /**
@@ -151,10 +153,12 @@ typedef struct H264Context{
     uint8_t *rbsp_buffer;
     int rbsp_buffer_size;
 
-    // AVC
-    int is_avc; // != 0 if data is avc variant of h264
-    int got_avcC; // flag to parse avcC data only once
-    int nal_length_size; // Number of bytes used for nal length (1, 2 or 4)
+    /**
+      * Used to parse AVC variant of h264
+      */
+    int is_avc; ///< this flag is != 0 if codec is avc1
+    int got_avcC; ///< flag used to parse avcC data only once
+    int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 
     int chroma_qp; //QPc
 
@@ -163,6 +167,9 @@ typedef struct H264Context{
     //prediction stuff
     int chroma_pred_mode;
     int intra16x16_pred_mode;
+
+    int top_mb_xy;
+    int left_mb_xy[2];
     
     int8_t intra4x4_pred_mode_cache[5*8];
     int8_t (*intra4x4_pred_mode)[8];
@@ -173,21 +180,21 @@ typedef struct H264Context{
     unsigned int top_samples_available;
     unsigned int topright_samples_available;
     unsigned int left_samples_available;
-    uint8_t (*top_border)[16+2*8];
-    uint8_t left_border[17+2*9];
+    uint8_t (*top_borders[2])[16+2*8];
+    uint8_t left_border[2*(17+2*9)];
 
     /**
      * non zero coeff count cache.
      * is 64 if not available.
      */
-    uint8_t non_zero_count_cache[6*8];
+    uint8_t non_zero_count_cache[6*8] __align8;
     uint8_t (*non_zero_count)[16];
 
     /**
      * Motion vector cache.
      */
-    int16_t mv_cache[2][5*8][2];
-    int8_t ref_cache[2][5*8];
+    int16_t mv_cache[2][5*8][2] __align8;
+    int8_t ref_cache[2][5*8] __align8;
 #define LIST_NOT_USED -1 //FIXME rename?
 #define PART_NOT_AVAILABLE -2
     
@@ -196,12 +203,15 @@ typedef struct H264Context{
      */
     int mv_cache_clean[2];
 
-    int block_offset[16+8];
-    int chroma_subblock_offset[16]; //FIXME remove
+    /**
+     * block_offset[ 0..23] for frame macroblocks
+     * block_offset[24..47] for field macroblocks
+     */
+    int block_offset[2*(16+8)];
     
     uint16_t *mb2b_xy; //FIXME are these 4 a good idea?
     uint16_t *mb2b8_xy;
-    int b_stride;
+    int b_stride; //FIXME use s->b4_stride
     int b8_stride;
 
     int halfpel_flag;
@@ -226,6 +236,7 @@ typedef struct H264Context{
     int slice_type_fixed;
     
     //interlacing specific flags
+    int mb_aff_frame;
     int mb_field_decoding_flag;
     
     int sub_mb_type[4];
@@ -253,12 +264,15 @@ typedef struct H264Context{
     int max_pic_num;
 
     //Weighted pred stuff
+    int use_weight;
+    int use_weight_chroma;
     int luma_log2_weight_denom;
     int chroma_log2_weight_denom;
     int luma_weight[2][16];
     int luma_offset[2][16];
     int chroma_weight[2][16][2];
     int chroma_offset[2][16][2];
+    int implicit_weight[16][16];
    
     //deblock
     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0 
@@ -268,16 +282,20 @@ typedef struct H264Context{
     int redundant_pic_count;
     
     int direct_spatial_mv_pred;
+    int dist_scale_factor[16];
+    int map_col_to_list0[2][16];
 
     /**
      * num_ref_idx_l0/1_active_minus1 + 1
      */
     int ref_count[2];// FIXME split for AFF
-    Picture *short_ref[16];
-    Picture *long_ref[16];
+    Picture *short_ref[32];
+    Picture *long_ref[32];
     Picture default_ref_list[2][32];
     Picture ref_list[2][32]; //FIXME size?
     Picture field_ref_list[2][32]; //FIXME size?
+    Picture *delayed_pic[16]; //FIXME size?
+    Picture *delayed_output_pic;
     
     /**
      * memory management control operations buffer.
@@ -305,11 +323,15 @@ typedef struct H264Context{
 
     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
     uint16_t     *cbp_table;
+    int top_cbp;
+    int left_cbp;
     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
     uint8_t     *chroma_pred_mode_table;
     int         last_qscale_diff;
     int16_t     (*mvd_table[2])[2];
-    int16_t     mvd_cache[2][5*8][2];
+    int16_t     mvd_cache[2][5*8][2] __align8;
+    uint8_t     *direct_table;
+    uint8_t     direct_cache[5*8];
 
 }H264Context;
 
@@ -324,7 +346,7 @@ static VLC run7_vlc;
 
 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
-static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr);
+static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 
 static inline uint32_t pack16to32(int a, int b){
 #ifdef WORDS_BIGENDIAN
@@ -336,8 +358,8 @@ static inline uint32_t pack16to32(int a, int b){
 
 /**
  * fill a rectangle.
- * @param h height of the recatangle, should be a constant
- * @param w width of the recatangle, should be a constant
+ * @param h height of the rectangle, should be a constant
+ * @param w width of the rectangle, should be a constant
  * @param size the size of val (1 or 4), should be a constant
  */
 static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ //FIXME ensure this IS inlined
@@ -347,6 +369,7 @@ static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t v
     w      *= size;
     stride *= size;
     
+    assert((((int)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 //FIXME check what gcc generates for 64 bit on x86 and possible write a 32 bit ver of it
     if(w==2 && h==2){
         *(uint16_t*)(p + 0)=
@@ -397,38 +420,110 @@ static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t v
         assert(0);
 }
 
-static inline void fill_caches(H264Context *h, int mb_type){
+static inline void fill_caches(H264Context *h, int mb_type, int for_deblock){
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
     int topleft_xy, top_xy, topright_xy, left_xy[2];
     int topleft_type, top_type, topright_type, left_type[2];
-    int left_block[4];
+    int left_block[8];
     int i;
 
     //wow what a mess, why didnt they simplify the interlacing&intra stuff, i cant imagine that these complex rules are worth it 
     
-    if(h->sps.mb_aff){
-    //FIXME
-        topleft_xy = 0; /* avoid warning */
-        top_xy = 0; /* avoid warning */
-        topright_xy = 0; /* avoid warning */
+    top_xy     = mb_xy  - s->mb_stride;
+    topleft_xy = top_xy - 1;
+    topright_xy= top_xy + 1;
+    left_xy[1] = left_xy[0] = mb_xy-1;
+    left_block[0]= 0;
+    left_block[1]= 1;
+    left_block[2]= 2;
+    left_block[3]= 3;
+    left_block[4]= 7;
+    left_block[5]= 10;
+    left_block[6]= 8;
+    left_block[7]= 11;
+    if(h->mb_aff_frame){
+        const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
+        const int top_pair_xy      = pair_xy     - s->mb_stride;
+        const int topleft_pair_xy  = top_pair_xy - 1;
+        const int topright_pair_xy = top_pair_xy + 1;
+        const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
+        const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
+        const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
+        const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
+        const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
+        const int bottom = (s->mb_y & 1);
+        tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
+        if (bottom
+                ? !curr_mb_frame_flag // bottom macroblock
+                : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
+                ) {
+            top_xy -= s->mb_stride;
+        }
+        if (bottom
+                ? !curr_mb_frame_flag // bottom macroblock
+                : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
+                ) {
+            topleft_xy -= s->mb_stride;
+        }
+        if (bottom
+                ? !curr_mb_frame_flag // bottom macroblock
+                : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
+                ) {
+            topright_xy -= s->mb_stride;
+        }
+        if (left_mb_frame_flag != curr_mb_frame_flag) {
+            left_xy[1] = left_xy[0] = pair_xy - 1;
+            if (curr_mb_frame_flag) {
+                if (bottom) {
+                    left_block[0]= 2;
+                    left_block[1]= 2;
+                    left_block[2]= 3;
+                    left_block[3]= 3;
+                    left_block[4]= 8;
+                    left_block[5]= 11;
+                    left_block[6]= 8;
+                    left_block[7]= 11;
+                } else {
+                    left_block[0]= 0;
+                    left_block[1]= 0;
+                    left_block[2]= 1;
+                    left_block[3]= 1;
+                    left_block[4]= 7;
+                    left_block[5]= 10;
+                    left_block[6]= 7;
+                    left_block[7]= 10;
+                }
+            } else {
+                left_xy[1] += s->mb_stride;
+                //left_block[0]= 0;
+                left_block[1]= 2;
+                left_block[2]= 0;
+                left_block[3]= 2;
+                //left_block[4]= 7;
+                left_block[5]= 10;
+                left_block[6]= 7;
+                left_block[7]= 10;
+            }
+        }
+    }
+
+    h->top_mb_xy = top_xy;
+    h->left_mb_xy[0] = left_xy[0];
+    h->left_mb_xy[1] = left_xy[1];
+    if(for_deblock){
+        topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
+        top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
+        topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
+        left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
+        left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
     }else{
-        topleft_xy = mb_xy-1 - s->mb_stride;
-        top_xy     = mb_xy   - s->mb_stride;
-        topright_xy= mb_xy+1 - s->mb_stride;
-        left_xy[0]   = mb_xy-1;
-        left_xy[1]   = mb_xy-1;
-        left_block[0]= 0;
-        left_block[1]= 1;
-        left_block[2]= 2;
-        left_block[3]= 3;
-    }
-
-    topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
-    top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
-    topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
-    left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
-    left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
+        topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
+        top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
+        topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
+        left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
+        left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
+    }
 
     if(IS_INTRA(mb_type)){
         h->topleft_samples_available= 
@@ -462,10 +557,10 @@ static inline void fill_caches(H264Context *h, int mb_type){
                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
             }else{
                 int pred;
-                if(IS_INTRA16x16(top_type) || (IS_INTER(top_type) && !h->pps.constrained_intra_pred))
-                    pred= 2;
-                else{
+                if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
                     pred= -1;
+                else{
+                    pred= 2;
                 }
                 h->intra4x4_pred_mode_cache[4+8*0]=
                 h->intra4x4_pred_mode_cache[5+8*0]=
@@ -478,10 +573,10 @@ static inline void fill_caches(H264Context *h, int mb_type){
                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
                 }else{
                     int pred;
-                    if(IS_INTRA16x16(left_type[i]) || (IS_INTER(left_type[i]) && !h->pps.constrained_intra_pred))
-                        pred= 2;
-                    else{
+                    if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
                         pred= -1;
+                    else{
+                        pred= 2;
                     }
                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
@@ -501,16 +596,17 @@ static inline void fill_caches(H264Context *h, int mb_type){
 */
 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
     if(top_type){
-        h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][0];
-        h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][1];
-        h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][2];
+        h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
+        h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
+        h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
     
-        h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][7];
+        h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
     
-        h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][10];
+        h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
+        
     }else{
         h->non_zero_count_cache[4+8*0]=      
         h->non_zero_count_cache[5+8*0]=
@@ -521,44 +617,61 @@ static inline void fill_caches(H264Context *h, int mb_type){
         h->non_zero_count_cache[2+8*0]=
     
         h->non_zero_count_cache[1+8*3]=
-        h->non_zero_count_cache[2+8*3]= 64;
+        h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
+        
     }
-    
-    if(left_type[0]){
-        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][6];
-        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][5];
-        h->non_zero_count_cache[0+8*1]= h->non_zero_count[left_xy[0]][9]; //FIXME left_block
-        h->non_zero_count_cache[0+8*4]= h->non_zero_count[left_xy[0]][12];
-    }else{
-        h->non_zero_count_cache[3+8*1]= 
-        h->non_zero_count_cache[3+8*2]= 
-        h->non_zero_count_cache[0+8*1]= 
-        h->non_zero_count_cache[0+8*4]= 64;
+
+    for (i=0; i<2; i++) {
+        if(left_type[i]){
+            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
+            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
+            h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
+            h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
+        }else{
+            h->non_zero_count_cache[3+8*1 + 2*8*i]= 
+            h->non_zero_count_cache[3+8*2 + 2*8*i]= 
+            h->non_zero_count_cache[0+8*1 +   8*i]= 
+            h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
+        }
     }
-    
-    if(left_type[1]){
-        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[1]][4];
-        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[1]][3];
-        h->non_zero_count_cache[0+8*2]= h->non_zero_count[left_xy[1]][8];
-        h->non_zero_count_cache[0+8*5]= h->non_zero_count[left_xy[1]][11];
-    }else{
-        h->non_zero_count_cache[3+8*3]= 
-        h->non_zero_count_cache[3+8*4]= 
-        h->non_zero_count_cache[0+8*2]= 
-        h->non_zero_count_cache[0+8*5]= 64;
+
+    if( h->pps.cabac ) {
+        // top_cbp
+        if(top_type) {
+            h->top_cbp = h->cbp_table[top_xy];
+        } else if(IS_INTRA(mb_type)) {
+            h->top_cbp = 0x1C0;
+        } else {
+            h->top_cbp = 0;
+        }
+        // left_cbp
+        if (left_type[0]) {
+            h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
+        } else if(IS_INTRA(mb_type)) {
+            h->left_cbp = 0x1C0;
+        } else {
+            h->left_cbp = 0;
+        }
+        if (left_type[0]) {
+            h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
+        }
+        if (left_type[1]) {
+            h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
+        }
     }
-    
+
 #if 1
-    if(IS_INTER(mb_type)){
+    //FIXME direct mb can skip much of this
+    if(IS_INTER(mb_type) || (IS_DIRECT(mb_type) && h->direct_spatial_mv_pred)){
         int list;
         for(list=0; list<2; list++){
-            if((!IS_8X8(mb_type)) && !USES_LIST(mb_type, list)){
+            if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !for_deblock){
                 /*if(!h->mv_cache_clean[list]){
                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
                     h->mv_cache_clean[list]= 1;
                 }*/
-                continue; //FIXME direct mode ...
+                continue;
             }
             h->mv_cache_clean[list]= 0;
             
@@ -630,6 +743,9 @@ static inline void fill_caches(H264Context *h, int mb_type){
                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
             }
 
+            if(for_deblock)
+                continue;
+
             h->ref_cache[list][scan8[5 ]+1] = 
             h->ref_cache[list][scan8[7 ]+1] = 
             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewher else)
@@ -683,9 +799,35 @@ static inline void fill_caches(H264Context *h, int mb_type){
                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
+
+                if(h->slice_type == B_TYPE){
+                    fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
+
+                    if(IS_DIRECT(top_type)){
+                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
+                    }else if(IS_8X8(top_type)){
+                        int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
+                        h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
+                        h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
+                    }else{
+                        *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
+                    }
+                    
+                    //FIXME interlacing
+                    if(IS_DIRECT(left_type[0])){
+                        h->direct_cache[scan8[0] - 1 + 0*8]=
+                        h->direct_cache[scan8[0] - 1 + 2*8]= 1;
+                    }else if(IS_8X8(left_type[0])){
+                        int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
+                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
+                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
+                    }else{
+                        h->direct_cache[scan8[0] - 1 + 0*8]=
+                        h->direct_cache[scan8[0] - 1 + 2*8]= 0;
+                    }
+                }
             }
         }
-//FIXME
     }
 #endif
 }
@@ -747,8 +889,10 @@ static inline int check_intra_pred_mode(H264Context *h, int mode){
     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
     
-    if(mode < 0 || mode > 6)
+    if(mode < 0 || mode > 6) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
         return -1;
+    }
     
     if(!(h->top_samples_available&0x8000)){
         mode= top[ mode ];
@@ -788,21 +932,21 @@ static inline void write_back_non_zero_count(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 
-    h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[4+8*4];
-    h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[5+8*4];
-    h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[6+8*4];
+    h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
+    h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
+    h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
-    h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[7+8*3];
-    h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[7+8*2];
-    h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[7+8*1];
+    h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
+    h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
+    h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
     
-    h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[1+8*2];
+    h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
-    h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[2+8*1];
+    h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 
-    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[1+8*5];
+    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
-    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[2+8*4];
+    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 }
 
 /**
@@ -864,6 +1008,7 @@ static inline void pred_motion(H264Context * const h, int n, int part_width, int
 
     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
+    tprintf("pred_motion match_count=%d\n", match_count);
     if(match_count > 1){ //most common
         *mx= mid_pred(A[0], B[0], C[0]);
         *my= mid_pred(A[1], B[1], C[1]);
@@ -902,7 +1047,7 @@ static inline void pred_16x8_motion(H264Context * const h, int n, int list, int
         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 
-        tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
+        tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
         
         if(top_ref == ref){
             *mx= B[0];
@@ -913,7 +1058,7 @@ static inline void pred_16x8_motion(H264Context * const h, int n, int list, int
         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
         
-        tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
+        tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 
         if(left_ref == ref){
             *mx= A[0];
@@ -937,7 +1082,7 @@ static inline void pred_8x16_motion(H264Context * const h, int n, int list, int
         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
         
-        tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
+        tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 
         if(left_ref == ref){
             *mx= A[0];
@@ -950,7 +1095,7 @@ static inline void pred_8x16_motion(H264Context * const h, int n, int list, int
 
         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
         
-        tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
+        tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 
         if(diagonal_ref == ref){ 
             *mx= C[0];
@@ -967,7 +1112,7 @@ static inline void pred_pskip_motion(H264Context * const h, int * const mx, int
     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 
-    tprintf("pred_pskip: (%d) (%d) at %2d %2d", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
+    tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 
     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
@@ -982,6 +1127,226 @@ static inline void pred_pskip_motion(H264Context * const h, int * const mx, int
     return;
 }
 
+static inline void direct_dist_scale_factor(H264Context * const h){
+    const int poc = h->s.current_picture_ptr->poc;
+    const int poc1 = h->ref_list[1][0].poc;
+    int i;
+    for(i=0; i<h->ref_count[0]; i++){
+        int poc0 = h->ref_list[0][i].poc;
+        int td = clip(poc1 - poc0, -128, 127);
+        if(td == 0 /* FIXME || pic0 is a long-term ref */){
+            h->dist_scale_factor[i] = 256;
+        }else{
+            int tb = clip(poc - poc0, -128, 127);
+            int tx = (16384 + (ABS(td) >> 1)) / td;
+            h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
+        }
+    }
+}
+static inline void direct_ref_list_init(H264Context * const h){
+    MpegEncContext * const s = &h->s;
+    Picture * const ref1 = &h->ref_list[1][0];
+    Picture * const cur = s->current_picture_ptr;
+    int list, i, j;
+    if(cur->pict_type == I_TYPE)
+        cur->ref_count[0] = 0;
+    if(cur->pict_type != B_TYPE)
+        cur->ref_count[1] = 0;
+    for(list=0; list<2; list++){
+        cur->ref_count[list] = h->ref_count[list];
+        for(j=0; j<h->ref_count[list]; j++)
+            cur->ref_poc[list][j] = h->ref_list[list][j].poc;
+    }
+    if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
+        return;
+    for(list=0; list<2; list++){
+        for(i=0; i<ref1->ref_count[list]; i++){
+            const int poc = ref1->ref_poc[list][i];
+            h->map_col_to_list0[list][i] = PART_NOT_AVAILABLE;
+            for(j=0; j<h->ref_count[list]; j++)
+                if(h->ref_list[list][j].poc == poc){
+                    h->map_col_to_list0[list][i] = j;
+                    break;
+                }
+        }
+    }
+}
+
+static inline void pred_direct_motion(H264Context * const h, int *mb_type){
+    MpegEncContext * const s = &h->s;
+    const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
+    const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
+    const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
+    const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
+    const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
+    const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
+    const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
+    const int is_b8x8 = IS_8X8(*mb_type);
+    int sub_mb_type;
+    int i8, i4;
+
+    if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
+        /* FIXME save sub mb types from previous frames (or derive from MVs)
+         * so we know exactly what block size to use */
+        sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
+        *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
+    }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
+        sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+        *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
+    }else{
+        sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
+        *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
+    }
+    if(!is_b8x8)
+        *mb_type |= MB_TYPE_DIRECT2;
+
+    tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
+    
+    if(h->direct_spatial_mv_pred){
+        int ref[2];
+        int mv[2][2];
+        int list;
+
+        /* ref = min(neighbors) */
+        for(list=0; list<2; list++){
+            int refa = h->ref_cache[list][scan8[0] - 1];
+            int refb = h->ref_cache[list][scan8[0] - 8];
+            int refc = h->ref_cache[list][scan8[0] - 8 + 4];
+            if(refc == -2)
+                refc = h->ref_cache[list][scan8[0] - 8 - 1];
+            ref[list] = refa;
+            if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
+                ref[list] = refb;
+            if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
+                ref[list] = refc;
+            if(ref[list] < 0)
+                ref[list] = -1;
+        }
+
+        if(ref[0] < 0 && ref[1] < 0){
+            ref[0] = ref[1] = 0;
+            mv[0][0] = mv[0][1] =
+            mv[1][0] = mv[1][1] = 0;
+        }else{
+            for(list=0; list<2; list++){
+                if(ref[list] >= 0)
+                    pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
+                else
+                    mv[list][0] = mv[list][1] = 0;
+            }
+        }
+
+        if(ref[1] < 0){
+            *mb_type &= ~MB_TYPE_P0L1;
+            sub_mb_type &= ~MB_TYPE_P0L1;
+        }else if(ref[0] < 0){
+            *mb_type &= ~MB_TYPE_P0L0;
+            sub_mb_type &= ~MB_TYPE_P0L0;
+        }
+
+        if(IS_16X16(*mb_type)){
+            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref[0], 1);
+            fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, ref[1], 1);
+            if(!IS_INTRA(mb_type_col) && l1ref0[0] == 0 &&
+                ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1){
+                if(ref[0] > 0)
+                    fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
+                else
+                    fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+                if(ref[1] > 0)
+                    fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
+                else
+                    fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+            }else{
+                fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
+                fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
+            }
+        }else{
+            for(i8=0; i8<4; i8++){
+                const int x8 = i8&1;
+                const int y8 = i8>>1;
+    
+                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
+                    continue;
+                h->sub_mb_type[i8] = sub_mb_type;
+    
+                fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
+                fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
+                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref[0], 1);
+                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, ref[1], 1);
+    
+                /* col_zero_flag */
+                if(!IS_INTRA(mb_type_col) && l1ref0[x8 + y8*h->b8_stride] == 0){
+                    for(i4=0; i4<4; i4++){
+                        const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
+                        if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
+                            if(ref[0] == 0)
+                                *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
+                            if(ref[1] == 0)
+                                *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }else{ /* direct temporal mv pred */
+        if(IS_16X16(*mb_type)){
+            fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
+            if(IS_INTRA(mb_type_col)){
+                fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+                fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+            }else{
+                const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
+                                                : h->map_col_to_list0[1][l1ref1[0]];
+                const int dist_scale_factor = h->dist_scale_factor[ref0];
+                const int16_t *mv_col = l1mv0[0];
+                int mv_l0[2];
+                mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
+                mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
+                fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
+                fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
+                fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
+            }
+        }else{
+            for(i8=0; i8<4; i8++){
+                const int x8 = i8&1;
+                const int y8 = i8>>1;
+                int ref0, dist_scale_factor;
+    
+                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
+                    continue;
+                h->sub_mb_type[i8] = sub_mb_type;
+                if(IS_INTRA(mb_type_col)){
+                    fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
+                    fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
+                    fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
+                    fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
+                    continue;
+                }
+    
+                ref0 = l1ref0[x8 + y8*h->b8_stride];
+                if(ref0 >= 0)
+                    ref0 = h->map_col_to_list0[0][ref0];
+                else
+                    ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
+                dist_scale_factor = h->dist_scale_factor[ref0];
+    
+                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
+                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
+                for(i4=0; i4<4; i4++){
+                    const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
+                    int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
+                    mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
+                    mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
+                    *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
+                        pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
+                }
+            }
+        }
+    }
+}
+
 static inline void write_back_motion(H264Context *h, int mb_type){
     MpegEncContext * const s = &h->s;
     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
@@ -990,7 +1355,7 @@ static inline void write_back_motion(H264Context *h, int mb_type){
 
     for(list=0; list<2; list++){
         int y;
-        if((!IS_8X8(mb_type)) && !USES_LIST(mb_type, list)){
+        if(!USES_LIST(mb_type, list)){
             if(1){ //FIXME skip or never read if mb_type doesnt use it
                 for(y=0; y<4; y++){
                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
@@ -1004,10 +1369,10 @@ static inline void write_back_motion(H264Context *h, int mb_type){
                     }
                 }
                 for(y=0; y<2; y++){
-                    *(uint16_t*)s->current_picture.motion_val[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
+                    *(uint16_t*)&s->current_picture.ref_index[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
                 }
             }
-            continue; //FIXME direct mode ...
+            continue;
         }
         
         for(y=0; y<4; y++){
@@ -1025,6 +1390,14 @@ static inline void write_back_motion(H264Context *h, int mb_type){
             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
         }
     }
+    
+    if(h->slice_type == B_TYPE && h->pps.cabac){
+        if(IS_8X8(mb_type)){
+            h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
+            h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
+            h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
+        }
+    }
 }
 
 /**
@@ -1315,46 +1688,12 @@ static void chroma_dc_dct_c(DCTELEM *block){
 /**
  * gets the chroma qp.
  */
-static inline int get_chroma_qp(H264Context *h, int qscale){
+static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
     
-    return chroma_qp[clip(qscale + h->pps.chroma_qp_index_offset, 0, 51)];
+    return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
 }
 
 
-/**
- *
- */
-static void h264_add_idct_c(uint8_t *dst, DCTELEM *block, int stride){
-    int i;
-    uint8_t *cm = cropTbl + MAX_NEG_CROP;
-
-    block[0] += 32;
-
-    for(i=0; i<4; i++){
-        const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
-        const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
-        const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
-        const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
-
-        block[0 + 4*i]= z0 + z3;
-        block[1 + 4*i]= z1 + z2;
-        block[2 + 4*i]= z1 - z2;
-        block[3 + 4*i]= z0 - z3;
-    }
-
-    for(i=0; i<4; i++){
-        const int z0=  block[i + 4*0]     +  block[i + 4*2];
-        const int z1=  block[i + 4*0]     -  block[i + 4*2];
-        const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
-        const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
-
-        dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ];
-        dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ];
-        dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
-        dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
-    }
-}
-
 #if 0
 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
     int i;
@@ -1998,7 +2337,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
 }
 
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
+static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int x_offset, int y_offset,
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -2032,9 +2371,95 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
     }
 }
 
+static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
+                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                           int x_offset, int y_offset,
+                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+                           h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
+                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
+                           int list0, int list1){
+    MpegEncContext * const s = &h->s;
+
+    dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
+    dest_cb +=   x_offset +   y_offset*s->uvlinesize;
+    dest_cr +=   x_offset +   y_offset*s->uvlinesize;
+    x_offset += 8*s->mb_x;
+    y_offset += 8*s->mb_y;
+    
+    if(list0 && list1){
+        /* don't optimize for luma-only case, since B-frames usually
+         * use implicit weights => chroma too. */
+        uint8_t *tmp_cb = s->obmc_scratchpad;
+        uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
+        uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
+        int refn0 = h->ref_cache[0][ scan8[n] ];
+        int refn1 = h->ref_cache[1][ scan8[n] ];
+
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+                    dest_y, dest_cb, dest_cr,
+                    x_offset, y_offset, qpix_put, chroma_put);
+        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+                    tmp_y, tmp_cb, tmp_cr,
+                    x_offset, y_offset, qpix_put, chroma_put);
+
+        if(h->use_weight == 2){
+            int weight0 = h->implicit_weight[refn0][refn1];
+            int weight1 = 64 - weight0;
+            luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0, 0);
+            chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0, 0);
+        }else{
+            luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
+                            h->luma_weight[0][refn0], h->luma_weight[1][refn1], 
+                            h->luma_offset[0][refn0], h->luma_offset[1][refn1]);
+            chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
+                            h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0], 
+                            h->chroma_offset[0][refn0][0], h->chroma_offset[1][refn1][0]);
+            chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
+                            h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1], 
+                            h->chroma_offset[0][refn0][1], h->chroma_offset[1][refn1][1]);
+        }
+    }else{
+        int list = list1 ? 1 : 0;
+        int refn = h->ref_cache[list][ scan8[n] ];
+        Picture *ref= &h->ref_list[list][refn];
+        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                    qpix_put, chroma_put);
+
+        luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
+                       h->luma_weight[list][refn], h->luma_offset[list][refn]);
+        if(h->use_weight_chroma){
+            chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
+                             h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
+            chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
+                             h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
+        }
+    }
+}
+
+static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
+                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                           int x_offset, int y_offset,
+                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+                           h264_weight_func *weight_op, h264_biweight_func *weight_avg, 
+                           int list0, int list1){
+    if((h->use_weight==2 && list0 && list1
+        && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
+       || h->use_weight==1)
+        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+                         x_offset, y_offset, qpix_put, chroma_put,
+                         weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
+    else
+        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
+}
+
 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
-                      qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg)){
+                      qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+                      h264_weight_func *weight_op, h264_biweight_func *weight_avg){
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
     const int mb_type= s->current_picture.mb_type[mb_xy];
@@ -2044,20 +2469,25 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
     if(IS_16X16(mb_type)){
         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
+                &weight_op[0], &weight_avg[0],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
     }else if(IS_16X8(mb_type)){
         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
     }else if(IS_8X16(mb_type)){
         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+                &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+                &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
     }else{
         int i;
@@ -2073,20 +2503,25 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
             if(IS_SUB_8X8(sub_mb_type)){
                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+                    &weight_op[3], &weight_avg[3],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             }else if(IS_SUB_8X4(sub_mb_type)){
                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+                    &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+                    &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             }else if(IS_SUB_4X8(sub_mb_type)){
                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+                    &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+                    &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             }else{
                 int j;
@@ -2096,6 +2531,7 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
                     int sub_y_offset= y_offset +   (j&2);
                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+                        &weight_op[6], &weight_avg[6],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                 }
             }
@@ -2112,33 +2548,33 @@ static void decode_init_vlc(H264Context *h){
 
         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5, 
                  &chroma_dc_coeff_token_len [0], 1, 1,
-                 &chroma_dc_coeff_token_bits[0], 1, 1);
+                 &chroma_dc_coeff_token_bits[0], 1, 1, 1);
 
         for(i=0; i<4; i++){
             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17, 
                      &coeff_token_len [i][0], 1, 1,
-                     &coeff_token_bits[i][0], 1, 1);
+                     &coeff_token_bits[i][0], 1, 1, 1);
         }
 
         for(i=0; i<3; i++){
             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
                      &chroma_dc_total_zeros_len [i][0], 1, 1,
-                     &chroma_dc_total_zeros_bits[i][0], 1, 1);
+                     &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
         }
         for(i=0; i<15; i++){
             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16, 
                      &total_zeros_len [i][0], 1, 1,
-                     &total_zeros_bits[i][0], 1, 1);
+                     &total_zeros_bits[i][0], 1, 1, 1);
         }
 
         for(i=0; i<6; i++){
             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7, 
                      &run_len [i][0], 1, 1,
-                     &run_bits[i][0], 1, 1);
+                     &run_bits[i][0], 1, 1, 1);
         }
         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16, 
                  &run_len [6][0], 1, 1,
-                 &run_bits[6][0], 1, 1);
+                 &run_bits[6][0], 1, 1, 1);
     }
 }
 
@@ -2184,13 +2620,17 @@ static void free_tables(H264Context *h){
     av_freep(&h->cbp_table);
     av_freep(&h->mvd_table[0]);
     av_freep(&h->mvd_table[1]);
+    av_freep(&h->direct_table);
     av_freep(&h->non_zero_count);
     av_freep(&h->slice_table_base);
-    av_freep(&h->top_border);
+    av_freep(&h->top_borders[1]);
+    av_freep(&h->top_borders[0]);
     h->slice_table= NULL;
 
     av_freep(&h->mb2b_xy);
     av_freep(&h->mb2b8_xy);
+
+    av_freep(&h->s.obmc_scratchpad);
 }
 
 /**
@@ -2206,13 +2646,15 @@ static int alloc_tables(H264Context *h){
 
     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
-    CHECKED_ALLOCZ(h->top_border       , s->mb_width * (16+8+8) * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
 
     if( h->pps.cabac ) {
         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
-        CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
+        CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
     }
 
     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
@@ -2230,7 +2672,9 @@ static int alloc_tables(H264Context *h){
             h->mb2b8_xy[mb_xy]= b8_xy;
         }
     }
-    
+
+    s->obmc_scratchpad = NULL;
+
     return 0;
 fail:
     free_tables(h);
@@ -2269,15 +2713,12 @@ static int decode_init(AVCodecContext *avctx){
 
     decode_init_vlc(h);
     
-    if(avctx->codec_tag != 0x31637661) // avc1
-        h->is_avc = 0;
-    else {
-        if((avctx->extradata_size == 0) || (avctx->extradata == NULL)) {
-            av_log(avctx, AV_LOG_ERROR, "AVC codec requires avcC data\n");
-            return -1;
-        }
+    if(avctx->extradata_size > 0 && avctx->extradata &&
+       *(char *)avctx->extradata == 1){
         h->is_avc = 1;
         h->got_avcC = 0;
+    } else {
+        h->is_avc = 0;
     }
 
     return 0;
@@ -2289,19 +2730,25 @@ static void frame_start(H264Context *h){
 
     MPV_frame_start(s, s->avctx);
     ff_er_frame_start(s);
-    h->mmco_index=0;
 
     assert(s->linesize && s->uvlinesize);
 
     for(i=0; i<16; i++){
         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
-        h->chroma_subblock_offset[i]= 2*((scan8[i] - scan8[0])&7) + 2*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
     }
     for(i=0; i<4; i++){
         h->block_offset[16+i]=
         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
+        h->block_offset[24+16+i]=
+        h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
     }
 
+    /* can't be in alloc_tables because linesize isn't known there.
+     * FIXME: redo bipred weight to not require extra buffer? */
+    if(!s->obmc_scratchpad)
+        s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
+
 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
 }
 
@@ -2313,23 +2760,25 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
     src_cb -= uvlinesize;
     src_cr -= uvlinesize;
 
-    h->left_border[0]= h->top_border[s->mb_x][15];
+    // There is two lines saved, the line above the the top macroblock of a pair,
+    // and the line above the bottom macroblock
+    h->left_border[0]= h->top_borders[0][s->mb_x][15];
     for(i=1; i<17; i++){
         h->left_border[i]= src_y[15+i*  linesize];
     }
     
-    *(uint64_t*)(h->top_border[s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
-    *(uint64_t*)(h->top_border[s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
+    *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
+    *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
 
     if(!(s->flags&CODEC_FLAG_GRAY)){
-        h->left_border[17  ]= h->top_border[s->mb_x][16+7];
-        h->left_border[17+9]= h->top_border[s->mb_x][24+7];
+        h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
+        h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
         for(i=1; i<9; i++){
             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
         }
-        *(uint64_t*)(h->top_border[s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
-        *(uint64_t*)(h->top_border[s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
+        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
+        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
     }
 }
 
@@ -2357,8 +2806,8 @@ b= t;
     }
 
     if(deblock_top){
-        XCHG(*(uint64_t*)(h->top_border[s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
-        XCHG(*(uint64_t*)(h->top_border[s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
+        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
+        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
     }
 
     if(!(s->flags&CODEC_FLAG_GRAY)){
@@ -2369,8 +2818,93 @@ b= t;
             }
         }
         if(deblock_top){
-            XCHG(*(uint64_t*)(h->top_border[s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
-            XCHG(*(uint64_t*)(h->top_border[s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
+            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
+            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
+        }
+    }
+}
+
+static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+    MpegEncContext * const s = &h->s;
+    int i;
+    
+    src_y  -= 2 *   linesize;
+    src_cb -= 2 * uvlinesize;
+    src_cr -= 2 * uvlinesize;
+
+    // There is two lines saved, the line above the the top macroblock of a pair,
+    // and the line above the bottom macroblock
+    h->left_border[0]= h->top_borders[0][s->mb_x][15];
+    h->left_border[1]= h->top_borders[1][s->mb_x][15];
+    for(i=2; i<34; i++){
+        h->left_border[i]= src_y[15+i*  linesize];
+    }
+    
+    *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
+    *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
+    *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
+    *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
+
+    if(!(s->flags&CODEC_FLAG_GRAY)){
+        h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
+        h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
+        h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
+        h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
+        for(i=2; i<18; i++){
+            h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
+            h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
+        }
+        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
+        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
+        *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
+        *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
+    }
+}
+
+static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+    MpegEncContext * const s = &h->s;
+    int temp8, i;
+    uint64_t temp64;
+    int deblock_left = (s->mb_x > 0);
+    int deblock_top  = (s->mb_y > 0);
+
+    tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
+
+    src_y  -= 2 *   linesize + 1;
+    src_cb -= 2 * uvlinesize + 1;
+    src_cr -= 2 * uvlinesize + 1;
+
+#define XCHG(a,b,t,xchg)\
+t= a;\
+if(xchg)\
+    a= b;\
+b= t;
+
+    if(deblock_left){
+        for(i = (!deblock_top)<<1; i<34; i++){
+            XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
+        }
+    }
+
+    if(deblock_top){
+        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
+        XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
+        XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
+        XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
+    }
+
+    if(!(s->flags&CODEC_FLAG_GRAY)){
+        if(deblock_left){
+            for(i = (!deblock_top) << 1; i<18; i++){
+                XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
+                XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
+            }
+        }
+        if(deblock_top){
+            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
+            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
+            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
+            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
         }
     }
 }
@@ -2384,13 +2918,12 @@ static void hl_decode_mb(H264Context *h){
     uint8_t  *dest_y, *dest_cb, *dest_cr;
     int linesize, uvlinesize /*dct_offset*/;
     int i;
+    int *block_offset = &h->block_offset[0];
+    const unsigned int bottom = mb_y & 1;
 
     if(!s->decode)
         return;
 
-    if(s->mb_skiped){
-    }
-
     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
@@ -2398,10 +2931,11 @@ static void hl_decode_mb(H264Context *h){
     if (h->mb_field_decoding_flag) {
         linesize = s->linesize * 2;
         uvlinesize = s->uvlinesize * 2;
+        block_offset = &h->block_offset[24];
         if(mb_y&1){ //FIXME move out of this func?
             dest_y -= s->linesize*15;
-            dest_cb-= s->linesize*7;
-            dest_cr-= s->linesize*7;
+            dest_cb-= s->uvlinesize*7;
+            dest_cr-= s->uvlinesize*7;
         }
     } else {
         linesize = s->linesize;
@@ -2409,112 +2943,195 @@ static void hl_decode_mb(H264Context *h){
 //        dct_offset = s->linesize * 16;
     }
 
-    if(IS_INTRA(mb_type)){
-        if(h->deblocking_filter)
-            xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+    if (IS_INTRA_PCM(mb_type)) {
+        unsigned int x, y;
 
-        if(!(s->flags&CODEC_FLAG_GRAY)){
-            h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
-            h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
+        // The pixels are stored in h->mb array in the same order as levels,
+        // copy them in output in the correct order.
+        for(i=0; i<16; i++) {
+            for (y=0; y<4; y++) {
+                for (x=0; x<4; x++) {
+                    *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
+                }
+            }
+        }
+        for(i=16; i<16+4; i++) {
+            for (y=0; y<4; y++) {
+                for (x=0; x<4; x++) {
+                    *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
+                }
+            }
+        }
+        for(i=20; i<20+4; i++) {
+            for (y=0; y<4; y++) {
+                for (x=0; x<4; x++) {
+                    *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
+                }
+            }
         }
+    } else {
+        if(IS_INTRA(mb_type)){
+            if(h->deblocking_filter) {
+                if (h->mb_aff_frame) {
+                    if (!bottom)
+                        xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
+                } else {
+                    xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+                }
+            }
 
-        if(IS_INTRA4x4(mb_type)){
-            if(!s->encoding){
-                for(i=0; i<16; i++){
-                    uint8_t * const ptr= dest_y + h->block_offset[i];
-                    uint8_t *topright= ptr + 4 - linesize;
-                    const int topright_avail= (h->topright_samples_available<<i)&0x8000;
-                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                    int tr;
-
-                    if(!topright_avail){
-                        /* xine: avoid (negative) buffer overflow */
-                        tr= (!mb_y && linesize > h->block_offset[i]) ?
-                             ptr[3]*0x01010101 :
-                             ptr[3 - linesize]*0x01010101;
-                        topright= (uint8_t*) &tr;
-                    }else if(i==5 && h->deblocking_filter){
-                        tr= *(uint32_t*)h->top_border[mb_x+1];
-                        topright= (uint8_t*) &tr;
-                    }
+            if(!(s->flags&CODEC_FLAG_GRAY)){
+                h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
+                h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
+            }
 
-                    h->pred4x4[ dir ](ptr, topright, linesize);
-                    if(h->non_zero_count_cache[ scan8[i] ]){
-                        if(s->codec_id == CODEC_ID_H264)
-                            h264_add_idct_c(ptr, h->mb + i*16, linesize);
-                        else
-                            svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
+            if(IS_INTRA4x4(mb_type)){
+                if(!s->encoding){
+                    for(i=0; i<16; i++){
+                        uint8_t * const ptr= dest_y + block_offset[i];
+                        uint8_t *topright;
+                        const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                        int tr;
+
+                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
+                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
+                            assert(mb_y || linesize <= block_offset[i]);
+                            if(!topright_avail){
+                                tr= ptr[3 - linesize]*0x01010101;
+                                topright= (uint8_t*) &tr;
+                            }else if(i==5 && h->deblocking_filter){
+                                tr= *(uint32_t*)h->top_borders[h->mb_aff_frame ? IS_INTERLACED(mb_type) ? bottom : 1 : 0][mb_x+1];
+                                topright= (uint8_t*) &tr;
+                            }else
+                                topright= ptr + 4 - linesize;
+                        }else
+                            topright= NULL;
+
+                        h->pred4x4[ dir ](ptr, topright, linesize);
+                        if(h->non_zero_count_cache[ scan8[i] ]){
+                            if(s->codec_id == CODEC_ID_H264)
+                                s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
+                            else
+                                svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
+                        }
                     }
                 }
+            }else{
+                h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
+                if(s->codec_id == CODEC_ID_H264)
+                    h264_luma_dc_dequant_idct_c(h->mb, s->qscale);
+                else
+                    svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
             }
-        }else{
-            h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
-            if(s->codec_id == CODEC_ID_H264)
-                h264_luma_dc_dequant_idct_c(h->mb, s->qscale);
-            else
-                svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
+            if(h->deblocking_filter) {
+                if (h->mb_aff_frame) {
+                    if (bottom) {
+                        uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
+                        uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
+                        uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
+                        s->mb_y--;
+                        xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
+                        s->mb_y++;
+                    }
+                } else {
+                    xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
+                }
+            }
+        }else if(s->codec_id == CODEC_ID_H264){
+            hl_motion(h, dest_y, dest_cb, dest_cr,
+                      s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab, 
+                      s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
+                      s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
         }
-        if(h->deblocking_filter)
-            xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
-    }else if(s->codec_id == CODEC_ID_H264){
-        hl_motion(h, dest_y, dest_cb, dest_cr,
-                  s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab, 
-                  s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab);
-    }
 
 
-    if(!IS_INTRA4x4(mb_type)){
-        if(s->codec_id == CODEC_ID_H264){
-            for(i=0; i<16; i++){
-                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
-                    uint8_t * const ptr= dest_y + h->block_offset[i];
-                    h264_add_idct_c(ptr, h->mb + i*16, linesize);
+        if(!IS_INTRA4x4(mb_type)){
+            if(s->codec_id == CODEC_ID_H264){
+                for(i=0; i<16; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
+                        uint8_t * const ptr= dest_y + block_offset[i];
+                        s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
+                    }
                 }
-            }
-        }else{
-            for(i=0; i<16; i++){
-                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
-                    uint8_t * const ptr= dest_y + h->block_offset[i];
-                    svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
+            }else{
+                for(i=0; i<16; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
+                        uint8_t * const ptr= dest_y + block_offset[i];
+                        svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
+                    }
                 }
             }
         }
-    }
 
-    if(!(s->flags&CODEC_FLAG_GRAY)){
-        chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp);
-        chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp);
-        if(s->codec_id == CODEC_ID_H264){
-            for(i=16; i<16+4; i++){
-                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                    uint8_t * const ptr= dest_cb + h->block_offset[i];
-                    h264_add_idct_c(ptr, h->mb + i*16, uvlinesize);
+        if(!(s->flags&CODEC_FLAG_GRAY)){
+            chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp);
+            chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp);
+            if(s->codec_id == CODEC_ID_H264){
+                for(i=16; i<16+4; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                        uint8_t * const ptr= dest_cb + block_offset[i];
+                        s->dsp.h264_idct_add(ptr, h->mb + i*16, uvlinesize);
+                    }
                 }
-            }
-            for(i=20; i<20+4; i++){
-                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                    uint8_t * const ptr= dest_cr + h->block_offset[i];
-                    h264_add_idct_c(ptr, h->mb + i*16, uvlinesize);
+                for(i=20; i<20+4; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                        uint8_t * const ptr= dest_cr + block_offset[i];
+                        s->dsp.h264_idct_add(ptr, h->mb + i*16, uvlinesize);
+                    }
                 }
-            }
-        }else{
-            for(i=16; i<16+4; i++){
-                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                    uint8_t * const ptr= dest_cb + h->block_offset[i];
-                    svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
+            }else{
+                for(i=16; i<16+4; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                        uint8_t * const ptr= dest_cb + block_offset[i];
+                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
+                    }
                 }
-            }
-            for(i=20; i<20+4; i++){
-                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                    uint8_t * const ptr= dest_cr + h->block_offset[i];
-                    svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
+                for(i=20; i<20+4; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                        uint8_t * const ptr= dest_cr + block_offset[i];
+                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
+                    }
                 }
             }
         }
     }
     if(h->deblocking_filter) {
-        backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
-        filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr);
+        if (h->mb_aff_frame) {
+            const int mb_y = s->mb_y - 1;
+            uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
+            const int mb_xy= mb_x + mb_y*s->mb_stride;
+            const int mb_type_top   = s->current_picture.mb_type[mb_xy];
+            const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
+            uint8_t tmp = s->current_picture.data[1][384];
+            if (!bottom) return;
+            pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
+            pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
+            pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
+
+            backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
+            // TODO deblock a pair
+            // top 
+            s->mb_y--;
+            tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
+            fill_caches(h, mb_type_top, 1); //FIXME dont fill stuff which isnt used by filter_mb
+            filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
+            if (tmp != s->current_picture.data[1][384]) {
+                tprintf("modified pixel 8,1 (1)\n");
+            }
+            // bottom
+            s->mb_y++;
+            tprintf("call mbaff filter_mb\n");
+            fill_caches(h, mb_type_bottom, 1); //FIXME dont fill stuff which isnt used by filter_mb
+            filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+            if (tmp != s->current_picture.data[1][384]) {
+                tprintf("modified pixel 8,1 (2)\n");
+            }
+        } else {
+            tprintf("call filter_mb\n");
+            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+            fill_caches(h, mb_type, 1); //FIXME dont fill stuff which isnt used by filter_mb
+            filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+        }
     }
 }
 
@@ -2524,15 +3141,17 @@ static void hl_decode_mb(H264Context *h){
 static int fill_default_ref_list(H264Context *h){
     MpegEncContext * const s = &h->s;
     int i;
-    Picture sorted_short_ref[16];
+    int smallest_poc_greater_than_current = -1;
+    Picture sorted_short_ref[32];
     
     if(h->slice_type==B_TYPE){
         int out_i;
         int limit= -1;
 
+        /* sort frame according to poc in B slice */
         for(out_i=0; out_i<h->short_ref_count; out_i++){
             int best_i=-1;
-            int best_poc=-1;
+            int best_poc=INT_MAX;
 
             for(i=0; i<h->short_ref_count; i++){
                 const int poc= h->short_ref[i]->poc;
@@ -2546,37 +3165,47 @@ static int fill_default_ref_list(H264Context *h){
             
             limit= best_poc;
             sorted_short_ref[out_i]= *h->short_ref[best_i];
+            tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
+            if (-1 == smallest_poc_greater_than_current) {
+                if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
+                    smallest_poc_greater_than_current = out_i;
+                }
+            }
         }
     }
 
     if(s->picture_structure == PICT_FRAME){
         if(h->slice_type==B_TYPE){
-            const int current_poc= s->current_picture_ptr->poc;
             int list;
+            tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
 
+            // find the largest poc
             for(list=0; list<2; list++){
-                int index=0;
-
-                for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++){
-                    const int i2= list ? h->short_ref_count - i - 1 : i;
-                    const int poc= sorted_short_ref[i2].poc;
-                    
-                    if(sorted_short_ref[i2].reference != 3) continue; //FIXME refernce field shit
-
-                    if((list==1 && poc > current_poc) || (list==0 && poc < current_poc)){
-                        h->default_ref_list[list][index  ]= sorted_short_ref[i2];
-                        h->default_ref_list[list][index++].pic_id= sorted_short_ref[i2].frame_num;
+                int index = 0;
+                int j= -99;
+                int step= list ? -1 : 1;
+
+                for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
+                    while(j<0 || j>= h->short_ref_count){
+                        step = -step;
+                        j= smallest_poc_greater_than_current + (step>>1);
                     }
+                    if(sorted_short_ref[j].reference != 3) continue;
+                    h->default_ref_list[list][index  ]= sorted_short_ref[j];
+                    h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
                 }
 
-                for(i=0; i<h->long_ref_count && index < h->ref_count[ list ]; i++){
+                for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
+                    if(h->long_ref[i] == NULL) continue;
                     if(h->long_ref[i]->reference != 3) continue;
 
                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
                     h->default_ref_list[ list ][index++].pic_id= i;;
                 }
                 
-                if(h->long_ref_count > 1 && h->short_ref_count==0){
+                if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
+                    // swap the two first elements of L1 when
+                    // L0 and L1 are identical
                     Picture temp= h->default_ref_list[1][0];
                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
                     h->default_ref_list[1][0] = temp;
@@ -2587,12 +3216,13 @@ static int fill_default_ref_list(H264Context *h){
             }
         }else{
             int index=0;
-            for(i=0; i<h->short_ref_count && index < h->ref_count[0]; i++){
+            for(i=0; i<h->short_ref_count; i++){
                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
                 h->default_ref_list[0][index  ]= *h->short_ref[i];
                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
             }
-            for(i=0; i<h->long_ref_count && index < h->ref_count[0]; i++){
+            for(i = 0; i < 16; i++){
+                if(h->long_ref[i] == NULL) continue;
                 if(h->long_ref[i]->reference != 3) continue;
                 h->default_ref_list[0][index  ]= *h->long_ref[i];
                 h->default_ref_list[0][index++].pic_id= i;;
@@ -2606,13 +3236,28 @@ static int fill_default_ref_list(H264Context *h){
             //FIXME second field balh
         }
     }
+#ifdef TRACE
+    for (i=0; i<h->ref_count[0]; i++) {
+        tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
+    }
+    if(h->slice_type==B_TYPE){
+        for (i=0; i<h->ref_count[1]; i++) {
+            tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
+        }
+    }
+#endif
     return 0;
 }
 
+static void print_short_term(H264Context *h);
+static void print_long_term(H264Context *h);
+
 static int decode_ref_pic_list_reordering(H264Context *h){
     MpegEncContext * const s = &h->s;
     int list;
     
+    print_short_term(h);
+    print_long_term(h);
     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move beofre func
     
     for(list=0; list<2; list++){
@@ -2626,7 +3271,10 @@ static int decode_ref_pic_list_reordering(H264Context *h){
                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
                 int pic_id;
                 int i;
+                Picture *ref = NULL;
                 
+                if(reordering_of_pic_nums_idc==3) 
+                    break;
                 
                 if(index >= h->ref_count[list]){
                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
@@ -2646,32 +3294,23 @@ static int decode_ref_pic_list_reordering(H264Context *h){
                         else                                pred+= abs_diff_pic_num;
                         pred &= h->max_pic_num - 1;
                     
-                        for(i= h->ref_count[list]-1; i>=index; i--){
-                            if(h->ref_list[list][i].pic_id == pred && h->ref_list[list][i].long_ref==0)
+                        for(i= h->short_ref_count-1; i>=0; i--){
+                            ref = h->short_ref[i];
+                            if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
                                 break;
                         }
                     }else{
                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
-
-                        for(i= h->ref_count[list]-1; i>=index; i--){
-                            if(h->ref_list[list][i].pic_id == pic_id && h->ref_list[list][i].long_ref==1)
-                                break;
-                        }
+                        ref = h->long_ref[pic_id];
                     }
 
-                    if(i < index){
+                    if (i < 0) {
                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
-                    }else if(i > index){
-                        Picture tmp= h->ref_list[list][i];
-                        for(; i>index; i--){
-                            h->ref_list[list][i]= h->ref_list[list][i-1];
-                        }
-                        h->ref_list[list][index]= tmp;
+                    } else {
+                        h->ref_list[list][index]= *ref;
                     }
-                }else if(reordering_of_pic_nums_idc==3) 
-                    break;
-                else{
+                }else{
                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
                     return -1;
                 }
@@ -2680,15 +3319,24 @@ static int decode_ref_pic_list_reordering(H264Context *h){
 
         if(h->slice_type!=B_TYPE) break;
     }
+    
+    if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
+        direct_dist_scale_factor(h);
+    direct_ref_list_init(h);
     return 0;    
 }
 
 static int pred_weight_table(H264Context *h){
     MpegEncContext * const s = &h->s;
     int list, i;
+    int luma_def, chroma_def;
     
+    h->use_weight= 0;
+    h->use_weight_chroma= 0;
     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
+    luma_def = 1<<h->luma_log2_weight_denom;
+    chroma_def = 1<<h->chroma_log2_weight_denom;
 
     for(list=0; list<2; list++){
         for(i=0; i<h->ref_count[list]; i++){
@@ -2698,6 +3346,12 @@ static int pred_weight_table(H264Context *h){
             if(luma_weight_flag){
                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
+                if(   h->luma_weight[list][i] != luma_def
+                   || h->luma_offset[list][i] != 0)
+                    h->use_weight= 1;
+            }else{
+                h->luma_weight[list][i]= luma_def;
+                h->luma_offset[list][i]= 0;
             }
 
             chroma_weight_flag= get_bits1(&s->gb);
@@ -2706,28 +3360,91 @@ static int pred_weight_table(H264Context *h){
                 for(j=0; j<2; j++){
                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
+                    if(   h->chroma_weight[list][i][j] != chroma_def
+                       || h->chroma_offset[list][i][j] != 0)
+                        h->use_weight_chroma= 1;
+                }
+            }else{
+                int j;
+                for(j=0; j<2; j++){
+                    h->chroma_weight[list][i][j]= chroma_def;
+                    h->chroma_offset[list][i][j]= 0;
                 }
             }
         }
         if(h->slice_type != B_TYPE) break;
     }
+    h->use_weight= h->use_weight || h->use_weight_chroma;
     return 0;
 }
 
+static void implicit_weight_table(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    int ref0, ref1;
+    int cur_poc = s->current_picture_ptr->poc;
+
+    if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
+       && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
+        h->use_weight= 0;
+        h->use_weight_chroma= 0;
+        return;
+    }
+
+    h->use_weight= 2;
+    h->use_weight_chroma= 2;
+    h->luma_log2_weight_denom= 5;
+    h->chroma_log2_weight_denom= 5;
+
+    /* FIXME: MBAFF */
+    for(ref0=0; ref0 < h->ref_count[0]; ref0++){
+        int poc0 = h->ref_list[0][ref0].poc;
+        for(ref1=0; ref1 < h->ref_count[1]; ref1++){
+            int poc1 = h->ref_list[1][ref1].poc;
+            int td = clip(poc1 - poc0, -128, 127);
+            if(td){
+                int tb = clip(cur_poc - poc0, -128, 127);
+                int tx = (16384 + (ABS(td) >> 1)) / td;
+                int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
+                if(dist_scale_factor < -64 || dist_scale_factor > 128)
+                    h->implicit_weight[ref0][ref1] = 32;
+                else
+                    h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
+            }else
+                h->implicit_weight[ref0][ref1] = 32;
+        }
+    }
+}
+
+static inline void unreference_pic(H264Context *h, Picture *pic){
+    int i;
+    pic->reference=0;
+    if(pic == h->delayed_output_pic)
+        pic->reference=1;
+    else{
+        for(i = 0; h->delayed_pic[i]; i++)
+            if(pic == h->delayed_pic[i]){
+                pic->reference=1;
+                break;
+            }
+    }
+}
+
 /**
- * instantaneos decoder refresh.
+ * instantaneous decoder refresh.
  */
 static void idr(H264Context *h){
     int i;
 
-    for(i=0; i<h->long_ref_count; i++){
-        h->long_ref[i]->reference=0;
-        h->long_ref[i]= NULL;
+    for(i=0; i<16; i++){
+        if (h->long_ref[i] != NULL) {
+            unreference_pic(h, h->long_ref[i]);
+            h->long_ref[i]= NULL;
+        }
     }
     h->long_ref_count=0;
 
     for(i=0; i<h->short_ref_count; i++){
-        h->short_ref[i]->reference=0;
+        unreference_pic(h, h->short_ref[i]);
         h->short_ref[i]= NULL;
     }
     h->short_ref_count=0;
@@ -2765,23 +3482,49 @@ static Picture * remove_short(H264Context *h, int frame_num){
 static Picture * remove_long(H264Context *h, int i){
     Picture *pic;
 
-    if(i >= h->long_ref_count) return NULL;
     pic= h->long_ref[i];
-    if(pic==NULL) return NULL;
-    
     h->long_ref[i]= NULL;
-    memmove(&h->long_ref[i], &h->long_ref[i+1], (h->long_ref_count - i - 1)*sizeof(Picture*));
-    h->long_ref_count--;
+    if(pic) h->long_ref_count--;
 
     return pic;
 }
 
 /**
+ * print short term list
+ */
+static void print_short_term(H264Context *h) {
+    uint32_t i;
+    if(h->s.avctx->debug&FF_DEBUG_MMCO) {
+        av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
+        for(i=0; i<h->short_ref_count; i++){
+            Picture *pic= h->short_ref[i];
+            av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
+        }
+    }
+}
+
+/**
+ * print long term list
+ */
+static void print_long_term(H264Context *h) {
+    uint32_t i;
+    if(h->s.avctx->debug&FF_DEBUG_MMCO) {
+        av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
+        for(i = 0; i < 16; i++){
+            Picture *pic= h->long_ref[i];
+            if (pic) {
+                av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
+            }
+        }
+    }
+}
+
+/**
  * Executes the reference picture marking (memory management control operations).
  */
 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
     MpegEncContext * const s = &h->s;
-    int i;
+    int i, j;
     int current_is_long=0;
     Picture *pic;
     
@@ -2796,23 +3539,24 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
         case MMCO_SHORT2UNUSED:
             pic= remove_short(h, mmco[i].short_frame_num);
             if(pic==NULL) return -1;
-            pic->reference= 0;
+            unreference_pic(h, pic);
             break;
         case MMCO_SHORT2LONG:
             pic= remove_long(h, mmco[i].long_index);
-            if(pic) pic->reference=0;
+            if(pic) unreference_pic(h, pic);
             
             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
             h->long_ref[ mmco[i].long_index ]->long_ref=1;
+            h->long_ref_count++;
             break;
         case MMCO_LONG2UNUSED:
             pic= remove_long(h, mmco[i].long_index);
             if(pic==NULL) return -1;
-            pic->reference= 0;
+            unreference_pic(h, pic);
             break;
         case MMCO_LONG:
             pic= remove_long(h, mmco[i].long_index);
-            if(pic) pic->reference=0;
+            if(pic) unreference_pic(h, pic);
             
             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
             h->long_ref[ mmco[i].long_index ]->long_ref=1;
@@ -2822,22 +3566,20 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
             break;
         case MMCO_SET_MAX_LONG:
             assert(mmco[i].long_index <= 16);
-            while(mmco[i].long_index < h->long_ref_count){
-                pic= remove_long(h, mmco[i].long_index);
-                pic->reference=0;
-            }
-            while(mmco[i].long_index > h->long_ref_count){
-                h->long_ref[ h->long_ref_count++ ]= NULL;
+            // just remove the long term which index is greater than new max
+            for(j = mmco[i].long_index; j<16; j++){
+                pic = remove_long(h, j);
+                if (pic) unreference_pic(h, pic);
             }
             break;
         case MMCO_RESET:
             while(h->short_ref_count){
                 pic= remove_short(h, h->short_ref[0]->frame_num);
-                pic->reference=0;
+                unreference_pic(h, pic);
             }
-            while(h->long_ref_count){
-                pic= remove_long(h, h->long_ref_count-1);
-                pic->reference=0;
+            for(j = 0; j < 16; j++) {
+                pic= remove_long(h, j);
+                if(pic) unreference_pic(h, pic);
             }
             break;
         default: assert(0);
@@ -2847,7 +3589,7 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
     if(!current_is_long){
         pic= remove_short(h, s->current_picture_ptr->frame_num);
         if(pic){
-            pic->reference=0;
+            unreference_pic(h, pic);
             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
         }
         
@@ -2859,6 +3601,8 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
         h->short_ref_count++;
     }
     
+    print_short_term(h);
+    print_long_term(h);
     return 0; 
 }
 
@@ -2877,7 +3621,7 @@ static int decode_ref_pic_marking(H264Context *h){
         } 
     }else{
         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
-            for(i= h->mmco_index; i<MAX_MMCO_COUNT; i++) { 
+            for(i= 0; i<MAX_MMCO_COUNT; i++) { 
                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
 
                 h->mmco[i].opcode= opcode;
@@ -2900,6 +3644,8 @@ static int decode_ref_pic_marking(H264Context *h){
                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
                     return -1;
                 }
+                if(opcode == MMCO_END)
+                    break;
             }
             h->mmco_index= i;
         }else{
@@ -3010,23 +3756,32 @@ static int decode_slice_header(H264Context *h){
     int first_mb_in_slice, pps_id;
     int num_ref_idx_active_override_flag;
     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
+    int slice_type;
+    int default_ref_list_done = 0;
 
     s->current_picture.reference= h->nal_ref_idc != 0;
+    s->dropable= h->nal_ref_idc == 0;
 
     first_mb_in_slice= get_ue_golomb(&s->gb);
 
-    h->slice_type= get_ue_golomb(&s->gb);
-    if(h->slice_type > 9){
+    slice_type= get_ue_golomb(&s->gb);
+    if(slice_type > 9){
         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
+        return -1;
     }
-    if(h->slice_type > 4){
-        h->slice_type -= 5;
+    if(slice_type > 4){
+        slice_type -= 5;
         h->slice_type_fixed=1;
     }else
         h->slice_type_fixed=0;
     
-    h->slice_type= slice_type_map[ h->slice_type ];
-    
+    slice_type= slice_type_map[ slice_type ];
+    if (slice_type == I_TYPE
+        || (h->slice_num != 0 && slice_type == h->slice_type) ) {
+        default_ref_list_done = 1;
+    }
+    h->slice_type= slice_type;
+
     s->pict_type= h->slice_type; // to make a few old func happy, its wrong though
         
     pps_id= get_ue_golomb(&s->gb);
@@ -3047,14 +3802,11 @@ static int decode_slice_header(H264Context *h){
     }
     
     s->mb_width= h->sps.mb_width;
-    s->mb_height= h->sps.mb_height;
+    s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
     
-    h->b_stride=  s->mb_width*4;
-    h->b8_stride= s->mb_width*2;
+    h->b_stride=  s->mb_width*4 + 1;
+    h->b8_stride= s->mb_width*2 + 1;
 
-    s->mb_x = first_mb_in_slice % s->mb_width;
-    s->mb_y = first_mb_in_slice / s->mb_width; //FIXME AFFW
-    
     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
     if(h->sps.frame_mbs_only_flag)
         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
@@ -3075,6 +3827,8 @@ static int decode_slice_header(H264Context *h){
         s->avctx->width = s->width;
         s->avctx->height = s->height;
         s->avctx->sample_aspect_ratio= h->sps.sar;
+        if(!s->avctx->sample_aspect_ratio.den)
+            s->avctx->sample_aspect_ratio.den = 1;
 
         if(h->sps.timing_info_present_flag && h->sps.fixed_frame_rate_flag){
             s->avctx->frame_rate = h->sps.time_scale;
@@ -3082,22 +3836,29 @@ static int decode_slice_header(H264Context *h){
         }
     }
 
-    if(first_mb_in_slice == 0){
+    if(h->slice_num == 0){
         frame_start(h);
     }
 
     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
 
+    h->mb_aff_frame = 0;
     if(h->sps.frame_mbs_only_flag){
         s->picture_structure= PICT_FRAME;
     }else{
-        if(get_bits1(&s->gb)) //field_pic_flag
+        if(get_bits1(&s->gb)) { //field_pic_flag
             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
-        else
+        } else {
             s->picture_structure= PICT_FRAME;
+            first_mb_in_slice <<= 1;
+            h->mb_aff_frame = h->sps.mb_aff;
+        }
     }
 
+    s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
+    s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
+    
     if(s->picture_structure==PICT_FRAME){
         h->curr_pic_num=   h->frame_num;
         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
@@ -3153,7 +3914,7 @@ static int decode_slice_header(H264Context *h){
         }
     }
 
-    if(first_mb_in_slice == 0){
+    if(!default_ref_list_done){
         fill_default_ref_list(h);
     }
 
@@ -3162,6 +3923,10 @@ static int decode_slice_header(H264Context *h){
     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE )) 
        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
         pred_weight_table(h);
+    else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
+        implicit_weight_table(h);
+    else
+        h->use_weight = 0;
     
     if(s->current_picture.reference)
         decode_ref_pic_marking(h);
@@ -3175,6 +3940,7 @@ static int decode_slice_header(H264Context *h){
         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
         return -1;
     }
+    h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
     //FIXME qscale / qp ... stuff
     if(h->slice_type == SP_TYPE){
         get_bits1(&s->gb); /* sp_for_switch_flag */
@@ -3202,15 +3968,21 @@ static int decode_slice_header(H264Context *h){
         slice_group_change_cycle= get_bits(&s->gb, ?);
 #endif
 
+    h->slice_num++;
+
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-        av_log(h->s.avctx, AV_LOG_DEBUG, "mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d\n", 
+        av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n", 
+               h->slice_num,
+               (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
                first_mb_in_slice, 
                av_get_pict_type_char(h->slice_type),
                pps_id, h->frame_num,
                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
                h->ref_count[0], h->ref_count[1],
                s->qscale,
-               h->deblocking_filter
+               h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
+               h->use_weight,
+               h->use_weight==1 && h->use_weight_chroma ? "c" : ""
                );
     }
 
@@ -3231,7 +4003,7 @@ static inline int get_level_prefix(GetBitContext *gb){
     log= 32 - av_log2(buf);
 #ifdef TRACE
     print_bin(buf>>(32-log), log);
-    printf("%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
+    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
 #endif
 
     LAST_SKIP_BITS(re, gb, log);
@@ -3381,6 +4153,55 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
 }
 
 /**
+ * decodes a P_SKIP or B_SKIP macroblock
+ */
+static void decode_mb_skip(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
+    int mb_type;
+    
+    memset(h->non_zero_count[mb_xy], 0, 16);
+    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
+
+    if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
+        h->mb_field_decoding_flag= get_bits1(&s->gb);
+    }
+    if(h->mb_field_decoding_flag)
+        mb_type|= MB_TYPE_INTERLACED;
+        
+    if( h->slice_type == B_TYPE )
+    {
+        // just for fill_caches. pred_direct_motion will set the real mb_type
+        mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
+
+        fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
+        pred_direct_motion(h, &mb_type);
+        if(h->pps.cabac){
+            fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+            fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+        }
+    }
+    else
+    {
+        int mx, my;
+        mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+
+        fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
+        pred_pskip_motion(h, &mx, &my);
+        fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
+        fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
+        if(h->pps.cabac)
+            fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+    }
+
+    write_back_motion(h, mb_type);
+    s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
+    s->current_picture.qscale_table[mb_xy]= s->qscale;
+    h->slice_table[ mb_xy ]= h->slice_num;
+    h->prev_mb_skiped= 1;
+}
+
+/**
  * decodes a macroblock
  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
  */
@@ -3399,40 +4220,15 @@ static int decode_mb_cavlc(H264Context *h){
             s->mb_skip_run= get_ue_golomb(&s->gb);
         
         if (s->mb_skip_run--) {
-            int mx, my;
-            /* skip mb */
-//FIXME b frame
-            mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0;
-
-            memset(h->non_zero_count[mb_xy], 0, 16);
-            memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
-
-            if(h->sps.mb_aff && s->mb_skip_run==0 && (s->mb_y&1)==0){
-                h->mb_field_decoding_flag= get_bits1(&s->gb);
-            }
-
-            if(h->mb_field_decoding_flag)
-                mb_type|= MB_TYPE_INTERLACED;
-            
-            fill_caches(h, mb_type); //FIXME check what is needed and what not ...
-            pred_pskip_motion(h, &mx, &my);
-            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
-            fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
-            write_back_motion(h, mb_type);
-
-            s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
-            s->current_picture.qscale_table[mb_xy]= s->qscale;
-            h->slice_table[ mb_xy ]= h->slice_num;
-
-            h->prev_mb_skiped= 1;
+            decode_mb_skip(h);
             return 0;
         }
     }
-    if(h->sps.mb_aff /* && !field pic FIXME needed? */){
-        if((s->mb_y&1)==0)
+    if(h->mb_aff_frame){
+        if ( ((s->mb_y&1) == 0) || h->prev_mb_skiped)
             h->mb_field_decoding_flag = get_bits1(&s->gb);
     }else
-        h->mb_field_decoding_flag=0; //FIXME som ed note ?!
+        h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
     
     h->prev_mb_skiped= 0;
     
@@ -3473,43 +4269,44 @@ decode_intra_mb:
     h->slice_table[ mb_xy ]= h->slice_num;
     
     if(IS_INTRA_PCM(mb_type)){
-        const uint8_t *ptr;
-        int x, y;
+        unsigned int x, y;
         
         // we assume these blocks are very rare so we dont optimize it
         align_get_bits(&s->gb);
         
-        ptr= s->gb.buffer + get_bits_count(&s->gb);
-    
+        // The pixels are stored in the same order as levels in h->mb array.
         for(y=0; y<16; y++){
-            const int index= 4*(y&3) + 64*(y>>2);
+            const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
             for(x=0; x<16; x++){
-                h->mb[index + (x&3) + 16*(x>>2)]= *(ptr++);
+                tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
+                h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
             }
         }
         for(y=0; y<8; y++){
             const int index= 256 + 4*(y&3) + 32*(y>>2);
             for(x=0; x<8; x++){
-                h->mb[index + (x&3) + 16*(x>>2)]= *(ptr++);
+                tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
+                h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
             }
         }
         for(y=0; y<8; y++){
             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
             for(x=0; x<8; x++){
-                h->mb[index + (x&3) + 16*(x>>2)]= *(ptr++);
+                tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
+                h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
             }
         }
     
-        skip_bits(&s->gb, 384); //FIXME check /fix the bitstream readers
-        
-        //FIXME deblock filter, non_zero_count_cache init ...
+        // In deblocking, the quantiser is 0
+        s->current_picture.qscale_table[mb_xy]= 0;
+        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
+        // All coeffs are presents
         memset(h->non_zero_count[mb_xy], 16, 16);
-        s->current_picture.qscale_table[mb_xy]= s->qscale;
         
         return 0;
     }
         
-    fill_caches(h, mb_type);
+    fill_caches(h, mb_type, 0);
 
     //mb_pred
     if(IS_INTRA(mb_type)){
@@ -3561,6 +4358,9 @@ decode_intra_mb:
                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
             }
+            if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
+               || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3]))
+                pred_direct_motion(h, &mb_type);
         }else{
             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
             for(i=0; i<4; i++){
@@ -3575,10 +4375,14 @@ decode_intra_mb:
         }
         
         for(list=0; list<2; list++){
-            const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
+            int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
             if(ref_count == 0) continue;
+            if (h->mb_aff_frame && h->mb_field_decoding_flag) {
+                ref_count <<= 1;
+            }
             for(i=0; i<4; i++){
-                if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+                if(IS_DIRECT(h->sub_mb_type[i])) continue;
+                if(IS_DIR(h->sub_mb_type[i], 0, list)){
                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
                 }else{
                  //FIXME
@@ -3592,10 +4396,11 @@ decode_intra_mb:
             if(ref_count == 0) continue;
 
             for(i=0; i<4; i++){
+                if(IS_DIRECT(h->sub_mb_type[i])) continue;
                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
 
-                if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+                if(IS_DIR(h->sub_mb_type[i], 0, list)){
                     const int sub_mb_type= h->sub_mb_type[i];
                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
                     for(j=0; j<sub_partition_count[i]; j++){
@@ -3631,16 +4436,20 @@ decode_intra_mb:
                 }
             }
         }
-    }else if(!IS_DIRECT(mb_type)){
+    }else if(IS_DIRECT(mb_type)){
+        pred_direct_motion(h, &mb_type);
+        s->current_picture.mb_type[mb_xy]= mb_type;
+    }else{
         int list, mx, my, i;
          //FIXME we should set ref_idx_l? to 0 if we use that later ...
         if(IS_16X16(mb_type)){
             for(list=0; list<2; list++){
-                if(h->ref_count[0]>0){
+                if(h->ref_count[list]>0){
                     if(IS_DIR(mb_type, 0, list)){
                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
-                    }
+                    }else
+                        fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
                 }
             }
             for(list=0; list<2; list++){
@@ -3651,7 +4460,8 @@ decode_intra_mb:
                     tprintf("final mv:%d %d\n", mx, my);
 
                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
-                }
+                }else
+                    fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
             }
         }
         else if(IS_16X8(mb_type)){
@@ -3661,7 +4471,8 @@ decode_intra_mb:
                         if(IS_DIR(mb_type, i, list)){
                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
-                        }
+                        }else
+                            fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
                     }
                 }
             }
@@ -3674,7 +4485,8 @@ decode_intra_mb:
                         tprintf("final mv:%d %d\n", mx, my);
 
                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
-                    }
+                    }else
+                        fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
                 }
             }
         }else{
@@ -3685,7 +4497,8 @@ decode_intra_mb:
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
-                        }
+                        }else
+                            fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
                     }
                 }
             }
@@ -3698,7 +4511,8 @@ decode_intra_mb:
                         tprintf("final mv:%d %d\n", mx, my);
 
                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
-                    }
+                    }else
+                        fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
                 }
             }
         }
@@ -3749,7 +4563,7 @@ decode_intra_mb:
             else            s->qscale-= 52;
         }
         
-        h->chroma_qp= chroma_qp= get_chroma_qp(h, s->qscale);
+        h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
         if(IS_INTRA16x16(mb_type)){
             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, s->qscale, 16) < 0){
                 return -1; //FIXME continue if partotioned and other retirn -1 too
@@ -3819,41 +4633,71 @@ decode_intra_mb:
     return 0;
 }
 
-static int decode_cabac_mb_type( H264Context *h ) {
+static int decode_cabac_field_decoding_flag(H264Context *h) {
     MpegEncContext * const s = &h->s;
+    const int mb_x = s->mb_x;
+    const int mb_y = s->mb_y & ~1;
+    const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
+    const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
 
-    if( h->slice_type == I_TYPE ) {
-        const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
-        int ctx = 0;
-        int mb_type;
+    unsigned int ctx = 0;
+    
+    if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
+        ctx += 1;
+    }
+    if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
+        ctx += 1;
+    }
 
-        if( s->mb_x > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-1] ) )
+    return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
+}
+
+static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
+    uint8_t *state= &h->cabac_state[ctx_base];
+    int mb_type;
+    
+    if(intra_slice){
+        MpegEncContext * const s = &h->s;
+        const int mba_xy = h->left_mb_xy[0];
+        const int mbb_xy = h->top_mb_xy;
+        int ctx=0;
+        if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
             ctx++;
-        if( s->mb_y > 0 && !IS_INTRA4x4( s->current_picture.mb_type[mb_xy-s->mb_stride] ) )
+        if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
             ctx++;
-
-        if( get_cabac( &h->cabac, &h->cabac_state[3+ctx] ) == 0 )
+        if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
+            return 0;   /* I4x4 */
+        state += 2;
+    }else{
+        if( get_cabac( &h->cabac, &state[0] ) == 0 )
             return 0;   /* I4x4 */
+    }
 
-        if( get_cabac_terminate( &h->cabac ) )
-            return 25;  /* PCM */
+    if( get_cabac_terminate( &h->cabac ) )
+        return 25;  /* PCM */
 
-        mb_type = 1;    /* I16x16 */
-        if( get_cabac( &h->cabac, &h->cabac_state[3+3] ) )
-            mb_type += 12;  /* cbp_luma != 0 */
+    mb_type = 1; /* I16x16 */
+    if( get_cabac( &h->cabac, &state[1] ) )
+        mb_type += 12;  /* cbp_luma != 0 */
 
-        if( get_cabac( &h->cabac, &h->cabac_state[3+4] ) ) {
-            if( get_cabac( &h->cabac, &h->cabac_state[3+5] ) )
-                mb_type += 4 * 2;   /* cbp_chroma == 2 */
-            else
-                mb_type += 4 * 1;   /* cbp_chroma == 1 */
-        }
-        if( get_cabac( &h->cabac, &h->cabac_state[3+6] ) )
-            mb_type += 2;
-        if( get_cabac( &h->cabac, &h->cabac_state[3+7] ) )
-            mb_type += 1;
-        return mb_type;
+    if( get_cabac( &h->cabac, &state[2] ) ) {
+        if( get_cabac( &h->cabac, &state[2+intra_slice] ) )
+            mb_type += 4 * 2;   /* cbp_chroma == 2 */
+        else
+            mb_type += 4 * 1;   /* cbp_chroma == 1 */
+    }
+    if( get_cabac( &h->cabac, &state[3+intra_slice] ) )
+        mb_type += 2;
+    if( get_cabac( &h->cabac, &state[3+2*intra_slice] ) )
+        mb_type += 1;
+    return mb_type;
+}
+
+static int decode_cabac_mb_type( H264Context *h ) {
+    MpegEncContext * const s = &h->s;
 
+    if( h->slice_type == I_TYPE ) {
+        return decode_cabac_intra_mb_type(h, 3, 1);
     } else if( h->slice_type == P_TYPE ) {
         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
             /* P-type */
@@ -3869,31 +4713,45 @@ static int decode_cabac_mb_type( H264Context *h ) {
                     return 1; /* P_L0_D16x8; */
             }
         } else {
-            int mb_type;
-            /* I-type */
-            if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
-                return 5+0; /* I_4x4 */
-            if( get_cabac_terminate( &h->cabac ) )
-                return 5+25; /*I_PCM */
-            mb_type = 5+1;    /* I16x16 */
-            if( get_cabac( &h->cabac, &h->cabac_state[17+1] ) )
-                mb_type += 12;  /* cbp_luma != 0 */
-
-            if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) ) {
-                if( get_cabac( &h->cabac, &h->cabac_state[17+2] ) )
-                    mb_type += 4 * 2;   /* cbp_chroma == 2 */
-                else
-                    mb_type += 4 * 1;   /* cbp_chroma == 1 */
-            }
-            if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) )
-                mb_type += 2;
-            if( get_cabac( &h->cabac, &h->cabac_state[17+3] ) )
-                mb_type += 1;
+            return decode_cabac_intra_mb_type(h, 17, 0) + 5;
+        }
+    } else if( h->slice_type == B_TYPE ) {
+        const int mba_xy = h->left_mb_xy[0];
+        const int mbb_xy = h->top_mb_xy;
+        int ctx = 0;
+        int bits;
+
+        if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] )
+                      && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
+            ctx++;
+        if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] )
+                      && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
+            ctx++;
+
+        if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
+            return 0; /* B_Direct_16x16 */
 
-            return mb_type;
+        if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
+            return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
         }
+
+        bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
+        bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
+        bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
+        bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
+        if( bits < 8 )
+            return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
+        else if( bits == 13 ) {
+            return decode_cabac_intra_mb_type(h, 32, 0) + 23;
+        } else if( bits == 14 )
+            return 11; /* B_L1_L0_8x16 */
+        else if( bits == 15 )
+            return 22; /* B_8x8 */
+
+        bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
+        return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
     } else {
-        /* TODO do others frames types */
+        /* TODO SI/SP frames? */
         return -1;
     }
 }
@@ -3905,9 +4763,9 @@ static int decode_cabac_mb_skip( H264Context *h) {
     const int mbb_xy = mb_xy - s->mb_stride;
     int ctx = 0;
 
-    if( s->mb_x > 0 && !IS_SKIP( s->current_picture.mb_type[mba_xy] ) )
+    if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
         ctx++;
-    if( s->mb_y > 0 && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ) )
+    if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
         ctx++;
 
     if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
@@ -3935,18 +4793,16 @@ static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
 }
 
 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
-    const int mba_xy = mb_xy - 1;
-    const int mbb_xy = mb_xy - s->mb_stride;
+    const int mba_xy = h->left_mb_xy[0];
+    const int mbb_xy = h->top_mb_xy;
 
     int ctx = 0;
 
     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
-    if( s->mb_x > 0 && h->chroma_pred_mode_table[mba_xy] != 0 )
+    if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
         ctx++;
 
-    if( s->mb_y > 0 && h->chroma_pred_mode_table[mbb_xy] != 0 )
+    if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
         ctx++;
 
     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
@@ -3975,16 +4831,13 @@ static const uint8_t block_idx_xy[4][4] = {
 
 static int decode_cabac_mb_cbp_luma( H264Context *h) {
     MpegEncContext * const s = &h->s;
-    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
 
     int cbp = 0;
     int i8x8;
 
-    h->cbp_table[mb_xy] = 0;  /* FIXME aaahahahah beurk */
-
     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
-        int mba_xy = -1;
-        int mbb_xy = -1;
+        int cbp_a = -1;
+        int cbp_b = -1;
         int x, y;
         int ctx = 0;
 
@@ -3992,51 +4845,45 @@ static int decode_cabac_mb_cbp_luma( H264Context *h) {
         y = block_idx_y[4*i8x8];
 
         if( x > 0 )
-            mba_xy = mb_xy;
-        else if( s->mb_x > 0 )
-            mba_xy = mb_xy - 1;
+            cbp_a = cbp;
+        else if( s->mb_x > 0 && (h->slice_table[h->left_mb_xy[0]] == h->slice_num)) {
+            cbp_a = h->left_cbp;
+            tprintf("cbp_a = left_cbp = %x\n", cbp_a);
+        }
 
         if( y > 0 )
-            mbb_xy = mb_xy;
-        else if( s->mb_y > 0 )
-            mbb_xy = mb_xy - s->mb_stride;
+            cbp_b = cbp;
+        else if( s->mb_y > 0 && (h->slice_table[h->top_mb_xy] == h->slice_num)) {
+            cbp_b = h->top_cbp;
+            tprintf("cbp_b = top_cbp = %x\n", cbp_b);
+        }
 
         /* No need to test for skip as we put 0 for skip block */
-        if( mba_xy >= 0 ) {
+        /* No need to test for IPCM as we put 1 for IPCM block */
+        if( cbp_a >= 0 ) {
             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
-            if( ((h->cbp_table[mba_xy] >> i8x8a)&0x01) == 0 )
+            if( ((cbp_a >> i8x8a)&0x01) == 0 )
                 ctx++;
         }
 
-        if( mbb_xy >= 0 ) {
+        if( cbp_b >= 0 ) {
             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
-            if( ((h->cbp_table[mbb_xy] >> i8x8b)&0x01) == 0 )
+            if( ((cbp_b >> i8x8b)&0x01) == 0 )
                 ctx += 2;
         }
 
         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
             cbp |= 1 << i8x8;
-            h->cbp_table[mb_xy] = cbp;  /* FIXME aaahahahah beurk */
         }
     }
     return cbp;
 }
 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
     int ctx;
     int cbp_a, cbp_b;
 
-    /* No need to test for skip */
-    if( s->mb_x > 0 )
-        cbp_a = (h->cbp_table[mb_xy-1]>>4)&0x03;
-    else
-        cbp_a = -1;
-
-    if( s->mb_y > 0 )
-        cbp_b = (h->cbp_table[mb_xy-s->mb_stride]>>4)&0x03;
-    else
-        cbp_b = -1;
+    cbp_a = (h->left_cbp>>4)&0x03;
+    cbp_b = (h-> top_cbp>>4)&0x03;
 
     ctx = 0;
     if( cbp_a > 0 ) ctx++;
@@ -4047,10 +4894,7 @@ static int decode_cabac_mb_cbp_chroma( H264Context *h) {
     ctx = 4;
     if( cbp_a == 2 ) ctx++;
     if( cbp_b == 2 ) ctx += 2;
-    if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) )
-        return 2;
-    else
-        return 1;
+    return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
 }
 static int decode_cabac_mb_dqp( H264Context *h) {
     MpegEncContext * const s = &h->s;
@@ -4063,7 +4907,7 @@ static int decode_cabac_mb_dqp( H264Context *h) {
     else
         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
 
-    if( mbn_xy >= 0 && h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
+    if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
         ctx++;
 
     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
@@ -4079,7 +4923,7 @@ static int decode_cabac_mb_dqp( H264Context *h) {
     else
         return -(val + 1)/2;
 }
-static int decode_cabac_mb_sub_type( H264Context *h ) {
+static int decode_cabac_p_mb_sub_type( H264Context *h ) {
     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
         return 0;   /* 8x8 */
     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
@@ -4088,6 +4932,22 @@ static int decode_cabac_mb_sub_type( H264Context *h ) {
         return 2;   /* 4x8 */
     return 3;       /* 4x4 */
 }
+static int decode_cabac_b_mb_sub_type( H264Context *h ) {
+    int type;
+    if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
+        return 0;   /* B_Direct_8x8 */
+    if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
+        return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
+    type = 3;
+    if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
+        if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
+            return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
+        type += 4;
+    }
+    type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
+    type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
+    return type;
+}
 
 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
     int refa = h->ref_cache[list][scan8[n] - 1];
@@ -4095,10 +4955,17 @@ static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
     int ref  = 0;
     int ctx  = 0;
 
-    if( refa > 0 )
-        ctx++;
-    if( refb > 0 )
-        ctx += 2;
+    if( h->slice_type == B_TYPE) {
+        if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
+            ctx++;
+        if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
+            ctx += 2;
+    } else {
+        if( refa > 0 )
+            ctx++;
+        if( refb > 0 )
+            ctx += 2;
+    }
 
     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
         ref++;
@@ -4114,8 +4981,7 @@ static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
                abs( h->mvd_cache[list][scan8[n] - 8][l] );
     int ctxbase = (l == 0) ? 40 : 47;
-    int ctx;
-    int mvd = 0;
+    int ctx, mvd;
 
     if( amvd < 3 )
         ctx = 0;
@@ -4124,11 +4990,14 @@ static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
     else
         ctx = 1;
 
+    if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
+        return 0;
+
+    mvd= 1;
+    ctx= 3;
     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
         mvd++;
-        if( ctx < 3 )
-            ctx = 3;
-        else if( ctx < 6 )
+        if( ctx < 6 )
             ctx++;
     }
 
@@ -4143,132 +5012,52 @@ static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
                 mvd += 1 << k;
         }
     }
-    if( mvd != 0 && get_cabac_bypass( &h->cabac ) )
-        return -mvd;
-    return mvd;
+    if( get_cabac_bypass( &h->cabac ) )  return -mvd;
+    else                                 return  mvd;
 }
 
-
-static int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
-    MpegEncContext * const s = &h->s;
-    const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
-    int mba_xy = -1;
-    int mbb_xy = -1;
-
-    int nza = -1;
-    int nzb = -1;
+static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
+    int nza, nzb;
     int ctx = 0;
 
     if( cat == 0 ) {
-        if( s->mb_x > 0 ) {
-            mba_xy = mb_xy - 1;
-            if( IS_INTRA16x16(s->current_picture.mb_type[mba_xy] ) )
-                    nza = h->cbp_table[mba_xy]&0x100;
-        }
-        if( s->mb_y > 0 ) {
-            mbb_xy = mb_xy - s->mb_stride;
-            if( IS_INTRA16x16(s->current_picture.mb_type[mbb_xy] ) )
-                    nzb = h->cbp_table[mbb_xy]&0x100;
-        }
+        nza = h->left_cbp&0x100;
+        nzb = h-> top_cbp&0x100;
     } else if( cat == 1 || cat == 2 ) {
-        int i8x8a, i8x8b;
-        int x, y;
-
-        x = block_idx_x[idx];
-        y = block_idx_y[idx];
-
-        if( x > 0 )
-            mba_xy = mb_xy;
-        else if( s->mb_x > 0 )
-            mba_xy = mb_xy - 1;
-
-        if( y > 0 )
-            mbb_xy = mb_xy;
-        else if( s->mb_y > 0 )
-            mbb_xy = mb_xy - s->mb_stride;
-
-        /* No need to test for skip */
-        if( mba_xy >= 0 ) {
-            i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
-
-            if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
-                ((h->cbp_table[mba_xy]&0x0f)>>i8x8a))
-                nza = h->non_zero_count_cache[scan8[idx] - 1];
-        }
-
-        if( mbb_xy >= 0 ) {
-            i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
-
-            if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
-                ((h->cbp_table[mbb_xy]&0x0f)>>i8x8b))
-                nzb = h->non_zero_count_cache[scan8[idx] - 8];
-        }
+        nza = h->non_zero_count_cache[scan8[idx] - 1];
+        nzb = h->non_zero_count_cache[scan8[idx] - 8];
     } else if( cat == 3 ) {
-        if( s->mb_x > 0 ) {
-            mba_xy = mb_xy - 1;
-
-            if( !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
-                (h->cbp_table[mba_xy]&0x30) )
-                nza = (h->cbp_table[mba_xy]>>(6+idx))&0x01;
-        }
-        if( s->mb_y > 0 ) {
-            mbb_xy = mb_xy - s->mb_stride;
-
-            if( !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
-                (h->cbp_table[mbb_xy]&0x30) )
-                nzb = (h->cbp_table[mbb_xy]>>(6+idx))&0x01;
-        }
-    } else if( cat == 4 ) {
-        int idxc = idx % 4 ;
-        if( idxc == 1 || idxc == 3 )
-            mba_xy = mb_xy;
-        else if( s->mb_x > 0 )
-            mba_xy = mb_xy -1;
-
-        if( idxc == 2 || idxc == 3 )
-            mbb_xy = mb_xy;
-        else if( s->mb_y > 0 )
-            mbb_xy = mb_xy - s->mb_stride;
-
-        if( mba_xy >= 0 &&
-            !IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) &&
-            (h->cbp_table[mba_xy]&0x30) == 0x20 )
-            nza = h->non_zero_count_cache[scan8[16+idx] - 1];
-
-        if( mbb_xy >= 0 &&
-            !IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) &&
-            (h->cbp_table[mbb_xy]&0x30) == 0x20 )
-            nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
-    }
-
-    if( ( mba_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) ||
-        ( mba_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mba_xy] ) ) ||
-          nza > 0 )
+        nza = (h->left_cbp>>(6+idx))&0x01;
+        nzb = (h-> top_cbp>>(6+idx))&0x01;
+    } else {
+        assert(cat == 4);
+        nza = h->non_zero_count_cache[scan8[16+idx] - 1];
+        nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
+    }
+
+    if( nza > 0 )
         ctx++;
 
-    if( ( mbb_xy < 0 && IS_INTRA( s->current_picture.mb_type[mb_xy] ) ) ||
-        ( mbb_xy >= 0 && IS_INTRA_PCM(s->current_picture.mb_type[mbb_xy] ) ) ||
-          nzb > 0 )
+    if( nzb > 0 )
         ctx += 2;
 
     return ctx + 4 * cat;
 }
 
-static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
+static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
     const uint16_t *qmul= dequant_coeff[qp];
+    static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
+    static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
     static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
-    static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
-    static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 };
+    static const int coeff_abs_level_m1_offset[5] = {227+ 0, 227+10, 227+20, 227+30, 227+39 };
 
-    int coeff[16];
+    int index[16];
 
-    int last = 0;
+    int i, last;
     int coeff_count = 0;
-    int nz[16] = {0};
-    int i;
 
-    int abslevel1 = 0;
+    int abslevel1 = 1;
     int abslevelgt1 = 0;
 
     /* cat: 0-> DC 16x16  n = 0
@@ -4288,96 +5077,103 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
         return 0;
     }
 
-    while( last < max_coeff - 1 ) {
-        int ctx = FFMIN( last, max_coeff - 2 );
-
-        if( get_cabac( &h->cabac, &h->cabac_state[105+significant_coeff_flag_offset[cat]+ctx] ) == 0 ) {
-            nz[last++] = 0;
-        }
-        else {
-            nz[last++] = 1;
-            coeff_count++;
-            if( get_cabac( &h->cabac, &h->cabac_state[166+last_significant_coeff_flag_offset[cat]+ctx] ) ) {
-                while( last < max_coeff ) {
-                    nz[last++] = 0;
-                }
+    for(last= 0; last < max_coeff - 1; last++) {
+        if( get_cabac( &h->cabac, &h->cabac_state[significant_coeff_flag_field_offset[h->mb_field_decoding_flag]+significant_coeff_flag_offset[cat]+last] )) {
+            index[coeff_count++] = last;
+            if( get_cabac( &h->cabac, &h->cabac_state[last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag]+significant_coeff_flag_offset[cat]+last] ) ) {
+                last= max_coeff;
                 break;
             }
         }
     }
     if( last == max_coeff -1 ) {
-        nz[last++] = 1;
-        coeff_count++;
+        index[coeff_count++] = last;
     }
+    assert(coeff_count > 0);
 
-    if( cat == 0 && coeff_count > 0 )
+    if( cat == 0 )
         h->cbp_table[mb_xy] |= 0x100;
     else if( cat == 1 || cat == 2 )
         h->non_zero_count_cache[scan8[n]] = coeff_count;
-    else if( cat == 3 && coeff_count > 0 )
+    else if( cat == 3 )
         h->cbp_table[mb_xy] |= 0x40 << n;
-    else if( cat == 4 )
+    else {
+        assert( cat == 4 );
         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
+    }
 
     for( i = coeff_count - 1; i >= 0; i-- ) {
-        int coeff_abs_m1;
-
-        int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 + 1 )) + coeff_abs_level_m1_offset[cat];
+        int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + coeff_abs_level_m1_offset[cat];
+        int j= scantable[index[i]];
 
-        if( get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) == 0 ) {
-            coeff_abs_m1 = 0;
+        if( get_cabac( &h->cabac, &h->cabac_state[ctx] ) == 0 ) {
+            if( cat == 0 || cat == 3 ) {
+                if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
+                else                                block[j] =  1;
+            }else{
+                if( get_cabac_bypass( &h->cabac ) ) block[j] = -qmul[j];
+                else                                block[j] =  qmul[j];
+            }
+    
+            abslevel1++;
         } else {
-            coeff_abs_m1 = 1;
+            int coeff_abs = 2;
             ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat];
-            while( coeff_abs_m1 < 14 && get_cabac( &h->cabac, &h->cabac_state[227+ctx] ) ) {
-                coeff_abs_m1++;
+            while( coeff_abs < 15 && get_cabac( &h->cabac, &h->cabac_state[ctx] ) ) {
+                coeff_abs++;
             }
-        }
 
-        if( coeff_abs_m1 >= 14 ) {
-            int j = 0;
-            while( get_cabac_bypass( &h->cabac ) ) {
-                coeff_abs_m1 += 1 << j;
-                j++;
+            if( coeff_abs >= 15 ) {
+                int j = 0;
+                while( get_cabac_bypass( &h->cabac ) ) {
+                    coeff_abs += 1 << j;
+                    j++;
+                }
+    
+                while( j-- ) {
+                    if( get_cabac_bypass( &h->cabac ) )
+                        coeff_abs += 1 << j ;
+                }
             }
 
-            while( j-- ) {
-                if( get_cabac_bypass( &h->cabac ) )
-                    coeff_abs_m1 += 1 << j ;
+            if( cat == 0 || cat == 3 ) {
+                if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
+                else                                block[j] =  coeff_abs;
+            }else{
+                if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs * qmul[j];
+                else                                block[j] =  coeff_abs * qmul[j];
             }
-        }
-        if( get_cabac_bypass( &h->cabac ) )
-            coeff[i] = -1 *( coeff_abs_m1 + 1 );
-        else
-            coeff[i] = coeff_abs_m1 + 1;
-
-        if( coeff_abs_m1 == 0 )
-            abslevel1++;
-        else
+    
             abslevelgt1++;
-    }
-
-    if( cat == 0 || cat == 3 ) { /* DC */
-        int j;
-        for( i = 0, j = 0; j < coeff_count; i++ ) {
-            if( nz[i] ) {
-                block[scantable[i]] = coeff[j];
-
-                j++;
-            }
         }
+    }
+    return 0;
+}
 
-    } else { /* AC */
-        int j;
-        for( i = 0, j = 0; j < coeff_count; i++ ) {
-            if( nz[i] ) {
-                block[scantable[i]] = coeff[j] * qmul[scantable[i]];
-
-                j++;
-            }
+void inline compute_mb_neighboors(H264Context *h)
+{
+    MpegEncContext * const s = &h->s;
+    const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
+    h->top_mb_xy     = mb_xy - s->mb_stride;
+    h->left_mb_xy[0] = mb_xy - 1;
+    if(h->mb_aff_frame){
+        const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
+        const int top_pair_xy      = pair_xy     - s->mb_stride;
+        const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
+        const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
+        const int curr_mb_frame_flag = !h->mb_field_decoding_flag;
+        const int bottom = (s->mb_y & 1);
+        if (bottom
+                ? !curr_mb_frame_flag // bottom macroblock
+                : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
+                ) {
+            h->top_mb_xy -= s->mb_stride;
+        }
+        if (left_mb_frame_flag != curr_mb_frame_flag) {
+            h->left_mb_xy[0] = pair_xy - 1;
         }
     }
-    return 0;
+    return;
 }
 
 /**
@@ -4391,61 +5187,43 @@ static int decode_mb_cabac(H264Context *h) {
 
     s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?)
 
-    if( h->slice_type == B_TYPE ) {
-        av_log( h->s.avctx, AV_LOG_ERROR, "B-frame not supported with CABAC\n" );
-        return -1;
-    }
-    if( h->sps.mb_aff ) {
-        av_log( h->s.avctx, AV_LOG_ERROR, "Fields not supported with CABAC\n" );
-        return -1;
-    }
-
+    tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
         /* read skip flags */
         if( decode_cabac_mb_skip( h ) ) {
-            int mx, my;
-
-            /* skip mb */
-            mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
+            decode_mb_skip(h);
 
-            memset(h->non_zero_count[mb_xy], 0, 16);
-            memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
-#if 0
-            if(h->sps.mb_aff && s->mb_skip_run==0 && (s->mb_y&1)==0){
-                h->mb_field_decoding_flag= get_bits1(&s->gb);
-            }
-            if(h->mb_field_decoding_flag)
-                mb_type|= MB_TYPE_INTERLACED;
-#endif
-
-            fill_caches(h, mb_type); //FIXME check what is needed and what not ...
-            pred_pskip_motion(h, &mx, &my);
-            fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
-            fill_rectangle(  h->mvd_cache[0][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
-            fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
-            write_back_motion(h, mb_type);
-
-            s->current_picture.mb_type[mb_xy]= mb_type; //FIXME SKIP type
-            s->current_picture.qscale_table[mb_xy]= s->qscale;
-            h->slice_table[ mb_xy ]= h->slice_num;
             h->cbp_table[mb_xy] = 0;
             h->chroma_pred_mode_table[mb_xy] = 0;
             h->last_qscale_diff = 0;
 
-            h->prev_mb_skiped= 1;
-
             return 0;
 
         }
     }
+    if(h->mb_aff_frame){
+        if ( ((s->mb_y&1) == 0) || h->prev_mb_skiped)
+            h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
+    }else
+        h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
+
     h->prev_mb_skiped = 0;
 
+    compute_mb_neighboors(h);
     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
         return -1;
     }
 
-    if( h->slice_type == P_TYPE ) {
+    if( h->slice_type == B_TYPE ) {
+        if( mb_type < 23 ){
+            partition_count= b_mb_type_info[mb_type].partition_count;
+            mb_type=         b_mb_type_info[mb_type].type;
+        }else{
+            mb_type -= 23;
+            goto decode_intra_mb;
+        }
+    } else if( h->slice_type == P_TYPE ) {
         if( mb_type < 5) {
             partition_count= p_mb_type_info[mb_type].partition_count;
             mb_type=         p_mb_type_info[mb_type].type;
@@ -4461,23 +5239,59 @@ decode_intra_mb:
         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
         mb_type= i_mb_type_info[mb_type].type;
     }
-#if 0
     if(h->mb_field_decoding_flag)
         mb_type |= MB_TYPE_INTERLACED;
-#endif
 
     s->current_picture.mb_type[mb_xy]= mb_type;
     h->slice_table[ mb_xy ]= h->slice_num;
 
     if(IS_INTRA_PCM(mb_type)) {
-        /* TODO */
-        h->cbp_table[mb_xy] = 0xf +4*2;
+        const uint8_t *ptr;
+        unsigned int x, y;
+        
+        // We assume these blocks are very rare so we dont optimize it.
+        // FIXME The two following lines get the bitstream position in the cabac
+        // decode, I think it should be done by a function in cabac.h (or cabac.c).
+        ptr= h->cabac.bytestream;
+        if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
+
+        // The pixels are stored in the same order as levels in h->mb array.
+        for(y=0; y<16; y++){
+            const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
+            for(x=0; x<16; x++){
+                tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
+                h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
+            }
+        }
+        for(y=0; y<8; y++){
+            const int index= 256 + 4*(y&3) + 32*(y>>2);
+            for(x=0; x<8; x++){
+                tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
+                h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
+            }
+        }
+        for(y=0; y<8; y++){
+            const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
+            for(x=0; x<8; x++){
+                tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
+                h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
+            }
+        }
+
+        ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
+
+        // All blocks are presents
+        h->cbp_table[mb_xy] = 0x1ef;
         h->chroma_pred_mode_table[mb_xy] = 0;
-        s->current_picture.qscale_table[mb_xy]= s->qscale;
-        return -1;
+        // In deblocking, the quantiser is 0
+        s->current_picture.qscale_table[mb_xy]= 0;
+        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
+        // All coeffs are presents
+        memset(h->non_zero_count[mb_xy], 16, 16);
+        return 0;
     }
 
-    fill_caches(h, mb_type);
+    fill_caches(h, mb_type, 0);
 
     if( IS_INTRA( mb_type ) ) {
         if( IS_INTRA4x4( mb_type ) ) {
@@ -4502,17 +5316,34 @@ decode_intra_mb:
     } else if( partition_count == 4 ) {
         int i, j, sub_partition_count[4], list, ref[2][4];
 
-        /* Only P-frame */
-        for( i = 0; i < 4; i++ ) {
-            h->sub_mb_type[i] = decode_cabac_mb_sub_type( h );
-            sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
-            h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+        if( h->slice_type == B_TYPE ) {
+            for( i = 0; i < 4; i++ ) {
+                h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
+                sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
+                h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+            }
+            if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
+               || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
+                pred_direct_motion(h, &mb_type);
+                if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
+                    for( i = 0; i < 4; i++ )
+                        if( IS_DIRECT(h->sub_mb_type[i]) )
+                            fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
+                }
+            }
+        } else {
+            for( i = 0; i < 4; i++ ) {
+                h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
+                sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
+                h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
+            }
         }
 
         for( list = 0; list < 2; list++ ) {
             if( h->ref_count[list] > 0 ) {
                 for( i = 0; i < 4; i++ ) {
-                    if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
+                    if(IS_DIRECT(h->sub_mb_type[i])) continue;
+                    if(IS_DIR(h->sub_mb_type[i], 0, list)){
                         if( h->ref_count[list] > 1 )
                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
                         else
@@ -4528,6 +5359,10 @@ decode_intra_mb:
 
         for(list=0; list<2; list++){
             for(i=0; i<4; i++){
+                if(IS_DIRECT(h->sub_mb_type[i])){
+                    fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
+                    continue;
+                }
                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
 
                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
@@ -4584,7 +5419,12 @@ decode_intra_mb:
                 }
             }
         }
-    } else if( !IS_DIRECT(mb_type) ) {
+    } else if( IS_DIRECT(mb_type) ) {
+        pred_direct_motion(h, &mb_type);
+        s->current_picture.mb_type[mb_xy]= mb_type;
+        fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
+        fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
+    } else {
         int list, mx, my, i, mpx, mpy;
         if(IS_16X16(mb_type)){
             for(list=0; list<2; list++){
@@ -4593,7 +5433,8 @@ decode_intra_mb:
                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
                     }
-                }
+                }else
+                    fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
             }
             for(list=0; list<2; list++){
                 if(IS_DIR(mb_type, 0, list)){
@@ -4605,7 +5446,8 @@ decode_intra_mb:
 
                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
-                }
+                }else
+                    fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
             }
         }
         else if(IS_16X8(mb_type)){
@@ -4615,7 +5457,8 @@ decode_intra_mb:
                         if(IS_DIR(mb_type, i, list)){
                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
-                        }
+                        }else
+                            fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
                     }
                 }
             }
@@ -4629,6 +5472,9 @@ decode_intra_mb:
 
                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
+                    }else{
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
+                        fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
                     }
                 }
             }
@@ -4640,7 +5486,8 @@ decode_intra_mb:
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
-                        }
+                        }else
+                            fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
                     }
                 }
             }
@@ -4654,6 +5501,9 @@ decode_intra_mb:
                         tprintf("final mv:%d %d\n", mx, my);
                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
+                    }else{
+                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
+                        fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
                     }
                 }
             }
@@ -4690,7 +5540,7 @@ decode_intra_mb:
             if(s->qscale<0) s->qscale+= 52;
             else            s->qscale-= 52;
         }
-        h->chroma_qp = get_chroma_qp(h, s->qscale);
+        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
 
         if( IS_INTRA16x16( mb_type ) ) {
             int i;
@@ -4748,7 +5598,10 @@ decode_intra_mb:
             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
         }
     } else {
-        memset( &h->non_zero_count_cache[8], 0, 8*5 );
+        uint8_t * const nnz= &h->non_zero_count_cache[0];
+        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
+        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
+        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
     }
 
     s->current_picture.qscale_table[mb_xy]= s->qscale;
@@ -4799,6 +5652,7 @@ static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4]
                     i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
                     pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
                     pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                    tprintf("filter_mb_edgev i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], pix[-3], p1, p0, q0, q1, pix[2], pix[-2], pix[-1], pix[0], pix[1]);
                 }
                 pix += stride;
             }
@@ -4845,6 +5699,7 @@ static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4]
                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
                     }
+                    tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
                 }
                 pix += stride;
             }
@@ -4879,6 +5734,7 @@ static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4
 
                     pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
                     pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                    tprintf("filter_mb_edgecv i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
                 }
                 pix += stride;
             }
@@ -4896,6 +5752,7 @@ static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4
 
                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
                     pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                    tprintf("filter_mb_edgecv i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
                 }
                 pix += stride;
             }
@@ -4903,6 +5760,160 @@ static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4
     }
 }
 
+static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
+    int i;
+    for( i = 0; i < 16; i++, pix += stride) {
+        int index_a;
+        int alpha;
+        int beta;
+    
+        int qp_index;
+        int bS_index = (i >> 1);
+        if (h->mb_field_decoding_flag) {
+            bS_index &= ~1;
+            bS_index |= (i & 1);
+        }
+
+        if( bS[bS_index] == 0 ) {
+            continue;
+        }
+
+        qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
+        index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
+        alpha = alpha_table[index_a];
+        beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
+
+
+        if( bS[bS_index] < 4 ) {
+            const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
+            /* 4px edge length */
+            const int p0 = pix[-1];
+            const int p1 = pix[-2];
+            const int p2 = pix[-3];
+            const int q0 = pix[0];
+            const int q1 = pix[1];
+            const int q2 = pix[2];
+
+            if( ABS( p0 - q0 ) < alpha &&
+                ABS( p1 - p0 ) < beta &&
+                ABS( q1 - q0 ) < beta ) {
+                int tc = tc0;
+                int i_delta;
+
+                if( ABS( p2 - p0 ) < beta ) {
+                    pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
+                    tc++;
+                }
+                if( ABS( q2 - q0 ) < beta ) {
+                    pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
+                    tc++;
+                }
+
+                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+                pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
+            }
+        }else{
+            /* 4px edge length */
+            const int p0 = pix[-1];
+            const int p1 = pix[-2];
+            const int p2 = pix[-3];
+
+            const int q0 = pix[0];
+            const int q1 = pix[1];
+            const int q2 = pix[2];
+
+            if( ABS( p0 - q0 ) < alpha &&
+                ABS( p1 - p0 ) < beta &&
+                ABS( q1 - q0 ) < beta ) {
+
+                if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                    if( ABS( p2 - p0 ) < beta)
+                    {
+                        const int p3 = pix[-4];
+                        /* p0', p1', p2' */
+                        pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                        pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                        pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                    } else {
+                        /* p0' */
+                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                    }
+                    if( ABS( q2 - q0 ) < beta)
+                    {
+                        const int q3 = pix[3];
+                        /* q0', q1', q2' */
+                        pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                        pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                        pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                    } else {
+                        /* q0' */
+                        pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                    }
+                }else{
+                    /* p0', q0' */
+                    pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                    pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                }
+                tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
+            }
+        }
+    }
+}
+static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
+    int i;
+    for( i = 0; i < 8; i++, pix += stride) {
+        int index_a;
+        int alpha;
+        int beta;
+
+        int qp_index;
+        int bS_index = i;
+
+        if( bS[bS_index] == 0 ) {
+            continue;
+        }
+
+        qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
+        index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
+        alpha = alpha_table[index_a];
+        beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
+        if( bS[bS_index] < 4 ) {
+            const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
+            /* 2px edge length (because we use same bS than the one for luma) */
+            const int p0 = pix[-1];
+            const int p1 = pix[-2];
+            const int q0 = pix[0];
+            const int q1 = pix[1];
+
+            if( ABS( p0 - q0 ) < alpha &&
+                ABS( p1 - p0 ) < beta &&
+                ABS( q1 - q0 ) < beta ) {
+                const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
+
+                pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
+                pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
+                tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
+            }
+        }else{
+            const int p0 = pix[-1];
+            const int p1 = pix[-2];
+            const int q0 = pix[0];
+            const int q1 = pix[1];
+
+            if( ABS( p0 - q0 ) < alpha &&
+                ABS( p1 - p0 ) < beta &&
+                ABS( q1 - q0 ) < beta ) {
+
+                pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
+                pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
+            }
+        }
+    }
+}
+
 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
     int i, d;
     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
@@ -4946,6 +5957,7 @@ static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4]
                     i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
                     pix[-pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
                     pix[0]         = clip_uint8( q0 - i_delta );    /* q0' */
+                    tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
                 }
                 pix++;
             }
@@ -4990,6 +6002,7 @@ static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4]
                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
                     }
+                    tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
                 }
                 pix++;
             }
@@ -5028,6 +6041,7 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
 
                     pix[-pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
                     pix[0]         = clip_uint8( q0 - i_delta );    /* q0' */
+                    tprintf("filter_mb_edgech i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], pix[-3*pix_next], p1, p0, q0, q1, pix[2*pix_next], pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
                 }
                 pix++;
             }
@@ -5045,6 +6059,7 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
 
                     pix[-pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
                     pix[0]         = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
+                    tprintf("filter_mb_edgech i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], pix[-3*pix_next], p1, p0, q0, q1, pix[2*pix_next], pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
                 }
                 pix++;
             }
@@ -5052,43 +6067,167 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
     }
 }
 
-static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
+static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
     MpegEncContext * const s = &h->s;
     const int mb_xy= mb_x + mb_y*s->mb_stride;
-    int linesize, uvlinesize;
+    int first_vertical_edge_done = 0;
     int dir;
 
-    /* FIXME Implement deblocking filter for field MB */
-    if( h->sps.mb_aff ) {
-        return;
-    }
-    linesize = s->linesize;
-    uvlinesize = s->uvlinesize;
+    if (h->mb_aff_frame
+            // left mb is in picture
+            && h->slice_table[mb_xy-1] != 255
+            // and current and left pair do not have the same interlaced type
+            && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
+            // and left mb is in the same slice if deblocking_filter == 2
+            && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
+        /* First vertical edge is different in MBAFF frames
+         * There are 8 differents bS to compute and 2 differents Qp
+         */
+        int bS[8];
+        int qp[2];
+        int chroma_qp[2];
+
+        int i;
+        first_vertical_edge_done = 1;
+        for( i = 0; i < 8; i++ ) {
+            int y = i>>1;
+            int b_idx= 8 + 4 + 8*y;
+            int bn_idx= b_idx - 1;
+
+            int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
+
+            if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
+                IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
+                bS[i] = 4;
+            } else if( h->non_zero_count_cache[b_idx] != 0 ||
+                h->non_zero_count_cache[bn_idx] != 0 ) {
+                bS[i] = 2;
+            } else {
+                /* FIXME: A given frame may occupy more than one position in
+                 * the reference list. So we should compare the frame numbers,
+                 * not the indices in the ref list. */
+                int l;
+                bS[i] = 0;
+                for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
+                    if( h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] ||
+                        ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
+                        ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
+                        bS[i] = 1;
+                        break;
+                    }
+                }
+            }
+        }
+        if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
+            // Do not use s->qscale as luma quantiser because it has not the same
+            // value in IPCM macroblocks.
+            qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
+            chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
+                             get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
+            qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
+            chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
+                             get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
 
+            /* Filter edge */
+            tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
+            { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
+            filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
+            filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
+            filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
+        }
+    }
     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
     for( dir = 0; dir < 2; dir++ )
     {
-        int start = 0;
         int edge;
+        const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
+        int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
 
-        /* test picture boundary */
-        if( ( dir == 0 && mb_x == 0 ) || ( dir == 1 && mb_y == 0 ) ) {
+        if (first_vertical_edge_done) {
             start = 1;
+            first_vertical_edge_done = 0;
         }
-        /* FIXME test slice boundary */
-        if( h->deblocking_filter == 2 ) {
-        }
+
+        if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
+            start = 1;
 
         /* Calculate bS */
         for( edge = start; edge < 4; edge++ ) {
-            /* mbn_xy: neighbour macroblock (how that works for field ?) */
-            int mbn_xy = edge > 0 ? mb_xy : ( dir == 0 ? mb_xy -1 : mb_xy - s->mb_stride );
+            /* mbn_xy: neighbour macroblock */
+            int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
             int bS[4];
             int qp;
 
+            if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
+                && !IS_INTERLACED(s->current_picture.mb_type[mb_xy])
+                && IS_INTERLACED(s->current_picture.mb_type[mbn_xy])
+                ) {
+                // This is a special case in the norm where the filtering must
+                // be done twice (one each of the field) even if we are in a
+                // frame macroblock.
+                //
+                unsigned int tmp_linesize   = 2 *   linesize;
+                unsigned int tmp_uvlinesize = 2 * uvlinesize;
+                int mbn_xy = mb_xy - 2 * s->mb_stride;
+                int qp, chroma_qp;
+
+                // first filtering
+                if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
+                    IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
+                    bS[0] = bS[1] = bS[2] = bS[3] = 3;
+                } else {
+                    // TODO
+                    assert(0);
+                }
+                /* Filter edge */
+                // Do not use s->qscale as luma quantiser because it has not the same
+                // value in IPCM macroblocks.
+                qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+                tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
+                { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
+                filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
+                chroma_qp = ( h->chroma_qp +
+                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
+                filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
+
+                // second filtering
+                mbn_xy += s->mb_stride;
+                if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
+                    IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
+                    bS[0] = bS[1] = bS[2] = bS[3] = 3;
+                } else {
+                    // TODO
+                    assert(0);
+                }
+                /* Filter edge */
+                // Do not use s->qscale as luma quantiser because it has not the same
+                // value in IPCM macroblocks.
+                qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+                tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
+                { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
+                filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
+                chroma_qp = ( h->chroma_qp +
+                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+                filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+                continue;
+            }
             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
-                bS[0] = bS[1] = bS[2] = bS[3] = ( edge == 0 ? 4 : 3 );
+                int value;
+                if (edge == 0) {
+                    if (   (!IS_INTERLACED(s->current_picture.mb_type[mb_xy]) && !IS_INTERLACED(s->current_picture.mb_type[mbm_xy]))
+                        || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
+                    ) {
+                        value = 4;
+                    } else {
+                        value = 3;
+                    }
+                } else {
+                    value = 3;
+                }
+                bS[0] = bS[1] = bS[2] = bS[3] = value;
             } else {
                 int i;
                 for( i = 0; i < 4; i++ ) {
@@ -5101,17 +6240,21 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                         h->non_zero_count_cache[bn_idx] != 0 ) {
                         bS[i] = 2;
                     }
-                    else if( h->slice_type == P_TYPE ) {
-                        if( h->ref_cache[0][b_idx] != h->ref_cache[0][bn_idx] ||
-                            ABS( h->mv_cache[0][b_idx][0] - h->mv_cache[0][bn_idx][0] ) >= 4 ||
-                            ABS( h->mv_cache[0][b_idx][1] - h->mv_cache[0][bn_idx][1] ) >= 4 )
-                            bS[i] = 1;
-                        else
-                            bS[i] = 0;
-                    }
-                    else {
-                        /* FIXME Add support for B frame */
-                        return;
+                    else
+                    {
+                        /* FIXME: A given frame may occupy more than one position in
+                         * the reference list. So we should compare the frame numbers,
+                         * not the indices in the ref list. */
+                        int l;
+                        bS[i] = 0;
+                        for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
+                            if( h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] ||
+                                ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
+                                ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
+                                bS[i] = 1;
+                                break;
+                            }
+                        }
                     }
                 }
 
@@ -5120,12 +6263,17 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
             }
 
             /* Filter edge */
-            qp = ( s->qscale + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+            // Do not use s->qscale as luma quantiser because it has not the same
+            // value in IPCM macroblocks.
+            qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+            //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
+            tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+            { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
             if( dir == 0 ) {
                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
                 if( (edge&1) == 0 ) {
                     int chroma_qp = ( h->chroma_qp +
-                                      get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                                      get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
                 }
@@ -5133,7 +6281,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
                 if( (edge&1) == 0 ) {
                     int chroma_qp = ( h->chroma_qp +
-                                      get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
+                                      get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
                 }
@@ -5175,20 +6323,20 @@ static int decode_slice(H264Context *h){
 
         for(;;){
             int ret = decode_mb_cabac(h);
-            int eos = get_cabac_terminate( &h->cabac ); /* End of Slice flag */
+            int eos;
 
             if(ret>=0) hl_decode_mb(h);
 
             /* XXX: useless as decode_mb_cabac it doesn't support that ... */
-            if( ret >= 0 && h->sps.mb_aff ) { //FIXME optimal? or let mb_decode decode 16x32 ?
+            if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
                 s->mb_y++;
 
                 if(ret>=0) ret = decode_mb_cabac(h);
-                eos = get_cabac_terminate( &h->cabac );
 
                 hl_decode_mb(h);
                 s->mb_y--;
             }
+            eos = get_cabac_terminate( &h->cabac );
 
             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
@@ -5199,22 +6347,17 @@ static int decode_slice(H264Context *h){
             if( ++s->mb_x >= s->mb_width ) {
                 s->mb_x = 0;
                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
-                if( ++s->mb_y >= s->mb_height ) {
-                    tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
+                ++s->mb_y;
+                if(h->mb_aff_frame) {
+                    ++s->mb_y;
                 }
             }
 
             if( eos || s->mb_y >= s->mb_height ) {
+                tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
                 return 0;
             }
-#if 0
-            /* TODO test over-reading in cabac code */
-            else if( read too much in h->cabac ) {
-                ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
-                return -1;
-            }
-#endif
         }
 
     } else {
@@ -5223,7 +6366,7 @@ static int decode_slice(H264Context *h){
 
             if(ret>=0) hl_decode_mb(h);
 
-            if(ret>=0 && h->sps.mb_aff){ //FIXME optimal? or let mb_decode decode 16x32 ?
+            if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
                 s->mb_y++;
                 ret = decode_mb_cavlc(h);
 
@@ -5241,7 +6384,11 @@ static int decode_slice(H264Context *h){
             if(++s->mb_x >= s->mb_width){
                 s->mb_x=0;
                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
-                if(++s->mb_y >= s->mb_height){
+                ++s->mb_y;
+                if(h->mb_aff_frame) {
+                    ++s->mb_y;
+                }
+                if(s->mb_y >= s->mb_height){
                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
 
                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
@@ -5257,6 +6404,7 @@ static int decode_slice(H264Context *h){
             }
 
             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
+                tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
 
@@ -5318,9 +6466,27 @@ static int decode_slice(H264Context *h){
     return -1; //not reached
 }
 
+static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
+    MpegEncContext * const s = &h->s;
+    int cpb_count, i;
+    cpb_count = get_ue_golomb(&s->gb) + 1;
+    get_bits(&s->gb, 4); /* bit_rate_scale */
+    get_bits(&s->gb, 4); /* cpb_size_scale */
+    for(i=0; i<cpb_count; i++){
+        get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
+        get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
+        get_bits1(&s->gb);     /* cbr_flag */
+    }
+    get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
+    get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
+    get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
+    get_bits(&s->gb, 5); /* time_offset_length */
+}
+
 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
     MpegEncContext * const s = &h->s;
     int aspect_ratio_info_present_flag, aspect_ratio_idc;
+    int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
 
     aspect_ratio_info_present_flag= get_bits1(&s->gb);
     
@@ -5367,29 +6533,27 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps){
         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
     }
 
-#if 0
-| nal_hrd_parameters_present_flag                   |0  |u(1)    |
-| if( nal_hrd_parameters_present_flag  = =  1)      |   |        |
-|  hrd_parameters( )                                |   |        |
-| vcl_hrd_parameters_present_flag                   |0  |u(1)    |
-| if( vcl_hrd_parameters_present_flag  = =  1)      |   |        |
-|  hrd_parameters( )                                |   |        |
-| if( ( nal_hrd_parameters_present_flag  = =  1  | ||   |        |
-|                                                   |   |        |
-|( vcl_hrd_parameters_present_flag  = =  1 ) )      |   |        |
-|  low_delay_hrd_flag                               |0  |u(1)    |
-| bitstream_restriction_flag                        |0  |u(1)    |
-| if( bitstream_restriction_flag ) {                |0  |u(1)    |
-|  motion_vectors_over_pic_boundaries_flag          |0  |u(1)    |
-|  max_bytes_per_pic_denom                          |0  |ue(v)   |
-|  max_bits_per_mb_denom                            |0  |ue(v)   |
-|  log2_max_mv_length_horizontal                    |0  |ue(v)   |
-|  log2_max_mv_length_vertical                      |0  |ue(v)   |
-|  num_reorder_frames                               |0  |ue(v)   |
-|  max_dec_frame_buffering                          |0  |ue(v)   |
-| }                                                 |   |        |
-|}                                                  |   |        |
-#endif
+    nal_hrd_parameters_present_flag = get_bits1(&s->gb);
+    if(nal_hrd_parameters_present_flag)
+        decode_hrd_parameters(h, sps);
+    vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
+    if(vcl_hrd_parameters_present_flag)
+        decode_hrd_parameters(h, sps);
+    if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
+        get_bits1(&s->gb);     /* low_delay_hrd_flag */
+    get_bits1(&s->gb);         /* pic_struct_present_flag */
+
+    sps->bitstream_restriction_flag = get_bits1(&s->gb);
+    if(sps->bitstream_restriction_flag){
+        get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
+        get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
+        get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
+        get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
+        get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
+        sps->num_reorder_frames = get_ue_golomb(&s->gb);
+        get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
+    }
+
     return 0;
 }
 
@@ -5403,14 +6567,15 @@ static inline int decode_seq_parameter_set(H264Context *h){
     get_bits1(&s->gb);   //constraint_set0_flag
     get_bits1(&s->gb);   //constraint_set1_flag
     get_bits1(&s->gb);   //constraint_set2_flag
-    get_bits(&s->gb, 5); // reserved
+    get_bits1(&s->gb);   //constraint_set3_flag
+    get_bits(&s->gb, 4); // reserved
     level_idc= get_bits(&s->gb, 8);
     sps_id= get_ue_golomb(&s->gb);
     
     sps= &h->sps_buffer[ sps_id ];
     sps->profile_idc= profile_idc;
     sps->level_idc= level_idc;
-    
+
     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
     sps->poc_type= get_ue_golomb(&s->gb);
     
@@ -5431,9 +6596,16 @@ static inline int decode_seq_parameter_set(H264Context *h){
     }
 
     sps->ref_frame_count= get_ue_golomb(&s->gb);
+    if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
+        av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
+    }
     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
     sps->mb_width= get_ue_golomb(&s->gb) + 1;
     sps->mb_height= get_ue_golomb(&s->gb) + 1;
+    if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 || 
+       avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
+        return -1;
+
     sps->frame_mbs_only_flag= get_bits1(&s->gb);
     if(!sps->frame_mbs_only_flag)
         sps->mb_aff= get_bits1(&s->gb);
@@ -5561,23 +6733,34 @@ static inline int decode_picture_parameter_set(H264Context *h){
  * finds the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame, or -1
  */
-static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
+static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
     int i;
     uint32_t state;
+    ParseContext *pc = &(h->s.parse_context);
 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
 //    mb_addr= pc->mb_addr - 1;
     state= pc->state;
-    //FIXME this will fail with slices
-    for(i=0; i<buf_size; i++){
-        state= (state<<8) | buf[i];
+    for(i=0; i<=buf_size; i++){
         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
+            tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
             if(pc->frame_start_found){
-                pc->state=-1; 
-                pc->frame_start_found= 0;
-                return i-3;
+                // If there isn't one more byte in the buffer
+                // the test on first_mb_in_slice cannot be done yet
+                // do it at next call.
+                if (i >= buf_size) break;
+                if (buf[i] & 0x80) {
+                    // first_mb_in_slice is 0, probably the first nal of a new
+                    // slice
+                    tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
+                    pc->state=-1; 
+                    pc->frame_start_found= 0;
+                    return i-4;
+                }
             }
-            pc->frame_start_found= 1;
+            pc->frame_start_found = 1;
         }
+        if (i<buf_size)
+            state= (state<<8) | buf[i];
     }
     
     pc->state= state;
@@ -5589,10 +6772,11 @@ static int h264_parse(AVCodecParserContext *s,
                       uint8_t **poutbuf, int *poutbuf_size, 
                       const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    H264Context *h = s->priv_data;
+    ParseContext *pc = &h->s.parse_context;
     int next;
     
-    next= find_frame_end(pc, buf, buf_size);
+    next= find_frame_end(h, buf, buf_size);
 
     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
         *poutbuf = NULL;
@@ -5615,6 +6799,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         printf("%X ", buf[i]);
     }
 #endif
+    h->slice_num = 0;
     for(;;){
         int consumed;
         int dst_length;
@@ -5626,8 +6811,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         if(buf_index >= buf_size) break;
         nalsize = 0;
         for(i = 0; i < h->nal_length_size; i++)
-            nalsize = (nalsize << 8) | buf[buf_index + i];
-        buf_index += h->nal_length_size;
+            nalsize = (nalsize << 8) | buf[buf_index++];
       } else {
         // start code prefix search
         for(; buf_index + 3 < buf_size; buf_index++){
@@ -5641,12 +6825,12 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         buf_index+=3;
       }  
         
-        ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, buf_size - buf_index);
+        ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
         if(ptr[dst_length - 1] == 0) dst_length--;
         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
 
         if(s->avctx->debug&FF_DEBUG_STARTCODE){
-            av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d length %d\n", h->nal_unit_type, buf_index, dst_length);
+            av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
         }
         
         if (h->is_avc && (nalsize != consumed))
@@ -5698,7 +6882,8 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
             if(s->flags& CODEC_FLAG_LOW_DELAY)
                 s->low_delay=1;
       
-            avctx->has_b_frames= !s->low_delay;
+            if(avctx->has_b_frames < 2)
+                avctx->has_b_frames= !s->low_delay;
             break;
         case NAL_PPS:
             init_get_bits(&s->gb, ptr, bit_length);
@@ -5713,13 +6898,12 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
 	default:
 	    av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
         }        
-
-        //FIXME move after where irt is set
-        s->current_picture.pict_type= s->pict_type;
-        s->current_picture.key_frame= s->pict_type == I_TYPE;
     }
     
     if(!s->current_picture_ptr) return buf_index; //no frame
+
+    s->current_picture_ptr->pict_type= s->pict_type;
+    s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
     
     h->prev_frame_num_offset= h->frame_num_offset;
     h->prev_frame_num= h->frame_num;
@@ -5729,8 +6913,6 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
     }
     if(s->current_picture_ptr->reference)
         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
-    else
-        assert(h->mmco_index==0);
 
     ff_er_frame_end(s);
 
@@ -5774,7 +6956,7 @@ static int decode_frame(AVCodecContext *avctx,
     }
     
     if(s->flags&CODEC_FLAG_TRUNCATED){
-        int next= find_frame_end(&s->parse_context, buf, buf_size);
+        int next= find_frame_end(h, buf, buf_size);
         
         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
             return buf_size;
@@ -5782,7 +6964,7 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     if(h->is_avc && !h->got_avcC) {
-        int i, cnt, poffs;
+        int i, cnt, nalsize;
         unsigned char *p = avctx->extradata;
         if(avctx->extradata_size < 7) {
             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
@@ -5795,27 +6977,29 @@ static int decode_frame(AVCodecContext *avctx,
         /* sps and pps in the avcC always have length coded with 2 bytes,
            so put a fake nal_length_size = 2 while parsing them */
         h->nal_length_size = 2;
-        poffs = 6;
         // Decode sps from avcC
         cnt = *(p+5) & 0x1f; // Number of sps
+        p += 6;
         for (i = 0; i < cnt; i++) {
-            if(decode_nal_units(h, p + poffs, BE_16(p + poffs) + 2) != BE_16(p + poffs) + 2) {
+            nalsize = BE_16(p) + 2;
+            if(decode_nal_units(h, p, nalsize) != nalsize) {
                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
                 return -1;
             }
-            poffs += BE_16(p + poffs) + 2;
+            p += nalsize;
         }        
         // Decode pps from avcC
-        cnt = *(p + poffs++); // Number of pps
+        cnt = *(p++); // Number of pps
         for (i = 0; i < cnt; i++) {
-            if(decode_nal_units(h, p + poffs, BE_16(p + poffs) + 2)  != BE_16(p + poffs) + 2) {
+            nalsize = BE_16(p) + 2;
+            if(decode_nal_units(h, p, nalsize)  != nalsize) {
                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
                 return -1;
             }
-            poffs += BE_16(p + poffs) + 2;
+            p += nalsize;
         }        
         // Now store right nal length size, that will be use to parse all other nals
-        h->nal_length_size = ((*(p+4))&0x03)+1;
+        h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
         // Do not reparse avcC
         h->got_avcC = 1;
     }
@@ -5832,21 +7016,83 @@ static int decode_frame(AVCodecContext *avctx,
     //FIXME do something with unavailable reference frames    
  
 //    if(ret==FRAME_SKIPED) return get_consumed_bytes(s, buf_index, buf_size);
-#if 0
-    if(s->pict_type==B_TYPE || s->low_delay){
-        *pict= *(AVFrame*)&s->current_picture;
-    } else {
-        *pict= *(AVFrame*)&s->last_picture;
-    }
-#endif
     if(!s->current_picture_ptr){
         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
         return -1;
     }
 
-    *pict= *(AVFrame*)&s->current_picture; //FIXME 
-    ff_print_debug_info(s, pict);
+    {
+        Picture *out = s->current_picture_ptr;
+#if 0 //decode order
+        *data_size = sizeof(AVFrame);
+#else
+        /* Sort B-frames into display order */
+        Picture *cur = s->current_picture_ptr;
+        Picture *prev = h->delayed_output_pic;
+        int out_idx = 0;
+        int pics = 0;
+        int out_of_order;
+        int cross_idr = 0;
+        int dropped_frame = 0;
+        int i;
+
+        if(h->sps.bitstream_restriction_flag
+           && s->avctx->has_b_frames < h->sps.num_reorder_frames){
+            s->avctx->has_b_frames = h->sps.num_reorder_frames;
+            s->low_delay = 0;
+        }
+
+        while(h->delayed_pic[pics]) pics++;
+        h->delayed_pic[pics++] = cur;
+        if(cur->reference == 0)
+            cur->reference = 1;
+
+        for(i=0; h->delayed_pic[i]; i++)
+            if(h->delayed_pic[i]->key_frame)
+                cross_idr = 1;
+
+        out = h->delayed_pic[0];
+        for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
+            if(h->delayed_pic[i]->poc < out->poc){
+                out = h->delayed_pic[i];
+                out_idx = i;
+            }
+
+        out_of_order = !cross_idr && prev && out->poc < prev->poc;
+        if(prev && pics <= s->avctx->has_b_frames)
+            out = prev;
+        else if((out_of_order && pics-1 == s->avctx->has_b_frames)
+           || (s->low_delay && 
+            ((!cross_idr && prev && out->poc > prev->poc + 2)
+             || cur->pict_type == B_TYPE)))
+        {
+            s->low_delay = 0;
+            s->avctx->has_b_frames++;
+            out = prev;
+        }
+        else if(out_of_order)
+            out = prev;
+
+        if(out_of_order || pics > s->avctx->has_b_frames){
+            dropped_frame = (out != h->delayed_pic[out_idx]);
+            for(i=out_idx; h->delayed_pic[i]; i++)
+                h->delayed_pic[i] = h->delayed_pic[i+1];
+        }
+
+        if(prev == out && !dropped_frame)
+            *data_size = 0;
+        else
+            *data_size = sizeof(AVFrame);
+        if(prev && prev != out && prev->reference == 1)
+            prev->reference = 0;
+        h->delayed_output_pic = out;
+#endif
+
+        *pict= *(AVFrame*)out;
+    }
+
     assert(pict->data[0]);
+    ff_print_debug_info(s, pict);
 //printf("out %d\n", (int)pict->data[0]);
 #if 0 //?
 
@@ -5854,12 +7100,6 @@ static int decode_frame(AVCodecContext *avctx,
     /* we substract 1 because it is added on utils.c    */
     avctx->frame_number = s->picture_number - 1;
 #endif
-#if 0
-    /* dont output the last pic after seeking */
-    if(s->last_picture_ptr || s->low_delay)
-    //Note this isnt a issue as a IDR pic should flush teh buffers
-#endif
-        *data_size = sizeof(AVFrame);
     return get_consumed_bytes(s, buf_index, buf_size);
 }
 #if 0
@@ -5970,7 +7210,7 @@ int main(){
         }
 //        printf("\n");
         
-        h264_add_idct_c(ref, block, 4);
+        s->dsp.h264_idct_add(ref, block, 4);
 /*        for(j=0; j<16; j++){
             printf("%d ", ref[j]);
         }
@@ -6079,12 +7319,12 @@ AVCodec h264_decoder = {
     NULL,
     decode_end,
     decode_frame,
-    /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
+    /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
 };
 
 AVCodecParser h264_parser = {
     { CODEC_ID_H264 },
-    sizeof(ParseContext),
+    sizeof(H264Context),
     NULL,
     h264_parse,
     ff_parse_close,
diff --git a/src/libffmpeg/libavcodec/h264data.h b/src/libffmpeg/libavcodec/h264data.h
index 5480becd4..21d2260e8 100644
--- a/src/libffmpeg/libavcodec/h264data.h
+++ b/src/libffmpeg/libavcodec/h264data.h
@@ -353,8 +353,8 @@ static const PMbInfo p_mb_type_info[5]={
 {MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
 {MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
 {MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
-{MB_TYPE_8x8                            , 4},
-{MB_TYPE_8x8  |MB_TYPE_REF0             , 4},
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
 };
 
 static const PMbInfo p_sub_mb_type_info[4]={
@@ -387,7 +387,7 @@ static const PMbInfo b_mb_type_info[23]={
 {MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
 {MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
 {MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
-{MB_TYPE_8x8                                                      , 4, },
+{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
 };
 
 static const PMbInfo b_sub_mb_type_info[13]={
diff --git a/src/libffmpeg/libavcodec/h264idct.c b/src/libffmpeg/libavcodec/h264idct.c
new file mode 100755
index 000000000..551197d37
--- /dev/null
+++ b/src/libffmpeg/libavcodec/h264idct.c
@@ -0,0 +1,70 @@
+/*
+ * H.264 IDCT
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file h264-idct.c
+ * H.264 IDCT.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+ 
+#include "dsputil.h"
+
+static always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){
+    int i;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+
+    block[0] += 1<<(shift-1);
+
+    for(i=0; i<4; i++){
+        const int z0=  block[0 + block_stride*i]     +  block[2 + block_stride*i];
+        const int z1=  block[0 + block_stride*i]     -  block[2 + block_stride*i];
+        const int z2= (block[1 + block_stride*i]>>1) -  block[3 + block_stride*i];
+        const int z3=  block[1 + block_stride*i]     + (block[3 + block_stride*i]>>1);
+
+        block[0 + block_stride*i]= z0 + z3;
+        block[1 + block_stride*i]= z1 + z2;
+        block[2 + block_stride*i]= z1 - z2;
+        block[3 + block_stride*i]= z0 - z3;
+    }
+
+    for(i=0; i<4; i++){
+        const int z0=  block[i + block_stride*0]     +  block[i + block_stride*2];
+        const int z1=  block[i + block_stride*0]     -  block[i + block_stride*2];
+        const int z2= (block[i + block_stride*1]>>1) -  block[i + block_stride*3];
+        const int z3=  block[i + block_stride*1]     + (block[i + block_stride*3]>>1);
+
+        dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ];
+        dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ];
+        dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ];
+        dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ];
+    }
+}
+
+void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    idct_internal(dst, block, stride, 4, 6, 1);
+}
+
+void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){
+    idct_internal(dst, block, stride, 8, 3, 1);
+}
+
+void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){
+    idct_internal(dst, block, stride, 8, 3, 0);
+}
diff --git a/src/libffmpeg/libavcodec/huffyuv.c b/src/libffmpeg/libavcodec/huffyuv.c
index ecc6a5fa2..5b496c512 100644
--- a/src/libffmpeg/libavcodec/huffyuv.c
+++ b/src/libffmpeg/libavcodec/huffyuv.c
@@ -27,11 +27,22 @@
  */
 
 #include "common.h"
+#include "bitstream.h"
 #include "avcodec.h"
 #include "dsputil.h"
 
 #define VLC_BITS 11
 
+#ifdef WORDS_BIGENDIAN
+#define B 3
+#define G 2
+#define R 1
+#else
+#define B 0
+#define G 1
+#define R 2
+#endif
+
 typedef enum Predictor{
     LEFT= 0,
     PLANE,
@@ -51,15 +62,17 @@ typedef struct HYuvContext{
     int bgr32;                              //use bgr32 instead of bgr24
     int width, height;
     int flags;
+    int context;
     int picture_number;
     int last_slice_end;
-    uint8_t __align8 temp[3][2560];
+    uint8_t *temp[3];
     uint64_t stats[3][256];
     uint8_t len[3][256];
     uint32_t bits[3][256];
     VLC vlc[3];
     AVFrame picture;
-    uint8_t __align8 bitstream_buffer[1024*1024*3]; //FIXME dynamic alloc or some other solution
+    uint8_t *bitstream_buffer;
+    int bitstream_buffer_size;
     DSPContext dsp; 
 }HYuvContext;
 
@@ -157,13 +170,13 @@ static inline void add_left_prediction_bgr32(uint8_t *dst, uint8_t *src, int w,
     b= *blue;
 
     for(i=0; i<w; i++){
-        b+= src[4*i+0];
-        g+= src[4*i+1];
-        r+= src[4*i+2];
+        b+= src[4*i+B];
+        g+= src[4*i+G];
+        r+= src[4*i+R];
         
-        dst[4*i+0]= b;
-        dst[4*i+1]= g;
-        dst[4*i+2]= r;
+        dst[4*i+B]= b;
+        dst[4*i+G]= g;
+        dst[4*i+R]= r;
     }
 
     *red= r;
@@ -271,7 +284,7 @@ static void generate_len_table(uint8_t *dst, uint64_t *stats, int size){
             for(len=0; up[index] != -1; len++)
                 index= up[index];
                 
-            if(len > 32) break;
+            if(len >= 32) break;
             
             dst[i]= len;
         }
@@ -296,10 +309,11 @@ for(j=0; j<256; j++){
 printf("%6X, %2d,  %3d\n", s->bits[i][j], s->len[i][j], j);
 }
 #endif
-        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1, s->bits[i], 4, 4);
+        free_vlc(&s->vlc[i]);
+        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1, s->bits[i], 4, 4, 0);
     }
     
-    return 0;
+    return (get_bits_count(&gb)+7)/8;
 }
 
 static int read_old_huffman_tables(HYuvContext *s){
@@ -322,8 +336,10 @@ static int read_old_huffman_tables(HYuvContext *s){
     memcpy(s->bits[2], s->bits[1], 256*sizeof(uint32_t));
     memcpy(s->len[2] , s->len [1], 256*sizeof(uint8_t));
     
-    for(i=0; i<3; i++)
-        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1, s->bits[i], 4, 4);
+    for(i=0; i<3; i++){
+        free_vlc(&s->vlc[i]);
+        init_vlc(&s->vlc[i], VLC_BITS, 256, s->len[i], 1, 1, s->bits[i], 4, 4, 0);
+    }
     
     return 0;
 #else
@@ -332,22 +348,44 @@ static int read_old_huffman_tables(HYuvContext *s){
 #endif
 }
 
-static int decode_init(AVCodecContext *avctx)
-{
+static void alloc_temp(HYuvContext *s){
+    int i;
+    
+    if(s->bitstream_bpp<24){
+        for(i=0; i<3; i++){
+            s->temp[i]= av_malloc(s->width + 16);
+        }
+    }else{
+        s->temp[0]= av_malloc(4*s->width + 16);
+    }
+}
+
+static int common_init(AVCodecContext *avctx){
     HYuvContext *s = avctx->priv_data;
-    int width, height;
 
     s->avctx= avctx;
     s->flags= avctx->flags;
         
     dsputil_init(&s->dsp, avctx);
     
-    width= s->width= avctx->width;
-    height= s->height= avctx->height;
+    s->width= avctx->width;
+    s->height= avctx->height;
+    assert(s->width>0 && s->height>0);
+        
+    return 0;
+}
+
+static int decode_init(AVCodecContext *avctx)
+{
+    HYuvContext *s = avctx->priv_data;
+
+    common_init(avctx);
+    memset(s->vlc, 0, 3*sizeof(VLC));
+    
     avctx->coded_frame= &s->picture;
+    s->interlaced= s->height > 288;
 
 s->bgr32=1;
-    assert(width && height);
 //if(avctx->extradata)
 //  printf("extradata:%X, extradata_size:%d\n", *(uint32_t*)avctx->extradata, avctx->extradata_size);
     if(avctx->extradata_size){
@@ -359,7 +397,7 @@ s->bgr32=1;
         s->version=0;
     
     if(s->version==2){
-        int method;
+        int method, interlace;
 
         method= ((uint8_t*)avctx->extradata)[0];
         s->decorrelate= method&64 ? 1 : 0;
@@ -367,6 +405,9 @@ s->bgr32=1;
         s->bitstream_bpp= ((uint8_t*)avctx->extradata)[1];
         if(s->bitstream_bpp==0) 
             s->bitstream_bpp= avctx->bits_per_sample&~7;
+        interlace= (((uint8_t*)avctx->extradata)[2] & 0x30) >> 4;
+        s->interlaced= (interlace==1) ? 1 : (interlace==2) ? 0 : s->interlaced;
+        s->context= ((uint8_t*)avctx->extradata)[2] & 0x40 ? 1 : 0;
             
         if(read_huffman_tables(s, ((uint8_t*)avctx->extradata)+4, avctx->extradata_size) < 0)
             return -1;
@@ -394,13 +435,12 @@ s->bgr32=1;
             break;
         }
         s->bitstream_bpp= avctx->bits_per_sample & ~7;
+        s->context= 0;
         
         if(read_old_huffman_tables(s) < 0)
             return -1;
     }
     
-    s->interlaced= height > 288;
-    
     switch(s->bitstream_bpp){
     case 12:
         avctx->pix_fmt = PIX_FMT_YUV420P;
@@ -424,14 +464,16 @@ s->bgr32=1;
         assert(0);
     }
     
-//    printf("pred:%d bpp:%d hbpp:%d il:%d\n", s->predictor, s->bitstream_bpp, avctx->bits_per_sample, s->interlaced);
+    alloc_temp(s);
     
+//    av_log(NULL, AV_LOG_DEBUG, "pred:%d bpp:%d hbpp:%d il:%d\n", s->predictor, s->bitstream_bpp, avctx->bits_per_sample, s->interlaced);
+
     return 0;
 }
 
-static void store_table(HYuvContext *s, uint8_t *len){
+static int store_table(HYuvContext *s, uint8_t *len, uint8_t *buf){
     int i;
-    int index= s->avctx->extradata_size;
+    int index= 0;
 
     for(i=0; i<256;){
         int val= len[i];
@@ -442,43 +484,31 @@ static void store_table(HYuvContext *s, uint8_t *len){
         
         assert(val < 32 && val >0 && repeat<256 && repeat>0);
         if(repeat>7){
-            ((uint8_t*)s->avctx->extradata)[index++]= val;
-            ((uint8_t*)s->avctx->extradata)[index++]= repeat;
+            buf[index++]= val;
+            buf[index++]= repeat;
         }else{
-            ((uint8_t*)s->avctx->extradata)[index++]= val | (repeat<<5);
+            buf[index++]= val | (repeat<<5);
         }
     }
     
-    s->avctx->extradata_size= index;
+    return index;
 }
 
 static int encode_init(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
-    int i, j, width, height;
+    int i, j;
 
-    s->avctx= avctx;
-    s->flags= avctx->flags;
-        
-    dsputil_init(&s->dsp, avctx);
-    
-    width= s->width= avctx->width;
-    height= s->height= avctx->height;
-    
-    assert(width && height);
+    common_init(avctx);
     
-    avctx->extradata= av_mallocz(1024*30);
-    avctx->stats_out= av_mallocz(1024*30);
+    avctx->extradata= av_mallocz(1024*30); // 256*3+4 == 772
+    avctx->stats_out= av_mallocz(1024*30); // 21*256*3(%llu ) + 3(\n) + 1(0) = 16132
     s->version=2;
     
     avctx->coded_frame= &s->picture;
     
     switch(avctx->pix_fmt){
     case PIX_FMT_YUV420P:
-        if(avctx->strict_std_compliance>=0){
-            av_log(avctx, AV_LOG_ERROR, "YV12-huffyuv is experimental, there WILL be no compatbility! (use (v)strict=-1)\n");
-            return -1;
-        }
         s->bitstream_bpp= 12;
         break;
     case PIX_FMT_YUV422P:
@@ -491,10 +521,36 @@ static int encode_init(AVCodecContext *avctx)
     avctx->bits_per_sample= s->bitstream_bpp;
     s->decorrelate= s->bitstream_bpp >= 24;
     s->predictor= avctx->prediction_method;
+    s->interlaced= avctx->flags&CODEC_FLAG_INTERLACED_ME ? 1 : 0;
+    if(avctx->context_model==1){
+        s->context= avctx->context_model;
+        if(s->flags & (CODEC_FLAG_PASS1|CODEC_FLAG_PASS2)){
+            av_log(avctx, AV_LOG_ERROR, "context=1 is not compatible with 2 pass huffyuv encoding\n");
+            return -1;
+        }
+    }else s->context= 0;
+    
+    if(avctx->codec->id==CODEC_ID_HUFFYUV){
+        if(avctx->pix_fmt==PIX_FMT_YUV420P){
+            av_log(avctx, AV_LOG_ERROR, "Error: YV12 is not supported by huffyuv; use vcodec=ffvhuff or format=422p\n");
+            return -1;
+        }
+        if(avctx->context_model){
+            av_log(avctx, AV_LOG_ERROR, "Error: per-frame huffman tables are not supported by huffyuv; use vcodec=ffvhuff\n");
+            return -1;
+        }
+        if(s->interlaced != ( s->height > 288 ))
+            av_log(avctx, AV_LOG_INFO, "using huffyuv 2.2.0 or newer interlacing flag\n");
+    }else if(avctx->strict_std_compliance>=0){
+        av_log(avctx, AV_LOG_ERROR, "This codec is under development; files encoded with it may not be decodeable with future versions!!! Set vstrict=-1 / -strict -1 to use it anyway.\n");
+        return -1;
+    }
     
     ((uint8_t*)avctx->extradata)[0]= s->predictor;
     ((uint8_t*)avctx->extradata)[1]= s->bitstream_bpp;
-    ((uint8_t*)avctx->extradata)[2]=
+    ((uint8_t*)avctx->extradata)[2]= s->interlaced ? 0x10 : 0x20;
+    if(s->context)
+        ((uint8_t*)avctx->extradata)[2]|= 0x40;
     ((uint8_t*)avctx->extradata)[3]= 0;
     s->avctx->extradata_size= 4;
     
@@ -533,17 +589,28 @@ static int encode_init(AVCodecContext *avctx)
             return -1;
         }
         
-        store_table(s, s->len[i]);
+        s->avctx->extradata_size+=
+        store_table(s, s->len[i], &((uint8_t*)s->avctx->extradata)[s->avctx->extradata_size]);
     }
 
-    for(i=0; i<3; i++)
-        for(j=0; j<256; j++)
-            s->stats[i][j]= 0;
+    if(s->context){
+        for(i=0; i<3; i++){
+            int pels = s->width*s->height / (i?40:10);
+            for(j=0; j<256; j++){
+                int d= FFMIN(j, 256-j);
+                s->stats[i][j]= pels/(d+1);
+            }
+        }
+    }else{
+        for(i=0; i<3; i++)
+            for(j=0; j<256; j++)
+                s->stats[i][j]= 0;
+    }
     
-    s->interlaced= height > 288;
-
 //    printf("pred:%d bpp:%d hbpp:%d il:%d\n", s->predictor, s->bitstream_bpp, avctx->bits_per_sample, s->interlaced);
 
+    alloc_temp(s);
+
     s->picture_number=0;
 
     return 0;
@@ -573,9 +640,14 @@ static void decode_gray_bitstream(HYuvContext *s, int count){
     }
 }
 
-static void encode_422_bitstream(HYuvContext *s, int count){
+static int encode_422_bitstream(HYuvContext *s, int count){
     int i;
     
+    if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < 2*4*count){
+        av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+        return -1;
+    }
+    
     count/=2;
     if(s->flags&CODEC_FLAG_PASS1){
         for(i=0; i<count; i++){
@@ -584,6 +656,20 @@ static void encode_422_bitstream(HYuvContext *s, int count){
             s->stats[0][ s->temp[0][2*i+1] ]++;
             s->stats[2][ s->temp[2][  i  ] ]++;
         }
+    }
+    if(s->avctx->flags2&CODEC_FLAG2_NO_OUTPUT)
+        return 0;
+    if(s->context){
+        for(i=0; i<count; i++){
+            s->stats[0][ s->temp[0][2*i  ] ]++;
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i  ] ], s->bits[0][ s->temp[0][2*i  ] ]);
+            s->stats[1][ s->temp[1][  i  ] ]++;
+            put_bits(&s->pb, s->len[1][ s->temp[1][  i  ] ], s->bits[1][ s->temp[1][  i  ] ]);
+            s->stats[0][ s->temp[0][2*i+1] ]++;
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i+1] ], s->bits[0][ s->temp[0][2*i+1] ]);
+            s->stats[2][ s->temp[2][  i  ] ]++;
+            put_bits(&s->pb, s->len[2][ s->temp[2][  i  ] ], s->bits[2][ s->temp[2][  i  ] ]);
+        }
     }else{
         for(i=0; i<count; i++){
             put_bits(&s->pb, s->len[0][ s->temp[0][2*i  ] ], s->bits[0][ s->temp[0][2*i  ] ]);
@@ -592,23 +678,41 @@ static void encode_422_bitstream(HYuvContext *s, int count){
             put_bits(&s->pb, s->len[2][ s->temp[2][  i  ] ], s->bits[2][ s->temp[2][  i  ] ]);
         }
     }
+    return 0;
 }
 
-static void encode_gray_bitstream(HYuvContext *s, int count){
+static int encode_gray_bitstream(HYuvContext *s, int count){
     int i;
     
+    if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < 4*count){
+        av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+        return -1;
+    }
+
     count/=2;
     if(s->flags&CODEC_FLAG_PASS1){
         for(i=0; i<count; i++){
             s->stats[0][ s->temp[0][2*i  ] ]++;
             s->stats[0][ s->temp[0][2*i+1] ]++;
         }
+    }
+    if(s->avctx->flags2&CODEC_FLAG2_NO_OUTPUT)
+        return 0;
+    
+    if(s->context){
+        for(i=0; i<count; i++){
+            s->stats[0][ s->temp[0][2*i  ] ]++;
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i  ] ], s->bits[0][ s->temp[0][2*i  ] ]);
+            s->stats[0][ s->temp[0][2*i+1] ]++;
+            put_bits(&s->pb, s->len[0][ s->temp[0][2*i+1] ], s->bits[0][ s->temp[0][2*i+1] ]);
+        }
     }else{
         for(i=0; i<count; i++){
             put_bits(&s->pb, s->len[0][ s->temp[0][2*i  ] ], s->bits[0][ s->temp[0][2*i  ] ]);
             put_bits(&s->pb, s->len[0][ s->temp[0][2*i+1] ], s->bits[0][ s->temp[0][2*i+1] ]);
         }
     }
+    return 0;
 }
 
 static void decode_bgr_bitstream(HYuvContext *s, int count){
@@ -617,30 +721,30 @@ static void decode_bgr_bitstream(HYuvContext *s, int count){
     if(s->decorrelate){
         if(s->bitstream_bpp==24){
             for(i=0; i<count; i++){
-                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
-                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) + s->temp[0][4*i+1];
-                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) + s->temp[0][4*i+1];
+                s->temp[0][4*i+G]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i+B]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) + s->temp[0][4*i+G];
+                s->temp[0][4*i+R]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) + s->temp[0][4*i+G];
             }
         }else{
             for(i=0; i<count; i++){
-                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
-                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) + s->temp[0][4*i+1];
-                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) + s->temp[0][4*i+1]; 
+                s->temp[0][4*i+G]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i+B]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3) + s->temp[0][4*i+G];
+                s->temp[0][4*i+R]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3) + s->temp[0][4*i+G]; 
                                    get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); //?!
             }
         }
     }else{
         if(s->bitstream_bpp==24){
             for(i=0; i<count; i++){
-                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
-                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
-                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
+                s->temp[0][4*i+B]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
+                s->temp[0][4*i+G]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i+R]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
             }
         }else{
             for(i=0; i<count; i++){
-                s->temp[0][4*i  ]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
-                s->temp[0][4*i+1]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
-                s->temp[0][4*i+2]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
+                s->temp[0][4*i+B]= get_vlc2(&s->gb, s->vlc[0].table, VLC_BITS, 3);
+                s->temp[0][4*i+G]= get_vlc2(&s->gb, s->vlc[1].table, VLC_BITS, 3); 
+                s->temp[0][4*i+R]= get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); 
                                    get_vlc2(&s->gb, s->vlc[2].table, VLC_BITS, 3); //?!
             }
         }
@@ -681,17 +785,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
     const int height= s->height;
     int fake_ystride, fake_ustride, fake_vstride;
     AVFrame * const p= &s->picture;
+    int table_size= 0;
 
     AVFrame *picture = data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
+    s->bitstream_buffer= av_fast_realloc(s->bitstream_buffer, &s->bitstream_buffer_size, buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
 
     s->dsp.bswap_buf((uint32_t*)s->bitstream_buffer, (uint32_t*)buf, buf_size/4);
     
-    init_get_bits(&s->gb, s->bitstream_buffer, buf_size*8);
-
     if(p->data[0])
         avctx->release_buffer(avctx, p);
 
@@ -700,6 +801,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         return -1;
     }
+    
+    if(s->context){
+        table_size = read_huffman_tables(s, s->bitstream_buffer, buf_size);
+        if(table_size < 0)
+            return -1;
+    }
+
+    init_get_bits(&s->gb, s->bitstream_buffer+table_size, (buf_size-table_size)*8);
 
     fake_ystride= s->interlaced ? p->linesize[0]*2  : p->linesize[0];
     fake_ustride= s->interlaced ? p->linesize[1]*2  : p->linesize[1];
@@ -858,14 +967,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
         const int last_line= (height-1)*p->linesize[0];
         
         if(s->bitstream_bpp==32){
-                   p->data[0][last_line+3]= get_bits(&s->gb, 8);
-            leftr= p->data[0][last_line+2]= get_bits(&s->gb, 8);
-            leftg= p->data[0][last_line+1]= get_bits(&s->gb, 8);
-            leftb= p->data[0][last_line+0]= get_bits(&s->gb, 8);
+            skip_bits(&s->gb, 8);
+            leftr= p->data[0][last_line+R]= get_bits(&s->gb, 8);
+            leftg= p->data[0][last_line+G]= get_bits(&s->gb, 8);
+            leftb= p->data[0][last_line+B]= get_bits(&s->gb, 8);
         }else{
-            leftr= p->data[0][last_line+2]= get_bits(&s->gb, 8);
-            leftg= p->data[0][last_line+1]= get_bits(&s->gb, 8);
-            leftb= p->data[0][last_line+0]= get_bits(&s->gb, 8);
+            leftr= p->data[0][last_line+R]= get_bits(&s->gb, 8);
+            leftg= p->data[0][last_line+G]= get_bits(&s->gb, 8);
+            leftb= p->data[0][last_line+B]= get_bits(&s->gb, 8);
             skip_bits(&s->gb, 8);
         }
         
@@ -881,7 +990,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
                     
                     add_left_prediction_bgr32(p->data[0] + p->linesize[0]*y, s->temp[0], width, &leftr, &leftg, &leftb);
                     if(s->predictor == PLANE){
-                        if((y&s->interlaced)==0){
+                        if((y&s->interlaced)==0 && y<s->height-1-s->interlaced){
                             s->dsp.add_bytes(p->data[0] + p->linesize[0]*y, 
                                              p->data[0] + p->linesize[0]*y + fake_ystride, fake_ystride);
                         }
@@ -906,11 +1015,23 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
     return (get_bits_count(&s->gb)+31)/32*4;
 }
 
+static int common_end(HYuvContext *s){
+    int i;
+    
+    for(i=0; i<3; i++){
+        av_freep(&s->temp[i]);
+    }
+    return 0;
+}
+
 static int decode_end(AVCodecContext *avctx)
 {
     HYuvContext *s = avctx->priv_data;
     int i;
     
+    common_end(s);
+    av_freep(&s->bitstream_buffer);
+    
     for(i=0; i<3; i++){
         free_vlc(&s->vlc[i]);
     }
@@ -928,14 +1049,27 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     const int fake_ustride= s->interlaced ? pict->linesize[1]*2  : pict->linesize[1];
     const int fake_vstride= s->interlaced ? pict->linesize[2]*2  : pict->linesize[2];
     AVFrame * const p= &s->picture;
-    int i, size;
+    int i, j, size=0;
 
-    init_put_bits(&s->pb, buf, buf_size);
-    
     *p = *pict;
     p->pict_type= FF_I_TYPE;
     p->key_frame= 1;
     
+    if(s->context){
+        for(i=0; i<3; i++){
+            generate_len_table(s->len[i], s->stats[i], 256);
+            if(generate_bits_table(s->bits[i], s->len[i])<0)
+                return -1;
+            size+= store_table(s, s->len[i], &buf[size]);
+        }
+
+        for(i=0; i<3; i++)
+            for(j=0; j<256; j++)
+                s->stats[i][j] >>= 1;
+    }
+
+    init_put_bits(&s->pb, buf+size, buf_size-size);
+
     if(avctx->pix_fmt == PIX_FMT_YUV422P || avctx->pix_fmt == PIX_FMT_YUV420P){
         int lefty, leftu, leftv, y, cy;
 
@@ -963,8 +1097,8 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
             }
             
             lefty= sub_left_prediction(s, s->temp[0], p->data[0]+fake_ystride, 4, lefty);
-            leftu= sub_left_prediction(s, s->temp[1], p->data[1]+fake_ystride, 2, leftu);
-            leftv= sub_left_prediction(s, s->temp[2], p->data[2]+fake_ystride, 2, leftv);
+            leftu= sub_left_prediction(s, s->temp[1], p->data[1]+fake_ustride, 2, leftu);
+            leftv= sub_left_prediction(s, s->temp[2], p->data[2]+fake_vstride, 2, leftv);
         
             encode_422_bitstream(s, 4);
 
@@ -1026,11 +1160,11 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
                 if(s->predictor == PLANE && s->interlaced < cy){
                     s->dsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
                     s->dsp.diff_bytes(s->temp[2], udst, udst - fake_ustride, width2);
-                    s->dsp.diff_bytes(s->temp[2] + 1250, vdst, vdst - fake_vstride, width2);
+                    s->dsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
 
                     lefty= sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
                     leftu= sub_left_prediction(s, s->temp[1], s->temp[2], width2, leftu);
-                    leftv= sub_left_prediction(s, s->temp[2], s->temp[2] + 1250, width2, leftv);
+                    leftv= sub_left_prediction(s, s->temp[2], s->temp[2] + width2, width2, leftv);
                 }else{
                     lefty= sub_left_prediction(s, s->temp[0], ydst, width , lefty);
                     leftu= sub_left_prediction(s, s->temp[1], udst, width2, leftu);
@@ -1045,23 +1179,27 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     }
     emms_c();
     
-    size= (put_bits_count(&s->pb)+31)/32;
+    size+= (put_bits_count(&s->pb)+31)/8;
+    size/= 4;
     
     if((s->flags&CODEC_FLAG_PASS1) && (s->picture_number&31)==0){
         int j;
         char *p= avctx->stats_out;
+        char *end= p + 1024*30;
         for(i=0; i<3; i++){
             for(j=0; j<256; j++){
-                sprintf(p, "%llu ", s->stats[i][j]);
+                snprintf(p, end-p, "%llu ", s->stats[i][j]);
                 p+= strlen(p);
                 s->stats[i][j]= 0;
             }
-            sprintf(p, "\n");
+            snprintf(p, end-p, "\n");
             p++;
         }
-    }else{
+    }
+    if(!(s->avctx->flags2 & CODEC_FLAG2_NO_OUTPUT)){
         flush_put_bits(&s->pb);
         s->dsp.bswap_buf((uint32_t*)buf, (uint32_t*)buf, size);
+        avctx->stats_out[0] = '\0';
     }
     
     s->picture_number++;
@@ -1071,7 +1209,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
 
 static int encode_end(AVCodecContext *avctx)
 {
-//    HYuvContext *s = avctx->priv_data;
+    HYuvContext *s = avctx->priv_data;
+    
+    common_end(s);
 
     av_freep(&avctx->extradata);
     av_freep(&avctx->stats_out);
@@ -1079,12 +1219,6 @@ static int encode_end(AVCodecContext *avctx)
     return 0;
 }
 
-static const AVOption huffyuv_options[] =
-{
-    AVOPTION_CODEC_INT("prediction_method", "prediction_method", prediction_method, 0, 2, 0),
-    AVOPTION_END()
-};
-
 AVCodec huffyuv_decoder = {
     "huffyuv",
     CODEC_TYPE_VIDEO,
@@ -1098,6 +1232,19 @@ AVCodec huffyuv_decoder = {
     NULL
 };
 
+AVCodec ffvhuff_decoder = {
+    "ffvhuff",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFVHUFF,
+    sizeof(HYuvContext),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    CODEC_CAP_DR1 | CODEC_CAP_DRAW_HORIZ_BAND,
+    NULL
+};
+
 #ifdef CONFIG_ENCODERS
 
 AVCodec huffyuv_encoder = {
@@ -1108,7 +1255,16 @@ AVCodec huffyuv_encoder = {
     encode_init,
     encode_frame,
     encode_end,
-    .options = huffyuv_options,
+};
+
+AVCodec ffvhuff_encoder = {
+    "ffvhuff",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFVHUFF,
+    sizeof(HYuvContext),
+    encode_init,
+    encode_frame,
+    encode_end,
 };
 
 #endif //CONFIG_ENCODERS
diff --git a/src/libffmpeg/libavcodec/i386/cputest.c b/src/libffmpeg/libavcodec/i386/cputest.c
index b50d653c4..593e0550d 100644
--- a/src/libffmpeg/libavcodec/i386/cputest.c
+++ b/src/libffmpeg/libavcodec/i386/cputest.c
@@ -4,12 +4,20 @@
 #include <stdlib.h>
 #include "../dsputil.h"
 
+#ifdef ARCH_X86_64
+#  define REG_b "rbx"
+#  define REG_S "rsi"
+#else
+#  define REG_b "ebx"
+#  define REG_S "esi"
+#endif
+
 /* ebx saving is necessary for PIC. gcc seems unable to see it alone */
 #define cpuid(index,eax,ebx,ecx,edx)\
     __asm __volatile\
-	("movl %%ebx, %%esi\n\t"\
+	("mov %%"REG_b", %%"REG_S"\n\t"\
          "cpuid\n\t"\
-         "xchgl %%ebx, %%esi"\
+         "xchg %%"REG_b", %%"REG_S\
          : "=a" (eax), "=S" (ebx),\
            "=c" (ecx), "=d" (edx)\
          : "0" (index));
@@ -17,82 +25,72 @@
 /* Function to test if multimedia instructions are supported...  */
 int mm_support(void)
 {
-    int rval;
+    int rval = 0;
     int eax, ebx, ecx, edx;
+    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+    long a, c;
     
     __asm__ __volatile__ (
                           /* See if CPUID instruction is supported ... */
                           /* ... Get copies of EFLAGS into eax and ecx */
                           "pushf\n\t"
-                          "popl %0\n\t"
-                          "movl %0, %1\n\t"
+                          "pop %0\n\t"
+                          "mov %0, %1\n\t"
                           
                           /* ... Toggle the ID bit in one copy and store */
                           /*     to the EFLAGS reg */
-                          "xorl $0x200000, %0\n\t"
+                          "xor $0x200000, %0\n\t"
                           "push %0\n\t"
                           "popf\n\t"
                           
                           /* ... Get the (hopefully modified) EFLAGS */
                           "pushf\n\t"
-                          "popl %0\n\t"
-                          : "=a" (eax), "=c" (ecx)
+                          "pop %0\n\t"
+                          : "=a" (a), "=c" (c)
                           :
                           : "cc" 
                           );
     
-    if (eax == ecx)
+    if (a == c)
         return 0; /* CPUID not supported */
-    
-    cpuid(0, eax, ebx, ecx, edx);
 
-    if (ebx == 0x756e6547 &&
-        edx == 0x49656e69 &&
-        ecx == 0x6c65746e) {
-        
-        /* intel */
-    inteltest:
-        cpuid(1, eax, ebx, ecx, edx);
-        if ((edx & 0x00800000) == 0)
-            return 0;
-        rval = MM_MMX;
-        if (edx & 0x02000000) 
+    cpuid(0, max_std_level, ebx, ecx, edx);
+
+    if(max_std_level >= 1){
+        cpuid(1, eax, ebx, ecx, std_caps);
+        if (std_caps & (1<<23))
+            rval |= MM_MMX;
+        if (std_caps & (1<<25)) 
             rval |= MM_MMXEXT | MM_SSE;
-        if (edx & 0x04000000) 
+        if (std_caps & (1<<26)) 
             rval |= MM_SSE2;
-        return rval;
-    } else if (ebx == 0x68747541 &&
+    }
+
+    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
+
+    if(max_ext_level >= 0x80000001){
+        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
+        if (ext_caps & (1<<31))
+            rval |= MM_3DNOW;
+        if (ext_caps & (1<<30))
+            rval |= MM_3DNOWEXT;
+        if (ext_caps & (1<<23))
+            rval |= MM_MMX;
+    }
+
+    cpuid(0, eax, ebx, ecx, edx);
+    if (       ebx == 0x68747541 &&
                edx == 0x69746e65 &&
                ecx == 0x444d4163) {
         /* AMD */
-        cpuid(0x80000000, eax, ebx, ecx, edx);
-        if ((unsigned)eax < 0x80000001)
-            goto inteltest;
-        cpuid(0x80000001, eax, ebx, ecx, edx);
-        if ((edx & 0x00800000) == 0)
-            return 0;
-        rval = MM_MMX;
-        if (edx & 0x80000000)
-            rval |= MM_3DNOW;
-        if (edx & 0x00400000)
+        if(ext_caps & (1<<22))
             rval |= MM_MMXEXT;
-        return rval;
     } else if (ebx == 0x746e6543 &&
                edx == 0x48727561 &&
                ecx == 0x736c7561) {  /*  "CentaurHauls" */
         /* VIA C3 */
-        cpuid(0x80000000, eax, ebx, ecx, edx);
-        if ((unsigned)eax < 0x80000001)
-            goto inteltest;	
-	cpuid(0x80000001, eax, ebx, ecx, edx);
-	rval = 0;      
-	if( edx & ( 1 << 31) )
-	  rval |= MM_3DNOW;
-	if( edx & ( 1 << 23) )
-	  rval |= MM_MMX;
-	if( edx & ( 1 << 24) )
+	if(ext_caps & (1<<24))
 	  rval |= MM_MMXEXT;
-	return rval;
     } else if (ebx == 0x69727943 &&
                edx == 0x736e4978 &&
                ecx == 0x64616574) {
@@ -105,18 +103,21 @@ int mm_support(void)
            According to the table, the only CPU which supports level
            2 is also the only one which supports extended CPUID levels.
         */
-        if (eax != 2) 
-            goto inteltest;
-        cpuid(0x80000001, eax, ebx, ecx, edx);
-        if ((eax & 0x00800000) == 0)
-            return 0;
-        rval = MM_MMX;
-        if (eax & 0x01000000)
+        if (eax < 2) 
+            return rval;
+        if (ext_caps & (1<<24))
             rval |= MM_MMXEXT;
-        return rval;
-    } else {
-        return 0;
     }
+#if 0
+    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s\n", 
+        (rval&MM_MMX) ? "MMX ":"", 
+        (rval&MM_MMXEXT) ? "MMX2 ":"", 
+        (rval&MM_SSE) ? "SSE ":"", 
+        (rval&MM_SSE2) ? "SSE2 ":"", 
+        (rval&MM_3DNOW) ? "3DNow ":"", 
+        (rval&MM_3DNOWEXT) ? "3DNowExt ":"");
+#endif
+    return rval;
 }
 
 #ifdef __TEST__
@@ -124,7 +125,7 @@ int main ( void )
 {
   int mm_flags;
   mm_flags = mm_support();
-  printf("mm_support = 0x%08u\n",mm_flags);
+  printf("mm_support = 0x%08X\n",mm_flags);
   return 0;
 }
 #endif
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 11504e225..550122673 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -22,6 +22,7 @@
 
 #include "../dsputil.h"
 #include "../simple_idct.h"
+#include "../mpegvideo.h"
 #include "mmx.h"
 
 //#undef NDEBUG
@@ -38,7 +39,9 @@ static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x00
 
 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
 static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
+static const uint64_t ff_pw_5  attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
+static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
 
 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
@@ -184,7 +187,7 @@ static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xF
 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
 {
     asm volatile(
-        "movl $-128, %%eax	\n\t"
+        "mov $-128, %%"REG_a"	\n\t"
         "pxor %%mm7, %%mm7	\n\t"
         ".balign 16		\n\t"
         "1:			\n\t"
@@ -196,16 +199,16 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
         "punpckhbw %%mm7, %%mm1	\n\t"
         "punpcklbw %%mm7, %%mm2	\n\t"
         "punpckhbw %%mm7, %%mm3	\n\t"
-        "movq %%mm0, (%1, %%eax)\n\t"
-        "movq %%mm1, 8(%1, %%eax)\n\t"
-        "movq %%mm2, 16(%1, %%eax)\n\t"
-        "movq %%mm3, 24(%1, %%eax)\n\t"
-        "addl %3, %0		\n\t"
-        "addl $32, %%eax	\n\t"
+        "movq %%mm0, (%1, %%"REG_a")\n\t"
+        "movq %%mm1, 8(%1, %%"REG_a")\n\t"
+        "movq %%mm2, 16(%1, %%"REG_a")\n\t"
+        "movq %%mm3, 24(%1, %%"REG_a")\n\t"
+        "add %3, %0		\n\t"
+        "add $32, %%"REG_a"	\n\t"
         "js 1b			\n\t"
         : "+r" (pixels)
-        : "r" (block+64), "r" (line_size), "r" (line_size*2)
-        : "%eax"
+        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
+        : "%"REG_a
     );
 }
 
@@ -213,7 +216,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
 {
     asm volatile(
         "pxor %%mm7, %%mm7	\n\t"
-        "movl $-128, %%eax	\n\t"
+        "mov $-128, %%"REG_a"	\n\t"
         ".balign 16		\n\t"
         "1:			\n\t"
         "movq (%0), %%mm0	\n\t"
@@ -226,15 +229,15 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
         "punpckhbw %%mm7, %%mm3	\n\t"
         "psubw %%mm2, %%mm0	\n\t"
         "psubw %%mm3, %%mm1	\n\t"
-        "movq %%mm0, (%2, %%eax)\n\t"
-        "movq %%mm1, 8(%2, %%eax)\n\t"
-        "addl %3, %0		\n\t"
-        "addl %3, %1		\n\t"
-        "addl $16, %%eax	\n\t"
+        "movq %%mm0, (%2, %%"REG_a")\n\t"
+        "movq %%mm1, 8(%2, %%"REG_a")\n\t"
+        "add %3, %0		\n\t"
+        "add %3, %1		\n\t"
+        "add $16, %%"REG_a"	\n\t"
         "jnz 1b			\n\t"
         : "+r" (s1), "+r" (s2)
-        : "r" (block+64), "r" (stride)
-        : "%eax"
+        : "r" (block+64), "r" ((long)stride)
+        : "%"REG_a
     );
 }
 #endif //CONFIG_ENCODERS
@@ -265,7 +268,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
 		"movq	%%mm2, (%0, %1)\n\t"
 		"movq	%%mm4, (%0, %1, 2)\n\t"
 		"movq	%%mm6, (%0, %2)\n\t"
-		::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
+		::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
 		:"memory");
         pix += line_size*4;
         p += 32;
@@ -290,7 +293,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
 	    "movq	%%mm2, (%0, %1)\n\t"
 	    "movq	%%mm4, (%0, %1, 2)\n\t"
 	    "movq	%%mm6, (%0, %2)\n\t"
-	    ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
+	    ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
 	    :"memory");
 }
 
@@ -353,36 +356,62 @@ void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
     } while (--i);
 }
 
+static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm __volatile(
+	 "lea (%3, %3), %%"REG_a"	\n\t"
+	 ".balign 8			\n\t"
+	 "1:				\n\t"
+	 "movd (%1), %%mm0		\n\t"
+	 "movd (%1, %3), %%mm1		\n\t"
+	 "movd %%mm0, (%2)		\n\t"
+	 "movd %%mm1, (%2, %3)		\n\t"
+	 "add %%"REG_a", %1		\n\t"
+	 "add %%"REG_a", %2		\n\t"
+	 "movd (%1), %%mm0		\n\t"
+	 "movd (%1, %3), %%mm1		\n\t"
+	 "movd %%mm0, (%2)		\n\t"
+	 "movd %%mm1, (%2, %3)		\n\t"
+	 "add %%"REG_a", %1		\n\t"
+	 "add %%"REG_a", %2		\n\t"
+	 "subl $4, %0			\n\t"
+	 "jnz 1b			\n\t"
+	 : "+g"(h), "+r" (pixels),  "+r" (block)
+	 : "r"((long)line_size)
+	 : "%"REG_a, "memory"
+	);
+}
+
 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	 "lea (%3, %3), %%eax		\n\t"
+	 "lea (%3, %3), %%"REG_a"	\n\t"
 	 ".balign 8			\n\t"
 	 "1:				\n\t"
 	 "movq (%1), %%mm0		\n\t"
 	 "movq (%1, %3), %%mm1		\n\t"
      	 "movq %%mm0, (%2)		\n\t"
 	 "movq %%mm1, (%2, %3)		\n\t"
-	 "addl %%eax, %1		\n\t"
-         "addl %%eax, %2       		\n\t"
+	 "add %%"REG_a", %1		\n\t"
+	 "add %%"REG_a", %2		\n\t"
 	 "movq (%1), %%mm0		\n\t"
 	 "movq (%1, %3), %%mm1		\n\t"
 	 "movq %%mm0, (%2)		\n\t"
 	 "movq %%mm1, (%2, %3)		\n\t"
-	 "addl %%eax, %1		\n\t"
-	 "addl %%eax, %2       		\n\t"
+	 "add %%"REG_a", %1		\n\t"
+	 "add %%"REG_a", %2		\n\t"
 	 "subl $4, %0			\n\t"
 	 "jnz 1b			\n\t"
 	 : "+g"(h), "+r" (pixels),  "+r" (block)
-	 : "r"(line_size)
-	 : "%eax", "memory"
+	 : "r"((long)line_size)
+	 : "%"REG_a, "memory"
 	);
 }
 
 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	 "lea (%3, %3), %%eax		\n\t"
+	 "lea (%3, %3), %%"REG_a"	\n\t"
 	 ".balign 8			\n\t"
 	 "1:				\n\t"
 	 "movq (%1), %%mm0		\n\t"
@@ -393,8 +422,8 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz
      	 "movq %%mm4, 8(%2)		\n\t"
 	 "movq %%mm1, (%2, %3)		\n\t"
 	 "movq %%mm5, 8(%2, %3)		\n\t"
-	 "addl %%eax, %1		\n\t"
-         "addl %%eax, %2       		\n\t"
+	 "add %%"REG_a", %1		\n\t"
+	 "add %%"REG_a", %2       	\n\t"
 	 "movq (%1), %%mm0		\n\t"
 	 "movq 8(%1), %%mm4		\n\t"
 	 "movq (%1, %3), %%mm1		\n\t"
@@ -403,13 +432,13 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz
 	 "movq %%mm4, 8(%2)		\n\t"
 	 "movq %%mm1, (%2, %3)		\n\t"
 	 "movq %%mm5, 8(%2, %3)		\n\t"
-	 "addl %%eax, %1		\n\t"
-	 "addl %%eax, %2       		\n\t"
+	 "add %%"REG_a", %1		\n\t"
+	 "add %%"REG_a", %2       	\n\t"
 	 "subl $4, %0			\n\t"
 	 "jnz 1b			\n\t"
 	 : "+g"(h), "+r" (pixels),  "+r" (block)
-	 : "r"(line_size)
-	 : "%eax", "memory"
+	 : "r"((long)line_size)
+	 : "%"REG_a, "memory"
 	);
 }
 
@@ -417,16 +446,16 @@ static void clear_blocks_mmx(DCTELEM *blocks)
 {
     __asm __volatile(
                 "pxor %%mm7, %%mm7		\n\t"
-                "movl $-128*6, %%eax		\n\t"
+                "mov $-128*6, %%"REG_a"	\n\t"
                 "1:				\n\t"
-                "movq %%mm7, (%0, %%eax)	\n\t"
-                "movq %%mm7, 8(%0, %%eax)	\n\t"
-                "movq %%mm7, 16(%0, %%eax)	\n\t"
-                "movq %%mm7, 24(%0, %%eax)	\n\t"
-                "addl $32, %%eax		\n\t"
+                "movq %%mm7, (%0, %%"REG_a")	\n\t"
+                "movq %%mm7, 8(%0, %%"REG_a")	\n\t"
+                "movq %%mm7, 16(%0, %%"REG_a")	\n\t"
+                "movq %%mm7, 24(%0, %%"REG_a")	\n\t"
+                "add $32, %%"REG_a"		\n\t"
                 " js 1b				\n\t"
-                : : "r" (((int)blocks)+128*6)
-                : "%eax"
+                : : "r" (((uint8_t *)blocks)+128*6)
+                : "%"REG_a
         );
 }
 
@@ -434,7 +463,7 @@ static void clear_blocks_mmx(DCTELEM *blocks)
 static int pix_sum16_mmx(uint8_t * pix, int line_size){
     const int h=16;
     int sum;
-    int index= -line_size*h;
+    long index= -line_size*h;
 
     __asm __volatile(
                 "pxor %%mm7, %%mm7		\n\t"
@@ -452,7 +481,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
                 "paddw %%mm2, %%mm3		\n\t"
                 "paddw %%mm1, %%mm3		\n\t"
                 "paddw %%mm3, %%mm6		\n\t"
-                "addl %3, %1			\n\t"
+                "add %3, %1			\n\t"
                 " js 1b				\n\t"
                 "movq %%mm6, %%mm5		\n\t"
                 "psrlq $32, %%mm6		\n\t"
@@ -463,7 +492,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
                 "movd %%mm6, %0			\n\t"
                 "andl $0xFFFF, %0		\n\t"
                 : "=&r" (sum), "+r" (index)
-                : "r" (pix - index), "r" (line_size)
+                : "r" (pix - index), "r" ((long)line_size)
         );
 
         return sum;
@@ -471,7 +500,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
 #endif //CONFIG_ENCODERS
 
 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
-    int i=0;
+    long i=0;
     asm volatile(
         "1:				\n\t"
         "movq  (%1, %0), %%mm0		\n\t"
@@ -482,11 +511,11 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
         "movq 8(%2, %0), %%mm1		\n\t"
         "paddb %%mm0, %%mm1		\n\t"
         "movq %%mm1, 8(%2, %0)		\n\t"
-        "addl $16, %0			\n\t"
-        "cmpl %3, %0			\n\t"
+        "add $16, %0			\n\t"
+        "cmp %3, %0			\n\t"
         " jb 1b				\n\t"
         : "+r" (i)
-        : "r"(src), "r"(dst), "r"(w-15)
+        : "r"(src), "r"(dst), "r"((long)w-15)
     );
     for(; i<w; i++)
         dst[i+0] += src[i+0];
@@ -643,26 +672,22 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
         "punpcklwd %%mm0, %%mm1		\n\t"
         "punpckhwd %%mm4, %%mm3		\n\t"
         "punpckhwd %%mm0, %%mm6		\n\t"
-        "movd %%mm5, %0			\n\t"
+        "movd %%mm5, (%0)		\n\t"
         "punpckhdq %%mm5, %%mm5		\n\t"
-        "movd %%mm5, %1			\n\t"
-        "movd %%mm3, %2			\n\t"
+        "movd %%mm5, (%0,%2)		\n\t"
+        "movd %%mm3, (%0,%2,2)		\n\t"
         "punpckhdq %%mm3, %%mm3		\n\t"
-        "movd %%mm3, %3			\n\t"
-        "movd %%mm1, %4			\n\t"
+        "movd %%mm3, (%0,%3)		\n\t"
+        "movd %%mm1, (%1)		\n\t"
         "punpckhdq %%mm1, %%mm1		\n\t"
-        "movd %%mm1, %5			\n\t"
-        "movd %%mm6, %6			\n\t"
+        "movd %%mm1, (%1,%2)		\n\t"
+        "movd %%mm6, (%1,%2,2)		\n\t"
         "punpckhdq %%mm6, %%mm6		\n\t"
-        "movd %%mm6, %7			\n\t"
-        : "=m" (*(uint32_t*)(src + 0*stride)),
-          "=m" (*(uint32_t*)(src + 1*stride)),
-          "=m" (*(uint32_t*)(src + 2*stride)),
-          "=m" (*(uint32_t*)(src + 3*stride)),
-          "=m" (*(uint32_t*)(src + 4*stride)),
-          "=m" (*(uint32_t*)(src + 5*stride)),
-          "=m" (*(uint32_t*)(src + 6*stride)),
-          "=m" (*(uint32_t*)(src + 7*stride))
+        "movd %%mm6, (%1,%3)		\n\t"
+        :: "r" (src),
+           "r" (src + 4*stride),
+           "r" ((long)   stride ),
+           "r" ((long)(3*stride))
     );
 }
 
@@ -697,7 +722,7 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
       "paddd %%mm3,%%mm4\n"
       "paddd %%mm2,%%mm7\n"
 
-      "addl %2, %0\n"
+      "add %2, %0\n"
       "paddd %%mm4,%%mm7\n"
       "dec %%ecx\n"
       "jnz 1b\n"
@@ -706,7 +731,50 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
       "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
       "paddd %%mm7,%%mm1\n"
       "movd %%mm1,%1\n"
-      : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
+      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
+    return tmp;
+}
+
+static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+    int tmp;
+  asm volatile (
+      "movl %4,%%ecx\n"
+      "pxor %%mm0,%%mm0\n"	/* mm0 = 0 */
+      "pxor %%mm7,%%mm7\n"	/* mm7 holds the sum */
+      "1:\n"
+      "movq (%0),%%mm1\n"	/* mm1 = pix1[0-7] */
+      "movq (%1),%%mm2\n"	/* mm2 = pix2[0-7] */
+
+      "movq %%mm1,%%mm5\n"
+      "psubusb %%mm2,%%mm1\n"
+      "psubusb %%mm5,%%mm2\n"
+
+      "por %%mm1,%%mm2\n"
+
+      "movq %%mm2,%%mm1\n"
+
+      "punpckhbw %%mm0,%%mm2\n"
+      "punpcklbw %%mm0,%%mm1\n"	/* mm1 now spread over (mm1,mm2) */
+
+      "pmaddwd %%mm2,%%mm2\n"
+      "pmaddwd %%mm1,%%mm1\n"
+
+      "add %3,%0\n"
+      "add %3,%1\n"
+
+      "paddd %%mm2,%%mm1\n"
+      "paddd %%mm1,%%mm7\n"
+
+      "decl %%ecx\n"
+      "jnz 1b\n"
+
+      "movq %%mm7,%%mm1\n"
+      "psrlq $32, %%mm7\n"	/* shift hi dword to lo */
+      "paddd %%mm7,%%mm1\n"
+      "movd %%mm1,%2\n"
+      : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
+      : "r" ((long)line_size) , "m" (h)
+      : "%ecx");
     return tmp;
 }
 
@@ -749,8 +817,8 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
       "pmaddwd %%mm1,%%mm1\n"
       "pmaddwd %%mm3,%%mm3\n"
 
-      "addl %3,%0\n"
-      "addl %3,%1\n"
+      "add %3,%0\n"
+      "add %3,%1\n"
 
       "paddd %%mm2,%%mm1\n"
       "paddd %%mm4,%%mm3\n"
@@ -765,11 +833,266 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
       "paddd %%mm7,%%mm1\n"
       "movd %%mm1,%2\n"
       : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
-      : "r" (line_size) , "m" (h)
+      : "r" ((long)line_size) , "m" (h)
       : "%ecx");
     return tmp;
 }
 
+static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
+    int tmp;
+  asm volatile (
+      "movl %3,%%ecx\n"
+      "pxor %%mm7,%%mm7\n"
+      "pxor %%mm6,%%mm6\n"
+      
+      "movq (%0),%%mm0\n"
+      "movq %%mm0, %%mm1\n"
+      "psllq $8, %%mm0\n"
+      "psrlq $8, %%mm1\n"
+      "psrlq $8, %%mm0\n"
+      "movq %%mm0, %%mm2\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm0\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm2\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm0\n"
+      "psubw %%mm3, %%mm2\n"
+      
+      "add %2,%0\n"
+      
+      "movq (%0),%%mm4\n"
+      "movq %%mm4, %%mm1\n"
+      "psllq $8, %%mm4\n"
+      "psrlq $8, %%mm1\n"
+      "psrlq $8, %%mm4\n"
+      "movq %%mm4, %%mm5\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm4\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm5\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm4\n"
+      "psubw %%mm3, %%mm5\n"
+      "psubw %%mm4, %%mm0\n"
+      "psubw %%mm5, %%mm2\n"
+      "pxor %%mm3, %%mm3\n"
+      "pxor %%mm1, %%mm1\n"
+      "pcmpgtw %%mm0, %%mm3\n\t"
+      "pcmpgtw %%mm2, %%mm1\n\t"
+      "pxor %%mm3, %%mm0\n"
+      "pxor %%mm1, %%mm2\n"
+      "psubw %%mm3, %%mm0\n" 
+      "psubw %%mm1, %%mm2\n"
+      "paddw %%mm0, %%mm2\n"
+      "paddw %%mm2, %%mm6\n"
+
+      "add %2,%0\n"
+      "1:\n"
+  
+      "movq (%0),%%mm0\n"
+      "movq %%mm0, %%mm1\n"
+      "psllq $8, %%mm0\n"
+      "psrlq $8, %%mm1\n"
+      "psrlq $8, %%mm0\n"
+      "movq %%mm0, %%mm2\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm0\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm2\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm0\n"
+      "psubw %%mm3, %%mm2\n"
+      "psubw %%mm0, %%mm4\n"
+      "psubw %%mm2, %%mm5\n"
+      "pxor %%mm3, %%mm3\n"
+      "pxor %%mm1, %%mm1\n"
+      "pcmpgtw %%mm4, %%mm3\n\t"
+      "pcmpgtw %%mm5, %%mm1\n\t"
+      "pxor %%mm3, %%mm4\n"
+      "pxor %%mm1, %%mm5\n"
+      "psubw %%mm3, %%mm4\n" 
+      "psubw %%mm1, %%mm5\n"
+      "paddw %%mm4, %%mm5\n"
+      "paddw %%mm5, %%mm6\n"
+      
+      "add %2,%0\n"
+      
+      "movq (%0),%%mm4\n"
+      "movq %%mm4, %%mm1\n"
+      "psllq $8, %%mm4\n"
+      "psrlq $8, %%mm1\n"
+      "psrlq $8, %%mm4\n"
+      "movq %%mm4, %%mm5\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm4\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm5\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm4\n"
+      "psubw %%mm3, %%mm5\n"
+      "psubw %%mm4, %%mm0\n"
+      "psubw %%mm5, %%mm2\n"
+      "pxor %%mm3, %%mm3\n"
+      "pxor %%mm1, %%mm1\n"
+      "pcmpgtw %%mm0, %%mm3\n\t"
+      "pcmpgtw %%mm2, %%mm1\n\t"
+      "pxor %%mm3, %%mm0\n"
+      "pxor %%mm1, %%mm2\n"
+      "psubw %%mm3, %%mm0\n" 
+      "psubw %%mm1, %%mm2\n"
+      "paddw %%mm0, %%mm2\n"
+      "paddw %%mm2, %%mm6\n"
+
+      "add %2,%0\n"
+      "subl $2, %%ecx\n"
+      " jnz 1b\n"
+
+      "movq %%mm6, %%mm0\n"
+      "punpcklwd %%mm7,%%mm0\n"
+      "punpckhwd %%mm7,%%mm6\n"
+      "paddd %%mm0, %%mm6\n"
+      
+      "movq %%mm6,%%mm0\n"
+      "psrlq $32, %%mm6\n"
+      "paddd %%mm6,%%mm0\n"
+      "movd %%mm0,%1\n"
+      : "+r" (pix1), "=r"(tmp) 
+      : "r" ((long)line_size) , "g" (h-2)
+      : "%ecx");
+      return tmp;
+}
+
+static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
+    int tmp;
+    uint8_t * pix= pix1;
+  asm volatile (
+      "movl %3,%%ecx\n"
+      "pxor %%mm7,%%mm7\n"
+      "pxor %%mm6,%%mm6\n"
+      
+      "movq (%0),%%mm0\n"
+      "movq 1(%0),%%mm1\n"
+      "movq %%mm0, %%mm2\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm0\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm2\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm0\n"
+      "psubw %%mm3, %%mm2\n"
+      
+      "add %2,%0\n"
+      
+      "movq (%0),%%mm4\n"
+      "movq 1(%0),%%mm1\n"
+      "movq %%mm4, %%mm5\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm4\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm5\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm4\n"
+      "psubw %%mm3, %%mm5\n"
+      "psubw %%mm4, %%mm0\n"
+      "psubw %%mm5, %%mm2\n"
+      "pxor %%mm3, %%mm3\n"
+      "pxor %%mm1, %%mm1\n"
+      "pcmpgtw %%mm0, %%mm3\n\t"
+      "pcmpgtw %%mm2, %%mm1\n\t"
+      "pxor %%mm3, %%mm0\n"
+      "pxor %%mm1, %%mm2\n"
+      "psubw %%mm3, %%mm0\n" 
+      "psubw %%mm1, %%mm2\n"
+      "paddw %%mm0, %%mm2\n"
+      "paddw %%mm2, %%mm6\n"
+
+      "add %2,%0\n"
+      "1:\n"
+  
+      "movq (%0),%%mm0\n"
+      "movq 1(%0),%%mm1\n"
+      "movq %%mm0, %%mm2\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm0\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm2\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm0\n"
+      "psubw %%mm3, %%mm2\n"
+      "psubw %%mm0, %%mm4\n"
+      "psubw %%mm2, %%mm5\n"
+      "pxor %%mm3, %%mm3\n"
+      "pxor %%mm1, %%mm1\n"
+      "pcmpgtw %%mm4, %%mm3\n\t"
+      "pcmpgtw %%mm5, %%mm1\n\t"
+      "pxor %%mm3, %%mm4\n"
+      "pxor %%mm1, %%mm5\n"
+      "psubw %%mm3, %%mm4\n"
+      "psubw %%mm1, %%mm5\n"
+      "paddw %%mm4, %%mm5\n"
+      "paddw %%mm5, %%mm6\n"
+      
+      "add %2,%0\n"
+      
+      "movq (%0),%%mm4\n"
+      "movq 1(%0),%%mm1\n"
+      "movq %%mm4, %%mm5\n"
+      "movq %%mm1, %%mm3\n"
+      "punpcklbw %%mm7,%%mm4\n"
+      "punpcklbw %%mm7,%%mm1\n"
+      "punpckhbw %%mm7,%%mm5\n"
+      "punpckhbw %%mm7,%%mm3\n"
+      "psubw %%mm1, %%mm4\n"
+      "psubw %%mm3, %%mm5\n"
+      "psubw %%mm4, %%mm0\n"
+      "psubw %%mm5, %%mm2\n"
+      "pxor %%mm3, %%mm3\n"
+      "pxor %%mm1, %%mm1\n"
+      "pcmpgtw %%mm0, %%mm3\n\t"
+      "pcmpgtw %%mm2, %%mm1\n\t"
+      "pxor %%mm3, %%mm0\n"
+      "pxor %%mm1, %%mm2\n"
+      "psubw %%mm3, %%mm0\n" 
+      "psubw %%mm1, %%mm2\n"
+      "paddw %%mm0, %%mm2\n"
+      "paddw %%mm2, %%mm6\n"
+
+      "add %2,%0\n"
+      "subl $2, %%ecx\n"
+      " jnz 1b\n"
+
+      "movq %%mm6, %%mm0\n"
+      "punpcklwd %%mm7,%%mm0\n"
+      "punpckhwd %%mm7,%%mm6\n"
+      "paddd %%mm0, %%mm6\n"
+      
+      "movq %%mm6,%%mm0\n"
+      "psrlq $32, %%mm6\n"
+      "paddd %%mm6,%%mm0\n"
+      "movd %%mm0,%1\n"
+      : "+r" (pix1), "=r"(tmp) 
+      : "r" ((long)line_size) , "g" (h-2)
+      : "%ecx");
+      return tmp + hf_noise8_mmx(pix+8, line_size, h);
+}
+
+static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+    int score1= sse16_mmx(c, pix1, pix2, line_size, h);
+    int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
+
+    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
+    else  return score1 + ABS(score2)*8;
+}
+
+static int nsse8_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
+    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
+
+    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
+    else  return score1 + ABS(score2)*8;
+}
+
 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
     int tmp;
     
@@ -779,7 +1102,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
 #define SUM(in0, in1, out0, out1) \
       "movq (%0), %%mm2\n"\
       "movq 8(%0), %%mm3\n"\
-      "addl %2,%0\n"\
+      "add %2,%0\n"\
       "movq %%mm2, " #out0 "\n"\
       "movq %%mm3, " #out1 "\n"\
       "psubusb " #in0 ", %%mm2\n"\
@@ -806,7 +1129,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
       "pxor %%mm7,%%mm7\n"
       "movq (%0),%%mm0\n"
       "movq 8(%0),%%mm1\n"
-      "addl %2,%0\n"
+      "add %2,%0\n"
       "subl $2, %%ecx\n"
       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
       "1:\n"
@@ -826,7 +1149,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
       "paddw %%mm6,%%mm0\n"
       "movd %%mm0,%1\n"
       : "+r" (pix), "=r"(tmp) 
-      : "r" (line_size) , "m" (h)
+      : "r" ((long)line_size) , "m" (h)
       : "%ecx");
     return tmp & 0xFFFF;
 }
@@ -841,7 +1164,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
 #define SUM(in0, in1, out0, out1) \
       "movq (%0), " #out0 "\n"\
       "movq 8(%0), " #out1 "\n"\
-      "addl %2,%0\n"\
+      "add %2,%0\n"\
       "psadbw " #out0 ", " #in0 "\n"\
       "psadbw " #out1 ", " #in1 "\n"\
       "paddw " #in1 ", " #in0 "\n"\
@@ -853,7 +1176,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
       "pxor %%mm7,%%mm7\n"
       "movq (%0),%%mm0\n"
       "movq 8(%0),%%mm1\n"
-      "addl %2,%0\n"
+      "add %2,%0\n"
       "subl $2, %%ecx\n"
       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
       "1:\n"
@@ -867,7 +1190,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
 
       "movd %%mm6,%1\n"
       : "+r" (pix), "=r"(tmp) 
-      : "r" (line_size) , "m" (h)
+      : "r" ((long)line_size) , "m" (h)
       : "%ecx");
     return tmp;
 }
@@ -885,8 +1208,8 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
       "movq (%1)," #out0 "\n"\
       "movq 8(%0),%%mm3\n"\
       "movq 8(%1)," #out1 "\n"\
-      "addl %3,%0\n"\
-      "addl %3,%1\n"\
+      "add %3,%0\n"\
+      "add %3,%1\n"\
       "psubb " #out0 ", %%mm2\n"\
       "psubb " #out1 ", %%mm3\n"\
       "pxor %%mm7, %%mm2\n"\
@@ -921,8 +1244,8 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
       "movq (%1),%%mm2\n"
       "movq 8(%0),%%mm1\n"
       "movq 8(%1),%%mm3\n"
-      "addl %3,%0\n"
-      "addl %3,%1\n"
+      "add %3,%0\n"
+      "add %3,%1\n"
       "subl $2, %%ecx\n"
       "psubb %%mm2, %%mm0\n"
       "psubb %%mm3, %%mm1\n"
@@ -946,7 +1269,7 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
       "paddw %%mm6,%%mm0\n"
       "movd %%mm0,%2\n"
       : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
-      : "r" (line_size) , "m" (h)
+      : "r" ((long)line_size) , "m" (h)
       : "%ecx");
     return tmp & 0x7FFF;
 }
@@ -964,8 +1287,8 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
       "movq (%1),%%mm2\n"\
       "movq 8(%0)," #out1 "\n"\
       "movq 8(%1),%%mm3\n"\
-      "addl %3,%0\n"\
-      "addl %3,%1\n"\
+      "add %3,%0\n"\
+      "add %3,%1\n"\
       "psubb %%mm2, " #out0 "\n"\
       "psubb %%mm3, " #out1 "\n"\
       "pxor %%mm7, " #out0 "\n"\
@@ -985,8 +1308,8 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
       "movq (%1),%%mm2\n"
       "movq 8(%0),%%mm1\n"
       "movq 8(%1),%%mm3\n"
-      "addl %3,%0\n"
-      "addl %3,%1\n"
+      "add %3,%0\n"
+      "add %3,%1\n"
       "subl $2, %%ecx\n"
       "psubb %%mm2, %%mm0\n"
       "psubb %%mm3, %%mm1\n"
@@ -1004,14 +1327,14 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
 
       "movd %%mm6,%2\n"
       : "+r" (pix1), "+r" (pix2), "=r"(tmp) 
-      : "r" (line_size) , "m" (h)
+      : "r" ((long)line_size) , "m" (h)
       : "%ecx");
     return tmp;
 }
 #undef SUM
 
 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
-    int i=0;
+    long i=0;
     asm volatile(
         "1:				\n\t"
         "movq  (%2, %0), %%mm0		\n\t"
@@ -1022,20 +1345,20 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
         "movq 8(%1, %0), %%mm1		\n\t"
         "psubb %%mm0, %%mm1		\n\t"
         "movq %%mm1, 8(%3, %0)		\n\t"
-        "addl $16, %0			\n\t"
-        "cmpl %4, %0			\n\t"
+        "add $16, %0			\n\t"
+        "cmp %4, %0			\n\t"
         " jb 1b				\n\t"
         : "+r" (i)
-        : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
+        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
     );
     for(; i<w; i++)
         dst[i+0] = src1[i+0]-src2[i+0];
 }
 
 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
-    int i=0;
+    long i=0;
     uint8_t l, lt;
-
+    
     asm volatile(
         "1:				\n\t"
         "movq  -1(%1, %0), %%mm0	\n\t" // LT
@@ -1052,11 +1375,11 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
         "pmaxub %%mm1, %%mm4		\n\t"
         "psubb %%mm4, %%mm3		\n\t" // dst - pred
         "movq %%mm3, (%3, %0)		\n\t"
-        "addl $8, %0			\n\t"
-        "cmpl %4, %0			\n\t"
+        "add $8, %0			\n\t"
+        "cmp %4, %0			\n\t"
         " jb 1b				\n\t"
         : "+r" (i)
-        : "r"(src1), "r"(src2), "r"(dst), "r"(w)
+        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
     );
 
     l= *left;
@@ -1445,12 +1768,12 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, in
         "packuswb %%mm4, %%mm0		\n\t"\
         OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
         \
-        "addl %3, %0			\n\t"\
-        "addl %4, %1			\n\t"\
+        "add %3, %0			\n\t"\
+        "add %4, %1			\n\t"\
         "decl %2			\n\t"\
         " jnz 1b				\n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
+        : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
         : "memory"\
     );\
 }\
@@ -1558,12 +1881,12 @@ static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int
         "packuswb %%mm3, %%mm0		\n\t"\
         OP_MMX2(%%mm0, (%1), %%mm4, q)\
         \
-        "addl %3, %0			\n\t"\
-        "addl %4, %1			\n\t"\
+        "add %3, %0			\n\t"\
+        "add %4, %1			\n\t"\
         "decl %2			\n\t"\
         " jnz 1b			\n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
+        : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
         : "memory"\
     );\
 }\
@@ -1622,12 +1945,12 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
         "movq %%mm1, 17*8(%1)		\n\t"\
         "movq %%mm2, 2*17*8(%1)		\n\t"\
         "movq %%mm3, 3*17*8(%1)		\n\t"\
-        "addl $8, %1			\n\t"\
-        "addl %3, %0			\n\t"\
+        "add $8, %1			\n\t"\
+        "add %3, %0			\n\t"\
         "decl %2			\n\t"\
         " jnz 1b			\n\t"\
         : "+r" (src), "+r" (temp_ptr), "+r"(count)\
-        : "r" (srcStride)\
+        : "r" ((long)srcStride)\
         : "memory"\
     );\
     \
@@ -1644,43 +1967,43 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
         "movq 24(%0), %%mm3		\n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"  \
+        "add %4, %1			\n\t"  \
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
         \
-        "addl $136, %0			\n\t"\
-        "addl %6, %1			\n\t"\
+        "add $136, %0			\n\t"\
+        "add %6, %1			\n\t"\
         "decl %2			\n\t"\
         " jnz 1b			\n\t"\
         \
         : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
-        : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
+        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
         :"memory"\
     );\
 }\
 \
 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    uint64_t temp[9*4];\
+    uint64_t temp[9*2];\
     uint64_t *temp_ptr= temp;\
     int count= 9;\
 \
@@ -1694,12 +2017,12 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
         "punpckhbw %%mm7, %%mm1		\n\t"\
         "movq %%mm0, (%1)		\n\t"\
         "movq %%mm1, 9*8(%1)		\n\t"\
-        "addl $8, %1			\n\t"\
-        "addl %3, %0			\n\t"\
+        "add $8, %1			\n\t"\
+        "add %3, %0			\n\t"\
         "decl %2			\n\t"\
         " jnz 1b			\n\t"\
         : "+r" (src), "+r" (temp_ptr), "+r"(count)\
-        : "r" (srcStride)\
+        : "r" ((long)srcStride)\
         : "memory"\
     );\
     \
@@ -1716,25 +2039,25 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
         "movq 24(%0), %%mm3		\n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
         \
         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
-        "addl %4, %1			\n\t"\
+        "add %4, %1			\n\t"\
         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
                 \
-        "addl $72, %0			\n\t"\
-        "addl %6, %1			\n\t"\
+        "add $72, %0			\n\t"\
+        "add %6, %1			\n\t"\
         "decl %2			\n\t"\
         " jnz 1b			\n\t"\
          \
         : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
-        : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
+        : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
         : "memory"\
    );\
 }\
@@ -1747,7 +2070,7 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
 }\
 \
 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1758,14 +2081,14 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
 }\
 \
 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
-    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
 }\
 \
 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1776,43 +2099,43 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
-    OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
 }\
 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
@@ -1820,7 +2143,7 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
@@ -1828,20 +2151,20 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride)
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1858,7 +2181,7 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
 }\
 \
 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1869,14 +2192,14 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
 }\
 \
 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
-    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
 }\
 \
 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1887,43 +2210,43 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
-    OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
 }\
 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
@@ -1931,7 +2254,7 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
@@ -1939,20 +2262,20 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride
     uint8_t * const halfHV= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[17*2];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }\
 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[17*2];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }\
 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1962,6 +2285,499 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }
 
+#define QPEL_H264V(A,B,C,D,E,F,OP)\
+        "movd (%0), "#F"		\n\t"\
+        "movq "#C", %%mm6		\n\t"\
+        "paddw "#D", %%mm6		\n\t"\
+        "psllw $2, %%mm6		\n\t"\
+        "psubw "#B", %%mm6		\n\t"\
+        "psubw "#E", %%mm6		\n\t"\
+        "pmullw %4, %%mm6		\n\t"\
+        "add %2, %0			\n\t"\
+        "punpcklbw %%mm7, "#F"		\n\t"\
+        "paddw %5, "#A"			\n\t"\
+        "paddw "#F", "#A"		\n\t"\
+        "paddw "#A", %%mm6		\n\t"\
+        "psraw $5, %%mm6		\n\t"\
+        "packuswb %%mm6, %%mm6		\n\t"\
+        OP(%%mm6, (%1), A, d)\
+        "add %3, %1			\n\t"     
+
+#define QPEL_H264HV(A,B,C,D,E,F,OF)\
+        "movd (%0), "#F"		\n\t"\
+        "movq "#C", %%mm6		\n\t"\
+        "paddw "#D", %%mm6		\n\t"\
+        "psllw $2, %%mm6		\n\t"\
+        "psubw "#B", %%mm6		\n\t"\
+        "psubw "#E", %%mm6		\n\t"\
+        "pmullw %3, %%mm6		\n\t"\
+        "add %2, %0			\n\t"\
+        "punpcklbw %%mm7, "#F"		\n\t"\
+        "paddw "#F", "#A"		\n\t"\
+        "paddw "#A", %%mm6		\n\t"\
+        "movq %%mm6, "#OF"(%1)		\n\t"
+        
+#define QPEL_H264(OPNAME, OP, MMX)\
+static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=4;\
+\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "movq %5, %%mm4			\n\t"\
+        "movq %6, %%mm5			\n\t"\
+        "1:				\n\t"\
+        "movd  -1(%0), %%mm1		\n\t"\
+        "movd    (%0), %%mm2		\n\t"\
+        "movd   1(%0), %%mm3		\n\t"\
+        "movd   2(%0), %%mm0		\n\t"\
+        "punpcklbw %%mm7, %%mm1		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpcklbw %%mm7, %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "paddw %%mm0, %%mm1		\n\t"\
+        "paddw %%mm3, %%mm2		\n\t"\
+        "movd  -2(%0), %%mm0		\n\t"\
+        "movd   3(%0), %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "punpcklbw %%mm7, %%mm3		\n\t"\
+        "paddw %%mm3, %%mm0		\n\t"\
+        "psllw $2, %%mm2		\n\t"\
+        "psubw %%mm1, %%mm2		\n\t"\
+        "pmullw %%mm4, %%mm2		\n\t"\
+        "paddw %%mm5, %%mm0		\n\t"\
+        "paddw %%mm2, %%mm0		\n\t"\
+        "psraw $5, %%mm0		\n\t"\
+        "packuswb %%mm0, %%mm0		\n\t"\
+        OP(%%mm0, (%1),%%mm6, d)\
+        "add %3, %0			\n\t"\
+        "add %4, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    src -= 2*srcStride;\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "movd (%0), %%mm0		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm1		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm2		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm3		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm4		\n\t"\
+        "add %2, %0			\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "punpcklbw %%mm7, %%mm1		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpcklbw %%mm7, %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm4		\n\t"\
+        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    int h=4;\
+    int w=3;\
+    src -= 2*srcStride+2;\
+    while(w--){\
+        asm volatile(\
+            "pxor %%mm7, %%mm7			\n\t"\
+            "movd (%0), %%mm0			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm1			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm2			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm3			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm4			\n\t"\
+            "add %2, %0				\n\t"\
+            "punpcklbw %%mm7, %%mm0		\n\t"\
+            "punpcklbw %%mm7, %%mm1		\n\t"\
+            "punpcklbw %%mm7, %%mm2		\n\t"\
+            "punpcklbw %%mm7, %%mm3		\n\t"\
+            "punpcklbw %%mm7, %%mm4		\n\t"\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
+            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
+            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
+             \
+            : "+a"(src)\
+            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+            : "memory"\
+        );\
+        tmp += 4;\
+        src += 4 - 9*srcStride;\
+    }\
+    tmp -= 3*4;\
+    asm volatile(\
+        "movq %4, %%mm6			\n\t"\
+        "1:				\n\t"\
+        "movq     (%0), %%mm0		\n\t"\
+        "paddw  10(%0), %%mm0		\n\t"\
+        "movq    2(%0), %%mm1		\n\t"\
+        "paddw   8(%0), %%mm1		\n\t"\
+        "movq    4(%0), %%mm2		\n\t"\
+        "paddw   6(%0), %%mm2		\n\t"\
+        "psubw %%mm1, %%mm0		\n\t"/*a-b   (abccba)*/\
+        "psraw $2, %%mm0		\n\t"/*(a-b)/4 */\
+        "psubw %%mm1, %%mm0		\n\t"/*(a-b)/4-b */\
+        "paddsw %%mm2, %%mm0		\n\t"\
+        "psraw $2, %%mm0		\n\t"/*((a-b)/4-b)/4 */\
+        "paddw %%mm6, %%mm2		\n\t"\
+        "paddw %%mm2, %%mm0		\n\t"\
+        "psraw $6, %%mm0		\n\t"\
+        "packuswb %%mm0, %%mm0		\n\t"\
+        OP(%%mm0, (%1),%%mm7, d)\
+        "add $24, %0			\n\t"\
+        "add %3, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+a"(tmp), "+c"(dst), "+m"(h)\
+        : "S"((long)dstStride), "m"(ff_pw_32)\
+        : "memory"\
+    );\
+}\
+\
+static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=8;\
+    asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "movq %5, %%mm6			\n\t"\
+        "1:				\n\t"\
+        "movq    (%0), %%mm0		\n\t"\
+        "movq   1(%0), %%mm2		\n\t"\
+        "movq %%mm0, %%mm1		\n\t"\
+        "movq %%mm2, %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "punpckhbw %%mm7, %%mm1		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpckhbw %%mm7, %%mm3		\n\t"\
+        "paddw %%mm2, %%mm0		\n\t"\
+        "paddw %%mm3, %%mm1		\n\t"\
+        "psllw $2, %%mm0		\n\t"\
+        "psllw $2, %%mm1		\n\t"\
+        "movq   -1(%0), %%mm2		\n\t"\
+        "movq    2(%0), %%mm4		\n\t"\
+        "movq %%mm2, %%mm3		\n\t"\
+        "movq %%mm4, %%mm5		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpckhbw %%mm7, %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm4		\n\t"\
+        "punpckhbw %%mm7, %%mm5		\n\t"\
+        "paddw %%mm4, %%mm2		\n\t"\
+        "paddw %%mm3, %%mm5		\n\t"\
+        "psubw %%mm2, %%mm0		\n\t"\
+        "psubw %%mm5, %%mm1		\n\t"\
+        "pmullw %%mm6, %%mm0		\n\t"\
+        "pmullw %%mm6, %%mm1		\n\t"\
+        "movd   -2(%0), %%mm2		\n\t"\
+        "movd    7(%0), %%mm5		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpcklbw %%mm7, %%mm5		\n\t"\
+        "paddw %%mm3, %%mm2		\n\t"\
+        "paddw %%mm5, %%mm4		\n\t"\
+        "movq %6, %%mm5			\n\t"\
+        "paddw %%mm5, %%mm2		\n\t"\
+        "paddw %%mm5, %%mm4		\n\t"\
+        "paddw %%mm2, %%mm0		\n\t"\
+        "paddw %%mm4, %%mm1		\n\t"\
+        "psraw $5, %%mm0		\n\t"\
+        "psraw $5, %%mm1		\n\t"\
+        "packuswb %%mm1, %%mm0		\n\t"\
+        OP(%%mm0, (%1),%%mm5, q)\
+        "add %3, %0			\n\t"\
+        "add %4, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h= 2;\
+    src -= 2*srcStride;\
+    \
+    while(h--){\
+      asm volatile(\
+        "pxor %%mm7, %%mm7		\n\t"\
+        "movd (%0), %%mm0		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm1		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm2		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm3		\n\t"\
+        "add %2, %0			\n\t"\
+        "movd (%0), %%mm4		\n\t"\
+        "add %2, %0			\n\t"\
+        "punpcklbw %%mm7, %%mm0		\n\t"\
+        "punpcklbw %%mm7, %%mm1		\n\t"\
+        "punpcklbw %%mm7, %%mm2		\n\t"\
+        "punpcklbw %%mm7, %%mm3		\n\t"\
+        "punpcklbw %%mm7, %%mm4		\n\t"\
+        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+     );\
+     src += 4-13*srcStride;\
+     dst +=  4-8*dstStride;\
+   }\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    int h=8;\
+    int w=4;\
+    src -= 2*srcStride+2;\
+    while(w--){\
+        asm volatile(\
+            "pxor %%mm7, %%mm7			\n\t"\
+            "movd (%0), %%mm0			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm1			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm2			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm3			\n\t"\
+            "add %2, %0				\n\t"\
+            "movd (%0), %%mm4			\n\t"\
+            "add %2, %0				\n\t"\
+            "punpcklbw %%mm7, %%mm0		\n\t"\
+            "punpcklbw %%mm7, %%mm1		\n\t"\
+            "punpcklbw %%mm7, %%mm2		\n\t"\
+            "punpcklbw %%mm7, %%mm3		\n\t"\
+            "punpcklbw %%mm7, %%mm4		\n\t"\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
+            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
+            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
+            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
+            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
+             \
+            : "+a"(src)\
+            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+            : "memory"\
+        );\
+        tmp += 4;\
+        src += 4 - 13*srcStride;\
+    }\
+    tmp -= 4*4;\
+    asm volatile(\
+        "movq %4, %%mm6			\n\t"\
+        "1:				\n\t"\
+        "movq     (%0), %%mm0		\n\t"\
+        "movq    8(%0), %%mm3		\n\t"\
+        "movq    2(%0), %%mm1		\n\t"\
+        "movq   10(%0), %%mm4		\n\t"\
+        "paddw   %%mm4, %%mm0		\n\t"\
+        "paddw   %%mm3, %%mm1		\n\t"\
+        "paddw  18(%0), %%mm3		\n\t"\
+        "paddw  16(%0), %%mm4		\n\t"\
+        "movq    4(%0), %%mm2		\n\t"\
+        "movq   12(%0), %%mm5		\n\t"\
+        "paddw   6(%0), %%mm2		\n\t"\
+        "paddw  14(%0), %%mm5		\n\t"\
+        "psubw %%mm1, %%mm0		\n\t"\
+        "psubw %%mm4, %%mm3		\n\t"\
+        "psraw $2, %%mm0		\n\t"\
+        "psraw $2, %%mm3		\n\t"\
+        "psubw %%mm1, %%mm0		\n\t"\
+        "psubw %%mm4, %%mm3		\n\t"\
+        "paddsw %%mm2, %%mm0		\n\t"\
+        "paddsw %%mm5, %%mm3		\n\t"\
+        "psraw $2, %%mm0		\n\t"\
+        "psraw $2, %%mm3		\n\t"\
+        "paddw %%mm6, %%mm2		\n\t"\
+        "paddw %%mm6, %%mm5		\n\t"\
+        "paddw %%mm2, %%mm0		\n\t"\
+        "paddw %%mm5, %%mm3		\n\t"\
+        "psraw $6, %%mm0		\n\t"\
+        "psraw $6, %%mm3		\n\t"\
+        "packuswb %%mm3, %%mm0		\n\t"\
+        OP(%%mm0, (%1),%%mm7, q)\
+        "add $32, %0			\n\t"\
+        "add %3, %1			\n\t"\
+        "decl %2			\n\t"\
+        " jnz 1b			\n\t"\
+        : "+a"(tmp), "+c"(dst), "+m"(h)\
+        : "S"((long)dstStride), "m"(ff_pw_32)\
+        : "memory"\
+    );\
+}\
+static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
+    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
+    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
+}\
+
+#define H264_MC(OPNAME, SIZE, MMX) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/4];\
+    uint8_t * const halfH= (uint8_t*)temp;\
+    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/4];\
+    uint8_t * const halfH= (uint8_t*)temp;\
+    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/4];\
+    uint8_t * const halfH= (uint8_t*)temp;\
+    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*SIZE/4];\
+    uint8_t * const halfH= (uint8_t*)temp;\
+    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*(SIZE+8)/4];\
+    int16_t * const tmp= (int16_t*)temp;\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+    uint8_t * const halfH= (uint8_t*)temp;\
+    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
+    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+    uint8_t * const halfH= (uint8_t*)temp;\
+    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
+    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+    uint8_t * const halfV= (uint8_t*)temp;\
+    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
+    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+    uint8_t * const halfV= (uint8_t*)temp;\
+    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
+    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
+}\
+
 
 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "	\n\t"
 #define AVG_3DNOW_OP(a,b,temp, size) \
@@ -1983,6 +2799,24 @@ QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
 QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
 
+QPEL_H264(put_       ,       PUT_OP, 3dnow)
+QPEL_H264(avg_       , AVG_3DNOW_OP, 3dnow)
+QPEL_H264(put_       ,       PUT_OP, mmx2)
+QPEL_H264(avg_       ,  AVG_MMX2_OP, mmx2)
+
+H264_MC(put_, 4, 3dnow)
+H264_MC(put_, 8, 3dnow)
+H264_MC(put_, 16,3dnow)
+H264_MC(avg_, 4, 3dnow)
+H264_MC(avg_, 8, 3dnow)
+H264_MC(avg_, 16,3dnow)
+H264_MC(put_, 4, mmx2)
+H264_MC(put_, 8, mmx2)
+H264_MC(put_, 16,mmx2)
+H264_MC(avg_, 4, mmx2)
+H264_MC(avg_, 8, mmx2)
+H264_MC(avg_, 16,mmx2)
+
 #if 0
 static void just_return() { return; }
 #endif
@@ -1993,7 +2827,7 @@ static void just_return() { return; }
     c->avg_ ## postfix1 = avg_ ## postfix2;
 
 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
-    int i=0;
+    long i=0;
     
     assert(ABS(scale) < 256);
     scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
@@ -2025,8 +2859,8 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6
         "paddd %%mm1, %%mm0		\n\t"
         "psrld $4, %%mm0		\n\t"
         "paddd %%mm0, %%mm7		\n\t"
-        "addl $16, %0			\n\t"
-        "cmpl $128, %0			\n\t" //FIXME optimize & bench
+        "add $16, %0			\n\t"
+        "cmp $128, %0			\n\t" //FIXME optimize & bench
         " jb 1b				\n\t"
         "movq %%mm7, %%mm6		\n\t"
         "psrlq $32, %%mm7		\n\t"
@@ -2041,7 +2875,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6
 }
 
 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
-    int i=0;
+    long i=0;
     
     if(ABS(scale) < 256){
         scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
@@ -2064,8 +2898,8 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
                 "paddw 8(%2, %0), %%mm1		\n\t"
                 "movq %%mm0, (%2, %0)		\n\t"
                 "movq %%mm1, 8(%2, %0)		\n\t"
-                "addl $16, %0			\n\t"
-                "cmpl $128, %0			\n\t" //FIXME optimize & bench
+                "add $16, %0			\n\t"
+                "cmp $128, %0			\n\t" //FIXME optimize & bench
                 " jb 1b				\n\t"
                 
                 : "+r" (i)
@@ -2146,23 +2980,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             }
         }
 #endif //CONFIG_ENCODERS
-
-        if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
-            c->idct_put= ff_simple_idct_put_mmx;
-            c->idct_add= ff_simple_idct_add_mmx;
-            c->idct    = ff_simple_idct_mmx;
-            c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
-        }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
-            if(mm_flags & MM_MMXEXT){
-                c->idct_put= ff_libmpeg2mmx2_idct_put;
-                c->idct_add= ff_libmpeg2mmx2_idct_add;
-                c->idct    = ff_mmxext_idct;
-            }else{
-                c->idct_put= ff_libmpeg2mmx_idct_put;
-                c->idct_add= ff_libmpeg2mmx_idct_add;
-                c->idct    = ff_mmx_idct;
+        if(avctx->lowres==0){
+            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
+                c->idct_put= ff_simple_idct_put_mmx;
+                c->idct_add= ff_simple_idct_add_mmx;
+                c->idct    = ff_simple_idct_mmx;
+                c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+            }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
+                if(mm_flags & MM_MMXEXT){
+                    c->idct_put= ff_libmpeg2mmx2_idct_put;
+                    c->idct_add= ff_libmpeg2mmx2_idct_add;
+                    c->idct    = ff_mmxext_idct;
+                }else{
+                    c->idct_put= ff_libmpeg2mmx_idct_put;
+                    c->idct_add= ff_libmpeg2mmx_idct_add;
+                    c->idct    = ff_mmx_idct;
+                }
+                c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
             }
-            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
         }
 
         /* VP3 optimized DSP functions */
@@ -2235,8 +3070,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         
 	c->pix_norm1 = pix_norm1_mmx;
 	c->sse[0] = sse16_mmx;
+  	c->sse[1] = sse8_mmx;
         c->vsad[4]= vsad_intra16_mmx;
 
+	c->nsse[0] = nsse16_mmx;
+	c->nsse[1] = nsse8_mmx;
         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
             c->vsad[0] = vsad16_mmx;
         }
@@ -2319,6 +3157,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
 #endif
 
+//FIXME 3dnow too
+#define dspfunc(PFX, IDX, NUM) \
+    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
+    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
+    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
+    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
+    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
+    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
+    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
+
+            dspfunc(put_h264_qpel, 0, 16);
+            dspfunc(put_h264_qpel, 1, 8);
+            dspfunc(put_h264_qpel, 2, 4);
+            dspfunc(avg_h264_qpel, 0, 16);
+            dspfunc(avg_h264_qpel, 1, 8);
+            dspfunc(avg_h264_qpel, 2, 4);
+#undef dspfunc
+
 #ifdef CONFIG_ENCODERS
             c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
 #endif //CONFIG_ENCODERS
@@ -2378,6 +3243,31 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
             SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
             SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
+
+#define dspfunc(PFX, IDX, NUM) \
+    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
+    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
+    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
+    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
+    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
+    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
+    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
+
+            dspfunc(put_h264_qpel, 0, 16);
+            dspfunc(put_h264_qpel, 1, 8);
+            dspfunc(put_h264_qpel, 2, 4);
+            dspfunc(avg_h264_qpel, 0, 16);
+            dspfunc(avg_h264_qpel, 1, 8);
+            dspfunc(avg_h264_qpel, 2, 4);
         }
     }
         
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
index 052aad75c..c70891304 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
@@ -28,7 +28,7 @@
 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"1:				\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
@@ -36,59 +36,305 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
-	"addl %%eax, %1			\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %1		\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
 	PAVGB" 1(%1), %%mm0		\n\t"
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
-static __attribute__((unused)) void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
 {
     __asm __volatile(
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movd	(%1), %%mm0		\n\t"
+	"movd	(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$4, %2			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	"movd	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
+	"1:				\n\t"
+	"movd	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movd	(%1), %%mm1		\n\t"
+	"movd	(%2), %%mm2		\n\t"
+	"movd	4(%2), %%mm3		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"movd	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movd	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movd	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movd	(%1), %%mm1		\n\t"
+	"movd	8(%2), %%mm2		\n\t"
+	"movd	12(%2), %%mm3		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"movd	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movd	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"add	$16, %2			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+}
+
+
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$8, %2			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" 16(%2), %%mm0		\n\t"
+	PAVGB" 24(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+//the following should be used, though better not with gcc ...
+/*	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"pcmpeqb %%mm6, %%mm6	\n\t"
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$8, %2			\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	(%2), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"pxor %%mm6, %%mm2		\n\t"
+	"pxor %%mm6, %%mm3		\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	16(%2), %%mm2		\n\t"
+	"movq	24(%2), %%mm3		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"pxor %%mm6, %%mm2		\n\t"
+	"pxor %%mm6, %%mm3		\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+//the following should be used, though better not with gcc ...
+/*	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");*/
+}
+
+static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movd	(%1), %%mm0		\n\t"
+	"movd	(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$4, %2			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" (%3), %%mm0		\n\t"
+	"movd	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
+	"1:				\n\t"
+	"movd	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movd	(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 4(%2), %%mm1		\n\t"
+	PAVGB" (%3), %%mm0	 	\n\t"
+	"movd	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	PAVGB" (%3), %%mm1	 	\n\t"
+	"movd	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movd	(%1), %%mm0		\n\t"
+	"add	%4, %1			\n\t"
+	"movd	(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" 8(%2), %%mm0		\n\t"
+	PAVGB" 12(%2), %%mm1		\n\t"
+	PAVGB" (%3), %%mm0	 	\n\t"
+	"movd	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	PAVGB" (%3), %%mm1	 	\n\t"
+	"movd	%%mm1, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"add	$16, %2			\n\t"
+	"subl	$4, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+}
+
+
+static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$8, %2			\n\t"
+	PAVGB" %%mm1, %%mm0		\n\t"
+	PAVGB" (%3), %%mm0		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	"movq	(%1), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGB" (%2), %%mm0		\n\t"
 	PAVGB" 8(%2), %%mm1		\n\t"
+	PAVGB" (%3), %%mm0	 	\n\t"
 	"movq	%%mm0, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
+	PAVGB" (%3), %%mm1	 	\n\t"
 	"movq	%%mm1, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"movq	(%1), %%mm0		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	"movq	(%1), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGB" 16(%2), %%mm0		\n\t"
 	PAVGB" 24(%2), %%mm1		\n\t"
+	PAVGB" (%3), %%mm0	 	\n\t"
 	"movq	%%mm0, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
+	PAVGB" (%3), %%mm1	 	\n\t"
 	"movq	%%mm1, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
-        "addl	$32, %2			\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
 	"subl	$4, %0			\n\t"
 	"jnz	1b			\n\t"
-	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+//the following should be used, though better not with gcc ...
+/*	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
 	:"r"(src1Stride), "r"(dstStride)
-	:"memory");
+	:"memory");*/
 }
 
 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"1:				\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
@@ -102,8 +348,8 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line
 	"movq %%mm1, (%2, %3)		\n\t"
 	"movq %%mm2, 8(%2)		\n\t"
 	"movq %%mm3, 8(%2, %3)		\n\t"
-	"addl %%eax, %1			\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %1		\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
 	"movq 8(%1), %%mm2		\n\t"
@@ -112,45 +358,192 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
 	PAVGB" 9(%1), %%mm2		\n\t"
 	PAVGB" 9(%1, %3), %%mm3		\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
 	"movq %%mm2, 8(%2)		\n\t"
 	"movq %%mm3, 8(%2, %3)		\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$16, %2			\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	PAVGB" 16(%2), %%mm0		\n\t"
+	PAVGB" 24(%2), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+//the following should be used, though better not with gcc ...
+/*	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");*/
 }
 
-static __attribute__((unused)) void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
 {
     __asm __volatile(
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	PAVGB" (%2), %%mm0		\n\t"
+	PAVGB" 8(%2), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$16, %2			\n\t"
+	PAVGB" (%3), %%mm0		\n\t"
+	PAVGB" 8(%3), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	8(%1), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGB" (%2), %%mm0		\n\t"
 	PAVGB" 8(%2), %%mm1		\n\t"
+	PAVGB" (%3), %%mm0		\n\t"
+	PAVGB" 8(%3), %%mm1		\n\t"
 	"movq	%%mm0, (%3)		\n\t"
 	"movq	%%mm1, 8(%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	8(%1), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGB" 16(%2), %%mm0		\n\t"
 	PAVGB" 24(%2), %%mm1		\n\t"
+	PAVGB" (%3), %%mm0		\n\t"
+	PAVGB" 8(%3), %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
+	"subl	$2, %0			\n\t"
+	"jnz	1b			\n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+//the following should be used, though better not with gcc ...
+/*	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+	:"r"(src1Stride), "r"(dstStride)
+	:"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm __volatile(
+	"pcmpeqb %%mm6, %%mm6\n\t"
+	"testl $1, %0			\n\t"
+	    " jz 1f				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"movq	(%2), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"pxor %%mm6, %%mm2		\n\t"
+	"pxor %%mm6, %%mm3		\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"add	$16, %2			\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"decl	%0			\n\t"
+	"1:				\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	(%2), %%mm2		\n\t"
+	"movq	8(%2), %%mm3		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"pxor %%mm6, %%mm2		\n\t"
+	"pxor %%mm6, %%mm3		\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"movq	%%mm0, (%3)		\n\t"
+	"movq	%%mm1, 8(%3)		\n\t"
+	"add	%5, %3			\n\t"
+	"movq	(%1), %%mm0		\n\t"
+	"movq	8(%1), %%mm1		\n\t"
+	"add	%4, %1			\n\t"
+	"movq	16(%2), %%mm2		\n\t"
+	"movq	24(%2), %%mm3		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
+	"pxor %%mm6, %%mm2		\n\t"
+	"pxor %%mm6, %%mm3		\n\t"
+	PAVGB" %%mm2, %%mm0		\n\t"
+	PAVGB" %%mm3, %%mm1		\n\t"
+	"pxor %%mm6, %%mm0		\n\t"
+	"pxor %%mm6, %%mm1		\n\t"
 	"movq	%%mm0, (%3)		\n\t"
 	"movq	%%mm1, 8(%3)		\n\t"
-	"addl	%5, %3			\n\t"
-        "addl	$32, %2			\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
 	"subl	$2, %0			\n\t"
 	"jnz	1b			\n\t"
-	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
+	:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+	:"S"((long)src1Stride), "D"((long)dstStride)
+	:"memory"); 
+//the following should be used, though better not with gcc ...
+/*	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
 	:"r"(src1Stride), "r"(dstStride)
-	:"memory");
+	:"memory");*/
 }
  
 /* GL: this function does incorrect rounding if overflow */
@@ -158,13 +551,13 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"1:				\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm2		\n\t"
 	"movq 1(%1), %%mm1		\n\t"
 	"movq 1(%1, %3), %%mm3		\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"psubusb %%mm6, %%mm0		\n\t"
 	"psubusb %%mm6, %%mm2		\n\t"
 	PAVGB" %%mm1, %%mm0		\n\t"
@@ -175,50 +568,50 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in
 	"movq 1(%1), %%mm1		\n\t"
 	"movq (%1, %3), %%mm2		\n\t"
 	"movq 1(%1, %3), %%mm3		\n\t"
-	"addl %%eax, %2			\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %2		\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"psubusb %%mm6, %%mm0		\n\t"
 	"psubusb %%mm6, %%mm2		\n\t"
 	PAVGB" %%mm1, %%mm0		\n\t"
 	PAVGB" %%mm3, %%mm2		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm2, (%2, %3)		\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"movq (%1), %%mm0		\n\t"
-	"subl %3, %2			\n\t"
+	"sub %3, %2			\n\t"
 	"1:				\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm2	\n\t"
-	"addl %%eax, %1			\n\t"
+	"movq (%1, %%"REG_a"), %%mm2	\n\t"
+	"add %%"REG_a", %1		\n\t"
 	PAVGB" %%mm1, %%mm0		\n\t"
 	PAVGB" %%mm2, %%mm1		\n\t"
 	"movq %%mm0, (%2, %3)		\n\t"
-	"movq %%mm1, (%2, %%eax)	\n\t"
+	"movq %%mm1, (%2, %%"REG_a")	\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm0	\n\t"
-	"addl %%eax, %2			\n\t"
-	"addl %%eax, %1			\n\t"
+	"movq (%1, %%"REG_a"), %%mm0	\n\t"
+	"add %%"REG_a", %2		\n\t"
+	"add %%"REG_a", %1		\n\t"
 	PAVGB" %%mm1, %%mm2		\n\t"
 	PAVGB" %%mm0, %%mm1		\n\t"
 	"movq %%mm2, (%2, %3)		\n\t"
-	"movq %%mm1, (%2, %%eax)	\n\t"
-	"addl %%eax, %2			\n\t"
+	"movq %%mm1, (%2, %%"REG_a")	\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D" (block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
 /* GL: this function does incorrect rounding if overflow */
@@ -226,39 +619,39 @@ static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, in
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"movq (%1), %%mm0		\n\t"
-	"subl %3, %2			\n\t"
+	"sub %3, %2			\n\t"
 	"1:				\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm2	\n\t"
-	"addl %%eax, %1			\n\t"
+	"movq (%1, %%"REG_a"), %%mm2	\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"psubusb %%mm6, %%mm1		\n\t"
 	PAVGB" %%mm1, %%mm0		\n\t"
 	PAVGB" %%mm2, %%mm1		\n\t"
 	"movq %%mm0, (%2, %3)		\n\t"
-	"movq %%mm1, (%2, %%eax)	\n\t"
+	"movq %%mm1, (%2, %%"REG_a")	\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm0	\n\t"
-	"addl %%eax, %2			\n\t"
-	"addl %%eax, %1			\n\t"
+	"movq (%1, %%"REG_a"), %%mm0	\n\t"
+	"add %%"REG_a", %2		\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"psubusb %%mm6, %%mm1		\n\t"
 	PAVGB" %%mm1, %%mm2		\n\t"
 	PAVGB" %%mm0, %%mm1		\n\t"
 	"movq %%mm2, (%2, %3)		\n\t"
-	"movq %%mm1, (%2, %%eax)	\n\t"
-	"addl %%eax, %2			\n\t"
+	"movq %%mm1, (%2, %%"REG_a")	\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D" (block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"1:				\n\t"
 	"movq (%2), %%mm0		\n\t"
 	"movq (%2, %3), %%mm1		\n\t"
@@ -266,27 +659,27 @@ static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_siz
 	PAVGB" (%1, %3), %%mm1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
-	"addl %%eax, %1			\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %1		\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"movq (%2), %%mm0		\n\t"
 	"movq (%2, %3), %%mm1		\n\t"
 	PAVGB" (%1), %%mm0		\n\t"
 	PAVGB" (%1, %3), %%mm1		\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"1:				\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm2		\n\t"
@@ -294,63 +687,63 @@ static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
 	PAVGB" 1(%1, %3), %%mm2		\n\t"
 	PAVGB" (%2), %%mm0		\n\t"
 	PAVGB" (%2, %3), %%mm2		\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm2, (%2, %3)		\n\t"
 	"movq (%1), %%mm0		\n\t"
 	"movq (%1, %3), %%mm2		\n\t"
 	PAVGB" 1(%1), %%mm0		\n\t"
 	PAVGB" 1(%1, %3), %%mm2		\n\t"
-	"addl %%eax, %2			\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %2		\n\t"
+	"add %%"REG_a", %1		\n\t"
 	PAVGB" (%2), %%mm0		\n\t"
 	PAVGB" (%2, %3), %%mm2		\n\t"
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm2, (%2, %3)		\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"movq (%1), %%mm0		\n\t"
-	"subl %3, %2			\n\t"
+	"sub %3, %2			\n\t"
 	"1:				\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm2	\n\t"
-	"addl %%eax, %1			\n\t"
+	"movq (%1, %%"REG_a"), %%mm2	\n\t"
+	"add %%"REG_a", %1		\n\t"
 	PAVGB" %%mm1, %%mm0		\n\t"
 	PAVGB" %%mm2, %%mm1		\n\t"
 	"movq (%2, %3), %%mm3		\n\t"
-	"movq (%2, %%eax), %%mm4	\n\t"
+	"movq (%2, %%"REG_a"), %%mm4	\n\t"
 	PAVGB" %%mm3, %%mm0		\n\t"
 	PAVGB" %%mm4, %%mm1		\n\t"
 	"movq %%mm0, (%2, %3)		\n\t"
-	"movq %%mm1, (%2, %%eax)	\n\t"
+	"movq %%mm1, (%2, %%"REG_a")	\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq (%1, %%"REG_a"), %%mm0	\n\t"
 	PAVGB" %%mm1, %%mm2		\n\t"
 	PAVGB" %%mm0, %%mm1		\n\t"
-	"addl %%eax, %2			\n\t"
-	"addl %%eax, %1			\n\t"
+	"add %%"REG_a", %2		\n\t"
+	"add %%"REG_a", %1		\n\t"
 	"movq (%2, %3), %%mm3		\n\t"
-	"movq (%2, %%eax), %%mm4	\n\t"
+	"movq (%2, %%"REG_a"), %%mm4	\n\t"
 	PAVGB" %%mm3, %%mm2		\n\t"
 	PAVGB" %%mm4, %%mm1		\n\t"
 	"movq %%mm2, (%2, %3)		\n\t"
-	"movq %%mm1, (%2, %%eax)	\n\t"
-	"addl %%eax, %2			\n\t"
+	"movq %%mm1, (%2, %%"REG_a")	\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax", "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a, "memory");
 }
 
 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 
@@ -358,17 +751,17 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"movq (%1), %%mm0		\n\t"
 	PAVGB" 1(%1), %%mm0		\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
-	"movq (%1, %%eax), %%mm2	\n\t"
+	"movq (%1, %%"REG_a"), %%mm2	\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
 	"psubusb %%mm6, %%mm2		\n\t"
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
-	PAVGB" 1(%1, %%eax), %%mm2	\n\t"
-	"addl %%eax, %1			\n\t"
+	PAVGB" 1(%1, %%"REG_a"), %%mm2	\n\t"
+	"add %%"REG_a", %1		\n\t"
 	PAVGB" %%mm1, %%mm0		\n\t"
 	PAVGB" %%mm2, %%mm1		\n\t"
 	PAVGB" (%2), %%mm0		\n\t"
@@ -376,23 +769,23 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
 	"movq %%mm0, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
 	"movq (%1, %3), %%mm1		\n\t"
-	"movq (%1, %%eax), %%mm0	\n\t"
+	"movq (%1, %%"REG_a"), %%mm0	\n\t"
 	PAVGB" 1(%1, %3), %%mm1		\n\t"
-	PAVGB" 1(%1, %%eax), %%mm0	\n\t"
-	"addl %%eax, %2			\n\t"
-	"addl %%eax, %1			\n\t"
+	PAVGB" 1(%1, %%"REG_a"), %%mm0	\n\t"
+	"add %%"REG_a", %2		\n\t"
+	"add %%"REG_a", %1		\n\t"
 	PAVGB" %%mm1, %%mm2		\n\t"
 	PAVGB" %%mm0, %%mm1		\n\t"
 	PAVGB" (%2), %%mm2		\n\t"
 	PAVGB" (%2, %3), %%mm1		\n\t"
 	"movq %%mm2, (%2)		\n\t"
 	"movq %%mm1, (%2, %3)		\n\t"
-	"addl %%eax, %2			\n\t"
+	"add %%"REG_a", %2		\n\t"
 	"subl $4, %0			\n\t"
 	"jnz 1b				\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r" (line_size)
-	:"%eax",  "memory");
+	:"r" ((long)line_size)
+	:"%"REG_a,  "memory");
 }
 
 //FIXME the following could be optimized too ...
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
index 1b79aa56a..20ea1b59e 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -27,7 +27,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
-	"lea	(%3, %3), %%eax		\n\t"
+	"lea	(%3, %3), %%"REG_a"	\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
@@ -37,8 +37,8 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%2)		\n\t"
 	"movq	%%mm5, (%2, %3)		\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	1(%1), %%mm1		\n\t"
 	"movq	(%1, %3), %%mm2		\n\t"
@@ -46,13 +46,13 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%2)		\n\t"
 	"movq	%%mm5, (%2, %3)		\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 	"subl	$4, %0			\n\t"
 	"jnz	1b			\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r"(line_size)
-	:"eax", "memory");
+	:"r"((long)line_size)
+	:REG_a, "memory");
 }
 
 static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
@@ -63,37 +63,37 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
         " jz 1f				\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	(%2), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
-        "addl	$8, %2			\n\t"
+	"add	%4, %1			\n\t"
+	"add	$8, %2			\n\t"
 	PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
 	"movq	%%mm4, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
         "decl	%0			\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	(%2), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	"movq	(%1), %%mm2		\n\t"
 	"movq	8(%2), %%mm3		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"movq	%%mm5, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	16(%2), %%mm1		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	"movq	(%1), %%mm2		\n\t"
 	"movq	24(%2), %%mm3		\n\t"
-	"addl	%4, %1			\n\t"
-	"addl	$32, %2			\n\t"
+	"add	%4, %1			\n\t"
+	"add	$32, %2			\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"movq	%%mm5, (%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"subl	$4, %0			\n\t"
 	"jnz	1b			\n\t"
 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@@ -101,7 +101,7 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
 #else
         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
 #endif
-	:"S"(src1Stride), "D"(dstStride)
+	:"S"((long)src1Stride), "D"((long)dstStride)
 	:"memory");
 }
 
@@ -109,7 +109,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
-	"lea	(%3, %3), %%eax		\n\t"
+	"lea	(%3, %3), %%"REG_a"	\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1), %%mm0		\n\t"
@@ -126,8 +126,8 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, 8(%2)		\n\t"
 	"movq	%%mm5, 8(%2, %3)	\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	1(%1), %%mm1		\n\t"
 	"movq	(%1, %3), %%mm2		\n\t"
@@ -142,13 +142,13 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, 8(%2)		\n\t"
 	"movq	%%mm5, 8(%2, %3)	\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 	"subl	$4, %0			\n\t"
 	"jnz	1b			\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r"(line_size)
-	:"eax", "memory");
+	:"r"((long)line_size)
+	:REG_a, "memory");
 }
 
 static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
@@ -161,12 +161,12 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
 	"movq	(%2), %%mm1		\n\t"
 	"movq	8(%1), %%mm2		\n\t"
 	"movq	8(%2), %%mm3		\n\t"
-	"addl	%4, %1			\n\t"
-	"addl	$16, %2			\n\t"
+	"add	%4, %1			\n\t"
+	"add	$16, %2			\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%3)		\n\t"
 	"movq	%%mm5, 8(%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"decl	%0			\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
@@ -174,21 +174,21 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
 	"movq	(%2), %%mm1		\n\t"
 	"movq	8(%1), %%mm2		\n\t"
 	"movq	8(%2), %%mm3		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%3)		\n\t"
 	"movq	%%mm5, 8(%3)		\n\t"
-	"addl	%5, %3			\n\t"
+	"add	%5, %3			\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	"movq	16(%2), %%mm1		\n\t"
 	"movq	8(%1), %%mm2		\n\t"
 	"movq	24(%2), %%mm3		\n\t"
-	"addl	%4, %1			\n\t"
+	"add	%4, %1			\n\t"
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
 	"movq	%%mm4, (%3)		\n\t"
 	"movq	%%mm5, 8(%3)		\n\t"
-	"addl	%5, %3			\n\t"
-	"addl	$32, %2			\n\t"
+	"add	%5, %3			\n\t"
+	"add	$32, %2			\n\t"
 	"subl	$2, %0			\n\t"
 	"jnz	1b			\n\t"
 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@@ -196,7 +196,7 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
 #else
 	:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
 #endif
-	:"S"(src1Stride), "D"(dstStride)
+	:"S"((long)src1Stride), "D"((long)dstStride)
 	:"memory"); 
 }
 
@@ -204,29 +204,29 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
-	"lea (%3, %3), %%eax		\n\t"
+	"lea (%3, %3), %%"REG_a"	\n\t"
 	"movq (%1), %%mm0		\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1, %3), %%mm1		\n\t"
-	"movq	(%1, %%eax),%%mm2	\n\t"
+	"movq	(%1, %%"REG_a"),%%mm2	\n\t"
 	PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
 	"movq	%%mm4, (%2)		\n\t"
 	"movq	%%mm5, (%2, %3)		\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 	"movq	(%1, %3), %%mm1		\n\t"
-	"movq	(%1, %%eax),%%mm0	\n\t"
+	"movq	(%1, %%"REG_a"),%%mm0	\n\t"
 	PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
 	"movq	%%mm4, (%2)		\n\t"
 	"movq	%%mm5, (%2, %3)		\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 	"subl	$4, %0			\n\t"
 	"jnz	1b			\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r"(line_size)
-	:"eax", "memory");
+	:"r"((long)line_size)
+	:REG_a, "memory");
 }
 
 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@@ -244,12 +244,12 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 	"punpckhbw %%mm7, %%mm5		\n\t"
 	"paddusw %%mm0, %%mm4		\n\t"
 	"paddusw %%mm1, %%mm5		\n\t"
-	"xorl	%%eax, %%eax		\n\t"
-	"addl	%3, %1			\n\t"
+	"xor	%%"REG_a", %%"REG_a"	\n\t"
+	"add	%3, %1			\n\t"
 	".balign 8      		\n\t"
 	"1:				\n\t"
-	"movq	(%1, %%eax), %%mm0	\n\t"
-	"movq	1(%1, %%eax), %%mm2	\n\t"
+	"movq	(%1, %%"REG_a"), %%mm0	\n\t"
+	"movq	1(%1, %%"REG_a"), %%mm2	\n\t"
 	"movq	%%mm0, %%mm1		\n\t"
 	"movq	%%mm2, %%mm3		\n\t"
 	"punpcklbw %%mm7, %%mm0		\n\t"
@@ -265,11 +265,11 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 	"psrlw	$2, %%mm4		\n\t"
 	"psrlw	$2, %%mm5		\n\t"
 	"packuswb  %%mm5, %%mm4		\n\t"
-	"movq	%%mm4, (%2, %%eax)	\n\t"
-	"addl	%3, %%eax		\n\t"
+	"movq	%%mm4, (%2, %%"REG_a")	\n\t"
+	"add	%3, %%"REG_a"		\n\t"
 
-	"movq	(%1, %%eax), %%mm2	\n\t" // 0 <-> 2   1 <-> 3
-	"movq	1(%1, %%eax), %%mm4	\n\t"
+	"movq	(%1, %%"REG_a"), %%mm2	\n\t" // 0 <-> 2   1 <-> 3
+	"movq	1(%1, %%"REG_a"), %%mm4	\n\t"
 	"movq	%%mm2, %%mm3		\n\t"
 	"movq	%%mm4, %%mm5		\n\t"
 	"punpcklbw %%mm7, %%mm2		\n\t"
@@ -285,17 +285,36 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 	"psrlw	$2, %%mm0		\n\t"
 	"psrlw	$2, %%mm1		\n\t"
 	"packuswb  %%mm1, %%mm0		\n\t"
-	"movq	%%mm0, (%2, %%eax)	\n\t"
-	"addl	%3, %%eax		\n\t"
+	"movq	%%mm0, (%2, %%"REG_a")	\n\t"
+	"add	%3, %%"REG_a"		\n\t"
 
 	"subl	$2, %0			\n\t"
 	"jnz	1b			\n\t"
 	:"+g"(h), "+S"(pixels)
-	:"D"(block), "r"(line_size)
-	:"eax", "memory");
+	:"D"(block), "r"((long)line_size)
+	:REG_a, "memory");
 }
 
 // avg_pixels
+static void DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+	__asm __volatile(
+	     "movd  %0, %%mm0		\n\t"
+	     "movd  %1, %%mm1		\n\t"
+	     PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+	     "movd  %%mm2, %0		\n\t"
+	     :"+m"(*block)
+	     :"m"(*pixels)
+	     :"memory");
+	pixels += line_size;
+	block += line_size;
+    }
+    while (--h);
+}
+
 // in case more speed is needed - unroling would certainly help
 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
@@ -437,12 +456,12 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
-	"lea	(%3, %3), %%eax		\n\t"
+	"lea	(%3, %3), %%"REG_a"	\n\t"
 	"movq	(%1), %%mm0		\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
 	"movq	(%1, %3), %%mm1		\n\t"
-	"movq	(%1, %%eax), %%mm2	\n\t"
+	"movq	(%1, %%"REG_a"), %%mm2	\n\t"
 	PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
 	"movq	(%2), %%mm3		\n\t"
 	PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
@@ -450,11 +469,11 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
 	PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
 	"movq	%%mm0, (%2)		\n\t"
 	"movq	%%mm1, (%2, %3)		\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 
 	"movq	(%1, %3), %%mm1		\n\t"
-	"movq	(%1, %%eax), %%mm0	\n\t"
+	"movq	(%1, %%"REG_a"), %%mm0	\n\t"
 	PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
 	"movq	(%2), %%mm3		\n\t"
 	PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
@@ -462,14 +481,14 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
 	PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
 	"movq	%%mm2, (%2)		\n\t"
 	"movq	%%mm1, (%2, %3)		\n\t"
-	"addl	%%eax, %1		\n\t"
-	"addl	%%eax, %2		\n\t"
+	"add	%%"REG_a", %1		\n\t"
+	"add	%%"REG_a", %2		\n\t"
 
 	"subl	$4, %0			\n\t"
 	"jnz	1b			\n\t"
 	:"+g"(h), "+S"(pixels), "+D"(block)
-	:"r"(line_size)
-	:"eax", "memory");
+	:"r"((long)line_size)
+	:REG_a, "memory");
 }
 
 // this routine is 'slightly' suboptimal but mostly unused
@@ -488,12 +507,12 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 	"punpckhbw %%mm7, %%mm5		\n\t"
 	"paddusw %%mm0, %%mm4		\n\t"
 	"paddusw %%mm1, %%mm5		\n\t"
-	"xorl	%%eax, %%eax		\n\t"
-	"addl	%3, %1			\n\t"
+	"xor	%%"REG_a", %%"REG_a"	\n\t"
+	"add	%3, %1			\n\t"
 	".balign 8			\n\t"
 	"1:				\n\t"
-	"movq	(%1, %%eax), %%mm0	\n\t"
-	"movq	1(%1, %%eax), %%mm2	\n\t"
+	"movq	(%1, %%"REG_a"), %%mm0	\n\t"
+	"movq	1(%1, %%"REG_a"), %%mm2	\n\t"
 	"movq	%%mm0, %%mm1		\n\t"
 	"movq	%%mm2, %%mm3		\n\t"
 	"punpcklbw %%mm7, %%mm0		\n\t"
@@ -508,16 +527,16 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 	"paddusw %%mm1, %%mm5		\n\t"
 	"psrlw	$2, %%mm4		\n\t"
 	"psrlw	$2, %%mm5		\n\t"
-		"movq	(%2, %%eax), %%mm3	\n\t"
+		"movq	(%2, %%"REG_a"), %%mm3	\n\t"
 	"packuswb  %%mm5, %%mm4		\n\t"
 		"pcmpeqd %%mm2, %%mm2	\n\t"
 		"paddb %%mm2, %%mm2	\n\t"
 		PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
-		"movq	%%mm5, (%2, %%eax)	\n\t"
-	"addl	%3, %%eax		\n\t"
+		"movq	%%mm5, (%2, %%"REG_a")	\n\t"
+	"add	%3, %%"REG_a"		\n\t"
 
-	"movq	(%1, %%eax), %%mm2	\n\t" // 0 <-> 2   1 <-> 3
-	"movq	1(%1, %%eax), %%mm4	\n\t"
+	"movq	(%1, %%"REG_a"), %%mm2	\n\t" // 0 <-> 2   1 <-> 3
+	"movq	1(%1, %%"REG_a"), %%mm4	\n\t"
 	"movq	%%mm2, %%mm3		\n\t"
 	"movq	%%mm4, %%mm5		\n\t"
 	"punpcklbw %%mm7, %%mm2		\n\t"
@@ -532,19 +551,19 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 	"paddusw %%mm5, %%mm1		\n\t"
 	"psrlw	$2, %%mm0		\n\t"
 	"psrlw	$2, %%mm1		\n\t"
-		"movq	(%2, %%eax), %%mm3	\n\t"
+		"movq	(%2, %%"REG_a"), %%mm3	\n\t"
 	"packuswb  %%mm1, %%mm0		\n\t"
 		"pcmpeqd %%mm2, %%mm2	\n\t"
 		"paddb %%mm2, %%mm2	\n\t"
 		PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
-		"movq	%%mm1, (%2, %%eax)	\n\t"
-	"addl	%3, %%eax		\n\t"
+		"movq	%%mm1, (%2, %%"REG_a")	\n\t"
+	"add	%3, %%"REG_a"		\n\t"
 
 	"subl	$2, %0			\n\t"
 	"jnz	1b			\n\t"
 	:"+g"(h), "+S"(pixels)
-	:"D"(block), "r"(line_size)
-	:"eax", "memory");
+	:"D"(block), "r"((long)line_size)
+	:REG_a, "memory");
 }
 
 //FIXME optimize
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index 7f348329a..aacbe5743 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
     23170, 23170, 23170, 23170,	//cos * (2<<15) + 0.5
 };
 
-static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
+static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
 
-static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
+static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
 
 struct 
 {
- const long fdct_r_row_sse2[4] ATTR_ALIGN(16);
+ const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
 } fdct_r_row_sse2 ATTR_ALIGN(16)=
 {{
  RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h
index ad684bc5a..f0ef1b79e 100644
--- a/src/libffmpeg/libavcodec/i386/mmx.h
+++ b/src/libffmpeg/libavcodec/i386/mmx.h
@@ -5,6 +5,12 @@
 #ifndef AVCODEC_I386MMX_H
 #define AVCODEC_I386MMX_H
 
+#ifdef ARCH_X86_64
+#  define REG_a "rax"
+#else
+#  define REG_a "eax"
+#endif
+
 /*
  * The type of an value that fits in an MMX register (note that long
  * long constant values MUST be suffixed by LL and unsigned long long
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index 39246d905..1b90f8e40 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -20,6 +20,7 @@
  * mostly by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "../dsputil.h"
+#include "mmx.h"
 
 static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
 0x0000000000000000ULL,
@@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101
 
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
     asm volatile(
         ".balign 16			\n\t"
         "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
-        "movq (%2, %%eax), %%mm4	\n\t"
-        "addl %3, %%eax			\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
+        "movq (%2, %%"REG_a"), %%mm4	\n\t"
+        "add %3, %%"REG_a"		\n\t"
         "psubusb %%mm0, %%mm2		\n\t"
         "psubusb %%mm4, %%mm0		\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
-        "movq (%2, %%eax), %%mm5	\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
+        "movq (%2, %%"REG_a"), %%mm5	\n\t"
         "psubusb %%mm1, %%mm3		\n\t"
         "psubusb %%mm5, %%mm1		\n\t"
         "por %%mm2, %%mm0		\n\t"
@@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         "paddw %%mm3, %%mm2		\n\t"
         "paddw %%mm2, %%mm0		\n\t"
         "paddw %%mm0, %%mm6		\n\t"
-        "addl %3, %%eax			\n\t"
+        "add %3, %%"REG_a"		\n\t"
         " js 1b				\n\t"
         : "+a" (len)
-        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
     );
 }
 
 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
     asm volatile(
         ".balign 16			\n\t"
         "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
         "psadbw %%mm2, %%mm0		\n\t"
-        "addl %3, %%eax			\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "add %3, %%"REG_a"		\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
         "psadbw %%mm1, %%mm3		\n\t"
         "paddw %%mm3, %%mm0		\n\t"
         "paddw %%mm0, %%mm6		\n\t"
-        "addl %3, %%eax			\n\t"
+        "add %3, %%"REG_a"		\n\t"
         " js 1b				\n\t"
         : "+a" (len)
-        : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
     );
 }
 
 static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
     asm volatile(
         ".balign 16			\n\t"
         "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
         "pavgb %%mm2, %%mm0		\n\t"
-        "movq (%3, %%eax), %%mm2	\n\t"
+        "movq (%3, %%"REG_a"), %%mm2	\n\t"
         "psadbw %%mm2, %%mm0		\n\t"
-        "addl %4, %%eax			\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "add %4, %%"REG_a"		\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
         "pavgb %%mm1, %%mm3		\n\t"
-        "movq (%3, %%eax), %%mm1	\n\t"
+        "movq (%3, %%"REG_a"), %%mm1	\n\t"
         "psadbw %%mm1, %%mm3		\n\t"
         "paddw %%mm3, %%mm0		\n\t"
         "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
         " js 1b				\n\t"
         : "+a" (len)
-        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
     );
 }
 
 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 { //FIXME reuse src
-    int len= -(stride*h);
+    long len= -(stride*h);
     asm volatile(
         ".balign 16			\n\t"
         "movq "MANGLE(bone)", %%mm5	\n\t"
         "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm2	\n\t"
-        "movq 1(%1, %%eax), %%mm1	\n\t"
-        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
+        "movq (%2, %%"REG_a"), %%mm2	\n\t"
+        "movq 1(%1, %%"REG_a"), %%mm1	\n\t"
+        "movq 1(%2, %%"REG_a"), %%mm3	\n\t"
         "pavgb %%mm2, %%mm0		\n\t"
         "pavgb %%mm1, %%mm3		\n\t"
         "psubusb %%mm5, %%mm3		\n\t"
         "pavgb %%mm3, %%mm0		\n\t"
-        "movq (%3, %%eax), %%mm2	\n\t"
+        "movq (%3, %%"REG_a"), %%mm2	\n\t"
         "psadbw %%mm2, %%mm0		\n\t"
-        "addl %4, %%eax			\n\t"
-        "movq (%1, %%eax), %%mm1	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
-        "movq 1(%1, %%eax), %%mm2	\n\t"
-        "movq 1(%2, %%eax), %%mm4	\n\t"
+        "add %4, %%"REG_a"		\n\t"
+        "movq (%1, %%"REG_a"), %%mm1	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
+        "movq 1(%1, %%"REG_a"), %%mm2	\n\t"
+        "movq 1(%2, %%"REG_a"), %%mm4	\n\t"
         "pavgb %%mm3, %%mm1		\n\t"
         "pavgb %%mm4, %%mm2		\n\t"
         "psubusb %%mm5, %%mm2		\n\t"
         "pavgb %%mm1, %%mm2		\n\t"
-        "movq (%3, %%eax), %%mm1	\n\t"
+        "movq (%3, %%"REG_a"), %%mm1	\n\t"
         "psadbw %%mm1, %%mm2		\n\t"
         "paddw %%mm2, %%mm0		\n\t"
         "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
         " js 1b				\n\t"
         : "+a" (len)
-        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
     );
 }
 
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
     asm volatile(
         ".balign 16			\n\t"
         "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm1	\n\t"
-        "movq (%1, %%eax), %%mm2	\n\t"
-        "movq (%2, %%eax), %%mm3	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
+        "movq (%2, %%"REG_a"), %%mm1	\n\t"
+        "movq (%1, %%"REG_a"), %%mm2	\n\t"
+        "movq (%2, %%"REG_a"), %%mm3	\n\t"
         "punpcklbw %%mm7, %%mm0		\n\t"
         "punpcklbw %%mm7, %%mm1		\n\t"
         "punpckhbw %%mm7, %%mm2		\n\t"
         "punpckhbw %%mm7, %%mm3		\n\t"
         "paddw %%mm0, %%mm1		\n\t"
         "paddw %%mm2, %%mm3		\n\t"
-        "movq (%3, %%eax), %%mm4	\n\t"
-        "movq (%3, %%eax), %%mm2	\n\t"
+        "movq (%3, %%"REG_a"), %%mm4	\n\t"
+        "movq (%3, %%"REG_a"), %%mm2	\n\t"
         "paddw %%mm5, %%mm1		\n\t"
         "paddw %%mm5, %%mm3		\n\t"
         "psrlw $1, %%mm1		\n\t"
@@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
         "punpckhbw %%mm7, %%mm1		\n\t"
         "paddw %%mm1, %%mm0		\n\t"
         "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
         " js 1b				\n\t"
         : "+a" (len)
-        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
     );
 }
 
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    int len= -(stride*h);
+    long len= -(stride*h);
     asm volatile(
         ".balign 16			\n\t"
         "1:				\n\t"
-        "movq (%1, %%eax), %%mm0	\n\t"
-        "movq (%2, %%eax), %%mm1	\n\t"
+        "movq (%1, %%"REG_a"), %%mm0	\n\t"
+        "movq (%2, %%"REG_a"), %%mm1	\n\t"
         "movq %%mm0, %%mm4		\n\t"
         "movq %%mm1, %%mm2		\n\t"
         "punpcklbw %%mm7, %%mm0		\n\t"
@@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         "punpckhbw %%mm7, %%mm2		\n\t"
         "paddw %%mm1, %%mm0		\n\t"
         "paddw %%mm2, %%mm4		\n\t"
-        "movq 1(%1, %%eax), %%mm2	\n\t"
-        "movq 1(%2, %%eax), %%mm3	\n\t"
+        "movq 1(%1, %%"REG_a"), %%mm2	\n\t"
+        "movq 1(%2, %%"REG_a"), %%mm3	\n\t"
         "movq %%mm2, %%mm1		\n\t"
         "punpcklbw %%mm7, %%mm2		\n\t"
         "punpckhbw %%mm7, %%mm1		\n\t"
@@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         "punpckhbw %%mm7, %%mm4		\n\t"
         "paddw %%mm3, %%mm2		\n\t"
         "paddw %%mm4, %%mm1		\n\t"
-        "movq (%3, %%eax), %%mm3	\n\t"
-        "movq (%3, %%eax), %%mm4	\n\t"
+        "movq (%3, %%"REG_a"), %%mm3	\n\t"
+        "movq (%3, %%"REG_a"), %%mm4	\n\t"
         "paddw %%mm5, %%mm2		\n\t"
         "paddw %%mm5, %%mm1		\n\t"
         "psrlw $2, %%mm2		\n\t"
@@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         "punpckhbw %%mm7, %%mm2		\n\t"
         "paddw %%mm2, %%mm0		\n\t"
         "paddw %%mm0, %%mm6		\n\t"
-        "addl %4, %%eax			\n\t"
+        "add %4, %%"REG_a"		\n\t"
         " js 1b				\n\t"
         : "+a" (len)
-        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
+        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
     );
 }
 
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index f19de73d6..70c81f675 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -23,6 +23,7 @@
 #include "../dsputil.h"
 #include "../mpegvideo.h"
 #include "../avcodec.h"
+#include "mmx.h"
 
 extern uint8_t zigzag_direct_noperm[64];
 extern uint16_t inv_zigzag_direct16[64];
@@ -34,7 +35,7 @@ static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x000
 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
                                   DCTELEM *block, int n, int qscale)
 {
-    int level, qmul, qadd, nCoeffs;
+    long level, qmul, qadd, nCoeffs;
 
     qmul = qscale << 1;
 
@@ -97,7 +98,7 @@ asm volatile(
 		"movq %%mm0, (%0, %3)		\n\t"
 		"movq %%mm1, 8(%0, %3)		\n\t"
 
-		"addl $16, %3			\n\t"
+		"add $16, %3			\n\t"
 		"jng 1b				\n\t"
 		::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
 		: "memory"
@@ -109,7 +110,7 @@ asm volatile(
 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
                                   DCTELEM *block, int n, int qscale)
 {
-    int qmul, qadd, nCoeffs;
+    long qmul, qadd, nCoeffs;
 
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
@@ -160,7 +161,7 @@ asm volatile(
 		"movq %%mm0, (%0, %3)		\n\t"
 		"movq %%mm1, 8(%0, %3)		\n\t"
 
-		"addl $16, %3			\n\t"
+		"add $16, %3			\n\t"
 		"jng 1b				\n\t"
 		::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
 		: "memory"
@@ -200,7 +201,7 @@ asm volatile(
 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
-    int nCoeffs;
+    long nCoeffs;
     const uint16_t *quant_matrix;
     int block0;
 
@@ -220,13 +221,13 @@ asm volatile(
 		"movd %2, %%mm6			\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
-                "movl %3, %%eax			\n\t"
+		"mov %3, %%"REG_a"		\n\t"
 		".balign 16\n\t"
 		"1:				\n\t"
-		"movq (%0, %%eax), %%mm0	\n\t"
-		"movq 8(%0, %%eax), %%mm1	\n\t"
-		"movq (%1, %%eax), %%mm4	\n\t"
-		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"movq (%0, %%"REG_a"), %%mm0	\n\t"
+		"movq 8(%0, %%"REG_a"), %%mm1	\n\t"
+		"movq (%1, %%"REG_a"), %%mm4	\n\t"
+		"movq 8(%1, %%"REG_a"), %%mm5	\n\t"
 		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
 		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
 		"pxor %%mm2, %%mm2		\n\t"
@@ -241,8 +242,8 @@ asm volatile(
 		"pmullw %%mm5, %%mm1		\n\t" // abs(block[i])*q
 		"pxor %%mm4, %%mm4		\n\t"
 		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
-		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
-		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw (%0, %%"REG_a"), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
 		"psraw $3, %%mm0		\n\t"
 		"psraw $3, %%mm1		\n\t"
 		"psubw %%mm7, %%mm0		\n\t"
@@ -255,13 +256,13 @@ asm volatile(
 		"psubw %%mm3, %%mm1		\n\t"
 		"pandn %%mm0, %%mm4		\n\t"
 		"pandn %%mm1, %%mm5		\n\t"
-		"movq %%mm4, (%0, %%eax)	\n\t"
-		"movq %%mm5, 8(%0, %%eax)	\n\t"
+		"movq %%mm4, (%0, %%"REG_a")	\n\t"
+		"movq %%mm5, 8(%0, %%"REG_a")	\n\t"
 
-		"addl $16, %%eax		\n\t"
+		"add $16, %%"REG_a"		\n\t"
 		"js 1b				\n\t"
 		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
-		: "%eax", "memory"
+		: "%"REG_a, "memory"
 	);    
     block[0]= block0;
 }
@@ -269,7 +270,7 @@ asm volatile(
 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
-    int nCoeffs;
+    long nCoeffs;
     const uint16_t *quant_matrix;
 
     assert(s->block_last_index[n]>=0);
@@ -283,13 +284,13 @@ asm volatile(
 		"movd %2, %%mm6			\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
-                "movl %3, %%eax			\n\t"
+		"mov %3, %%"REG_a"		\n\t"
 		".balign 16\n\t"
 		"1:				\n\t"
-		"movq (%0, %%eax), %%mm0	\n\t"
-		"movq 8(%0, %%eax), %%mm1	\n\t"
-		"movq (%1, %%eax), %%mm4	\n\t"
-		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"movq (%0, %%"REG_a"), %%mm0	\n\t"
+		"movq 8(%0, %%"REG_a"), %%mm1	\n\t"
+		"movq (%1, %%"REG_a"), %%mm4	\n\t"
+		"movq 8(%1, %%"REG_a"), %%mm5	\n\t"
 		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
 		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
 		"pxor %%mm2, %%mm2		\n\t"
@@ -308,8 +309,8 @@ asm volatile(
 		"pmullw %%mm5, %%mm1		\n\t" // (abs(block[i])*2 + 1)*q
 		"pxor %%mm4, %%mm4		\n\t"
 		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
-		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
-		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw (%0, %%"REG_a"), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
 		"psraw $4, %%mm0		\n\t"
 		"psraw $4, %%mm1		\n\t"
 		"psubw %%mm7, %%mm0		\n\t"
@@ -322,20 +323,20 @@ asm volatile(
 		"psubw %%mm3, %%mm1		\n\t"
 		"pandn %%mm0, %%mm4		\n\t"
 		"pandn %%mm1, %%mm5		\n\t"
-		"movq %%mm4, (%0, %%eax)	\n\t"
-		"movq %%mm5, 8(%0, %%eax)	\n\t"
+		"movq %%mm4, (%0, %%"REG_a")	\n\t"
+		"movq %%mm5, 8(%0, %%"REG_a")	\n\t"
 
-		"addl $16, %%eax		\n\t"
+		"add $16, %%"REG_a"		\n\t"
 		"js 1b				\n\t"
 		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
-		: "%eax", "memory"
+		: "%"REG_a, "memory"
 	);
 }
 
 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
-    int nCoeffs;
+    long nCoeffs;
     const uint16_t *quant_matrix;
     int block0;
     
@@ -355,13 +356,13 @@ asm volatile(
 		"movd %2, %%mm6			\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
-                "movl %3, %%eax			\n\t"
+		"mov %3, %%"REG_a"		\n\t"
 		".balign 16\n\t"
 		"1:				\n\t"
-		"movq (%0, %%eax), %%mm0	\n\t"
-		"movq 8(%0, %%eax), %%mm1	\n\t"
-		"movq (%1, %%eax), %%mm4	\n\t"
-		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"movq (%0, %%"REG_a"), %%mm0	\n\t"
+		"movq 8(%0, %%"REG_a"), %%mm1	\n\t"
+		"movq (%1, %%"REG_a"), %%mm4	\n\t"
+		"movq 8(%1, %%"REG_a"), %%mm5	\n\t"
 		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
 		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
 		"pxor %%mm2, %%mm2		\n\t"
@@ -376,8 +377,8 @@ asm volatile(
 		"pmullw %%mm5, %%mm1		\n\t" // abs(block[i])*q
 		"pxor %%mm4, %%mm4		\n\t"
 		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
-		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
-		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw (%0, %%"REG_a"), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
 		"psraw $3, %%mm0		\n\t"
 		"psraw $3, %%mm1		\n\t"
 		"pxor %%mm2, %%mm0		\n\t"
@@ -386,13 +387,13 @@ asm volatile(
 		"psubw %%mm3, %%mm1		\n\t"
 		"pandn %%mm0, %%mm4		\n\t"
 		"pandn %%mm1, %%mm5		\n\t"
-		"movq %%mm4, (%0, %%eax)	\n\t"
-		"movq %%mm5, 8(%0, %%eax)	\n\t"
+		"movq %%mm4, (%0, %%"REG_a")	\n\t"
+		"movq %%mm5, 8(%0, %%"REG_a")	\n\t"
 
-		"addl $16, %%eax		\n\t"
+		"add $16, %%"REG_a"		\n\t"
 		"jng 1b				\n\t"
 		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
-		: "%eax", "memory"
+		: "%"REG_a, "memory"
 	);    
     block[0]= block0;
         //Note, we dont do mismatch control for intra as errors cannot accumulate
@@ -401,7 +402,7 @@ asm volatile(
 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
-    int nCoeffs;
+    long nCoeffs;
     const uint16_t *quant_matrix;
     
     assert(s->block_last_index[n]>=0);
@@ -416,13 +417,13 @@ asm volatile(
 		"movd %2, %%mm6			\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
 		"packssdw %%mm6, %%mm6		\n\t"
-                "movl %3, %%eax			\n\t"
+		"mov %3, %%"REG_a"		\n\t"
 		".balign 16\n\t"
 		"1:				\n\t"
-		"movq (%0, %%eax), %%mm0	\n\t"
-		"movq 8(%0, %%eax), %%mm1	\n\t"
-		"movq (%1, %%eax), %%mm4	\n\t"
-		"movq 8(%1, %%eax), %%mm5	\n\t"
+		"movq (%0, %%"REG_a"), %%mm0	\n\t"
+		"movq 8(%0, %%"REG_a"), %%mm1	\n\t"
+		"movq (%1, %%"REG_a"), %%mm4	\n\t"
+		"movq 8(%1, %%"REG_a"), %%mm5	\n\t"
 		"pmullw %%mm6, %%mm4		\n\t" // q=qscale*quant_matrix[i]
 		"pmullw %%mm6, %%mm5		\n\t" // q=qscale*quant_matrix[i]
 		"pxor %%mm2, %%mm2		\n\t"
@@ -441,8 +442,8 @@ asm volatile(
 		"paddw %%mm5, %%mm1		\n\t" // (abs(block[i])*2 + 1)*q
 		"pxor %%mm4, %%mm4		\n\t"
 		"pxor %%mm5, %%mm5		\n\t" // FIXME slow
-		"pcmpeqw (%0, %%eax), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
-		"pcmpeqw 8(%0, %%eax), %%mm5	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw (%0, %%"REG_a"), %%mm4	\n\t" // block[i] == 0 ? -1 : 0
+		"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
 		"psrlw $4, %%mm0		\n\t"
 		"psrlw $4, %%mm1		\n\t"
 		"pxor %%mm2, %%mm0		\n\t"
@@ -453,10 +454,10 @@ asm volatile(
 		"pandn %%mm1, %%mm5		\n\t"
                 "pxor %%mm4, %%mm7		\n\t"
                 "pxor %%mm5, %%mm7		\n\t"
-		"movq %%mm4, (%0, %%eax)	\n\t"
-		"movq %%mm5, 8(%0, %%eax)	\n\t"
+		"movq %%mm4, (%0, %%"REG_a")	\n\t"
+		"movq %%mm5, 8(%0, %%"REG_a")	\n\t"
 
-		"addl $16, %%eax		\n\t"
+		"add $16, %%"REG_a"		\n\t"
 		"jng 1b				\n\t"
                 "movd 124(%0, %3), %%mm0	\n\t"
                 "movq %%mm7, %%mm6		\n\t"
@@ -471,7 +472,7 @@ asm volatile(
                 "movd %%mm0, 124(%0, %3)	\n\t"
                 
 		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
-		: "%eax", "memory"
+		: "%"REG_a, "memory"
 	);
 }
 
@@ -499,11 +500,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
 		"punpckhwd %%mm1, %%mm1		\n\t"
 		"punpckhdq %%mm1, %%mm1		\n\t"
 		"movq %%mm1, (%0, %2)		\n\t"
-		"addl %1, %0			\n\t"
-		"cmpl %3, %0			\n\t"
+		"add %1, %0			\n\t"
+		"cmp %3, %0			\n\t"
 		" jb 1b				\n\t"
 		: "+r" (ptr)
-		: "r" (wrap), "r" (width), "r" (ptr + wrap*height)
+		: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
 	);
     }
     else
@@ -522,11 +523,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
 		"punpckhdq %%mm1, %%mm1		\n\t"
 		"movq %%mm1, (%0, %2)		\n\t"
 		"movq %%mm1, 8(%0, %2)		\n\t"
-		"addl %1, %0			\n\t"
-		"cmpl %3, %0			\n\t"
+		"add %1, %0			\n\t"
+		"cmp %3, %0			\n\t"
 		" jb 1b				\n\t"		
 		: "+r" (ptr)
-		: "r" (wrap), "r" (width), "r" (ptr + wrap*height)
+		: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
 	);
     }
     
@@ -540,11 +541,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
 		"movq %%mm0, (%0, %2)		\n\t"
 		"movq %%mm0, (%0, %2, 2)	\n\t"
 		"movq %%mm0, (%0, %3)		\n\t"
-		"addl $8, %0			\n\t"
-		"cmpl %4, %0			\n\t"
+		"add $8, %0			\n\t"
+		"cmp %4, %0			\n\t"
 		" jb 1b				\n\t"
 		: "+r" (ptr)
-		: "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w)
+		: "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
 	);
 	ptr= last_line + (i + 1) * wrap - w;
 	asm volatile(
@@ -554,11 +555,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
 		"movq %%mm0, (%0, %2)		\n\t"
 		"movq %%mm0, (%0, %2, 2)	\n\t"
 		"movq %%mm0, (%0, %3)		\n\t"
-		"addl $8, %0			\n\t"
-		"cmpl %4, %0			\n\t"
+		"add $8, %0			\n\t"
+		"cmp %4, %0			\n\t"
 		" jb 1b				\n\t"
 		: "+r" (ptr)
-		: "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w)
+		: "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
 	);
     }
 }
@@ -607,10 +608,10 @@ static void  denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
         "movq %%mm2, 8(%1)		\n\t"
         "movq %%mm5, 16(%1)		\n\t"
         "movq %%mm3, 24(%1)		\n\t"
-        "addl $16, %0			\n\t"
-        "addl $32, %1			\n\t"
-        "addl $16, %2			\n\t"
-        "cmpl %3, %0			\n\t"
+        "add $16, %0			\n\t"
+        "add $32, %1			\n\t"
+        "add $16, %2			\n\t"
+        "cmp %3, %0			\n\t"
             " jb 1b			\n\t"
         : "+r" (block), "+r" (sum), "+r" (offset)
         : "r"(block+64)
@@ -661,10 +662,10 @@ static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
         "movdqa %%xmm6, 16(%1)		\n\t"
         "movdqa %%xmm5, 32(%1)		\n\t"
         "movdqa %%xmm0, 48(%1)		\n\t"
-        "addl $32, %0			\n\t"
-        "addl $64, %1			\n\t"
-        "addl $32, %2			\n\t"
-        "cmpl %3, %0			\n\t"
+        "add $32, %0			\n\t"
+        "add $64, %1			\n\t"
+        "add $32, %2			\n\t"
+        "cmp %3, %0			\n\t"
             " jb 1b			\n\t"
         : "+r" (block), "+r" (sum), "+r" (offset)
         : "r"(block+64)
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index d4ed61ecb..c9354dc1b 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
                             DCTELEM *block, int n,
                             int qscale, int *overflow)
 {
-    int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
+    long last_non_zero_p1;
+    int level=0, q; //=0 is cuz gcc says uninitalized ...
     const uint16_t *qmat, *bias;
     __align8 int16_t temp_block[64];
     
@@ -58,7 +59,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         if (!s->h263_aic) {
 #if 1
         asm volatile (
-        	"imul %%ecx		\n\t"
+        	"mul %%ecx		\n\t"
         	: "=d" (level), "=a"(dummy)
         	: "a" ((block[0]>>2) + q), "c" (inverse[q<<1])
         );
@@ -87,21 +88,21 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         qmat = s->q_inter_matrix16[qscale][0];
     }
 
-    if(s->out_format == FMT_H263 && s->mpeg_quant==0){
+    if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
     
         asm volatile(
-            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            "movd %%"REG_a", %%mm3		\n\t" // last_non_zero_p1
             SPREADW(%%mm3)
             "pxor %%mm7, %%mm7			\n\t" // 0
             "pxor %%mm4, %%mm4			\n\t" // 0
             "movq (%2), %%mm5			\n\t" // qmat[0]
             "pxor %%mm6, %%mm6			\n\t"
             "psubw (%3), %%mm6			\n\t" // -bias[0]
-            "movl $-128, %%eax			\n\t"
+            "mov $-128, %%"REG_a"		\n\t"
             ".balign 16				\n\t"
             "1:					\n\t"
             "pxor %%mm1, %%mm1			\n\t" // 0
-            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "movq (%1, %%"REG_a"), %%mm0	\n\t" // block[i]
             "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
@@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             "por %%mm0, %%mm4			\n\t" 
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movq %%mm0, (%5, %%eax)		\n\t"
+            "movq %%mm0, (%5, %%"REG_a")	\n\t"
             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
-            "movq (%4, %%eax), %%mm1		\n\t" 
-            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "movq (%4, %%"REG_a"), %%mm1	\n\t" 
+            "movq %%mm7, (%1, %%"REG_a")	\n\t" // 0
             "pandn %%mm1, %%mm0			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "addl $8, %%eax			\n\t"
+            "add $8, %%"REG_a"			\n\t"
             " js 1b				\n\t"
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $32, %%mm3			\n\t"
@@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $16, %%mm3			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "movd %%mm3, %%eax			\n\t"
-            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+            "movd %%mm3, %%"REG_a"		\n\t"
+            "movzb %%al, %%"REG_a"		\n\t" // last_non_zero_p1
 	    : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
@@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         );
     }else{ // FMT_H263
         asm volatile(
-            "movd %%eax, %%mm3			\n\t" // last_non_zero_p1
+            "movd %%"REG_a", %%mm3		\n\t" // last_non_zero_p1
             SPREADW(%%mm3)
             "pxor %%mm7, %%mm7			\n\t" // 0
             "pxor %%mm4, %%mm4			\n\t" // 0
-            "movl $-128, %%eax			\n\t"
+            "mov $-128, %%"REG_a"		\n\t"
             ".balign 16				\n\t"
             "1:					\n\t"
             "pxor %%mm1, %%mm1			\n\t" // 0
-            "movq (%1, %%eax), %%mm0		\n\t" // block[i]
+            "movq (%1, %%"REG_a"), %%mm0	\n\t" // block[i]
             "pcmpgtw %%mm0, %%mm1		\n\t" // block[i] <= 0 ? 0xFF : 0x00
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
-            "movq (%3, %%eax), %%mm6		\n\t" // bias[0]
+            "movq (%3, %%"REG_a"), %%mm6	\n\t" // bias[0]
             "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
-            "movq (%2, %%eax), %%mm5		\n\t" // qmat[i]
+            "movq (%2, %%"REG_a"), %%mm5		\n\t" // qmat[i]
             "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
             "por %%mm0, %%mm4			\n\t" 
             "pxor %%mm1, %%mm0			\n\t" 
             "psubw %%mm1, %%mm0			\n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movq %%mm0, (%5, %%eax)		\n\t"
+            "movq %%mm0, (%5, %%"REG_a")	\n\t"
             "pcmpeqw %%mm7, %%mm0		\n\t" // out==0 ? 0xFF : 0x00
-            "movq (%4, %%eax), %%mm1		\n\t" 
-            "movq %%mm7, (%1, %%eax)		\n\t" // 0
+            "movq (%4, %%"REG_a"), %%mm1		\n\t" 
+            "movq %%mm7, (%1, %%"REG_a")		\n\t" // 0
             "pandn %%mm1, %%mm0			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "addl $8, %%eax			\n\t"
+            "add $8, %%"REG_a"			\n\t"
             " js 1b				\n\t"
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $32, %%mm3			\n\t"
@@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             "movq %%mm3, %%mm0			\n\t"
             "psrlq $16, %%mm3			\n\t"
 	    PMAXW(%%mm0, %%mm3)
-            "movd %%mm3, %%eax			\n\t"
-            "movzbl %%al, %%eax			\n\t" // last_non_zero_p1
+            "movd %%mm3, %%"REG_a"		\n\t"
+            "movzb %%al, %%"REG_a"		\n\t" // last_non_zero_p1
 	    : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
diff --git a/src/libffmpeg/libavcodec/idcinvideo.c b/src/libffmpeg/libavcodec/idcinvideo.c
index e53246bd1..f5df5a49d 100644
--- a/src/libffmpeg/libavcodec/idcinvideo.c
+++ b/src/libffmpeg/libavcodec/idcinvideo.c
@@ -192,7 +192,7 @@ static void idcin_decode_vlcs(IdcinContext *s)
 
             while(node_num >= HUF_TOKENS) {
                 if(!bit_pos) {
-                    if(dat_pos > s->size) {
+                    if(dat_pos >= s->size) {
                         av_log(s->avctx, AV_LOG_ERROR, "Huffman decode error.\n");
                         return;
                     }
diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c
index b351d2219..1ba723a95 100644
--- a/src/libffmpeg/libavcodec/imgconvert.c
+++ b/src/libffmpeg/libavcodec/imgconvert.c
@@ -97,6 +97,14 @@ static PixFmtInfo pix_fmt_info[PIX_FMT_NB] = {
         .depth = 8,
         .x_chroma_shift = 1, .y_chroma_shift = 0,
     },
+    [PIX_FMT_UYVY422] = {
+        .name = "uyvy422",
+        .nb_channels = 1,
+        .color_type = FF_COLOR_YUV,
+        .pixel_type = FF_PIXEL_PACKED,
+        .depth = 8,
+        .x_chroma_shift = 1, .y_chroma_shift = 0,
+    },
     [PIX_FMT_YUV410P] = {
         .name = "yuv410p",
         .nb_channels = 3,
@@ -213,6 +221,20 @@ static PixFmtInfo pix_fmt_info[PIX_FMT_NB] = {
         .pixel_type = FF_PIXEL_PALETTE,
         .depth = 8,
     },
+    [PIX_FMT_XVMC_MPEG2_MC] = {
+        .name = "xvmcmc",
+    },
+    [PIX_FMT_XVMC_MPEG2_IDCT] = {
+        .name = "xvmcidct",
+    },
+    [PIX_FMT_UYVY411] = {
+        .name = "uyvy411",
+        .nb_channels = 1,
+        .color_type = FF_COLOR_YUV,
+        .pixel_type = FF_PIXEL_PACKED,
+        .depth = 8,
+        .x_chroma_shift = 2, .y_chroma_shift = 0,
+    },
 };
 
 void avcodec_get_chroma_sub_sample(int pix_fmt, int *h_shift, int *v_shift)
@@ -246,6 +268,9 @@ int avpicture_fill(AVPicture *picture, uint8_t *ptr,
     int size, w2, h2, size2;
     PixFmtInfo *pinfo;
     
+    if(avcodec_check_dimensions(NULL, width, height))
+        goto fail;
+
     pinfo = &pix_fmt_info[pix_fmt];
     size = width * height;
     switch(pix_fmt) {
@@ -288,6 +313,18 @@ int avpicture_fill(AVPicture *picture, uint8_t *ptr,
         picture->data[2] = NULL;
         picture->linesize[0] = width * 2;
         return size * 2;
+    case PIX_FMT_UYVY422:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = width * 2;
+        return size * 2;
+    case PIX_FMT_UYVY411:
+        picture->data[0] = ptr;
+        picture->data[1] = NULL;
+        picture->data[2] = NULL;
+        picture->linesize[0] = width + width/2;
+        return size + size/2;
     case PIX_FMT_GRAY8:
         picture->data[0] = ptr;
         picture->data[1] = NULL;
@@ -310,6 +347,7 @@ int avpicture_fill(AVPicture *picture, uint8_t *ptr,
         picture->linesize[1] = 4;
         return size2 + 256 * 4;
     default:
+fail:
         picture->data[0] = NULL;
         picture->data[1] = NULL;
         picture->data[2] = NULL;
@@ -326,13 +364,17 @@ int avpicture_layout(const AVPicture* src, int pix_fmt, int width, int height,
     const unsigned char* s; 
     int size = avpicture_get_size(pix_fmt, width, height);
 
-    if (size > dest_size)
+    if (size > dest_size || size < 0)
         return -1;
 
     if (pf->pixel_type == FF_PIXEL_PACKED || pf->pixel_type == FF_PIXEL_PALETTE) {
-        if (pix_fmt == PIX_FMT_YUV422 || pix_fmt == PIX_FMT_RGB565 ||
-	    pix_fmt == PIX_FMT_RGB555)
-	  w = width * 2;
+        if (pix_fmt == PIX_FMT_YUV422 || 
+            pix_fmt == PIX_FMT_UYVY422 || 
+            pix_fmt == PIX_FMT_RGB565 ||
+            pix_fmt == PIX_FMT_RGB555)
+            w = width * 2;
+	else if (pix_fmt == PIX_FMT_UYVY411)
+	  w = width + width/2;
 	else if (pix_fmt == PIX_FMT_PAL8)
 	  w = width;
 	else
@@ -342,7 +384,7 @@ int avpicture_layout(const AVPicture* src, int pix_fmt, int width, int height,
 	h = height;
     } else {
         data_planes = pf->nb_channels;
-	w = width;
+	w = (width*pf->depth + 7)/8;
 	h = height;
     }
     
@@ -439,10 +481,14 @@ static int avg_bits_per_pixel(int pix_fmt)
     case FF_PIXEL_PACKED:
         switch(pix_fmt) {
         case PIX_FMT_YUV422:
+        case PIX_FMT_UYVY422:
         case PIX_FMT_RGB565:
         case PIX_FMT_RGB555:
             bits = 16;
             break;
+	case PIX_FMT_UYVY411:
+	    bits = 12;
+	    break;
         default:
             bits = pf->depth * pf->nb_channels;
             break;
@@ -551,10 +597,14 @@ void img_copy(AVPicture *dst, const AVPicture *src,
     case FF_PIXEL_PACKED:
         switch(pix_fmt) {
         case PIX_FMT_YUV422:
+        case PIX_FMT_UYVY422:
         case PIX_FMT_RGB565:
         case PIX_FMT_RGB555:
             bits = 16;
             break;
+	case PIX_FMT_UYVY411:
+	    bits = 12;
+	    break;
         default:
             bits = pf->depth * pf->nb_channels;
             break;
@@ -649,6 +699,98 @@ static void yuv422_to_yuv420p(AVPicture *dst, const AVPicture *src,
     }
 }
 
+static void uyvy422_to_yuv420p(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    const uint8_t *p, *p1;
+    uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
+    int w;
+ 
+    p1 = src->data[0];
+    
+    lum1 = dst->data[0];
+    cb1 = dst->data[1];
+    cr1 = dst->data[2];
+
+    for(;height >= 1; height -= 2) {
+        p = p1;
+        lum = lum1;
+        cb = cb1;
+        cr = cr1;
+        for(w = width; w >= 2; w -= 2) {
+            lum[0] = p[1];
+            cb[0] = p[0];
+            lum[1] = p[3];
+            cr[0] = p[2];
+            p += 4;
+            lum += 2;
+            cb++;
+            cr++;
+        }
+        if (w) {
+            lum[0] = p[1];
+            cb[0] = p[0];
+            cr[0] = p[2];
+            cb++;
+            cr++;
+        }
+        p1 += src->linesize[0];
+        lum1 += dst->linesize[0];
+        if (height>1) {
+            p = p1;
+            lum = lum1;
+            for(w = width; w >= 2; w -= 2) {
+                lum[0] = p[1];
+                lum[1] = p[3];
+                p += 4;
+                lum += 2;
+            }
+            if (w) {
+                lum[0] = p[1];
+            }
+            p1 += src->linesize[0];
+            lum1 += dst->linesize[0];
+        }
+        cb1 += dst->linesize[1];
+        cr1 += dst->linesize[2];
+    }
+}
+
+
+static void uyvy422_to_yuv422p(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    const uint8_t *p, *p1;
+    uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
+    int w;
+
+    p1 = src->data[0];
+    lum1 = dst->data[0];
+    cb1 = dst->data[1];
+    cr1 = dst->data[2];
+    for(;height > 0; height--) {
+        p = p1;
+        lum = lum1;
+        cb = cb1;
+        cr = cr1;
+        for(w = width; w >= 2; w -= 2) {
+            lum[0] = p[1];
+            cb[0] = p[0];
+            lum[1] = p[3];
+            cr[0] = p[2];
+            p += 4;
+            lum += 2;
+            cb++;
+            cr++;
+        }
+        p1 += src->linesize[0];
+        lum1 += dst->linesize[0];
+        cb1 += dst->linesize[1];
+        cr1 += dst->linesize[2];
+    }
+}
+
+
 static void yuv422_to_yuv422p(AVPicture *dst, const AVPicture *src,
                               int width, int height)
 {
@@ -715,6 +857,141 @@ static void yuv422p_to_yuv422(AVPicture *dst, const AVPicture *src,
     }
 }
 
+static void yuv422p_to_uyvy422(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    uint8_t *p, *p1;
+    const uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
+    int w;
+
+    p1 = dst->data[0];
+    lum1 = src->data[0];
+    cb1 = src->data[1];
+    cr1 = src->data[2];
+    for(;height > 0; height--) {
+        p = p1;
+        lum = lum1;
+        cb = cb1;
+        cr = cr1;
+        for(w = width; w >= 2; w -= 2) {
+            p[1] = lum[0];
+            p[0] = cb[0];
+            p[3] = lum[1];
+            p[2] = cr[0];
+            p += 4;
+            lum += 2;
+            cb++;
+            cr++;
+        }
+        p1 += dst->linesize[0];
+        lum1 += src->linesize[0];
+        cb1 += src->linesize[1];
+        cr1 += src->linesize[2];
+    }
+}
+
+static void uyvy411_to_yuv411p(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    const uint8_t *p, *p1;
+    uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
+    int w;
+
+    p1 = src->data[0];
+    lum1 = dst->data[0];
+    cb1 = dst->data[1];
+    cr1 = dst->data[2];
+    for(;height > 0; height--) {
+        p = p1;
+        lum = lum1;
+        cb = cb1;
+        cr = cr1;
+        for(w = width; w >= 4; w -= 4) {
+            cb[0] = p[0];
+	    lum[0] = p[1];
+            lum[1] = p[2];
+            cr[0] = p[3];
+	    lum[2] = p[4];
+	    lum[3] = p[5];
+            p += 6;
+            lum += 4;
+            cb++;
+            cr++;
+        }
+        p1 += src->linesize[0];
+        lum1 += dst->linesize[0];
+        cb1 += dst->linesize[1];
+        cr1 += dst->linesize[2];
+    }
+}
+
+
+static void yuv420p_to_yuv422(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    int w, h;
+    uint8_t *line1, *line2, *linesrc = dst->data[0];
+    uint8_t *lum1, *lum2, *lumsrc = src->data[0];
+    uint8_t *cb1, *cb2 = src->data[1];
+    uint8_t *cr1, *cr2 = src->data[2];
+    
+    for(h = height / 2; h--;) {
+        line1 = linesrc;
+        line2 = linesrc + dst->linesize[0];
+        
+        lum1 = lumsrc;
+        lum2 = lumsrc + src->linesize[0];
+        
+        cb1 = cb2;
+        cr1 = cr2;
+        
+        for(w = width / 2; w--;) {
+                *line1++ = *lum1++; *line2++ = *lum2++;                     
+                *line1++ =          *line2++ = *cb1++;                      
+                *line1++ = *lum1++; *line2++ = *lum2++;                     
+                *line1++ =          *line2++ = *cr1++;
+        }
+        
+        linesrc += dst->linesize[0] * 2;
+        lumsrc += src->linesize[0] * 2;
+        cb2 += src->linesize[1];
+        cr2 += src->linesize[2];
+    }
+}
+
+static void yuv420p_to_uyvy422(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    int w, h;
+    uint8_t *line1, *line2, *linesrc = dst->data[0];
+    uint8_t *lum1, *lum2, *lumsrc = src->data[0];
+    uint8_t *cb1, *cb2 = src->data[1];
+    uint8_t *cr1, *cr2 = src->data[2];
+    
+    for(h = height / 2; h--;) {
+        line1 = linesrc;
+        line2 = linesrc + dst->linesize[0];
+        
+        lum1 = lumsrc;
+        lum2 = lumsrc + src->linesize[0];
+        
+        cb1 = cb2;
+        cr1 = cr2;
+        
+        for(w = width / 2; w--;) {
+                *line1++ =          *line2++ = *cb1++;                      
+                *line1++ = *lum1++; *line2++ = *lum2++;                     
+                *line1++ =          *line2++ = *cr1++;
+                *line1++ = *lum1++; *line2++ = *lum2++;                     
+        }
+        
+        linesrc += dst->linesize[0] * 2;
+        lumsrc += src->linesize[0] * 2;
+        cb2 += src->linesize[1];
+        cr2 += src->linesize[2];
+    }
+}
+
 #define SCALEBITS 10
 #define ONE_HALF  (1 << (SCALEBITS - 1))
 #define FIX(x)	  ((int) ((x) * (1<<SCALEBITS) + 0.5))
@@ -1424,6 +1701,9 @@ typedef struct ConvertEntry {
 */
 static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
     [PIX_FMT_YUV420P] = {
+        [PIX_FMT_YUV422] = {
+            .convert = yuv420p_to_yuv422,
+        },
         [PIX_FMT_RGB555] = { 
             .convert = yuv420p_to_rgb555
         },
@@ -1439,11 +1719,17 @@ static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
         [PIX_FMT_RGBA32] = { 
             .convert = yuv420p_to_rgba32
         },
+	[PIX_FMT_UYVY422] = { 
+            .convert = yuv420p_to_uyvy422,
+        },
     },
     [PIX_FMT_YUV422P] = { 
         [PIX_FMT_YUV422] = { 
             .convert = yuv422p_to_yuv422,
         },
+        [PIX_FMT_UYVY422] = { 
+            .convert = yuv422p_to_uyvy422,
+        },
     },
     [PIX_FMT_YUV444P] = { 
         [PIX_FMT_RGB24] = { 
@@ -1480,7 +1766,14 @@ static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
             .convert = yuv422_to_yuv422p,
         },
     },
-
+    [PIX_FMT_UYVY422] = { 
+        [PIX_FMT_YUV420P] = { 
+            .convert = uyvy422_to_yuv420p,
+        },
+        [PIX_FMT_YUV422P] = { 
+            .convert = uyvy422_to_yuv422p,
+        },
+    },
     [PIX_FMT_RGB24] = {
         [PIX_FMT_YUV420P] = { 
             .convert = rgb24_to_yuv420p
@@ -1616,6 +1909,12 @@ static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
             .convert = pal8_to_rgba32
         },
     },
+    [PIX_FMT_UYVY411] = { 
+        [PIX_FMT_YUV411P] = { 
+            .convert = uyvy411_to_yuv411p,
+        },
+    },
+
 };
 
 int avpicture_alloc(AVPicture *picture,
@@ -1625,6 +1924,8 @@ int avpicture_alloc(AVPicture *picture,
     void *ptr;
 
     size = avpicture_get_size(pix_fmt, width, height);
+    if(size<0)
+        goto fail;
     ptr = av_malloc(size);
     if (!ptr)
         goto fail;
@@ -1683,7 +1984,7 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
 
     ce = &convert_table[src_pix_fmt][dst_pix_fmt];
     if (ce->convert) {
-        /* specific convertion routine */
+        /* specific conversion routine */
         ce->convert(dst, src, dst_width, dst_height);
         return 0;
     }
@@ -1838,6 +2139,14 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
         dst_pix_fmt == PIX_FMT_YUV422) {
         /* specific case: convert to YUV422P first */
         int_pix_fmt = PIX_FMT_YUV422P;
+    } else if (src_pix_fmt == PIX_FMT_UYVY422 ||
+        dst_pix_fmt == PIX_FMT_UYVY422) {
+        /* specific case: convert to YUV422P first */
+        int_pix_fmt = PIX_FMT_YUV422P;
+    } else if (src_pix_fmt == PIX_FMT_UYVY411 ||
+        dst_pix_fmt == PIX_FMT_UYVY411) {
+        /* specific case: convert to YUV411P first */
+        int_pix_fmt = PIX_FMT_YUV411P;
     } else if ((src_pix->color_type == FF_COLOR_GRAY &&
                 src_pix_fmt != PIX_FMT_GRAY8) || 
                (dst_pix->color_type == FF_COLOR_GRAY &&
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index 14fdb1059..2c7e1120a 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -55,6 +55,8 @@ struct ImgReSampleContext {
     uint8_t *line_buf;
 };
 
+void av_build_filter(int16_t *filter, double factor, int tap_count, int phase_count, int scale, int type);
+
 static inline int get_phase(int pos)
 {
     return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
@@ -540,39 +542,6 @@ static void component_resample(ImgReSampleContext *s,
     }
 }
 
-/* XXX: the following filter is quite naive, but it seems to suffice
-   for 4 taps */
-static void build_filter(int16_t *filter, float factor)
-{
-    int ph, i, v;
-    float x, y, tab[NB_TAPS], norm, mult;
-
-    /* if upsampling, only need to interpolate, no filter */
-    if (factor > 1.0)
-        factor = 1.0;
-
-    for(ph=0;ph<NB_PHASES;ph++) {
-        norm = 0;
-        for(i=0;i<NB_TAPS;i++) {
-            
-            x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
-            if (x == 0)
-                y = 1.0;
-            else
-                y = sin(x) / x;
-            tab[i] = y;
-            norm += y;
-        }
-
-        /* normalize so that an uniform color remains the same */
-        mult = (float)(1 << FILTER_BITS) / norm;
-        for(i=0;i<NB_TAPS;i++) {
-            v = (int)(tab[i] * mult);
-            filter[ph * NB_TAPS + i] = v;
-        }
-    }
-}
-
 ImgReSampleContext *img_resample_init(int owidth, int oheight,
                                       int iwidth, int iheight)
 {
@@ -592,6 +561,8 @@ ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
     s = av_mallocz(sizeof(ImgReSampleContext));
     if (!s)
         return NULL;
+    if((unsigned)owidth >= UINT_MAX / (LINE_BUF_HEIGHT + NB_TAPS))
+        return NULL;
     s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
     if (!s->line_buf) 
         goto fail;
@@ -617,10 +588,10 @@ ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
     s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / s->pad_owidth;
     s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / s->pad_oheight; 
 
-    build_filter(&s->h_filters[0][0], (float) s->pad_owidth  / 
-            (float) (iwidth - leftBand - rightBand));
-    build_filter(&s->v_filters[0][0], (float) s->pad_oheight / 
-            (float) (iheight - topBand - bottomBand));
+    av_build_filter(&s->h_filters[0][0], (float) s->pad_owidth  / 
+            (float) (iwidth - leftBand - rightBand), NB_TAPS, NB_PHASES, 1<<FILTER_BITS, 0);
+    av_build_filter(&s->v_filters[0][0], (float) s->pad_oheight / 
+            (float) (iheight - topBand - bottomBand), NB_TAPS, NB_PHASES, 1<<FILTER_BITS, 0);
 
     return s;
 fail:
@@ -657,21 +628,7 @@ void img_resample_close(ImgReSampleContext *s)
 }
 
 #ifdef TEST
-
-void *av_mallocz(int size)
-{
-    void *ptr;
-    ptr = malloc(size);
-    memset(ptr, 0, size);
-    return ptr;
-}
-
-void av_free(void *ptr)
-{
-    /* XXX: this test should not be needed on most libcs */
-    if (ptr)
-        free(ptr);
-}
+#include <stdio.h>
 
 /* input */
 #define XSIZE 256
@@ -698,11 +655,11 @@ static void dump_filter(int16_t *filter)
     int i, ph;
 
     for(ph=0;ph<NB_PHASES;ph++) {
-        printf("%2d: ", ph);
+        av_log(NULL, AV_LOG_INFO, "%2d: ", ph);
         for(i=0;i<NB_TAPS;i++) {
-            printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
+            av_log(NULL, AV_LOG_INFO, " %5.2f", filter[ph * NB_TAPS + i] / 256.0);
         }
-        printf("\n");
+        av_log(NULL, AV_LOG_INFO, "\n");
     }
 }
 
@@ -766,20 +723,20 @@ int main(int argc, char **argv)
         fact = factors[i];
         xsize = (int)(XSIZE * fact);
         ysize = (int)((YSIZE - 100) * fact);
-        s = img_resample_full_init(xsize, ysize, XSIZE, YSIZE, 50 ,50, 0, 0);
-        printf("Factor=%0.2f\n", fact);
+        s = img_resample_full_init(xsize, ysize, XSIZE, YSIZE, 50 ,50, 0, 0, 0, 0, 0, 0);
+        av_log(NULL, AV_LOG_INFO, "Factor=%0.2f\n", fact);
         dump_filter(&s->h_filters[0][0]);
         component_resample(s, img1, xsize, xsize, ysize,
                            img + 50 * XSIZE, XSIZE, XSIZE, YSIZE - 100);
         img_resample_close(s);
 
-        sprintf(buf, "/tmp/out%d.pgm", i);
+        snprintf(buf, sizeof(buf), "/tmp/out%d.pgm", i);
         save_pgm(buf, img1, xsize, ysize);
     }
 
     /* mmx test */
 #ifdef HAVE_MMX
-    printf("MMX test\n");
+    av_log(NULL, AV_LOG_INFO, "MMX test\n");
     fact = 0.72;
     xsize = (int)(XSIZE * fact);
     ysize = (int)(YSIZE * fact);
@@ -793,10 +750,10 @@ int main(int argc, char **argv)
     component_resample(s, img2, xsize, xsize, ysize,
                        img, XSIZE, XSIZE, YSIZE);
     if (memcmp(img1, img2, xsize * ysize) != 0) {
-        fprintf(stderr, "mmx error\n");
+        av_log(NULL, AV_LOG_ERROR, "mmx error\n");
         exit(1);
     }
-    printf("MMX OK\n");
+    av_log(NULL, AV_LOG_INFO, "MMX OK\n");
 #endif
     return 0;
 }
diff --git a/src/libffmpeg/libavcodec/indeo3.c b/src/libffmpeg/libavcodec/indeo3.c
index 14ff02858..351af2191 100644
--- a/src/libffmpeg/libavcodec/indeo3.c
+++ b/src/libffmpeg/libavcodec/indeo3.c
@@ -95,13 +95,16 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s, unsigned char *cur,
   long fflags2, unsigned char *hdr,
   unsigned char *buf2, int min_width_160);
 
+#ifndef min
 #define min(a,b) ((a) < (b) ? (a) : (b))
+#endif
 
 /* ---------------------------------------------------------------------- */
 static void iv_alloc_frames(Indeo3DecodeContext *s) 
 {
   int luma_width, luma_height, luma_pixels, chroma_width, chroma_height,
-    chroma_pixels, bufsize, i;
+      chroma_pixels, i;
+  unsigned int bufsize;
 
   luma_width   = (s->width  + 3) & (~3);
   luma_height  = (s->height + 3) & (~3);
@@ -195,6 +198,10 @@ static unsigned long iv_decode_frame(Indeo3DecodeContext *s,
   hdr_height = le2me_16(*(uint16_t *)buf_pos);
   buf_pos += 2;
   hdr_width = le2me_16(*(uint16_t *)buf_pos);
+  
+  if(avcodec_check_dimensions(NULL, hdr_width, hdr_height))
+      return -1;
+  
   buf_pos += 2;
   chroma_height = ((hdr_height >> 2) + 3) & 0x7ffc;
   chroma_width = ((hdr_width >> 2) + 3) & 0x7ffc;
@@ -223,6 +230,9 @@ static unsigned long iv_decode_frame(Indeo3DecodeContext *s,
     hdr_height, buf_pos + offs * 2, fflags2, hdr_pos, buf_pos, 
     min(hdr_width, 160));
 
+  if (!(s->avctx->flags & CODEC_FLAG_GRAY))
+  {
+
   buf_pos = buf + 16 + offs2;
   offs = le2me_32(*(uint32_t *)buf_pos);
   buf_pos += 4;
@@ -239,6 +249,8 @@ static unsigned long iv_decode_frame(Indeo3DecodeContext *s,
     chroma_height, buf_pos + offs * 2, fflags2, hdr_pos, buf_pos, 
     min(chroma_width, 40));
 
+  }
+
   return 8;
 }
 
@@ -304,7 +316,7 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
   unsigned char bit_buf;
   unsigned long bit_pos, lv, lv1, lv2;
   long *width_tbl, width_tbl_arr[10];
-  char *ref_vectors;
+  signed char *ref_vectors;
   unsigned char *cur_frm_pos, *ref_frm_pos, *cp, *cp2;
   uint32_t *cur_lp, *ref_lp;
   const uint32_t *correction_lp[2], *correctionloworder_lp[2], *correctionhighorder_lp[2];
@@ -312,6 +324,7 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
   ustr_t strip_tbl[20], *strip;
   int i, j, k, lp1, lp2, flag1, cmd, blks_width, blks_height, region_160_width,
     rle_v1, rle_v2, rle_v3;
+  unsigned short res;
 
   bit_buf = 0;
   ref_vectors = NULL;
@@ -446,14 +459,15 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
 
                 switch(correction_type_sp[0][k]) {
                   case 0:
-                    *cur_lp = ((*ref_lp >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                    *cur_lp = le2me_32(((le2me_32(*ref_lp) >> 1) + correction_lp[lp2 & 0x01][k]) << 1);
                     lp2++;
                     break;
                   case 1:
-                    ((unsigned short *)cur_lp)[0] = ((((unsigned short *)(ref_lp))[0] >> 1)
-                      + correction_lp[lp2 & 0x01][*buf1++]) << 1;
-                    ((unsigned short *)cur_lp)[1] = ((((unsigned short *)(ref_lp))[1] >> 1)
-                      + correction_lp[lp2 & 0x01][k]) << 1;
+                    res = ((le2me_16(((unsigned short *)(ref_lp))[0]) >> 1) + correction_lp[lp2 & 0x01][*buf1]) << 1;
+                    ((unsigned short *)cur_lp)[0] = le2me_16(res);
+                    res = ((le2me_16(((unsigned short *)(ref_lp))[1]) >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                    ((unsigned short *)cur_lp)[1] = le2me_16(res);
+                    buf1++;
                     lp2++;
                     break;
                   case 2:
@@ -548,23 +562,25 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
 
                 switch(correction_type_sp[lp2 & 0x01][k]) {
                   case 0:
-                    cur_lp[width_tbl[1]] = ((*ref_lp >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                    cur_lp[width_tbl[1]] = le2me_32(((le2me_32(*ref_lp) >> 1) + correction_lp[lp2 & 0x01][k]) << 1);
                     if(lp2 > 0 || flag1 == 0 || strip->ypos != 0)
                       cur_lp[0] = ((cur_lp[-width_tbl[1]] >> 1) + (cur_lp[width_tbl[1]] >> 1)) & 0xFEFEFEFE;
                     else
-                      cur_lp[0] = ((*ref_lp >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                      cur_lp[0] = le2me_32(((le2me_32(*ref_lp) >> 1) + correction_lp[lp2 & 0x01][k]) << 1);
                     lp2++;
                     break;
 
                   case 1:
-                    ((unsigned short *)cur_lp)[width_tbl[2]] =
-                      ((((unsigned short *)ref_lp)[0] >> 1) + correction_lp[lp2 & 0x01][*buf1++]) << 1;
-                    ((unsigned short *)cur_lp)[width_tbl[2]+1] =
-                      ((((unsigned short *)ref_lp)[1] >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                    res = ((le2me_16(((unsigned short *)ref_lp)[0]) >> 1) + correction_lp[lp2 & 0x01][*buf1]) << 1;
+                    ((unsigned short *)cur_lp)[width_tbl[2]] = le2me_16(res);
+                    res = ((le2me_16(((unsigned short *)ref_lp)[1]) >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                    ((unsigned short *)cur_lp)[width_tbl[2]+1] = le2me_16(res);
+
                     if(lp2 > 0 || flag1 == 0 || strip->ypos != 0)
                       cur_lp[0] = ((cur_lp[-width_tbl[1]] >> 1) + (cur_lp[width_tbl[1]] >> 1)) & 0xFEFEFEFE;
                     else
                       cur_lp[0] = cur_lp[width_tbl[1]];
+                    buf1++;
                     lp2++;
                     break;
 
@@ -660,16 +676,23 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
                   lv1 = ref_lp[0];
                   lv2 = ref_lp[1];
                   if(lp2 == 0 && flag1 != 0) {
+#ifdef WORDS_BIGENDIAN
+                    lv1 = lv1 & 0xFF00FF00;
+                    lv1 = (lv1 >> 8) | lv1;
+                    lv2 = lv2 & 0xFF00FF00;
+                    lv2 = (lv2 >> 8) | lv2;
+#else
                     lv1 = lv1 & 0x00FF00FF;
                     lv1 = (lv1 << 8) | lv1;
                     lv2 = lv2 & 0x00FF00FF;
                     lv2 = (lv2 << 8) | lv2;
+#endif
                   }
 
                   switch(correction_type_sp[lp2 & 0x01][k]) {
                     case 0:
-                      cur_lp[width_tbl[1]] = ((lv1 >> 1) + correctionloworder_lp[lp2 & 0x01][k]) << 1;
-                      cur_lp[width_tbl[1]+1] = ((lv2 >> 1) + correctionhighorder_lp[lp2 & 0x01][k]) << 1;
+                      cur_lp[width_tbl[1]] = le2me_32(((le2me_32(lv1) >> 1) + correctionloworder_lp[lp2 & 0x01][k]) << 1);
+                      cur_lp[width_tbl[1]+1] = le2me_32(((le2me_32(lv2) >> 1) + correctionhighorder_lp[lp2 & 0x01][k]) << 1);
                       if(lp2 > 0 || strip->ypos != 0 || flag1 == 0) {
                         cur_lp[0] = ((cur_lp[-width_tbl[1]] >> 1) + (cur_lp[width_tbl[1]] >> 1)) & 0xFEFEFEFE;
                         cur_lp[1] = ((cur_lp[-width_tbl[1]+1] >> 1) + (cur_lp[width_tbl[1]+1] >> 1)) & 0xFEFEFEFE;
@@ -681,8 +704,8 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
                       break;
 
                     case 1:
-                      cur_lp[width_tbl[1]] = ((lv1 >> 1) + correctionloworder_lp[lp2 & 0x01][*buf1++]) << 1;
-                      cur_lp[width_tbl[1]+1] = ((lv2 >> 1) + correctionloworder_lp[lp2 & 0x01][k]) << 1;
+                      cur_lp[width_tbl[1]] = le2me_32(((le2me_32(lv1) >> 1) + correctionloworder_lp[lp2 & 0x01][*buf1]) << 1);
+                      cur_lp[width_tbl[1]+1] = le2me_32(((le2me_32(lv2) >> 1) + correctionloworder_lp[lp2 & 0x01][k]) << 1);
                       if(lp2 > 0 || strip->ypos != 0 || flag1 == 0) {
                         cur_lp[0] = ((cur_lp[-width_tbl[1]] >> 1) + (cur_lp[width_tbl[1]] >> 1)) & 0xFEFEFEFE;
                         cur_lp[1] = ((cur_lp[-width_tbl[1]+1] >> 1) + (cur_lp[width_tbl[1]+1] >> 1)) & 0xFEFEFEFE;
@@ -690,6 +713,7 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
                         cur_lp[0] = cur_lp[width_tbl[1]];
                         cur_lp[1] = cur_lp[width_tbl[1]+1];
                       }
+                      buf1++;
                       lp2++;
                       break;
 
@@ -824,20 +848,20 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
                     case 0:
                       lv1 = correctionloworder_lp[lp2 & 0x01][k];
                       lv2 = correctionhighorder_lp[lp2 & 0x01][k];
-                      cur_lp[0] = ((ref_lp[0] >> 1) + lv1) << 1;
-                      cur_lp[1] = ((ref_lp[1] >> 1) + lv2) << 1;
-                      cur_lp[width_tbl[1]] = ((ref_lp[width_tbl[1]] >> 1) + lv1) << 1;
-                      cur_lp[width_tbl[1]+1] = ((ref_lp[width_tbl[1]+1] >> 1) + lv2) << 1;
+                      cur_lp[0] = le2me_32(((le2me_32(ref_lp[0]) >> 1) + lv1) << 1);
+                      cur_lp[1] = le2me_32(((le2me_32(ref_lp[1]) >> 1) + lv2) << 1);
+                      cur_lp[width_tbl[1]] = le2me_32(((le2me_32(ref_lp[width_tbl[1]]) >> 1) + lv1) << 1);
+                      cur_lp[width_tbl[1]+1] = le2me_32(((le2me_32(ref_lp[width_tbl[1]+1]) >> 1) + lv2) << 1);
                       lp2++;
                       break;
 
                     case 1:
                       lv1 = correctionloworder_lp[lp2 & 0x01][*buf1++];
                       lv2 = correctionloworder_lp[lp2 & 0x01][k];
-                      cur_lp[0] = ((ref_lp[0] >> 1) + lv1) << 1;
-                      cur_lp[1] = ((ref_lp[1] >> 1) + lv2) << 1;
-                      cur_lp[width_tbl[1]] = ((ref_lp[width_tbl[1]] >> 1) + lv1) << 1;
-                      cur_lp[width_tbl[1]+1] = ((ref_lp[width_tbl[1]+1] >> 1) + lv2) << 1;
+                      cur_lp[0] = le2me_32(((le2me_32(ref_lp[0]) >> 1) + lv1) << 1);
+                      cur_lp[1] = le2me_32(((le2me_32(ref_lp[1]) >> 1) + lv2) << 1);
+                      cur_lp[width_tbl[1]] = le2me_32(((le2me_32(ref_lp[width_tbl[1]]) >> 1) + lv1) << 1);
+                      cur_lp[width_tbl[1]+1] = le2me_32(((le2me_32(ref_lp[width_tbl[1]+1]) >> 1) + lv2) << 1);
                       lp2++;
                       break;
 
@@ -925,18 +949,22 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
 
                 switch(correction_type_sp[lp2 & 0x01][k]) {
                   case 0:
-                    cur_lp[0] = ((*ref_lp >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
-                    cur_lp[width_tbl[1]] = ((ref_lp[width_tbl[1]] >> 1) + correction_lp[lp2 & 0x01][k]) << 1;
+                    cur_lp[0] = le2me_32(((le2me_32(*ref_lp) >> 1) + correction_lp[lp2 & 0x01][k]) << 1);
+                    cur_lp[width_tbl[1]] = le2me_32(((le2me_32(ref_lp[width_tbl[1]]) >> 1) + correction_lp[lp2 & 0x01][k]) << 1);
                     lp2++;
                     break;
 
                   case 1:
                     lv1 = (unsigned short)(correction_lp[lp2 & 0x01][*buf1++]);
                     lv2 = (unsigned short)(correction_lp[lp2 & 0x01][k]);
-                    ((unsigned short *)cur_lp)[0] = ((((unsigned short *)ref_lp)[0] >> 1) + lv1) << 1;
-                    ((unsigned short *)cur_lp)[1] = ((((unsigned short *)ref_lp)[1] >> 1) + lv2) << 1;
-                    ((unsigned short *)cur_lp)[width_tbl[2]] = ((((unsigned short *)ref_lp)[width_tbl[2]] >> 1) + lv1) << 1;
-                    ((unsigned short *)cur_lp)[width_tbl[2]+1] = ((((unsigned short *)ref_lp)[width_tbl[2]+1] >> 1) + lv2) << 1;
+                    res = (unsigned short)(((le2me_16(((unsigned short *)ref_lp)[0]) >> 1) + lv1) << 1);
+                    ((unsigned short *)cur_lp)[0] = le2me_16(res);
+                    res = (unsigned short)(((le2me_16(((unsigned short *)ref_lp)[1]) >> 1) + lv2) << 1);
+                    ((unsigned short *)cur_lp)[1] = le2me_16(res);
+                    res = (unsigned short)(((le2me_16(((unsigned short *)ref_lp)[width_tbl[2]]) >> 1) + lv1) << 1);
+                    ((unsigned short *)cur_lp)[width_tbl[2]] = le2me_16(res);
+                    res = (unsigned short)(((le2me_16(((unsigned short *)ref_lp)[width_tbl[2]+1]) >> 1) + lv2) << 1);
+                    ((unsigned short *)cur_lp)[width_tbl[2]+1] = le2me_16(res);
                     lp2++;
                     break;
 
@@ -1056,11 +1084,6 @@ static int indeo3_decode_frame(AVCodecContext *avctx,
     unsigned char *src, *dest;
     int y;
 
-    /* no supplementary picture */
-    if (buf_size == 0) {
-        return 0;
-    }
-
     iv_decode_frame(s, buf, buf_size);
 
     if(s->frame.data[0])
@@ -1080,6 +1103,8 @@ static int indeo3_decode_frame(AVCodecContext *avctx,
       dest += s->frame.linesize[0];
     }
 
+    if (!(s->avctx->flags & CODEC_FLAG_GRAY))
+    {
     src = s->cur_frame->Ubuf;
     dest = s->frame.data[1];
     for (y = 0; y < s->height / 4; y++) {
@@ -1095,6 +1120,7 @@ static int indeo3_decode_frame(AVCodecContext *avctx,
       src += s->cur_frame->uv_w;
       dest += s->frame.linesize[2];
     }
+    }
 
     *data_size=sizeof(AVFrame);
     *(AVFrame*)data= s->frame;
diff --git a/src/libffmpeg/libavcodec/integer.c b/src/libffmpeg/libavcodec/integer.c
index 025560f9e..38a826f86 100644
--- a/src/libffmpeg/libavcodec/integer.c
+++ b/src/libffmpeg/libavcodec/integer.c
@@ -47,6 +47,10 @@ AVInteger av_sub_i(AVInteger a, AVInteger b){
     return a;
 }
 
+/**
+ * returns the rounded down value of the logarithm of base 2 of the given AVInteger.
+ * this is simply the index of the most significant bit which is 1. Or 0 of all bits are 0
+ */
 int av_log2_i(AVInteger a){
     int i;
 
@@ -78,6 +82,9 @@ AVInteger av_mul_i(AVInteger a, AVInteger b){
     return out;
 }
 
+/**
+ * returns 0 if a==b, 1 if a>b and -1 if a<b.
+ */
 int av_cmp_i(AVInteger a, AVInteger b){
     int i; 
     int v= (int16_t)a.v[AV_INTEGER_SIZE-1] - (int16_t)b.v[AV_INTEGER_SIZE-1];
@@ -90,6 +97,10 @@ int av_cmp_i(AVInteger a, AVInteger b){
     return 0;
 }
 
+/**
+ * bitwise shift.
+ * @param s the number of bits by which the value should be shifted right, may be negative for shifting left
+ */
 AVInteger av_shr_i(AVInteger a, int s){
     AVInteger out;
     int i;
@@ -104,6 +115,10 @@ AVInteger av_shr_i(AVInteger a, int s){
     return out;
 }
 
+/**
+ * returns a % b.
+ * @param quot a/b will be stored here
+ */
 AVInteger av_mod_i(AVInteger *quot, AVInteger a, AVInteger b){
     int i= av_log2_i(a) - av_log2_i(b);
     AVInteger quot_temp;
@@ -128,12 +143,18 @@ AVInteger av_mod_i(AVInteger *quot, AVInteger a, AVInteger b){
     return a;
 }
 
+/**
+ * returns a/b.
+ */
 AVInteger av_div_i(AVInteger a, AVInteger b){
     AVInteger quot;
     av_mod_i(&quot, a, b);
     return quot;
 }
 
+/**
+ * converts the given int64_t to an AVInteger.
+ */
 AVInteger av_int2i(int64_t a){
     AVInteger out;
     int i;
@@ -145,6 +166,11 @@ AVInteger av_int2i(int64_t a){
     return out;
 }
 
+/**
+ * converts the given AVInteger to an int64_t.
+ * if the AVInteger is too large to fit into an int64_t, 
+ * then only the least significant 64bit will be used
+ */
 int64_t av_i2int(AVInteger a){
     int i;
     int64_t out=(int8_t)a.v[AV_INTEGER_SIZE-1];
diff --git a/src/libffmpeg/libavcodec/interplayvideo.c b/src/libffmpeg/libavcodec/interplayvideo.c
index 06816ba3e..f4add08c0 100644
--- a/src/libffmpeg/libavcodec/interplayvideo.c
+++ b/src/libffmpeg/libavcodec/interplayvideo.c
@@ -47,7 +47,7 @@
 /* debugging support */
 #define DEBUG_INTERPLAY 0
 #if DEBUG_INTERPLAY
-#define debug_interplay printf
+#define debug_interplay(x,...) av_log(NULL, AV_LOG_DEBUG, x, __VA_ARGS__)
 #else
 static inline void debug_interplay(const char *format, ...) { }
 #endif
diff --git a/src/libffmpeg/libavcodec/jrevdct.c b/src/libffmpeg/libavcodec/jrevdct.c
index 3bd78c192..c08d1241f 100644
--- a/src/libffmpeg/libavcodec/jrevdct.c
+++ b/src/libffmpeg/libavcodec/jrevdct.c
@@ -235,9 +235,7 @@ void j_rev_dct(DCTBLOCK data)
     /* The rotator is sqrt(2)*c(-6). */
 {
     if (d6) {
-	if (d4) {
 	    if (d2) {
-		if (d0) {
 		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
@@ -250,21 +248,7 @@ void j_rev_dct(DCTBLOCK data)
 		    tmp13 = tmp0 - tmp3;
 		    tmp11 = tmp1 + tmp2;
 		    tmp12 = tmp1 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
-		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
-		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
-		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
-
-		    tmp0 = d4 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp2 - tmp0;
-		    tmp12 = -(tmp0 + tmp2);
-		}
 	    } else {
-		if (d0) {
 		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
@@ -276,72 +260,9 @@ void j_rev_dct(DCTBLOCK data)
 		    tmp13 = tmp0 - tmp3;
 		    tmp11 = tmp1 + tmp2;
 		    tmp12 = tmp1 - tmp2;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
-		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
-		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
-
-		    tmp0 = d4 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp2 - tmp0;
-		    tmp12 = -(tmp0 + tmp2);
-		}
 	    }
-	} else {
-	    if (d2) {
-		if (d0) {
-		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
-		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
-		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
-		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
-
-		    tmp0 = d0 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp0 + tmp2;
-		    tmp12 = tmp0 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
-		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
-		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
-		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
-
-		    tmp10 = tmp3;
-		    tmp13 = -tmp3;
-		    tmp11 = tmp2;
-		    tmp12 = -tmp2;
-		}
-	    } else {
-		if (d0) {
-		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
-		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
-		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
-
-		    tmp0 = d0 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp0 + tmp2;
-		    tmp12 = tmp0 - tmp2;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
-		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
-		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
-
-		    tmp10 = tmp3;
-		    tmp13 = -tmp3;
-		    tmp11 = tmp2;
-		    tmp12 = -tmp2;
-		}
-	    }
-	}
     } else {
-	if (d4) {
 	    if (d2) {
-		if (d0) {
 		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
@@ -353,62 +274,11 @@ void j_rev_dct(DCTBLOCK data)
 		    tmp13 = tmp0 - tmp3;
 		    tmp11 = tmp1 + tmp2;
 		    tmp12 = tmp1 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
-		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
-		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
-
-		    tmp0 = d4 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp2 - tmp0;
-		    tmp12 = -(tmp0 + tmp2);
-		}
 	    } else {
-		if (d0) {
 		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
 		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
 		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
-		    tmp10 = tmp13 = d4 << CONST_BITS;
-		    tmp11 = tmp12 = -tmp10;
-		}
-	    }
-	} else {
-	    if (d2) {
-		if (d0) {
-		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
-		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
-		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
-
-		    tmp0 = d0 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp0 + tmp2;
-		    tmp12 = tmp0 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
-		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
-		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
-
-		    tmp10 = tmp3;
-		    tmp13 = -tmp3;
-		    tmp11 = tmp2;
-		    tmp12 = -tmp2;
-		}
-	    } else {
-		if (d0) {
-		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
-		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
-		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
-		}
 	    }
-	}
       }
 
     /* Odd part per figure 8; the matrix is unitary and hence its
@@ -711,9 +581,7 @@ void j_rev_dct(DCTBLOCK data)
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
     if (d6) {
-	if (d4) {
 	    if (d2) {
-		if (d0) {
 		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
 		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
 		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
@@ -726,21 +594,7 @@ void j_rev_dct(DCTBLOCK data)
 		    tmp13 = tmp0 - tmp3;
 		    tmp11 = tmp1 + tmp2;
 		    tmp12 = tmp1 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
-		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
-		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
-		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
-
-		    tmp0 = d4 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp2 - tmp0;
-		    tmp12 = -(tmp0 + tmp2);
-		}
 	    } else {
-		if (d0) {
 		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
 		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
 		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
@@ -752,72 +606,9 @@ void j_rev_dct(DCTBLOCK data)
 		    tmp13 = tmp0 - tmp3;
 		    tmp11 = tmp1 + tmp2;
 		    tmp12 = tmp1 - tmp2;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
-		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
-		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
-
-		    tmp0 = d4 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp2 - tmp0;
-		    tmp12 = -(tmp0 + tmp2);
-		}
-	    }
-	} else {
-	    if (d2) {
-		if (d0) {
-		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
-		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
-		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
-		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
-
-		    tmp0 = d0 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp0 + tmp2;
-		    tmp12 = tmp0 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
-		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
-		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
-		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
-
-		    tmp10 = tmp3;
-		    tmp13 = -tmp3;
-		    tmp11 = tmp2;
-		    tmp12 = -tmp2;
-		}
-	    } else {
-		if (d0) {
-		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
-		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
-		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
-
-		    tmp0 = d0 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp0 + tmp2;
-		    tmp12 = tmp0 - tmp2;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
-		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
-		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
-
-		    tmp10 = tmp3;
-		    tmp13 = -tmp3;
-		    tmp11 = tmp2;
-		    tmp12 = -tmp2;
-		}
 	    }
-	}
     } else {
-	if (d4) {
 	    if (d2) {
-		if (d0) {
 		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
 		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
 		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
@@ -829,62 +620,11 @@ void j_rev_dct(DCTBLOCK data)
 		    tmp13 = tmp0 - tmp3;
 		    tmp11 = tmp1 + tmp2;
 		    tmp12 = tmp1 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
-		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
-		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
-
-		    tmp0 = d4 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp2 - tmp0;
-		    tmp12 = -(tmp0 + tmp2);
-		}
 	    } else {
-		if (d0) {
 		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
 		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
 		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
-		    tmp10 = tmp13 = d4 << CONST_BITS;
-		    tmp11 = tmp12 = -tmp10;
-		}
-	    }
-	} else {
-	    if (d2) {
-		if (d0) {
-		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
-		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
-		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
-
-		    tmp0 = d0 << CONST_BITS;
-
-		    tmp10 = tmp0 + tmp3;
-		    tmp13 = tmp0 - tmp3;
-		    tmp11 = tmp0 + tmp2;
-		    tmp12 = tmp0 - tmp2;
-		} else {
-		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
-		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
-		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
-
-		    tmp10 = tmp3;
-		    tmp13 = -tmp3;
-		    tmp11 = tmp2;
-		    tmp12 = -tmp2;
-		}
-	    } else {
-		if (d0) {
-		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
-		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
-		} else {
-		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
-		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
-		}
 	    }
-	}
     }
 
     /* Odd part per figure 8; the matrix is unitary and hence its
@@ -1172,5 +912,215 @@ void j_rev_dct(DCTBLOCK data)
   }
 }
 
+#undef DCTSIZE
+#define DCTSIZE 4
+#define DCTSTRIDE 8
+
+void j_rev_dct4(DCTBLOCK data)
+{
+  int32_t tmp0, tmp1, tmp2, tmp3;
+  int32_t tmp10, tmp11, tmp12, tmp13;
+  int32_t z1;
+  int32_t d0, d2, d4, d6;
+  register DCTELEM *dataptr;
+  int rowctr;
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  data[0] += 4;
+  
+  dataptr = data;
+
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Due to quantization, we will usually find that many of the input
+     * coefficients are zero, especially the AC terms.  We can exploit this
+     * by short-circuiting the IDCT calculation for any row in which all
+     * the AC terms are zero.  In that case each output is equal to the
+     * DC coefficient (with scale factor as needed).
+     * With typical images and quantization tables, half or more of the
+     * row DCT calculations can be simplified this way.
+     */
+
+    register int *idataptr = (int*)dataptr;
+
+    d0 = dataptr[0];
+    d2 = dataptr[1];
+    d4 = dataptr[2];
+    d6 = dataptr[3];
+
+    if ((d2 | d4 | d6) == 0) {
+      /* AC terms all zero */
+      if (d0) {
+	  /* Compute a 32 bit value to assign. */
+	  DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
+	  register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
+	  
+	  idataptr[0] = v;
+	  idataptr[1] = v;
+      }
+      
+      dataptr += DCTSTRIDE;	/* advance pointer to next row */
+      continue;
+    }
+    
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+	    if (d2) {
+		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+		    tmp0 = (d0 + d4) << CONST_BITS;
+		    tmp1 = (d0 - d4) << CONST_BITS;
+
+		    tmp10 = tmp0 + tmp3;
+		    tmp13 = tmp0 - tmp3;
+		    tmp11 = tmp1 + tmp2;
+		    tmp12 = tmp1 - tmp2;
+	    } else {
+		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+		    tmp0 = (d0 + d4) << CONST_BITS;
+		    tmp1 = (d0 - d4) << CONST_BITS;
+
+		    tmp10 = tmp0 + tmp3;
+		    tmp13 = tmp0 - tmp3;
+		    tmp11 = tmp1 + tmp2;
+		    tmp12 = tmp1 - tmp2;
+	    }
+    } else {
+	    if (d2) {
+		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+		    tmp0 = (d0 + d4) << CONST_BITS;
+		    tmp1 = (d0 - d4) << CONST_BITS;
+
+		    tmp10 = tmp0 + tmp3;
+		    tmp13 = tmp0 - tmp3;
+		    tmp11 = tmp1 + tmp2;
+		    tmp12 = tmp1 - tmp2;
+	    } else {
+		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+	    }
+      }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[0] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
+
+    dataptr += DCTSTRIDE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  dataptr = data;
+  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+    /* Columns of zeroes can be exploited in the same way as we did with rows.
+     * However, the row calculation has created many nonzero AC terms, so the
+     * simplification applies less often (typically 5% to 10% of the time).
+     * On machines with very fast multiplication, it's possible that the
+     * test takes more time than it's worth.  In that case this section
+     * may be commented out.
+     */
+
+    d0 = dataptr[DCTSTRIDE*0];
+    d2 = dataptr[DCTSTRIDE*1];
+    d4 = dataptr[DCTSTRIDE*2];
+    d6 = dataptr[DCTSTRIDE*3];
+
+    /* Even part: reverse the even part of the forward DCT. */
+    /* The rotator is sqrt(2)*c(-6). */
+    if (d6) {
+	    if (d2) {
+		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
+		    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
+		    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
+		    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
+
+		    tmp0 = (d0 + d4) << CONST_BITS;
+		    tmp1 = (d0 - d4) << CONST_BITS;
+
+		    tmp10 = tmp0 + tmp3;
+		    tmp13 = tmp0 - tmp3;
+		    tmp11 = tmp1 + tmp2;
+		    tmp12 = tmp1 - tmp2;
+	    } else {
+		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
+		    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
+		    tmp3 = MULTIPLY(d6, FIX_0_541196100);
+
+		    tmp0 = (d0 + d4) << CONST_BITS;
+		    tmp1 = (d0 - d4) << CONST_BITS;
+
+		    tmp10 = tmp0 + tmp3;
+		    tmp13 = tmp0 - tmp3;
+		    tmp11 = tmp1 + tmp2;
+		    tmp12 = tmp1 - tmp2;
+	    }
+    } else {
+	    if (d2) {
+		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
+		    tmp2 = MULTIPLY(d2, FIX_0_541196100);
+		    tmp3 = MULTIPLY(d2, FIX_1_306562965);
+
+		    tmp0 = (d0 + d4) << CONST_BITS;
+		    tmp1 = (d0 - d4) << CONST_BITS;
+
+		    tmp10 = tmp0 + tmp3;
+		    tmp13 = tmp0 - tmp3;
+		    tmp11 = tmp1 + tmp2;
+		    tmp12 = tmp1 - tmp2;
+	    } else {
+		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
+		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
+		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
+	    }
+    }
+
+    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
+    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
+    
+    dataptr++;			/* advance pointer to next column */
+  }
+}
+
+void j_rev_dct2(DCTBLOCK data){
+  int d00, d01, d10, d11;
+
+  data[0] += 4;
+  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
+  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
+  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
+  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
+ 
+  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
+  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
+  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
+  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
+}
+
+void j_rev_dct1(DCTBLOCK data){
+  data[0] = (data[0] + 4)>>3;
+}
+
 #undef FIX
 #undef CONST_BITS
diff --git a/src/libffmpeg/libavcodec/lcl.c b/src/libffmpeg/libavcodec/lcl.c
index 9a8591a89..a15a10769 100644
--- a/src/libffmpeg/libavcodec/lcl.c
+++ b/src/libffmpeg/libavcodec/lcl.c
@@ -41,6 +41,7 @@
 #include <stdlib.h>
 
 #include "common.h"
+#include "bitstream.h"
 #include "avcodec.h"
 
 #ifdef CONFIG_ZLIB
@@ -144,14 +145,15 @@ static inline unsigned char get_r (unsigned char yq, signed char rq)
 
 
 
-static int mszh_decomp(unsigned char * srcptr, int srclen, unsigned char * destptr)
+static unsigned int mszh_decomp(unsigned char * srcptr, int srclen, unsigned char * destptr, unsigned int destsize)
 {
     unsigned char *destptr_bak = destptr;
+    unsigned char *destptr_end = destptr + destsize;
     unsigned char mask = 0;
     unsigned char maskbit = 0;
     unsigned int ofs, cnt;
   
-    while (srclen > 0) {
+    while ((srclen > 0) && (destptr < destptr_end)) {
         if (maskbit == 0) {
             mask = *(srcptr++);
             maskbit = 8;
@@ -159,6 +161,8 @@ static int mszh_decomp(unsigned char * srcptr, int srclen, unsigned char * destp
             continue;
         }
         if ((mask & (1 << (--maskbit))) == 0) {
+            if (destptr + 4 > destptr_end)
+                break;
             *(int*)destptr = *(int*)srcptr;
             srclen -= 4;
             destptr += 4;
@@ -171,6 +175,9 @@ static int mszh_decomp(unsigned char * srcptr, int srclen, unsigned char * destp
             ofs &= 0x7ff;
             srclen -= 2;
             cnt *= 4;
+            if (destptr + cnt > destptr_end) {
+                cnt =  destptr_end - destptr;
+            }
             for (; cnt > 0; cnt--) {
                 *(destptr) = *(destptr - ofs);
                 destptr++;
@@ -193,7 +200,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
 {
 	LclContext * const c = (LclContext *)avctx->priv_data;
 	unsigned char *encoded = (unsigned char *)buf;
-    int pixel_ptr;
+    unsigned int pixel_ptr;
     int row, col;
     unsigned char *outptr;
     unsigned int width = avctx->width; // Real image width
@@ -205,11 +212,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
 #ifdef CONFIG_ZLIB
     int zret; // Zlib return code
 #endif
-    int len = buf_size;
-
-	/* no supplementary picture */
-	if (buf_size == 0)
-		return 0;
+    unsigned int len = buf_size;
 
 	if(c->pic.data[0])
 		avctx->release_buffer(avctx, &c->pic);
@@ -231,24 +234,29 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
                     if (c->flags & FLAG_MULTITHREAD) {
                         mthread_inlen = *((unsigned int*)encoded);
                         mthread_outlen = *((unsigned int*)(encoded+4));
-                        mszh_dlen = mszh_decomp(encoded + 8, mthread_inlen, c->decomp_buf);
+                        if (mthread_outlen > c->decomp_size) // this should not happen
+                            mthread_outlen = c->decomp_size;
+                        mszh_dlen = mszh_decomp(encoded + 8, mthread_inlen, c->decomp_buf, c->decomp_size);
                         if (mthread_outlen != mszh_dlen) {
                             av_log(avctx, AV_LOG_ERROR, "Mthread1 decoded size differs (%d != %d)\n",
                                    mthread_outlen, mszh_dlen);
+                            return -1;
                         }
                         mszh_dlen = mszh_decomp(encoded + 8 + mthread_inlen, len - mthread_inlen,
-                                                c->decomp_buf + mthread_outlen);
-                        if ((c->decomp_size - mthread_outlen) != mszh_dlen) {
+                                                c->decomp_buf + mthread_outlen, c->decomp_size - mthread_outlen);
+                        if (mthread_outlen != mszh_dlen) {
                             av_log(avctx, AV_LOG_ERROR, "Mthread2 decoded size differs (%d != %d)\n",
-                                   c->decomp_size - mthread_outlen, mszh_dlen);
+                                   mthread_outlen, mszh_dlen);
+                            return -1;
                         }
                         encoded = c->decomp_buf;
                         len = c->decomp_size;
                     } else {
-                        mszh_dlen = mszh_decomp(encoded, len, c->decomp_buf);
+                        mszh_dlen = mszh_decomp(encoded, len, c->decomp_buf, c->decomp_size);
                         if (c->decomp_size != mszh_dlen) {
                             av_log(avctx, AV_LOG_ERROR, "Decoded size differs (%d != %d)\n",
                                    c->decomp_size, mszh_dlen);
+                            return -1;
                         }
                         encoded = c->decomp_buf;
                         len = mszh_dlen;
@@ -277,10 +285,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
             if (c->flags & FLAG_MULTITHREAD) {
                 mthread_inlen = *((unsigned int*)encoded);
                 mthread_outlen = *((unsigned int*)(encoded+4));
+                if (mthread_outlen > c->decomp_size)
+                    mthread_outlen = c->decomp_size;
                 c->zstream.next_in = encoded + 8;
                 c->zstream.avail_in = mthread_inlen;
                 c->zstream.next_out = c->decomp_buf;
-                c->zstream.avail_out = mthread_outlen;    
+                c->zstream.avail_out = c->decomp_size;    
                 zret = inflate(&(c->zstream), Z_FINISH);
                 if ((zret != Z_OK) && (zret != Z_STREAM_END)) {
                     av_log(avctx, AV_LOG_ERROR, "Mthread1 inflate error: %d\n", zret);
@@ -289,6 +299,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
                 if (mthread_outlen != (unsigned int)(c->zstream.total_out)) {
                     av_log(avctx, AV_LOG_ERROR, "Mthread1 decoded size differs (%u != %lu)\n",
                            mthread_outlen, c->zstream.total_out);
+                    return -1;
                 }
                 zret = inflateReset(&(c->zstream));
                 if (zret != Z_OK) {
@@ -298,15 +309,16 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
                 c->zstream.next_in = encoded + 8 + mthread_inlen;
                 c->zstream.avail_in = len - mthread_inlen;
                 c->zstream.next_out = c->decomp_buf + mthread_outlen;
-                c->zstream.avail_out = mthread_outlen;    
+                c->zstream.avail_out = c->decomp_size - mthread_outlen;    
                 zret = inflate(&(c->zstream), Z_FINISH);
                 if ((zret != Z_OK) && (zret != Z_STREAM_END)) {
                     av_log(avctx, AV_LOG_ERROR, "Mthread2 inflate error: %d\n", zret);
                     return -1;
                 }
-                if ((c->decomp_size - mthread_outlen) != (unsigned int)(c->zstream.total_out)) {
+                if (mthread_outlen != (unsigned int)(c->zstream.total_out)) {
                     av_log(avctx, AV_LOG_ERROR, "Mthread2 decoded size differs (%d != %lu)\n",
-                           c->decomp_size - mthread_outlen, c->zstream.total_out);
+                           mthread_outlen, c->zstream.total_out);
+                    return -1;
                 }
             } else {
                 c->zstream.next_in = encoded;
@@ -321,6 +333,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
                 if (c->decomp_size != (unsigned int)(c->zstream.total_out)) {
                     av_log(avctx, AV_LOG_ERROR, "Decoded size differs (%d != %lu)\n",
                            c->decomp_size, c->zstream.total_out);
+                    return -1;
                 }
             }
             encoded = c->decomp_buf;
@@ -567,13 +580,20 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         av_log(avctx, AV_LOG_ERROR, "Deflate reset error: %d\n", zret);
         return -1;
     }
-    c->zstream.next_in = p->data[0];
-    c->zstream.avail_in = c->decomp_size;
     c->zstream.next_out = c->comp_buf;
     c->zstream.avail_out = c->max_comp_size;
 
+    for(i = avctx->height - 1; i >= 0; i--) {
+        c->zstream.next_in = p->data[0]+p->linesize[0]*i;
+        c->zstream.avail_in = avctx->width*3;
+        zret = deflate(&(c->zstream), Z_NO_FLUSH);
+        if (zret != Z_OK) {
+    	    av_log(avctx, AV_LOG_ERROR, "Deflate error: %d\n", zret);
+    	    return -1;
+        }
+    }
     zret = deflate(&(c->zstream), Z_FINISH);
-    if ((zret != Z_OK) && (zret != Z_STREAM_END)) {
+    if (zret != Z_STREAM_END) {
         av_log(avctx, AV_LOG_ERROR, "Deflate error: %d\n", zret);
         return -1;
     }
@@ -596,7 +616,9 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
 static int decode_init(AVCodecContext *avctx)
 {
     LclContext * const c = (LclContext *)avctx->priv_data;
-    int basesize = avctx->width * avctx->height;
+    unsigned int basesize = avctx->width * avctx->height;
+    unsigned int max_basesize = ((avctx->width + 3) & ~3) * ((avctx->height + 3) & ~3);
+    unsigned int max_decomp_size;
     int zret; // Zlib return code
 
     c->avctx = avctx;
@@ -614,6 +636,10 @@ static int decode_init(AVCodecContext *avctx)
         return 1;
     }
 
+    if (avcodec_check_dimensions(avctx, avctx->width, avctx->height) < 0) {
+        return 1;
+    }
+
     /* Check codec type */ 
     if (((avctx->codec_id == CODEC_ID_MSZH)  && (*((char *)avctx->extradata + 7) != CODEC_MSZH)) ||
         ((avctx->codec_id == CODEC_ID_ZLIB)  && (*((char *)avctx->extradata + 7) != CODEC_ZLIB))) {
@@ -624,26 +650,32 @@ static int decode_init(AVCodecContext *avctx)
     switch (c->imgtype = *((char *)avctx->extradata + 4)) {
         case IMGTYPE_YUV111:
             c->decomp_size = basesize * 3;
+            max_decomp_size = max_basesize * 3;
             av_log(avctx, AV_LOG_INFO, "Image type is YUV 1:1:1.\n");
             break;
         case IMGTYPE_YUV422:
             c->decomp_size = basesize * 2;
+            max_decomp_size = max_basesize * 2;
             av_log(avctx, AV_LOG_INFO, "Image type is YUV 4:2:2.\n");
             break;
         case IMGTYPE_RGB24:
             c->decomp_size = basesize * 3;
+            max_decomp_size = max_basesize * 3;
             av_log(avctx, AV_LOG_INFO, "Image type is RGB 24.\n");
             break;
         case IMGTYPE_YUV411:
             c->decomp_size = basesize / 2 * 3;
+            max_decomp_size = max_basesize / 2 * 3;
             av_log(avctx, AV_LOG_INFO, "Image type is YUV 4:1:1.\n");
             break;
         case IMGTYPE_YUV211:
             c->decomp_size = basesize * 2;
+            max_decomp_size = max_basesize * 2;
             av_log(avctx, AV_LOG_INFO, "Image type is YUV 2:1:1.\n");
             break;
         case IMGTYPE_YUV420:
             c->decomp_size = basesize / 2 * 3;
+            max_decomp_size = max_basesize / 2 * 3;
             av_log(avctx, AV_LOG_INFO, "Image type is YUV 4:2:0.\n");
             break;
         default:
@@ -698,9 +730,8 @@ static int decode_init(AVCodecContext *avctx)
     }
 
     /* Allocate decompression buffer */
-    /* 4*8 max overflow space for mszh decomp algorithm */
     if (c->decomp_size) {
-        if ((c->decomp_buf = av_malloc(c->decomp_size+4*8)) == NULL) {
+        if ((c->decomp_buf = av_malloc(max_decomp_size)) == NULL) {
             av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
             return 1;
         }
@@ -785,7 +816,7 @@ static int encode_init(AVCodecContext *avctx)
     ((uint8_t*)avctx->extradata)[4]= c->imgtype;
     ((uint8_t*)avctx->extradata)[5]= c->compression;
     ((uint8_t*)avctx->extradata)[6]= c->flags;
-    ((uint8_t*)avctx->extradata)[7]= 0;
+    ((uint8_t*)avctx->extradata)[7]= CODEC_ZLIB;
     c->avctx->extradata_size= 8;
     
     c->zstream.zalloc = Z_NULL;
@@ -843,7 +874,7 @@ static int encode_end(AVCodecContext *avctx)
     LclContext *c = avctx->priv_data;
 
     av_freep(&avctx->extradata);
-    av_freep(c->comp_buf);
+    av_freep(&c->comp_buf);
 #ifdef CONFIG_ZLIB
     deflateEnd(&(c->zstream));
 #endif
@@ -886,7 +917,6 @@ AVCodec zlib_encoder = {
     encode_init,
     encode_frame,
     encode_end,
-//    .options = lcl_options,
 };
 
 #endif //CONFIG_ENCODERS
diff --git a/src/libffmpeg/libavcodec/libpostproc/mangle.h b/src/libffmpeg/libavcodec/libpostproc/mangle.h
index f3894cc33..aa09cd6bf 100644
--- a/src/libffmpeg/libavcodec/libpostproc/mangle.h
+++ b/src/libffmpeg/libavcodec/libpostproc/mangle.h
@@ -8,12 +8,21 @@
 #define __MANGLE_H
 
 /* Feel free to add more to the list, eg. a.out IMO */
+/* Use rip-relative addressing if compiling PIC code on x86-64. */
 #if defined(__CYGWIN__) || defined(__MINGW32__) || defined(__OS2__) || \
    (defined(__OpenBSD__) && !defined(__ELF__))
+#if defined(ARCH_X86_64) && defined(PIC)
+#define MANGLE(a) "_" #a"(%%rip)"
+#else
 #define MANGLE(a) "_" #a
+#endif
+#else
+#if defined(ARCH_X86_64) && defined(PIC)
+#define MANGLE(a) #a"(%%rip)"
 #else
 #define MANGLE(a) #a
 #endif
+#endif
 
 #endif /* !__MANGLE_H */
 
diff --git a/src/libffmpeg/libavcodec/libpostproc/postprocess.c b/src/libffmpeg/libavcodec/libpostproc/postprocess.c
index a03ff133d..e7ca0191d 100644
--- a/src/libffmpeg/libavcodec/libpostproc/postprocess.c
+++ b/src/libffmpeg/libavcodec/libpostproc/postprocess.c
@@ -29,10 +29,11 @@ isVertDC		Ec	Ec			Ec
 isVertMinMaxOk		Ec	Ec			Ec
 doVertLowPass		E		e	e	Ec
 doVertDefFilter		Ec	Ec	e	e	Ec
-isHorizDC		Ec	Ec
-isHorizMinMaxOk		a	E
-doHorizLowPass		E		e	e
-doHorizDefFilter	Ec	Ec	e	e
+isHorizDC		Ec	Ec			Ec
+isHorizMinMaxOk		a	E			Ec
+doHorizLowPass		E		e	e	Ec
+doHorizDefFilter	Ec	Ec	e	e	Ec
+do_a_deblock		Ec	E	Ec	E
 deRing			E		e	e*	Ecp
 Vertical RKAlgo1	E		a	a
 Horizontal RKAlgo1			a	a
@@ -42,7 +43,7 @@ LinIpolDeinterlace	e		E	E*
 CubicIpolDeinterlace	a		e	e*
 LinBlendDeinterlace	e		E	E*
 MedianDeinterlace#	E	Ec	Ec
-TempDeNoiser#		E		e	e
+TempDeNoiser#		E		e	e	Ec
 
 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
 # more or less selfinvented filters so the exactness isnt too meaningfull
@@ -91,6 +92,10 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 
 #include "mangle.h" //FIXME should be supressed
 
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
 #ifndef HAVE_MEMALIGN
 #define memalign(a,b) malloc(b)
 #endif
@@ -108,12 +113,15 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
 #    define attribute_used __attribute__((used))
+#    define always_inline __attribute__((always_inline)) inline
 #else
 #    define attribute_used
+#    define always_inline inline
 #endif
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static uint64_t __attribute__((aligned(8))) attribute_used w05=		0x0005000500050005LL;
+static uint64_t __attribute__((aligned(8))) attribute_used w04=		0x0004000400040004LL;
 static uint64_t __attribute__((aligned(8))) attribute_used w20=		0x0020002000200020LL;
 static uint64_t __attribute__((aligned(8))) attribute_used b00= 		0x0000000000000000LL;
 static uint64_t __attribute__((aligned(8))) attribute_used b01= 		0x0101010101010101LL;
@@ -122,7 +130,6 @@ static uint64_t __attribute__((aligned(8))) attribute_used b08= 		0x080808080808
 static uint64_t __attribute__((aligned(8))) attribute_used b80= 		0x8080808080808080LL;
 #endif
 
-
 static uint8_t clip_table[3*256];
 static uint8_t * const clip_tab= clip_table + 256;
 
@@ -139,6 +146,8 @@ static struct PPFilter filters[]=
 	{"vr", "rkvdeblock", 		1, 2, 4, V_RK1_FILTER},*/
 	{"h1", "x1hdeblock", 		1, 1, 3, H_X1_FILTER},
 	{"v1", "x1vdeblock", 		1, 2, 4, V_X1_FILTER},
+	{"ha", "ahdeblock", 		1, 1, 3, H_A_DEBLOCK},
+	{"va", "avdeblock", 		1, 2, 4, V_A_DEBLOCK},
 	{"dr", "dering", 		1, 5, 6, DERING},
 	{"al", "autolevels", 		0, 1, 2, LEVEL_FIX},
 	{"lb", "linblenddeint", 	1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
@@ -154,15 +163,16 @@ static struct PPFilter filters[]=
 
 static char *replaceTable[]=
 {
-	"default", 	"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
-	"de", 		"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
-	"fast", 	"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
-	"fa", 		"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
+	"default", 	"hdeblock:a,vdeblock:a,dering:a",
+	"de", 		"hdeblock:a,vdeblock:a,dering:a",
+	"fast", 	"x1hdeblock:a,x1vdeblock:a,dering:a",
+	"fa", 		"x1hdeblock:a,x1vdeblock:a,dering:a",
+	"ac", 		"ha:a:128:7,va:a,dering:a",
 	NULL //End Marker
 };
 
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 static inline void prefetchnta(void *p)
 {
 	asm volatile(	"prefetchnta (%0)\n\t"
@@ -372,32 +382,32 @@ static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
  */
 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
 {
-
 	int y;
 	for(y=0; y<BLOCK_SIZE; y++)
 	{
 		const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 		const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 
-		int sums[9];
-		sums[0] = first + dst[0];
-		sums[1] = dst[0] + dst[1];
-		sums[2] = dst[1] + dst[2];
-		sums[3] = dst[2] + dst[3];
-		sums[4] = dst[3] + dst[4];
-		sums[5] = dst[4] + dst[5];
-		sums[6] = dst[5] + dst[6];
-		sums[7] = dst[6] + dst[7];
-		sums[8] = dst[7] + last;
-
-		dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
-		dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
-		dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
-		dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
-		dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
-		dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
-		dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
-		dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
+		int sums[10];
+		sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
+		sums[1] = sums[0] - first  + dst[3];
+		sums[2] = sums[1] - first  + dst[4];
+		sums[3] = sums[2] - first  + dst[5];
+		sums[4] = sums[3] - first  + dst[6];
+		sums[5] = sums[4] - dst[0] + dst[7];
+		sums[6] = sums[5] - dst[1] + last;
+		sums[7] = sums[6] - dst[2] + last;
+		sums[8] = sums[7] - dst[3] + last;
+		sums[9] = sums[8] - dst[4] + last;
+
+		dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
+		dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
+		dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
+		dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
+		dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
+		dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
+		dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
+		dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 
 		dst+= stride;
 	}
@@ -469,6 +479,111 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 	}
 }
 
+/**
+ * accurate deblock filter
+ */
+static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
+	int y;
+	const int QP= c->QP;
+	const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+	const int dcThreshold= dcOffset*2 + 1;
+//START_TIMER
+	src+= step*4; // src points to begin of the 8x8 Block
+	for(y=0; y<8; y++){
+		int numEq= 0;
+
+		if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
+		if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
+		if(numEq > c->ppMode.flatnessThreshold){
+			int min, max, x;
+			
+			if(src[0] > src[step]){
+			    max= src[0];
+			    min= src[step];
+			}else{
+			    max= src[step];
+			    min= src[0];
+			}
+			for(x=2; x<8; x+=2){
+				if(src[x*step] > src[(x+1)*step]){
+					if(src[x    *step] > max) max= src[ x   *step];
+					if(src[(x+1)*step] < min) min= src[(x+1)*step];
+				}else{
+					if(src[(x+1)*step] > max) max= src[(x+1)*step];
+					if(src[ x   *step] < min) min= src[ x   *step];
+				}
+			}
+			if(max-min < 2*QP){
+				const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
+				const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
+				
+				int sums[10];
+				sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
+				sums[1] = sums[0] - first       + src[3*step];
+				sums[2] = sums[1] - first       + src[4*step];
+				sums[3] = sums[2] - first       + src[5*step];
+				sums[4] = sums[3] - first       + src[6*step];
+				sums[5] = sums[4] - src[0*step] + src[7*step];
+				sums[6] = sums[5] - src[1*step] + last;
+				sums[7] = sums[6] - src[2*step] + last;
+				sums[8] = sums[7] - src[3*step] + last;
+				sums[9] = sums[8] - src[4*step] + last;
+
+				src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
+				src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
+				src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
+				src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
+				src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
+				src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
+				src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
+				src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
+			}
+		}else{
+			const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
+
+			if(ABS(middleEnergy) < 8*QP)
+			{
+				const int q=(src[3*step] - src[4*step])/2;
+				const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
+				const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
+
+				int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
+				d= MAX(d, 0);
+	
+				d= (5*d + 32) >> 6;
+				d*= SIGN(-middleEnergy);
+	
+				if(q>0)
+				{
+					d= d<0 ? 0 : d;
+					d= d>q ? q : d;
+				}
+				else
+				{
+					d= d>0 ? 0 : d;
+					d= d<q ? q : d;
+				}
+	
+				src[3*step]-= d;
+				src[4*step]+= d;
+			}
+		}
+
+		src += stride;
+	}
+/*if(step==16){
+    STOP_TIMER("step16")
+}else{
+    STOP_TIMER("stepX")
+}*/
+}
 
 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 //Plain C versions
@@ -479,15 +594,10 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 #ifdef ARCH_POWERPC
 #ifdef HAVE_ALTIVEC
 #define COMPILE_ALTIVEC
-#ifndef CONFIG_DARWIN
-#warning "################################################################################"
-#warning  "WARNING: No gcc available as of today (2004-05-25) seems to be able to compile properly some of the code under non-Darwin PPC OSes. Some functions result in wrong results, while others simply won't compile (gcc explodes after allocating 1GiB+)."
-#warning "################################################################################"
-#endif //CONFIG_DARWIN
 #endif //HAVE_ALTIVEC
 #endif //ARCH_POWERPC
 
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 
 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 #define COMPILE_MMX
@@ -506,13 +616,11 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
 #undef HAVE_ALTIVEC
-#undef ARCH_X86
 
 #ifdef COMPILE_C
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
-#undef ARCH_X86
 #define RENAME(a) a ## _C
 #include "postprocess_template.c"
 #endif
@@ -533,7 +641,6 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 #define HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_3DNOW
-#define ARCH_X86
 #define RENAME(a) a ## _MMX
 #include "postprocess_template.c"
 #endif
@@ -544,7 +651,6 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 #define HAVE_MMX
 #define HAVE_MMX2
 #undef HAVE_3DNOW
-#define ARCH_X86
 #define RENAME(a) a ## _MMX2
 #include "postprocess_template.c"
 #endif
@@ -555,7 +661,6 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 #define HAVE_MMX
 #undef HAVE_MMX2
 #define HAVE_3DNOW
-#define ARCH_X86
 #define RENAME(a) a ## _3DNow
 #include "postprocess_template.c"
 #endif
@@ -573,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
 	// difference wouldnt be messureable here but its much better because
 	// someone might exchange the cpu whithout restarting mplayer ;)
 #ifdef RUNTIME_CPUDETECT
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
 	// ordered per speed fasterst first
 	if(c->cpuCaps & PP_CPU_CAPS_MMX2)
 		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
@@ -586,7 +691,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
 #else
 #ifdef ARCH_POWERPC
 #ifdef HAVE_ALTIVEC
-        else if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
+        if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
 		postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
         else
 #endif
@@ -614,24 +719,21 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
 /* -pp Command line Help
 */
 char *pp_help=
-"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
-"long form example:\n"
-"vdeblock:autoq/hdeblock:autoq/linblenddeint	default,-vdeblock\n"
-"short form example:\n"
-"vb:a/hb:a/lb					de,-vb\n"
-"more examples:\n"
-"tn:64:128:256\n"
+"Available postprocessing filters:\n"
 "Filters			Options\n"
 "short	long name	short	long option	Description\n"
 "*	*		a	autoq		CPU power dependent enabler\n"
 "			c	chrom		chrominance filtering enabled\n"
 "			y	nochrom		chrominance filtering disabled\n"
+"			n	noluma		luma filtering disabled\n"
 "hb	hdeblock	(2 threshold)		horizontal deblocking filter\n"
 "	1. difference factor: default=32, higher -> more deblocking\n"
 "	2. flatness threshold: default=39, lower -> more deblocking\n"
 "			the h & v deblocking filters share these\n"
 "			so you can't set different thresholds for h / v\n"
 "vb	vdeblock	(2 threshold)		vertical deblocking filter\n"
+"ha	hadeblock	(2 threshold)		horizontal deblocking filter\n"
+"va	vadeblock	(2 threshold)		vertical deblocking filter\n"
 "h1	x1hdeblock				experimental h deblock filter 1\n"
 "v1	x1vdeblock				experimental v deblock filter 1\n"
 "dr	dering					deringing filter\n"
@@ -642,11 +744,20 @@ char *pp_help=
 "ci	cubicipoldeint				cubic interpolating deinterlacer\n"
 "md	mediandeint				median deinterlacer\n"
 "fd	ffmpegdeint				ffmpeg deinterlacer\n"
-"de	default					hb:a,vb:a,dr:a,al\n"
-"fa	fast					h1:a,v1:a,dr:a,al\n"
+"l5	lowpass5				FIR lowpass deinterlacer\n"
+"de	default					hb:a,vb:a,dr:a\n"
+"fa	fast					h1:a,v1:a,dr:a\n"
 "tn	tmpnoise	(3 threshold)		temporal noise reducer\n"
 "			1. <= 2. <= 3.		larger -> stronger filtering\n"
 "fq	forceQuant	<quantizer>		force quantizer\n"
+"Usage:\n"
+"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
+"long form example:\n"
+"vdeblock:autoq/hdeblock:autoq/linblenddeint	default,-vdeblock\n"
+"short form example:\n"
+"vb:a/hb:a/lb					de,-vb\n"
+"more examples:\n"
+"tn:64:128:256\n"
 ;
 
 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
@@ -680,6 +791,7 @@ pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
 		char *filterName;
 		int q= 1000000; //PP_QUALITY_MAX;
 		int chrom=-1;
+		int luma=-1;
 		char *option;
 		char *options[OPTIONS_ARRAY_SIZE];
 		int i;
@@ -707,6 +819,7 @@ pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
 			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
+			else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 			else
 			{
 				options[numOfUnknownOptions] = option;
@@ -753,7 +866,7 @@ pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
 				filterNameOk=1;
 				if(!enable) break; // user wants to disable it
 
-				if(q >= filters[i].minLumQuality)
+				if(q >= filters[i].minLumQuality && luma)
 					ppMode->lumMode|= filters[i].mask;
 				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 					if(q >= filters[i].minChromQuality)
@@ -793,7 +906,8 @@ pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
 						}
 					}
 				}
-				else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
+				else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK 
+				     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
 				{
 					int o;
 
@@ -940,18 +1054,20 @@ void  pp_postprocess(uint8_t * src[3], int srcStride[3],
 	int mbHeight= (height+15)>>4;
 	PPMode *mode = (PPMode*)vm;
 	PPContext *c = (PPContext*)vc;
-        int minStride= MAX(srcStride[0], dstStride[0]);
+	int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
+	int absQPStride = ABS(QPStride);
 
-	if(c->stride < minStride || c->qpStride < QPStride)
+	// c->stride and c->QPStride are always positive
+	if(c->stride < minStride || c->qpStride < absQPStride)
 		reallocBuffers(c, width, height, 
 				MAX(minStride, c->stride), 
-				MAX(c->qpStride, QPStride));
+				MAX(c->qpStride, absQPStride));
 
 	if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)) 
 	{
 		int i;
 		QP_store= c->forcedQPTable;
-		QPStride= 0;
+		absQPStride = QPStride = 0;
 		if(mode->lumMode & FORCE_QUANT)
 			for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
 		else
@@ -961,7 +1077,7 @@ void  pp_postprocess(uint8_t * src[3], int srcStride[3],
 
 	if(pict_type & PP_PICT_TYPE_QP2){
 		int i;
-		const int count= mbHeight * QPStride;
+		const int count= mbHeight * absQPStride;
 		for(i=0; i<(count>>2); i++){
 			((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
 		}
@@ -969,6 +1085,7 @@ void  pp_postprocess(uint8_t * src[3], int srcStride[3],
 			c->stdQPTable[i] = QP_store[i]>>1;
 		}
                 QP_store= c->stdQPTable;
+		QPStride= absQPStride;		
 	}
 
 if(0){
@@ -984,13 +1101,22 @@ for(y=0; y<mbHeight; y++){
 
 	if((pict_type&7)!=3)
 	{
-		int i;
-		const int count= mbHeight * QPStride;
-		for(i=0; i<(count>>2); i++){
-			((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
-		}
-		for(i<<=2; i<count; i++){
-			c->nonBQPTable[i] = QP_store[i] & 0x3F;
+		if (QPStride >= 0) {
+			int i;
+			const int count= mbHeight * QPStride;
+			for(i=0; i<(count>>2); i++){
+				((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
+			}
+			for(i<<=2; i<count; i++){
+				c->nonBQPTable[i] = QP_store[i] & 0x3F;
+			}
+		} else {
+			int i,j;
+			for(i=0; i<mbHeight; i++) {
+		    		for(j=0; j<absQPStride; j++) {
+					c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
+				}
+			}
 		}
 	}
 
@@ -1014,8 +1140,8 @@ for(y=0; y<mbHeight; y++){
 	}
 	else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
 	{
-		memcpy(dst[1], src[1], srcStride[1]*height);
-		memcpy(dst[2], src[2], srcStride[2]*height);
+		linecpy(dst[1], src[1], height, srcStride[1]);
+		linecpy(dst[2], src[2], height, srcStride[2]);
 	}
 	else
 	{
diff --git a/src/libffmpeg/libavcodec/libpostproc/postprocess_altivec_template.c b/src/libffmpeg/libavcodec/libpostproc/postprocess_altivec_template.c
index 0c84873cc..1c59b9465 100644
--- a/src/libffmpeg/libavcodec/libpostproc/postprocess_altivec_template.c
+++ b/src/libffmpeg/libavcodec/libpostproc/postprocess_altivec_template.c
@@ -25,6 +25,39 @@
 #define AVV(x...) {x}
 #endif
 
+#define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
+  do {									\
+    __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;			\
+    __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;			\
+    __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;			\
+    __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;			\
+    tempA1 = vec_mergeh (src_a, src_e);					\
+    tempB1 = vec_mergel (src_a, src_e);					\
+    tempC1 = vec_mergeh (src_b, src_f);					\
+    tempD1 = vec_mergel (src_b, src_f);					\
+    tempE1 = vec_mergeh (src_c, src_g);					\
+    tempF1 = vec_mergel (src_c, src_g);					\
+    tempG1 = vec_mergeh (src_d, src_h);					\
+    tempH1 = vec_mergel (src_d, src_h);					\
+    tempA2 = vec_mergeh (tempA1, tempE1);				\
+    tempB2 = vec_mergel (tempA1, tempE1);				\
+    tempC2 = vec_mergeh (tempB1, tempF1);				\
+    tempD2 = vec_mergel (tempB1, tempF1);				\
+    tempE2 = vec_mergeh (tempC1, tempG1);				\
+    tempF2 = vec_mergel (tempC1, tempG1);				\
+    tempG2 = vec_mergeh (tempD1, tempH1);				\
+    tempH2 = vec_mergel (tempD1, tempH1);				\
+    src_a = vec_mergeh (tempA2, tempE2);				\
+    src_b = vec_mergel (tempA2, tempE2);				\
+    src_c = vec_mergeh (tempB2, tempF2);				\
+    src_d = vec_mergel (tempB2, tempF2);				\
+    src_e = vec_mergeh (tempC2, tempG2);				\
+    src_f = vec_mergel (tempC2, tempG2);				\
+    src_g = vec_mergeh (tempD2, tempH2);				\
+    src_h = vec_mergel (tempD2, tempH2);				\
+  } while (0)
+
+
 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
   /*
     this code makes no assumption on src or stride.
@@ -40,7 +73,9 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
   vector signed short v2QP;
   vector unsigned short v4QP;
   vector unsigned short v_dcThreshold;
-  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
+  const int properStride = (stride % 16);
+  const int srcAlign = ((unsigned long)src2 % 16);
+  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
   const vector signed int zero = vec_splat_s32(0);
   const vector signed short mask = vec_splat_s16(1);
   vector signed int v_numEq = vec_splat_s32(0);
@@ -57,6 +92,8 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
 
   src2 += stride * 4;
 
+  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
+
 #define LOAD_LINE(i)							\
   register int j##i = i * stride;					\
   vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
@@ -66,19 +103,41 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
     v_srcA2##i = vec_ld(j##i + 16, src2);				\
   const vector unsigned char v_srcA##i =				\
     vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
-  vector signed short v_srcAss##i =					\
+  v_srcAss##i =                                                         \
     (vector signed short)vec_mergeh((vector signed char)zero,		\
 				    (vector signed char)v_srcA##i)
 
-  LOAD_LINE(0);
-  LOAD_LINE(1);
-  LOAD_LINE(2);
-  LOAD_LINE(3);
-  LOAD_LINE(4);
-  LOAD_LINE(5);
-  LOAD_LINE(6);
-  LOAD_LINE(7);
+#define LOAD_LINE_ALIGNED(i)                                            \
+  register int j##i = i * stride;                                       \
+  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \
+  v_srcAss##i =                                                         \
+    (vector signed short)vec_mergeh((vector signed char)zero,		\
+				    (vector signed char)v_srcA##i)
+
+    // special casing the aligned case is worthwhile, as all call from
+    // the (transposed) horizontable deblocks will be aligned, i naddition
+    // to the naturraly aligned vertical deblocks.
+    if (properStride && srcAlign) {
+      LOAD_LINE_ALIGNED(0);
+      LOAD_LINE_ALIGNED(1);
+      LOAD_LINE_ALIGNED(2);
+      LOAD_LINE_ALIGNED(3);
+      LOAD_LINE_ALIGNED(4);
+      LOAD_LINE_ALIGNED(5);
+      LOAD_LINE_ALIGNED(6);
+      LOAD_LINE_ALIGNED(7);
+    } else {
+      LOAD_LINE(0);
+      LOAD_LINE(1);
+      LOAD_LINE(2);
+      LOAD_LINE(3);
+      LOAD_LINE(4);
+      LOAD_LINE(5);
+      LOAD_LINE(6);
+      LOAD_LINE(7);
+    }
 #undef LOAD_LINE
+#undef LOAD_LINE_ALIGNED
 
 #define ITER(i, j)							\
   const vector signed short v_diff##i =					\
@@ -133,7 +192,6 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
   else return 2; 
 }
 
-
 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
   /*
     this code makes no assumption on src or stride.
@@ -145,112 +203,130 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
   */
   uint8_t *src2 = src;
   const vector signed int zero = vec_splat_s32(0);
+  const int properStride = (stride % 16);
+  const int srcAlign = ((unsigned long)src2 % 16);
   short __attribute__ ((aligned(16))) qp[8];
   qp[0] = c->QP;
   vector signed short vqp = vec_ld(0, qp);
   vqp = vec_splat(vqp, 0);
 	
+  src2 += stride*3;
+
+  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
+  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
+  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
+  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
+	
 #define LOAD_LINE(i)                                                    \
   const vector unsigned char perml##i =					\
     vec_lvsl(i * stride, src2);						\
-  const vector unsigned char vbA##i =					\
-    vec_ld(i * stride, src2);						\
-  const vector unsigned char vbB##i =					\
-    vec_ld(i * stride + 16, src2);					\
-  const vector unsigned char vbT##i =					\
-    vec_perm(vbA##i, vbB##i, perml##i);					\
-  const vector signed short vb##i =					\
+  vbA##i = vec_ld(i * stride, src2);                                    \
+  vbB##i = vec_ld(i * stride + 16, src2);                               \
+  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \
+  vb##i =                                                               \
     (vector signed short)vec_mergeh((vector unsigned char)zero,		\
 				    (vector unsigned char)vbT##i)
-	
-  src2 += stride*3;
 
-  LOAD_LINE(0);
-  LOAD_LINE(1);
-  LOAD_LINE(2);
-  LOAD_LINE(3);
-  LOAD_LINE(4);
-  LOAD_LINE(5);
-  LOAD_LINE(6);
-  LOAD_LINE(7);
-  LOAD_LINE(8);
-  LOAD_LINE(9);
+#define LOAD_LINE_ALIGNED(i)                                            \
+  register int j##i = i * stride;                                       \
+  vbT##i = vec_ld(j##i, src2);                                          \
+  vb##i =                                                               \
+    (vector signed short)vec_mergeh((vector signed char)zero,		\
+				    (vector signed char)vbT##i)
+
+    // special casing the aligned case is worthwhile, as all call from
+    // the (transposed) horizontable deblocks will be aligned, in addition
+    // to the naturraly aligned vertical deblocks.
+    if (properStride && srcAlign) {
+      LOAD_LINE_ALIGNED(0);
+      LOAD_LINE_ALIGNED(1);
+      LOAD_LINE_ALIGNED(2);
+      LOAD_LINE_ALIGNED(3);
+      LOAD_LINE_ALIGNED(4);
+      LOAD_LINE_ALIGNED(5);
+      LOAD_LINE_ALIGNED(6);
+      LOAD_LINE_ALIGNED(7);
+      LOAD_LINE_ALIGNED(8);
+      LOAD_LINE_ALIGNED(9);
+    } else {
+      LOAD_LINE(0);
+      LOAD_LINE(1);
+      LOAD_LINE(2);
+      LOAD_LINE(3);
+      LOAD_LINE(4);
+      LOAD_LINE(5);
+      LOAD_LINE(6);
+      LOAD_LINE(7);
+      LOAD_LINE(8);
+      LOAD_LINE(9);
+    }
 #undef LOAD_LINE
+#undef LOAD_LINE_ALIGNED
 
   const vector unsigned short v_1 = vec_splat_u16(1);
   const vector unsigned short v_2 = vec_splat_u16(2);
   const vector unsigned short v_4 = vec_splat_u16(4);
-  const vector signed short v_8 = vec_splat_s16(8);
-
-  const vector signed short v_first = vec_sel(vb1, vb0,
-                                              vec_cmplt(vec_abs(vec_sub(vb0, vb1)),
-                                                        vqp));
-  const vector signed short v_last = vec_sel(vb8, vb9,
-                                             vec_cmplt(vec_abs(vec_sub(vb8, vb9)),
-                                                       vqp));
-
-  const vector signed short v_sums0 = vec_add(v_first, vb1);
-  const vector signed short v_sums1 = vec_add(vb1, vb2);
-  const vector signed short v_sums2 = vec_add(vb2, vb3);
-  const vector signed short v_sums3 = vec_add(vb3, vb4);
-  const vector signed short v_sums4 = vec_add(vb4, vb5);
-  const vector signed short v_sums5 = vec_add(vb5, vb6);
-  const vector signed short v_sums6 = vec_add(vb6, vb7);
-  const vector signed short v_sums7 = vec_add(vb7, vb8);
-  const vector signed short v_sums8 = vec_add(vb8, v_last);
-
-  const vector signed short vr1 = vec_sra(vec_add(vec_add(vec_sl(v_sums0, v_2),
-                                                          vec_sl(vec_add(v_first, v_sums2), v_1)),
-                                                  vec_add(v_sums4, v_8)),
-                                          v_4);
-  const vector signed short vr2 = vec_sra(vec_add(vec_add(vec_sl(vb2, v_2),
-                                                          v_sums5),
-                                                  vec_add(v_8,
-                                                          vec_sl(vec_add(v_first,
-                                                                         vec_add(v_sums0, v_sums3)),
-                                                                 v_1))),
-                                          v_4);
-  const vector signed short vr3 = vec_sra(vec_add(vec_add(vec_sl(vb3, v_2),
-                                                          v_sums6),
-                                                  vec_add(v_8,
-                                                          vec_sl(vec_add(v_first,
-                                                                         vec_add(v_sums1, v_sums4)),
-                                                                 v_1))),
-                                          v_4);
-  const vector signed short vr4 = vec_sra(vec_add(vec_add(vec_sl(vb4, v_2),
-                                                          v_sums7),
-                                                  vec_add(v_8,
-                                                          vec_add(v_sums0,
-                                                                  vec_sl(vec_add(v_sums2, v_sums5),
-                                                                         v_1)))),
-                                          v_4);
-  const vector signed short vr5 = vec_sra(vec_add(vec_add(vec_sl(vb5, v_2),
-                                                          v_sums8),
-                                                  vec_add(v_8,
-                                                          vec_add(v_sums1,
-                                                                  vec_sl(vec_add(v_sums3, v_sums6),
-                                                                         v_1)))),
-                                          v_4);
-  const vector signed short vr6 = vec_sra(vec_add(vec_add(vec_sl(vb6, v_2),
-                                                          v_sums2),
-                                                  vec_add(v_8,
-                                                          vec_sl(vec_add(v_last,
-                                                                         vec_add(v_sums7, v_sums4)),
-                                                                 v_1))),
-                                          v_4);
-  const vector signed short vr7 = vec_sra(vec_add(vec_add(vec_sl(vec_add(v_last, vb7), v_2),
-                                                          vec_sl(vec_add(vb8, v_sums5), v_1)),
-                                                  vec_add(v_8, v_sums3)),
-                                          v_4);
-  const vector signed short vr8 = vec_sra(vec_add(vec_add(vec_sl(v_sums8, v_2),
-                                                          vec_sl(vec_add(v_last, v_sums6), v_1)),
-                                                  vec_add(v_sums4, v_8)),
-                                          v_4);
-
-  const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1,
-							      -1, -1, -1, -1, -1, -1, -1, -1);
-  const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-								0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+
+  const vector signed short v_diff01 = vec_sub(vb0, vb1);
+  const vector unsigned short v_cmp01 =
+    (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
+  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
+  const vector signed short v_diff89 = vec_sub(vb8, vb9);
+  const vector unsigned short v_cmp89 =
+    (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
+  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
+  
+  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
+  const vector signed short temp02 = vec_add(vb2, vb3);
+  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
+  const vector signed short v_sumsB0 = vec_add(temp02, temp03);
+
+  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
+  const vector signed short v_sumsB1 = vec_add(temp11, vb4);
+
+  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
+  const vector signed short v_sumsB2 = vec_add(temp21, vb5);
+
+  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
+  const vector signed short v_sumsB3 = vec_add(temp31, vb6);
+
+  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
+  const vector signed short v_sumsB4 = vec_add(temp41, vb7);
+
+  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
+  const vector signed short v_sumsB5 = vec_add(temp51, vb8);
+
+  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
+  const vector signed short v_sumsB6 = vec_add(temp61, v_last);
+
+  const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
+  const vector signed short v_sumsB7 = vec_add(temp71, v_last);
+
+  const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
+  const vector signed short v_sumsB8 = vec_add(temp81, v_last);
+
+  const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
+  const vector signed short v_sumsB9 = vec_add(temp91, v_last);
+
+#define COMPUTE_VR(i, j, k)						\
+  const vector signed short temps1##i =					\
+    vec_add(v_sumsB##i, v_sumsB##k);					\
+  const vector signed short temps2##i =					\
+    vec_mladd(vb##j, (vector signed short)v_2, temps1##i);		\
+  const vector signed short  vr##j = vec_sra(temps2##i, v_4)
+
+  COMPUTE_VR(0, 1, 2);
+  COMPUTE_VR(1, 2, 3);
+  COMPUTE_VR(2, 3, 4);
+  COMPUTE_VR(3, 4, 5);
+  COMPUTE_VR(4, 5, 6);
+  COMPUTE_VR(5, 6, 7);
+  COMPUTE_VR(6, 7, 8);
+  COMPUTE_VR(7, 8, 9);
+
+  const vector signed char neg1 = vec_splat_s8(-1);
+  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
 
 #define PACK_AND_STORE(i)					\
   const vector unsigned char perms##i =				\
@@ -260,7 +336,7 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
   const vector unsigned char vg##i =				\
     vec_perm(vf##i, vbT##i, permHH);				\
   const vector unsigned char mask##i =				\
-    vec_perm((vector unsigned char)zero, neg1, perms##i);	\
+    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i);	\
   const vector unsigned char vg2##i =				\
     vec_perm(vg##i, vg##i, perms##i);				\
   const vector unsigned char svA##i =				\
@@ -270,16 +346,37 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
   vec_st(svA##i, i * stride, src2);				\
   vec_st(svB##i, i * stride + 16, src2)
 
-  PACK_AND_STORE(1);
-  PACK_AND_STORE(2);
-  PACK_AND_STORE(3);
-  PACK_AND_STORE(4);
-  PACK_AND_STORE(5);
-  PACK_AND_STORE(6);
-  PACK_AND_STORE(7);
-  PACK_AND_STORE(8);
+#define PACK_AND_STORE_ALIGNED(i)				\
+  const vector unsigned char vf##i =				\
+    vec_packsu(vr##i, (vector signed short)zero);		\
+  const vector unsigned char vg##i =				\
+    vec_perm(vf##i, vbT##i, permHH);				\
+  vec_st(vg##i, i * stride, src2)
 
+  // special casing the aligned case is worthwhile, as all call from
+  // the (transposed) horizontable deblocks will be aligned, in addition
+  // to the naturraly aligned vertical deblocks.
+  if (properStride && srcAlign) {
+    PACK_AND_STORE_ALIGNED(1);
+    PACK_AND_STORE_ALIGNED(2);
+    PACK_AND_STORE_ALIGNED(3);
+    PACK_AND_STORE_ALIGNED(4);
+    PACK_AND_STORE_ALIGNED(5);
+    PACK_AND_STORE_ALIGNED(6);
+    PACK_AND_STORE_ALIGNED(7);
+    PACK_AND_STORE_ALIGNED(8);
+  } else {
+    PACK_AND_STORE(1);
+    PACK_AND_STORE(2);
+    PACK_AND_STORE(3);
+    PACK_AND_STORE(4);
+    PACK_AND_STORE(5);
+    PACK_AND_STORE(6);
+    PACK_AND_STORE(7);
+    PACK_AND_STORE(8);
+  }
 #undef PACK_AND_STORE
+#undef PACK_AND_STORE_ALIGNED
 }
 
 
@@ -383,12 +480,10 @@ static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext
   /* finally, stores */
   const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
   const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
-	
-  const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1,
-							      -1, -1, -1, -1, -1, -1, -1, -1);
-	
-  const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-								0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+  
+  const vector signed char neg1 = vec_splat_s8(-1);
+  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
 	
 #define STORE(i)						\
   const vector unsigned char perms##i =				\
@@ -396,7 +491,7 @@ static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext
   const vector unsigned char vg##i =				\
     vec_perm(st##i, vbT##i, permHH);				\
   const vector unsigned char mask##i =				\
-    vec_perm((vector unsigned char)zero, neg1, perms##i);	\
+    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i);	\
   const vector unsigned char vg2##i =				\
     vec_perm(vg##i, vg##i, perms##i);				\
   const vector unsigned char svA##i =				\
@@ -680,7 +775,7 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
   ITER(6, 7, 8);
   ITER(7, 8, 9);
 
-  const vector signed char neg1 = vec_splat_s8( -1 );
+  const vector signed char neg1 = vec_splat_s8(-1);
 	
 #define STORE_LINE(i)					\
   const vector unsigned char permST##i =		\
@@ -708,6 +803,394 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
 #undef F2
 }
 
-#define horizClassify_altivec(a...) horizClassify_C(a)
 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
+#define do_a_deblock_altivec(a...) do_a_deblock_C(a)
+
+static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
+				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
+{
+  const vector signed int zero = vec_splat_s32(0);
+  const vector signed short vsint16_1 = vec_splat_s16(1);
+  vector signed int v_dp = zero;
+  vector signed int v_sysdp = zero;
+  int d, sysd, i;
+  
+  tempBluredPast[127]= maxNoise[0];
+  tempBluredPast[128]= maxNoise[1];
+  tempBluredPast[129]= maxNoise[2];
+
+#define LOAD_LINE(src, i)						\
+  register int j##src##i = i * stride;					\
+  vector unsigned char perm##src##i = vec_lvsl(j##src##i, src);		\
+  const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src);	\
+  const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
+  const vector unsigned char v_##src##A##i =				\
+    vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i);		\
+  vector signed short v_##src##Ass##i =					\
+    (vector signed short)vec_mergeh((vector signed char)zero,		\
+				    (vector signed char)v_##src##A##i)
+  
+  LOAD_LINE(src, 0);
+  LOAD_LINE(src, 1);
+  LOAD_LINE(src, 2);
+  LOAD_LINE(src, 3);
+  LOAD_LINE(src, 4);
+  LOAD_LINE(src, 5);
+  LOAD_LINE(src, 6);
+  LOAD_LINE(src, 7);
+
+  LOAD_LINE(tempBlured, 0);
+  LOAD_LINE(tempBlured, 1);
+  LOAD_LINE(tempBlured, 2);
+  LOAD_LINE(tempBlured, 3);
+  LOAD_LINE(tempBlured, 4);
+  LOAD_LINE(tempBlured, 5);
+  LOAD_LINE(tempBlured, 6);
+  LOAD_LINE(tempBlured, 7);
+#undef LOAD_LINE
+
+#define ACCUMULATE_DIFFS(i)					\
+  vector signed short v_d##i = vec_sub(v_tempBluredAss##i,	\
+				       v_srcAss##i);		\
+  v_dp = vec_msums(v_d##i, v_d##i, v_dp);			\
+  v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
+
+  ACCUMULATE_DIFFS(0);
+  ACCUMULATE_DIFFS(1);
+  ACCUMULATE_DIFFS(2);
+  ACCUMULATE_DIFFS(3);
+  ACCUMULATE_DIFFS(4);
+  ACCUMULATE_DIFFS(5);
+  ACCUMULATE_DIFFS(6);
+  ACCUMULATE_DIFFS(7);
+#undef ACCUMULATE_DIFFS
+
+  v_dp = vec_sums(v_dp, zero);
+  v_sysdp = vec_sums(v_sysdp, zero);
+
+  v_dp = vec_splat(v_dp, 3);
+  v_sysdp = vec_splat(v_sysdp, 3);
+  
+  vec_ste(v_dp, 0, &d);
+  vec_ste(v_sysdp, 0, &sysd);
+
+  i = d;
+  d = (4*d
+       +(*(tempBluredPast-256))
+       +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
+       +(*(tempBluredPast+256))
+       +4)>>3;
+
+  *tempBluredPast=i;
+
+  if (d > maxNoise[1]) {
+    if (d < maxNoise[2]) {
+#define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i);
+
+      OP(0);
+      OP(1);
+      OP(2);
+      OP(3);
+      OP(4);
+      OP(5);
+      OP(6);
+      OP(7);
+#undef OP
+    } else {
+#define OP(i) v_tempBluredAss##i = v_srcAss##i;
+
+      OP(0);
+      OP(1);
+      OP(2);
+      OP(3);
+      OP(4);
+      OP(5);
+      OP(6);
+      OP(7);
+#undef OP
+    }
+  } else {
+    if (d < maxNoise[0]) {
+      const vector signed short vsint16_7 = vec_splat_s16(7);
+      const vector signed short vsint16_4 = vec_splat_s16(4);
+      const vector unsigned short vuint16_3 = vec_splat_u16(3);
+      
+#define OP(i)								\
+      const vector signed short v_temp##i =				\
+	vec_mladd(v_tempBluredAss##i,					\
+		  vsint16_7, v_srcAss##i);				\
+      const vector signed short v_temp2##i =				\
+	vec_add(v_temp##i, vsint16_4);					\
+      v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
+
+      OP(0);
+      OP(1);
+      OP(2);
+      OP(3);
+      OP(4);
+      OP(5);
+      OP(6);
+      OP(7);
+#undef OP
+    } else {
+      const vector signed short vsint16_3 = vec_splat_s16(3);
+      const vector signed short vsint16_2 = vec_splat_s16(2);
+      
+#define OP(i)								\
+      const vector signed short v_temp##i =				\
+	vec_mladd(v_tempBluredAss##i,					\
+		  vsint16_3, v_srcAss##i);				\
+      const vector signed short v_temp2##i =				\
+	vec_add(v_temp##i, vsint16_2);					\
+      v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
+
+      OP(0);
+      OP(1);
+      OP(2);
+      OP(3);
+      OP(4);
+      OP(5);
+      OP(6);
+      OP(7);
+#undef OP
+    }
+  }
+
+  const vector signed char neg1 = vec_splat_s8(-1);
+  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+
+#define PACK_AND_STORE(src, i)						\
+  const vector unsigned char perms##src##i =				\
+    vec_lvsr(i * stride, src);						\
+  const vector unsigned char vf##src##i =				\
+    vec_packsu(v_tempBluredAss##i, (vector signed short)zero);		\
+  const vector unsigned char vg##src##i =				\
+    vec_perm(vf##src##i, v_##src##A##i, permHH);			\
+  const vector unsigned char mask##src##i =				\
+    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
+  const vector unsigned char vg2##src##i =				\
+    vec_perm(vg##src##i, vg##src##i, perms##src##i);			\
+  const vector unsigned char svA##src##i =				\
+    vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i);			\
+  const vector unsigned char svB##src##i =				\
+    vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i);			\
+  vec_st(svA##src##i, i * stride, src);					\
+  vec_st(svB##src##i, i * stride + 16, src)
+
+  PACK_AND_STORE(src, 0);
+  PACK_AND_STORE(src, 1);
+  PACK_AND_STORE(src, 2);
+  PACK_AND_STORE(src, 3);
+  PACK_AND_STORE(src, 4);
+  PACK_AND_STORE(src, 5);
+  PACK_AND_STORE(src, 6);
+  PACK_AND_STORE(src, 7);
+  PACK_AND_STORE(tempBlured, 0);
+  PACK_AND_STORE(tempBlured, 1);
+  PACK_AND_STORE(tempBlured, 2);
+  PACK_AND_STORE(tempBlured, 3);
+  PACK_AND_STORE(tempBlured, 4);
+  PACK_AND_STORE(tempBlured, 5);
+  PACK_AND_STORE(tempBlured, 6);
+  PACK_AND_STORE(tempBlured, 7);
+#undef PACK_AND_STORE
+}
+
+static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
+  const vector unsigned char zero = vec_splat_u8(0);
+
+#define LOAD_DOUBLE_LINE(i, j)						\
+  vector unsigned char perm1##i = vec_lvsl(i * stride, src);		\
+  vector unsigned char perm2##i = vec_lvsl(j * stride, src);		\
+  vector unsigned char srcA##i = vec_ld(i * stride, src);		\
+  vector unsigned char srcB##i = vec_ld(i * stride + 16, src);          \
+  vector unsigned char srcC##i = vec_ld(j * stride, src);		\
+  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);           \
+  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i);	\
+  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
+  
+  LOAD_DOUBLE_LINE(0, 1);
+  LOAD_DOUBLE_LINE(2, 3);
+  LOAD_DOUBLE_LINE(4, 5);
+  LOAD_DOUBLE_LINE(6, 7);
+#undef LOAD_DOUBLE_LINE
+
+  vector unsigned char tempA = vec_mergeh(src0, zero);
+  vector unsigned char tempB = vec_mergel(src0, zero);
+  vector unsigned char tempC = vec_mergeh(src1, zero);
+  vector unsigned char tempD = vec_mergel(src1, zero);
+  vector unsigned char tempE = vec_mergeh(src2, zero);
+  vector unsigned char tempF = vec_mergel(src2, zero);
+  vector unsigned char tempG = vec_mergeh(src3, zero);
+  vector unsigned char tempH = vec_mergel(src3, zero);
+  vector unsigned char tempI = vec_mergeh(src4, zero);
+  vector unsigned char tempJ = vec_mergel(src4, zero);
+  vector unsigned char tempK = vec_mergeh(src5, zero);
+  vector unsigned char tempL = vec_mergel(src5, zero);
+  vector unsigned char tempM = vec_mergeh(src6, zero);
+  vector unsigned char tempN = vec_mergel(src6, zero);
+  vector unsigned char tempO = vec_mergeh(src7, zero);
+  vector unsigned char tempP = vec_mergel(src7, zero);
+
+  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
+  vector unsigned char temp1 = vec_mergel(tempA, tempI);
+  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
+  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
+  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
+  vector unsigned char temp5 = vec_mergel(tempC, tempK);
+  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
+  vector unsigned char temp7 = vec_mergel(tempD, tempL);
+  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
+  vector unsigned char temp9 = vec_mergel(tempE, tempM);
+  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
+  vector unsigned char temp11 = vec_mergel(tempF, tempN);
+  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
+  vector unsigned char temp13 = vec_mergel(tempG, tempO);
+  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
+  vector unsigned char temp15 = vec_mergel(tempH, tempP);
+
+  tempA = vec_mergeh(temp0, temp8);
+  tempB = vec_mergel(temp0, temp8);
+  tempC = vec_mergeh(temp1, temp9);
+  tempD = vec_mergel(temp1, temp9);
+  tempE = vec_mergeh(temp2, temp10);
+  tempF = vec_mergel(temp2, temp10);
+  tempG = vec_mergeh(temp3, temp11);
+  tempH = vec_mergel(temp3, temp11);
+  tempI = vec_mergeh(temp4, temp12);
+  tempJ = vec_mergel(temp4, temp12);
+  tempK = vec_mergeh(temp5, temp13);
+  tempL = vec_mergel(temp5, temp13);
+  tempM = vec_mergeh(temp6, temp14);
+  tempN = vec_mergel(temp6, temp14);
+  tempO = vec_mergeh(temp7, temp15);
+  tempP = vec_mergel(temp7, temp15);
+
+  temp0 = vec_mergeh(tempA, tempI);
+  temp1 = vec_mergel(tempA, tempI);
+  temp2 = vec_mergeh(tempB, tempJ);
+  temp3 = vec_mergel(tempB, tempJ);
+  temp4 = vec_mergeh(tempC, tempK);
+  temp5 = vec_mergel(tempC, tempK);
+  temp6 = vec_mergeh(tempD, tempL);
+  temp7 = vec_mergel(tempD, tempL);
+  temp8 = vec_mergeh(tempE, tempM);
+  temp9 = vec_mergel(tempE, tempM);
+  temp10 = vec_mergeh(tempF, tempN);
+  temp11 = vec_mergel(tempF, tempN);
+  temp12 = vec_mergeh(tempG, tempO);
+  temp13 = vec_mergel(tempG, tempO);
+  temp14 = vec_mergeh(tempH, tempP);
+  temp15 = vec_mergel(tempH, tempP);
+
+  vec_st(temp0, 0, dst);
+  vec_st(temp1, 16, dst);
+  vec_st(temp2, 32, dst);
+  vec_st(temp3, 48, dst);
+  vec_st(temp4, 64, dst);
+  vec_st(temp5, 80, dst);
+  vec_st(temp6, 96, dst);
+  vec_st(temp7, 112, dst);
+  vec_st(temp8, 128, dst);
+  vec_st(temp9, 144, dst);
+  vec_st(temp10, 160, dst);
+  vec_st(temp11, 176, dst);
+  vec_st(temp12, 192, dst);
+  vec_st(temp13, 208, dst);
+  vec_st(temp14, 224, dst);
+  vec_st(temp15, 240, dst);
+}
+
+static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
+  const vector unsigned char zero = vec_splat_u8(0);
+  const vector unsigned char magic_perm = (const vector unsigned char)
+    AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+  
+#define LOAD_DOUBLE_LINE(i, j)			    		\
+  vector unsigned char src##i = vec_ld(i * 16, src);		\
+  vector unsigned char src##j = vec_ld(j * 16, src)
+
+  LOAD_DOUBLE_LINE(0, 1);
+  LOAD_DOUBLE_LINE(2, 3);
+  LOAD_DOUBLE_LINE(4, 5);
+  LOAD_DOUBLE_LINE(6, 7);
+  LOAD_DOUBLE_LINE(8, 9);
+  LOAD_DOUBLE_LINE(10, 11);
+  LOAD_DOUBLE_LINE(12, 13);
+  LOAD_DOUBLE_LINE(14, 15);
+#undef LOAD_DOUBLE_LINE
+
+  vector unsigned char tempA = vec_mergeh(src0, src8);
+  vector unsigned char tempB;
+  vector unsigned char tempC = vec_mergeh(src1, src9);
+  vector unsigned char tempD;
+  vector unsigned char tempE = vec_mergeh(src2, src10);
+  vector unsigned char tempG = vec_mergeh(src3, src11);
+  vector unsigned char tempI = vec_mergeh(src4, src12);
+  vector unsigned char tempJ;
+  vector unsigned char tempK = vec_mergeh(src5, src13);
+  vector unsigned char tempL;
+  vector unsigned char tempM = vec_mergeh(src6, src14);
+  vector unsigned char tempO = vec_mergeh(src7, src15);
+
+  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
+  vector unsigned char temp1 = vec_mergel(tempA, tempI);
+  vector unsigned char temp2;
+  vector unsigned char temp3;
+  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
+  vector unsigned char temp5 = vec_mergel(tempC, tempK);
+  vector unsigned char temp6;
+  vector unsigned char temp7;
+  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
+  vector unsigned char temp9 = vec_mergel(tempE, tempM);
+  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
+  vector unsigned char temp13 = vec_mergel(tempG, tempO);
+
+  tempA = vec_mergeh(temp0, temp8);
+  tempB = vec_mergel(temp0, temp8);
+  tempC = vec_mergeh(temp1, temp9);
+  tempD = vec_mergel(temp1, temp9);
+  tempI = vec_mergeh(temp4, temp12);
+  tempJ = vec_mergel(temp4, temp12);
+  tempK = vec_mergeh(temp5, temp13);
+  tempL = vec_mergel(temp5, temp13);
+
+  temp0 = vec_mergeh(tempA, tempI);
+  temp1 = vec_mergel(tempA, tempI);
+  temp2 = vec_mergeh(tempB, tempJ);
+  temp3 = vec_mergel(tempB, tempJ);
+  temp4 = vec_mergeh(tempC, tempK);
+  temp5 = vec_mergel(tempC, tempK);
+  temp6 = vec_mergeh(tempD, tempL);
+  temp7 = vec_mergel(tempD, tempL);
+
+
+  const vector signed char neg1 = vec_splat_s8(-1);
+#define STORE_DOUBLE_LINE(i, j)						\
+  vector unsigned char dstA##i = vec_ld(i * stride, dst);		\
+  vector unsigned char dstB##i = vec_ld(i * stride + 16, dst);		\
+  vector unsigned char dstA##j = vec_ld(j * stride, dst);		\
+  vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst);		\
+  vector unsigned char align##i = vec_lvsr(i * stride, dst);		\
+  vector unsigned char align##j = vec_lvsr(j * stride, dst);		\
+  vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i);	\
+  vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j);	\
+  vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);	\
+  vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);	\
+  vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i);	\
+  vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i);	\
+  vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j);	\
+  vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j);	\
+  vec_st(dstAF##i, i * stride, dst);					\
+  vec_st(dstBF##i, i * stride + 16, dst);				\
+  vec_st(dstAF##j, j * stride, dst);					\
+  vec_st(dstBF##j, j * stride + 16, dst)
+
+  STORE_DOUBLE_LINE(0,1);
+  STORE_DOUBLE_LINE(2,3);
+  STORE_DOUBLE_LINE(4,5);
+  STORE_DOUBLE_LINE(6,7);
+}
diff --git a/src/libffmpeg/libavcodec/libpostproc/postprocess_internal.h b/src/libffmpeg/libavcodec/libpostproc/postprocess_internal.h
index db50fa3b5..01d4679ad 100644
--- a/src/libffmpeg/libavcodec/libpostproc/postprocess_internal.h
+++ b/src/libffmpeg/libavcodec/libpostproc/postprocess_internal.h
@@ -37,9 +37,11 @@
 
 // Experimental vertical filters
 #define V_X1_FILTER	0x0200			// 512
+#define V_A_DEBLOCK	0x0400
 
 // Experimental horizontal filters
 #define H_X1_FILTER	0x2000			// 8192
+#define H_A_DEBLOCK	0x4000
 
 /// select between full y range (255-0) or standart one (234-16)
 #define FULL_Y_RANGE	0x8000			// 32768
@@ -158,3 +160,11 @@ typedef struct PPContext{
 } PPContext;
 
 
+static inline void linecpy(void *dest, void *src, int lines, int stride)
+{
+	if (stride > 0) {
+		memcpy(dest, src, lines*stride);
+	} else {
+		memcpy(dest+(lines-1)*stride, src+(lines-1)*stride, -lines*stride);
+	}
+}
diff --git a/src/libffmpeg/libavcodec/libpostproc/postprocess_template.c b/src/libffmpeg/libavcodec/libpostproc/postprocess_template.c
index 4e81bd556..d1307caca 100644
--- a/src/libffmpeg/libavcodec/libpostproc/postprocess_template.c
+++ b/src/libffmpeg/libavcodec/libpostproc/postprocess_template.c
@@ -22,15 +22,37 @@
  */
 
 
+#ifdef ARCH_X86_64
+#  define REGa  rax
+#  define REGc  rcx
+#  define REGd  rdx
+#  define REG_a  "rax"
+#  define REG_c  "rcx"
+#  define REG_d  "rdx"
+#  define REG_SP "rsp"
+#  define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8"
+#else
+#  define REGa  eax
+#  define REGc  ecx
+#  define REGd  edx
+#  define REG_a  "eax"
+#  define REG_c  "ecx"
+#  define REG_d  "edx"
+#  define REG_SP "esp"
+#  define ALIGN_MASK "$0xFFFFFFF8"
+#endif
+
+
 #undef PAVGB
 #undef PMINUB
 #undef PMAXUB
 
 #ifdef HAVE_MMX2
-#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
+#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
 #elif defined (HAVE_3DNOW)
-#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
+#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
 #endif
+#define PAVGB(a,b)  REAL_PAVGB(a,b)
 
 #ifdef HAVE_MMX2
 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
@@ -64,12 +86,12 @@ asm volatile(
                 );
                 
 asm volatile(
-		"leal (%2, %3), %%eax				\n\t"
+		"lea (%2, %3), %%"REG_a"			\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
 
 		"movq (%2), %%mm0				\n\t"
-		"movq (%%eax), %%mm1				\n\t"
+		"movq (%%"REG_a"), %%mm1			\n\t"
                 "movq %%mm0, %%mm3				\n\t"
                 "movq %%mm0, %%mm4				\n\t"
                 PMAXUB(%%mm1, %%mm4)
@@ -78,7 +100,7 @@ asm volatile(
 		"paddb %%mm7, %%mm0				\n\t"
 		"pcmpgtb %%mm6, %%mm0				\n\t"
 
-		"movq (%%eax,%3), %%mm2				\n\t"
+		"movq (%%"REG_a",%3), %%mm2			\n\t"
                 PMAXUB(%%mm2, %%mm4)
                 PMINUB(%%mm2, %%mm3, %%mm5)
 		"psubb %%mm2, %%mm1				\n\t"
@@ -86,7 +108,7 @@ asm volatile(
 		"pcmpgtb %%mm6, %%mm1				\n\t"
 		"paddb %%mm1, %%mm0				\n\t"
 
-		"movq (%%eax, %3, 2), %%mm1			\n\t"
+		"movq (%%"REG_a", %3, 2), %%mm1			\n\t"
                 PMAXUB(%%mm1, %%mm4)
                 PMINUB(%%mm1, %%mm3, %%mm5)
 		"psubb %%mm1, %%mm2				\n\t"
@@ -94,7 +116,7 @@ asm volatile(
 		"pcmpgtb %%mm6, %%mm2				\n\t"
 		"paddb %%mm2, %%mm0				\n\t"
 		
-		"leal (%%eax, %3, 4), %%eax			\n\t"
+		"lea (%%"REG_a", %3, 4), %%"REG_a"		\n\t"
 
 		"movq (%2, %3, 4), %%mm2			\n\t"
                 PMAXUB(%%mm2, %%mm4)
@@ -104,7 +126,7 @@ asm volatile(
 		"pcmpgtb %%mm6, %%mm1				\n\t"
 		"paddb %%mm1, %%mm0				\n\t"
 
-		"movq (%%eax), %%mm1				\n\t"
+		"movq (%%"REG_a"), %%mm1			\n\t"
                 PMAXUB(%%mm1, %%mm4)
                 PMINUB(%%mm1, %%mm3, %%mm5)
 		"psubb %%mm1, %%mm2				\n\t"
@@ -112,7 +134,7 @@ asm volatile(
 		"pcmpgtb %%mm6, %%mm2				\n\t"
 		"paddb %%mm2, %%mm0				\n\t"
 
-		"movq (%%eax, %3), %%mm2			\n\t"
+		"movq (%%"REG_a", %3), %%mm2			\n\t"
                 PMAXUB(%%mm2, %%mm4)
                 PMINUB(%%mm2, %%mm3, %%mm5)
 		"psubb %%mm2, %%mm1				\n\t"
@@ -120,7 +142,7 @@ asm volatile(
 		"pcmpgtb %%mm6, %%mm1				\n\t"
 		"paddb %%mm1, %%mm0				\n\t"
 
-		"movq (%%eax, %3, 2), %%mm1			\n\t"
+		"movq (%%"REG_a", %3, 2), %%mm1			\n\t"
                 PMAXUB(%%mm1, %%mm4)
                 PMINUB(%%mm1, %%mm3, %%mm5)
 		"psubb %%mm1, %%mm2				\n\t"
@@ -152,8 +174,8 @@ asm volatile(
 		"movd %%mm4, %1					\n\t"
 
 		: "=r" (numEq), "=r" (dcOk)
-		: "r" (src), "r" (stride), "m" (c->pQPb)
-		: "%eax"
+		: "r" (src), "r" ((long)stride), "m" (c->pQPb)
+		: "%"REG_a
 		);
 
 	numEq= (-numEq) &0xFF;
@@ -194,10 +216,10 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
 		"por %%mm2, %%mm6				\n\t"// First Line to Filter
 
 		"movq (%0, %1, 8), %%mm5			\n\t"
-		"leal (%0, %1, 4), %%eax			\n\t"
-		"leal (%0, %1, 8), %%ecx			\n\t"
-		"subl %1, %%ecx					\n\t"
-		"addl %1, %0					\n\t" // %0 points to line 1 not 0
+		"lea (%0, %1, 4), %%"REG_a"			\n\t"
+		"lea (%0, %1, 8), %%"REG_c"			\n\t"
+		"sub %1, %%"REG_c"				\n\t"
+		"add %1, %0					\n\t" // %0 points to line 1 not 0
 		"movq (%0, %1, 8), %%mm7			\n\t"
 		"movq %%mm5, %%mm1				\n\t"
 		"movq %%mm7, %%mm2				\n\t"
@@ -225,7 +247,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
 
 		"movq (%0, %1, 4), %%mm2			\n\t" //     1
 		"movq %%mm2, %%mm5				\n\t" //     1
-		PAVGB((%%eax), %%mm2)				      //    11	/2
+		PAVGB((%%REGa), %%mm2)				      //    11	/2
 		PAVGB((%0, %1, 2), %%mm2)			      //   211	/4
 		"movq %%mm2, %%mm3				\n\t" //   211	/4
 		"movq (%0), %%mm4				\n\t" // 1
@@ -237,15 +259,15 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
 		PAVGB(%%mm6, %%mm0)				      //1 1	/2
 		"movq %%mm4, %%mm3				\n\t" // 1
 		PAVGB((%0,%1,2), %%mm3)				      // 1 1	/2
-		PAVGB((%%eax,%1,2), %%mm5)			      //     11	/2
-		PAVGB((%%eax), %%mm5)				      //    211 /4
+		PAVGB((%%REGa,%1,2), %%mm5)			      //     11	/2
+		PAVGB((%%REGa), %%mm5)				      //    211 /4
 		PAVGB(%%mm5, %%mm3)				      // 2 2211 /8
 		PAVGB(%%mm0, %%mm3)				      //4242211 /16
 		"movq %%mm3, (%0,%1)				\n\t" //  X
 		// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
 		PAVGB(%%mm4, %%mm6)				      //11	/2
-		"movq (%%ecx), %%mm0				\n\t" //       1
-		PAVGB((%%eax, %1, 2), %%mm0)			      //      11/2
+		"movq (%%"REG_c"), %%mm0			\n\t" //       1
+		PAVGB((%%REGa, %1, 2), %%mm0)			      //      11/2
 		"movq %%mm0, %%mm3				\n\t" //      11/2
 		PAVGB(%%mm1, %%mm0)				      //  2   11/4
 		PAVGB(%%mm6, %%mm0)				      //222   11/8
@@ -253,17 +275,17 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
 		"movq (%0, %1, 2), %%mm2			\n\t" //   1
 		"movq %%mm0, (%0, %1, 2)			\n\t" //   X
 		// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
-		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
-		PAVGB((%%ecx), %%mm0)				      //       11	/2
+		"movq (%%"REG_a", %1, 4), %%mm0			\n\t" //        1
+		PAVGB((%%REGc), %%mm0)				      //       11	/2
 		PAVGB(%%mm0, %%mm6)				      //11     11	/4
 		PAVGB(%%mm1, %%mm4)				      // 11		/2
 		PAVGB(%%mm2, %%mm1)				      //  11		/2
 		PAVGB(%%mm1, %%mm6)				      //1122   11	/8
 		PAVGB(%%mm5, %%mm6)				      //112242211	/16
-		"movq (%%eax), %%mm5				\n\t" //    1
-		"movq %%mm6, (%%eax)				\n\t" //    X
+		"movq (%%"REG_a"), %%mm5			\n\t" //    1
+		"movq %%mm6, (%%"REG_a")			\n\t" //    X
 		// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
-		"movq (%%eax, %1, 4), %%mm6			\n\t" //        1
+		"movq (%%"REG_a", %1, 4), %%mm6			\n\t" //        1
 		PAVGB(%%mm7, %%mm6)				      //        11	/2
 		PAVGB(%%mm4, %%mm6)				      // 11     11	/4
 		PAVGB(%%mm3, %%mm6)				      // 11   2211	/8
@@ -276,29 +298,29 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
 		PAVGB(%%mm7, %%mm1)				      //  11     2	/4
 		PAVGB(%%mm4, %%mm5)				      //    11		/2
 		PAVGB(%%mm5, %%mm0)				      //    11 11	/4
-		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
+		"movq (%%"REG_a", %1, 2), %%mm6			\n\t" //      1
 		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
 		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
-		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
+		"movq %%mm1, (%%"REG_a", %1, 2)			\n\t" //      X
 		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
-		PAVGB((%%ecx), %%mm2)				      //   112 4	/8
-		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
+		PAVGB((%%REGc), %%mm2)				      //   112 4	/8
+		"movq (%%"REG_a", %1, 4), %%mm0			\n\t" //        1
 		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
 		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
 		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
-		"movq %%mm6, (%%ecx)				\n\t" //       X
+		"movq %%mm6, (%%"REG_c")			\n\t" //       X
 		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
 		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
 		PAVGB(%%mm7, %%mm5)				      //    11   6	/8
 
 		PAVGB(%%mm3, %%mm0)				      //      112	/4
 		PAVGB(%%mm0, %%mm5)				      //    112246	/16
-		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
-		"subl %1, %0					\n\t"
+		"movq %%mm5, (%%"REG_a", %1, 4)			\n\t" //        X
+		"sub %1, %0					\n\t"
 
 		:
-		: "r" (src), "r" (stride), "m" (c->pQPb)
-		: "%eax", "%ecx"
+		: "r" (src), "r" ((long)stride), "m" (c->pQPb)
+		: "%"REG_a, "%"REG_c
 	);
 #else
 	const int l1= stride;
@@ -317,25 +339,26 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
 		const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
 		const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
 
-		int sums[9];
-		sums[0] = first + src[l1];
-		sums[1] = src[l1] + src[l2];
-		sums[2] = src[l2] + src[l3];
-		sums[3] = src[l3] + src[l4];
-		sums[4] = src[l4] + src[l5];
-		sums[5] = src[l5] + src[l6];
-		sums[6] = src[l6] + src[l7];
-		sums[7] = src[l7] + src[l8];
-		sums[8] = src[l8] + last;
-
-		src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
-		src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
-		src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
-		src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
-		src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
-		src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
-		src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
-		src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
+		int sums[10];
+		sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
+		sums[1] = sums[0] - first  + src[l4];
+		sums[2] = sums[1] - first  + src[l5];
+		sums[3] = sums[2] - first  + src[l6];
+		sums[4] = sums[3] - first  + src[l7];
+		sums[5] = sums[4] - src[l1] + src[l8];
+		sums[6] = sums[5] - src[l2] + last;
+		sums[7] = sums[6] - src[l3] + last;
+		sums[8] = sums[7] - src[l4] + last;
+		sums[9] = sums[8] - src[l5] + last;
+
+		src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
+		src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
+		src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
+		src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
+		src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
+		src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
+		src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
+		src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
 
 		src++;
 	}
@@ -363,8 +386,8 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
 		"movq "MANGLE(b80)", %%mm6			\n\t" // MIN_SIGNED_BYTE
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
+		"leal (%0, %1), %%"REG_a"			\n\t"
+		"leal (%%"REG_a", %1, 4), %%"REG_c"		\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
 		"movq "MANGLE(pQPb)", %%mm0			\n\t" // QP,..., QP
@@ -374,7 +397,7 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
 		"pand "MANGLE(b3F)", %%mm0			\n\t" // QP/4,..., QP/4
 		"paddusb %%mm1, %%mm0				\n\t" // QP*1.25 ...
 		"movq (%0, %1, 4), %%mm2			\n\t" // line 4
-		"movq (%%ecx), %%mm3				\n\t" // line 5
+		"movq (%%"REG_c"), %%mm3				\n\t" // line 5
 		"movq %%mm2, %%mm4				\n\t" // line 4
 		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
 		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
@@ -392,32 +415,32 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
 //		"psubb %%mm6, %%mm2				\n\t"
 		"movq %%mm2, (%0,%1, 4)				\n\t"
 
-		"movq (%%ecx), %%mm2				\n\t"
+		"movq (%%"REG_c"), %%mm2				\n\t"
 //		"paddb %%mm6, %%mm2				\n\t" // line 5 + 0x80
 		"psubb %%mm5, %%mm2				\n\t"
 //		"psubb %%mm6, %%mm2				\n\t"
-		"movq %%mm2, (%%ecx)				\n\t"
+		"movq %%mm2, (%%"REG_c")				\n\t"
 
 		"paddb %%mm6, %%mm5				\n\t"
 		"psrlw $2, %%mm5				\n\t"
 		"pand "MANGLE(b3F)", %%mm5			\n\t"
 		"psubb "MANGLE(b20)", %%mm5			\n\t" // (l5-l4)/8
 
-		"movq (%%eax, %1, 2), %%mm2			\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm2			\n\t"
 		"paddb %%mm6, %%mm2				\n\t" // line 3 + 0x80
 		"paddsb %%mm5, %%mm2				\n\t"
 		"psubb %%mm6, %%mm2				\n\t"
-		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+		"movq %%mm2, (%%"REG_a", %1, 2)			\n\t"
 
-		"movq (%%ecx, %1), %%mm2			\n\t"
+		"movq (%%"REG_c", %1), %%mm2			\n\t"
 		"paddb %%mm6, %%mm2				\n\t" // line 6 + 0x80
 		"psubsb %%mm5, %%mm2				\n\t"
 		"psubb %%mm6, %%mm2				\n\t"
-		"movq %%mm2, (%%ecx, %1)			\n\t"
+		"movq %%mm2, (%%"REG_c", %1)			\n\t"
 
 		:
-		: "r" (src), "r" (stride)
-		: "%eax", "%ecx"
+		: "r" (src), "r" ((long)stride)
+		: "%"REG_a, "%"REG_c
 	);
 #else
  	const int l1= stride;
@@ -463,18 +486,18 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
 
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t" // 0
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_c"		\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
+		"movq (%%"REG_a", %1, 2), %%mm0			\n\t" // line 3
 		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
 		"movq %%mm1, %%mm2				\n\t" // line 4
 		"psubusb %%mm0, %%mm1				\n\t"
 		"psubusb %%mm2, %%mm0				\n\t"
 		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
-		"movq (%%ecx), %%mm3				\n\t" // line 5
-		"movq (%%ecx, %1), %%mm4			\n\t" // line 6
+		"movq (%%"REG_c"), %%mm3				\n\t" // line 5
+		"movq (%%"REG_c", %1), %%mm4			\n\t" // line 6
 		"movq %%mm3, %%mm5				\n\t" // line 5
 		"psubusb %%mm4, %%mm3				\n\t"
 		"psubusb %%mm5, %%mm4				\n\t"
@@ -506,43 +529,43 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
 		"pxor %%mm2, %%mm0				\n\t"
 		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
 
-		"movq (%%ecx), %%mm0				\n\t" // line 5
+		"movq (%%"REG_c"), %%mm0			\n\t" // line 5
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
 		"paddusb %%mm3, %%mm0				\n\t"
 		"pxor %%mm2, %%mm0				\n\t"
-		"movq %%mm0, (%%ecx)				\n\t" // line 5
+		"movq %%mm0, (%%"REG_c")			\n\t" // line 5
 
 		PAVGB(%%mm7, %%mm1)				      // d/4
 
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
+		"movq (%%"REG_a", %1, 2), %%mm0			\n\t" // line 3
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
 		"psubusb %%mm1, %%mm0				\n\t"
 		"pxor %%mm2, %%mm0				\n\t"
-		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
+		"movq %%mm0, (%%"REG_a", %1, 2)			\n\t" // line 3
 
-		"movq (%%ecx, %1), %%mm0			\n\t" // line 6
+		"movq (%%"REG_c", %1), %%mm0			\n\t" // line 6
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
 		"paddusb %%mm1, %%mm0				\n\t"
 		"pxor %%mm2, %%mm0				\n\t"
-		"movq %%mm0, (%%ecx, %1)			\n\t" // line 6
+		"movq %%mm0, (%%"REG_c", %1)			\n\t" // line 6
 
 		PAVGB(%%mm7, %%mm1)				      // d/8
 
-		"movq (%%eax, %1), %%mm0			\n\t" // line 2
+		"movq (%%"REG_a", %1), %%mm0			\n\t" // line 2
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
 		"psubusb %%mm1, %%mm0				\n\t"
 		"pxor %%mm2, %%mm0				\n\t"
-		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
+		"movq %%mm0, (%%"REG_a", %1)			\n\t" // line 2
 
-		"movq (%%ecx, %1, 2), %%mm0			\n\t" // line 7
+		"movq (%%"REG_c", %1, 2), %%mm0			\n\t" // line 7
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
 		"paddusb %%mm1, %%mm0				\n\t"
 		"pxor %%mm2, %%mm0				\n\t"
-		"movq %%mm0, (%%ecx, %1, 2)			\n\t" // line 7
+		"movq %%mm0, (%%"REG_c", %1, 2)			\n\t" // line 7
 
 		:
-		: "r" (src), "r" (stride), "m" (co->pQPb)
-		: "%eax", "%ecx"
+		: "r" (src), "r" ((long)stride), "m" (co->pQPb)
+		: "%"REG_a, "%"REG_c
 	);
 #else
 
@@ -607,8 +630,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 
 #if 0 //sligtly more accurate and slightly slower
 		"pxor %%mm7, %%mm7				\n\t" // 0
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_c"		\n\t"
 //	0	1	2	3	4	5	6	7
 //	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
@@ -621,8 +644,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		PAVGB(%%mm1, %%mm0)				      // ~(l2 + 2l0)/4
 		PAVGB(%%mm2, %%mm0)				      // ~(5l2 + 2l0)/8
 
-		"movq (%%eax), %%mm1				\n\t" // l1
-		"movq (%%eax, %1, 2), %%mm3			\n\t" // l3
+		"movq (%%"REG_a"), %%mm1			\n\t" // l1
+		"movq (%%"REG_a", %1, 2), %%mm3			\n\t" // l3
 		"movq %%mm1, %%mm4				\n\t" // l1
 		PAVGB(%%mm7, %%mm1)				      // ~l1/2
 		PAVGB(%%mm3, %%mm1)				      // ~(l1 + 2l3)/4
@@ -640,7 +663,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		PAVGB(%%mm2, %%mm0)				      // ~(l4 + 2l2)/4
 		PAVGB(%%mm4, %%mm0)				      // ~(5l4 + 2l2)/8
 
-		"movq (%%ecx), %%mm2				\n\t" // l5
+		"movq (%%"REG_c"), %%mm2			\n\t" // l5
 		"movq %%mm3, %%mm5				\n\t" // l3
 		PAVGB(%%mm7, %%mm3)				      // ~l3/2
 		PAVGB(%%mm2, %%mm3)				      // ~(l3 + 2l5)/4
@@ -653,13 +676,13 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		"pcmpeqb %%mm7, %%mm0				\n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
 
-		"movq (%%ecx, %1), %%mm6			\n\t" // l6
+		"movq (%%"REG_c", %1), %%mm6			\n\t" // l6
 		"movq %%mm6, %%mm5				\n\t" // l6
 		PAVGB(%%mm7, %%mm6)				      // ~l6/2
 		PAVGB(%%mm4, %%mm6)				      // ~(l6 + 2l4)/4
 		PAVGB(%%mm5, %%mm6)				      // ~(5l6 + 2l4)/8
 
-		"movq (%%ecx, %1, 2), %%mm5			\n\t" // l7
+		"movq (%%"REG_c", %1, 2), %%mm5			\n\t" // l7
 		"movq %%mm2, %%mm4				\n\t" // l5
 		PAVGB(%%mm7, %%mm2)				      // ~l5/2
 		PAVGB(%%mm5, %%mm2)				      // ~(l5 + 2l7)/4
@@ -686,7 +709,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		"paddusb %%mm1, %%mm3				\n\t"
 //		"paddusb "MANGLE(b01)", %%mm3			\n\t"
 
-		"movq (%%eax, %1, 2), %%mm6			\n\t" //l3
+		"movq (%%"REG_a", %1, 2), %%mm6			\n\t" //l3
 		"movq (%0, %1, 4), %%mm5			\n\t" //l4
 		"movq (%0, %1, 4), %%mm4			\n\t" //l4
 		"psubusb %%mm6, %%mm5				\n\t"
@@ -700,7 +723,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		"psubusb "MANGLE(b01)", %%mm3			\n\t"
 		PAVGB(%%mm7, %%mm3)
 
-		"movq (%%eax, %1, 2), %%mm0			\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm0			\n\t"
 		"movq (%0, %1, 4), %%mm2			\n\t"
 		"pxor %%mm6, %%mm0				\n\t"
 		"pxor %%mm6, %%mm2				\n\t"
@@ -708,36 +731,36 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		"paddb %%mm3, %%mm2				\n\t"
 		"pxor %%mm6, %%mm0				\n\t"
 		"pxor %%mm6, %%mm2				\n\t"
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
+		"movq %%mm0, (%%"REG_a", %1, 2)			\n\t"
 		"movq %%mm2, (%0, %1, 4)			\n\t"
 #endif
 
-		"leal (%0, %1), %%eax				\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
 		"pcmpeqb %%mm6, %%mm6				\n\t" // -1
 //	0	1	2	3	4	5	6	7
 //	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
 
 
-		"movq (%%eax, %1, 2), %%mm1			\n\t" // l3
+		"movq (%%"REG_a", %1, 2), %%mm1			\n\t" // l3
 		"movq (%0, %1, 4), %%mm0			\n\t" // l4
 		"pxor %%mm6, %%mm1				\n\t" // -l3-1
 		PAVGB(%%mm1, %%mm0)				      // -q+128 = (l4-l3+256)/2
 // mm1=-l3-1, mm0=128-q
 
-		"movq (%%eax, %1, 4), %%mm2			\n\t" // l5
-		"movq (%%eax, %1), %%mm3			\n\t" // l2
+		"movq (%%"REG_a", %1, 4), %%mm2			\n\t" // l5
+		"movq (%%"REG_a", %1), %%mm3			\n\t" // l2
 		"pxor %%mm6, %%mm2				\n\t" // -l5-1
 		"movq %%mm2, %%mm5				\n\t" // -l5-1
 		"movq "MANGLE(b80)", %%mm4			\n\t" // 128
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_c"		\n\t"
 		PAVGB(%%mm3, %%mm2)				      // (l2-l5+256)/2
 		PAVGB(%%mm0, %%mm4)				      // ~(l4-l3)/4 + 128
 		PAVGB(%%mm2, %%mm4)				      // ~(l2-l5)/4 +(l4-l3)/8 + 128
 		PAVGB(%%mm0, %%mm4)				      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
 
-		"movq (%%eax), %%mm2				\n\t" // l1
+		"movq (%%"REG_a"), %%mm2			\n\t" // l1
 		"pxor %%mm6, %%mm2				\n\t" // -l1-1
 		PAVGB(%%mm3, %%mm2)				      // (l2-l1+256)/2
 		PAVGB((%0), %%mm1)				      // (l0-l3+256)/2
@@ -747,8 +770,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		PAVGB(%%mm2, %%mm3)				      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
 
-		PAVGB((%%ecx, %1), %%mm5)			      // (l6-l5+256)/2
-		"movq (%%ecx, %1, 2), %%mm1			\n\t" // l7
+		PAVGB((%%REGc, %1), %%mm5)			      // (l6-l5+256)/2
+		"movq (%%"REG_c", %1, 2), %%mm1			\n\t" // l7
 		"pxor %%mm6, %%mm1				\n\t" // -l7-1
 		PAVGB((%0, %1, 4), %%mm1)			      // (l4-l7+256)/2
 		"movq "MANGLE(b80)", %%mm2			\n\t" // 128
@@ -797,7 +820,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		"pxor %%mm1, %%mm7				\n\t" // SIGN(d*q)
 
 		"pand %%mm7, %%mm4				\n\t"
-		"movq (%%eax, %1, 2), %%mm0			\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm0			\n\t"
 		"movq (%0, %1, 4), %%mm2			\n\t"
 		"pxor %%mm1, %%mm0				\n\t"
 		"pxor %%mm1, %%mm2				\n\t"
@@ -805,12 +828,12 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext
 		"psubb %%mm4, %%mm2				\n\t"
 		"pxor %%mm1, %%mm0				\n\t"
 		"pxor %%mm1, %%mm2				\n\t"
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
+		"movq %%mm0, (%%"REG_a", %1, 2)			\n\t"
 		"movq %%mm2, (%0, %1, 4)			\n\t"
 
 		:
-		: "r" (src), "r" (stride), "m" (c->pQPb)
-		: "%eax", "%ecx"
+		: "r" (src), "r" ((long)stride), "m" (c->pQPb)
+		: "%"REG_a, "%"REG_c
 	);
 
 /*
@@ -881,8 +904,8 @@ src-=8;
 	src+= stride*4;
 	asm volatile(
 		"pxor %%mm7, %%mm7				\n\t"
-		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
-		"andl $0xFFFFFFF8, %%ecx			\n\t" // align
+		"lea -40(%%"REG_SP"), %%"REG_c"			\n\t" // make space for 4 8-byte vars
+		"and "ALIGN_MASK", %%"REG_c"			\n\t" // align
 //	0	1	2	3	4	5	6	7
 //	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	edx+%1	edx+2%1
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1
@@ -893,12 +916,12 @@ src-=8;
 		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
 
 		"movq (%0, %1), %%mm2				\n\t"
-		"leal (%0, %1, 2), %%eax			\n\t"
+		"lea (%0, %1, 2), %%"REG_a"			\n\t"
 		"movq %%mm2, %%mm3				\n\t"
 		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
 		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
 
-		"movq (%%eax), %%mm4				\n\t"
+		"movq (%%"REG_a"), %%mm4			\n\t"
 		"movq %%mm4, %%mm5				\n\t"
 		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
 		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
@@ -915,7 +938,7 @@ src-=8;
 		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
 		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
 
-		"movq (%%eax, %1), %%mm2			\n\t"
+		"movq (%%"REG_a", %1), %%mm2			\n\t"
 		"movq %%mm2, %%mm3				\n\t"
 		"punpcklbw %%mm7, %%mm2				\n\t" // L3
 		"punpckhbw %%mm7, %%mm3				\n\t" // H3
@@ -924,24 +947,24 @@ src-=8;
 		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
 		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
 		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
-		"movq %%mm0, (%%ecx)				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
-		"movq %%mm1, 8(%%ecx)				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+		"movq %%mm0, (%%"REG_c")			\n\t" // 2L0 - 5L1 + 5L2 - 2L3
+		"movq %%mm1, 8(%%"REG_c")			\n\t" // 2H0 - 5H1 + 5H2 - 2H3
 
-		"movq (%%eax, %1, 2), %%mm0			\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm0			\n\t"
 		"movq %%mm0, %%mm1				\n\t"
 		"punpcklbw %%mm7, %%mm0				\n\t" // L4
 		"punpckhbw %%mm7, %%mm1				\n\t" // H4
 
 		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
 		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
-		"movq %%mm2, 16(%%ecx)				\n\t" // L3 - L4
-		"movq %%mm3, 24(%%ecx)				\n\t" // H3 - H4
+		"movq %%mm2, 16(%%"REG_c")			\n\t" // L3 - L4
+		"movq %%mm3, 24(%%"REG_c")			\n\t" // H3 - H4
 		"paddw %%mm4, %%mm4				\n\t" // 2L2
 		"paddw %%mm5, %%mm5				\n\t" // 2H2
 		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
 		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
 
-		"leal (%%eax, %1), %0				\n\t"
+		"lea (%%"REG_a", %1), %0			\n\t"
 		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
 		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
 		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
@@ -956,10 +979,10 @@ src-=8;
 		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
 		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
 
-		"movq (%%eax, %1, 4), %%mm6			\n\t"
+		"movq (%%"REG_a", %1, 4), %%mm6			\n\t"
 		"punpcklbw %%mm7, %%mm6				\n\t" // L6
 		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
-		"movq (%%eax, %1, 4), %%mm6			\n\t"
+		"movq (%%"REG_a", %1, 4), %%mm6			\n\t"
 		"punpckhbw %%mm7, %%mm6				\n\t" // H6
 		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
 
@@ -983,8 +1006,8 @@ src-=8;
 		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
 		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
 
-		"movq (%%ecx), %%mm2				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
-		"movq 8(%%ecx), %%mm3				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+		"movq (%%"REG_c"), %%mm2			\n\t" // 2L0 - 5L1 + 5L2 - 2L3
+		"movq 8(%%"REG_c"), %%mm3			\n\t" // 2H0 - 5H1 + 5H2 - 2H3
 
 #ifdef HAVE_MMX2
 		"movq %%mm7, %%mm6				\n\t" // 0
@@ -1030,6 +1053,9 @@ src-=8;
 		"psubw %%mm6, %%mm1				\n\t"
 #endif
 
+		"movd %2, %%mm2					\n\t" // QP
+		"punpcklbw %%mm7, %%mm2				\n\t"
+
 		"movq %%mm7, %%mm6				\n\t" // 0
 		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
 		"pxor %%mm6, %%mm4				\n\t"
@@ -1038,7 +1064,6 @@ src-=8;
 		"pxor %%mm7, %%mm5				\n\t"
 		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
 // 100 opcodes
-		"movd %2, %%mm2					\n\t" // QP
 		"psllw $3, %%mm2				\n\t" // 8QP
 		"movq %%mm2, %%mm3				\n\t" // 8QP
 		"pcmpgtw %%mm4, %%mm2				\n\t"
@@ -1060,8 +1085,8 @@ src-=8;
 		"psrlw $6, %%mm4				\n\t"
 		"psrlw $6, %%mm5				\n\t"
 
-		"movq 16(%%ecx), %%mm0				\n\t" // L3 - L4
-		"movq 24(%%ecx), %%mm1				\n\t" // H3 - H4
+		"movq 16(%%"REG_c"), %%mm0			\n\t" // L3 - L4
+		"movq 24(%%"REG_c"), %%mm1			\n\t" // H3 - H4
 
 		"pxor %%mm2, %%mm2				\n\t"
 		"pxor %%mm3, %%mm3				\n\t"
@@ -1104,8 +1129,8 @@ src-=8;
 		"movq %%mm0, (%0, %1)				\n\t"
 
 		: "+r" (src)
-		: "r" (stride), "m" (c->pQPb)
-		: "%eax", "%ecx"
+		: "r" ((long)stride), "m" (c->pQPb)
+		: "%"REG_a, "%"REG_c
 	);
 #else
 	const int l1= stride;
@@ -1168,20 +1193,20 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
 		"packuswb %%mm0, %%mm0				\n\t"
 		"movq %%mm0, %3					\n\t"
 
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
 		
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
 
 #undef FIND_MIN_MAX
 #ifdef HAVE_MMX2
-#define FIND_MIN_MAX(addr)\
+#define REAL_FIND_MIN_MAX(addr)\
 		"movq " #addr ", %%mm0				\n\t"\
 		"pminub %%mm0, %%mm7				\n\t"\
 		"pmaxub %%mm0, %%mm6				\n\t"
 #else
-#define FIND_MIN_MAX(addr)\
+#define REAL_FIND_MIN_MAX(addr)\
 		"movq " #addr ", %%mm0				\n\t"\
 		"movq %%mm7, %%mm1				\n\t"\
 		"psubusb %%mm0, %%mm6				\n\t"\
@@ -1189,14 +1214,15 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
 		"psubusb %%mm0, %%mm1				\n\t"\
 		"psubb %%mm1, %%mm7				\n\t"
 #endif
+#define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
 
-FIND_MIN_MAX((%%eax))
-FIND_MIN_MAX((%%eax, %1))
-FIND_MIN_MAX((%%eax, %1, 2))
+FIND_MIN_MAX((%%REGa))
+FIND_MIN_MAX((%%REGa, %1))
+FIND_MIN_MAX((%%REGa, %1, 2))
 FIND_MIN_MAX((%0, %1, 4))
-FIND_MIN_MAX((%%edx))
-FIND_MIN_MAX((%%edx, %1))
-FIND_MIN_MAX((%%edx, %1, 2))
+FIND_MIN_MAX((%%REGd))
+FIND_MIN_MAX((%%REGd, %1))
+FIND_MIN_MAX((%%REGd, %1, 2))
 FIND_MIN_MAX((%0, %1, 8))
 
 		"movq %%mm7, %%mm4				\n\t"
@@ -1249,13 +1275,13 @@ FIND_MIN_MAX((%0, %1, 8))
 		"movd %%mm6, %%ecx				\n\t"
 		"cmpb "MANGLE(deringThreshold)", %%cl		\n\t"
 		" jb 1f						\n\t"
-		"leal -24(%%esp), %%ecx				\n\t"
-		"andl $0xFFFFFFF8, %%ecx			\n\t" 
+		"lea -24(%%"REG_SP"), %%"REG_c"			\n\t"
+		"and "ALIGN_MASK", %%"REG_c"			\n\t" 
 		PAVGB(%%mm0, %%mm7)				      // a=(max + min)/2
 		"punpcklbw %%mm7, %%mm7				\n\t"
 		"punpcklbw %%mm7, %%mm7				\n\t"
 		"punpcklbw %%mm7, %%mm7				\n\t"
-		"movq %%mm7, (%%ecx)				\n\t"
+		"movq %%mm7, (%%"REG_c")			\n\t"
 
 		"movq (%0), %%mm0				\n\t" // L10
 		"movq %%mm0, %%mm1				\n\t" // L10
@@ -1280,13 +1306,13 @@ FIND_MIN_MAX((%0, %1, 8))
 		"paddb %%mm2, %%mm0				\n\t"
 		"paddb %%mm3, %%mm0				\n\t"
 
-		"movq (%%eax), %%mm2				\n\t" // L11
+		"movq (%%"REG_a"), %%mm2			\n\t" // L11
 		"movq %%mm2, %%mm3				\n\t" // L11
 		"movq %%mm2, %%mm4				\n\t" // L11
 		"psllq $8, %%mm3				\n\t"
 		"psrlq $8, %%mm4				\n\t"
-		"movd -4(%%eax), %%mm5				\n\t"
-		"movd 8(%%eax), %%mm6				\n\t"
+		"movd -4(%%"REG_a"), %%mm5			\n\t"
+		"movd 8(%%"REG_a"), %%mm6			\n\t"
 		"psrlq $24, %%mm5				\n\t"
 		"psllq $56, %%mm6				\n\t"
 		"por %%mm5, %%mm3				\n\t" // L01
@@ -1303,7 +1329,7 @@ FIND_MIN_MAX((%0, %1, 8))
 		"paddb %%mm4, %%mm2				\n\t"
 		"paddb %%mm5, %%mm2				\n\t"
 // 0, 2, 3, 1
-#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
+#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
 		"movq " #src ", " #sx "				\n\t" /* src[0] */\
 		"movq " #sx ", " #lx "				\n\t" /* src[0] */\
 		"movq " #sx ", " #t0 "				\n\t" /* src[0] */\
@@ -1319,8 +1345,8 @@ FIND_MIN_MAX((%0, %1, 8))
 		PAVGB(t0, lx)				              /* (src[-1] + src[+1])/2 */\
 		PAVGB(sx, lx)				      /* (src[-1] + 2src[0] + src[+1])/4 */\
 		PAVGB(lx, pplx)					     \
-		"movq " #lx ", 8(%%ecx)				\n\t"\
-		"movq (%%ecx), " #lx "				\n\t"\
+		"movq " #lx ", 8(%%"REG_c")			\n\t"\
+		"movq (%%"REG_c"), " #lx "			\n\t"\
 		"psubusb " #lx ", " #t1 "			\n\t"\
 		"psubusb " #lx ", " #t0 "			\n\t"\
 		"psubusb " #lx ", " #sx "			\n\t"\
@@ -1347,8 +1373,10 @@ FIND_MIN_MAX((%0, %1, 8))
 		"pandn " #dst ", " #ppsx "			\n\t"\
 		"por " #pplx ", " #ppsx "			\n\t"\
 		"movq " #ppsx ", " #dst "			\n\t"\
-		"movq 8(%%ecx), " #lx "				\n\t"
+		"movq 8(%%"REG_c"), " #lx "			\n\t"
 
+#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
+   REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
 /*
 0000000
 1111111
@@ -1365,18 +1393,18 @@ FIND_MIN_MAX((%0, %1, 8))
 
 */
 //DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
-DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
-DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
-DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
-DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
-DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
-DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
-DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
-DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
+DERING_CORE((%%REGa),(%%REGa, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
+DERING_CORE((%%REGa, %1),(%%REGa, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
+DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
+DERING_CORE((%0, %1, 4),(%%REGd)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
+DERING_CORE((%%REGd),(%%REGd, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
+DERING_CORE((%%REGd, %1), (%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
+DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
+DERING_CORE((%0, %1, 8),(%%REGd, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
 
 		"1:			\n\t"
-		: : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
-		: "%eax", "%edx", "%ecx"
+		: : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
+		: "%"REG_a, "%"REG_d, "%"REG_c
 	);
 #else
 	int y;
@@ -1523,27 +1551,27 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	src+= 4*stride;
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_c"		\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
 
 		"movq (%0), %%mm0				\n\t"
-		"movq (%%eax, %1), %%mm1			\n\t"
+		"movq (%%"REG_a", %1), %%mm1			\n\t"
 		PAVGB(%%mm1, %%mm0)
-		"movq %%mm0, (%%eax)				\n\t"
+		"movq %%mm0, (%%"REG_a")			\n\t"
 		"movq (%0, %1, 4), %%mm0			\n\t"
 		PAVGB(%%mm0, %%mm1)
-		"movq %%mm1, (%%eax, %1, 2)			\n\t"
-		"movq (%%ecx, %1), %%mm1			\n\t"
+		"movq %%mm1, (%%"REG_a", %1, 2)			\n\t"
+		"movq (%%"REG_c", %1), %%mm1			\n\t"
 		PAVGB(%%mm1, %%mm0)
-		"movq %%mm0, (%%ecx)				\n\t"
+		"movq %%mm0, (%%"REG_c")			\n\t"
 		"movq (%0, %1, 8), %%mm0			\n\t"
 		PAVGB(%%mm0, %%mm1)
-		"movq %%mm1, (%%ecx, %1, 2)			\n\t"
+		"movq %%mm1, (%%"REG_c", %1, 2)			\n\t"
 
-		: : "r" (src), "r" (stride)
-		: "%eax", "%ecx"
+		: : "r" (src), "r" ((long)stride)
+		: "%"REG_a, "%"REG_c
 	);
 #else
 	int a, b, x;
@@ -1576,15 +1604,15 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	src+= stride*3;
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
-		"leal (%%edx, %1, 4), %%ecx			\n\t"
-		"addl %1, %%ecx					\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
+		"lea (%%"REG_d", %1, 4), %%"REG_c"		\n\t"
+		"add %1, %%"REG_c"				\n\t"
 		"pxor %%mm7, %%mm7				\n\t"
 //	0	1	2	3	4	5	6	7	8	9	10
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
 
-#define DEINT_CUBIC(a,b,c,d,e)\
+#define REAL_DEINT_CUBIC(a,b,c,d,e)\
 		"movq " #a ", %%mm0				\n\t"\
 		"movq " #b ", %%mm1				\n\t"\
 		"movq " #d ", %%mm2				\n\t"\
@@ -1605,14 +1633,15 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride
 		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
 		"packuswb %%mm3, %%mm1				\n\t"\
 		"movq %%mm1, " #c "				\n\t"
+#define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
 
-DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
-DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
-DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
-DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
+DEINT_CUBIC((%0), (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd, %1))
+DEINT_CUBIC((%%REGa, %1), (%0, %1, 4), (%%REGd), (%%REGd, %1), (%0, %1, 8))
+DEINT_CUBIC((%0, %1, 4), (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGc))
+DEINT_CUBIC((%%REGd, %1), (%0, %1, 8), (%%REGd, %1, 4), (%%REGc), (%%REGc, %1, 2))
 
-		: : "r" (src), "r" (stride)
-		: "%eax", "%edx", "ecx"
+		: : "r" (src), "r" ((long)stride)
+		: "%"REG_a, "%"REG_d, "%"REG_c
 	);
 #else
 	int x;
@@ -1640,14 +1669,14 @@ static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	src+= stride*4;
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
 		"pxor %%mm7, %%mm7				\n\t"
 		"movq (%2), %%mm0				\n\t"
 //	0	1	2	3	4	5	6	7	8	9	10
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
 
-#define DEINT_FF(a,b,c,d)\
+#define REAL_DEINT_FF(a,b,c,d)\
 		"movq " #a ", %%mm1				\n\t"\
 		"movq " #b ", %%mm2				\n\t"\
 		"movq " #c ", %%mm3				\n\t"\
@@ -1675,14 +1704,16 @@ static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp
 		"packuswb %%mm4, %%mm1				\n\t"\
 		"movq %%mm1, " #b "				\n\t"\
 
-DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
-DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
-DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
-DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
+#define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
+
+DEINT_FF((%0)       ,  (%%REGa)       , (%%REGa, %1), (%%REGa, %1, 2))
+DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4),  (%%REGd)       )
+DEINT_FF((%0, %1, 4),  (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
+DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8),  (%%REGd, %1, 4))
 
 		"movq %%mm0, (%2)				\n\t"
-		: : "r" (src), "r" (stride), "r"(tmp)
-		: "%eax", "%edx"
+		: : "r" (src), "r" ((long)stride), "r"(tmp)
+		: "%"REG_a, "%"REG_d
 	);
 #else
 	int x;
@@ -1718,15 +1749,15 @@ static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	src+= stride*4;
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
 		"pxor %%mm7, %%mm7				\n\t"
 		"movq (%2), %%mm0				\n\t"
 		"movq (%3), %%mm1				\n\t"
 //	0	1	2	3	4	5	6	7	8	9	10
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
 
-#define DEINT_L5(t1,t2,a,b,c)\
+#define REAL_DEINT_L5(t1,t2,a,b,c)\
 		"movq " #a ", %%mm2				\n\t"\
 		"movq " #b ", %%mm3				\n\t"\
 		"movq " #c ", %%mm4				\n\t"\
@@ -1759,19 +1790,21 @@ static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp
 		"packuswb %%mm5, %%mm2				\n\t"\
 		"movq %%mm2, " #a "				\n\t"\
 
-DEINT_L5(%%mm0, %%mm1, (%0)          , (%%eax)       , (%%eax, %1)   )
-DEINT_L5(%%mm1, %%mm0, (%%eax)       , (%%eax, %1)   , (%%eax, %1, 2))
-DEINT_L5(%%mm0, %%mm1, (%%eax, %1)   , (%%eax, %1, 2), (%0, %1, 4)   )
-DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4)   , (%%edx)       )
-DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)   , (%%edx)       , (%%edx, %1)   )  
-DEINT_L5(%%mm1, %%mm0, (%%edx)       , (%%edx, %1)   , (%%edx, %1, 2))
-DEINT_L5(%%mm0, %%mm1, (%%edx, %1)   , (%%edx, %1, 2), (%0, %1, 8)   )
-DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8)   , (%%edx, %1, 4))
+#define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
+
+DEINT_L5(%%mm0, %%mm1, (%0)           , (%%REGa)       , (%%REGa, %1)   )
+DEINT_L5(%%mm1, %%mm0, (%%REGa)       , (%%REGa, %1)   , (%%REGa, %1, 2))
+DEINT_L5(%%mm0, %%mm1, (%%REGa, %1)   , (%%REGa, %1, 2), (%0, %1, 4)   )
+DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4)    , (%%REGd)       )
+DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)    , (%%REGd)       , (%%REGd, %1)   )  
+DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
+DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
+DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
 
 		"movq %%mm0, (%2)				\n\t"
 		"movq %%mm1, (%3)				\n\t"
-		: : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2)
-		: "%eax", "%edx"
+		: : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
+		: "%"REG_a, "%"REG_d
 	);
 #else
 	int x;
@@ -1818,49 +1851,49 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	src+= 4*stride;
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
 
 		"movq (%2), %%mm0				\n\t" // L0
-		"movq (%%eax), %%mm1				\n\t" // L2
+		"movq (%%"REG_a"), %%mm1			\n\t" // L2
 		PAVGB(%%mm1, %%mm0)				      // L0+L2
 		"movq (%0), %%mm2				\n\t" // L1
 		PAVGB(%%mm2, %%mm0)
 		"movq %%mm0, (%0)				\n\t"
-		"movq (%%eax, %1), %%mm0			\n\t" // L3
+		"movq (%%"REG_a", %1), %%mm0			\n\t" // L3
 		PAVGB(%%mm0, %%mm2)				      // L1+L3
 		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
-		"movq %%mm2, (%%eax)				\n\t"
-		"movq (%%eax, %1, 2), %%mm2			\n\t" // L4
+		"movq %%mm2, (%%"REG_a")			\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm2			\n\t" // L4
 		PAVGB(%%mm2, %%mm1)				      // L2+L4
 		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
-		"movq %%mm1, (%%eax, %1)			\n\t"
+		"movq %%mm1, (%%"REG_a", %1)			\n\t"
 		"movq (%0, %1, 4), %%mm1			\n\t" // L5
 		PAVGB(%%mm1, %%mm0)				      // L3+L5
 		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
-		"movq (%%edx), %%mm0				\n\t" // L6
+		"movq %%mm0, (%%"REG_a", %1, 2)			\n\t"
+		"movq (%%"REG_d"), %%mm0			\n\t" // L6
 		PAVGB(%%mm0, %%mm2)				      // L4+L6
 		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
 		"movq %%mm2, (%0, %1, 4)			\n\t"
-		"movq (%%edx, %1), %%mm2			\n\t" // L7
+		"movq (%%"REG_d", %1), %%mm2			\n\t" // L7
 		PAVGB(%%mm2, %%mm1)				      // L5+L7
 		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
-		"movq %%mm1, (%%edx)				\n\t"
-		"movq (%%edx, %1, 2), %%mm1			\n\t" // L8
+		"movq %%mm1, (%%"REG_d")			\n\t"
+		"movq (%%"REG_d", %1, 2), %%mm1			\n\t" // L8
 		PAVGB(%%mm1, %%mm0)				      // L6+L8
 		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
-		"movq %%mm0, (%%edx, %1)			\n\t"
+		"movq %%mm0, (%%"REG_d", %1)			\n\t"
 		"movq (%0, %1, 8), %%mm0			\n\t" // L9
 		PAVGB(%%mm0, %%mm2)				      // L7+L9
 		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
-		"movq %%mm2, (%%edx, %1, 2)			\n\t"
+		"movq %%mm2, (%%"REG_d", %1, 2)			\n\t"
 		"movq %%mm1, (%2)				\n\t"
 
-		: : "r" (src), "r" (stride), "r" (tmp)
-		: "%eax", "%edx"
+		: : "r" (src), "r" ((long)stride), "r" (tmp)
+		: "%"REG_a, "%"REG_d
 	);
 #else
 	int a, b, c, x;
@@ -1920,62 +1953,62 @@ static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
 	src+= 4*stride;
 #ifdef HAVE_MMX2
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
 
 		"movq (%0), %%mm0				\n\t" //
-		"movq (%%eax, %1), %%mm2			\n\t" //
-		"movq (%%eax), %%mm1				\n\t" //
+		"movq (%%"REG_a", %1), %%mm2			\n\t" //
+		"movq (%%"REG_a"), %%mm1			\n\t" //
 		"movq %%mm0, %%mm3				\n\t"
 		"pmaxub %%mm1, %%mm0				\n\t" //
 		"pminub %%mm3, %%mm1				\n\t" //
 		"pmaxub %%mm2, %%mm1				\n\t" //
 		"pminub %%mm1, %%mm0				\n\t"
-		"movq %%mm0, (%%eax)				\n\t"
+		"movq %%mm0, (%%"REG_a")			\n\t"
 
 		"movq (%0, %1, 4), %%mm0			\n\t" //
-		"movq (%%eax, %1, 2), %%mm1			\n\t" //
+		"movq (%%"REG_a", %1, 2), %%mm1			\n\t" //
 		"movq %%mm2, %%mm3				\n\t"
 		"pmaxub %%mm1, %%mm2				\n\t" //
 		"pminub %%mm3, %%mm1				\n\t" //
 		"pmaxub %%mm0, %%mm1				\n\t" //
 		"pminub %%mm1, %%mm2				\n\t"
-		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+		"movq %%mm2, (%%"REG_a", %1, 2)			\n\t"
 
-		"movq (%%edx), %%mm2				\n\t" //
-		"movq (%%edx, %1), %%mm1			\n\t" //
+		"movq (%%"REG_d"), %%mm2			\n\t" //
+		"movq (%%"REG_d", %1), %%mm1			\n\t" //
 		"movq %%mm2, %%mm3				\n\t"
 		"pmaxub %%mm0, %%mm2				\n\t" //
 		"pminub %%mm3, %%mm0				\n\t" //
 		"pmaxub %%mm1, %%mm0				\n\t" //
 		"pminub %%mm0, %%mm2				\n\t"
-		"movq %%mm2, (%%edx)				\n\t"
+		"movq %%mm2, (%%"REG_d")			\n\t"
 
-		"movq (%%edx, %1, 2), %%mm2			\n\t" //
+		"movq (%%"REG_d", %1, 2), %%mm2			\n\t" //
 		"movq (%0, %1, 8), %%mm0			\n\t" //
 		"movq %%mm2, %%mm3				\n\t"
 		"pmaxub %%mm0, %%mm2				\n\t" //
 		"pminub %%mm3, %%mm0				\n\t" //
 		"pmaxub %%mm1, %%mm0				\n\t" //
 		"pminub %%mm0, %%mm2				\n\t"
-		"movq %%mm2, (%%edx, %1, 2)			\n\t"
+		"movq %%mm2, (%%"REG_d", %1, 2)			\n\t"
 
 
-		: : "r" (src), "r" (stride)
-		: "%eax", "%edx"
+		: : "r" (src), "r" ((long)stride)
+		: "%"REG_a, "%"REG_d
 	);
 
 #else // MMX without MMX2
 	asm volatile(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"			\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_d"		\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
 		"pxor %%mm7, %%mm7				\n\t"
 
-#define MEDIAN(a,b,c)\
+#define REAL_MEDIAN(a,b,c)\
 		"movq " #a ", %%mm0				\n\t"\
 		"movq " #b ", %%mm2				\n\t"\
 		"movq " #c ", %%mm1				\n\t"\
@@ -1998,14 +2031,15 @@ static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
 		"pand %%mm2, %%mm0				\n\t"\
 		"pand %%mm1, %%mm0				\n\t"\
 		"movq %%mm0, " #b "				\n\t"
+#define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
 
-MEDIAN((%0), (%%eax), (%%eax, %1))
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
-MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
-MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
+MEDIAN((%0), (%%REGa), (%%REGa, %1))
+MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
+MEDIAN((%0, %1, 4), (%%REGd), (%%REGd, %1))
+MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
 
-		: : "r" (src), "r" (stride)
-		: "%eax", "%edx"
+		: : "r" (src), "r" ((long)stride)
+		: "%"REG_a, "%"REG_d
 	);
 #endif // MMX
 #else
@@ -2039,17 +2073,17 @@ MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
 {
 	asm(
-		"leal (%0, %1), %%eax				\n\t"
+		"lea (%0, %1), %%"REG_a"	\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
 		"movq (%0), %%mm0		\n\t" // 12345678
-		"movq (%%eax), %%mm1		\n\t" // abcdefgh
+		"movq (%%"REG_a"), %%mm1	\n\t" // abcdefgh
 		"movq %%mm0, %%mm2		\n\t" // 12345678
 		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
 		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
 
-		"movq (%%eax, %1), %%mm1	\n\t"
-		"movq (%%eax, %1, 2), %%mm3	\n\t"
+		"movq (%%"REG_a", %1), %%mm1	\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm3	\n\t"
 		"movq %%mm1, %%mm4		\n\t"
 		"punpcklbw %%mm3, %%mm1		\n\t"
 		"punpckhbw %%mm3, %%mm4		\n\t"
@@ -2076,16 +2110,16 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src
 		"psrlq $32, %%mm1		\n\t"
 		"movd %%mm1, 112(%3)		\n\t"
 
-		"leal (%%eax, %1, 4), %%eax	\n\t"
+		"lea (%%"REG_a", %1, 4), %%"REG_a"	\n\t"
 		
 		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
-		"movq (%%eax), %%mm1		\n\t" // abcdefgh
+		"movq (%%"REG_a"), %%mm1	\n\t" // abcdefgh
 		"movq %%mm0, %%mm2		\n\t" // 12345678
 		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
 		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
 
-		"movq (%%eax, %1), %%mm1	\n\t"
-		"movq (%%eax, %1, 2), %%mm3	\n\t"
+		"movq (%%"REG_a", %1), %%mm1	\n\t"
+		"movq (%%"REG_a", %1, 2), %%mm3	\n\t"
 		"movq %%mm1, %%mm4		\n\t"
 		"punpcklbw %%mm3, %%mm1		\n\t"
 		"punpckhbw %%mm3, %%mm4		\n\t"
@@ -2113,8 +2147,8 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src
 		"movd %%mm1, 116(%3)		\n\t"
 
 
-	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
-	: "%eax"
+	:: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
+	: "%"REG_a
 	);
 }
 
@@ -2124,8 +2158,8 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src
 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
 {
 	asm(
-		"leal (%0, %1), %%eax				\n\t"
-		"leal (%%eax, %1, 4), %%edx			\n\t"
+		"lea (%0, %1), %%"REG_a"	\n\t"
+		"lea (%%"REG_a",%1,4), %%"REG_d"\n\t"
 //	0	1	2	3	4	5	6	7	8	9
 //	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
 		"movq (%2), %%mm0		\n\t" // 12345678
@@ -2149,16 +2183,16 @@ static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
 
 		"movd %%mm0, (%0)		\n\t"
 		"psrlq $32, %%mm0		\n\t"
-		"movd %%mm0, (%%eax)		\n\t"
-		"movd %%mm3, (%%eax, %1)	\n\t"
+		"movd %%mm0, (%%"REG_a")	\n\t"
+		"movd %%mm3, (%%"REG_a", %1)	\n\t"
 		"psrlq $32, %%mm3		\n\t"
-		"movd %%mm3, (%%eax, %1, 2)	\n\t"
+		"movd %%mm3, (%%"REG_a", %1, 2)	\n\t"
 		"movd %%mm2, (%0, %1, 4)	\n\t"
 		"psrlq $32, %%mm2		\n\t"
-		"movd %%mm2, (%%edx)		\n\t"
-		"movd %%mm1, (%%edx, %1)	\n\t"
+		"movd %%mm2, (%%"REG_d")	\n\t"
+		"movd %%mm1, (%%"REG_d", %1)	\n\t"
 		"psrlq $32, %%mm1		\n\t"
-		"movd %%mm1, (%%edx, %1, 2)	\n\t"
+		"movd %%mm1, (%%"REG_d", %1, 2)	\n\t"
 
 
 		"movq 64(%2), %%mm0		\n\t" // 12345678
@@ -2182,24 +2216,25 @@ static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
 
 		"movd %%mm0, 4(%0)		\n\t"
 		"psrlq $32, %%mm0		\n\t"
-		"movd %%mm0, 4(%%eax)		\n\t"
-		"movd %%mm3, 4(%%eax, %1)	\n\t"
+		"movd %%mm0, 4(%%"REG_a")		\n\t"
+		"movd %%mm3, 4(%%"REG_a", %1)	\n\t"
 		"psrlq $32, %%mm3		\n\t"
-		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
+		"movd %%mm3, 4(%%"REG_a", %1, 2)	\n\t"
 		"movd %%mm2, 4(%0, %1, 4)	\n\t"
 		"psrlq $32, %%mm2		\n\t"
-		"movd %%mm2, 4(%%edx)		\n\t"
-		"movd %%mm1, 4(%%edx, %1)	\n\t"
+		"movd %%mm2, 4(%%"REG_d")		\n\t"
+		"movd %%mm1, 4(%%"REG_d", %1)	\n\t"
 		"psrlq $32, %%mm1		\n\t"
-		"movd %%mm1, 4(%%edx, %1, 2)	\n\t"
+		"movd %%mm1, 4(%%"REG_d", %1, 2)	\n\t"
 
-	:: "r" (dst), "r" (dstStride), "r" (src)
-	: "%eax", "%edx"
+	:: "r" (dst), "r" ((long)dstStride), "r" (src)
+	: "%"REG_a, "%"REG_d
 	);
 }
 #endif
-//static int test=0;
+//static long test=0;
 
+#ifndef HAVE_ALTIVEC
 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
 				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
 {
@@ -2212,9 +2247,9 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
 //#define L1_DIFF //u should change the thresholds too if u try that one
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
 	asm volatile(
-		"leal (%2, %2, 2), %%eax			\n\t" // 3*stride
-		"leal (%2, %2, 4), %%edx			\n\t" // 5*stride
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
+		"lea (%2, %2, 2), %%"REG_a"			\n\t" // 3*stride
+		"lea (%2, %2, 4), %%"REG_d"			\n\t" // 5*stride
+		"lea (%%"REG_d", %2, 2), %%"REG_c"		\n\t" // 7*stride
 //	0	1	2	3	4	5	6	7	8	9
 //	%x	%x+%2	%x+2%2	%x+eax	%x+4%2	%x+edx	%x+2eax	%x+ecx	%x+8%2
 //FIXME reorder?
@@ -2225,29 +2260,30 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
 		"psadbw (%1, %2), %%mm1				\n\t" // |L1-R1|
 		"movq (%0, %2, 2), %%mm2			\n\t" // L2
 		"psadbw (%1, %2, 2), %%mm2			\n\t" // |L2-R2|
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
-		"psadbw (%1, %%eax), %%mm3			\n\t" // |L3-R3|
+		"movq (%0, %%"REG_a"), %%mm3			\n\t" // L3
+		"psadbw (%1, %%"REG_a"), %%mm3			\n\t" // |L3-R3|
 
 		"movq (%0, %2, 4), %%mm4			\n\t" // L4
 		"paddw %%mm1, %%mm0				\n\t"
 		"psadbw (%1, %2, 4), %%mm4			\n\t" // |L4-R4|
-		"movq (%0, %%edx), %%mm5			\n\t" // L5
+		"movq (%0, %%"REG_d"), %%mm5			\n\t" // L5
 		"paddw %%mm2, %%mm0				\n\t"
-		"psadbw (%1, %%edx), %%mm5			\n\t" // |L5-R5|
-		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
+		"psadbw (%1, %%"REG_d"), %%mm5			\n\t" // |L5-R5|
+		"movq (%0, %%"REG_a", 2), %%mm6			\n\t" // L6
 		"paddw %%mm3, %%mm0				\n\t"
-		"psadbw (%1, %%eax, 2), %%mm6			\n\t" // |L6-R6|
-		"movq (%0, %%ecx), %%mm7			\n\t" // L7
+		"psadbw (%1, %%"REG_a", 2), %%mm6		\n\t" // |L6-R6|
+		"movq (%0, %%"REG_c"), %%mm7			\n\t" // L7
 		"paddw %%mm4, %%mm0				\n\t"
-		"psadbw (%1, %%ecx), %%mm7			\n\t" // |L7-R7|
+		"psadbw (%1, %%"REG_c"), %%mm7			\n\t" // |L7-R7|
 		"paddw %%mm5, %%mm6				\n\t"
 		"paddw %%mm7, %%mm6				\n\t"
 		"paddw %%mm6, %%mm0				\n\t"
-#elif defined (FAST_L2_DIFF)
+#else
+#if defined (FAST_L2_DIFF)
 		"pcmpeqb %%mm7, %%mm7				\n\t"
 		"movq "MANGLE(b80)", %%mm6			\n\t"
 		"pxor %%mm0, %%mm0				\n\t"
-#define L2_DIFF_CORE(a, b)\
+#define REAL_L2_DIFF_CORE(a, b)\
 		"movq " #a ", %%mm5				\n\t"\
 		"movq " #b ", %%mm2				\n\t"\
 		"pxor %%mm7, %%mm2				\n\t"\
@@ -2261,19 +2297,10 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
 		"psrld $14, %%mm5				\n\t"\
 		"paddd %%mm5, %%mm0				\n\t"
 
-L2_DIFF_CORE((%0), (%1))
-L2_DIFF_CORE((%0, %2), (%1, %2))
-L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
-L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
-L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
-L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
-L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
-L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
-
 #else
 		"pxor %%mm7, %%mm7				\n\t"
 		"pxor %%mm0, %%mm0				\n\t"
-#define L2_DIFF_CORE(a, b)\
+#define REAL_L2_DIFF_CORE(a, b)\
 		"movq " #a ", %%mm5				\n\t"\
 		"movq " #b ", %%mm2				\n\t"\
 		"movq %%mm5, %%mm1				\n\t"\
@@ -2289,14 +2316,18 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 		"paddd %%mm1, %%mm5				\n\t"\
 		"paddd %%mm5, %%mm0				\n\t"
 
+#endif
+
+#define L2_DIFF_CORE(a, b)  REAL_L2_DIFF_CORE(a, b)
+
 L2_DIFF_CORE((%0), (%1))
 L2_DIFF_CORE((%0, %2), (%1, %2))
 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
-L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
+L2_DIFF_CORE((%0, %%REGa), (%1, %%REGa))
 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
-L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
-L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
-L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
+L2_DIFF_CORE((%0, %%REGd), (%1, %%REGd))
+L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
+L2_DIFF_CORE((%0, %%REGc), (%1, %%REGc))
 
 #endif
 
@@ -2305,94 +2336,94 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 		"paddd %%mm0, %%mm4				\n\t"
 		"movd %%mm4, %%ecx				\n\t"
 		"shll $2, %%ecx					\n\t"
-		"movl %3, %%edx					\n\t"
-		"addl -4(%%edx), %%ecx				\n\t"
-		"addl 4(%%edx), %%ecx				\n\t"
-		"addl -1024(%%edx), %%ecx			\n\t"
+		"mov %3, %%"REG_d"				\n\t"
+		"addl -4(%%"REG_d"), %%ecx			\n\t"
+		"addl 4(%%"REG_d"), %%ecx			\n\t"
+		"addl -1024(%%"REG_d"), %%ecx			\n\t"
 		"addl $4, %%ecx					\n\t"
-		"addl 1024(%%edx), %%ecx			\n\t"
+		"addl 1024(%%"REG_d"), %%ecx			\n\t"
 		"shrl $3, %%ecx					\n\t"
-		"movl %%ecx, (%%edx)				\n\t"
+		"movl %%ecx, (%%"REG_d")			\n\t"
 
-//		"movl %3, %%ecx					\n\t"
-//		"movl %%ecx, test				\n\t"
+//		"mov %3, %%"REG_c"				\n\t"
+//		"mov %%"REG_c", test				\n\t"
 //		"jmp 4f \n\t"
-		"cmpl 512(%%edx), %%ecx				\n\t"
+		"cmpl 512(%%"REG_d"), %%ecx			\n\t"
 		" jb 2f						\n\t"
-		"cmpl 516(%%edx), %%ecx				\n\t"
+		"cmpl 516(%%"REG_d"), %%ecx			\n\t"
 		" jb 1f						\n\t"
 
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
+		"lea (%%"REG_a", %2, 2), %%"REG_d"		\n\t" // 5*stride
+		"lea (%%"REG_d", %2, 2), %%"REG_c"		\n\t" // 7*stride
 		"movq (%0), %%mm0				\n\t" // L0
 		"movq (%0, %2), %%mm1				\n\t" // L1
 		"movq (%0, %2, 2), %%mm2			\n\t" // L2
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
+		"movq (%0, %%"REG_a"), %%mm3			\n\t" // L3
 		"movq (%0, %2, 4), %%mm4			\n\t" // L4
-		"movq (%0, %%edx), %%mm5			\n\t" // L5
-		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
-		"movq (%0, %%ecx), %%mm7			\n\t" // L7
+		"movq (%0, %%"REG_d"), %%mm5			\n\t" // L5
+		"movq (%0, %%"REG_a", 2), %%mm6			\n\t" // L6
+		"movq (%0, %%"REG_c"), %%mm7			\n\t" // L7
 		"movq %%mm0, (%1)				\n\t" // L0
 		"movq %%mm1, (%1, %2)				\n\t" // L1
 		"movq %%mm2, (%1, %2, 2)			\n\t" // L2
-		"movq %%mm3, (%1, %%eax)			\n\t" // L3
+		"movq %%mm3, (%1, %%"REG_a")			\n\t" // L3
 		"movq %%mm4, (%1, %2, 4)			\n\t" // L4
-		"movq %%mm5, (%1, %%edx)			\n\t" // L5
-		"movq %%mm6, (%1, %%eax, 2)			\n\t" // L6
-		"movq %%mm7, (%1, %%ecx)			\n\t" // L7
+		"movq %%mm5, (%1, %%"REG_d")			\n\t" // L5
+		"movq %%mm6, (%1, %%"REG_a", 2)			\n\t" // L6
+		"movq %%mm7, (%1, %%"REG_c")			\n\t" // L7
 		"jmp 4f						\n\t"
 
 		"1:						\n\t"
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
+		"lea (%%"REG_a", %2, 2), %%"REG_d"		\n\t" // 5*stride
+		"lea (%%"REG_d", %2, 2), %%"REG_c"		\n\t" // 7*stride
 		"movq (%0), %%mm0				\n\t" // L0
 		PAVGB((%1), %%mm0)				      // L0
 		"movq (%0, %2), %%mm1				\n\t" // L1
 		PAVGB((%1, %2), %%mm1)				      // L1
 		"movq (%0, %2, 2), %%mm2			\n\t" // L2
 		PAVGB((%1, %2, 2), %%mm2)			      // L2
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
-		PAVGB((%1, %%eax), %%mm3)			      // L3
+		"movq (%0, %%"REG_a"), %%mm3			\n\t" // L3
+		PAVGB((%1, %%REGa), %%mm3)			      // L3
 		"movq (%0, %2, 4), %%mm4			\n\t" // L4
 		PAVGB((%1, %2, 4), %%mm4)			      // L4
-		"movq (%0, %%edx), %%mm5			\n\t" // L5
-		PAVGB((%1, %%edx), %%mm5)			      // L5
-		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
-		PAVGB((%1, %%eax, 2), %%mm6)			      // L6
-		"movq (%0, %%ecx), %%mm7			\n\t" // L7
-		PAVGB((%1, %%ecx), %%mm7)			      // L7
+		"movq (%0, %%"REG_d"), %%mm5			\n\t" // L5
+		PAVGB((%1, %%REGd), %%mm5)			      // L5
+		"movq (%0, %%"REG_a", 2), %%mm6			\n\t" // L6
+		PAVGB((%1, %%REGa, 2), %%mm6)			      // L6
+		"movq (%0, %%"REG_c"), %%mm7			\n\t" // L7
+		PAVGB((%1, %%REGc), %%mm7)			      // L7
 		"movq %%mm0, (%1)				\n\t" // R0
 		"movq %%mm1, (%1, %2)				\n\t" // R1
 		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
-		"movq %%mm3, (%1, %%eax)			\n\t" // R3
+		"movq %%mm3, (%1, %%"REG_a")			\n\t" // R3
 		"movq %%mm4, (%1, %2, 4)			\n\t" // R4
-		"movq %%mm5, (%1, %%edx)			\n\t" // R5
-		"movq %%mm6, (%1, %%eax, 2)			\n\t" // R6
-		"movq %%mm7, (%1, %%ecx)			\n\t" // R7
+		"movq %%mm5, (%1, %%"REG_d")			\n\t" // R5
+		"movq %%mm6, (%1, %%"REG_a", 2)			\n\t" // R6
+		"movq %%mm7, (%1, %%"REG_c")			\n\t" // R7
 		"movq %%mm0, (%0)				\n\t" // L0
 		"movq %%mm1, (%0, %2)				\n\t" // L1
 		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
-		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+		"movq %%mm3, (%0, %%"REG_a")			\n\t" // L3
 		"movq %%mm4, (%0, %2, 4)			\n\t" // L4
-		"movq %%mm5, (%0, %%edx)			\n\t" // L5
-		"movq %%mm6, (%0, %%eax, 2)			\n\t" // L6
-		"movq %%mm7, (%0, %%ecx)			\n\t" // L7
+		"movq %%mm5, (%0, %%"REG_d")			\n\t" // L5
+		"movq %%mm6, (%0, %%"REG_a", 2)			\n\t" // L6
+		"movq %%mm7, (%0, %%"REG_c")			\n\t" // L7
 		"jmp 4f						\n\t"
 
 		"2:						\n\t"
-		"cmpl 508(%%edx), %%ecx				\n\t"
+		"cmpl 508(%%"REG_d"), %%ecx			\n\t"
 		" jb 3f						\n\t"
 
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
+		"lea (%%"REG_a", %2, 2), %%"REG_d"		\n\t" // 5*stride
+		"lea (%%"REG_d", %2, 2), %%"REG_c"		\n\t" // 7*stride
 		"movq (%0), %%mm0				\n\t" // L0
 		"movq (%0, %2), %%mm1				\n\t" // L1
 		"movq (%0, %2, 2), %%mm2			\n\t" // L2
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
+		"movq (%0, %%"REG_a"), %%mm3			\n\t" // L3
 		"movq (%1), %%mm4				\n\t" // R0
 		"movq (%1, %2), %%mm5				\n\t" // R1
 		"movq (%1, %2, 2), %%mm6			\n\t" // R2
-		"movq (%1, %%eax), %%mm7			\n\t" // R3
+		"movq (%1, %%"REG_a"), %%mm7			\n\t" // R3
 		PAVGB(%%mm4, %%mm0)
 		PAVGB(%%mm5, %%mm1)
 		PAVGB(%%mm6, %%mm2)
@@ -2404,20 +2435,20 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 		"movq %%mm0, (%1)				\n\t" // R0
 		"movq %%mm1, (%1, %2)				\n\t" // R1
 		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
-		"movq %%mm3, (%1, %%eax)			\n\t" // R3
+		"movq %%mm3, (%1, %%"REG_a")			\n\t" // R3
 		"movq %%mm0, (%0)				\n\t" // L0
 		"movq %%mm1, (%0, %2)				\n\t" // L1
 		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
-		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+		"movq %%mm3, (%0, %%"REG_a")			\n\t" // L3
 
 		"movq (%0, %2, 4), %%mm0			\n\t" // L4
-		"movq (%0, %%edx), %%mm1			\n\t" // L5
-		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
-		"movq (%0, %%ecx), %%mm3			\n\t" // L7
+		"movq (%0, %%"REG_d"), %%mm1			\n\t" // L5
+		"movq (%0, %%"REG_a", 2), %%mm2			\n\t" // L6
+		"movq (%0, %%"REG_c"), %%mm3			\n\t" // L7
 		"movq (%1, %2, 4), %%mm4			\n\t" // R4
-		"movq (%1, %%edx), %%mm5			\n\t" // R5
-		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
-		"movq (%1, %%ecx), %%mm7			\n\t" // R7
+		"movq (%1, %%"REG_d"), %%mm5			\n\t" // R5
+		"movq (%1, %%"REG_a", 2), %%mm6			\n\t" // R6
+		"movq (%1, %%"REG_c"), %%mm7			\n\t" // R7
 		PAVGB(%%mm4, %%mm0)
 		PAVGB(%%mm5, %%mm1)
 		PAVGB(%%mm6, %%mm2)
@@ -2427,26 +2458,26 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 		PAVGB(%%mm6, %%mm2)
 		PAVGB(%%mm7, %%mm3)
 		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
-		"movq %%mm1, (%1, %%edx)			\n\t" // R5
-		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
-		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
+		"movq %%mm1, (%1, %%"REG_d")			\n\t" // R5
+		"movq %%mm2, (%1, %%"REG_a", 2)			\n\t" // R6
+		"movq %%mm3, (%1, %%"REG_c")			\n\t" // R7
 		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
-		"movq %%mm1, (%0, %%edx)			\n\t" // L5
-		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
-		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
+		"movq %%mm1, (%0, %%"REG_d")			\n\t" // L5
+		"movq %%mm2, (%0, %%"REG_a", 2)			\n\t" // L6
+		"movq %%mm3, (%0, %%"REG_c")			\n\t" // L7
 		"jmp 4f						\n\t"
 
 		"3:						\n\t"
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
+		"lea (%%"REG_a", %2, 2), %%"REG_d"		\n\t" // 5*stride
+		"lea (%%"REG_d", %2, 2), %%"REG_c"		\n\t" // 7*stride
 		"movq (%0), %%mm0				\n\t" // L0
 		"movq (%0, %2), %%mm1				\n\t" // L1
 		"movq (%0, %2, 2), %%mm2			\n\t" // L2
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
+		"movq (%0, %%"REG_a"), %%mm3			\n\t" // L3
 		"movq (%1), %%mm4				\n\t" // R0
 		"movq (%1, %2), %%mm5				\n\t" // R1
 		"movq (%1, %2, 2), %%mm6			\n\t" // R2
-		"movq (%1, %%eax), %%mm7			\n\t" // R3
+		"movq (%1, %%"REG_a"), %%mm7			\n\t" // R3
 		PAVGB(%%mm4, %%mm0)
 		PAVGB(%%mm5, %%mm1)
 		PAVGB(%%mm6, %%mm2)
@@ -2462,20 +2493,20 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 		"movq %%mm0, (%1)				\n\t" // R0
 		"movq %%mm1, (%1, %2)				\n\t" // R1
 		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
-		"movq %%mm3, (%1, %%eax)			\n\t" // R3
+		"movq %%mm3, (%1, %%"REG_a")			\n\t" // R3
 		"movq %%mm0, (%0)				\n\t" // L0
 		"movq %%mm1, (%0, %2)				\n\t" // L1
 		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
-		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+		"movq %%mm3, (%0, %%"REG_a")			\n\t" // L3
 
 		"movq (%0, %2, 4), %%mm0			\n\t" // L4
-		"movq (%0, %%edx), %%mm1			\n\t" // L5
-		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
-		"movq (%0, %%ecx), %%mm3			\n\t" // L7
+		"movq (%0, %%"REG_d"), %%mm1			\n\t" // L5
+		"movq (%0, %%"REG_a", 2), %%mm2			\n\t" // L6
+		"movq (%0, %%"REG_c"), %%mm3			\n\t" // L7
 		"movq (%1, %2, 4), %%mm4			\n\t" // R4
-		"movq (%1, %%edx), %%mm5			\n\t" // R5
-		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
-		"movq (%1, %%ecx), %%mm7			\n\t" // R7
+		"movq (%1, %%"REG_d"), %%mm5			\n\t" // R5
+		"movq (%1, %%"REG_a", 2), %%mm6			\n\t" // R6
+		"movq (%1, %%"REG_c"), %%mm7			\n\t" // R7
 		PAVGB(%%mm4, %%mm0)
 		PAVGB(%%mm5, %%mm1)
 		PAVGB(%%mm6, %%mm2)
@@ -2489,25 +2520,25 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 		PAVGB(%%mm6, %%mm2)
 		PAVGB(%%mm7, %%mm3)
 		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
-		"movq %%mm1, (%1, %%edx)			\n\t" // R5
-		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
-		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
+		"movq %%mm1, (%1, %%"REG_d")			\n\t" // R5
+		"movq %%mm2, (%1, %%"REG_a", 2)			\n\t" // R6
+		"movq %%mm3, (%1, %%"REG_c")			\n\t" // R7
 		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
-		"movq %%mm1, (%0, %%edx)			\n\t" // L5
-		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
-		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
+		"movq %%mm1, (%0, %%"REG_d")			\n\t" // L5
+		"movq %%mm2, (%0, %%"REG_a", 2)			\n\t" // L6
+		"movq %%mm3, (%0, %%"REG_c")			\n\t" // L7
 
 		"4:						\n\t"
 
-		:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
-		: "%eax", "%edx", "%ecx", "memory"
+		:: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
+		: "%"REG_a, "%"REG_d, "%"REG_c, "memory"
 		);
 //printf("%d\n", test);
 #else
 {
 	int y;
 	int d=0;
-	int sysd=0;
+//	int sysd=0;
 	int i;
 
 	for(y=0; y<8; y++)
@@ -2522,7 +2553,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 //			if(y==0 || y==7) d1+= d1>>1;
 //			d+= ABS(d1);
 			d+= d1*d1;
-			sysd+= d1;
+//			sysd+= d1;
 		}
 	}
 	i=d;
@@ -2608,6 +2639,543 @@ Switch between
 }
 #endif
 }
+#endif //HAVE_ALTIVEC
+
+#ifdef HAVE_MMX
+/**
+ * accurate deblock filter
+ */
+static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
+	int64_t dc_mask, eq_mask;
+	int64_t sums[10*8*2];
+	src+= step*3; // src points to begin of the 8x8 Block
+//START_TIMER
+asm volatile(
+		"movq %0, %%mm7					\n\t" 
+		"movq %1, %%mm6					\n\t" 
+                : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
+                );
+                
+asm volatile(
+		"lea (%2, %3), %%"REG_a"			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
+
+		"movq (%2), %%mm0				\n\t"
+		"movq (%%"REG_a"), %%mm1			\n\t"
+                "movq %%mm1, %%mm3				\n\t"
+                "movq %%mm1, %%mm4				\n\t"
+		"psubb %%mm1, %%mm0				\n\t" // mm0 = differnece
+		"paddb %%mm7, %%mm0				\n\t"
+		"pcmpgtb %%mm6, %%mm0				\n\t"
+
+		"movq (%%"REG_a",%3), %%mm2			\n\t"
+                PMAXUB(%%mm2, %%mm4)
+                PMINUB(%%mm2, %%mm3, %%mm5)
+		"psubb %%mm2, %%mm1				\n\t"
+		"paddb %%mm7, %%mm1				\n\t"
+		"pcmpgtb %%mm6, %%mm1				\n\t"
+		"paddb %%mm1, %%mm0				\n\t"
+
+		"movq (%%"REG_a", %3, 2), %%mm1			\n\t"
+                PMAXUB(%%mm1, %%mm4)
+                PMINUB(%%mm1, %%mm3, %%mm5)
+		"psubb %%mm1, %%mm2				\n\t"
+		"paddb %%mm7, %%mm2				\n\t"
+		"pcmpgtb %%mm6, %%mm2				\n\t"
+		"paddb %%mm2, %%mm0				\n\t"
+		
+		"lea (%%"REG_a", %3, 4), %%"REG_a"		\n\t"
+
+		"movq (%2, %3, 4), %%mm2			\n\t"
+                PMAXUB(%%mm2, %%mm4)
+                PMINUB(%%mm2, %%mm3, %%mm5)
+		"psubb %%mm2, %%mm1				\n\t"
+		"paddb %%mm7, %%mm1				\n\t"
+		"pcmpgtb %%mm6, %%mm1				\n\t"
+		"paddb %%mm1, %%mm0				\n\t"
+
+		"movq (%%"REG_a"), %%mm1			\n\t"
+                PMAXUB(%%mm1, %%mm4)
+                PMINUB(%%mm1, %%mm3, %%mm5)
+		"psubb %%mm1, %%mm2				\n\t"
+		"paddb %%mm7, %%mm2				\n\t"
+		"pcmpgtb %%mm6, %%mm2				\n\t"
+		"paddb %%mm2, %%mm0				\n\t"
+
+		"movq (%%"REG_a", %3), %%mm2			\n\t"
+                PMAXUB(%%mm2, %%mm4)
+                PMINUB(%%mm2, %%mm3, %%mm5)
+		"psubb %%mm2, %%mm1				\n\t"
+		"paddb %%mm7, %%mm1				\n\t"
+		"pcmpgtb %%mm6, %%mm1				\n\t"
+		"paddb %%mm1, %%mm0				\n\t"
+
+		"movq (%%"REG_a", %3, 2), %%mm1			\n\t"
+                PMAXUB(%%mm1, %%mm4)
+                PMINUB(%%mm1, %%mm3, %%mm5)
+		"psubb %%mm1, %%mm2				\n\t"
+		"paddb %%mm7, %%mm2				\n\t"
+		"pcmpgtb %%mm6, %%mm2				\n\t"
+		"paddb %%mm2, %%mm0				\n\t"
+
+		"movq (%2, %3, 8), %%mm2			\n\t"
+                PMAXUB(%%mm2, %%mm4)
+                PMINUB(%%mm2, %%mm3, %%mm5)
+		"psubb %%mm2, %%mm1				\n\t"
+		"paddb %%mm7, %%mm1				\n\t"
+		"pcmpgtb %%mm6, %%mm1				\n\t"
+		"paddb %%mm1, %%mm0				\n\t"
+
+		"movq (%%"REG_a", %3, 4), %%mm1			\n\t"
+		"psubb %%mm1, %%mm2				\n\t"
+		"paddb %%mm7, %%mm2				\n\t"
+		"pcmpgtb %%mm6, %%mm2				\n\t"
+		"paddb %%mm2, %%mm0				\n\t"
+		"psubusb %%mm3, %%mm4				\n\t"
+
+		"pxor %%mm6, %%mm6				\n\t"
+                "movq %4, %%mm7					\n\t" // QP,..., QP
+		"paddusb %%mm7, %%mm7				\n\t" // 2QP ... 2QP
+		"psubusb %%mm4, %%mm7				\n\t" // Diff >=2QP -> 0
+		"pcmpeqb %%mm6, %%mm7				\n\t" // Diff < 2QP -> 0
+		"pcmpeqb %%mm6, %%mm7				\n\t" // Diff < 2QP -> 0
+		"movq %%mm7, %1					\n\t"
+
+		"movq %5, %%mm7					\n\t"
+		"punpcklbw %%mm7, %%mm7				\n\t"
+		"punpcklbw %%mm7, %%mm7				\n\t"
+		"punpcklbw %%mm7, %%mm7				\n\t"
+		"psubb %%mm0, %%mm6				\n\t"
+		"pcmpgtb %%mm7, %%mm6				\n\t"
+		"movq %%mm6, %0					\n\t"
+
+		: "=m" (eq_mask), "=m" (dc_mask)
+		: "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
+		: "%"REG_a
+		);
+
+	if(dc_mask & eq_mask){
+		long offset= -8*step;
+		int64_t *temp_sums= sums;
+
+		asm volatile(
+		"movq %2, %%mm0					\n\t"  // QP,..., QP
+		"pxor %%mm4, %%mm4				\n\t"
+
+		"movq (%0), %%mm6				\n\t"
+		"movq (%0, %1), %%mm5				\n\t"
+		"movq %%mm5, %%mm1				\n\t"
+		"movq %%mm6, %%mm2				\n\t"
+		"psubusb %%mm6, %%mm5				\n\t"
+		"psubusb %%mm1, %%mm2				\n\t"
+		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
+		"psubusb %%mm2, %%mm0				\n\t" // diff >= QP -> 0
+		"pcmpeqb %%mm4, %%mm0				\n\t" // diff >= QP -> FF
+
+		"pxor %%mm6, %%mm1				\n\t"
+		"pand %%mm0, %%mm1				\n\t"
+		"pxor %%mm1, %%mm6				\n\t"
+		// 0:QP  6:First
+
+		"movq (%0, %1, 8), %%mm5			\n\t"
+		"add %1, %0					\n\t" // %0 points to line 1 not 0
+		"movq (%0, %1, 8), %%mm7			\n\t"
+		"movq %%mm5, %%mm1				\n\t"
+		"movq %%mm7, %%mm2				\n\t"
+		"psubusb %%mm7, %%mm5				\n\t"
+		"psubusb %%mm1, %%mm2				\n\t"
+		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
+		"movq %2, %%mm0					\n\t"  // QP,..., QP
+		"psubusb %%mm2, %%mm0				\n\t" // diff >= QP -> 0
+		"pcmpeqb %%mm4, %%mm0				\n\t" // diff >= QP -> FF
+
+		"pxor %%mm7, %%mm1				\n\t"
+		"pand %%mm0, %%mm1				\n\t"
+		"pxor %%mm1, %%mm7				\n\t"
+		
+		"movq %%mm6, %%mm5				\n\t"
+		"punpckhbw %%mm4, %%mm6				\n\t"
+		"punpcklbw %%mm4, %%mm5				\n\t"
+		// 4:0 5/6:First 7:Last
+
+		"movq %%mm5, %%mm0				\n\t"
+		"movq %%mm6, %%mm1				\n\t"
+		"psllw $2, %%mm0				\n\t"
+		"psllw $2, %%mm1				\n\t"
+		"paddw "MANGLE(w04)", %%mm0			\n\t"
+		"paddw "MANGLE(w04)", %%mm1			\n\t"
+
+#define NEXT\
+		"movq (%0), %%mm2				\n\t"\
+		"movq (%0), %%mm3				\n\t"\
+		"add %1, %0					\n\t"\
+		"punpcklbw %%mm4, %%mm2				\n\t"\
+		"punpckhbw %%mm4, %%mm3				\n\t"\
+		"paddw %%mm2, %%mm0				\n\t"\
+		"paddw %%mm3, %%mm1				\n\t"
+
+#define PREV\
+		"movq (%0), %%mm2				\n\t"\
+		"movq (%0), %%mm3				\n\t"\
+		"add %1, %0					\n\t"\
+		"punpcklbw %%mm4, %%mm2				\n\t"\
+		"punpckhbw %%mm4, %%mm3				\n\t"\
+		"psubw %%mm2, %%mm0				\n\t"\
+		"psubw %%mm3, %%mm1				\n\t"
+
+				
+		NEXT //0
+		NEXT //1
+		NEXT //2
+		"movq %%mm0, (%3)				\n\t"
+		"movq %%mm1, 8(%3)				\n\t"
+
+		NEXT //3
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 16(%3)				\n\t"
+		"movq %%mm1, 24(%3)				\n\t"
+
+		NEXT //4
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 32(%3)				\n\t"
+		"movq %%mm1, 40(%3)				\n\t"
+
+		NEXT //5
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 48(%3)				\n\t"
+		"movq %%mm1, 56(%3)				\n\t"
+
+		NEXT //6
+		"psubw %%mm5, %%mm0				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+		"movq %%mm0, 64(%3)				\n\t"
+		"movq %%mm1, 72(%3)				\n\t"
+
+		"movq %%mm7, %%mm6				\n\t"
+		"punpckhbw %%mm4, %%mm7				\n\t"
+		"punpcklbw %%mm4, %%mm6				\n\t"
+		
+		NEXT //7
+		"mov %4, %0					\n\t"
+		"add %1, %0					\n\t"
+		PREV //0
+		"movq %%mm0, 80(%3)				\n\t"
+		"movq %%mm1, 88(%3)				\n\t"
+
+		PREV //1
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 96(%3)				\n\t"
+		"movq %%mm1, 104(%3)				\n\t"
+		
+		PREV //2
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 112(%3)				\n\t"
+		"movq %%mm1, 120(%3)				\n\t"
+
+		PREV //3
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 128(%3)				\n\t"
+		"movq %%mm1, 136(%3)				\n\t"
+
+		PREV //4
+		"paddw %%mm6, %%mm0				\n\t"
+		"paddw %%mm7, %%mm1				\n\t"
+		"movq %%mm0, 144(%3)				\n\t"
+		"movq %%mm1, 152(%3)				\n\t"
+
+		"mov %4, %0					\n\t" //FIXME
+
+		: "+&r"(src)
+		: "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
+		);
+
+		src+= step; // src points to begin of the 8x8 Block
+
+		asm volatile(
+		"movq %4, %%mm6					\n\t"
+		"pcmpeqb %%mm5, %%mm5				\n\t"
+		"pxor %%mm6, %%mm5				\n\t"
+		"pxor %%mm7, %%mm7				\n\t"
+
+		"1:						\n\t"
+		"movq (%1), %%mm0				\n\t"
+		"movq 8(%1), %%mm1				\n\t"
+		"paddw 32(%1), %%mm0				\n\t"
+		"paddw 40(%1), %%mm1				\n\t"
+		"movq (%0, %3), %%mm2				\n\t"
+		"movq %%mm2, %%mm3				\n\t"
+		"movq %%mm2, %%mm4				\n\t"
+		"punpcklbw %%mm7, %%mm2				\n\t"
+		"punpckhbw %%mm7, %%mm3				\n\t"
+		"paddw %%mm2, %%mm0				\n\t"
+		"paddw %%mm3, %%mm1				\n\t"
+		"paddw %%mm2, %%mm0				\n\t"
+		"paddw %%mm3, %%mm1				\n\t"
+		"psrlw $4, %%mm0				\n\t"
+		"psrlw $4, %%mm1				\n\t"
+		"packuswb %%mm1, %%mm0				\n\t"
+		"pand %%mm6, %%mm0				\n\t"
+		"pand %%mm5, %%mm4				\n\t"
+		"por %%mm4, %%mm0				\n\t"
+		"movq %%mm0, (%0, %3)				\n\t"
+		"add $16, %1					\n\t"
+		"add %2, %0					\n\t"
+		" js 1b						\n\t"
+
+		: "+r"(offset), "+r"(temp_sums)
+		: "r" ((long)step), "r"(src - offset), "m"(dc_mask & eq_mask)
+		);
+	}else
+		src+= step; // src points to begin of the 8x8 Block
+
+	if(eq_mask != -1LL){
+		uint8_t *temp_src= src;
+		asm volatile(
+		"pxor %%mm7, %%mm7				\n\t"
+		"lea -40(%%"REG_SP"), %%"REG_c"			\n\t" // make space for 4 8-byte vars
+		"and "ALIGN_MASK", %%"REG_c"			\n\t" // align
+//	0	1	2	3	4	5	6	7	8	9
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%1+8%1	ecx+4%1
+
+		"movq (%0), %%mm0				\n\t"
+		"movq %%mm0, %%mm1				\n\t"
+		"punpcklbw %%mm7, %%mm0				\n\t" // low part of line 0
+		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
+
+		"movq (%0, %1), %%mm2				\n\t"
+		"lea (%0, %1, 2), %%"REG_a"			\n\t"
+		"movq %%mm2, %%mm3				\n\t"
+		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
+		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
+
+		"movq (%%"REG_a"), %%mm4			\n\t"
+		"movq %%mm4, %%mm5				\n\t"
+		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
+		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
+
+		"paddw %%mm0, %%mm0				\n\t" // 2L0
+		"paddw %%mm1, %%mm1				\n\t" // 2H0
+		"psubw %%mm4, %%mm2				\n\t" // L1 - L2
+		"psubw %%mm5, %%mm3				\n\t" // H1 - H2
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - L1 + L2
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - H1 + H2
+
+		"psllw $2, %%mm2				\n\t" // 4L1 - 4L2
+		"psllw $2, %%mm3				\n\t" // 4H1 - 4H2
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
+
+		"movq (%%"REG_a", %1), %%mm2			\n\t"
+		"movq %%mm2, %%mm3				\n\t"
+		"punpcklbw %%mm7, %%mm2				\n\t" // L3
+		"punpckhbw %%mm7, %%mm3				\n\t" // H3
+
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - L3
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+		"movq %%mm0, (%%"REG_c")			\n\t" // 2L0 - 5L1 + 5L2 - 2L3
+		"movq %%mm1, 8(%%"REG_c")			\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+
+		"movq (%%"REG_a", %1, 2), %%mm0			\n\t"
+		"movq %%mm0, %%mm1				\n\t"
+		"punpcklbw %%mm7, %%mm0				\n\t" // L4
+		"punpckhbw %%mm7, %%mm1				\n\t" // H4
+
+		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
+		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
+		"movq %%mm2, 16(%%"REG_c")			\n\t" // L3 - L4
+		"movq %%mm3, 24(%%"REG_c")			\n\t" // H3 - H4
+		"paddw %%mm4, %%mm4				\n\t" // 2L2
+		"paddw %%mm5, %%mm5				\n\t" // 2H2
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
+
+		"lea (%%"REG_a", %1), %0			\n\t"
+		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
+		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4
+//50 opcodes so far
+		"movq (%0, %1, 2), %%mm2			\n\t"
+		"movq %%mm2, %%mm3				\n\t"
+		"punpcklbw %%mm7, %%mm2				\n\t" // L5
+		"punpckhbw %%mm7, %%mm3				\n\t" // H5
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - L5
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - H5
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
+
+		"movq (%%"REG_a", %1, 4), %%mm6			\n\t"
+		"punpcklbw %%mm7, %%mm6				\n\t" // L6
+		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
+		"movq (%%"REG_a", %1, 4), %%mm6			\n\t"
+		"punpckhbw %%mm7, %%mm6				\n\t" // H6
+		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
+
+		"paddw %%mm0, %%mm0				\n\t" // 2L4
+		"paddw %%mm1, %%mm1				\n\t" // 2H4
+		"psubw %%mm2, %%mm0				\n\t" // 2L4 - L5 + L6
+		"psubw %%mm3, %%mm1				\n\t" // 2H4 - H5 + H6
+
+		"psllw $2, %%mm2				\n\t" // 4L5 - 4L6
+		"psllw $2, %%mm3				\n\t" // 4H5 - 4H6
+		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6
+		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6
+
+		"movq (%0, %1, 4), %%mm2			\n\t"
+		"movq %%mm2, %%mm3				\n\t"
+		"punpcklbw %%mm7, %%mm2				\n\t" // L7
+		"punpckhbw %%mm7, %%mm3				\n\t" // H7
+
+		"paddw %%mm2, %%mm2				\n\t" // 2L7
+		"paddw %%mm3, %%mm3				\n\t" // 2H7
+		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
+		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
+
+		"movq (%%"REG_c"), %%mm2			\n\t" // 2L0 - 5L1 + 5L2 - 2L3
+		"movq 8(%%"REG_c"), %%mm3			\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+
+#ifdef HAVE_MMX2
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"psubw %%mm0, %%mm6				\n\t"
+		"pmaxsw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"psubw %%mm1, %%mm6				\n\t"
+		"pmaxsw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"psubw %%mm2, %%mm6				\n\t"
+		"pmaxsw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"psubw %%mm3, %%mm6				\n\t"
+		"pmaxsw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
+#else
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"pcmpgtw %%mm0, %%mm6				\n\t"
+		"pxor %%mm6, %%mm0				\n\t"
+		"psubw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"pcmpgtw %%mm1, %%mm6				\n\t"
+		"pxor %%mm6, %%mm1				\n\t"
+		"psubw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"pcmpgtw %%mm2, %%mm6				\n\t"
+		"pxor %%mm6, %%mm2				\n\t"
+		"psubw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"pcmpgtw %%mm3, %%mm6				\n\t"
+		"pxor %%mm6, %%mm3				\n\t"
+		"psubw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
+#endif
+
+#ifdef HAVE_MMX2
+		"pminsw %%mm2, %%mm0				\n\t"
+		"pminsw %%mm3, %%mm1				\n\t"
+#else
+		"movq %%mm0, %%mm6				\n\t"
+		"psubusw %%mm2, %%mm6				\n\t"
+		"psubw %%mm6, %%mm0				\n\t"
+		"movq %%mm1, %%mm6				\n\t"
+		"psubusw %%mm3, %%mm6				\n\t"
+		"psubw %%mm6, %%mm1				\n\t"
+#endif
+
+		"movd %2, %%mm2					\n\t" // QP
+		"punpcklbw %%mm7, %%mm2				\n\t"
+
+		"movq %%mm7, %%mm6				\n\t" // 0
+		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
+		"pxor %%mm6, %%mm4				\n\t"
+		"psubw %%mm6, %%mm4				\n\t" // |2L2 - 5L3 + 5L4 - 2L5|
+		"pcmpgtw %%mm5, %%mm7				\n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
+		"pxor %%mm7, %%mm5				\n\t"
+		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
+// 100 opcodes
+		"psllw $3, %%mm2				\n\t" // 8QP
+		"movq %%mm2, %%mm3				\n\t" // 8QP
+		"pcmpgtw %%mm4, %%mm2				\n\t"
+		"pcmpgtw %%mm5, %%mm3				\n\t"
+		"pand %%mm2, %%mm4				\n\t"
+		"pand %%mm3, %%mm5				\n\t"
+
+
+		"psubusw %%mm0, %%mm4				\n\t" // hd
+		"psubusw %%mm1, %%mm5				\n\t" // ld
+
+
+		"movq "MANGLE(w05)", %%mm2			\n\t" // 5
+		"pmullw %%mm2, %%mm4				\n\t"
+		"pmullw %%mm2, %%mm5				\n\t"
+		"movq "MANGLE(w20)", %%mm2			\n\t" // 32
+		"paddw %%mm2, %%mm4				\n\t"
+		"paddw %%mm2, %%mm5				\n\t"
+		"psrlw $6, %%mm4				\n\t"
+		"psrlw $6, %%mm5				\n\t"
+
+		"movq 16(%%"REG_c"), %%mm0			\n\t" // L3 - L4
+		"movq 24(%%"REG_c"), %%mm1			\n\t" // H3 - H4
+
+		"pxor %%mm2, %%mm2				\n\t"
+		"pxor %%mm3, %%mm3				\n\t"
+
+		"pcmpgtw %%mm0, %%mm2				\n\t" // sign (L3-L4)
+		"pcmpgtw %%mm1, %%mm3				\n\t" // sign (H3-H4)
+		"pxor %%mm2, %%mm0				\n\t"
+		"pxor %%mm3, %%mm1				\n\t"
+		"psubw %%mm2, %%mm0				\n\t" // |L3-L4|
+		"psubw %%mm3, %%mm1				\n\t" // |H3-H4|
+		"psrlw $1, %%mm0				\n\t" // |L3 - L4|/2
+		"psrlw $1, %%mm1				\n\t" // |H3 - H4|/2
+
+		"pxor %%mm6, %%mm2				\n\t"
+		"pxor %%mm7, %%mm3				\n\t"
+		"pand %%mm2, %%mm4				\n\t"
+		"pand %%mm3, %%mm5				\n\t"
+
+#ifdef HAVE_MMX2
+		"pminsw %%mm0, %%mm4				\n\t"
+		"pminsw %%mm1, %%mm5				\n\t"
+#else
+		"movq %%mm4, %%mm2				\n\t"
+		"psubusw %%mm0, %%mm2				\n\t"
+		"psubw %%mm2, %%mm4				\n\t"
+		"movq %%mm5, %%mm2				\n\t"
+		"psubusw %%mm1, %%mm2				\n\t"
+		"psubw %%mm2, %%mm5				\n\t"
+#endif
+		"pxor %%mm6, %%mm4				\n\t"
+		"pxor %%mm7, %%mm5				\n\t"
+		"psubw %%mm6, %%mm4				\n\t"
+		"psubw %%mm7, %%mm5				\n\t"
+		"packsswb %%mm5, %%mm4				\n\t"
+		"movq %3, %%mm1					\n\t"
+		"pandn %%mm4, %%mm1				\n\t"
+		"movq (%0), %%mm0				\n\t"
+		"paddb   %%mm1, %%mm0				\n\t"
+		"movq %%mm0, (%0)				\n\t"
+		"movq (%0, %1), %%mm0				\n\t"
+		"psubb %%mm1, %%mm0				\n\t"
+		"movq %%mm0, (%0, %1)				\n\t"
+
+		: "+r" (temp_src)
+		: "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
+		: "%"REG_a, "%"REG_c
+		);
+	}
+/*if(step==16){
+    STOP_TIMER("step16")
+}else{
+    STOP_TIMER("stepX")
+}*/
+}
+#endif //HAVE_MMX
 
 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
@@ -2628,13 +3196,13 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[]
 	{
 #ifdef HAVE_MMX
 					asm volatile(
-						"movq (%%eax), %%mm2	\n\t" // packedYOffset
-						"movq 8(%%eax), %%mm3	\n\t" // packedYScale
-						"leal (%2,%4), %%eax	\n\t"
-						"leal (%3,%5), %%edx	\n\t"
+						"movq (%%"REG_a"), %%mm2	\n\t" // packedYOffset
+						"movq 8(%%"REG_a"), %%mm3	\n\t" // packedYScale
+						"lea (%2,%4), %%"REG_a"	\n\t"
+						"lea (%3,%5), %%"REG_d"	\n\t"
 						"pxor %%mm4, %%mm4	\n\t"
 #ifdef HAVE_MMX2
-#define SCALED_CPY(src1, src2, dst1, dst2)					\
+#define REAL_SCALED_CPY(src1, src2, dst1, dst2)					\
 						"movq " #src1 ", %%mm0	\n\t"\
 						"movq " #src1 ", %%mm5	\n\t"\
 						"movq " #src2 ", %%mm1	\n\t"\
@@ -2657,7 +3225,7 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[]
 						"movq %%mm1, " #dst2 "	\n\t"\
 
 #else //HAVE_MMX2
-#define SCALED_CPY(src1, src2, dst1, dst2)					\
+#define REAL_SCALED_CPY(src1, src2, dst1, dst2)					\
 						"movq " #src1 ", %%mm0	\n\t"\
 						"movq " #src1 ", %%mm5	\n\t"\
 						"punpcklbw %%mm4, %%mm0 \n\t"\
@@ -2684,22 +3252,24 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[]
 						"movq %%mm1, " #dst2 "	\n\t"\
 
 #endif //!HAVE_MMX2
+#define SCALED_CPY(src1, src2, dst1, dst2)\
+   REAL_SCALED_CPY(src1, src2, dst1, dst2)
 
 SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
-SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
-SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
-						"leal (%%eax,%4,4), %%eax	\n\t"
-						"leal (%%edx,%5,4), %%edx	\n\t"
-SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
+SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
+SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
+						"lea (%%"REG_a",%4,4), %%"REG_a"	\n\t"
+						"lea (%%"REG_d",%5,4), %%"REG_d"	\n\t"
+SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
 
 
 						: "=&a" (packedOffsetAndScale)
 						: "0" (packedOffsetAndScale),
 						"r"(src),
 						"r"(dst),
-						"r" (srcStride),
-						"r" (dstStride)
-						: "%edx"
+						"r" ((long)srcStride),
+						"r" ((long)dstStride)
+						: "%"REG_d
 					);
 #else
 				for(i=0; i<8; i++)
@@ -2711,27 +3281,30 @@ SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
 	{
 #ifdef HAVE_MMX
 					asm volatile(
-						"leal (%0,%2), %%eax	\n\t"
-						"leal (%1,%3), %%edx	\n\t"
+						"lea (%0,%2), %%"REG_a"	\n\t"
+						"lea (%1,%3), %%"REG_d"	\n\t"
 
-#define SIMPLE_CPY(src1, src2, dst1, dst2)				\
+#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2)				\
 						"movq " #src1 ", %%mm0	\n\t"\
 						"movq " #src2 ", %%mm1	\n\t"\
 						"movq %%mm0, " #dst1 "	\n\t"\
 						"movq %%mm1, " #dst2 "	\n\t"\
 
+#define SIMPLE_CPY(src1, src2, dst1, dst2)\
+   REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
+
 SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
-SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
-SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
-						"leal (%%eax,%2,4), %%eax	\n\t"
-						"leal (%%edx,%3,4), %%edx	\n\t"
-SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
+SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
+SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
+						"lea (%%"REG_a",%2,4), %%"REG_a"	\n\t"
+						"lea (%%"REG_d",%3,4), %%"REG_d"	\n\t"
+SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
 
 						: : "r" (src),
 						"r" (dst),
-						"r" (srcStride),
-						"r" (dstStride)
-						: "%eax", "%edx"
+						"r" ((long)srcStride),
+						"r" ((long)dstStride)
+						: "%"REG_a, "%"REG_d
 					);
 #else
 				for(i=0; i<8; i++)
@@ -2749,12 +3322,12 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride)
 #ifdef HAVE_MMX
 	asm volatile(
 		"movq (%0), %%mm0		\n\t"
-		"addl %1, %0			\n\t"
+		"add %1, %0			\n\t"
 		"movq %%mm0, (%0)		\n\t"
 		"movq %%mm0, (%0, %1)		\n\t"
 		"movq %%mm0, (%0, %1, 2)	\n\t"
 		: "+r" (src)
-		: "r" (-stride)
+		: "r" ((long)-stride)
 	);
 #else
 	int i;
@@ -2793,8 +3366,8 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 
 	//FIXME remove
 	uint64_t * const yHistogram= c.yHistogram;
-	uint8_t * const tempSrc= c.tempSrc;
-	uint8_t * const tempDst= c.tempDst;
+	uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
+	uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
 	//const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
 
 #ifdef HAVE_MMX
@@ -2814,7 +3387,8 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 		|| (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
 	else if(   (mode & V_DEBLOCK)
 		|| (mode & LINEAR_IPOL_DEINT_FILTER)
-		|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
+		|| (mode & MEDIAN_DEINT_FILTER)
+		|| (mode & V_A_DEBLOCK)) copyAhead=13;
 	else if(mode & V_X1_FILTER) copyAhead=11;
 //	else if(mode & V_RK1_FILTER) copyAhead=10;
 	else if(mode & DERING) copyAhead=9;
@@ -2905,22 +3479,22 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 */
 
 			asm(
-				"movl %4, %%eax			\n\t"
-				"shrl $2, %%eax			\n\t"
-				"andl $6, %%eax			\n\t"
-				"addl %5, %%eax			\n\t"
-				"movl %%eax, %%edx		\n\t"
-				"imul %1, %%eax			\n\t"
-				"imul %3, %%edx			\n\t"
-				"prefetchnta 32(%%eax, %0)	\n\t"
-				"prefetcht0 32(%%edx, %2)	\n\t"
-				"addl %1, %%eax			\n\t"
-				"addl %3, %%edx			\n\t"
-				"prefetchnta 32(%%eax, %0)	\n\t"
-				"prefetcht0 32(%%edx, %2)	\n\t"
-			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
-			"m" (x), "m" (copyAhead)
-			: "%eax", "%edx"
+				"mov %4, %%"REG_a"		\n\t"
+				"shr $2, %%"REG_a"		\n\t"
+				"and $6, %%"REG_a"		\n\t"
+				"add %5, %%"REG_a"		\n\t"
+				"mov %%"REG_a", %%"REG_d"	\n\t"
+				"imul %1, %%"REG_a"		\n\t"
+				"imul %3, %%"REG_d"		\n\t"
+				"prefetchnta 32(%%"REG_a", %0)	\n\t"
+				"prefetcht0 32(%%"REG_d", %2)	\n\t"
+				"add %1, %%"REG_a"		\n\t"
+				"add %3, %%"REG_d"		\n\t"
+				"prefetchnta 32(%%"REG_a", %0)	\n\t"
+				"prefetcht0 32(%%"REG_d", %2)	\n\t"
+			:: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
+			"m" ((long)x), "m" ((long)copyAhead)
+			: "%"REG_a, "%"REG_d
 			);
 
 #elif defined(HAVE_3DNOW)
@@ -2955,8 +3529,8 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 			dstBlock+=8;
 			srcBlock+=8;
 		}
-		if(width==dstStride)
-			memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride);
+		if(width==ABS(dstStride))
+			linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
 		else
 		{
 			int i;
@@ -2978,7 +3552,7 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 		uint8_t *tempBlock2= c.tempBlocks + 8;
 #endif
 		int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
-		int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*QPStride];
+		int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)];
 		int QP=0;
 		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
 		   if not than use a temporary buffer */
@@ -2987,19 +3561,19 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 			int i;
 			/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
 			   blockcopy to dst later */
-			memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
-				srcStride*MAX(height-y-copyAhead, 0) );
+			linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
+				MAX(height-y-copyAhead, 0), srcStride);
 
 			/* duplicate last line of src to fill the void upto line (copyAhead+7) */
 			for(i=MAX(height-y, 8); i<copyAhead+8; i++)
-				memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
+				memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride));
 
 			/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
-			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
+			linecpy(tempDst, dstBlock - dstStride, MIN(height-y+1, copyAhead+1), dstStride);
 
 			/* duplicate last line of dst to fill the void upto line (copyAhead) */
 			for(i=height-y+1; i<=copyAhead; i++)
-				memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
+				memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride));
 
 			dstBlock= tempDst + dstStride;
 			srcBlock= tempSrc;
@@ -3051,22 +3625,22 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 */
 
 			asm(
-				"movl %4, %%eax			\n\t"
-				"shrl $2, %%eax			\n\t"
-				"andl $6, %%eax			\n\t"
-				"addl %5, %%eax			\n\t"
-				"movl %%eax, %%edx		\n\t"
-				"imul %1, %%eax			\n\t"
-				"imul %3, %%edx			\n\t"
-				"prefetchnta 32(%%eax, %0)	\n\t"
-				"prefetcht0 32(%%edx, %2)	\n\t"
-				"addl %1, %%eax			\n\t"
-				"addl %3, %%edx			\n\t"
-				"prefetchnta 32(%%eax, %0)	\n\t"
-				"prefetcht0 32(%%edx, %2)	\n\t"
-			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
-			"m" (x), "m" (copyAhead)
-			: "%eax", "%edx"
+				"mov %4, %%"REG_a"		\n\t"
+				"shr $2, %%"REG_a"		\n\t"
+				"and $6, %%"REG_a"		\n\t"
+				"add %5, %%"REG_a"		\n\t"
+				"mov %%"REG_a", %%"REG_d"	\n\t"
+				"imul %1, %%"REG_a"		\n\t"
+				"imul %3, %%"REG_d"		\n\t"
+				"prefetchnta 32(%%"REG_a", %0)	\n\t"
+				"prefetcht0 32(%%"REG_d", %2)	\n\t"
+				"add %1, %%"REG_a"		\n\t"
+				"add %3, %%"REG_d"		\n\t"
+				"prefetchnta 32(%%"REG_a", %0)	\n\t"
+				"prefetcht0 32(%%"REG_d", %2)	\n\t"
+			:: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
+			"m" ((long)x), "m" ((long)copyAhead)
+			: "%"REG_a, "%"REG_d
 			);
 
 #elif defined(HAVE_3DNOW)
@@ -3110,6 +3684,8 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 						RENAME(doVertLowPass)(dstBlock, stride, &c);
 					else if(t==2)
 						RENAME(doVertDefFilter)(dstBlock, stride, &c);
+				}else if(mode & V_A_DEBLOCK){
+					RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
 				}
 			}
 
@@ -3131,6 +3707,8 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 						RENAME(doVertLowPass)(tempBlock1, 16, &c);
 					else if(t==2)
 						RENAME(doVertDefFilter)(tempBlock1, 16, &c);
+				}else if(mode & H_A_DEBLOCK){
+					RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
 				}
 
 				RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
@@ -3140,12 +3718,29 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 					horizX1Filter(dstBlock-4, stride, QP);
 				else if(mode & H_DEBLOCK)
 				{
+#ifdef HAVE_ALTIVEC
+					unsigned char __attribute__ ((aligned(16))) tempBlock[272];
+					transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
+
+					const int t=vertClassify_altivec(tempBlock-48, 16, &c);
+					if(t==1) {
+						doVertLowPass_altivec(tempBlock-48, 16, &c);
+                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
+                                        }
+					else if(t==2) {
+						doVertDefFilter_altivec(tempBlock-48, 16, &c);
+                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
+                                        }
+#else
 					const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
 
 					if(t==1)
 						RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
 					else if(t==2)
 						RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
+#endif
+				}else if(mode & H_A_DEBLOCK){
+					RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
 				}
 #endif
 				if(mode & DERING)
@@ -3190,8 +3785,8 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
 		if(y+15 >= height)
 		{
 			uint8_t *dstBlock= &(dst[y*dstStride]);
-			if(width==dstStride)
-				memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y));
+			if(width==ABS(dstStride))
+				linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
 			else
 			{
 				int i;
diff --git a/src/libffmpeg/libavcodec/loco.c b/src/libffmpeg/libavcodec/loco.c
new file mode 100644
index 000000000..6f90c1ef1
--- /dev/null
+++ b/src/libffmpeg/libavcodec/loco.c
@@ -0,0 +1,285 @@
+/*
+ * LOCO codec
+ * Copyright (c) 2005 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file loco.c
+ * LOCO codec.
+ */
+ 
+#include "avcodec.h"
+#include "common.h"
+#include "bitstream.h"
+#include "golomb.h"
+
+enum LOCO_MODE {LOCO_UNKN=0, LOCO_CYUY2=-1, LOCO_CRGB=-2, LOCO_CRGBA=-3, LOCO_CYV12=-4,
+ LOCO_YUY2=1, LOCO_UYVY=2, LOCO_RGB=3, LOCO_RGBA=4, LOCO_YV12=5};
+
+typedef struct LOCOContext{
+    AVCodecContext *avctx;
+    AVFrame pic;
+    int lossy;
+    int mode;
+} LOCOContext;
+
+typedef struct RICEContext{
+    GetBitContext gb;
+    int save, run, run2; /* internal rice decoder state */
+    int sum, count; /* sum and count for getting rice parameter */
+    int lossy;
+}RICEContext;
+
+static int loco_get_rice_param(RICEContext *r)
+{
+    int cnt = 0;
+    int val = r->count;
+    
+    while(r->sum > val && cnt < 9) {
+        val <<= 1;
+        cnt++;
+    }
+    
+    return cnt;
+}
+
+static inline void loco_update_rice_param(RICEContext *r, int val)
+{
+    r->sum += val;
+    r->count++;
+    
+    if(r->count == 16) {
+        r->sum >>= 1;
+        r->count >>= 1;
+    }
+}
+
+static inline int loco_get_rice(RICEContext *r)
+{
+    int v;
+    if (r->run > 0) { /* we have zero run */
+        r->run--;
+        loco_update_rice_param(r, 0);
+        return 0;
+    }
+    v = get_ur_golomb_jpegls(&r->gb, loco_get_rice_param(r), INT_MAX, 0);
+    loco_update_rice_param(r, (v+1)>>1);
+    if (!v) {
+        if (r->save >= 0) {
+            r->run = get_ur_golomb_jpegls(&r->gb, 2, INT_MAX, 0);
+            if(r->run > 1)
+                r->save += r->run + 1;
+            else
+                r->save -= 3;
+        }
+        else
+            r->run2++;
+    } else {
+        v = ((v>>1) + r->lossy) ^ -(v&1);
+        if (r->run2 > 0) {
+            if (r->run2 > 2)
+                r->save += r->run2;
+            else
+                r->save -= 3;
+            r->run2 = 0;
+        }
+    }
+    
+    return v;
+}
+
+/* LOCO main predictor - LOCO-I/JPEG-LS predictor */
+static inline int loco_predict(uint8_t* data, int stride, int step)
+{
+    int a, b, c;
+    
+    a = data[-stride];
+    b = data[-step];
+    c = data[-stride - step];
+    
+    return mid_pred(a, a + b - c, b);
+}
+
+static int loco_decode_plane(LOCOContext *l, uint8_t *data, int width, int height,
+                             int stride, uint8_t *buf, int buf_size, int step)
+{
+    RICEContext rc;
+    int val;
+    int i, j;
+    
+    init_get_bits(&rc.gb, buf, buf_size*8);
+    rc.save = 0;
+    rc.run = 0;
+    rc.run2 = 0;
+    rc.lossy = l->lossy; 
+    
+    rc.sum = 8;
+    rc.count = 1;
+    
+    /* restore top left pixel */
+    val = loco_get_rice(&rc);
+    data[0] = 128 + val;
+    /* restore top line */
+    for (i = 1; i < width; i++) {
+        val = loco_get_rice(&rc);
+        data[i * step] = data[i * step - step] + val;
+    }
+    data += stride;
+    for (j = 1; j < height; j++) {
+        /* restore left column */
+        val = loco_get_rice(&rc);
+        data[0] = data[-stride] + val;
+        /* restore all other pixels */
+        for (i = 1; i < width; i++) {
+            val = loco_get_rice(&rc);
+            data[i * step] = loco_predict(&data[i * step], stride, step) + val;
+        }
+        data += stride;
+    }
+    
+    return ((get_bits_count(&rc.gb) + 7) >> 3);
+}
+
+static int decode_frame(AVCodecContext *avctx, 
+                        void *data, int *data_size,
+                        uint8_t *buf, int buf_size)
+{
+    LOCOContext * const l = avctx->priv_data;
+    AVFrame * const p= (AVFrame*)&l->pic;
+    int decoded;
+
+    if(p->data[0])
+        avctx->release_buffer(avctx, p);
+
+    p->reference = 0;
+    if(avctx->get_buffer(avctx, p) < 0){
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    p->key_frame = 1;
+
+    switch(l->mode) {
+    case LOCO_CYUY2: case LOCO_YUY2: case LOCO_UYVY:
+        decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
+                                    p->linesize[0], buf, buf_size, 1);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height,
+                                    p->linesize[1], buf, buf_size, 1);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height,
+                                    p->linesize[2], buf, buf_size, 1);
+        break;
+    case LOCO_CYV12: case LOCO_YV12:
+        decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
+                                    p->linesize[0], buf, buf_size, 1);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[2], avctx->width / 2, avctx->height / 2,
+                                    p->linesize[2], buf, buf_size, 1);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[1], avctx->width / 2, avctx->height / 2,
+                                    p->linesize[1], buf, buf_size, 1);
+        break;
+    case LOCO_CRGB: case LOCO_RGB:
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1), avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 3);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 1, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 3);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[0] + p->linesize[0]*(avctx->height-1) + 2, avctx->width, avctx->height,
+                                    -p->linesize[0], buf, buf_size, 3);
+        break;
+    case LOCO_RGBA:
+        decoded = loco_decode_plane(l, p->data[0], avctx->width, avctx->height,
+                                    p->linesize[0], buf, buf_size, 4);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[0] + 1, avctx->width, avctx->height,
+                                    p->linesize[0], buf, buf_size, 4);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[0] + 2, avctx->width, avctx->height,
+                                    p->linesize[0], buf, buf_size, 4);
+        buf += decoded; buf_size -= decoded;
+        decoded = loco_decode_plane(l, p->data[0] + 3, avctx->width, avctx->height,
+                                    p->linesize[0], buf, buf_size, 4);
+        break;
+    }
+
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = l->pic;
+    
+    return buf_size;
+}
+
+static int decode_init(AVCodecContext *avctx){
+    LOCOContext * const l = avctx->priv_data;
+    int version;
+
+    l->avctx = avctx;
+    if (avctx->extradata_size < 12) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata size must be >= 12 instead of %i\n",
+               avctx->extradata_size);
+        return -1;
+    }
+    version = LE_32(avctx->extradata);
+    switch(version) {
+    case 1:
+        l->lossy = 0;
+        break;
+    case 2:
+        l->lossy = LE_32(avctx->extradata + 8);
+        break;
+    default:
+        l->lossy = LE_32(avctx->extradata + 8);
+        av_log(avctx, AV_LOG_INFO, "This is LOCO codec version %i, please upload file for study\n", version);
+    }
+    
+    l->mode = LE_32(avctx->extradata + 4);
+    switch(l->mode) {
+    case LOCO_CYUY2: case LOCO_YUY2: case LOCO_UYVY:
+        avctx->pix_fmt = PIX_FMT_YUV422P;
+        break;
+    case LOCO_CRGB: case LOCO_RGB:
+        avctx->pix_fmt = PIX_FMT_BGR24;
+        break;
+    case LOCO_CYV12: case LOCO_YV12:
+        avctx->pix_fmt = PIX_FMT_YUV420P;
+        break;
+    case LOCO_CRGBA: case LOCO_RGBA:
+        avctx->pix_fmt = PIX_FMT_RGBA32;
+        break;
+    default:
+        av_log(avctx, AV_LOG_INFO, "Unknown colorspace, index = %i\n", l->mode);
+        return -1;
+    }
+    if(avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_INFO, "lossy:%i, version:%i, mode: %i\n", l->lossy, version, l->mode);
+
+    return 0;
+}
+
+AVCodec loco_decoder = {
+    "loco",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_LOCO,
+    sizeof(LOCOContext),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame,
+    CODEC_CAP_DR1,
+};
diff --git a/src/libffmpeg/libavcodec/mace.c b/src/libffmpeg/libavcodec/mace.c
index 8a4a20568..80cd28393 100644
--- a/src/libffmpeg/libavcodec/mace.c
+++ b/src/libffmpeg/libavcodec/mace.c
@@ -242,7 +242,8 @@ typedef struct MACEContext {
 static void chomp3(MACEContext *ctx,
             uint8_t val,
             const uint16_t tab1[],
-            const uint16_t tab2[][8])
+            const uint16_t tab2[][8],
+            uint32_t numChannels)
 {
   short current;
 
@@ -252,7 +253,8 @@ static void chomp3(MACEContext *ctx,
   else current+=ctx->lev;
   ctx->lev=current-(current >> 3);
 //  *ctx->outPtr++=current >> 8;
-  *ctx->outPtr++=current;
+  *ctx->outPtr=current;
+  ctx->outPtr+=numChannels;
   if ( ( ctx->index += tab1[val]-(ctx->index>>5) ) < 0 ) ctx->index = 0;
 }
 /* \\\ */
@@ -281,13 +283,13 @@ static void Exp1to3(MACEContext *ctx,
 
    while (cnt>0) {
      pkt=inBuffer[0];
-     chomp3(ctx, pkt       & 7, MACEtab1, MACEtab2);
-     chomp3(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4);
-     chomp3(ctx, pkt >> 5     , MACEtab1, MACEtab2);
+     chomp3(ctx, pkt       & 7, MACEtab1, MACEtab2, numChannels);
+     chomp3(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4, numChannels);
+     chomp3(ctx, pkt >> 5     , MACEtab1, MACEtab2, numChannels);
      pkt=inBuffer[1];
-     chomp3(ctx, pkt       & 7, MACEtab1, MACEtab2);
-     chomp3(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4);
-     chomp3(ctx, pkt >> 5     , MACEtab1, MACEtab2);
+     chomp3(ctx, pkt       & 7, MACEtab1, MACEtab2, numChannels);
+     chomp3(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4, numChannels);
+     chomp3(ctx, pkt >> 5     , MACEtab1, MACEtab2, numChannels);
 
      inBuffer+=numChannels*2;
      --cnt;
@@ -306,7 +308,8 @@ static void Exp1to3(MACEContext *ctx,
 static void chomp6(MACEContext *ctx,
             uint8_t val,
             const uint16_t tab1[],
-            const uint16_t tab2[][8])
+            const uint16_t tab2[][8],
+            uint32_t numChannels)
 {
   short current;
 
@@ -329,9 +332,10 @@ static void chomp6(MACEContext *ctx,
 
 //  *ctx->outPtr++=(ctx->previous+ctx->prev2-((ctx->prev2-current) >> 2)) >> 8;
 //  *ctx->outPtr++=(ctx->previous+current+((ctx->prev2-current) >> 2)) >> 8;
-  *ctx->outPtr++=(ctx->previous+ctx->prev2-((ctx->prev2-current) >> 2));
-  *ctx->outPtr++=(ctx->previous+current+((ctx->prev2-current) >> 2));
-
+  *ctx->outPtr=(ctx->previous+ctx->prev2-((ctx->prev2-current) >> 2));
+  ctx->outPtr+=numChannels;
+  *ctx->outPtr=(ctx->previous+current+((ctx->prev2-current) >> 2));
+  ctx->outPtr+=numChannels;
   ctx->prev2=ctx->previous;
   ctx->previous=current;
 
@@ -366,9 +370,9 @@ static void Exp1to6(MACEContext *ctx,
    while (cnt>0) {
      pkt=*inBuffer;
 
-     chomp6(ctx, pkt >> 5     , MACEtab1, MACEtab2);
-     chomp6(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4);
-     chomp6(ctx, pkt       & 7, MACEtab1, MACEtab2);
+     chomp6(ctx, pkt >> 5     , MACEtab1, MACEtab2, numChannels);
+     chomp6(ctx,(pkt >> 3) & 3, MACEtab3, MACEtab4, numChannels);
+     chomp6(ctx, pkt       & 7, MACEtab1, MACEtab2, numChannels);
 
      inBuffer+=numChannels;
      --cnt;
diff --git a/src/libffmpeg/libavcodec/mdec.c b/src/libffmpeg/libavcodec/mdec.c
index ef4e6ec0a..d6e5d044a 100644
--- a/src/libffmpeg/libavcodec/mdec.c
+++ b/src/libffmpeg/libavcodec/mdec.c
@@ -81,7 +81,7 @@ static inline int mdec_decode_block_intra(MDECContext *a, DCTELEM *block, int n)
         /* now quantify & encode AC coefs */
         for(;;) {
             UPDATE_CACHE(re, &a->gb);
-            GET_RL_VLC(level, run, re, &a->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2);
+            GET_RL_VLC(level, run, re, &a->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
             
             if(level == 127){
                 break;
@@ -163,11 +163,6 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p= (AVFrame*)&a->picture;
     int i;
 
-    /* special case for last picture */
-    if (buf_size == 0) {
-        return 0;
-    }
-
     if(p->data[0])
         avctx->release_buffer(avctx, p);
 
@@ -222,8 +217,8 @@ static void mdec_common_init(AVCodecContext *avctx){
 
     dsputil_init(&a->dsp, avctx);
 
-    a->mb_width   = (avctx->width  + 15) / 16;
-    a->mb_height  = (avctx->height + 15) / 16;
+    a->mb_width   = (avctx->coded_width  + 15) / 16;
+    a->mb_height  = (avctx->coded_height + 15) / 16;
 
     avctx->coded_frame= (AVFrame*)&a->picture;
     a->avctx= avctx;
diff --git a/src/libffmpeg/libavcodec/mem.c b/src/libffmpeg/libavcodec/mem.c
index c5ca166d3..462d674e4 100644
--- a/src/libffmpeg/libavcodec/mem.c
+++ b/src/libffmpeg/libavcodec/mem.c
@@ -45,8 +45,20 @@
 void *av_malloc(unsigned int size)
 {
     void *ptr;
+#ifdef MEMALIGN_HACK
+    int diff;
+#endif
+
+    /* lets disallow possible ambiguous cases */
+    if(size > INT_MAX)
+        return NULL;
     
-#if defined (HAVE_MEMALIGN)
+#ifdef MEMALIGN_HACK
+    ptr = malloc(size+16+1);
+    diff= ((-(int)ptr - 1)&15) + 1;
+    ptr += diff;
+    ((char*)ptr)[-1]= diff;
+#elif defined (HAVE_MEMALIGN) 
     ptr = memalign(16,size);
     /* Why 64? 
        Indeed, we should align it:
@@ -87,7 +99,22 @@ void *av_malloc(unsigned int size)
  */
 void *av_realloc(void *ptr, unsigned int size)
 {
+#ifdef MEMALIGN_HACK
+    int diff;
+#endif
+
+    /* lets disallow possible ambiguous cases */
+    if(size > INT_MAX)
+        return NULL;
+
+#ifdef MEMALIGN_HACK
+    //FIXME this isnt aligned correctly though it probably isnt needed
+    if(!ptr) return av_malloc(size);
+    diff= ((char*)ptr)[-1];
+    return realloc(ptr - diff, size + diff) + diff;
+#else
     return realloc(ptr, size);
+#endif
 }
 
 /* NOTE: ptr = NULL is explicetly allowed */
@@ -95,6 +122,10 @@ void av_free(void *ptr)
 {
     /* XXX: this test should not be needed on most libcs */
     if (ptr)
+#ifdef MEMALIGN_HACK
+        free(ptr - ((char*)ptr)[-1]);
+#else
         free(ptr);
+#endif
 }
 
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 4e2305aef..4c2b4793b 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -1,6 +1,8 @@
 /*
  * MJPEG encoder and decoder
  * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2003 Alex Beregszaszi
+ * Copyright (c) 2003-2004 Michael Niedermayer
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -398,6 +400,19 @@ static void jpeg_put_comments(MpegEncContext *s)
         ptr[0] = size >> 8;
         ptr[1] = size;
     }
+
+    if(  s->avctx->pix_fmt == PIX_FMT_YUV420P 
+       ||s->avctx->pix_fmt == PIX_FMT_YUV422P
+       ||s->avctx->pix_fmt == PIX_FMT_YUV444P){
+        put_marker(p, COM);
+        flush_put_bits(p);
+        ptr = pbBufPtr(p);
+        put_bits(p, 16, 0); /* patched later */
+        put_string(p, "CS=ITU601", 1);
+        size = strlen("CS=ITU601")+3;
+        ptr[0] = size >> 8;
+        ptr[1] = size;
+    }
 }
 
 void mjpeg_picture_header(MpegEncContext *s)
@@ -657,11 +672,11 @@ static int encode_picture_lossless(AVCodecContext *avctx, unsigned char *buf, in
     mjpeg_picture_header(s);
 
     s->header_bits= put_bits_count(&s->pb);
-
+    
     if(avctx->pix_fmt == PIX_FMT_RGBA32){
         int x, y, i;
         const int linesize= p->linesize[0];
-        uint16_t buffer[2048][4];
+        uint16_t (*buffer)[4]= (void *) s->rd_scratchpad;
         int left[3], top[3], topleft[3];
 
         for(i=0; i<3; i++){
@@ -672,6 +687,11 @@ static int encode_picture_lossless(AVCodecContext *avctx, unsigned char *buf, in
             const int modified_predictor= y ? predictor : 1;
             uint8_t *ptr = p->data[0] + (linesize * y);
 
+            if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < width*3*4){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
+            
             for(i=0; i<3; i++){
                 top[i]= left[i]= topleft[i]= buffer[0][i];
             }
@@ -705,6 +725,10 @@ static int encode_picture_lossless(AVCodecContext *avctx, unsigned char *buf, in
         const int mb_height = (height + s->mjpeg_vsample[0] - 1) / s->mjpeg_vsample[0];
         
         for(mb_y = 0; mb_y < mb_height; mb_y++) {
+            if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < mb_width * 4 * 3 * s->mjpeg_hsample[0] * s->mjpeg_vsample[0]){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
             for(mb_x = 0; mb_x < mb_width; mb_x++) {
                 if(mb_x==0 || mb_y==0){
                     for(i=0;i<3;i++) {
@@ -825,7 +849,7 @@ typedef struct MJpegDecodeContext {
     int last_dc[MAX_COMPONENTS]; /* last DEQUANTIZED dc (XXX: am I right to do that ?) */
     AVFrame picture; /* picture structure */
     int linesize[MAX_COMPONENTS];                   ///< linesize << interlaced
-    uint8_t *qscale_table;
+    int8_t *qscale_table;
     DCTELEM block[64] __align8;
     ScanTable scantable;
     void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
@@ -834,13 +858,16 @@ typedef struct MJpegDecodeContext {
     int restart_count;
 
     int buggy_avid;
+    int cs_itu601;
     int interlace_polarity;
+
+    int mjpb_skiptosod;
 } MJpegDecodeContext;
 
 static int mjpeg_decode_dht(MJpegDecodeContext *s);
 
 static int build_vlc(VLC *vlc, const uint8_t *bits_table, const uint8_t *val_table, 
-                      int nb_codes)
+                      int nb_codes, int use_static)
 {
     uint8_t huff_size[256];
     uint16_t huff_code[256];
@@ -848,7 +875,7 @@ static int build_vlc(VLC *vlc, const uint8_t *bits_table, const uint8_t *val_tab
     memset(huff_size, 0, sizeof(huff_size));
     build_huffman_codes(huff_size, huff_code, bits_table, val_table);
     
-    return init_vlc(vlc, 9, nb_codes, huff_size, 1, 1, huff_code, 2, 2);
+    return init_vlc(vlc, 9, nb_codes, huff_size, 1, 1, huff_code, 2, 2, use_static);
 }
 
 static int mjpeg_decode_init(AVCodecContext *avctx)
@@ -876,12 +903,12 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
 	return -1;
     s->start_code = -1;
     s->first_picture = 1;
-    s->org_height = avctx->height;
+    s->org_height = avctx->coded_height;
     
-    build_vlc(&s->vlcs[0][0], bits_dc_luminance, val_dc_luminance, 12);
-    build_vlc(&s->vlcs[0][1], bits_dc_chrominance, val_dc_chrominance, 12);
-    build_vlc(&s->vlcs[1][0], bits_ac_luminance, val_ac_luminance, 251);
-    build_vlc(&s->vlcs[1][1], bits_ac_chrominance, val_ac_chrominance, 251);
+    build_vlc(&s->vlcs[0][0], bits_dc_luminance, val_dc_luminance, 12, 0);
+    build_vlc(&s->vlcs[0][1], bits_dc_chrominance, val_dc_chrominance, 12, 0);
+    build_vlc(&s->vlcs[1][0], bits_ac_luminance, val_ac_luminance, 251, 0);
+    build_vlc(&s->vlcs[1][1], bits_ac_chrominance, val_ac_chrominance, 251, 0);
 
     if (avctx->flags & CODEC_FLAG_EXTERN_HUFF)
     {
@@ -894,6 +921,69 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+
+/**
+ * finds the end of the current frame in the bitstream.
+ * @return the position of the first byte of the next frame, or -1
+ */
+static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size){
+    int vop_found, i;
+    uint16_t state;
+    
+    vop_found= pc->frame_start_found;
+    state= pc->state;
+    
+    i=0;
+    if(!vop_found){
+        for(i=0; i<buf_size; i++){
+            state= (state<<8) | buf[i];
+            if(state == 0xFFD8){
+                i++;
+                vop_found=1;
+                break;
+            }
+        }
+    }
+
+    if(vop_found){
+        /* EOF considered as end of frame */
+        if (buf_size == 0)
+            return 0;
+        for(; i<buf_size; i++){
+            state= (state<<8) | buf[i];
+            if(state == 0xFFD8){
+                pc->frame_start_found=0;
+                pc->state=0; 
+                return i-1;
+            }
+        }
+    }
+    pc->frame_start_found= vop_found;
+    pc->state= state;
+    return END_NOT_FOUND;
+}
+
+static int jpeg_parse(AVCodecParserContext *s,
+                           AVCodecContext *avctx,
+                           uint8_t **poutbuf, int *poutbuf_size, 
+                           const uint8_t *buf, int buf_size)
+{
+    ParseContext *pc = s->priv_data;
+    int next;
+    
+    next= find_frame_end(pc, buf, buf_size);
+
+    if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
+        *poutbuf = NULL;
+        *poutbuf_size = 0;
+        return buf_size;
+    }
+
+    *poutbuf = (uint8_t *)buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+
 /* quantize tables */
 static int mjpeg_decode_dqt(MJpegDecodeContext *s)
 {
@@ -969,7 +1059,7 @@ static int mjpeg_decode_dht(MJpegDecodeContext *s)
         free_vlc(&s->vlcs[class][index]);
         dprintf("class=%d index=%d nb_codes=%d\n",
                class, index, code_max + 1);
-        if(build_vlc(&s->vlcs[class][index], bits_table, val_table, code_max + 1) < 0){
+        if(build_vlc(&s->vlcs[class][index], bits_table, val_table, code_max + 1, 0) < 0){
             return -1;
         }
     }
@@ -993,7 +1083,10 @@ static int mjpeg_decode_sof(MJpegDecodeContext *s)
     }
     height = get_bits(&s->gb, 16);
     width = get_bits(&s->gb, 16);
+    
     dprintf("sof0: picture: %dx%d\n", width, height);
+    if(avcodec_check_dimensions(s->avctx, width, height))
+        return -1;
 
     nb_components = get_bits(&s->gb, 8);
     if (nb_components <= 0 ||
@@ -1028,8 +1121,7 @@ static int mjpeg_decode_sof(MJpegDecodeContext *s)
             
         s->width = width;
         s->height = height;
-        s->avctx->width = s->width;
-        s->avctx->height = s->height;
+        avcodec_set_dimensions(s->avctx, width, height);
 
         /* test interlaced mode */
         if (s->first_picture &&
@@ -1055,16 +1147,16 @@ static int mjpeg_decode_sof(MJpegDecodeContext *s)
         if(s->rgb){
             s->avctx->pix_fmt = PIX_FMT_RGBA32;
         }else if(s->nb_components==3)
-            s->avctx->pix_fmt = PIX_FMT_YUV444P;
+            s->avctx->pix_fmt = s->cs_itu601 ? PIX_FMT_YUV444P : PIX_FMT_YUVJ444P;
         else
             s->avctx->pix_fmt = PIX_FMT_GRAY8;
         break;
     case 0x21:
-        s->avctx->pix_fmt = PIX_FMT_YUV422P;
+        s->avctx->pix_fmt = s->cs_itu601 ? PIX_FMT_YUV422P : PIX_FMT_YUVJ422P;
         break;
     default:
     case 0x22:
-        s->avctx->pix_fmt = PIX_FMT_YUV420P;
+        s->avctx->pix_fmt = s->cs_itu601 ? PIX_FMT_YUV420P : PIX_FMT_YUVJ420P;
         break;
     }
 
@@ -1162,11 +1254,14 @@ static int decode_block(MJpegDecodeContext *s, DCTELEM *block,
 
 static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor, int point_transform){
     int i, mb_x, mb_y;
-    uint16_t buffer[2048][4];
+    uint16_t buffer[32768][4];
     int left[3], top[3], topleft[3];
     const int linesize= s->linesize[0];
     const int mask= (1<<s->bits)-1;
     
+    if((unsigned)s->mb_width > 32768) //dynamic alloc
+        return -1;
+    
     for(i=0; i<3; i++){
         buffer[0][i]= 1 << (s->bits + point_transform - 1);
     }
@@ -1336,8 +1431,8 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s){
                     }
 //		    dprintf("mb: %d %d processed\n", mb_y, mb_x);
                     ptr = s->picture.data[c] + 
-                        (s->linesize[c] * (v * mb_y + y) * 8) + 
-                        (h * mb_x + x) * 8;
+                        (((s->linesize[c] * (v * mb_y + y) * 8) + 
+                        (h * mb_x + x) * 8) >> s->avctx->lowres);
                     if (s->interlaced && s->bottom_field)
                         ptr += s->linesize[c] >> 1;
 //av_log(NULL, AV_LOG_DEBUG, "%d %d %d %d %d %d %d %d \n", mb_x, mb_y, x, y, c, s->bottom_field, (v * mb_y + y) * 8, (h * mb_x + x) * 8);
@@ -1453,6 +1548,10 @@ static int mjpeg_decode_sos(MJpegDecodeContext *s)
     if(s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG, "%s %s p:%d >>:%d\n", s->lossless ? "lossless" : "sequencial DCT", s->rgb ? "RGB" : "", predictor, point_transform);
     
+    /* mjpeg-b can have padding bytes between sos and image data, skip them */
+    for (i = s->mjpb_skiptosod; i > 0; i--)
+        skip_bits(&s->gb, 8);
+
     if(s->lossless){
             if(s->rgb){
                 if(ljpeg_decode_rgb_scan(s, predictor, point_transform) < 0)
@@ -1652,6 +1751,9 @@ static int mjpeg_decode_com(MJpegDecodeContext *s)
 		//	if (s->first_picture)
 		//	    printf("mjpeg: workarounding buggy AVID\n");
 	    }
+            else if(!strcmp(cbuf, "CS=ITU601")){
+                s->cs_itu601= 1;
+            }
 
 	    av_free(cbuf);
 	}
@@ -1724,10 +1826,6 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
     int start_code;
     AVFrame *picture = data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     buf_ptr = buf;
     buf_end = buf + buf_size;
     while (buf_ptr < buf_end) {
@@ -1762,9 +1860,9 @@ static int mjpeg_decode_frame(AVCodecContext *avctx,
 			*(dst++) = x;
 			if (x == 0xff)
 			{
-			    while(*src == 0xff) src++;
+                            while(src<buf_end && x == 0xff)
+                                x = *(src++);
 
-			    x = *(src++);
 			    if (x >= 0xd0 && x <= 0xd7)
 				*(dst++) = x;
 			    else if (x)
@@ -1898,11 +1996,7 @@ static int mjpegb_decode_frame(AVCodecContext *avctx,
     AVFrame *picture = data;
     GetBitContext hgb; /* for the header */
     uint32_t dqt_offs, dht_offs, sof_offs, sos_offs, second_field_offs;
-    uint32_t field_size;
-
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
+    uint32_t field_size, sod_offs;
 
     buf_ptr = buf;
     buf_end = buf + buf_size;
@@ -1910,26 +2004,27 @@ static int mjpegb_decode_frame(AVCodecContext *avctx,
 read_header:
     /* reset on every SOI */
     s->restart_interval = 0;
+    s->mjpb_skiptosod = 0;
 
     init_get_bits(&hgb, buf_ptr, /*buf_size*/(buf_end - buf_ptr)*8);
 
     skip_bits(&hgb, 32); /* reserved zeros */
     
-    if (get_bits(&hgb, 32) != be2me_32(ff_get_fourcc("mjpg")))
+    if (get_bits_long(&hgb, 32) != be2me_32(ff_get_fourcc("mjpg")))
     {
 	dprintf("not mjpeg-b (bad fourcc)\n");
 	return 0;
     }
 
-    field_size = get_bits(&hgb, 32); /* field size */
+    field_size = get_bits_long(&hgb, 32); /* field size */
     dprintf("field size: 0x%x\n", field_size);
     skip_bits(&hgb, 32); /* padded field size */
-    second_field_offs = get_bits(&hgb, 32);
+    second_field_offs = get_bits_long(&hgb, 32);
     dprintf("second field offs: 0x%x\n", second_field_offs);
     if (second_field_offs)
 	s->interlaced = 1;
 
-    dqt_offs = get_bits(&hgb, 32);
+    dqt_offs = get_bits_long(&hgb, 32);
     dprintf("dqt offs: 0x%x\n", dqt_offs);
     if (dqt_offs)
     {
@@ -1938,7 +2033,7 @@ read_header:
 	mjpeg_decode_dqt(s);
     }
     
-    dht_offs = get_bits(&hgb, 32);
+    dht_offs = get_bits_long(&hgb, 32);
     dprintf("dht offs: 0x%x\n", dht_offs);
     if (dht_offs)
     {
@@ -1947,7 +2042,7 @@ read_header:
 	mjpeg_decode_dht(s);
     }
 
-    sof_offs = get_bits(&hgb, 32);
+    sof_offs = get_bits_long(&hgb, 32);
     dprintf("sof offs: 0x%x\n", sof_offs);
     if (sof_offs)
     {
@@ -1957,18 +2052,19 @@ read_header:
 	    return -1;
     }
 
-    sos_offs = get_bits(&hgb, 32);
+    sos_offs = get_bits_long(&hgb, 32);
     dprintf("sos offs: 0x%x\n", sos_offs);
+    sod_offs = get_bits_long(&hgb, 32);
+    dprintf("sod offs: 0x%x\n", sod_offs);
     if (sos_offs)
     {
 //	init_get_bits(&s->gb, buf+sos_offs, (buf_end - (buf+sos_offs))*8);
 	init_get_bits(&s->gb, buf+sos_offs, field_size*8);
+	s->mjpb_skiptosod = (sod_offs - sos_offs - show_bits(&s->gb, 16));
 	s->start_code = SOS;
 	mjpeg_decode_sos(s);
     }
 
-    skip_bits(&hgb, 32); /* start of data offset */
-
     if (s->interlaced) {
         s->bottom_field ^= 1;
         /* if not bottom field, do not output image yet */
@@ -2011,10 +2107,6 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
     uint8_t *buf_ptr, *buf_end, *recoded;
     int i = 0, j = 0;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     if (!avctx->width || !avctx->height)
 	return -1;
 
@@ -2039,10 +2131,10 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
     j += sizeof(sp5x_data_dht);
 
     memcpy(recoded+j, &sp5x_data_sof[0], sizeof(sp5x_data_sof));
-    recoded[j+5] = (avctx->height >> 8) & 0xFF;
-    recoded[j+6] = avctx->height & 0xFF;
-    recoded[j+7] = (avctx->width >> 8) & 0xFF;
-    recoded[j+8] = avctx->width & 0xFF;
+    recoded[j+5] = (avctx->coded_height >> 8) & 0xFF;
+    recoded[j+6] = avctx->coded_height & 0xFF;
+    recoded[j+7] = (avctx->coded_width >> 8) & 0xFF;
+    recoded[j+8] = avctx->coded_width & 0xFF;
     j += sizeof(sp5x_data_sof);
 
     memcpy(recoded+j, &sp5x_data_sos[0], sizeof(sp5x_data_sos));
@@ -2066,8 +2158,8 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
 #else
     /* SOF */
     s->bits = 8;
-    s->width = avctx->width;
-    s->height = avctx->height;
+    s->width  = avctx->coded_width;
+    s->height = avctx->coded_height;
     s->nb_components = 3;
     s->component_id[0] = 0;
     s->h_count[0] = 2;
@@ -2085,7 +2177,7 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
     s->v_max = 2;
     
     s->qscale_table = av_mallocz((s->width+15)/16);
-    avctx->pix_fmt = PIX_FMT_YUV420P;
+    avctx->pix_fmt = s->cs_itu601 ? PIX_FMT_YUV420P : PIX_FMT_YUVJ420;
     s->interlaced = 0;
     
     s->picture.reference = 0;
@@ -2223,3 +2315,12 @@ AVCodec ljpeg_encoder = { //FIXME avoid MPV_* lossless jpeg shouldnt need them
     MPV_encode_end,
 };
 #endif
+
+AVCodecParser mjpeg_parser = {
+    { CODEC_ID_MJPEG },
+    sizeof(ParseContext),
+    NULL,
+    jpeg_parse,
+    ff_parse_close,
+};
+
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 242bb13e7..9aaad6daa 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -178,7 +178,7 @@ static always_inline int cmp(MpegEncContext *s, const int x, const int y, const
         }else
             d= 256*256*256*32;
     }else{
-        int uvdxy = 0;
+        int uvdxy;
         if(dxy){
             if(qpel){
                 c->qpel_put[size][dxy](c->temp, ref[0] + x + y*stride, stride); //FIXME prototype (add h)
@@ -222,23 +222,11 @@ static always_inline int cmp(MpegEncContext *s, const int x, const int y, const
 
 #include "motion_est_template.c"
 
-static inline int get_penalty_factor(MpegEncContext *s, int type){
-    switch(type&0xFF){
-    default:
-    case FF_CMP_SAD:
-        return s->qscale*2;
-    case FF_CMP_DCT:
-        return s->qscale*3;
-    case FF_CMP_SATD:
-        return s->qscale*6;
-    case FF_CMP_SSE:
-        return s->qscale*s->qscale*2;
-    case FF_CMP_BIT:
-        return 1;
-    case FF_CMP_RD:
-    case FF_CMP_PSNR:
-        return (s->qscale*s->qscale*185 + 64)>>7;
-    }
+static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
+    return 0;
+}
+
+static void zero_hpel(uint8_t *a, const uint8_t *b, int stride, int h){
 }
 
 void ff_init_me(MpegEncContext *s){
@@ -269,10 +257,11 @@ void ff_init_me(MpegEncContext *s){
             c->sub_motion_search= sad_hpel_motion_search; // 2050 vs. 2450 cycles
         else
             c->sub_motion_search= hpel_motion_search;
-        c->hpel_avg= s->dsp.avg_pixels_tab;
-        if(s->no_rounding) c->hpel_put= s->dsp.put_no_rnd_pixels_tab;
-        else               c->hpel_put= s->dsp.put_pixels_tab;
     }
+    c->hpel_avg= s->dsp.avg_pixels_tab;
+    if(s->no_rounding) c->hpel_put= s->dsp.put_no_rnd_pixels_tab;
+    else               c->hpel_put= s->dsp.put_pixels_tab;
+
     if(s->linesize){
         c->stride  = s->linesize; 
         c->uvstride= s->uvlinesize;
@@ -281,6 +270,22 @@ void ff_init_me(MpegEncContext *s){
         c->uvstride=  8*s->mb_width + 16;
     }
 
+    // 8x8 fullpel search would need a 4x4 chroma compare, which we dont have yet, and even if we had the motion estimation code doesnt expect it
+    if(s->codec_id != CODEC_ID_SNOW){
+        if((c->avctx->me_cmp&FF_CMP_CHROMA) && !s->dsp.me_cmp[2]){
+            s->dsp.me_cmp[2]= zero_cmp;
+        }
+        if((c->avctx->me_sub_cmp&FF_CMP_CHROMA) && !s->dsp.me_sub_cmp[2]){
+            s->dsp.me_sub_cmp[2]= zero_cmp;
+        }
+        c->hpel_put[2][0]= c->hpel_put[2][1]=
+        c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel;
+    }
+
+    if(s->codec_id == CODEC_ID_H261){
+        c->sub_motion_search= no_sub_motion_search;
+    }
+
     c->temp= c->scratchpad;
 }
       
@@ -315,6 +320,7 @@ static inline void no_motion_search(MpegEncContext * s,
     *my_ptr = 16 * s->mb_y;
 }
 
+#if 0  /* the use of these functions is inside #if 0 */
 static int full_motion_search(MpegEncContext * s,
                               int *mx_ptr, int *my_ptr, int range,
                               int xmin, int ymin, int xmax, int ymax, uint8_t *ref_picture)
@@ -535,7 +541,7 @@ static int phods_motion_search(MpegEncContext * s,
     *my_ptr = my;
     return dminy;
 }
-
+#endif /* 0 */
 
 #define Z_THRESHOLD 256
 
@@ -693,6 +699,12 @@ static inline void get_limits(MpegEncContext *s, int x, int y)
         c->ymin = - y - 16;
         c->xmax = - x + s->mb_width *16;
         c->ymax = - y + s->mb_height*16;
+    } else if (s->out_format == FMT_H261){
+        // Search range of H261 is different from other codec standards
+        c->xmin = (x > 15) ? - 15 : 0;
+        c->ymin = (y > 15) ? - 15 : 0;
+        c->xmax = (x < s->mb_width * 16 - 16) ? 15 : 0;              
+        c->ymax = (y < s->mb_height * 16 - 16) ? 15 : 0;
     } else {
         c->xmin = - x;
         c->ymin = - y;
@@ -722,7 +734,6 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift)
     int dmin_sum=0, mx4_sum=0, my4_sum=0;
     int same=1;
     const int stride= c->stride;
-    const int uvstride= c->uvstride;
     uint8_t *mv_penalty= c->current_mv_penalty;
 
     init_mv4_ref(c);
@@ -873,7 +884,6 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
     uint8_t * const mv_penalty= c->current_mv_penalty;
     int same=1;
     const int stride= 2*s->linesize;
-    const int uvstride= 2*s->uvlinesize;
     int dmin_sum= 0;
     const int mot_stride= s->mb_stride;
     const int xy= s->mb_x + s->mb_y*mot_stride;
@@ -983,6 +993,16 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
     }
 }
 
+static void clip_input_mv(MpegEncContext * s, int16_t *mv, int interlaced){
+    int ymax= s->me.ymax>>interlaced;
+    int ymin= s->me.ymin>>interlaced;
+    
+    if(mv[0] < s->me.xmin) mv[0] = s->me.xmin;
+    if(mv[0] > s->me.xmax) mv[0] = s->me.xmax;
+    if(mv[1] <       ymin) mv[1] =       ymin;
+    if(mv[1] >       ymax) mv[1] =       ymax;
+}
+
 static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int p_type){
     MotionEstContext * const c= &s->me;
     Picture *p= s->current_picture_ptr;
@@ -997,9 +1017,18 @@ static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int
     me_cmp_func cmpf= s->dsp.sse[0];
     me_cmp_func chroma_cmpf= s->dsp.sse[1];
     
-    assert(p_type==0 || !USES_LIST(mb_type, 1));
+    if(p_type && USES_LIST(mb_type, 1)){
+        av_log(c->avctx, AV_LOG_ERROR, "backward motion vector in P frame\n");
+        return INT_MAX/2;
+    }
     assert(IS_INTRA(mb_type) || USES_LIST(mb_type,0) || USES_LIST(mb_type,1));
     
+    for(i=0; i<4; i++){
+        int xy= s->block_index[i];
+        clip_input_mv(s, p->motion_val[0][xy], !!IS_INTERLACED(mb_type));
+        clip_input_mv(s, p->motion_val[1][xy], !!IS_INTERLACED(mb_type));
+    }
+
     if(IS_INTERLACED(mb_type)){
         int xy2= xy  + s->b8_stride;
         s->mb_type[mb_xy]=CANDIDATE_MB_TYPE_INTRA;
@@ -1008,7 +1037,7 @@ static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int
         
         if(!(s->flags & CODEC_FLAG_INTERLACED_ME)){
             av_log(c->avctx, AV_LOG_ERROR, "Interlaced macroblock selected but interlaced motion estimation disabled\n");
-            return -1;
+            return INT_MAX/2;
         }
 
         if(USES_LIST(mb_type, 0)){
@@ -1069,7 +1098,7 @@ static inline int check_input_motion(MpegEncContext * s, int mb_x, int mb_y, int
     }else if(IS_8X8(mb_type)){
         if(!(s->flags & CODEC_FLAG_4MV)){
             av_log(c->avctx, AV_LOG_ERROR, "4MV macroblock selected but 4MV encoding disabled\n");
-            return -1;
+            return INT_MAX/2;
         }
         cmpf= s->dsp.sse[1];
         chroma_cmpf= s->dsp.sse[1];
@@ -1127,9 +1156,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
     assert(s->linesize == c->stride);
     assert(s->uvlinesize == c->uvstride);
 
-    c->penalty_factor    = get_penalty_factor(s, c->avctx->me_cmp);
-    c->sub_penalty_factor= get_penalty_factor(s, c->avctx->me_sub_cmp);
-    c->mb_penalty_factor = get_penalty_factor(s, c->avctx->mb_cmp);
+    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
+    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
+    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
     c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
@@ -1222,7 +1251,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
             }
 
         }
-        dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift);       
+        dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift, 0, 16);       
 
         break;
     }
@@ -1298,7 +1327,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 
         dmin= c->sub_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
         if(c->avctx->me_sub_cmp != c->avctx->mb_cmp && !c->skip)
-            dmin= get_mb_score(s, mx, my, 0, 0);
+            dmin= ff_get_mb_score(s, mx, my, 0, 0, 0, 16, 1);
 
         if((s->flags&CODEC_FLAG_4MV)
            && !c->skip && varc>50 && vard>10){
@@ -1390,7 +1419,7 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
     
     assert(s->quarter_sample==0 || s->quarter_sample==1);
 
-    c->pre_penalty_factor    = get_penalty_factor(s, c->avctx->me_pre_cmp);
+    c->pre_penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_pre_cmp);
     c->current_mv_penalty= c->mv_penalty[s->f_code] + MAX_MV;
 
     get_limits(s, 16*mb_x, 16*mb_y);
@@ -1423,7 +1452,7 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
         c->pred_y = P_MEDIAN[1];
     }
 
-    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift);       
+    dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, 0, s->p_mv_table, (1<<16)>>shift, 0, 16);       
 
     s->p_mv_table[xy][0] = mx<<shift;
     s->p_mv_table[xy][1] = my<<shift;
@@ -1443,9 +1472,9 @@ static int ff_estimate_motion_b(MpegEncContext * s,
     uint8_t * const mv_penalty= c->mv_penalty[f_code] + MAX_MV;
     int mv_scale;
         
-    c->penalty_factor    = get_penalty_factor(s, c->avctx->me_cmp);
-    c->sub_penalty_factor= get_penalty_factor(s, c->avctx->me_sub_cmp);
-    c->mb_penalty_factor = get_penalty_factor(s, c->avctx->mb_cmp);
+    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
+    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
+    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
     c->current_mv_penalty= mv_penalty;
 
     get_limits(s, 16*mb_x, 16*mb_y);
@@ -1506,7 +1535,7 @@ static int ff_estimate_motion_b(MpegEncContext * s,
             mv_scale= ((s->pb_time - s->pp_time)<<16) / (s->pp_time<<shift);
         }
         
-        dmin = epzs_motion_search(s, &mx, &my, P, 0, ref_index, s->p_mv_table, mv_scale);
+        dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, ref_index, s->p_mv_table, mv_scale, 0, 16);
  
         break;
     }
@@ -1514,7 +1543,7 @@ static int ff_estimate_motion_b(MpegEncContext * s,
     dmin= c->sub_motion_search(s, &mx, &my, dmin, 0, ref_index, 0, 16);
                                    
     if(c->avctx->me_sub_cmp != c->avctx->mb_cmp && !c->skip)
-        dmin= get_mb_score(s, mx, my, 0, ref_index);
+        dmin= ff_get_mb_score(s, mx, my, 0, ref_index, 0, 16, 1);
 
 //printf("%d %d %d %d//", s->mb_x, s->mb_y, mx, my);
 //    s->mb_type[mb_y*s->mb_width + mb_x]= mb_type;
@@ -1697,14 +1726,14 @@ static inline int direct_search(MpegEncContext * s, int mb_x, int mb_y)
         P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
     }
  
-    dmin = epzs_motion_search(s, &mx, &my, P, 0, 0, mv_table, 1<<(16-shift));
+    dmin = ff_epzs_motion_search(s, &mx, &my, P, 0, 0, mv_table, 1<<(16-shift), 0, 16);
     if(c->sub_flags&FLAG_QPEL) 
         dmin = qpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
     else
         dmin = hpel_motion_search(s, &mx, &my, dmin, 0, 0, 0, 16);
     
     if(c->avctx->me_sub_cmp != c->avctx->mb_cmp && !c->skip)
-        dmin= get_mb_score(s, mx, my, 0, 0);
+        dmin= ff_get_mb_score(s, mx, my, 0, 0, 0, 16, 1);
     
     get_limits(s, 16*mb_x, 16*mb_y); //restore c->?min/max, maybe not needed
 
@@ -1726,6 +1755,7 @@ void ff_estimate_b_frame_motion(MpegEncContext * s,
     const int xy = mb_y*s->mb_stride + mb_x;
     init_ref(c, s->new_picture.data, s->last_picture.data, s->next_picture.data, 16*mb_x, 16*mb_y, 2);
 
+    get_limits(s, 16*mb_x, 16*mb_y);
     
     c->skip=0;
     if(c->avctx->me_threshold){
@@ -1864,7 +1894,7 @@ int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
 {
     if(s->me_method>=ME_EPZS){
         int score[8];
-        int i, y;
+        int i, y, range= s->avctx->me_range;
         uint8_t * fcode_tab= s->fcode_tab;
         int best_fcode=-1;
         int best_score=-10000000;
@@ -1876,10 +1906,18 @@ int ff_get_best_fcode(MpegEncContext * s, int16_t (*mv_table)[2], int type)
             int xy= y*s->mb_stride;
             for(x=0; x<s->mb_width; x++){
                 if(s->mb_type[xy] & type){
-                    int fcode= FFMAX(fcode_tab[mv_table[xy][0] + MAX_MV],
-                                     fcode_tab[mv_table[xy][1] + MAX_MV]);
+                    int mx= mv_table[xy][0];
+                    int my= mv_table[xy][1];
+                    int fcode= FFMAX(fcode_tab[mx + MAX_MV],
+                                     fcode_tab[my + MAX_MV]);
                     int j;
                     
+                    if(range){
+                        if(mx >= range || mx < -range || 
+                           my >= range || my < -range)
+                            continue;
+                    }
+                    
                     for(j=0; j<fcode && j<8; j++){
                         if(s->pict_type==B_TYPE || s->current_picture.mc_mb_var[xy] < s->current_picture.mb_var[xy])
                             score[j]-= 170;
diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c
index 8cfb24955..d8101ec33 100644
--- a/src/libffmpeg/libavcodec/motion_est_template.c
+++ b/src/libffmpeg/libavcodec/motion_est_template.c
@@ -25,11 +25,11 @@
 
 //lets hope gcc will remove the unused vars ...(gcc 3.2.2 seems to do it ...)
 #define LOAD_COMMON\
-    uint32_t * const score_map= c->score_map;\
-    const int xmin= c->xmin;\
-    const int ymin= c->ymin;\
-    const int xmax= c->xmax;\
-    const int ymax= c->ymax;\
+    uint32_t attribute_unused * const score_map= c->score_map;\
+    const int attribute_unused xmin= c->xmin;\
+    const int attribute_unused ymin= c->ymin;\
+    const int attribute_unused xmax= c->xmax;\
+    const int attribute_unused ymax= c->ymax;\
     uint8_t *mv_penalty= c->current_mv_penalty;\
     const int pred_x= c->pred_x;\
     const int pred_y= c->pred_y;\
@@ -221,13 +221,21 @@ static int hpel_motion_search(MpegEncContext * s,
 }
 #endif
 
-static int inline get_mb_score(MpegEncContext * s, int mx, int my, int src_index,
-                               int ref_index)
+static int no_sub_motion_search(MpegEncContext * s,
+          int *mx_ptr, int *my_ptr, int dmin,
+                                  int src_index, int ref_index,
+                                  int size, int h)
+{
+    (*mx_ptr)<<=1;
+    (*my_ptr)<<=1;
+    return dmin;
+}
+
+int inline ff_get_mb_score(MpegEncContext * s, int mx, int my, int src_index,
+                               int ref_index, int size, int h, int add_rate)
 {
 //    const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp;
     MotionEstContext * const c= &s->me;
-    const int size= 0;
-    const int h= 16;
     const int penalty_factor= c->mb_penalty_factor;
     const int flags= c->mb_flags;
     const int qpel= flags & FLAG_QPEL;
@@ -242,12 +250,12 @@ static int inline get_mb_score(MpegEncContext * s, int mx, int my, int src_index
     cmp_sub= s->dsp.mb_cmp[size];
     chroma_cmp_sub= s->dsp.mb_cmp[size+1];
     
-    assert(!c->skip);
-    assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp);
+//    assert(!c->skip);
+//    assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp);
 
     d= cmp(s, mx>>(qpel+1), my>>(qpel+1), mx&mask, my&mask, size, h, ref_index, src_index, cmp_sub, chroma_cmp_sub, flags);
     //FIXME check cbp before adding penalty for (0,0) vector
-    if(mx || my || size>0)
+    if(add_rate && (mx || my || size>0))
         d += (mv_penalty[mx - pred_x] + mv_penalty[my - pred_y])*penalty_factor;
         
     return d;
@@ -323,15 +331,16 @@ static int qpel_motion_search(MpegEncContext * s,
 
             for(ny= -3; ny <= 3; ny++){
                 for(nx= -3; nx <= 3; nx++){
-                    const int t2= nx*nx*(tr + tl - 2*t) + 4*nx*(tr-tl) + 32*t;
-                    const int c2= nx*nx*( r +  l - 2*c) + 4*nx*( r- l) + 32*c;
-                    const int b2= nx*nx*(br + bl - 2*b) + 4*nx*(br-bl) + 32*b;
-                    int score= ny*ny*(b2 + t2 - 2*c2) + 4*ny*(b2 - t2) + 32*c2;
+                    //FIXME this could overflow (unlikely though)
+                    const int64_t t2= nx*nx*(tr + tl - 2*t) + 4*nx*(tr-tl) + 32*t;
+                    const int64_t c2= nx*nx*( r +  l - 2*c) + 4*nx*( r- l) + 32*c;
+                    const int64_t b2= nx*nx*(br + bl - 2*b) + 4*nx*(br-bl) + 32*b;
+                    int score= (ny*ny*(b2 + t2 - 2*c2) + 4*ny*(b2 - t2) + 32*c2 + 512)>>10;
                     int i;
                     
                     if((nx&3)==0 && (ny&3)==0) continue;
                     
-                    score += 1024*(mv_penalty[4*mx + nx - pred_x] + mv_penalty[4*my + ny - pred_y])*penalty_factor;
+                    score += (mv_penalty[4*mx + nx - pred_x] + mv_penalty[4*my + ny - pred_y])*penalty_factor;
                     
 //                    if(nx&1) score-=1024*c->penalty_factor;
 //                    if(ny&1) score-=1024*c->penalty_factor;
@@ -350,6 +359,7 @@ static int qpel_motion_search(MpegEncContext * s,
             }
         }else{
             int tl;
+            //FIXME this could overflow (unlikely though)
             const int cx = 4*(r - l);
             const int cx2= r + l - 2*c; 
             const int cy = 4*(b - t);
@@ -372,6 +382,7 @@ static int qpel_motion_search(MpegEncContext * s,
             
             for(ny= -3; ny <= 3; ny++){
                 for(nx= -3; nx <= 3; nx++){
+                    //FIXME this could overflow (unlikely though)
                     int score= ny*nx*cxy + nx*nx*cx2 + ny*ny*cy2 + nx*cx + ny*cy + 32*c; //FIXME factor
                     int i;
                     
@@ -487,6 +498,10 @@ static int qpel_motion_search(MpegEncContext * s,
 {\
     const int key= ((y)<<ME_MAP_MV_BITS) + (x) + map_generation;\
     const int index= (((y)<<ME_MAP_SHIFT) + (x))&(ME_MAP_SIZE-1);\
+    assert((x) >= xmin);\
+    assert((x) <= xmax);\
+    assert((y) >= ymin);\
+    assert((y) <= ymax);\
 /*printf("check_mv %d %d\n", x, y);*/\
     if(map[index]!=key){\
         d= cmp(s, x, y, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);\
@@ -670,7 +685,7 @@ if(256*256*256*64 % (stats[0]+1)==0){
     }\
 }
 
-#define MAX_SAB_SIZE 16
+#define MAX_SAB_SIZE ME_MAP_SIZE
 static int sab_diamond_search(MpegEncContext * s, int *best, int dmin,
                                        int src_index, int ref_index, int const penalty_factor,
                                        int size, int h, int flags)
@@ -844,15 +859,13 @@ static always_inline int diamond_search(MpegEncContext * s, int *best, int dmin,
 
 static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx_ptr, int *my_ptr,
                              int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], 
-                             int ref_mv_scale, int flags)
+                             int ref_mv_scale, int flags, int size, int h)
 {
     MotionEstContext * const c= &s->me;
     int best[2]={0, 0};
     int d, dmin;
     int map_generation;
-    const int penalty_factor= c->penalty_factor;
-    const int size=0;
-    const int h=16;
+    int penalty_factor;
     const int ref_mv_stride= s->mb_stride; //pass as arg  FIXME
     const int ref_mv_xy= s->mb_x + s->mb_y*ref_mv_stride; //add to last_mv beforepassing FIXME
     me_cmp_func cmpf, chroma_cmpf;
@@ -860,11 +873,19 @@ static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx
     LOAD_COMMON
     LOAD_COMMON2
     
-    cmpf= s->dsp.me_cmp[size];
-    chroma_cmpf= s->dsp.me_cmp[size+1];
+    if(c->pre_pass){
+        penalty_factor= c->pre_penalty_factor;
+        cmpf= s->dsp.me_pre_cmp[size];
+        chroma_cmpf= s->dsp.me_pre_cmp[size+1];
+    }else{
+        penalty_factor= c->penalty_factor;
+        cmpf= s->dsp.me_cmp[size];
+        chroma_cmpf= s->dsp.me_cmp[size+1];
+    }
     
     map_generation= update_map_generation(c);
 
+    assert(cmpf);
     dmin= cmp(s, 0, 0, 0, 0, size, h, ref_index, src_index, cmpf, chroma_cmpf, flags);
     map[0]= map_generation;
     score_map[0]= dmin;
@@ -875,7 +896,7 @@ static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx
         CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
                         (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
     }else{
-        if(dmin<256 && ( P_LEFT[0]    |P_LEFT[1]
+        if(dmin<h*h && ( P_LEFT[0]    |P_LEFT[1]
                         |P_TOP[0]     |P_TOP[1]
                         |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){
             *mx_ptr= 0;
@@ -884,7 +905,7 @@ static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx
             return dmin;
         }
         CHECK_MV(P_MEDIAN[0]>>shift, P_MEDIAN[1]>>shift)
-        if(dmin>256*2){
+        if(dmin>h*h*2){
             CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16, 
                             (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
             CHECK_MV(P_LEFT[0]    >>shift, P_LEFT[1]    >>shift)
@@ -892,7 +913,7 @@ static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx
             CHECK_MV(P_TOPRIGHT[0]>>shift, P_TOPRIGHT[1]>>shift)
         }
     }
-    if(dmin>256*4){
+    if(dmin>h*h*4){
         if(c->pre_pass){
             CHECK_CLIPED_MV((last_mv[ref_mv_xy-1][0]*ref_mv_scale + (1<<15))>>16, 
                             (last_mv[ref_mv_xy-1][1]*ref_mv_scale + (1<<15))>>16)
@@ -941,19 +962,18 @@ static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx
 }
 
 //this function is dedicated to the braindamaged gcc
-static inline int epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr,
+inline int ff_epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr,
                              int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], 
-                             int ref_mv_scale)
+                             int ref_mv_scale, int size, int h)
 {
     MotionEstContext * const c= &s->me;
 //FIXME convert other functions in the same way if faster
-    switch(c->flags){
-    case 0:
-        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, 0);
+    if(c->flags==0 && h==16 && size==0){
+        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, 0, 0, 16);
 //    case FLAG_QPEL:
 //        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, FLAG_QPEL);
-    default:
-        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, c->flags);
+    }else{
+        return epzs_motion_search_internal(s, mx_ptr, my_ptr, P, src_index, ref_index, last_mv, ref_mv_scale, c->flags, size, h);
     }
 }
 
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index 872c39c0c..40ef61503 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -65,6 +65,20 @@ static void mpeg1_encode_block(MpegEncContext *s,
                          int component);
 static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code);    // RAL: f_code parameter added
 #endif //CONFIG_ENCODERS
+static inline int mpeg1_decode_block_inter(MpegEncContext *s, 
+                              DCTELEM *block, 
+                              int n);
+static inline int mpeg1_decode_block_intra(MpegEncContext *s, 
+                              DCTELEM *block, 
+                              int n);
+static inline int mpeg1_fast_decode_block_inter(MpegEncContext *s, DCTELEM *block, int n);
+static inline int mpeg2_decode_block_non_intra(MpegEncContext *s, 
+                                        DCTELEM *block, 
+                                        int n);
+static inline int mpeg2_decode_block_intra(MpegEncContext *s, 
+                                    DCTELEM *block, 
+                                    int n);
+static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s, DCTELEM *block, int n);
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred);
 static void exchange_uv(MpegEncContext *s);
 
@@ -97,16 +111,19 @@ static uint8_t mpeg1_index_run[2][64];
 static int8_t mpeg1_max_level[2][64];
 #endif //CONFIG_ENCODERS
 
-static void init_2d_vlc_rl(RLTable *rl)
+static void init_2d_vlc_rl(RLTable *rl, int use_static)
 {
     int i;
     
     init_vlc(&rl->vlc, TEX_VLC_BITS, rl->n + 2, 
              &rl->table_vlc[0][1], 4, 2,
-             &rl->table_vlc[0][0], 4, 2);
+             &rl->table_vlc[0][0], 4, 2, use_static);
+
+    if(use_static)    
+        rl->rl_vlc[0]= av_mallocz_static(rl->vlc.table_size*sizeof(RL_VLC_ELEM));
+    else
+        rl->rl_vlc[0]= av_malloc(rl->vlc.table_size*sizeof(RL_VLC_ELEM));
 
-    
-    rl->rl_vlc[0]= av_malloc(rl->vlc.table_size*sizeof(RL_VLC_ELEM));
     for(i=0; i<rl->vlc.table_size; i++){
         int code= rl->vlc.table[i][0];
         int len = rl->vlc.table[i][1];
@@ -296,6 +313,7 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
                 s->mb_width * s->mb_height <= 396 &&
                 s->mb_width * s->mb_height * framerate.num <= framerate.den*396*25 &&
                 framerate.num <= framerate.den*30 &&
+                s->avctx->me_range && s->avctx->me_range < 128 &&
                 vbv_buffer_size <= 20 &&
                 v <= 1856000/400 &&
                 s->codec_id == CODEC_ID_MPEG1VIDEO;
@@ -309,8 +327,19 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
                 put_header(s, EXT_START_CODE);
                 put_bits(&s->pb, 4, 1); //seq ext
                 put_bits(&s->pb, 1, 0); //esc
-                put_bits(&s->pb, 3, 4); //profile
-                put_bits(&s->pb, 4, 8); //level
+                
+                if(s->avctx->profile == FF_PROFILE_UNKNOWN){
+                    put_bits(&s->pb, 3, 4); //profile
+                }else{
+                    put_bits(&s->pb, 3, s->avctx->profile); //profile
+                }
+
+                if(s->avctx->level == FF_LEVEL_UNKNOWN){
+                    put_bits(&s->pb, 4, 8); //level
+                }else{
+                    put_bits(&s->pb, 4, s->avctx->level); //level
+                }
+
                 put_bits(&s->pb, 1, s->progressive_sequence);
                 put_bits(&s->pb, 2, 1); //chroma format 4:2:0
                 put_bits(&s->pb, 2, 0); //horizontal size ext
@@ -691,7 +720,7 @@ void mpeg1_encode_mb(MpegEncContext *s,
 // RAL: Parameter added: f_or_b_code
 static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code)
 {
-    int code, bit_size, l, m, bits, range, sign;
+    int code, bit_size, l, bits, range, sign;
 
     if (val == 0) {
         /* zero vector */
@@ -703,13 +732,8 @@ static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code)
         bit_size = f_or_b_code - 1;
         range = 1 << bit_size;
         /* modulo encoding */
-        l = 16 * range;
-        m = 2 * l;
-        if (val < -l) {
-            val += m;
-        } else if (val >= l) {
-            val -= m;
-        }
+        l= INT_BIT - 5 - bit_size;
+        val= (val<<l)>>l;
 
         if (val >= 0) {
             val--;
@@ -749,7 +773,7 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
 	int i;
 
         done=1;
-        init_rl(&rl_mpeg1);
+        init_rl(&rl_mpeg1, 1);
 
 	for(i=0; i<64; i++)
 	{
@@ -977,31 +1001,31 @@ static void init_vlcs(void)
 
         init_vlc(&dc_lum_vlc, DC_VLC_BITS, 12, 
                  vlc_dc_lum_bits, 1, 1,
-                 vlc_dc_lum_code, 2, 2);
+                 vlc_dc_lum_code, 2, 2, 1);
         init_vlc(&dc_chroma_vlc,  DC_VLC_BITS, 12, 
                  vlc_dc_chroma_bits, 1, 1,
-                 vlc_dc_chroma_code, 2, 2);
+                 vlc_dc_chroma_code, 2, 2, 1);
         init_vlc(&mv_vlc, MV_VLC_BITS, 17, 
                  &mbMotionVectorTable[0][1], 2, 1,
-                 &mbMotionVectorTable[0][0], 2, 1);
+                 &mbMotionVectorTable[0][0], 2, 1, 1);
         init_vlc(&mbincr_vlc, MBINCR_VLC_BITS, 36, 
                  &mbAddrIncrTable[0][1], 2, 1,
-                 &mbAddrIncrTable[0][0], 2, 1);
+                 &mbAddrIncrTable[0][0], 2, 1, 1);
         init_vlc(&mb_pat_vlc, MB_PAT_VLC_BITS, 64,
                  &mbPatTable[0][1], 2, 1,
-                 &mbPatTable[0][0], 2, 1);
+                 &mbPatTable[0][0], 2, 1, 1);
         
         init_vlc(&mb_ptype_vlc, MB_PTYPE_VLC_BITS, 7, 
                  &table_mb_ptype[0][1], 2, 1,
-                 &table_mb_ptype[0][0], 2, 1);
+                 &table_mb_ptype[0][0], 2, 1, 1);
         init_vlc(&mb_btype_vlc, MB_BTYPE_VLC_BITS, 11, 
                  &table_mb_btype[0][1], 2, 1,
-                 &table_mb_btype[0][0], 2, 1);
-        init_rl(&rl_mpeg1);
-        init_rl(&rl_mpeg2);
+                 &table_mb_btype[0][0], 2, 1, 1);
+        init_rl(&rl_mpeg1, 1);
+        init_rl(&rl_mpeg2, 1);
 
-        init_2d_vlc_rl(&rl_mpeg1);
-        init_2d_vlc_rl(&rl_mpeg2);
+        init_2d_vlc_rl(&rl_mpeg1, 1);
+        init_2d_vlc_rl(&rl_mpeg2, 1);
     }
 }
 
@@ -1023,337 +1047,6 @@ static inline int get_qscale(MpegEncContext *s)
     }
 }
 
-static inline int decode_dc(GetBitContext *gb, int component)
-{
-    int code, diff;
-
-    if (component == 0) {
-        code = get_vlc2(gb, dc_lum_vlc.table, DC_VLC_BITS, 2);
-    } else {
-        code = get_vlc2(gb, dc_chroma_vlc.table, DC_VLC_BITS, 2);
-    }
-    if (code < 0){
-        av_log(NULL, AV_LOG_ERROR, "invalid dc code at\n");
-        return 0xffff;
-    }
-    if (code == 0) {
-        diff = 0;
-    } else {
-        diff = get_xbits(gb, code);
-    }
-    return diff;
-}
-
-static inline int mpeg1_decode_block_intra(MpegEncContext *s, 
-                               DCTELEM *block, 
-                               int n)
-{
-    int level, dc, diff, i, j, run;
-    int component;
-    RLTable *rl = &rl_mpeg1;
-    uint8_t * const scantable= s->intra_scantable.permutated;
-    const uint16_t *quant_matrix= s->intra_matrix;
-    const int qscale= s->qscale;
-
-    /* DC coef */
-    component = (n <= 3 ? 0 : n - 4 + 1);
-    diff = decode_dc(&s->gb, component);
-    if (diff >= 0xffff)
-        return -1;
-    dc = s->last_dc[component];
-    dc += diff;
-    s->last_dc[component] = dc;
-    block[0] = dc<<3;
-    dprintf("dc=%d diff=%d\n", dc, diff);
-    i = 0;
-    {
-        OPEN_READER(re, &s->gb);    
-        /* now quantify & encode AC coefs */
-        for(;;) {
-            UPDATE_CACHE(re, &s->gb);
-            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2);
-            
-            if(level == 127){
-                break;
-            } else if(level != 0) {
-                i += run;
-                j = scantable[i];
-                level= (level*qscale*quant_matrix[j])>>4;
-                level= (level-1)|1;
-                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                LAST_SKIP_BITS(re, &s->gb, 1);
-            } else {
-                /* escape */
-                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
-                UPDATE_CACHE(re, &s->gb);
-                level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
-                if (level == -128) {
-                    level = SHOW_UBITS(re, &s->gb, 8) - 256; LAST_SKIP_BITS(re, &s->gb, 8);
-                } else if (level == 0) {
-                    level = SHOW_UBITS(re, &s->gb, 8)      ; LAST_SKIP_BITS(re, &s->gb, 8);
-                }
-                i += run;
-                j = scantable[i];
-                if(level<0){
-                    level= -level;
-                    level= (level*qscale*quant_matrix[j])>>4;
-                    level= (level-1)|1;
-                    level= -level;
-                }else{
-                    level= (level*qscale*quant_matrix[j])>>4;
-                    level= (level-1)|1;
-                }
-            }
-            if (i > 63){
-                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-
-            block[j] = level;
-        }
-        CLOSE_READER(re, &s->gb);
-    }
-    s->block_last_index[n] = i;
-   return 0;
-}
-
-static inline int mpeg1_decode_block_inter(MpegEncContext *s, 
-                               DCTELEM *block, 
-                               int n)
-{
-    int level, i, j, run;
-    RLTable *rl = &rl_mpeg1;
-    uint8_t * const scantable= s->intra_scantable.permutated;
-    const uint16_t *quant_matrix= s->inter_matrix;
-    const int qscale= s->qscale;
-
-    {
-        int v;
-        OPEN_READER(re, &s->gb);
-        i = -1;
-        /* special case for the first coef. no need to add a second vlc table */
-        UPDATE_CACHE(re, &s->gb);
-        v= SHOW_UBITS(re, &s->gb, 2);
-        if (v & 2) {
-            LAST_SKIP_BITS(re, &s->gb, 2);
-            level= (3*qscale*quant_matrix[0])>>5;
-            level= (level-1)|1;
-            if(v&1)
-                level= -level;
-            block[0] = level;
-            i++;
-        }
-
-        /* now quantify & encode AC coefs */
-        for(;;) {
-            UPDATE_CACHE(re, &s->gb);
-            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2);
-            
-            if(level == 127){
-                break;
-            } else if(level != 0) {
-                i += run;
-                j = scantable[i];
-                level= ((level*2+1)*qscale*quant_matrix[j])>>5;
-                level= (level-1)|1;
-                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                LAST_SKIP_BITS(re, &s->gb, 1);
-            } else {
-                /* escape */
-                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
-                UPDATE_CACHE(re, &s->gb);
-                level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
-                if (level == -128) {
-                    level = SHOW_UBITS(re, &s->gb, 8) - 256; LAST_SKIP_BITS(re, &s->gb, 8);
-                } else if (level == 0) {
-                    level = SHOW_UBITS(re, &s->gb, 8)      ; LAST_SKIP_BITS(re, &s->gb, 8);
-                }
-                i += run;
-                j = scantable[i];
-                if(level<0){
-                    level= -level;
-                    level= ((level*2+1)*qscale*quant_matrix[j])>>5;
-                    level= (level-1)|1;
-                    level= -level;
-                }else{
-                    level= ((level*2+1)*qscale*quant_matrix[j])>>5;
-                    level= (level-1)|1;
-                }
-            }
-            if (i > 63){
-                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-
-            block[j] = level;
-        }
-        CLOSE_READER(re, &s->gb);
-    }
-    s->block_last_index[n] = i;
-    return 0;
-}
-
-/* Also does unquantization here, since I will never support mpeg2
-   encoding */
-static inline int mpeg2_decode_block_non_intra(MpegEncContext *s, 
-                               DCTELEM *block, 
-                               int n)
-{
-    int level, i, j, run;
-    RLTable *rl = &rl_mpeg1;
-    uint8_t * const scantable= s->intra_scantable.permutated;
-    const uint16_t *quant_matrix;
-    const int qscale= s->qscale;
-    int mismatch;
-
-    mismatch = 1;
-
-    {
-        int v;
-        OPEN_READER(re, &s->gb);
-        i = -1;
-        if (n < 4)
-            quant_matrix = s->inter_matrix;
-        else
-            quant_matrix = s->chroma_inter_matrix;
-
-        /* special case for the first coef. no need to add a second vlc table */
-        UPDATE_CACHE(re, &s->gb);
-        v= SHOW_UBITS(re, &s->gb, 2);
-        if (v & 2) {
-            LAST_SKIP_BITS(re, &s->gb, 2);
-            level= (3*qscale*quant_matrix[0])>>5;
-            if(v&1)
-                level= -level;
-            block[0] = level;
-            mismatch ^= level;
-            i++;
-        }
-
-        /* now quantify & encode AC coefs */
-        for(;;) {
-            UPDATE_CACHE(re, &s->gb);
-            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2);
-            
-            if(level == 127){
-                break;
-            } else if(level != 0) {
-                i += run;
-                j = scantable[i];
-                level= ((level*2+1)*qscale*quant_matrix[j])>>5;
-                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                LAST_SKIP_BITS(re, &s->gb, 1);
-            } else {
-                /* escape */
-                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
-                UPDATE_CACHE(re, &s->gb);
-                level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
-
-                i += run;
-                j = scantable[i];
-                if(level<0){
-                    level= ((-level*2+1)*qscale*quant_matrix[j])>>5;
-                    level= -level;
-                }else{
-                    level= ((level*2+1)*qscale*quant_matrix[j])>>5;
-                }
-            }
-            if (i > 63){
-                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-            
-            mismatch ^= level;
-            block[j] = level;
-        }
-        CLOSE_READER(re, &s->gb);
-    }
-    block[63] ^= (mismatch & 1);
-    
-    s->block_last_index[n] = i;
-    return 0;
-}
-
-static inline int mpeg2_decode_block_intra(MpegEncContext *s, 
-                               DCTELEM *block, 
-                               int n)
-{
-    int level, dc, diff, i, j, run;
-    int component;
-    RLTable *rl;
-    uint8_t * const scantable= s->intra_scantable.permutated;
-    const uint16_t *quant_matrix;
-    const int qscale= s->qscale;
-    int mismatch;
-
-    /* DC coef */
-    if (n < 4){
-        quant_matrix = s->intra_matrix;
-        component = 0; 
-    }else{
-        quant_matrix = s->chroma_intra_matrix;
-        component = (n&1) + 1;
-    }
-    diff = decode_dc(&s->gb, component);
-    if (diff >= 0xffff)
-        return -1;
-    dc = s->last_dc[component];
-    dc += diff;
-    s->last_dc[component] = dc;
-    block[0] = dc << (3 - s->intra_dc_precision);
-    dprintf("dc=%d\n", block[0]);
-    mismatch = block[0] ^ 1;
-    i = 0;
-    if (s->intra_vlc_format)
-        rl = &rl_mpeg2;
-    else
-        rl = &rl_mpeg1;
-
-    {
-        OPEN_READER(re, &s->gb);    
-        /* now quantify & encode AC coefs */
-        for(;;) {
-            UPDATE_CACHE(re, &s->gb);
-            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2);
-            
-            if(level == 127){
-                break;
-            } else if(level != 0) {
-                i += run;
-                j = scantable[i];
-                level= (level*qscale*quant_matrix[j])>>4;
-                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                LAST_SKIP_BITS(re, &s->gb, 1);
-            } else {
-                /* escape */
-                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
-                UPDATE_CACHE(re, &s->gb);
-                level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
-                i += run;
-                j = scantable[i];
-                if(level<0){
-                    level= (-level*qscale*quant_matrix[j])>>4;
-                    level= -level;
-                }else{
-                    level= (level*qscale*quant_matrix[j])>>4;
-                }
-            }
-            if (i > 63){
-                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                return -1;
-            }
-            
-            mismatch^= level;
-            block[j] = level;
-        }
-        CLOSE_READER(re, &s->gb);
-    }
-    block[63]^= mismatch&1;
-    
-    s->block_last_index[n] = i;
-    return 0;
-}
-
 /* motion type (for mpeg2) */
 #define MT_FIELD 1
 #define MT_FRAME 2
@@ -1364,7 +1057,8 @@ static int mpeg_decode_mb(MpegEncContext *s,
                           DCTELEM block[12][64])
 {
     int i, j, k, cbp, val, mb_type, motion_type;
-    
+    const int mb_block_count = 4 + (1<< s->chroma_format);
+
     dprintf("decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
 
     assert(s->mb_skiped==0);
@@ -1393,6 +1087,15 @@ static int mpeg_decode_mb(MpegEncContext *s,
             s->mb_skiped = 1;
             s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride ]= MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
         } else {
+            int mb_type;
+            
+            if(s->mb_x)
+                mb_type= s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride - 1];
+            else
+                mb_type= s->current_picture.mb_type[ s->mb_width + (s->mb_y-1)*s->mb_stride - 1]; // FIXME not sure if this is allowed in mpeg at all, 
+            if(IS_INTRA(mb_type))
+                return -1;
+            
             /* if B type, reuse previous vectors and directions */
             s->mv[0][0][0] = s->last_mv[0][0][0];
             s->mv[0][0][1] = s->last_mv[0][0][1];
@@ -1400,7 +1103,7 @@ static int mpeg_decode_mb(MpegEncContext *s,
             s->mv[1][0][1] = s->last_mv[1][0][1];
 
             s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride ]= 
-                s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride - 1] | MB_TYPE_SKIP;
+                mb_type | MB_TYPE_SKIP;
 //            assert(s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride - 1]&(MB_TYPE_16x16|MB_TYPE_16x8));
 
             if((s->mv[0][0][0]|s->mv[0][0][1]|s->mv[1][0][0]|s->mv[1][0][1])==0) 
@@ -1477,7 +1180,7 @@ static int mpeg_decode_mb(MpegEncContext *s,
 #endif
 
         if (s->codec_id == CODEC_ID_MPEG2VIDEO) {
-            for(i=0;i<4+(1<<s->chroma_format);i++) {
+            for(i=0;i<mb_block_count;i++) {
                 if (mpeg2_decode_block_intra(s, s->pblocks[i], i) < 0)
                     return -1;
             }
@@ -1661,11 +1364,9 @@ static int mpeg_decode_mb(MpegEncContext *s,
                 av_log(s->avctx, AV_LOG_ERROR, "invalid cbp at %d %d\n", s->mb_x, s->mb_y);
                 return -1;
             }
-            if(s->chroma_format == 2){//CHROMA422
-                 cbp|= ( get_bits(&s->gb,2) ) << 6;
-            }else
-            if(s->chroma_format >  2){//CHROMA444
-                 cbp|= ( get_bits(&s->gb,6) ) << 6;
+            if(mb_block_count > 6){
+	         cbp<<= mb_block_count-6;
+		 cbp |= get_bits(&s->gb, mb_block_count-6);
             }
 
 #ifdef HAVE_XVMC
@@ -1679,44 +1380,48 @@ static int mpeg_decode_mb(MpegEncContext *s,
 #endif
 
             if (s->codec_id == CODEC_ID_MPEG2VIDEO) {
-                for(i=0;i<6;i++) {
-                    if (cbp & (1<<(5-i)) ) {
-                        if (mpeg2_decode_block_non_intra(s, s->pblocks[i], i) < 0)
-                            return -1;
-                    } else {
-                        s->block_last_index[i] = -1;
-                    }
-                }
-                if (s->chroma_format >= 2) {
-                    if (s->chroma_format == 2) {//CHROMA_422)
-                        for(i=6;i<8;i++) {
-                            if (cbp & (1<<(6+7-i)) ) {
-                                if (mpeg2_decode_block_non_intra(s, s->pblocks[i], i) < 0)
-                                    return -1;
-                            } else {
-                                s->block_last_index[i] = -1;
-                            }
+                if(s->flags2 & CODEC_FLAG2_FAST){
+                    for(i=0;i<6;i++) {
+                        if(cbp & 32) {
+                            mpeg2_fast_decode_block_non_intra(s, s->pblocks[i], i);
+                        } else {
+                            s->block_last_index[i] = -1;
                         }
-                    }else{ /*CHROMA_444*/
-                        for(i=6;i<12;i++) {
-                            if (cbp & (1<<(6+11-i)) ) {
-                                if (mpeg2_decode_block_non_intra(s, s->pblocks[i], i) < 0)
-                                    return -1;
-                            } else {
-                                s->block_last_index[i] = -1;
-                            }
+                        cbp+=cbp;
+                    }
+                }else{
+                    cbp<<= 12-mb_block_count;
+    
+                    for(i=0;i<mb_block_count;i++) {
+                        if ( cbp & (1<<11) ) {
+                            if (mpeg2_decode_block_non_intra(s, s->pblocks[i], i) < 0)
+                                return -1;
+                        } else {
+                            s->block_last_index[i] = -1;
                         }
+                        cbp+=cbp;
                     }
                 }
             } else {
-                for(i=0;i<6;i++) {
-                    if (cbp & 32) {
-                        if (mpeg1_decode_block_inter(s, s->pblocks[i], i) < 0)
-                            return -1;
-                    } else {
-                        s->block_last_index[i] = -1;
+                if(s->flags2 & CODEC_FLAG2_FAST){
+                    for(i=0;i<6;i++) {
+                        if (cbp & 32) {
+                            mpeg1_fast_decode_block_inter(s, s->pblocks[i], i);
+                        } else {
+                            s->block_last_index[i] = -1;
+                        }
+                        cbp+=cbp;
+                    }
+                }else{
+                    for(i=0;i<6;i++) {
+                        if (cbp & 32) {
+                            if (mpeg1_decode_block_inter(s, s->pblocks[i], i) < 0)
+                                return -1;
+                        } else {
+                            s->block_last_index[i] = -1;
+                        }
+                        cbp+=cbp;
                     }
-                    cbp+=cbp;
                 }
             }
         }else{
@@ -1756,11 +1461,471 @@ static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred)
     val += pred;
     
     /* modulo decoding */
-    l = 1 << (shift+4);
-    val = ((val + l)&(l*2-1)) - l;
+    l= INT_BIT - 5 - shift;
+    val = (val<<l)>>l;
     return val;
 }
 
+static inline int decode_dc(GetBitContext *gb, int component)
+{
+    int code, diff;
+
+    if (component == 0) {
+        code = get_vlc2(gb, dc_lum_vlc.table, DC_VLC_BITS, 2);
+    } else {
+        code = get_vlc2(gb, dc_chroma_vlc.table, DC_VLC_BITS, 2);
+    }
+    if (code < 0){
+        av_log(NULL, AV_LOG_ERROR, "invalid dc code at\n");
+        return 0xffff;
+    }
+    if (code == 0) {
+        diff = 0;
+    } else {
+        diff = get_xbits(gb, code);
+    }
+    return diff;
+}
+
+static inline int mpeg1_decode_block_intra(MpegEncContext *s, 
+                               DCTELEM *block, 
+                               int n)
+{
+    int level, dc, diff, i, j, run;
+    int component;
+    RLTable *rl = &rl_mpeg1;
+    uint8_t * const scantable= s->intra_scantable.permutated;
+    const uint16_t *quant_matrix= s->intra_matrix;
+    const int qscale= s->qscale;
+
+    /* DC coef */
+    component = (n <= 3 ? 0 : n - 4 + 1);
+    diff = decode_dc(&s->gb, component);
+    if (diff >= 0xffff)
+        return -1;
+    dc = s->last_dc[component];
+    dc += diff;
+    s->last_dc[component] = dc;
+    block[0] = dc<<3;
+    dprintf("dc=%d diff=%d\n", dc, diff);
+    i = 0;
+    {
+        OPEN_READER(re, &s->gb);    
+        /* now quantify & encode AC coefs */
+        for(;;) {
+            UPDATE_CACHE(re, &s->gb);
+            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            
+            if(level == 127){
+                break;
+            } else if(level != 0) {
+                i += run;
+                j = scantable[i];
+                level= (level*qscale*quant_matrix[j])>>4;
+                level= (level-1)|1;
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+            } else {
+                /* escape */
+                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
+                UPDATE_CACHE(re, &s->gb);
+                level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
+                if (level == -128) {
+                    level = SHOW_UBITS(re, &s->gb, 8) - 256; LAST_SKIP_BITS(re, &s->gb, 8);
+                } else if (level == 0) {
+                    level = SHOW_UBITS(re, &s->gb, 8)      ; LAST_SKIP_BITS(re, &s->gb, 8);
+                }
+                i += run;
+                j = scantable[i];
+                if(level<0){
+                    level= -level;
+                    level= (level*qscale*quant_matrix[j])>>4;
+                    level= (level-1)|1;
+                    level= -level;
+                }else{
+                    level= (level*qscale*quant_matrix[j])>>4;
+                    level= (level-1)|1;
+                }
+            }
+            if (i > 63){
+                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+
+            block[j] = level;
+        }
+        CLOSE_READER(re, &s->gb);
+    }
+    s->block_last_index[n] = i;
+   return 0;
+}
+
+static inline int mpeg1_decode_block_inter(MpegEncContext *s, 
+                               DCTELEM *block, 
+                               int n)
+{
+    int level, i, j, run;
+    RLTable *rl = &rl_mpeg1;
+    uint8_t * const scantable= s->intra_scantable.permutated;
+    const uint16_t *quant_matrix= s->inter_matrix;
+    const int qscale= s->qscale;
+
+    {
+        int v;
+        OPEN_READER(re, &s->gb);
+        i = -1;
+        /* special case for the first coef. no need to add a second vlc table */
+        UPDATE_CACHE(re, &s->gb);
+        v= SHOW_UBITS(re, &s->gb, 2);
+        if (v & 2) {
+            LAST_SKIP_BITS(re, &s->gb, 2);
+            level= (3*qscale*quant_matrix[0])>>5;
+            level= (level-1)|1;
+            if(v&1)
+                level= -level;
+            block[0] = level;
+            i++;
+        }
+
+        /* now quantify & encode AC coefs */
+        for(;;) {
+            UPDATE_CACHE(re, &s->gb);
+            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            
+            if(level == 127){
+                break;
+            } else if(level != 0) {
+                i += run;
+                j = scantable[i];
+                level= ((level*2+1)*qscale*quant_matrix[j])>>5;
+                level= (level-1)|1;
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+            } else {
+                /* escape */
+                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
+                UPDATE_CACHE(re, &s->gb);
+                level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
+                if (level == -128) {
+                    level = SHOW_UBITS(re, &s->gb, 8) - 256; LAST_SKIP_BITS(re, &s->gb, 8);
+                } else if (level == 0) {
+                    level = SHOW_UBITS(re, &s->gb, 8)      ; LAST_SKIP_BITS(re, &s->gb, 8);
+                }
+                i += run;
+                j = scantable[i];
+                if(level<0){
+                    level= -level;
+                    level= ((level*2+1)*qscale*quant_matrix[j])>>5;
+                    level= (level-1)|1;
+                    level= -level;
+                }else{
+                    level= ((level*2+1)*qscale*quant_matrix[j])>>5;
+                    level= (level-1)|1;
+                }
+            }
+            if (i > 63){
+                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+
+            block[j] = level;
+        }
+        CLOSE_READER(re, &s->gb);
+    }
+    s->block_last_index[n] = i;
+    return 0;
+}
+
+static inline int mpeg1_fast_decode_block_inter(MpegEncContext *s, DCTELEM *block, int n)
+{
+    int level, i, j, run;
+    RLTable *rl = &rl_mpeg1;
+    uint8_t * const scantable= s->intra_scantable.permutated;
+    const int qscale= s->qscale;
+
+    {
+        int v;
+        OPEN_READER(re, &s->gb);
+        i = -1;
+        /* special case for the first coef. no need to add a second vlc table */
+        UPDATE_CACHE(re, &s->gb);
+        v= SHOW_UBITS(re, &s->gb, 2);
+        if (v & 2) {
+            LAST_SKIP_BITS(re, &s->gb, 2);
+            level= (3*qscale)>>1;
+            level= (level-1)|1;
+            if(v&1)
+                level= -level;
+            block[0] = level;
+            i++;
+        }
+
+        /* now quantify & encode AC coefs */
+        for(;;) {
+            UPDATE_CACHE(re, &s->gb);
+            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            
+            if(level == 127){
+                break;
+            } else if(level != 0) {
+                i += run;
+                j = scantable[i];
+                level= ((level*2+1)*qscale)>>1;
+                level= (level-1)|1;
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+            } else {
+                /* escape */
+                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
+                UPDATE_CACHE(re, &s->gb);
+                level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
+                if (level == -128) {
+                    level = SHOW_UBITS(re, &s->gb, 8) - 256; LAST_SKIP_BITS(re, &s->gb, 8);
+                } else if (level == 0) {
+                    level = SHOW_UBITS(re, &s->gb, 8)      ; LAST_SKIP_BITS(re, &s->gb, 8);
+                }
+                i += run;
+                j = scantable[i];
+                if(level<0){
+                    level= -level;
+                    level= ((level*2+1)*qscale)>>1;
+                    level= (level-1)|1;
+                    level= -level;
+                }else{
+                    level= ((level*2+1)*qscale)>>1;
+                    level= (level-1)|1;
+                }
+            }
+
+            block[j] = level;
+        }
+        CLOSE_READER(re, &s->gb);
+    }
+    s->block_last_index[n] = i;
+    return 0;
+}
+
+
+static inline int mpeg2_decode_block_non_intra(MpegEncContext *s, 
+                               DCTELEM *block, 
+                               int n)
+{
+    int level, i, j, run;
+    RLTable *rl = &rl_mpeg1;
+    uint8_t * const scantable= s->intra_scantable.permutated;
+    const uint16_t *quant_matrix;
+    const int qscale= s->qscale;
+    int mismatch;
+
+    mismatch = 1;
+
+    {
+        int v;
+        OPEN_READER(re, &s->gb);
+        i = -1;
+        if (n < 4)
+            quant_matrix = s->inter_matrix;
+        else
+            quant_matrix = s->chroma_inter_matrix;
+
+        /* special case for the first coef. no need to add a second vlc table */
+        UPDATE_CACHE(re, &s->gb);
+        v= SHOW_UBITS(re, &s->gb, 2);
+        if (v & 2) {
+            LAST_SKIP_BITS(re, &s->gb, 2);
+            level= (3*qscale*quant_matrix[0])>>5;
+            if(v&1)
+                level= -level;
+            block[0] = level;
+            mismatch ^= level;
+            i++;
+        }
+
+        /* now quantify & encode AC coefs */
+        for(;;) {
+            UPDATE_CACHE(re, &s->gb);
+            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            
+            if(level == 127){
+                break;
+            } else if(level != 0) {
+                i += run;
+                j = scantable[i];
+                level= ((level*2+1)*qscale*quant_matrix[j])>>5;
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+            } else {
+                /* escape */
+                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
+                UPDATE_CACHE(re, &s->gb);
+                level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
+
+                i += run;
+                j = scantable[i];
+                if(level<0){
+                    level= ((-level*2+1)*qscale*quant_matrix[j])>>5;
+                    level= -level;
+                }else{
+                    level= ((level*2+1)*qscale*quant_matrix[j])>>5;
+                }
+            }
+            if (i > 63){
+                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+            
+            mismatch ^= level;
+            block[j] = level;
+        }
+        CLOSE_READER(re, &s->gb);
+    }
+    block[63] ^= (mismatch & 1);
+    
+    s->block_last_index[n] = i;
+    return 0;
+}
+
+static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s, 
+                               DCTELEM *block, 
+                               int n)
+{
+    int level, i, j, run;
+    RLTable *rl = &rl_mpeg1;
+    uint8_t * const scantable= s->intra_scantable.permutated;
+    const int qscale= s->qscale;
+    int v;
+    OPEN_READER(re, &s->gb);
+    i = -1;
+
+    /* special case for the first coef. no need to add a second vlc table */
+    UPDATE_CACHE(re, &s->gb);
+    v= SHOW_UBITS(re, &s->gb, 2);
+    if (v & 2) {
+        LAST_SKIP_BITS(re, &s->gb, 2);
+        level= (3*qscale)>>1;
+        if(v&1)
+            level= -level;
+        block[0] = level;
+        i++;
+    }
+
+    /* now quantify & encode AC coefs */
+    for(;;) {
+        UPDATE_CACHE(re, &s->gb);
+        GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+        
+        if(level == 127){
+            break;
+        } else if(level != 0) {
+            i += run;
+            j = scantable[i];
+            level= ((level*2+1)*qscale)>>1;
+            level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+            LAST_SKIP_BITS(re, &s->gb, 1);
+        } else {
+            /* escape */
+            run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
+            UPDATE_CACHE(re, &s->gb);
+            level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
+
+            i += run;
+            j = scantable[i];
+            if(level<0){
+                level= ((-level*2+1)*qscale)>>1;
+                level= -level;
+            }else{
+                level= ((level*2+1)*qscale)>>1;
+            }
+        }
+        
+        block[j] = level;
+    }
+    CLOSE_READER(re, &s->gb);
+    s->block_last_index[n] = i;
+    return 0;
+}
+
+
+static inline int mpeg2_decode_block_intra(MpegEncContext *s, 
+                               DCTELEM *block, 
+                               int n)
+{
+    int level, dc, diff, i, j, run;
+    int component;
+    RLTable *rl;
+    uint8_t * const scantable= s->intra_scantable.permutated;
+    const uint16_t *quant_matrix;
+    const int qscale= s->qscale;
+    int mismatch;
+
+    /* DC coef */
+    if (n < 4){
+        quant_matrix = s->intra_matrix;
+        component = 0; 
+    }else{
+        quant_matrix = s->chroma_intra_matrix;
+        component = (n&1) + 1;
+    }
+    diff = decode_dc(&s->gb, component);
+    if (diff >= 0xffff)
+        return -1;
+    dc = s->last_dc[component];
+    dc += diff;
+    s->last_dc[component] = dc;
+    block[0] = dc << (3 - s->intra_dc_precision);
+    dprintf("dc=%d\n", block[0]);
+    mismatch = block[0] ^ 1;
+    i = 0;
+    if (s->intra_vlc_format)
+        rl = &rl_mpeg2;
+    else
+        rl = &rl_mpeg1;
+
+    {
+        OPEN_READER(re, &s->gb);    
+        /* now quantify & encode AC coefs */
+        for(;;) {
+            UPDATE_CACHE(re, &s->gb);
+            GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+            
+            if(level == 127){
+                break;
+            } else if(level != 0) {
+                i += run;
+                j = scantable[i];
+                level= (level*qscale*quant_matrix[j])>>4;
+                level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+            } else {
+                /* escape */
+                run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
+                UPDATE_CACHE(re, &s->gb);
+                level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
+                i += run;
+                j = scantable[i];
+                if(level<0){
+                    level= (-level*qscale*quant_matrix[j])>>4;
+                    level= -level;
+                }else{
+                    level= (level*qscale*quant_matrix[j])>>4;
+                }
+            }
+            if (i > 63){
+                av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
+            
+            mismatch^= level;
+            block[j] = level;
+        }
+        CLOSE_READER(re, &s->gb);
+    }
+    block[63]^= mismatch&1;
+    
+    s->block_last_index[n] = i;
+    return 0;
+}
+
 typedef struct Mpeg1Context {
     MpegEncContext mpeg_enc_ctx;
     int mpeg_enc_ctx_allocated; /* true if decoding context allocated */
@@ -1769,6 +1934,7 @@ typedef struct Mpeg1Context {
     int slice_count;
     int swap_uv;//indicate VCR2
     int save_aspect_info;
+    AVRational frame_rate_ext;       ///< MPEG-2 specific framerate modificator
 
 } Mpeg1Context;
 
@@ -1801,8 +1967,8 @@ static int mpeg_decode_init(AVCodecContext *avctx)
 
 static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm, 
                                      const uint8_t *new_perm){
-uint16_t temp_matrix[64];
-int i;
+    uint16_t temp_matrix[64];
+    int i;
 
     memcpy(temp_matrix,matrix,64*sizeof(uint16_t));
     
@@ -1814,16 +1980,15 @@ int i;
 //Call this function when we know all parameters
 //it may be called in different places for mpeg1 and mpeg2
 static int mpeg_decode_postinit(AVCodecContext *avctx){
-Mpeg1Context *s1 = avctx->priv_data;
-MpegEncContext *s = &s1->mpeg_enc_ctx;
-uint8_t old_permutation[64];
-
+    Mpeg1Context *s1 = avctx->priv_data;
+    MpegEncContext *s = &s1->mpeg_enc_ctx;
+    uint8_t old_permutation[64];
 
     if (
     	(s1->mpeg_enc_ctx_allocated == 0)|| 
-        avctx->width  != s->width ||
-        avctx->height != s->height||
-//      s1->save_aspect_info != avctx->aspect_ratio_info||
+        avctx->coded_width  != s->width ||
+        avctx->coded_height != s->height||
+        s1->save_aspect_info != s->aspect_ratio_info||
         0)
     {
     
@@ -1834,8 +1999,7 @@ uint8_t old_permutation[64];
 	if( (s->width == 0 )||(s->height == 0))
 	    return -2;
 
-        avctx->width = s->width;
-        avctx->height = s->height;
+        avcodec_set_dimensions(avctx, s->width, s->height);
         avctx->bit_rate = s->bit_rate;
         s1->save_aspect_info = s->aspect_ratio_info;
 
@@ -1856,8 +2020,8 @@ uint8_t old_permutation[64];
             av_reduce(
                 &s->avctx->frame_rate, 
                 &s->avctx->frame_rate_base, 
-                frame_rate_tab[s->frame_rate_index].num * (s->frame_rate_ext_n+1),
-                frame_rate_tab[s->frame_rate_index].den * (s->frame_rate_ext_d+1),
+                frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num,
+                frame_rate_tab[s->frame_rate_index].den * s1->frame_rate_ext.den,
                 1<<30);
         //mpeg2 aspect
             if(s->aspect_ratio_info > 1){
@@ -1956,6 +2120,8 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
 
     ref = get_bits(&s->gb, 10); /* temporal ref */
     s->pict_type = get_bits(&s->gb, 3);
+    if(s->pict_type == 0 || s->pict_type > 3)
+        return -1;
 
     vbv_delay= get_bits(&s->gb, 16);
     if (s->pict_type == P_TYPE || s->pict_type == B_TYPE) {
@@ -1977,8 +2143,8 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
     s->current_picture.pict_type= s->pict_type;
     s->current_picture.key_frame= s->pict_type == I_TYPE;
     
-//    if(avctx->debug & FF_DEBUG_PICT_INFO)
-//        av_log(avctx, AV_LOG_DEBUG, "vbv_delay %d, ref %d\n", vbv_delay, ref);
+    if(avctx->debug & FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_DEBUG, "vbv_delay %d, ref %d type:%d\n", vbv_delay, ref, s->pict_type);
     
     s->y_dc_scale = 8;
     s->c_dc_scale = 8;
@@ -1986,15 +2152,15 @@ static int mpeg1_decode_picture(AVCodecContext *avctx,
     return 0;
 }
 
-static void mpeg_decode_sequence_extension(MpegEncContext *s)
+static void mpeg_decode_sequence_extension(Mpeg1Context *s1)
 {
+    MpegEncContext *s= &s1->mpeg_enc_ctx;
     int horiz_size_ext, vert_size_ext;
     int bit_rate_ext;
-    int level, profile;
 
     skip_bits(&s->gb, 1); /* profil and level esc*/
-    profile= get_bits(&s->gb, 3);
-    level= get_bits(&s->gb, 4);
+    s->avctx->profile= get_bits(&s->gb, 3);
+    s->avctx->level= get_bits(&s->gb, 4);
     s->progressive_sequence = get_bits1(&s->gb); /* progressive_sequence */
     s->chroma_format = get_bits(&s->gb, 2); /* chroma_format 1=420, 2=422, 3=444 */
     horiz_size_ext = get_bits(&s->gb, 2);
@@ -2002,15 +2168,15 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
     s->width |= (horiz_size_ext << 12);
     s->height |= (vert_size_ext << 12);
     bit_rate_ext = get_bits(&s->gb, 12);  /* XXX: handle it */
-    s->bit_rate += (bit_rate_ext << 12) * 400;
+    s->bit_rate += (bit_rate_ext << 18) * 400;
     skip_bits1(&s->gb); /* marker */
     s->avctx->rc_buffer_size += get_bits(&s->gb, 8)*1024*16<<10;
 
     s->low_delay = get_bits1(&s->gb);
     if(s->flags & CODEC_FLAG_LOW_DELAY) s->low_delay=1;
 
-    s->frame_rate_ext_n = get_bits(&s->gb, 2);
-    s->frame_rate_ext_d = get_bits(&s->gb, 5);
+    s1->frame_rate_ext.num = get_bits(&s->gb, 2)+1;
+    s1->frame_rate_ext.den = get_bits(&s->gb, 5)+1;
 
     dprintf("sequence extension\n");
     s->codec_id= s->avctx->codec_id= CODEC_ID_MPEG2VIDEO;
@@ -2018,7 +2184,7 @@ static void mpeg_decode_sequence_extension(MpegEncContext *s)
 
     if(s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_DEBUG, "profile: %d, level: %d vbv buffer: %d, bitrate:%d\n", 
-               profile, level, s->avctx->rc_buffer_size, s->bit_rate);
+               s->avctx->profile, s->avctx->level, s->avctx->rc_buffer_size, s->bit_rate);
 
 }
 
@@ -2176,7 +2342,7 @@ static void mpeg_decode_extension(AVCodecContext *avctx,
     ext_type = get_bits(&s->gb, 4);
     switch(ext_type) {
     case 0x1:
-        mpeg_decode_sequence_extension(s);
+        mpeg_decode_sequence_extension(s1);
         break;
     case 0x2:
         mpeg_decode_sequence_display_extension(s1);
@@ -2194,9 +2360,7 @@ static void mpeg_decode_extension(AVCodecContext *avctx,
 }
 
 static void exchange_uv(MpegEncContext *s){
-short * tmp;
-
-    tmp = s->pblocks[4];
+    short * tmp = s->pblocks[4];
     s->pblocks[4] = s->pblocks[5];
     s->pblocks[5] = tmp;
 }
@@ -2266,6 +2430,7 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
     AVCodecContext *avctx= s->avctx;
     int ret;
     const int field_pic= s->picture_structure != PICT_FRAME;
+    const int lowres= s->avctx->lowres;
 
     s->resync_mb_x=
     s->resync_mb_y= -1;
@@ -2335,7 +2500,9 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
 #endif
 
 	s->dsp.clear_blocks(s->block[0]);
-
+        if(!s->chroma_y_shift){
+            s->dsp.clear_blocks(s->block[6]);
+        }
         ret = mpeg_decode_mb(s, s->block);
         s->chroma_qscale= s->qscale;
 
@@ -2373,15 +2540,16 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
             }
         }
 
-        s->dest[0] += 16;
-        s->dest[1] += 8;
-        s->dest[2] += 8;
+        s->dest[0] += 16 >> lowres;
+        s->dest[1] += 16 >> (s->chroma_x_shift + lowres);
+        s->dest[2] += 16 >> (s->chroma_x_shift + lowres);
 
         MPV_decode_mb(s, s->block);
         
         if (++s->mb_x >= s->mb_width) {
+            const int mb_size= 16>>s->avctx->lowres;
 
-            ff_draw_horiz_band(s, 16*s->mb_y, 16);
+            ff_draw_horiz_band(s, mb_size*s->mb_y, mb_size);
 
             s->mb_x = 0;
             s->mb_y++;
@@ -2557,7 +2725,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
 #ifdef DEBUG
         dprintf("intra matrix present\n");
         for(i=0;i<64;i++)
-            dprintf(" %d", s->intra_matrix[s->dsp.idct_permutation[i]);
+            dprintf(" %d", s->intra_matrix[s->dsp.idct_permutation[i]]);
         printf("\n");
 #endif
     } else {
@@ -2582,7 +2750,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
 #ifdef DEBUG
         dprintf("non intra matrix present\n");
         for(i=0;i<64;i++)
-            dprintf(" %d", s->inter_matrix[s->dsp.idct_permutation[i]);
+            dprintf(" %d", s->inter_matrix[s->dsp.idct_permutation[i]]);
         printf("\n");
 #endif
     } else {
@@ -2629,8 +2797,8 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     if (s1->mpeg_enc_ctx_allocated) {
         MPV_common_end(s);
     }
-    s->width = avctx->width;
-    s->height = avctx->height;
+    s->width  = avctx->coded_width;
+    s->height = avctx->coded_height;
     avctx->has_b_frames= 0; //true?
     s->low_delay= 1;
 
@@ -2786,12 +2954,14 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
     MpegEncContext *s2 = &s->mpeg_enc_ctx;
     dprintf("fill_buffer\n");
 
-    /* special case for last picture */
-    if (buf_size == 0 && s2->low_delay==0 && s2->next_picture_ptr) {
-        *picture= *(AVFrame*)s2->next_picture_ptr;
-        s2->next_picture_ptr= NULL;
+    if (buf_size == 0) {
+	/* special case for last picture */
+	if (s2->low_delay==0 && s2->next_picture_ptr) {
+	    *picture= *(AVFrame*)s2->next_picture_ptr;
+	    s2->next_picture_ptr= NULL;
 
-        *data_size = sizeof(AVFrame);
+	    *data_size = sizeof(AVFrame);
+	}
         return 0;
     }
 
@@ -2886,6 +3056,11 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
                         if(avctx->hurry_up>=5) break;
                         
                         if (!s->mpeg_enc_ctx_allocated) break;
+
+                        if(s2->codec_id == CODEC_ID_MPEG2VIDEO){
+                            if(mb_y < avctx->skip_top || mb_y >= s2->mb_height - avctx->skip_bottom)
+                                break;
+                        }
                         
                         if(s2->first_slice){
                             s2->first_slice=0;
@@ -2943,7 +3118,7 @@ AVCodec mpeg1video_decoder = {
     NULL,
     mpeg_decode_end,
     mpeg_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
     .flush= ff_mpeg_flush,
 };
 
@@ -2956,7 +3131,7 @@ AVCodec mpeg2video_decoder = {
     NULL,
     mpeg_decode_end,
     mpeg_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
     .flush= ff_mpeg_flush,
 };
 
@@ -2970,7 +3145,7 @@ AVCodec mpegvideo_decoder = {
     NULL,
     mpeg_decode_end,
     mpeg_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
     .flush= ff_mpeg_flush,
 };
 
@@ -2985,10 +3160,10 @@ AVCodec mpeg1video_encoder = {
     MPV_encode_picture,
     MPV_encode_end,
     .supported_framerates= frame_rate_tab+1,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
+    .capabilities= CODEC_CAP_DELAY,
 };
 
-#ifdef CONFIG_RISKY
-
 AVCodec mpeg2video_encoder = {
     "mpeg2video",
     CODEC_TYPE_VIDEO,
@@ -2998,14 +3173,17 @@ AVCodec mpeg2video_encoder = {
     MPV_encode_picture,
     MPV_encode_end,
     .supported_framerates= frame_rate_tab+1,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
+    .capabilities= CODEC_CAP_DELAY,
 };
 #endif
-#endif
 
 #ifdef HAVE_XVMC
 static int mpeg_mc_decode_init(AVCodecContext *avctx){
     Mpeg1Context *s;
 
+    if( avctx->thread_count > 1) 
+        return -1;
     if( !(avctx->slice_flags & SLICE_FLAG_CODED_ORDER) )
         return -1;
     if( !(avctx->slice_flags & SLICE_FLAG_ALLOW_FIELD) ){
@@ -3029,7 +3207,7 @@ AVCodec mpeg_xvmc_decoder = {
     NULL,
     mpeg_decode_end,
     mpeg_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED| CODEC_CAP_HWACCEL,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED| CODEC_CAP_HWACCEL | CODEC_CAP_DELAY,
     .flush= ff_mpeg_flush,
 };
 
diff --git a/src/libffmpeg/libavcodec/mpegaudio.h b/src/libffmpeg/libavcodec/mpegaudio.h
index e50e8bd6f..072c41bda 100644
--- a/src/libffmpeg/libavcodec/mpegaudio.h
+++ b/src/libffmpeg/libavcodec/mpegaudio.h
@@ -18,6 +18,10 @@
 #define MPA_DUAL    2
 #define MPA_MONO    3
 
+/* header + layer + bitrate + freq + lsf/mpeg25 */
+#define SAME_HEADER_MASK \
+   (0xffe00000 | (3 << 17) | (0xf << 12) | (3 << 10) | (3 << 19))
+
 int l2_select_table(int bitrate, int nb_channels, int freq, int lsf);
 int mpa_decode_header(AVCodecContext *avctx, uint32_t head);
 
@@ -29,3 +33,20 @@ extern const int sblimit_table[5];
 extern const int quant_steps[17];
 extern const int quant_bits[17];
 extern const int32_t mpa_enwindow[257];
+
+/* fast header check for resync */
+static inline int ff_mpa_check_header(uint32_t header){
+    /* header */
+    if ((header & 0xffe00000) != 0xffe00000)
+        return -1;
+    /* layer check */
+    if ((header & (3<<17)) == 0)
+        return -1;
+    /* bit rate */
+    if ((header & (0xf<<12)) == 0xf<<12)
+        return -1;
+    /* frequency */
+    if ((header & (3<<10)) == 3<<10)
+        return -1;
+    return 0;
+}
diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c
index f9cb389aa..196d77d2a 100644
--- a/src/libffmpeg/libavcodec/mpegaudiodec.c
+++ b/src/libffmpeg/libavcodec/mpegaudiodec.c
@@ -24,6 +24,7 @@
 
 //#define DEBUG
 #include "avcodec.h"
+#include "bitstream.h"
 #include "mpegaudio.h"
 #include "dsputil.h"
 
@@ -47,6 +48,18 @@
 #define WFRAC_BITS  14   /* fractional bits for window */
 #endif
 
+#if defined(USE_HIGHPRECISION) && defined(CONFIG_AUDIO_NONSHORT)
+typedef int32_t OUT_INT;
+#define OUT_MAX INT32_MAX
+#define OUT_MIN INT32_MIN
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 31)
+#else
+typedef int16_t OUT_INT;
+#define OUT_MAX INT16_MAX
+#define OUT_MIN INT16_MIN
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
+#endif
+
 #define FRAC_ONE    (1 << FRAC_BITS)
 
 #define MULL(a,b) (((int64_t)(a) * (int64_t)(b)) >> FRAC_BITS)
@@ -56,6 +69,12 @@
 #define FIXR(a)   ((int)((a) * FRAC_ONE + 0.5))
 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS)
 
+#define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5))
+//#define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) //gcc 3.4 creates an incredibly bloated mess out of this
+static always_inline int MULH(int a, int b){
+    return ((int64_t)(a) * (int64_t)(b))>>32;
+}
+
 #if FRAC_BITS <= 15
 typedef int16_t MPA_INT;
 #else
@@ -97,8 +116,19 @@ typedef struct MPADecodeContext {
     int frame_count;
 #endif
     void (*compute_antialias)(struct MPADecodeContext *s, struct GranuleDef *g);
+    int adu_mode; ///< 0 for standard mp3, 1 for adu formatted mp3
+    unsigned int dither_state;
 } MPADecodeContext;
 
+/**
+ * Context for MP3On4 decoder
+ */
+typedef struct MP3On4DecodeContext {
+    int frames;   ///< number of mp3 frames per block (number of mp3 decoder instances)
+    int chan_cfg; ///< channel config number
+    MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance
+} MP3On4DecodeContext;
+
 /* layer 3 "granule" */
 typedef struct GranuleDef {
     uint8_t scfsi;
@@ -141,13 +171,9 @@ static VLC huff_quad_vlc[2];
 /* computed from band_size_long */
 static uint16_t band_index_long[9][23];
 /* XXX: free when all decoders are closed */
-#define TABLE_4_3_SIZE (8191 + 16)
+#define TABLE_4_3_SIZE (8191 + 16)*4
 static int8_t  *table_4_3_exp;
-#if FRAC_BITS <= 15
-static uint16_t *table_4_3_value;
-#else
 static uint32_t *table_4_3_value;
-#endif
 /* intensity stereo coef table */
 static int32_t is_table[2][16];
 static int32_t is_table_lsf[2][2][16];
@@ -170,14 +196,7 @@ static int32_t scale_factor_mult2[3][3] = {
     SCALE_GEN(4.0 / 9.0), /* 9 steps */
 };
 
-/* 2^(n/4) */
-static uint32_t scale_factor_mult3[4] = {
-    FIXR(1.0),
-    FIXR(1.18920711500272106671),
-    FIXR(1.41421356237309504880),
-    FIXR(1.68179283050742908605),
-};
-
+void ff_mpa_synth_init(MPA_INT *window);
 static MPA_INT window[512] __attribute__((aligned(16)));
     
 /* layer 1 unscaling */
@@ -214,30 +233,18 @@ static inline int l2_unscale_group(int steps, int mant, int scale_factor)
 /* compute value^(4/3) * 2^(exponent/4). It normalized to FRAC_BITS */
 static inline int l3_unscale(int value, int exponent)
 {
-#if FRAC_BITS <= 15    
     unsigned int m;
-#else
-    uint64_t m;
-#endif
     int e;
 
-    e = table_4_3_exp[value];
-    e += (exponent >> 2);
-    e = FRAC_BITS - e;
-#if FRAC_BITS <= 15    
+    e = table_4_3_exp  [4*value + (exponent&3)];
+    m = table_4_3_value[4*value + (exponent&3)];
+    e -= (exponent >> 2);
+    assert(e>=1);
     if (e > 31)
-        e = 31;
-#endif
-    m = table_4_3_value[value];
-#if FRAC_BITS <= 15    
-    m = (m * scale_factor_mult3[exponent & 3]);
+        return 0;
     m = (m + (1 << (e-1))) >> e;
+
     return m;
-#else
-    m = MUL64(m, scale_factor_mult3[exponent & 3]);
-    m = (m + (uint64_t_C(1) << (e-1))) >> e;
-    return m;
-#endif
 }
 
 /* all integer n^(4/3) computation code */
@@ -250,11 +257,13 @@ static inline int l3_unscale(int value, int exponent)
 
 static int dev_4_3_coefs[DEV_ORDER];
 
+#if 0 /* unused */
 static int pow_mult3[3] = {
     POW_FIX(1.0),
     POW_FIX(1.25992104989487316476),
     POW_FIX(1.58740105196819947474),
 };
+#endif
 
 static void int_pow_init(void)
 {
@@ -267,6 +276,7 @@ static void int_pow_init(void)
     }
 }
 
+#if 0 /* unused, remove? */
 /* return the mantissa and the binary exponent */
 static int int_pow(int i, int *exp_ptr)
 {
@@ -311,6 +321,7 @@ static int int_pow(int i, int *exp_ptr)
     *exp_ptr = eq;
     return a;
 }
+#endif
 
 static int decode_init(AVCodecContext * avctx)
 {
@@ -318,7 +329,13 @@ static int decode_init(AVCodecContext * avctx)
     static int init=0;
     int i, j, k;
 
-    if(avctx->antialias_algo == FF_AA_INT)
+#if defined(USE_HIGHPRECISION) && defined(CONFIG_AUDIO_NONSHORT)
+    avctx->sample_fmt= SAMPLE_FMT_S32;
+#else
+    avctx->sample_fmt= SAMPLE_FMT_S16;
+#endif    
+    
+    if(avctx->antialias_algo != FF_AA_FLOAT)
         s->compute_antialias= compute_antialias_integer;
     else
         s->compute_antialias= compute_antialias_float;
@@ -348,20 +365,7 @@ static int decode_init(AVCodecContext * avctx)
                     scale_factor_mult[i][2]);
         }
         
-        /* window */
-        /* max = 18760, max sum over all 16 coefs : 44736 */
-        for(i=0;i<257;i++) {
-            int v;
-            v = mpa_enwindow[i];
-#if WFRAC_BITS < 16
-            v = (v + (1 << (16 - WFRAC_BITS - 1))) >> (16 - WFRAC_BITS);
-#endif
-            window[i] = v;
-            if ((i & 63) != 0)
-                v = -v;
-            if (i != 0)
-                window[512 - i] = v;
-        }
+	ff_mpa_synth_init(window);
         
         /* huffman decode tables */
         huff_code_table[0] = NULL;
@@ -375,7 +379,7 @@ static int decode_init(AVCodecContext * avctx)
             n = xsize * xsize;
             /* XXX: fail test */
             init_vlc(&huff_vlc[i], 8, n, 
-                     h->bits, 1, 1, h->codes, 2, 2);
+                     h->bits, 1, 1, h->codes, 2, 2, 1);
             
             code_table = av_mallocz(n);
             j = 0;
@@ -387,7 +391,7 @@ static int decode_init(AVCodecContext * avctx)
         }
         for(i=0;i<2;i++) {
             init_vlc(&huff_quad_vlc[i], i == 0 ? 7 : 4, 16, 
-                     mpa_quad_bits[i], 1, 1, mpa_quad_codes[i], 1, 1);
+                     mpa_quad_bits[i], 1, 1, mpa_quad_codes[i], 1, 1, 1);
         }
 
         for(i=0;i<9;i++) {
@@ -409,32 +413,17 @@ static int decode_init(AVCodecContext * avctx)
         
         int_pow_init();
         for(i=1;i<TABLE_4_3_SIZE;i++) {
+            double f, fm;
             int e, m;
-            m = int_pow(i, &e);
-#if 0
-            /* test code */
-            {
-                double f, fm;
-                int e1, m1;
-                f = pow((double)i, 4.0 / 3.0);
-                fm = frexp(f, &e1);
-                m1 = FIXR(2 * fm);
-#if FRAC_BITS <= 15
-                if ((unsigned short)m1 != m1) {
-                    m1 = m1 >> 1;
-                    e1++;
-                }
-#endif
-                e1--;
-                if (m != m1 || e != e1) {
-                    printf("%4d: m=%x m1=%x e=%d e1=%d\n",
-                           i, m, m1, e, e1);
-                }
-            }
-#endif
+            f = pow((double)(i/4), 4.0 / 3.0) * pow(2, (i&3)*0.25);
+            fm = frexp(f, &e);
+            m = (uint32_t)(fm*(1LL<<31) + 0.5);
+            e+= FRAC_BITS - 31 + 5;
+
             /* normalized to FRAC_BITS */
             table_4_3_value[i] = m;
-            table_4_3_exp[i] = e;
+//            av_log(NULL, AV_LOG_DEBUG, "%d %d %f\n", i, m, pow((double)i, 4.0 / 3.0));
+            table_4_3_exp[i] = -e;
         }
         
         for(i=0;i<7;i++) {
@@ -473,38 +462,47 @@ static int decode_init(AVCodecContext * avctx)
             ci = ci_table[i];
             cs = 1.0 / sqrt(1.0 + ci * ci);
             ca = cs * ci;
-            csa_table[i][0] = FIX(cs);
-            csa_table[i][1] = FIX(ca);
-            csa_table[i][2] = FIX(ca) + FIX(cs);
-            csa_table[i][3] = FIX(ca) - FIX(cs); 
+            csa_table[i][0] = FIXHR(cs/4);
+            csa_table[i][1] = FIXHR(ca/4);
+            csa_table[i][2] = FIXHR(ca/4) + FIXHR(cs/4);
+            csa_table[i][3] = FIXHR(ca/4) - FIXHR(cs/4); 
             csa_table_float[i][0] = cs;
             csa_table_float[i][1] = ca;
             csa_table_float[i][2] = ca + cs;
             csa_table_float[i][3] = ca - cs; 
 //            printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca));
+//            av_log(NULL, AV_LOG_DEBUG,"%f %f %f %f\n", cs, ca, ca+cs, ca-cs);
         }
 
         /* compute mdct windows */
         for(i=0;i<36;i++) {
-            int v;
-            v = FIXR(sin(M_PI * (i + 0.5) / 36.0));
-            mdct_win[0][i] = v;
-            mdct_win[1][i] = v;
-            mdct_win[3][i] = v;
-        }
-        for(i=0;i<6;i++) {
-            mdct_win[1][18 + i] = FIXR(1.0);
-            mdct_win[1][24 + i] = FIXR(sin(M_PI * ((i + 6) + 0.5) / 12.0));
-            mdct_win[1][30 + i] = FIXR(0.0);
-
-            mdct_win[3][i] = FIXR(0.0);
-            mdct_win[3][6 + i] = FIXR(sin(M_PI * (i + 0.5) / 12.0));
-            mdct_win[3][12 + i] = FIXR(1.0);
+            for(j=0; j<4; j++){
+                double d;
+                
+                if(j==2 && i%3 != 1)
+                    continue;
+                
+                d= sin(M_PI * (i + 0.5) / 36.0);
+                if(j==1){
+                    if     (i>=30) d= 0;
+                    else if(i>=24) d= sin(M_PI * (i - 18 + 0.5) / 12.0);
+                    else if(i>=18) d= 1;
+                }else if(j==3){
+                    if     (i<  6) d= 0;
+                    else if(i< 12) d= sin(M_PI * (i -  6 + 0.5) / 12.0);
+                    else if(i< 18) d= 1;
+                }
+                //merge last stage of imdct into the window coefficients
+                d*= 0.5 / cos(M_PI*(2*i + 19)/72);
+
+                if(j==2)
+                    mdct_win[j][i/3] = FIXHR((d / (1<<5)));
+                else
+                    mdct_win[j][i  ] = FIXHR((d / (1<<5)));
+//                av_log(NULL, AV_LOG_DEBUG, "%2d %d %f\n", i,j,d / (1<<5));
+            }
         }
 
-        for(i=0;i<12;i++)
-            mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0));
-        
         /* NOTE: we do frequency inversion adter the MDCT by changing
            the sign of the right window coefs */
         for(j=0;j<4;j++) {
@@ -531,6 +529,8 @@ static int decode_init(AVCodecContext * avctx)
 #ifdef DEBUG
     s->frame_count = 0;
 #endif
+    if (avctx->codec_id == CODEC_ID_MP3ADU)
+        s->adu_mode = 1;
     return 0;
 }
 
@@ -753,18 +753,17 @@ static void dct32(int32_t *out, int32_t *tab)
     out[31] = tab[31];
 }
 
-#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
-
 #if FRAC_BITS <= 15
 
-static inline int round_sample(int sum)
+static inline int round_sample(int *sum)
 {
     int sum1;
-    sum1 = (sum + (1 << (OUT_SHIFT - 1))) >> OUT_SHIFT;
-    if (sum1 < -32768)
-        sum1 = -32768;
-    else if (sum1 > 32767)
-        sum1 = 32767;
+    sum1 = (*sum) >> OUT_SHIFT;
+    *sum &= (1<<OUT_SHIFT)-1;
+    if (sum1 < OUT_MIN)
+        sum1 = OUT_MIN;
+    else if (sum1 > OUT_MAX)
+        sum1 = OUT_MAX;
     return sum1;
 }
 
@@ -790,14 +789,15 @@ static inline int round_sample(int sum)
 
 #else
 
-static inline int round_sample(int64_t sum) 
+static inline int round_sample(int64_t *sum) 
 {
     int sum1;
-    sum1 = (int)((sum + (int64_t_C(1) << (OUT_SHIFT - 1))) >> OUT_SHIFT);
-    if (sum1 < -32768)
-        sum1 = -32768;
-    else if (sum1 > 32767)
-        sum1 = 32767;
+    sum1 = (int)((*sum) >> OUT_SHIFT);
+    *sum &= (1<<OUT_SHIFT)-1;
+    if (sum1 < OUT_MIN)
+        sum1 = OUT_MIN;
+    else if (sum1 > OUT_MAX)
+        sum1 = OUT_MAX;
     return sum1;
 }
 
@@ -846,29 +846,48 @@ static inline int round_sample(int64_t sum)
     sum2 op2 MULS((w2)[7 * 64], tmp);\
 }
 
+void ff_mpa_synth_init(MPA_INT *window)
+{
+    int i;
+
+    /* max = 18760, max sum over all 16 coefs : 44736 */
+    for(i=0;i<257;i++) {
+        int v;
+        v = mpa_enwindow[i];
+#if WFRAC_BITS < 16
+        v = (v + (1 << (16 - WFRAC_BITS - 1))) >> (16 - WFRAC_BITS);
+#endif
+        window[i] = v;
+        if ((i & 63) != 0)
+            v = -v;
+        if (i != 0)
+            window[512 - i] = v;
+    }	
+}
 
 /* 32 sub band synthesis filter. Input: 32 sub band samples, Output:
    32 samples. */
 /* XXX: optimize by avoiding ring buffer usage */
-static void synth_filter(MPADecodeContext *s1,
-                         int ch, int16_t *samples, int incr, 
+void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
+			 MPA_INT *window, int *dither_state,
+                         OUT_INT *samples, int incr, 
                          int32_t sb_samples[SBLIMIT])
 {
     int32_t tmp[32];
     register MPA_INT *synth_buf;
-    const register MPA_INT *w, *w2, *p;
+    register const MPA_INT *w, *w2, *p;
     int j, offset, v;
-    int16_t *samples2;
+    OUT_INT *samples2;
 #if FRAC_BITS <= 15
     int sum, sum2;
 #else
     int64_t sum, sum2;
 #endif
-    
+
     dct32(tmp, sb_samples);
     
-    offset = s1->synth_buf_offset[ch];
-    synth_buf = s1->synth_buf[ch] + offset;
+    offset = *synth_buf_offset;
+    synth_buf = synth_buf_ptr + offset;
 
     for(j=0;j<32;j++) {
         v = tmp[j];
@@ -889,148 +908,116 @@ static void synth_filter(MPADecodeContext *s1,
     w = window;
     w2 = window + 31;
 
-    sum = 0;
+    sum = *dither_state;
     p = synth_buf + 16;
     SUM8(sum, +=, w, p);
     p = synth_buf + 48;
     SUM8(sum, -=, w + 32, p);
-    *samples = round_sample(sum);
+    *samples = round_sample(&sum);
     samples += incr;
     w++;
 
     /* we calculate two samples at the same time to avoid one memory
        access per two sample */
     for(j=1;j<16;j++) {
-        sum = 0;
         sum2 = 0;
         p = synth_buf + 16 + j;
         SUM8P2(sum, +=, sum2, -=, w, w2, p);
         p = synth_buf + 48 - j;
         SUM8P2(sum, -=, sum2, -=, w + 32, w2 + 32, p);
 
-        *samples = round_sample(sum);
+        *samples = round_sample(&sum);
         samples += incr;
-        *samples2 = round_sample(sum2);
+        sum += sum2;
+        *samples2 = round_sample(&sum);
         samples2 -= incr;
         w++;
         w2--;
     }
     
     p = synth_buf + 32;
-    sum = 0;
     SUM8(sum, -=, w + 32, p);
-    *samples = round_sample(sum);
+    *samples = round_sample(&sum);
+    *dither_state= sum;
 
     offset = (offset - 32) & 511;
-    s1->synth_buf_offset[ch] = offset;
+    *synth_buf_offset = offset;
 }
 
-/* cos(pi*i/24) */
-#define C1  FIXR(0.99144486137381041114)
-#define C3  FIXR(0.92387953251128675612)
-#define C5  FIXR(0.79335334029123516458)
-#define C7  FIXR(0.60876142900872063941)
-#define C9  FIXR(0.38268343236508977173)
-#define C11 FIXR(0.13052619222005159154)
-
-/* 12 points IMDCT. We compute it "by hand" by factorizing obvious
-   cases. */
-static void imdct12(int *out, int *in)
-{
-    int tmp;
-    int64_t in1_3, in1_9, in4_3, in4_9;
-
-    in1_3 = MUL64(in[1], C3);
-    in1_9 = MUL64(in[1], C9);
-    in4_3 = MUL64(in[4], C3);
-    in4_9 = MUL64(in[4], C9);
-    
-    tmp = FRAC_RND(MUL64(in[0], C7) - in1_3 - MUL64(in[2], C11) + 
-                   MUL64(in[3], C1) - in4_9 - MUL64(in[5], C5));
-    out[0] = tmp;
-    out[5] = -tmp;
-    tmp = FRAC_RND(MUL64(in[0] - in[3], C9) - in1_3 + 
-                   MUL64(in[2] + in[5], C3) - in4_9);
-    out[1] = tmp;
-    out[4] = -tmp;
-    tmp = FRAC_RND(MUL64(in[0], C11) - in1_9 + MUL64(in[2], C7) -
-                   MUL64(in[3], C5) + in4_3 - MUL64(in[5], C1));
-    out[2] = tmp;
-    out[3] = -tmp;
-    tmp = FRAC_RND(MUL64(-in[0], C5) + in1_9 + MUL64(in[2], C1) + 
-                   MUL64(in[3], C11) - in4_3 - MUL64(in[5], C7));
-    out[6] = tmp;
-    out[11] = tmp;
-    tmp = FRAC_RND(MUL64(-in[0] + in[3], C3) - in1_9 + 
-                   MUL64(in[2] + in[5], C9) + in4_3);
-    out[7] = tmp;
-    out[10] = tmp;
-    tmp = FRAC_RND(-MUL64(in[0], C1) - in1_3 - MUL64(in[2], C5) -
-                   MUL64(in[3], C7) - in4_9 - MUL64(in[5], C11));
-    out[8] = tmp;
-    out[9] = tmp;
-}
-
-#undef C1
-#undef C3
-#undef C5
-#undef C7
-#undef C9
-#undef C11
-
-/* cos(pi*i/18) */
-#define C1 FIXR(0.98480775301220805936)
-#define C2 FIXR(0.93969262078590838405)
-#define C3 FIXR(0.86602540378443864676)
-#define C4 FIXR(0.76604444311897803520)
-#define C5 FIXR(0.64278760968653932632)
-#define C6 FIXR(0.5)
-#define C7 FIXR(0.34202014332566873304)
-#define C8 FIXR(0.17364817766693034885)
+#define C3 FIXHR(0.86602540378443864676/2)
 
 /* 0.5 / cos(pi*(2*i+1)/36) */
 static const int icos36[9] = {
     FIXR(0.50190991877167369479),
-    FIXR(0.51763809020504152469),
+    FIXR(0.51763809020504152469), //0
     FIXR(0.55168895948124587824),
     FIXR(0.61038729438072803416),
-    FIXR(0.70710678118654752439),
+    FIXR(0.70710678118654752439), //1
     FIXR(0.87172339781054900991),
     FIXR(1.18310079157624925896),
-    FIXR(1.93185165257813657349),
+    FIXR(1.93185165257813657349), //2
     FIXR(5.73685662283492756461),
 };
 
-static const int icos72[18] = {
-    /* 0.5 / cos(pi*(2*i+19)/72) */
-    FIXR(0.74009361646113053152),
-    FIXR(0.82133981585229078570),
-    FIXR(0.93057949835178895673),
-    FIXR(1.08284028510010010928),
-    FIXR(1.30656296487637652785),
-    FIXR(1.66275476171152078719),
-    FIXR(2.31011315767264929558),
-    FIXR(3.83064878777019433457),
-    FIXR(11.46279281302667383546),
-
-    /* 0.5 / cos(pi*(2*(i + 18) +19)/72) */
-    FIXR(-0.67817085245462840086),
-    FIXR(-0.63023620700513223342),
-    FIXR(-0.59284452371708034528),
-    FIXR(-0.56369097343317117734),
-    FIXR(-0.54119610014619698439),
-    FIXR(-0.52426456257040533932),
-    FIXR(-0.51213975715725461845),
-    FIXR(-0.50431448029007636036),
-    FIXR(-0.50047634258165998492),
-};
+/* 12 points IMDCT. We compute it "by hand" by factorizing obvious
+   cases. */
+static void imdct12(int *out, int *in)
+{
+    int in0, in1, in2, in3, in4, in5, t1, t2;
+
+    in0= in[0*3];
+    in1= in[1*3] + in[0*3];
+    in2= in[2*3] + in[1*3];
+    in3= in[3*3] + in[2*3];
+    in4= in[4*3] + in[3*3];
+    in5= in[5*3] + in[4*3];
+    in5 += in3;
+    in3 += in1;
+
+    in2= MULH(2*in2, C3);
+    in3= MULH(2*in3, C3);
+    
+    t1 = in0 - in4;
+    t2 = MULL(in1 - in5, icos36[4]);
+
+    out[ 7]= 
+    out[10]= t1 + t2;
+    out[ 1]=
+    out[ 4]= t1 - t2;
+
+    in0 += in4>>1;
+    in4 = in0 + in2;
+    in1 += in5>>1;
+    in5 = MULL(in1 + in3, icos36[1]);    
+    out[ 8]= 
+    out[ 9]= in4 + in5;
+    out[ 2]=
+    out[ 3]= in4 - in5;
+    
+    in0 -= in2;
+    in1 = MULL(in1 - in3, icos36[7]);
+    out[ 0]=
+    out[ 5]= in0 - in1;
+    out[ 6]=
+    out[11]= in0 + in1;    
+}
+
+/* cos(pi*i/18) */
+#define C1 FIXHR(0.98480775301220805936/2)
+#define C2 FIXHR(0.93969262078590838405/2)
+#define C3 FIXHR(0.86602540378443864676/2)
+#define C4 FIXHR(0.76604444311897803520/2)
+#define C5 FIXHR(0.64278760968653932632/2)
+#define C6 FIXHR(0.5/2)
+#define C7 FIXHR(0.34202014332566873304/2)
+#define C8 FIXHR(0.17364817766693034885/2)
+
 
 /* using Lee like decomposition followed by hand coded 9 points DCT */
-static void imdct36(int *out, int *in)
+static void imdct36(int *out, int *buf, int *in, int *win)
 {
     int i, j, t0, t1, t2, t3, s0, s1, s2, s3;
     int tmp[18], *tmp1, *in1;
-    int64_t in3_3, in6_6;
 
     for(i=17;i>=1;i--)
         in[i] += in[i-1];
@@ -1040,30 +1027,61 @@ static void imdct36(int *out, int *in)
     for(j=0;j<2;j++) {
         tmp1 = tmp + j;
         in1 = in + j;
+#if 0
+//more accurate but slower
+        int64_t t0, t1, t2, t3;
+        t2 = in1[2*4] + in1[2*8] - in1[2*2];
+        
+        t3 = (in1[2*0] + (int64_t)(in1[2*6]>>1))<<32;
+        t1 = in1[2*0] - in1[2*6];
+        tmp1[ 6] = t1 - (t2>>1);
+        tmp1[16] = t1 + t2;
+
+        t0 = MUL64(2*(in1[2*2] + in1[2*4]),    C2);
+        t1 = MUL64(   in1[2*4] - in1[2*8] , -2*C8);
+        t2 = MUL64(2*(in1[2*2] + in1[2*8]),   -C4);
+        
+        tmp1[10] = (t3 - t0 - t2) >> 32;
+        tmp1[ 2] = (t3 + t0 + t1) >> 32;
+        tmp1[14] = (t3 + t2 - t1) >> 32;
+        
+        tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3);
+        t2 = MUL64(2*(in1[2*1] + in1[2*5]),    C1);
+        t3 = MUL64(   in1[2*5] - in1[2*7] , -2*C7);
+        t0 = MUL64(2*in1[2*3], C3);
+
+        t1 = MUL64(2*(in1[2*1] + in1[2*7]),   -C5);
 
-        in3_3 = MUL64(in1[2*3], C3);
-        in6_6 = MUL64(in1[2*6], C6);
-
-        tmp1[0] = FRAC_RND(MUL64(in1[2*1], C1) + in3_3 + 
-                           MUL64(in1[2*5], C5) + MUL64(in1[2*7], C7));
-        tmp1[2] = in1[2*0] + FRAC_RND(MUL64(in1[2*2], C2) + 
-                                      MUL64(in1[2*4], C4) + in6_6 + 
-                                      MUL64(in1[2*8], C8));
-        tmp1[4] = FRAC_RND(MUL64(in1[2*1] - in1[2*5] - in1[2*7], C3));
-        tmp1[6] = FRAC_RND(MUL64(in1[2*2] - in1[2*4] - in1[2*8], C6)) - 
-            in1[2*6] + in1[2*0];
-        tmp1[8] = FRAC_RND(MUL64(in1[2*1], C5) - in3_3 - 
-                           MUL64(in1[2*5], C7) + MUL64(in1[2*7], C1));
-        tmp1[10] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C8) - 
-                                       MUL64(in1[2*4], C2) + in6_6 + 
-                                       MUL64(in1[2*8], C4));
-        tmp1[12] = FRAC_RND(MUL64(in1[2*1], C7) - in3_3 + 
-                            MUL64(in1[2*5], C1) - 
-                            MUL64(in1[2*7], C5));
-        tmp1[14] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C4) + 
-                                       MUL64(in1[2*4], C8) + in6_6 - 
-                                       MUL64(in1[2*8], C2));
-        tmp1[16] = in1[2*0] - in1[2*2] + in1[2*4] - in1[2*6] + in1[2*8];
+        tmp1[ 0] = (t2 + t3 + t0) >> 32;
+        tmp1[12] = (t2 + t1 - t0) >> 32;
+        tmp1[ 8] = (t3 - t1 - t0) >> 32;
+#else
+        t2 = in1[2*4] + in1[2*8] - in1[2*2];
+        
+        t3 = in1[2*0] + (in1[2*6]>>1);
+        t1 = in1[2*0] - in1[2*6];
+        tmp1[ 6] = t1 - (t2>>1);
+        tmp1[16] = t1 + t2;
+
+        t0 = MULH(2*(in1[2*2] + in1[2*4]),    C2);
+        t1 = MULH(   in1[2*4] - in1[2*8] , -2*C8);
+        t2 = MULH(2*(in1[2*2] + in1[2*8]),   -C4);
+        
+        tmp1[10] = t3 - t0 - t2;
+        tmp1[ 2] = t3 + t0 + t1;
+        tmp1[14] = t3 + t2 - t1;
+        
+        tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3);
+        t2 = MULH(2*(in1[2*1] + in1[2*5]),    C1);
+        t3 = MULH(   in1[2*5] - in1[2*7] , -2*C7);
+        t0 = MULH(2*in1[2*3], C3);
+
+        t1 = MULH(2*(in1[2*1] + in1[2*7]),   -C5);
+
+        tmp1[ 0] = t2 + t3 + t0;
+        tmp1[12] = t2 + t1 - t0;
+        tmp1[ 8] = t3 - t1 - t0;
+#endif
     }
 
     i = 0;
@@ -1078,54 +1096,32 @@ static void imdct36(int *out, int *in)
         s1 = MULL(t3 + t2, icos36[j]);
         s3 = MULL(t3 - t2, icos36[8 - j]);
         
-        t0 = MULL(s0 + s1, icos72[9 + 8 - j]);
-        t1 = MULL(s0 - s1, icos72[8 - j]);
-        out[18 + 9 + j] = t0;
-        out[18 + 8 - j] = t0;
-        out[9 + j] = -t1;
-        out[8 - j] = t1;
+        t0 = s0 + s1;
+        t1 = s0 - s1;
+        out[(9 + j)*SBLIMIT] =  MULH(t1, win[9 + j]) + buf[9 + j];
+        out[(8 - j)*SBLIMIT] =  MULH(t1, win[8 - j]) + buf[8 - j];
+        buf[9 + j] = MULH(t0, win[18 + 9 + j]);
+        buf[8 - j] = MULH(t0, win[18 + 8 - j]);
         
-        t0 = MULL(s2 + s3, icos72[9+j]);
-        t1 = MULL(s2 - s3, icos72[j]);
-        out[18 + 9 + (8 - j)] = t0;
-        out[18 + j] = t0;
-        out[9 + (8 - j)] = -t1;
-        out[j] = t1;
+        t0 = s2 + s3;
+        t1 = s2 - s3;
+        out[(9 + 8 - j)*SBLIMIT] =  MULH(t1, win[9 + 8 - j]) + buf[9 + 8 - j];
+        out[(        j)*SBLIMIT] =  MULH(t1, win[        j]) + buf[        j];
+        buf[9 + 8 - j] = MULH(t0, win[18 + 9 + 8 - j]);
+        buf[      + j] = MULH(t0, win[18         + j]);
         i += 4;
     }
 
     s0 = tmp[16];
     s1 = MULL(tmp[17], icos36[4]);
-    t0 = MULL(s0 + s1, icos72[9 + 4]);
-    t1 = MULL(s0 - s1, icos72[4]);
-    out[18 + 9 + 4] = t0;
-    out[18 + 8 - 4] = t0;
-    out[9 + 4] = -t1;
-    out[8 - 4] = t1;
-}
-
-/* fast header check for resync */
-static int check_header(uint32_t header)
-{
-    /* header */
-    if ((header & 0xffe00000) != 0xffe00000)
-	return -1;
-    /* layer check */
-    if (((header >> 17) & 3) == 0)
-	return -1;
-    /* bit rate */
-    if (((header >> 12) & 0xf) == 0xf)
-	return -1;
-    /* frequency */
-    if (((header >> 10) & 3) == 3)
-	return -1;
-    return 0;
+    t0 = s0 + s1;
+    t1 = s0 - s1;
+    out[(9 + 4)*SBLIMIT] =  MULH(t1, win[9 + 4]) + buf[9 + 4];
+    out[(8 - 4)*SBLIMIT] =  MULH(t1, win[8 - 4]) + buf[8 - 4];
+    buf[9 + 4] = MULH(t0, win[18 + 9 + 4]);
+    buf[8 - 4] = MULH(t0, win[18 + 8 - 4]);
 }
 
-/* header + layer + bitrate + freq + lsf/mpeg25 */
-#define SAME_HEADER_MASK \
-   (0xffe00000 | (3 << 17) | (0xf << 12) | (3 << 10) | (3 << 19))
-
 /* header decoding. MUST check the header before because no
    consistency check is done there. Return 1 if free format found and
    that the frame size must be computed externally */
@@ -1233,7 +1229,7 @@ int mpa_decode_header(AVCodecContext *avctx, uint32_t head)
     MPADecodeContext s1, *s = &s1;
     memset( s, 0, sizeof(MPADecodeContext) );
 
-    if (check_header(head) != 0)
+    if (ff_mpa_check_header(head) != 0)
         return -1;
 
     if (decode_header(s, head) != 0) {
@@ -1920,8 +1916,8 @@ static void compute_stereo(MPADecodeContext *s,
 static void compute_antialias_integer(MPADecodeContext *s,
                               GranuleDef *g)
 {
-    int32_t *ptr, *p0, *p1, *csa;
-    int n, i, j;
+    int32_t *ptr, *csa;
+    int n, i;
 
     /* we antialias only "long" bands */
     if (g->block_type == 2) {
@@ -1935,35 +1931,24 @@ static void compute_antialias_integer(MPADecodeContext *s,
     
     ptr = g->sb_hybrid + 18;
     for(i = n;i > 0;i--) {
-        p0 = ptr - 1;
-        p1 = ptr;
-        csa = &csa_table[0][0];       
-        for(j=0;j<4;j++) {
-            int tmp0 = *p0;
-            int tmp1 = *p1;
-#if 0
-            *p0 = FRAC_RND(MUL64(tmp0, csa[0]) - MUL64(tmp1, csa[1]));
-            *p1 = FRAC_RND(MUL64(tmp0, csa[1]) + MUL64(tmp1, csa[0]));
-#else
-            int64_t tmp2= MUL64(tmp0 + tmp1, csa[0]);
-            *p0 = FRAC_RND(tmp2 - MUL64(tmp1, csa[2]));
-            *p1 = FRAC_RND(tmp2 + MUL64(tmp0, csa[3]));
-#endif
-            p0--; p1++;
-            csa += 4;
-            tmp0 = *p0;
-            tmp1 = *p1;
-#if 0
-            *p0 = FRAC_RND(MUL64(tmp0, csa[0]) - MUL64(tmp1, csa[1]));
-            *p1 = FRAC_RND(MUL64(tmp0, csa[1]) + MUL64(tmp1, csa[0]));
-#else
-            tmp2= MUL64(tmp0 + tmp1, csa[0]);
-            *p0 = FRAC_RND(tmp2 - MUL64(tmp1, csa[2]));
-            *p1 = FRAC_RND(tmp2 + MUL64(tmp0, csa[3]));
-#endif
-            p0--; p1++;
-            csa += 4;
-        }
+        int tmp0, tmp1, tmp2;
+        csa = &csa_table[0][0];
+#define INT_AA(j) \
+            tmp0 = ptr[-1-j];\
+            tmp1 = ptr[   j];\
+            tmp2= MULH(tmp0 + tmp1, csa[0+4*j]);\
+            ptr[-1-j] = 4*(tmp2 - MULH(tmp1, csa[2+4*j]));\
+            ptr[   j] = 4*(tmp2 + MULH(tmp0, csa[3+4*j]));
+
+        INT_AA(0)
+        INT_AA(1)
+        INT_AA(2)
+        INT_AA(3)
+        INT_AA(4)
+        INT_AA(5)
+        INT_AA(6)
+        INT_AA(7)
+            
         ptr += 18;       
     }
 }
@@ -1971,8 +1956,8 @@ static void compute_antialias_integer(MPADecodeContext *s,
 static void compute_antialias_float(MPADecodeContext *s,
                               GranuleDef *g)
 {
-    int32_t *ptr, *p0, *p1;
-    int n, i, j;
+    int32_t *ptr;
+    int n, i;
 
     /* we antialias only "long" bands */
     if (g->block_type == 2) {
@@ -1986,35 +1971,23 @@ static void compute_antialias_float(MPADecodeContext *s,
     
     ptr = g->sb_hybrid + 18;
     for(i = n;i > 0;i--) {
+        float tmp0, tmp1;
         float *csa = &csa_table_float[0][0];       
-        p0 = ptr - 1;
-        p1 = ptr;
-        for(j=0;j<4;j++) {
-            float tmp0 = *p0;
-            float tmp1 = *p1;
-#if 1
-            *p0 = lrintf(tmp0 * csa[0] - tmp1 * csa[1]);
-            *p1 = lrintf(tmp0 * csa[1] + tmp1 * csa[0]);
-#else
-            float tmp2= (tmp0 + tmp1) * csa[0];
-            *p0 = lrintf(tmp2 - tmp1 * csa[2]);
-            *p1 = lrintf(tmp2 + tmp0 * csa[3]);
-#endif
-            p0--; p1++;
-            csa += 4;
-            tmp0 = *p0;
-            tmp1 = *p1;
-#if 1
-            *p0 = lrintf(tmp0 * csa[0] - tmp1 * csa[1]);
-            *p1 = lrintf(tmp0 * csa[1] + tmp1 * csa[0]);
-#else
-            tmp2= (tmp0 + tmp1) * csa[0];
-            *p0 = lrintf(tmp2 - tmp1 * csa[2]);
-            *p1 = lrintf(tmp2 + tmp0 * csa[3]);
-#endif
-            p0--; p1++;
-            csa += 4;
-        }
+#define FLOAT_AA(j)\
+        tmp0= ptr[-1-j];\
+        tmp1= ptr[   j];\
+        ptr[-1-j] = lrintf(tmp0 * csa[0+4*j] - tmp1 * csa[1+4*j]);\
+        ptr[   j] = lrintf(tmp0 * csa[1+4*j] + tmp1 * csa[0+4*j]);
+        
+        FLOAT_AA(0)
+        FLOAT_AA(1)
+        FLOAT_AA(2)
+        FLOAT_AA(3)
+        FLOAT_AA(4)
+        FLOAT_AA(5)
+        FLOAT_AA(6)
+        FLOAT_AA(7)
+
         ptr += 18;       
     }
 }
@@ -2024,11 +1997,9 @@ static void compute_imdct(MPADecodeContext *s,
                           int32_t *sb_samples,
                           int32_t *mdct_buf)
 {
-    int32_t *ptr, *win, *win1, *buf, *buf2, *out_ptr, *ptr1;
-    int32_t in[6];
-    int32_t out[36];
+    int32_t *ptr, *win, *win1, *buf, *out_ptr, *ptr1;
     int32_t out2[12];
-    int i, j, k, mdct_long_end, v, sblimit;
+    int i, j, mdct_long_end, v, sblimit;
 
     /* find last non zero block */
     ptr = g->sb_hybrid + 576;
@@ -2054,7 +2025,6 @@ static void compute_imdct(MPADecodeContext *s,
     buf = mdct_buf;
     ptr = g->sb_hybrid;
     for(j=0;j<mdct_long_end;j++) {
-        imdct36(out, ptr);
         /* apply window & overlap with previous buffer */
         out_ptr = sb_samples + j;
         /* select window */
@@ -2064,45 +2034,38 @@ static void compute_imdct(MPADecodeContext *s,
             win1 = mdct_win[g->block_type];
         /* select frequency inversion */
         win = win1 + ((4 * 36) & -(j & 1));
-        for(i=0;i<18;i++) {
-            *out_ptr = MULL(out[i], win[i]) + buf[i];
-            buf[i] = MULL(out[i + 18], win[i + 18]);
-            out_ptr += SBLIMIT;
-        }
+        imdct36(out_ptr, buf, ptr, win);
+        out_ptr += 18*SBLIMIT;
         ptr += 18;
         buf += 18;
     }
     for(j=mdct_long_end;j<sblimit;j++) {
-        for(i=0;i<6;i++) {
-            out[i] = 0;
-            out[6 + i] = 0;
-            out[30+i] = 0;
-        }
         /* select frequency inversion */
         win = mdct_win[2] + ((4 * 36) & -(j & 1));
-        buf2 = out + 6;
-        for(k=0;k<3;k++) {
-            /* reorder input for short mdct */
-            ptr1 = ptr + k;
-            for(i=0;i<6;i++) {
-                in[i] = *ptr1;
-                ptr1 += 3;
-            }
-            imdct12(out2, in);
-            /* apply 12 point window and do small overlap */
-            for(i=0;i<6;i++) {
-                buf2[i] = MULL(out2[i], win[i]) + buf2[i];
-                buf2[i + 6] = MULL(out2[i + 6], win[i + 6]);
-            }
-            buf2 += 6;
-        }
-        /* overlap */
         out_ptr = sb_samples + j;
-        for(i=0;i<18;i++) {
-            *out_ptr = out[i] + buf[i];
-            buf[i] = out[i + 18];
+        
+        for(i=0; i<6; i++){
+            *out_ptr = buf[i];
             out_ptr += SBLIMIT;
         }
+        imdct12(out2, ptr + 0);
+        for(i=0;i<6;i++) {
+            *out_ptr = MULH(out2[i], win[i]) + buf[i + 6*1];
+            buf[i + 6*2] = MULH(out2[i + 6], win[i + 6]);
+            out_ptr += SBLIMIT;
+        }
+        imdct12(out2, ptr + 1);
+        for(i=0;i<6;i++) {
+            *out_ptr = MULH(out2[i], win[i]) + buf[i + 6*2];
+            buf[i + 6*0] = MULH(out2[i + 6], win[i + 6]);
+            out_ptr += SBLIMIT;
+        }
+        imdct12(out2, ptr + 2);
+        for(i=0;i<6;i++) {
+            buf[i + 6*0] = MULH(out2[i], win[i]) + buf[i + 6*0];
+            buf[i + 6*1] = MULH(out2[i + 6], win[i + 6]);
+            buf[i + 6*2] = 0;
+        }
         ptr += 18;
         buf += 18;
     }
@@ -2129,7 +2092,7 @@ void sample_dump(int fnum, int32_t *tab, int n)
     
     f = files[fnum];
     if (!f) {
-        sprintf(buf, "/tmp/out%d.%s.pcm", 
+        snprintf(buf, sizeof(buf), "/tmp/out%d.%s.pcm", 
                 fnum, 
 #ifdef USE_HIGHPRECISION
                 "hp"
@@ -2145,11 +2108,11 @@ void sample_dump(int fnum, int32_t *tab, int n)
     
     if (fnum == 0) {
         static int pos = 0;
-        printf("pos=%d\n", pos);
+        av_log(NULL, AV_LOG_DEBUG, "pos=%d\n", pos);
         for(i=0;i<n;i++) {
-            printf(" %0.4f", (double)tab[i] / FRAC_ONE);
+            av_log(NULL, AV_LOG_DEBUG, " %0.4f", (double)tab[i] / FRAC_ONE);
             if ((i % 18) == 17)
-                printf("\n");
+                av_log(NULL, AV_LOG_DEBUG, "\n");
         }
         pos += n;
     }
@@ -2297,9 +2260,11 @@ static int mp_decode_layer3(MPADecodeContext *s)
         }
     }
 
+  if (!s->adu_mode) {
     /* now we get bits from the main_data_begin offset */
     dprintf("seekback: %d\n", main_data_begin);
     seek_to_maindata(s, main_data_begin);
+  }
 
     for(gr=0;gr<nb_granules;gr++) {
         for(ch=0;ch<s->nb_channels;ch++) {
@@ -2459,10 +2424,10 @@ static int mp_decode_layer3(MPADecodeContext *s)
 }
 
 static int mp_decode_frame(MPADecodeContext *s, 
-                           short *samples)
+                           OUT_INT *samples)
 {
     int i, nb_frames, ch;
-    short *samples_ptr;
+    OUT_INT *samples_ptr;
 
     init_get_bits(&s->gb, s->inbuf + HEADER_SIZE, 
                   (s->inbuf_ptr - s->inbuf - HEADER_SIZE)*8);
@@ -2499,7 +2464,9 @@ static int mp_decode_frame(MPADecodeContext *s,
     for(ch=0;ch<s->nb_channels;ch++) {
         samples_ptr = samples + ch;
         for(i=0;i<nb_frames;i++) {
-            synth_filter(s, ch, samples_ptr, s->nb_channels,
+            ff_mpa_synth_filter(s->synth_buf[ch], &(s->synth_buf_offset[ch]),
+			 window, &s->dither_state,
+			 samples_ptr, s->nb_channels,
                          s->sb_samples[ch][i]);
             samples_ptr += 32 * s->nb_channels;
         }
@@ -2507,7 +2474,7 @@ static int mp_decode_frame(MPADecodeContext *s,
 #ifdef DEBUG
     s->frame_count++;        
 #endif
-    return nb_frames * 32 * sizeof(short) * s->nb_channels;
+    return nb_frames * 32 * sizeof(OUT_INT) * s->nb_channels;
 }
 
 static int decode_frame(AVCodecContext * avctx,
@@ -2518,7 +2485,7 @@ static int decode_frame(AVCodecContext * avctx,
     uint32_t header;
     uint8_t *buf_ptr;
     int len, out_size;
-    short *out_samples = data;
+    OUT_INT *out_samples = data;
 
     buf_ptr = buf;
     while (buf_size > 0) {
@@ -2551,7 +2518,7 @@ static int decode_frame(AVCodecContext * avctx,
 		header = (s->inbuf[0] << 24) | (s->inbuf[1] << 16) |
 		    (s->inbuf[2] << 8) | s->inbuf[3];
 
-		if (check_header(header) < 0) {
+		if (ff_mpa_check_header(header) < 0) {
 		    /* no sync found : move by one byte (inefficient, but simple!) */
 		    memmove(s->inbuf, s->inbuf + 1, s->inbuf_ptr - s->inbuf - 1);
 		    s->inbuf_ptr--;
@@ -2668,6 +2635,226 @@ static int decode_frame(AVCodecContext * avctx,
     return buf_ptr - buf;
 }
 
+
+static int decode_frame_adu(AVCodecContext * avctx,
+			void *data, int *data_size,
+			uint8_t * buf, int buf_size)
+{
+    MPADecodeContext *s = avctx->priv_data;
+    uint32_t header;
+    int len, out_size;
+    OUT_INT *out_samples = data;
+
+    len = buf_size;
+
+    // Discard too short frames
+    if (buf_size < HEADER_SIZE) {
+        *data_size = 0;
+        return buf_size;
+    }
+
+
+    if (len > MPA_MAX_CODED_FRAME_SIZE)
+        len = MPA_MAX_CODED_FRAME_SIZE;
+
+    memcpy(s->inbuf, buf, len);
+    s->inbuf_ptr = s->inbuf + len;
+
+    // Get header and restore sync word
+    header = (s->inbuf[0] << 24) | (s->inbuf[1] << 16) |
+              (s->inbuf[2] << 8) | s->inbuf[3] | 0xffe00000;
+
+    if (ff_mpa_check_header(header) < 0) { // Bad header, discard frame
+        *data_size = 0;
+        return buf_size;
+    }
+
+    decode_header(s, header);
+    /* update codec info */
+    avctx->sample_rate = s->sample_rate;
+    avctx->channels = s->nb_channels;
+    avctx->bit_rate = s->bit_rate;
+    avctx->sub_id = s->layer;
+
+    avctx->frame_size=s->frame_size = len;
+
+    if (avctx->parse_only) {
+        /* simply return the frame data */
+        *(uint8_t **)data = s->inbuf;
+        out_size = s->inbuf_ptr - s->inbuf;
+    } else {
+        out_size = mp_decode_frame(s, out_samples);
+    }
+
+    *data_size = out_size;
+    return buf_size;
+}
+
+
+/* Next 3 arrays are indexed by channel config number (passed via codecdata) */
+static int mp3Frames[16] = {0,1,1,2,3,3,4,5,2};   /* number of mp3 decoder instances */
+static int mp3Channels[16] = {0,1,2,3,4,5,6,8,4}; /* total output channels */
+/* offsets into output buffer, assume output order is FL FR BL BR C LFE */
+static int chan_offset[9][5] = {
+    {0},
+    {0},            // C
+    {0},            // FLR
+    {2,0},          // C FLR
+    {2,0,3},        // C FLR BS
+    {4,0,2},        // C FLR BLRS
+    {4,0,2,5},      // C FLR BLRS LFE
+    {4,0,2,6,5},    // C FLR BLRS BLR LFE
+    {0,2}           // FLR BLRS
+};
+
+
+static int decode_init_mp3on4(AVCodecContext * avctx)
+{
+    MP3On4DecodeContext *s = avctx->priv_data;
+    int i;
+
+    if ((avctx->extradata_size < 2) || (avctx->extradata == NULL)) {
+        av_log(avctx, AV_LOG_ERROR, "Codec extradata missing or too short.\n");
+        return -1;
+    }
+
+    s->chan_cfg = (((unsigned char *)avctx->extradata)[1] >> 3) & 0x0f;
+    s->frames = mp3Frames[s->chan_cfg];
+    if(!s->frames) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid channel config number.\n");
+        return -1;
+    }
+    avctx->channels = mp3Channels[s->chan_cfg];
+
+    /* Init the first mp3 decoder in standard way, so that all tables get builded
+     * We replace avctx->priv_data with the context of the first decoder so that
+     * decode_init() does not have to be changed.
+     * Other decoders will be inited here copying data from the first context
+     */
+    // Allocate zeroed memory for the first decoder context
+    s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext));
+    // Put decoder context in place to make init_decode() happy
+    avctx->priv_data = s->mp3decctx[0];
+    decode_init(avctx);
+    // Restore mp3on4 context pointer
+    avctx->priv_data = s;
+    s->mp3decctx[0]->adu_mode = 1; // Set adu mode
+
+    /* Create a separate codec/context for each frame (first is already ok).
+     * Each frame is 1 or 2 channels - up to 5 frames allowed
+     */
+    for (i = 1; i < s->frames; i++) {
+        s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext));
+        s->mp3decctx[i]->compute_antialias = s->mp3decctx[0]->compute_antialias;
+        s->mp3decctx[i]->inbuf = &s->mp3decctx[i]->inbuf1[0][BACKSTEP_SIZE];
+        s->mp3decctx[i]->inbuf_ptr = s->mp3decctx[i]->inbuf;
+        s->mp3decctx[i]->adu_mode = 1;
+    }
+
+    return 0;
+}
+
+
+static int decode_close_mp3on4(AVCodecContext * avctx)
+{
+    MP3On4DecodeContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->frames; i++)
+        if (s->mp3decctx[i])
+            av_free(s->mp3decctx[i]);
+
+    return 0;
+}
+
+
+static int decode_frame_mp3on4(AVCodecContext * avctx,
+			void *data, int *data_size,
+			uint8_t * buf, int buf_size)
+{
+    MP3On4DecodeContext *s = avctx->priv_data;
+    MPADecodeContext *m;
+    int len, out_size = 0;
+    uint32_t header;
+    OUT_INT *out_samples = data;
+    OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS];
+    OUT_INT *outptr, *bp;
+    int fsize;
+    unsigned char *start2 = buf, *start;
+    int fr, i, j, n;
+    int off = avctx->channels;
+    int *coff = chan_offset[s->chan_cfg];
+
+    len = buf_size;
+
+    // Discard too short frames
+    if (buf_size < HEADER_SIZE) {
+        *data_size = 0;
+        return buf_size;
+    }
+
+    // If only one decoder interleave is not needed
+    outptr = s->frames == 1 ? out_samples : decoded_buf;
+
+    for (fr = 0; fr < s->frames; fr++) {
+        start = start2;
+        fsize = (start[0] << 4) | (start[1] >> 4);
+        start2 += fsize;
+        if (fsize > len)
+            fsize = len;
+        len -= fsize;
+        if (fsize > MPA_MAX_CODED_FRAME_SIZE)
+            fsize = MPA_MAX_CODED_FRAME_SIZE;
+        m = s->mp3decctx[fr];
+        assert (m != NULL);
+        /* copy original to new */
+        m->inbuf_ptr = m->inbuf + fsize;
+        memcpy(m->inbuf, start, fsize);
+
+        // Get header
+        header = (m->inbuf[0] << 24) | (m->inbuf[1] << 16) |
+                  (m->inbuf[2] << 8) | m->inbuf[3] | 0xfff00000;
+
+        if (ff_mpa_check_header(header) < 0) { // Bad header, discard block
+            *data_size = 0;
+            return buf_size;
+        }
+
+        decode_header(m, header);
+        mp_decode_frame(m, decoded_buf);
+
+        n = MPA_FRAME_SIZE * m->nb_channels;
+        out_size += n * sizeof(OUT_INT);
+        if(s->frames > 1) {
+            /* interleave output data */
+            bp = out_samples + coff[fr];
+            if(m->nb_channels == 1) {
+                for(j = 0; j < n; j++) {
+                    *bp = decoded_buf[j];
+                    bp += off;
+                }
+            } else {
+                for(j = 0; j < n; j++) {
+                    bp[0] = decoded_buf[j++];
+                    bp[1] = decoded_buf[j];
+                    bp += off;
+                }
+            }
+        }
+    }
+
+    /* update codec info */
+    avctx->sample_rate = s->mp3decctx[0]->sample_rate;
+    avctx->frame_size= buf_size;
+    avctx->bit_rate = 0;
+    for (i = 0; i < s->frames; i++)
+        avctx->bit_rate += s->mp3decctx[i]->bit_rate;
+
+    *data_size = out_size;
+    return buf_size;
+}
+
+
 AVCodec mp2_decoder =
 {
     "mp2",
@@ -2693,3 +2880,29 @@ AVCodec mp3_decoder =
     decode_frame,
     CODEC_CAP_PARSE_ONLY,
 };
+
+AVCodec mp3adu_decoder =
+{
+    "mp3adu",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_MP3ADU,
+    sizeof(MPADecodeContext),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame_adu,
+    CODEC_CAP_PARSE_ONLY,
+};
+
+AVCodec mp3on4_decoder =
+{
+    "mp3on4",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_MP3ON4,
+    sizeof(MP3On4DecodeContext),
+    decode_init_mp3on4,
+    NULL,
+    decode_close_mp3on4,
+    decode_frame_mp3on4,
+    0
+};
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 460fdbb32..6eecd0259 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -117,9 +117,10 @@ static uint8_t default_fcode_tab[MAX_MV*2+1];
 enum PixelFormat ff_yuv420p_list[2]= {PIX_FMT_YUV420P, -1};
 
 static void convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[2][64],
-                           const uint16_t *quant_matrix, int bias, int qmin, int qmax)
+                           const uint16_t *quant_matrix, int bias, int qmin, int qmax, int intra)
 {
     int qscale;
+    int shift=0;
 
     for(qscale=qmin; qscale<=qmax; qscale++){
         int i;
@@ -169,6 +170,23 @@ static void convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[
                 qmat16[qscale][1][i]= ROUNDED_DIV(bias<<(16-QUANT_BIAS_SHIFT), qmat16[qscale][0][i]);
             }
         }
+        
+        for(i=intra; i<64; i++){
+            int64_t max= 8191;
+            if (dsp->fdct == fdct_ifast
+#ifndef FAAN_POSTSCALE
+                   || dsp->fdct == ff_faandct
+#endif
+                   ) {
+                max= (8191LL*aanscales[i]) >> 14;
+            }
+            while(((max * qmat[qscale][i]) >> shift) > INT_MAX){ 
+                shift++;
+            }
+        }
+    }
+    if(shift){
+        av_log(NULL, AV_LOG_INFO, "Warning, QMAT_SHIFT is larger then %d, overflows possible\n", QMAT_SHIFT - shift);
     }
 }
 
@@ -231,7 +249,7 @@ int DCT_common_init(MpegEncContext *s)
 #ifdef CONFIG_ENCODERS
     s->dct_quantize= dct_quantize_c;
     s->denoise_dct= denoise_dct_c;
-#endif
+#endif //CONFIG_ENCODERS
         
 #ifdef HAVE_MMX
     MPV_common_init_mmx(s);
@@ -375,15 +393,15 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
         pic->mb_type= pic->mb_type_base + s->mb_stride+1;
         if(s->out_format == FMT_H264){
             for(i=0; i<2; i++){
-                CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b4_array_size+2)  * sizeof(int16_t))
-                pic->motion_val[i]= pic->motion_val_base[i]+2;
+                CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b4_array_size+4)  * sizeof(int16_t))
+                pic->motion_val[i]= pic->motion_val_base[i]+4;
                 CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t))
             }
             pic->motion_subsample_log2= 2;
         }else if(s->out_format == FMT_H263 || s->encoding || (s->avctx->debug&FF_DEBUG_MV) || (s->avctx->debug_mv)){
             for(i=0; i<2; i++){
-                CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+2) * sizeof(int16_t))
-                pic->motion_val[i]= pic->motion_val_base[i]+2;
+                CHECKED_ALLOCZ(pic->motion_val_base[i], 2 * (b8_array_size+4) * sizeof(int16_t))
+                pic->motion_val[i]= pic->motion_val_base[i]+4;
                 CHECKED_ALLOCZ(pic->ref_index[i], b8_array_size * sizeof(uint8_t))
             }
             pic->motion_subsample_log2= 3;
@@ -447,7 +465,7 @@ static int init_duplicate_context(MpegEncContext *s, MpegEncContext *base){
     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*17;
 
      //FIXME should be linesize instead of s->width*2 but that isnt known before get_buffer()
-    CHECKED_ALLOCZ(s->me.scratchpad,  (s->width+64)*2*16*2*sizeof(uint8_t)) 
+    CHECKED_ALLOCZ(s->me.scratchpad,  (s->width+64)*4*16*2*sizeof(uint8_t)) 
     s->rd_scratchpad=   s->me.scratchpad;
     s->b_scratchpad=    s->me.scratchpad;
     s->obmc_scratchpad= s->me.scratchpad + 16;
@@ -584,7 +602,6 @@ static void MPV_encode_defaults(MpegEncContext *s){
         done=1;
 
         default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) );
-        memset(default_mv_penalty, 0, sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1));
         memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1));
 
         for(i=-16; i<16; i++){
@@ -609,6 +626,9 @@ int MPV_common_init(MpegEncContext *s)
         return -1;
     }
 
+    if((s->width || s->height) && avcodec_check_dimensions(s->avctx, s->width, s->height))
+        return -1;
+
     dsputil_init(&s->dsp, s->avctx);
     DCT_common_init(s);
 
@@ -623,6 +643,10 @@ int MPV_common_init(MpegEncContext *s)
     mb_array_size= s->mb_height * s->mb_stride;
     mv_table_size= (s->mb_height+2) * s->mb_stride + 1;
 
+    /* set chroma shifts */
+    avcodec_get_chroma_sub_sample(s->avctx->pix_fmt,&(s->chroma_x_shift),
+                                                    &(s->chroma_y_shift) );
+
     /* set default edge pos, will be overriden in decode_header if needed */
     s->h_edge_pos= s->mb_width*16;
     s->v_edge_pos= s->mb_height*16;
@@ -728,9 +752,6 @@ int MPV_common_init(MpegEncContext *s)
         CHECKED_ALLOCZ(s->coded_block_base, y_size);
         s->coded_block= s->coded_block_base + s->b8_stride + 1;
         
-        /* divx501 bitstream reorder buffer */
-        CHECKED_ALLOCZ(s->bitstream_buffer, BITSTREAM_BUFFER_SIZE);
-
         /* cbp, ac_pred, pred_dir */
         CHECKED_ALLOCZ(s->cbp_table  , mb_array_size * sizeof(uint8_t))
         CHECKED_ALLOCZ(s->pred_dir_table, mb_array_size * sizeof(uint8_t))
@@ -835,6 +856,8 @@ void MPV_common_end(MpegEncContext *s)
     av_freep(&s->mbskip_table);
     av_freep(&s->prev_pict_types);
     av_freep(&s->bitstream_buffer);
+    s->allocated_bitstream_buffer_size=0;
+
     av_freep(&s->avctx->stats_out);
     av_freep(&s->ac_stats);
     av_freep(&s->error_status_table);
@@ -858,9 +881,12 @@ void MPV_common_end(MpegEncContext *s)
     s->last_picture_ptr=
     s->next_picture_ptr=
     s->current_picture_ptr= NULL;
+    s->linesize= s->uvlinesize= 0;
 
     for(i=0; i<3; i++)
         av_freep(&s->visualization_buffer[i]);
+
+    avcodec_default_free_buffers(s->avctx);
 }
 
 #ifdef CONFIG_ENCODERS
@@ -874,7 +900,22 @@ int MPV_encode_init(AVCodecContext *avctx)
     
     MPV_encode_defaults(s);
 
-    avctx->pix_fmt = PIX_FMT_YUV420P; // FIXME
+    if(avctx->pix_fmt != PIX_FMT_YUVJ420P && avctx->pix_fmt != PIX_FMT_YUV420P){
+        av_log(avctx, AV_LOG_ERROR, "only YUV420 is supported\n");
+        return -1;
+    }
+
+    if(avctx->codec_id == CODEC_ID_MJPEG || avctx->codec_id == CODEC_ID_LJPEG){
+        if(avctx->strict_std_compliance>=0 && avctx->pix_fmt != PIX_FMT_YUVJ420P){
+            av_log(avctx, AV_LOG_ERROR, "colorspace not supported in jpeg\n");
+            return -1;
+        }
+    }else{
+        if(avctx->strict_std_compliance>=0 && avctx->pix_fmt != PIX_FMT_YUV420P){
+            av_log(avctx, AV_LOG_ERROR, "colorspace not supported\n");
+            return -1;
+        }
+    }
 
     s->bit_rate = avctx->bit_rate;
     s->width = avctx->width;
@@ -897,6 +938,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->mpeg_quant= avctx->mpeg_quant;
     s->rtp_mode= !!avctx->rtp_payload_size;
     s->intra_dc_precision= avctx->intra_dc_precision;
+    s->user_specified_pts = AV_NOPTS_VALUE;
 
     if (s->gop_size <= 1) {
         s->intra_only = 1;
@@ -915,6 +957,7 @@ int MPV_encode_init(AVCodecContext *avctx)
                         || s->avctx->temporal_cplx_masking 
                         || s->avctx->spatial_cplx_masking
                         || s->avctx->p_masking
+                        || s->avctx->border_masking
                         || (s->flags&CODEC_FLAG_QP_RD))
                        && !s->fixed_qscale;
     
@@ -931,6 +974,16 @@ int MPV_encode_init(AVCodecContext *avctx)
         av_log(avctx, AV_LOG_INFO, "Warning min_rate > 0 but min_rate != max_rate isnt recommanded!\n");
     }
     
+    if(avctx->rc_min_rate && avctx->rc_min_rate > avctx->bit_rate){
+        av_log(avctx, AV_LOG_INFO, "bitrate below min bitrate\n");
+        return -1;
+    }
+    
+    if(avctx->rc_max_rate && avctx->rc_max_rate < avctx->bit_rate){
+        av_log(avctx, AV_LOG_INFO, "bitrate above max bitrate\n");
+        return -1;
+    }
+        
     if(   s->avctx->rc_max_rate && s->avctx->rc_min_rate == s->avctx->rc_max_rate 
        && (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO)
        && 90000LL * (avctx->rc_buffer_size-1) > s->avctx->rc_max_rate*0xFFFFLL){
@@ -1005,6 +1058,21 @@ int MPV_encode_init(AVCodecContext *avctx)
     if(s->avctx->thread_count > 1)
         s->rtp_mode= 1;
 
+    if(!avctx->frame_rate || !avctx->frame_rate_base){
+        av_log(avctx, AV_LOG_ERROR, "framerate not set\n");
+        return -1;
+    }
+    
+    i= (INT_MAX/2+128)>>8;
+    if(avctx->me_threshold >= i){
+        av_log(avctx, AV_LOG_ERROR, "me_threshold too large, max is %d\n", i - 1);
+        return -1;
+    }
+    if(avctx->mb_threshold >= i){
+        av_log(avctx, AV_LOG_ERROR, "mb_threshold too large, max is %d\n", i - 1);
+        return -1;
+    }
+        
     i= ff_gcd(avctx->frame_rate, avctx->frame_rate_base);
     if(i > 1){
         av_log(avctx, AV_LOG_INFO, "removing common factors from framerate\n");
@@ -1065,7 +1133,11 @@ int MPV_encode_init(AVCodecContext *avctx)
         avctx->delay=0;
         s->low_delay=1;
         break;
-#ifdef CONFIG_RISKY
+    case CODEC_ID_H261:
+        s->out_format = FMT_H261;
+        avctx->delay=0;
+        s->low_delay=1;
+        break;
     case CODEC_ID_H263:
         if (h263_get_picture_format(s->width, s->height) == 7) {
             av_log(avctx, AV_LOG_INFO, "Input picture size isn't suitable for h263 codec! try h263+\n");
@@ -1107,6 +1179,16 @@ int MPV_encode_init(AVCodecContext *avctx)
         avctx->delay=0;
         s->low_delay=1;
         break;
+    case CODEC_ID_RV20:
+        s->out_format = FMT_H263;
+        avctx->delay=0;
+        s->low_delay=1;
+        s->modified_quant=1;
+        s->h263_aic=1;
+        s->h263_plus=1;
+        s->loop_filter=1;
+        s->unrestricted_mv= s->obmc || s->loop_filter || s->umvplus;
+        break;
     case CODEC_ID_MPEG4:
         s->out_format = FMT_H263;
         s->h263_pred = 1;
@@ -1162,11 +1244,12 @@ int MPV_encode_init(AVCodecContext *avctx)
         avctx->delay=0;
         s->low_delay=1;
         break;
-#endif
 #endif /* #if 0 */
     default:
         return -1;
     }
+    
+    avctx->has_b_frames= !s->low_delay;
 
     s->encoding = 1;
 
@@ -1181,39 +1264,31 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->quant_precision=5;
     
     ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp);
+    ff_set_cmp(&s->dsp, s->dsp.frame_skip_cmp, s->avctx->frame_skip_cmp);
     
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-    ff_init_me(s);
-#endif /* #if 0 */
-
-#ifdef CONFIG_ENCODERS
-/* xine: do not need this for decode or MPEG-1 encoding modes */
-#if 0
-#ifdef CONFIG_RISKY
+    if (s->out_format == FMT_H261)
+        ff_h261_encode_init(s);
     if (s->out_format == FMT_H263)
         h263_encode_init(s);
     if(s->msmpeg4_version)
         ff_msmpeg4_encode_init(s);
-#endif
 #endif /* #if 0 */
-/* xine: we do want this for MPEG-1 encoding */
+/* xine: we DO want this for MPEG-1 encoding */
     if (s->out_format == FMT_MPEG1)
         ff_mpeg1_encode_init(s);
-#endif
 
     /* init q matrix */
     for(i=0;i<64;i++) {
         int j= s->dsp.idct_permutation[i];
-#ifdef CONFIG_RISKY
         if(s->codec_id==CODEC_ID_MPEG4 && s->mpeg_quant){
             s->intra_matrix[j] = ff_mpeg4_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg4_default_non_intra_matrix[i];
-        }else if(s->out_format == FMT_H263){
+        }else if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
             s->intra_matrix[j] =
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
         }else
-#endif
         { /* mpeg1/2 */
             s->intra_matrix[j] = ff_mpeg1_default_intra_matrix[i];
             s->inter_matrix[j] = ff_mpeg1_default_non_intra_matrix[i];
@@ -1228,9 +1303,9 @@ int MPV_encode_init(AVCodecContext *avctx)
     /* for mjpeg, we do include qscale in the matrix */
     if (s->out_format != FMT_MJPEG) {
         convert_matrix(&s->dsp, s->q_intra_matrix, s->q_intra_matrix16, 
-                       s->intra_matrix, s->intra_quant_bias, 1, 31);
+                       s->intra_matrix, s->intra_quant_bias, avctx->qmin, 31, 1);
         convert_matrix(&s->dsp, s->q_inter_matrix, s->q_inter_matrix16, 
-                       s->inter_matrix, s->inter_quant_bias, 1, 31);
+                       s->inter_matrix, s->inter_quant_bias, avctx->qmin, 31, 0);
     }
 
     if(ff_rate_control_init(s) < 0)
@@ -1263,12 +1338,16 @@ int MPV_encode_end(AVCodecContext *avctx)
 
 #endif //CONFIG_ENCODERS
 
-void init_rl(RLTable *rl)
+void init_rl(RLTable *rl, int use_static)
 {
     int8_t max_level[MAX_RUN+1], max_run[MAX_LEVEL+1];
     uint8_t index_run[MAX_RUN+1];
     int last, run, level, start, end, i;
 
+    /* If table is static, we can quit if rl->max_level[0] is not NULL */
+    if(use_static && rl->max_level[0])
+        return;
+
     /* compute max_level[], max_run[] and index_run[] */
     for(last=0;last<2;last++) {
         if (last == 0) {
@@ -1292,11 +1371,20 @@ void init_rl(RLTable *rl)
             if (run > max_run[level])
                 max_run[level] = run;
         }
-        rl->max_level[last] = av_malloc(MAX_RUN + 1);
+        if(use_static)
+            rl->max_level[last] = av_mallocz_static(MAX_RUN + 1);
+        else
+            rl->max_level[last] = av_malloc(MAX_RUN + 1);
         memcpy(rl->max_level[last], max_level, MAX_RUN + 1);
-        rl->max_run[last] = av_malloc(MAX_LEVEL + 1);
+        if(use_static)
+            rl->max_run[last] = av_mallocz_static(MAX_LEVEL + 1);
+        else
+            rl->max_run[last] = av_malloc(MAX_LEVEL + 1);
         memcpy(rl->max_run[last], max_run, MAX_LEVEL + 1);
-        rl->index_run[last] = av_malloc(MAX_RUN + 1);
+        if(use_static)
+            rl->index_run[last] = av_mallocz_static(MAX_RUN + 1);
+        else
+            rl->index_run[last] = av_malloc(MAX_RUN + 1);
         memcpy(rl->index_run[last], index_run, MAX_RUN + 1);
     }
 }
@@ -1409,7 +1497,8 @@ alloc:
             pic= (AVFrame*)&s->picture[i];
         }
 
-        pic->reference= s->pict_type != B_TYPE && !s->dropable ? 3 : 0;
+        pic->reference= (s->pict_type != B_TYPE || s->codec_id == CODEC_ID_H264)
+                        && !s->dropable ? 3 : 0;
 
         pic->coded_picture_number= s->coded_picture_number++;
         
@@ -1472,7 +1561,7 @@ alloc:
     if(s->mpeg_quant || s->codec_id == CODEC_ID_MPEG2VIDEO){
         s->dct_unquantize_intra = s->dct_unquantize_mpeg2_intra;
         s->dct_unquantize_inter = s->dct_unquantize_mpeg2_inter;
-    }else if(s->out_format == FMT_H263){
+    }else if(s->out_format == FMT_H263 || s->out_format == FMT_H261){
         s->dct_unquantize_intra = s->dct_unquantize_h263_intra;
         s->dct_unquantize_inter = s->dct_unquantize_h263_inter;
     }else{
@@ -1504,7 +1593,7 @@ void MPV_frame_end(MpegEncContext *s)
         XVMC_field_end(s);
     }else
 #endif
-    if(s->unrestricted_mv && s->pict_type != B_TYPE && !s->intra_only && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
+    if(s->unrestricted_mv && s->current_picture.reference && !s->intra_only && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
             draw_edges(s->current_picture.data[0], s->linesize  , s->h_edge_pos   , s->v_edge_pos   , EDGE_WIDTH  );
             draw_edges(s->current_picture.data[1], s->uvlinesize, s->h_edge_pos>>1, s->v_edge_pos>>1, EDGE_WIDTH/2);
             draw_edges(s->current_picture.data[2], s->uvlinesize, s->h_edge_pos>>1, s->v_edge_pos>>1, EDGE_WIDTH/2);
@@ -1540,6 +1629,7 @@ void MPV_frame_end(MpegEncContext *s)
     memset(&s->next_picture, 0, sizeof(Picture));
     memset(&s->current_picture, 0, sizeof(Picture));
 #endif
+    s->avctx->coded_frame= (AVFrame*)s->current_picture_ptr;
 }
 
 /**
@@ -1713,11 +1803,15 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
         uint8_t *ptr;
         int i;
         int h_chroma_shift, v_chroma_shift;
+        const int width = s->avctx->width;
+        const int height= s->avctx->height;
+        const int mv_sample_log2= 4 - pict->motion_subsample_log2;
+        const int mv_stride= (s->mb_width << mv_sample_log2) + 1;
         s->low_delay=0; //needed to see the vectors without trashing the buffers
 
         avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
         for(i=0; i<3; i++){
-            memcpy(s->visualization_buffer[i], pict->data[i], (i==0) ? pict->linesize[i]*s->height:pict->linesize[i]*s->height >> v_chroma_shift);
+            memcpy(s->visualization_buffer[i], pict->data[i], (i==0) ? pict->linesize[i]*height:pict->linesize[i]*height >> v_chroma_shift);
             pict->data[i]= s->visualization_buffer[i];
         }
         pict->type= FF_BUFFER_TYPE_COPY;
@@ -1748,38 +1842,51 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
                     if(!USES_LIST(pict->mb_type[mb_index], direction))
                         continue;
 
-                    //FIXME for h264
                     if(IS_8X8(pict->mb_type[mb_index])){
                       int i;
                       for(i=0; i<4; i++){
                         int sx= mb_x*16 + 4 + 8*(i&1);
                         int sy= mb_y*16 + 4 + 8*(i>>1);
-                        int xy= mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*s->b8_stride;
+                        int xy= (mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*mv_stride) << (mv_sample_log2-1);
                         int mx= (pict->motion_val[direction][xy][0]>>shift) + sx;
                         int my= (pict->motion_val[direction][xy][1]>>shift) + sy;
-                        draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
+                        draw_arrow(ptr, sx, sy, mx, my, width, height, s->linesize, 100);
                       }
                     }else if(IS_16X8(pict->mb_type[mb_index])){
                       int i;
                       for(i=0; i<2; i++){
                         int sx=mb_x*16 + 8;
                         int sy=mb_y*16 + 4 + 8*i;
-                        int xy= mb_x*2 + (mb_y*2 + i)*s->b8_stride;
+                        int xy= (mb_x*2 + (mb_y*2 + i)*mv_stride) << (mv_sample_log2-1);
+                        int mx=(pict->motion_val[direction][xy][0]>>shift);
+                        int my=(pict->motion_val[direction][xy][1]>>shift);
+                        
+                        if(IS_INTERLACED(pict->mb_type[mb_index]))
+                            my*=2;
+                        
+                        draw_arrow(ptr, sx, sy, mx+sx, my+sy, width, height, s->linesize, 100);
+                      }
+                    }else if(IS_8X16(pict->mb_type[mb_index])){
+                      int i;
+                      for(i=0; i<2; i++){
+                        int sx=mb_x*16 + 4 + 8*i;
+                        int sy=mb_y*16 + 8;
+                        int xy= (mb_x*2 + i + mb_y*2*mv_stride) << (mv_sample_log2-1);
                         int mx=(pict->motion_val[direction][xy][0]>>shift);
                         int my=(pict->motion_val[direction][xy][1]>>shift);
                         
                         if(IS_INTERLACED(pict->mb_type[mb_index]))
                             my*=2;
                         
-                        draw_arrow(ptr, sx, sy, mx+sx, my+sy, s->width, s->height, s->linesize, 100);
+                        draw_arrow(ptr, sx, sy, mx+sx, my+sy, width, height, s->linesize, 100);
                       }
                     }else{
                       int sx= mb_x*16 + 8;
                       int sy= mb_y*16 + 8;
-                      int xy= mb_x*2 + mb_y*2*s->b8_stride;
+                      int xy= (mb_x + mb_y*mv_stride) << mv_sample_log2;
                       int mx= (pict->motion_val[direction][xy][0]>>shift) + sx;
                       int my= (pict->motion_val[direction][xy][1]>>shift) + sy;
-                      draw_arrow(ptr, sx, sy, mx, my, s->width, s->height, s->linesize, 100);
+                      draw_arrow(ptr, sx, sy, mx, my, width, height, s->linesize, 100);
                     }
                   }                  
                 }
@@ -1842,6 +1949,21 @@ v= (int)(128 + r*sin(theta*3.141592/180));
                         for(y=0; y<16; y++)
                             pict->data[0][16*mb_x + 8 + (16*mb_y + y)*pict->linesize[0]]^= 0x80;
                     }
+                    if(IS_8X8(mb_type) && mv_sample_log2 >= 2){
+                        int dm= 1 << (mv_sample_log2-2);
+                        for(i=0; i<4; i++){
+                            int sx= mb_x*16 + 8*(i&1);
+                            int sy= mb_y*16 + 8*(i>>1);
+                            int xy= (mb_x*2 + (i&1) + (mb_y*2 + (i>>1))*mv_stride) << (mv_sample_log2-1);
+                            //FIXME bidir
+                            int32_t *mv = (int32_t*)&pict->motion_val[0][xy];
+                            if(mv[0] != mv[dm] || mv[dm*mv_stride] != mv[dm*(mv_stride+1)])
+                                for(y=0; y<8; y++)
+                                    pict->data[0][sx + 4 + (sy + y)*pict->linesize[0]]^= 0x80;
+                            if(mv[0] != mv[dm*mv_stride] || mv[dm] != mv[dm*(mv_stride+1)])
+                                *(uint64_t*)(pict->data[0] + sx + (sy + 4)*pict->linesize[0])^= 0x8080808080808080ULL;
+                        }
+                    }
                         
                     if(IS_INTERLACED(mb_type) && s->codec_id == CODEC_ID_H264){
                         // hmm
@@ -1891,10 +2013,37 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, uint8_t *ref, int st
 
 static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
     AVFrame *pic=NULL;
+    int64_t pts;
     int i;
     const int encoding_delay= s->max_b_frames;
     int direct=1;
     
+    if(pic_arg){
+        pts= pic_arg->pts;
+        pic_arg->display_picture_number= s->input_picture_number++;
+
+        if(pts != AV_NOPTS_VALUE){ 
+            if(s->user_specified_pts != AV_NOPTS_VALUE){
+                int64_t time= av_rescale(pts, s->avctx->frame_rate, s->avctx->frame_rate_base*(int64_t)AV_TIME_BASE);
+                int64_t last= av_rescale(s->user_specified_pts, s->avctx->frame_rate, s->avctx->frame_rate_base*(int64_t)AV_TIME_BASE);
+            
+                if(time <= last){            
+                    av_log(s->avctx, AV_LOG_ERROR, "Error, Invalid timestamp=%Ld, last=%Ld\n", pts, s->user_specified_pts);
+                    return -1;
+                }
+            }
+            s->user_specified_pts= pts;
+        }else{
+            if(s->user_specified_pts != AV_NOPTS_VALUE){
+                s->user_specified_pts= 
+                pts= s->user_specified_pts + AV_TIME_BASE*(int64_t)s->avctx->frame_rate_base / s->avctx->frame_rate;
+                av_log(s->avctx, AV_LOG_INFO, "Warning: AVFrame.pts=? trying to guess (%Ld)\n", pts);
+            }else{
+                pts= av_rescale(pic_arg->display_picture_number*(int64_t)s->avctx->frame_rate_base, AV_TIME_BASE, s->avctx->frame_rate);
+            }
+        }
+    }
+
   if(pic_arg){
     if(encoding_delay && !(s->flags&CODEC_FLAG_INPUT_PRESERVED)) direct=0;
     if(pic_arg->linesize[0] != s->linesize) direct=0;
@@ -1954,18 +2103,7 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
         }
     }
     copy_picture_attributes(s, pic, pic_arg);
-    
-    pic->display_picture_number= s->input_picture_number++;
-    if(pic->pts != AV_NOPTS_VALUE){ 
-        s->user_specified_pts= pic->pts;
-    }else{
-        if(s->user_specified_pts){
-            pic->pts= s->user_specified_pts + AV_TIME_BASE*(int64_t)s->avctx->frame_rate_base / s->avctx->frame_rate;
-            av_log(s->avctx, AV_LOG_INFO, "Warning: AVFrame.pts=? trying to guess (%Ld)\n", pic->pts);
-        }else{
-            pic->pts= av_rescale(pic->display_picture_number*(int64_t)s->avctx->frame_rate_base, AV_TIME_BASE, s->avctx->frame_rate);
-        }
-    }
+    pic->pts= pts; //we set this here to avoid modifiying pic_arg
   }
   
     /* shift buffer entries */
@@ -1977,6 +2115,38 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
     return 0;
 }
 
+static int skip_check(MpegEncContext *s, Picture *p, Picture *ref){
+    int x, y, plane;
+    int score=0;
+    int64_t score64=0;
+
+    for(plane=0; plane<3; plane++){
+        const int stride= p->linesize[plane];
+        const int bw= plane ? 1 : 2;
+        for(y=0; y<s->mb_height*bw; y++){
+            for(x=0; x<s->mb_width*bw; x++){
+                int v= s->dsp.frame_skip_cmp[1](s, p->data[plane] + 8*(x + y*stride), ref->data[plane] + 8*(x + y*stride), stride, 8);
+                
+                switch(s->avctx->frame_skip_exp){
+                    case 0: score= FFMAX(score, v); break;
+                    case 1: score+= ABS(v);break;
+                    case 2: score+= v*v;break;
+                    case 3: score64+= ABS(v*v*(int64_t)v);break;
+                    case 4: score64+= v*v*(int64_t)(v*v);break;
+                }
+            }
+        }
+    }
+    
+    if(score) score64= score;
+
+    if(score64 < s->avctx->frame_skip_threshold)
+        return 1;
+    if(score64 < ((s->avctx->frame_skip_factor * (int64_t)s->lambda)>>8))
+        return 1;
+    return 0;
+}
+
 static void select_input_picture(MpegEncContext *s){
     int i;
 
@@ -1992,28 +2162,43 @@ static void select_input_picture(MpegEncContext *s){
             s->reordered_input_picture[0]->coded_picture_number= s->coded_picture_number++;
         }else{
             int b_frames;
+
+            if(s->avctx->frame_skip_threshold || s->avctx->frame_skip_factor){
+                if(skip_check(s, s->input_picture[0], s->next_picture_ptr)){
+//av_log(NULL, AV_LOG_DEBUG, "skip %p %Ld\n", s->input_picture[0]->data[0], s->input_picture[0]->pts);
+                
+                    if(s->input_picture[0]->type == FF_BUFFER_TYPE_SHARED){
+                        for(i=0; i<4; i++)
+                            s->input_picture[0]->data[i]= NULL;
+                        s->input_picture[0]->type= 0;            
+                    }else{
+                        assert(   s->input_picture[0]->type==FF_BUFFER_TYPE_USER 
+                               || s->input_picture[0]->type==FF_BUFFER_TYPE_INTERNAL);
             
+                        s->avctx->release_buffer(s->avctx, (AVFrame*)s->input_picture[0]);
+                    }
+
+                    goto no_output_pic;
+                }
+            }
+
             if(s->flags&CODEC_FLAG_PASS2){
                 for(i=0; i<s->max_b_frames+1; i++){
                     int pict_num= s->input_picture[0]->display_picture_number + i;
-                    int pict_type= s->rc_context.entry[pict_num].new_pict_type;
-                    s->input_picture[i]->pict_type= pict_type;
-                    
-                    if(i + 1 >= s->rc_context.num_entries) break;
+
+                    if(pict_num >= s->rc_context.num_entries) 
+                        break;
+                    if(!s->input_picture[i]){
+                        s->rc_context.entry[pict_num-1].new_pict_type = P_TYPE;
+                        break;
+                    }
+
+                    s->input_picture[i]->pict_type= 
+                        s->rc_context.entry[pict_num].new_pict_type;
                 }
             }
 
-            if(s->input_picture[0]->pict_type){
-                /* user selected pict_type */
-                for(b_frames=0; b_frames<s->max_b_frames+1; b_frames++){
-                    if(s->input_picture[b_frames]->pict_type!=B_TYPE) break;
-                }
-            
-                if(b_frames > s->max_b_frames){
-                    av_log(s->avctx, AV_LOG_ERROR, "warning, too many bframes in a row\n");
-                    b_frames = s->max_b_frames;
-                }
-            }else if(s->avctx->b_frame_strategy==0){
+            if(s->avctx->b_frame_strategy==0){
                 b_frames= s->max_b_frames;
                 while(b_frames && !s->input_picture[b_frames]) b_frames--;
             }else if(s->avctx->b_frame_strategy==1){
@@ -2043,10 +2228,24 @@ static void select_input_picture(MpegEncContext *s){
 //static int b_count=0;
 //b_count+= b_frames;
 //av_log(s->avctx, AV_LOG_DEBUG, "b_frames: %d\n", b_count);
+
+            for(i= b_frames - 1; i>=0; i--){
+                int type= s->input_picture[i]->pict_type;
+                if(type && type != B_TYPE)
+                    b_frames= i;
+            }
+            if(s->input_picture[b_frames]->pict_type == B_TYPE && b_frames == s->max_b_frames){
+                av_log(s->avctx, AV_LOG_ERROR, "warning, too many bframes in a row\n");
+            }
+
             if(s->picture_in_gop_number + b_frames >= s->gop_size){
+              if((s->flags2 & CODEC_FLAG2_STRICT_GOP) && s->gop_size > s->picture_in_gop_number){
+                    b_frames= s->gop_size - s->picture_in_gop_number - 1;
+              }else{
                 if(s->flags & CODEC_FLAG_CLOSED_GOP)
                     b_frames=0;
                 s->input_picture[b_frames]->pict_type= I_TYPE;
+              }
             }
             
             if(   (s->flags & CODEC_FLAG_CLOSED_GOP)
@@ -2065,7 +2264,7 @@ static void select_input_picture(MpegEncContext *s){
             }
         }
     }
-    
+no_output_pic:
     if(s->reordered_input_picture[0]){
         s->reordered_input_picture[0]->reference= s->reordered_input_picture[0]->pict_type!=B_TYPE ? 3 : 0;
 
@@ -2116,7 +2315,7 @@ int MPV_encode_picture(AVCodecContext *avctx,
     AVFrame *pic_arg = data;
     int i, stuffing_count;
 
-    if(avctx->pix_fmt != PIX_FMT_YUV420P){
+    if(avctx->pix_fmt != PIX_FMT_YUV420P && avctx->pix_fmt != PIX_FMT_YUVJ420P){
         av_log(avctx, AV_LOG_ERROR, "this codec supports only YUV420P\n");
         return -1;
     }
@@ -2133,7 +2332,8 @@ int MPV_encode_picture(AVCodecContext *avctx,
 
     s->picture_in_gop_number++;
 
-    load_input_picture(s, pic_arg);
+    if(load_input_picture(s, pic_arg) < 0)
+        return -1;
     
     select_input_picture(s);
     
@@ -2171,11 +2371,18 @@ int MPV_encode_picture(AVCodecContext *avctx,
             avctx->error[i] += s->current_picture_ptr->error[i];
         }
 
+        if(s->flags&CODEC_FLAG_PASS1)
+            assert(avctx->header_bits + avctx->mv_bits + avctx->misc_bits + avctx->i_tex_bits + avctx->p_tex_bits == put_bits_count(&s->pb));
         flush_put_bits(&s->pb);
         s->frame_bits  = put_bits_count(&s->pb);
 
         stuffing_count= ff_vbv_update(s, s->frame_bits);
         if(stuffing_count){
+            if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < stuffing_count + 50){
+                av_log(s->avctx, AV_LOG_ERROR, "stuffing too large\n");
+                return -1;
+            }
+
             switch(s->codec_id){
             case CODEC_ID_MPEG1VIDEO:
             case CODEC_ID_MPEG2VIDEO:
@@ -2480,8 +2687,50 @@ static inline int hpel_motion(MpegEncContext *s,
     return emu;
 }
 
+static inline int hpel_motion_lowres(MpegEncContext *s, 
+                                  uint8_t *dest, uint8_t *src,
+                                  int field_based, int field_select,
+                                  int src_x, int src_y,
+                                  int width, int height, int stride,
+                                  int h_edge_pos, int v_edge_pos,
+                                  int w, int h, h264_chroma_mc_func *pix_op,
+                                  int motion_x, int motion_y)
+{
+    const int lowres= s->avctx->lowres;
+    const int s_mask= (2<<lowres)-1;
+    int emu=0;
+    int sx, sy;
+
+    if(s->quarter_sample){
+        motion_x/=2;
+        motion_y/=2;
+    }
+
+    sx= motion_x & s_mask;
+    sy= motion_y & s_mask;
+    src_x += motion_x >> (lowres+1);
+    src_y += motion_y >> (lowres+1);
+                
+    src += src_y * stride + src_x;
+
+    if(   (unsigned)src_x > h_edge_pos                 - (!!sx) - w
+       || (unsigned)src_y >(v_edge_pos >> field_based) - (!!sy) - h){
+        ff_emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, w+1, (h+1)<<field_based,
+                            src_x, src_y<<field_based, h_edge_pos, v_edge_pos);
+        src= s->edge_emu_buffer;
+        emu=1;
+    }
+
+    sx <<= 2 - lowres;
+    sy <<= 2 - lowres;
+    if(field_select)
+        src += s->linesize;
+    pix_op[lowres](dest, src, stride, h, sx, sy);
+    return emu;
+}
+
 /* apply one mpeg motion vector to the three components */
-static inline void mpeg_motion(MpegEncContext *s,
+static always_inline void mpeg_motion(MpegEncContext *s,
                                uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                int field_based, int bottom_field, int field_select,
                                uint8_t **ref_picture, op_pixels_func (*pix_op)[4],
@@ -2504,7 +2753,7 @@ if(s->quarter_sample)
 
     dxy = ((motion_y & 1) << 1) | (motion_x & 1);
     src_x = s->mb_x* 16               + (motion_x >> 1);
-    src_y = s->mb_y*(16>>field_based) + (motion_y >> 1);
+    src_y =(s->mb_y<<(4-field_based)) + (motion_y >> 1);
 
     if (s->out_format == FMT_H263) {
         if((s->workaround_bugs & FF_BUG_HPEL_CHROMA) && field_based){
@@ -2512,18 +2761,39 @@ if(s->quarter_sample)
             my = motion_y >>1;
             uvdxy = ((my & 1) << 1) | (mx & 1);
             uvsrc_x = s->mb_x* 8               + (mx >> 1);
-            uvsrc_y = s->mb_y*(8>>field_based) + (my >> 1);
+            uvsrc_y = (s->mb_y<<(3-field_based)) + (my >> 1);
         }else{
             uvdxy = dxy | (motion_y & 2) | ((motion_x & 2) >> 1);
             uvsrc_x = src_x>>1;
             uvsrc_y = src_y>>1;
         }
+    }else if(s->out_format == FMT_H261){//even chroma mv's are full pel in H261
+        mx = motion_x / 4;
+        my = motion_y / 4;
+        uvdxy = 0;
+        uvsrc_x = s->mb_x*8 + mx;
+        uvsrc_y = s->mb_y*8 + my;
     } else {
-        mx = motion_x / 2;
-        my = motion_y / 2;
-        uvdxy = ((my & 1) << 1) | (mx & 1);
-        uvsrc_x = s->mb_x* 8               + (mx >> 1);
-        uvsrc_y = s->mb_y*(8>>field_based) + (my >> 1);
+        if(s->chroma_y_shift){
+            mx = motion_x / 2;
+            my = motion_y / 2;
+            uvdxy = ((my & 1) << 1) | (mx & 1);
+            uvsrc_x = s->mb_x* 8               + (mx >> 1);
+            uvsrc_y = (s->mb_y<<(3-field_based)) + (my >> 1);
+        } else {
+            if(s->chroma_x_shift){
+            //Chroma422
+                mx = motion_x / 2;
+                uvdxy = ((motion_y & 1) << 1) | (mx & 1);
+                uvsrc_x = s->mb_x* 8           + (mx >> 1);
+                uvsrc_y = src_y;
+            } else {
+            //Chroma444
+                uvdxy = dxy;
+                uvsrc_x = src_x;
+                uvsrc_y = src_y;
+            }
+        }
     }
 
     ptr_y  = ref_picture[0] + src_y * linesize + src_x;
@@ -2532,6 +2802,11 @@ if(s->quarter_sample)
 
     if(   (unsigned)src_x > s->h_edge_pos - (motion_x&1) - 16
        || (unsigned)src_y >    v_edge_pos - (motion_y&1) - h){
+            if(s->codec_id == CODEC_ID_MPEG2VIDEO ||
+               s->codec_id == CODEC_ID_MPEG1VIDEO){
+                av_log(s->avctx,AV_LOG_DEBUG,"MPEG motion vector out of boundary\n");
+                return ;
+            }
             ff_emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize, 17, 17+field_based,
                              src_x, src_y<<field_based, s->h_edge_pos, s->v_edge_pos);
             ptr_y = s->edge_emu_buffer;
@@ -2561,10 +2836,111 @@ if(s->quarter_sample)
     pix_op[0][dxy](dest_y, ptr_y, linesize, h);
     
     if(!(s->flags&CODEC_FLAG_GRAY)){
-        pix_op[1][uvdxy](dest_cb, ptr_cb, uvlinesize, h >> 1);
-        pix_op[1][uvdxy](dest_cr, ptr_cr, uvlinesize, h >> 1);
+        pix_op[s->chroma_x_shift][uvdxy](dest_cb, ptr_cb, uvlinesize, h >> s->chroma_y_shift);
+        pix_op[s->chroma_x_shift][uvdxy](dest_cr, ptr_cr, uvlinesize, h >> s->chroma_y_shift);
+    }
+    if(s->out_format == FMT_H261){
+        ff_h261_loop_filter(s);
+    }
+}
+
+/* apply one mpeg motion vector to the three components */
+static always_inline void mpeg_motion_lowres(MpegEncContext *s,
+                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                               int field_based, int bottom_field, int field_select,
+                               uint8_t **ref_picture, h264_chroma_mc_func *pix_op,
+                               int motion_x, int motion_y, int h)
+{
+    uint8_t *ptr_y, *ptr_cb, *ptr_cr;
+    int mx, my, src_x, src_y, uvsrc_x, uvsrc_y, uvlinesize, linesize, sx, sy, uvsx, uvsy;
+    const int lowres= s->avctx->lowres;
+    const int block_s= 8>>lowres;
+    const int s_mask= (2<<lowres)-1;
+    const int h_edge_pos = s->h_edge_pos >> lowres;
+    const int v_edge_pos = s->v_edge_pos >> lowres;
+    linesize   = s->current_picture.linesize[0] << field_based;
+    uvlinesize = s->current_picture.linesize[1] << field_based;
+
+    if(s->quarter_sample){ //FIXME obviously not perfect but qpel wont work in lowres anyway
+        motion_x/=2;
+        motion_y/=2;
+    }
+    
+    if(field_based){
+        motion_y += (bottom_field - field_select)*((1<<lowres)-1);
+    }
+
+    sx= motion_x & s_mask;
+    sy= motion_y & s_mask;
+    src_x = s->mb_x*2*block_s               + (motion_x >> (lowres+1));
+    src_y =(s->mb_y*2*block_s>>field_based) + (motion_y >> (lowres+1));
+    
+    if (s->out_format == FMT_H263) {
+        uvsx = ((motion_x>>1) & s_mask) | (sx&1);
+        uvsy = ((motion_y>>1) & s_mask) | (sy&1);
+        uvsrc_x = src_x>>1;
+        uvsrc_y = src_y>>1;
+    }else if(s->out_format == FMT_H261){//even chroma mv's are full pel in H261
+        mx = motion_x / 4;
+        my = motion_y / 4;
+        uvsx = (2*mx) & s_mask;
+        uvsy = (2*my) & s_mask;
+        uvsrc_x = s->mb_x*block_s               + (mx >> lowres);
+        uvsrc_y = s->mb_y*block_s               + (my >> lowres);
+    } else {
+        mx = motion_x / 2;
+        my = motion_y / 2;
+        uvsx = mx & s_mask;
+        uvsy = my & s_mask;
+        uvsrc_x = s->mb_x*block_s               + (mx >> (lowres+1));
+        uvsrc_y =(s->mb_y*block_s>>field_based) + (my >> (lowres+1));
+    }
+
+    ptr_y  = ref_picture[0] + src_y * linesize + src_x;
+    ptr_cb = ref_picture[1] + uvsrc_y * uvlinesize + uvsrc_x;
+    ptr_cr = ref_picture[2] + uvsrc_y * uvlinesize + uvsrc_x;
+
+    if(   (unsigned)src_x > h_edge_pos                 - (!!sx) - 2*block_s
+       || (unsigned)src_y >(v_edge_pos >> field_based) - (!!sy) - h){
+            ff_emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize, 17, 17+field_based,
+                             src_x, src_y<<field_based, h_edge_pos, v_edge_pos);
+            ptr_y = s->edge_emu_buffer;
+            if(!(s->flags&CODEC_FLAG_GRAY)){
+                uint8_t *uvbuf= s->edge_emu_buffer+18*s->linesize;
+                ff_emulated_edge_mc(uvbuf  , ptr_cb, s->uvlinesize, 9, 9+field_based, 
+                                 uvsrc_x, uvsrc_y<<field_based, h_edge_pos>>1, v_edge_pos>>1);
+                ff_emulated_edge_mc(uvbuf+16, ptr_cr, s->uvlinesize, 9, 9+field_based, 
+                                 uvsrc_x, uvsrc_y<<field_based, h_edge_pos>>1, v_edge_pos>>1);
+                ptr_cb= uvbuf;
+                ptr_cr= uvbuf+16;
+            }
+    }
+
+    if(bottom_field){ //FIXME use this for field pix too instead of the obnoxious hack which changes picture.data
+        dest_y += s->linesize;
+        dest_cb+= s->uvlinesize;
+        dest_cr+= s->uvlinesize;
+    }
+
+    if(field_select){
+        ptr_y += s->linesize;
+        ptr_cb+= s->uvlinesize;
+        ptr_cr+= s->uvlinesize;
     }
+
+    sx <<= 2 - lowres;
+    sy <<= 2 - lowres;
+    pix_op[lowres-1](dest_y, ptr_y, linesize, h, sx, sy);
+    
+    if(!(s->flags&CODEC_FLAG_GRAY)){
+        uvsx <<= 2 - lowres;
+        uvsy <<= 2 - lowres;
+        pix_op[lowres](dest_cb, ptr_cb, uvlinesize, h >> s->chroma_y_shift, uvsx, uvsy);
+        pix_op[lowres](dest_cr, ptr_cr, uvlinesize, h >> s->chroma_y_shift, uvsx, uvsy);
+    }
+    //FIXME h261 lowres loop filter
 }
+
 //FIXME move to dsputil, avg variant, 16x16 version
 static inline void put_obmc(uint8_t *dst, uint8_t *src[5], int stride){
     int x;
@@ -2792,6 +3168,56 @@ static inline void chroma_4mv_motion(MpegEncContext *s,
     pix_op[dxy](dest_cr, ptr, s->uvlinesize, 8);
 }
 
+static inline void chroma_4mv_motion_lowres(MpegEncContext *s,
+                                     uint8_t *dest_cb, uint8_t *dest_cr,
+                                     uint8_t **ref_picture,
+                                     h264_chroma_mc_func *pix_op,
+                                     int mx, int my){
+    const int lowres= s->avctx->lowres;
+    const int block_s= 8>>lowres;
+    const int s_mask= (2<<lowres)-1;
+    const int h_edge_pos = s->h_edge_pos >> (lowres+1);
+    const int v_edge_pos = s->v_edge_pos >> (lowres+1);
+    int emu=0, src_x, src_y, offset, sx, sy;
+    uint8_t *ptr;
+    
+    if(s->quarter_sample){
+        mx/=2;
+        my/=2;
+    }
+
+    /* In case of 8X8, we construct a single chroma motion vector
+       with a special rounding */
+    mx= ff_h263_round_chroma(mx);
+    my= ff_h263_round_chroma(my);
+    
+    sx= mx & s_mask;
+    sy= my & s_mask;
+    src_x = s->mb_x*block_s + (mx >> (lowres+1));
+    src_y = s->mb_y*block_s + (my >> (lowres+1));
+    
+    offset = src_y * s->uvlinesize + src_x;
+    ptr = ref_picture[1] + offset;
+    if(s->flags&CODEC_FLAG_EMU_EDGE){
+        if(   (unsigned)src_x > h_edge_pos - (!!sx) - block_s
+           || (unsigned)src_y > v_edge_pos - (!!sy) - block_s){
+            ff_emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, src_x, src_y, h_edge_pos, v_edge_pos);
+            ptr= s->edge_emu_buffer;
+            emu=1;
+        }
+    }     
+    sx <<= 2 - lowres;
+    sy <<= 2 - lowres;
+    pix_op[lowres](dest_cb, ptr, s->uvlinesize, block_s, sx, sy);
+          
+    ptr = ref_picture[2] + offset;
+    if(emu){
+        ff_emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, src_x, src_y, h_edge_pos, v_edge_pos);
+        ptr= s->edge_emu_buffer;
+    }
+    pix_op[lowres](dest_cr, ptr, s->uvlinesize, block_s, sx, sy);
+}
+
 /**
  * motion compesation of a single macroblock
  * @param s context
@@ -2879,7 +3305,6 @@ static inline void MPV_motion(MpegEncContext *s,
    
     switch(s->mv_type) {
     case MV_TYPE_16X16:
-#ifdef CONFIG_RISKY
         if(s->mcsel){
             if(s->real_sprite_warping_points==1){
                 gmc1_motion(s, dest_y, dest_cb, dest_cr,
@@ -2898,7 +3323,6 @@ static inline void MPV_motion(MpegEncContext *s,
                         ref_picture, pix_op,
                         s->mv[dir][0][0], s->mv[dir][0][1], 16);
         }else
-#endif
         {
             mpeg_motion(s, dest_y, dest_cb, dest_cr, 
                         0, 0, 0,
@@ -3006,8 +3430,8 @@ static inline void MPV_motion(MpegEncContext *s,
                         s->mv[dir][i][0], s->mv[dir][i][1] + 16*i, 8);
                 
             dest_y += 16*s->linesize;
-            dest_cb+=  8*s->uvlinesize;
-            dest_cr+=  8*s->uvlinesize;
+            dest_cb+= (16>>s->chroma_y_shift)*s->uvlinesize;
+            dest_cr+= (16>>s->chroma_y_shift)*s->uvlinesize;
         }        
         break;
     case MV_TYPE_DMV:
@@ -3043,6 +3467,131 @@ static inline void MPV_motion(MpegEncContext *s,
     }
 }
 
+/**
+ * motion compesation of a single macroblock
+ * @param s context
+ * @param dest_y luma destination pointer
+ * @param dest_cb chroma cb/u destination pointer
+ * @param dest_cr chroma cr/v destination pointer
+ * @param dir direction (0->forward, 1->backward)
+ * @param ref_picture array[3] of pointers to the 3 planes of the reference picture
+ * @param pic_op halfpel motion compensation function (average or put normally)
+ * the motion vectors are taken from s->mv and the MV type from s->mv_type
+ */
+static inline void MPV_motion_lowres(MpegEncContext *s, 
+                              uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                              int dir, uint8_t **ref_picture, 
+                              h264_chroma_mc_func *pix_op)
+{
+    int mx, my;
+    int mb_x, mb_y, i;
+    const int lowres= s->avctx->lowres;
+    const int block_s= 8>>lowres;    
+
+    mb_x = s->mb_x;
+    mb_y = s->mb_y;
+
+    switch(s->mv_type) {
+    case MV_TYPE_16X16:
+        mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr, 
+                    0, 0, 0,
+                    ref_picture, pix_op,
+                    s->mv[dir][0][0], s->mv[dir][0][1], 2*block_s);
+        break;
+    case MV_TYPE_8X8:
+        mx = 0;
+        my = 0;
+            for(i=0;i<4;i++) {
+                hpel_motion_lowres(s, dest_y + ((i & 1) + (i >> 1) * s->linesize)*block_s,
+                            ref_picture[0], 0, 0,
+                            (2*mb_x + (i & 1))*block_s, (2*mb_y + (i >>1))*block_s,
+                            s->width, s->height, s->linesize,
+                            s->h_edge_pos >> lowres, s->v_edge_pos >> lowres,
+                            block_s, block_s, pix_op,
+                            s->mv[dir][i][0], s->mv[dir][i][1]);
+
+                mx += s->mv[dir][i][0];
+                my += s->mv[dir][i][1];
+            }
+
+        if(!(s->flags&CODEC_FLAG_GRAY))
+            chroma_4mv_motion_lowres(s, dest_cb, dest_cr, ref_picture, pix_op, mx, my);
+        break;
+    case MV_TYPE_FIELD:
+        if (s->picture_structure == PICT_FRAME) {
+            /* top field */       
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                        1, 0, s->field_select[dir][0],
+                        ref_picture, pix_op,
+                        s->mv[dir][0][0], s->mv[dir][0][1], block_s);
+            /* bottom field */
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                        1, 1, s->field_select[dir][1],
+                        ref_picture, pix_op,
+                        s->mv[dir][1][0], s->mv[dir][1][1], block_s);
+        } else {
+            if(s->picture_structure != s->field_select[dir][0] + 1 && s->pict_type != B_TYPE && !s->first_field){
+                ref_picture= s->current_picture_ptr->data;
+            } 
+
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                        0, 0, s->field_select[dir][0],
+                        ref_picture, pix_op,
+                        s->mv[dir][0][0], s->mv[dir][0][1], 2*block_s);
+        }
+        break;
+    case MV_TYPE_16X8:
+        for(i=0; i<2; i++){
+            uint8_t ** ref2picture;
+
+            if(s->picture_structure == s->field_select[dir][i] + 1 || s->pict_type == B_TYPE || s->first_field){
+                ref2picture= ref_picture;
+            }else{
+                ref2picture= s->current_picture_ptr->data;
+            } 
+
+            mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr, 
+                        0, 0, s->field_select[dir][i],
+                        ref2picture, pix_op,
+                        s->mv[dir][i][0], s->mv[dir][i][1] + 2*block_s*i, block_s);
+                
+            dest_y += 2*block_s*s->linesize;
+            dest_cb+= (2*block_s>>s->chroma_y_shift)*s->uvlinesize;
+            dest_cr+= (2*block_s>>s->chroma_y_shift)*s->uvlinesize;
+        }        
+        break;
+    case MV_TYPE_DMV:
+        if(s->picture_structure == PICT_FRAME){
+            for(i=0; i<2; i++){
+                int j;
+                for(j=0; j<2; j++){
+                    mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr,
+                                1, j, j^i,
+                                ref_picture, pix_op,
+                                s->mv[dir][2*i + j][0], s->mv[dir][2*i + j][1], block_s);
+                }
+                pix_op = s->dsp.avg_h264_chroma_pixels_tab;
+            }
+        }else{
+            for(i=0; i<2; i++){
+                mpeg_motion_lowres(s, dest_y, dest_cb, dest_cr, 
+                            0, 0, s->picture_structure != i+1,
+                            ref_picture, pix_op,
+                            s->mv[dir][2*i][0],s->mv[dir][2*i][1],2*block_s);
+
+                // after put we make avg of the same block
+                pix_op = s->dsp.avg_h264_chroma_pixels_tab;
+
+                //opposite parity is always in the same frame if this is second field
+                if(!s->first_field){
+                    ref_picture = s->current_picture_ptr->data;    
+                }
+            }
+        }
+    break;
+    default: assert(0);
+    }
+}
 
 /* put block[] to dest[] */
 static inline void put_dct(MpegEncContext *s, 
@@ -3114,7 +3663,7 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
    s->mv       : motion vector
    s->interlaced_dct : true if interlaced dct used (mpeg2)
  */
-void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
+static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM block[12][64], int lowres_flag)
 {
     int mb_x, mb_y;
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
@@ -3160,7 +3709,8 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         qpel_mc_func (*op_qpix)[16];
         const int linesize= s->current_picture.linesize[0]; //not s->linesize as this woulnd be wrong for field pics
         const int uvlinesize= s->current_picture.linesize[1];
-        const int readable= s->pict_type != B_TYPE || s->encoding || s->avctx->draw_horiz_band;
+        const int readable= s->pict_type != B_TYPE || s->encoding || s->avctx->draw_horiz_band || lowres_flag;
+        const int block_size= lowres_flag ? 8>>s->avctx->lowres : 8;
 
         /* avoid copy if macroblock skipped in last frame too */
         /* skip only during decoding as we might trash the buffers during encoding a bit */
@@ -3188,14 +3738,10 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
                 *mbskip_ptr = 0; /* not skipped */
             }
         }
-
-        if (s->interlaced_dct) {
-            dct_linesize = linesize * 2;
-            dct_offset = linesize;
-        } else {
-            dct_linesize = linesize;
-            dct_offset = linesize * 8;
-        }
+        
+        dct_linesize = linesize << s->interlaced_dct;
+        dct_offset =(s->interlaced_dct)? linesize : linesize*block_size;
+        
         if(readable){
             dest_y=  s->dest[0];
             dest_cb= s->dest[1];
@@ -3203,27 +3749,39 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         }else{
             dest_y = s->b_scratchpad;
             dest_cb= s->b_scratchpad+16*linesize;
-            dest_cr= s->b_scratchpad+16*linesize+8;
+            dest_cr= s->b_scratchpad+32*linesize;
         }
+
         if (!s->mb_intra) {
             /* motion handling */
             /* decoding or more than one mb_type (MC was allready done otherwise) */
             if(!s->encoding){
-                if ((!s->no_rounding) || s->pict_type==B_TYPE){                
-		    op_pix = s->dsp.put_pixels_tab;
-                    op_qpix= s->dsp.put_qpel_pixels_tab;
-                }else{
-                    op_pix = s->dsp.put_no_rnd_pixels_tab;
-                    op_qpix= s->dsp.put_no_rnd_qpel_pixels_tab;
-                }
+                if(lowres_flag){
+                    h264_chroma_mc_func *op_pix = s->dsp.put_h264_chroma_pixels_tab;
 
-                if (s->mv_dir & MV_DIR_FORWARD) {
-                    MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.data, op_pix, op_qpix);
-		    op_pix = s->dsp.avg_pixels_tab;
-                    op_qpix= s->dsp.avg_qpel_pixels_tab;
-                }
-                if (s->mv_dir & MV_DIR_BACKWARD) {
-                    MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix, op_qpix);
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.data, op_pix);
+                        op_pix = s->dsp.avg_h264_chroma_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix);
+                    }
+                }else{
+                    if ((!s->no_rounding) || s->pict_type==B_TYPE){                
+                        op_pix = s->dsp.put_pixels_tab;
+                        op_qpix= s->dsp.put_qpel_pixels_tab;
+                    }else{
+                        op_pix = s->dsp.put_no_rnd_pixels_tab;
+                        op_qpix= s->dsp.put_no_rnd_qpel_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_FORWARD) {
+                        MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.data, op_pix, op_qpix);
+                        op_pix = s->dsp.avg_pixels_tab;
+                        op_qpix= s->dsp.avg_qpel_pixels_tab;
+                    }
+                    if (s->mv_dir & MV_DIR_BACKWARD) {
+                        MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix, op_qpix);
+                    }
                 }
             }
 
@@ -3233,63 +3791,100 @@ void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             /* add dct residue */
             if(s->encoding || !(   s->h263_msmpeg4 || s->codec_id==CODEC_ID_MPEG1VIDEO || s->codec_id==CODEC_ID_MPEG2VIDEO
                                 || (s->codec_id==CODEC_ID_MPEG4 && !s->mpeg_quant))){
-                add_dequant_dct(s, block[0], 0, dest_y, dct_linesize, s->qscale);
-                add_dequant_dct(s, block[1], 1, dest_y + 8, dct_linesize, s->qscale);
-                add_dequant_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize, s->qscale);
-                add_dequant_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize, s->qscale);
+                add_dequant_dct(s, block[0], 0, dest_y                          , dct_linesize, s->qscale);
+                add_dequant_dct(s, block[1], 1, dest_y              + block_size, dct_linesize, s->qscale);
+                add_dequant_dct(s, block[2], 2, dest_y + dct_offset             , dct_linesize, s->qscale);
+                add_dequant_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize, s->qscale);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
                     add_dequant_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
                     add_dequant_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
                 }
             } else if(s->codec_id != CODEC_ID_WMV2){
-                add_dct(s, block[0], 0, dest_y, dct_linesize);
-                add_dct(s, block[1], 1, dest_y + 8, dct_linesize);
-                add_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize);
-                add_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize);
+                add_dct(s, block[0], 0, dest_y                          , dct_linesize);
+                add_dct(s, block[1], 1, dest_y              + block_size, dct_linesize);
+                add_dct(s, block[2], 2, dest_y + dct_offset             , dct_linesize);
+                add_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
-                    add_dct(s, block[4], 4, dest_cb, uvlinesize);
-                    add_dct(s, block[5], 5, dest_cr, uvlinesize);
-                }
-            } 
-#ifdef CONFIG_RISKY
+                    if(s->chroma_y_shift){//Chroma420
+                        add_dct(s, block[4], 4, dest_cb, uvlinesize);
+                        add_dct(s, block[5], 5, dest_cr, uvlinesize);
+                    }else{
+                        //chroma422
+                        dct_linesize = uvlinesize << s->interlaced_dct;
+                        dct_offset =(s->interlaced_dct)? uvlinesize : uvlinesize*8;
+
+                        add_dct(s, block[4], 4, dest_cb, dct_linesize);
+                        add_dct(s, block[5], 5, dest_cr, dct_linesize);
+                        add_dct(s, block[6], 6, dest_cb+dct_offset, dct_linesize);
+                        add_dct(s, block[7], 7, dest_cr+dct_offset, dct_linesize);
+                        if(!s->chroma_x_shift){//Chroma444
+                            add_dct(s, block[8], 8, dest_cb+8, dct_linesize);
+                            add_dct(s, block[9], 9, dest_cr+8, dct_linesize);
+                            add_dct(s, block[10], 10, dest_cb+8+dct_offset, dct_linesize);
+                            add_dct(s, block[11], 11, dest_cr+8+dct_offset, dct_linesize);
+                        }
+                    }
+                }//fi gray
+            }
             else{
                 ff_wmv2_add_mb(s, block, dest_y, dest_cb, dest_cr);
             }
-#endif
         } else {
             /* dct only in intra block */
             if(s->encoding || !(s->codec_id==CODEC_ID_MPEG1VIDEO || s->codec_id==CODEC_ID_MPEG2VIDEO)){
-                put_dct(s, block[0], 0, dest_y, dct_linesize, s->qscale);
-                put_dct(s, block[1], 1, dest_y + 8, dct_linesize, s->qscale);
-                put_dct(s, block[2], 2, dest_y + dct_offset, dct_linesize, s->qscale);
-                put_dct(s, block[3], 3, dest_y + dct_offset + 8, dct_linesize, s->qscale);
+                put_dct(s, block[0], 0, dest_y                          , dct_linesize, s->qscale);
+                put_dct(s, block[1], 1, dest_y              + block_size, dct_linesize, s->qscale);
+                put_dct(s, block[2], 2, dest_y + dct_offset             , dct_linesize, s->qscale);
+                put_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize, s->qscale);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
                     put_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
                     put_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
                 }
             }else{
-                s->dsp.idct_put(dest_y                 , dct_linesize, block[0]);
-                s->dsp.idct_put(dest_y              + 8, dct_linesize, block[1]);
-                s->dsp.idct_put(dest_y + dct_offset    , dct_linesize, block[2]);
-                s->dsp.idct_put(dest_y + dct_offset + 8, dct_linesize, block[3]);
+                s->dsp.idct_put(dest_y                          , dct_linesize, block[0]);
+                s->dsp.idct_put(dest_y              + block_size, dct_linesize, block[1]);
+                s->dsp.idct_put(dest_y + dct_offset             , dct_linesize, block[2]);
+                s->dsp.idct_put(dest_y + dct_offset + block_size, dct_linesize, block[3]);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
-                    s->dsp.idct_put(dest_cb, uvlinesize, block[4]);
-                    s->dsp.idct_put(dest_cr, uvlinesize, block[5]);
-                }
+                    if(s->chroma_y_shift){
+                        s->dsp.idct_put(dest_cb, uvlinesize, block[4]);
+                        s->dsp.idct_put(dest_cr, uvlinesize, block[5]);
+                    }else{
+
+                        dct_linesize = uvlinesize << s->interlaced_dct;
+                        dct_offset =(s->interlaced_dct)? uvlinesize : uvlinesize*8;
+
+                        s->dsp.idct_put(dest_cb,              dct_linesize, block[4]);
+                        s->dsp.idct_put(dest_cr,              dct_linesize, block[5]);
+                        s->dsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]);
+                        s->dsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]);
+                        if(!s->chroma_x_shift){//Chroma444
+                            s->dsp.idct_put(dest_cb + 8,              dct_linesize, block[8]);
+                            s->dsp.idct_put(dest_cr + 8,              dct_linesize, block[9]);
+                            s->dsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]);
+                            s->dsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]);
+                        }
+                    }
+                }//gray
             }
         }
         if(!readable){
             s->dsp.put_pixels_tab[0][0](s->dest[0], dest_y ,   linesize,16);
-            s->dsp.put_pixels_tab[1][0](s->dest[1], dest_cb, uvlinesize, 8);
-            s->dsp.put_pixels_tab[1][0](s->dest[2], dest_cr, uvlinesize, 8);
+            s->dsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[1], dest_cb, uvlinesize,16 >> s->chroma_y_shift);
+            s->dsp.put_pixels_tab[s->chroma_x_shift][0](s->dest[2], dest_cr, uvlinesize,16 >> s->chroma_y_shift);
         }
     }
 }
 
+void MPV_decode_mb(MpegEncContext *s, DCTELEM block[12][64]){
+    if(s->avctx->lowres) MPV_decode_mb_internal(s, block, 1);
+    else                  MPV_decode_mb_internal(s, block, 0);
+}
+
 #ifdef CONFIG_ENCODERS
 
 static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int threshold)
@@ -3389,7 +3984,7 @@ void ff_draw_horiz_band(MpegEncContext *s, int y, int h){
             if(s->first_field  && !(s->avctx->slice_flags&SLICE_FLAG_ALLOW_FIELD)) return;
         }
 
-        h= FFMIN(h, s->height - y);
+        h= FFMIN(h, s->avctx->height - y);
 
         if(s->pict_type==B_TYPE || s->low_delay || (s->avctx->slice_flags&SLICE_FLAG_CODED_ORDER)) 
             src= (AVFrame*)s->current_picture_ptr;
@@ -3406,7 +4001,7 @@ void ff_draw_horiz_band(MpegEncContext *s, int y, int h){
         }else{
             offset[0]= y * s->linesize;;
             offset[1]= 
-            offset[2]= (y>>1) * s->uvlinesize;;
+            offset[2]= (y >> s->chroma_y_shift) * s->uvlinesize;
             offset[3]= 0;
         }
 
@@ -3420,6 +4015,7 @@ void ff_draw_horiz_band(MpegEncContext *s, int y, int h){
 void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     const int linesize= s->current_picture.linesize[0]; //not s->linesize as this woulnd be wrong for field pics
     const int uvlinesize= s->current_picture.linesize[1];
+    const int mb_size= 4 - s->avctx->lowres;
         
     s->block_index[0]= s->b8_stride*(s->mb_y*2    ) - 2 + s->mb_x*2;
     s->block_index[1]= s->b8_stride*(s->mb_y*2    ) - 1 + s->mb_x*2;
@@ -3427,16 +4023,18 @@ void ff_init_block_index(MpegEncContext *s){ //FIXME maybe rename
     s->block_index[3]= s->b8_stride*(s->mb_y*2 + 1) - 1 + s->mb_x*2;
     s->block_index[4]= s->mb_stride*(s->mb_y + 1)                + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
     s->block_index[5]= s->mb_stride*(s->mb_y + s->mb_height + 2) + s->b8_stride*s->mb_height*2 + s->mb_x - 1;
-    
-    if(s->pict_type==B_TYPE && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME){
-        s->dest[0] = s->current_picture.data[0] + s->mb_x * 16 - 16;
-        s->dest[1] = s->current_picture.data[1] + s->mb_x * 8 - 8;
-        s->dest[2] = s->current_picture.data[2] + s->mb_x * 8 - 8;
-    }else{
-        s->dest[0] = s->current_picture.data[0] + (s->mb_y * 16* linesize  ) + s->mb_x * 16 - 16;
-        s->dest[1] = s->current_picture.data[1] + (s->mb_y * 8 * uvlinesize) + s->mb_x * 8 - 8;
-        s->dest[2] = s->current_picture.data[2] + (s->mb_y * 8 * uvlinesize) + s->mb_x * 8 - 8;
-    }    
+    //block_index is not used by mpeg2, so it is not affected by chroma_format
+
+    s->dest[0] = s->current_picture.data[0] + ((s->mb_x - 1) << mb_size);
+    s->dest[1] = s->current_picture.data[1] + ((s->mb_x - 1) << (mb_size - s->chroma_x_shift));
+    s->dest[2] = s->current_picture.data[2] + ((s->mb_x - 1) << (mb_size - s->chroma_x_shift));
+
+    if(!(s->pict_type==B_TYPE && s->avctx->draw_horiz_band && s->picture_structure==PICT_FRAME))
+    {
+        s->dest[0] += s->mb_y *   linesize << mb_size;
+        s->dest[1] += s->mb_y * uvlinesize << (mb_size - s->chroma_y_shift);
+        s->dest[2] += s->mb_y * uvlinesize << (mb_size - s->chroma_y_shift);
+    }
 }
 
 #ifdef CONFIG_ENCODERS
@@ -3506,7 +4104,8 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             }
         }
         ff_set_qscale(s, last_qp + s->dquant);
-    }
+    }else if(s->flags&CODEC_FLAG_QP_RD)
+        ff_set_qscale(s, s->qscale + s->dquant);
 
     wrap_y = s->linesize;
     wrap_c = s->uvlinesize;
@@ -3515,12 +4114,13 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     ptr_cr = s->new_picture.data[2] + (mb_y * 8 * wrap_c) + mb_x * 8;
 
     if(mb_x*16+16 > s->width || mb_y*16+16 > s->height){
-        ff_emulated_edge_mc(s->edge_emu_buffer            , ptr_y , wrap_y,16,16,mb_x*16,mb_y*16, s->width   , s->height);
-        ptr_y= s->edge_emu_buffer;
-        ff_emulated_edge_mc(s->edge_emu_buffer+18*wrap_y  , ptr_cb, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
-        ptr_cb= s->edge_emu_buffer+18*wrap_y;
-        ff_emulated_edge_mc(s->edge_emu_buffer+18*wrap_y+9, ptr_cr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
-        ptr_cr= s->edge_emu_buffer+18*wrap_y+9;
+        uint8_t *ebuf= s->edge_emu_buffer + 32;
+        ff_emulated_edge_mc(ebuf            , ptr_y , wrap_y,16,16,mb_x*16,mb_y*16, s->width   , s->height);
+        ptr_y= ebuf;
+        ff_emulated_edge_mc(ebuf+18*wrap_y  , ptr_cb, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+        ptr_cb= ebuf+18*wrap_y;
+        ff_emulated_edge_mc(ebuf+18*wrap_y+8, ptr_cr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+        ptr_cr= ebuf+18*wrap_y+8;
     }
 
     if (s->mb_intra) {
@@ -3701,7 +4301,6 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         mpeg1_encode_mb(s, s->block, motion_x, motion_y); break;
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_RISKY
     case CODEC_ID_MPEG4:
         mpeg4_encode_mb(s, s->block, motion_x, motion_y); break;
     case CODEC_ID_MSMPEG4V2:
@@ -3710,12 +4309,14 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         msmpeg4_encode_mb(s, s->block, motion_x, motion_y); break;
     case CODEC_ID_WMV2:
          ff_wmv2_encode_mb(s, s->block, motion_x, motion_y); break;
+    case CODEC_ID_H261:
+        ff_h261_encode_mb(s, s->block, motion_x, motion_y); break;
     case CODEC_ID_H263:
     case CODEC_ID_H263P:
     case CODEC_ID_FLV1:
     case CODEC_ID_RV10:
+    case CODEC_ID_RV20:
         h263_encode_mb(s, s->block, motion_x, motion_y); break;
-#endif
     case CODEC_ID_MJPEG:
         mjpeg_encode_mb(s, s->block); break;
 #endif /* #if 0 */
@@ -3740,6 +4341,8 @@ void ff_mpeg_flush(AVCodecContext *avctx){
     }
     s->current_picture_ptr = s->last_picture_ptr = s->next_picture_ptr = NULL;
     
+    s->mb_x= s->mb_y= 0;
+    
     s->parse_context.state= -1;
     s->parse_context.frame_start_found= 0;
     s->parse_context.overread= 0;
@@ -3917,9 +4520,15 @@ static int sse_mb(MpegEncContext *s){
     if(s->mb_y*16 + 16 > s->height) h= s->height- s->mb_y*16;
 
     if(w==16 && h==16)
+      if(s->avctx->mb_cmp == FF_CMP_NSSE){
+        return  s->dsp.nsse[0](s, s->new_picture.data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16)
+               +s->dsp.nsse[1](s, s->new_picture.data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8)
+               +s->dsp.nsse[1](s, s->new_picture.data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8);
+      }else{
         return  s->dsp.sse[0](NULL, s->new_picture.data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16)
                +s->dsp.sse[1](NULL, s->new_picture.data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8)
                +s->dsp.sse[1](NULL, s->new_picture.data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8);
+      }
     else
         return  sse(s, s->new_picture.data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], w, h, s->linesize)
                +sse(s, s->new_picture.data[1] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,s->dest[1], w>>1, h>>1, s->uvlinesize)
@@ -3971,7 +4580,7 @@ static int estimate_motion_thread(AVCodecContext *c, void *arg){
     }
     return 0;
 }
-#endif
+#endif /* #if 0 */
 
 static int mb_var_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= arg;
@@ -4011,6 +4620,9 @@ static void write_slice_end(MpegEncContext *s){
 
     align_put_bits(&s->pb);
     flush_put_bits(&s->pb);
+    
+    if((s->flags&CODEC_FLAG_PASS1) && !s->partitioned_frame)
+        s->misc_bits+= get_bits_diff(s);
 }
 
 static int encode_thread(AVCodecContext *c, void *arg){
@@ -4018,16 +4630,16 @@ static int encode_thread(AVCodecContext *c, void *arg){
     int mb_x, mb_y, pdif = 0;
     int i, j;
     MpegEncContext best_s, backup_s;
-    uint8_t bit_buf[2][3000];
-    uint8_t bit_buf2[2][3000];
-    uint8_t bit_buf_tex[2][3000];
+    uint8_t bit_buf[2][MAX_MB_BYTES];
+    uint8_t bit_buf2[2][MAX_MB_BYTES];
+    uint8_t bit_buf_tex[2][MAX_MB_BYTES];
     PutBitContext pb[2], pb2[2], tex_pb[2];
 //printf("%d->%d\n", s->resync_mb_y, s->end_mb_y);
 
     for(i=0; i<2; i++){
-        init_put_bits(&pb    [i], bit_buf    [i], 3000);
-        init_put_bits(&pb2   [i], bit_buf2   [i], 3000);
-        init_put_bits(&tex_pb[i], bit_buf_tex[i], 3000);
+        init_put_bits(&pb    [i], bit_buf    [i], MAX_MB_BYTES);
+        init_put_bits(&pb2   [i], bit_buf2   [i], MAX_MB_BYTES);
+        init_put_bits(&tex_pb[i], bit_buf_tex[i], MAX_MB_BYTES);
     }
 
     s->last_bits= put_bits_count(&s->pb);
@@ -4052,7 +4664,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
      
     s->last_mv_dir = 0;
 
-#ifdef CONFIG_RISKY
     switch(s->codec_id){
     case CODEC_ID_H263:
     case CODEC_ID_H263P:
@@ -4067,7 +4678,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
         break;
 #endif /* #if 0 */
     }
-#endif
 
     s->resync_mb_x=0;
     s->resync_mb_y=0; 
@@ -4082,17 +4692,35 @@ static int encode_thread(AVCodecContext *c, void *arg){
         ff_init_block_index(s);
         
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
-            const int xy= mb_y*s->mb_stride + mb_x;
+            int xy= mb_y*s->mb_stride + mb_x; // removed const, H261 needs to adjust this
             int mb_type= s->mb_type[xy];
 //            int d;
             int dmin= INT_MAX;
             int dir;
 
+            if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < MAX_MB_BYTES){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
+            if(s->data_partitioning){
+                if(   s->pb2   .buf_end - s->pb2   .buf - (put_bits_count(&s->    pb2)>>3) < MAX_MB_BYTES
+                   || s->tex_pb.buf_end - s->tex_pb.buf - (put_bits_count(&s->tex_pb )>>3) < MAX_MB_BYTES){
+                    av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                    return -1;
+                }
+            }
+
             s->mb_x = mb_x;
+            s->mb_y = mb_y;  // moved into loop, can get changed by H.261
             ff_update_block_index(s);
 
+            if(s->codec_id == CODEC_ID_H261){
+                ff_h261_reorder_mb_index(s);
+                xy= s->mb_y*s->mb_stride + s->mb_x;
+                mb_type= s->mb_type[xy];
+            }
+
             /* write gob / video packet header  */
-#ifdef CONFIG_RISKY
             if(s->rtp_mode){
                 int current_packet_size, is_gob_start;
                 
@@ -4131,7 +4759,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     current_packet_size= pbBufPtr(&s->pb) - s->ptr_lastgob;
                     
                     if(s->avctx->error_rate && s->resync_mb_x + s->resync_mb_y > 0){
-                        int r= put_bits_count(&s->pb)/8 + s->picture_number + s->codec_id + s->mb_x + s->mb_y;
+                        int r= put_bits_count(&s->pb)/8 + s->picture_number + 16 + s->mb_x + s->mb_y;
                         int d= 100 / s->avctx->error_rate;
                         if(r % d == 0){
                             current_packet_size=0;
@@ -4141,9 +4769,11 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             assert(pbBufPtr(&s->pb) == s->ptr_lastgob);
                         }
                     }
-        
-                    if (s->avctx->rtp_callback)
-                        s->avctx->rtp_callback(s->avctx, s->ptr_lastgob, current_packet_size, 0);
+
+                    if (s->avctx->rtp_callback){
+                        int number_mb = (mb_y - s->resync_mb_y)*s->mb_width + mb_x - s->resync_mb_x;
+                        s->avctx->rtp_callback(s->avctx, s->ptr_lastgob, current_packet_size, number_mb);
+                    }
                     
                     switch(s->codec_id){
 /* xine: do not need this for decode or MPEG-1 encoding modes */
@@ -4179,7 +4809,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     s->resync_mb_y=mb_y;
                 }
             }
-#endif
 
             if(  (s->resync_mb_x   == s->mb_x)
                && s->resync_mb_y+1 == s->mb_y){
@@ -4189,7 +4818,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
             s->mb_skiped=0;
             s->dquant=0; //only for QP_RD
 
-            if(mb_type & (mb_type-1) || (s->flags & CODEC_FLAG_QP_RD)){ // more than 1 MB type possible
+            if(mb_type & (mb_type-1) || (s->flags & CODEC_FLAG_QP_RD)){ // more than 1 MB type possible or CODEC_FLAG_QP_RD
                 int next_block=0;
                 int pb_bits_count, pb2_bits_count, tex_pb_bits_count;
 
@@ -4280,9 +4909,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     s->mb_intra= 0;
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_RISKY
                     ff_mpeg4_set_direct_mv(s, mx, my);
-#endif
 #endif /* #if 0 */
                     encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_DIRECT, pb, pb2, tex_pb, 
                                  &dmin, &next_block, mx, my);
@@ -4366,7 +4993,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             if(qp < s->avctx->qmin || qp > s->avctx->qmax)
                                 break;
                             backup_s.dquant= dquant;
-                            if(s->mb_intra){
+                            if(s->mb_intra && s->dc_val[0]){
                                 for(i=0; i<6; i++){
                                     dc[i]= s->dc_val[0][ s->block_index[i] ];
                                     memcpy(ac[i], s->ac_val[0][s->block_index[i]], sizeof(DCTELEM)*16);
@@ -4376,7 +5003,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_INTER /* wrong but unused */, pb, pb2, tex_pb, 
                                          &dmin, &next_block, s->mv[mvdir][0][0], s->mv[mvdir][0][1]);
                             if(best_s.qscale != qp){
-                                if(s->mb_intra){
+                                if(s->mb_intra && s->dc_val[0]){
                                     for(i=0; i<6; i++){
                                         s->dc_val[0][ s->block_index[i] ]= dc[i];
                                         memcpy(s->ac_val[0][s->block_index[i]], ac[i], sizeof(DCTELEM)*16);
@@ -4414,10 +5041,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 }
                 s->last_bits= put_bits_count(&s->pb);
                
-#ifdef CONFIG_RISKY
                 if (s->out_format == FMT_H263 && s->pict_type!=B_TYPE)
                     ff_h263_update_motion_val(s);
-#endif
         
                 if(next_block==0){ //FIXME 16 vs linesize16
                     s->dsp.put_pixels_tab[0][0](s->dest[0], s->rd_scratchpad                     , s->linesize  ,16);
@@ -4473,9 +5098,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     motion_y=s->b_direct_mv_table[xy][1];
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_RISKY
-                    ff_mpeg4_set_direct_mv(s, motion_x, motion_y);
-#endif
+                    ff_mpeg4_set_direct_mv(s, mx, my);
 #endif /* #if 0 */
                     break;
                 case CANDIDATE_MB_TYPE_BIDIR:
@@ -4546,10 +5169,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 // RAL: Update last macrobloc type
                 s->last_mv_dir = s->mv_dir;
             
-#ifdef CONFIG_RISKY
                 if (s->out_format == FMT_H263 && s->pict_type!=B_TYPE)
                     ff_h263_update_motion_val(s);
-#endif
 		
                 MPV_decode_mb(s, s->block);
             }
@@ -4577,29 +5198,30 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     s, s->new_picture    .data[2] + s->mb_x*8  + s->mb_y*s->uvlinesize*8,
                     s->dest[2], w>>1, h>>1, s->uvlinesize);
             }
-            if(s->loop_filter)
-                ff_h263_loop_filter(s);
+            if(s->loop_filter){
+                if(s->out_format == FMT_H263)
+                    ff_h263_loop_filter(s);
+            }
 //printf("MB %d %d bits\n", s->mb_x+s->mb_y*s->mb_stride, put_bits_count(&s->pb));
         }
     }
 
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_RISKY
     //not beautifull here but we must write it before flushing so it has to be here
     if (s->msmpeg4_version && s->msmpeg4_version<4 && s->pict_type == I_TYPE)
         msmpeg4_encode_ext_header(s);
-#endif
 #endif /* #if 0 */
 
     write_slice_end(s);
 
     /* Send the last GOB if RTP */    
     if (s->avctx->rtp_callback) {
+        int number_mb = (mb_y - s->resync_mb_y)*s->mb_width - s->resync_mb_x;
         pdif = pbBufPtr(&s->pb) - s->ptr_lastgob;
         /* Call the RTP callback to send the last GOB */
         emms_c();
-        s->avctx->rtp_callback(s->avctx, s->ptr_lastgob, pdif, 0);
+        s->avctx->rtp_callback(s->avctx, s->ptr_lastgob, pdif, number_mb);
     }
 
     return 0;
@@ -4618,7 +5240,6 @@ static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src)
     MERGE(dct_count[0]); //note, the other dct vars are not part of the context
     MERGE(dct_count[1]);
     MERGE(mv_bits);
-    MERGE(header_bits);
     MERGE(i_tex_bits);
     MERGE(p_tex_bits);
     MERGE(i_count);
@@ -4655,17 +5276,15 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_RISKY
     /* we need to initialize some time vars before we can encode b-frames */
     // RAL: Condition added for MPEG1VIDEO
     if (s->codec_id == CODEC_ID_MPEG1VIDEO || s->codec_id == CODEC_ID_MPEG2VIDEO || (s->h263_pred && !s->h263_msmpeg4))
         ff_set_mpeg4_time(s, s->picture_number);  //FIXME rename and use has_b_frames or similar
-#endif
 #endif /* #if 0 */
         
     s->me.scene_change_score=0;
     
-    s->lambda= s->current_picture_ptr->quality; //FIXME qscale / ... stuff for ME ratedistoration
+//    s->lambda= s->current_picture_ptr->quality; //FIXME qscale / ... stuff for ME ratedistoration
     
     if(s->pict_type==I_TYPE){
         if(s->msmpeg4_version >= 3) s->no_rounding=1;
@@ -4686,6 +5305,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
 
     /* Estimate motion for every MB */
     if(s->pict_type != I_TYPE){
+        s->lambda = (s->lambda * s->avctx->me_penalty_compensation + 128)>>8;
+        s->lambda2= (s->lambda2* s->avctx->me_penalty_compensation + 128)>>8;
         if(s->pict_type != B_TYPE && s->avctx->me_threshold==0){
             if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
                 s->avctx->execute(s->avctx, pre_estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
@@ -4782,7 +5403,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     if(s->adaptive_quant){
 /* xine: do not need this for decode or MPEG-1 encoding modes */
 #if 0
-#ifdef CONFIG_RISKY
         switch(s->codec_id){
         case CODEC_ID_MPEG4:
             ff_clean_mpeg4_qscales(s);
@@ -4793,7 +5413,6 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             ff_clean_h263_qscales(s);
             break;
         }
-#endif
 #endif /* #if 0 */
 
         s->lambda= s->lambda_table[0];
@@ -4815,12 +5434,14 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->intra_matrix[j] = CLAMP_TO_8BIT((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
         }
         convert_matrix(&s->dsp, s->q_intra_matrix, s->q_intra_matrix16, 
-                       s->intra_matrix, s->intra_quant_bias, 8, 8);
+                       s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
         s->qscale= 8;
     }
     
     //FIXME var duplication
+    s->current_picture_ptr->key_frame=
     s->current_picture.key_frame= s->pict_type == I_TYPE; //FIXME pic_ptr
+    s->current_picture_ptr->pict_type=
     s->current_picture.pict_type= s->pict_type;
 
     if(s->current_picture.key_frame)
@@ -4833,7 +5454,9 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     case FMT_MJPEG:
         mjpeg_picture_header(s);
         break;
-#ifdef CONFIG_RISKY
+    case FMT_H261:
+        ff_h261_encode_picture_header(s, picture_number);
+        break;
     case FMT_H263:
         if (s->codec_id == CODEC_ID_WMV2) 
             ff_wmv2_encode_picture_header(s, picture_number);
@@ -4843,12 +5466,13 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             mpeg4_encode_picture_header(s, picture_number);
         else if (s->codec_id == CODEC_ID_RV10) 
             rv10_encode_picture_header(s, picture_number);
+        else if (s->codec_id == CODEC_ID_RV20) 
+            rv20_encode_picture_header(s, picture_number);
         else if (s->codec_id == CODEC_ID_FLV1)
             ff_flv_encode_picture_header(s, picture_number);
         else
             h263_encode_picture_header(s, picture_number);
         break;
-#endif
 #endif /* #if 0 */
     case FMT_MPEG1:
         mpeg1_encode_picture_header(s, picture_number);
@@ -5216,7 +5840,7 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise?
                         DCTELEM *block, int16_t *weight, DCTELEM *orig,
                         int n, int qscale){
     int16_t rem[64];
-    DCTELEM d1[64];
+    DCTELEM d1[64] __align16;
     const int *qmat;
     const uint8_t *scantable= s->intra_scantable.scantable;
     const uint8_t *perm_scantable= s->intra_scantable.permutated;
@@ -5855,82 +6479,7 @@ static void dct_unquantize_h263_inter_c(MpegEncContext *s,
     }
 }
 
-static const AVOption mpeg4_options[] =
-{
-    AVOPTION_CODEC_INT("bitrate", "desired video bitrate", bit_rate, 4, 240000000, 800000),
-    AVOPTION_CODEC_INT("ratetol", "number of bits the bitstream is allowed to diverge from the reference"
-		       "the reference can be CBR (for CBR pass1) or VBR (for pass2)",
-		       bit_rate_tolerance, 4, 240000000, 8000),
-    AVOPTION_CODEC_INT("qmin", "minimum quantizer", qmin, 1, 31, 2),
-    AVOPTION_CODEC_INT("qmax", "maximum quantizer", qmax, 1, 31, 31),
-    AVOPTION_CODEC_STRING("rc_eq", "rate control equation",
-			  rc_eq, "tex^qComp,option1,options2", 0),
-    AVOPTION_CODEC_INT("rc_minrate", "rate control minimum bitrate",
-		       rc_min_rate, 4, 24000000, 0),
-    AVOPTION_CODEC_INT("rc_maxrate", "rate control maximum bitrate",
-		       rc_max_rate, 4, 24000000, 0),
-    AVOPTION_CODEC_DOUBLE("rc_buf_aggresivity", "rate control buffer aggresivity",
-			  rc_buffer_aggressivity, 4, 24000000, 0),
-    AVOPTION_CODEC_DOUBLE("rc_initial_cplx", "initial complexity for pass1 ratecontrol",
-			  rc_initial_cplx, 0., 9999999., 0),
-    AVOPTION_CODEC_DOUBLE("i_quant_factor", "qscale factor between p and i frames",
-			  i_quant_factor, 0., 0., 0),
-    AVOPTION_CODEC_DOUBLE("i_quant_offset", "qscale offset between p and i frames",
-			  i_quant_factor, -999999., 999999., 0),
-    AVOPTION_CODEC_INT("dct_algo", "dct alghorithm",
-		       dct_algo, 0, 5, 0), // fixme - "Auto,FastInt,Int,MMX,MLib,Altivec"
-    AVOPTION_CODEC_DOUBLE("lumi_masking", "luminance masking",
-			  lumi_masking, 0., 999999., 0),
-    AVOPTION_CODEC_DOUBLE("temporal_cplx_masking", "temporary complexity masking",
-			  temporal_cplx_masking, 0., 999999., 0),
-    AVOPTION_CODEC_DOUBLE("spatial_cplx_masking", "spatial complexity masking",
-			  spatial_cplx_masking, 0., 999999., 0),
-    AVOPTION_CODEC_DOUBLE("p_masking", "p block masking",
-			  p_masking, 0., 999999., 0),
-    AVOPTION_CODEC_DOUBLE("dark_masking", "darkness masking",
-			  dark_masking, 0., 999999., 0),
-    AVOPTION_CODEC_INT("idct_algo", "idct alghorithm",
-		       idct_algo, 0, 8, 0), // fixme - "Auto,Int,Simple,SimpleMMX,LibMPEG2MMX,PS2,MLib,ARM,Altivec"
-
-    AVOPTION_CODEC_INT("mb_qmin", "minimum MB quantizer",
-		       mb_qmin, 0, 8, 0),
-    AVOPTION_CODEC_INT("mb_qmax", "maximum MB quantizer",
-		       mb_qmin, 0, 8, 0),
-
-    AVOPTION_CODEC_INT("me_cmp", "ME compare function",
-		       me_cmp, 0, 24000000, 0),
-    AVOPTION_CODEC_INT("me_sub_cmp", "subpixel ME compare function",
-		       me_sub_cmp, 0, 24000000, 0),
-
-
-    AVOPTION_CODEC_INT("dia_size", "ME diamond size & shape",
-		       dia_size, 0, 24000000, 0),
-    AVOPTION_CODEC_INT("last_predictor_count", "amount of previous MV predictors",
-		       last_predictor_count, 0, 24000000, 0),
-
-    AVOPTION_CODEC_INT("pre_me", "pre pass for ME",
-		       pre_me, 0, 24000000, 0),
-    AVOPTION_CODEC_INT("me_pre_cmp", "ME pre pass compare function",
-		       me_pre_cmp, 0, 24000000, 0),
-
-    AVOPTION_CODEC_INT("me_range", "maximum ME search range",
-		       me_range, 0, 24000000, 0),
-    AVOPTION_CODEC_INT("pre_dia_size", "ME pre pass diamod size & shape",
-		       pre_dia_size, 0, 24000000, 0),
-    AVOPTION_CODEC_INT("me_subpel_quality", "subpel ME quality",
-		       me_subpel_quality, 0, 24000000, 0),
-    AVOPTION_CODEC_INT("me_range", "maximum ME search range",
-		       me_range, 0, 24000000, 0),
-    AVOPTION_CODEC_FLAG("psnr", "calculate PSNR of compressed frames",
-		        flags, CODEC_FLAG_PSNR, 0),
-    AVOPTION_CODEC_RCOVERRIDE("rc_override", "ratecontrol override (=startframe,endframe,qscale,quality_factor)",
-			      rc_override),
-    AVOPTION_SUB(avoptions_common),
-    AVOPTION_END()
-};
-
 #ifdef CONFIG_ENCODERS
-#ifdef CONFIG_RISKY
 AVCodec h263_encoder = {
     "h263",
     CODEC_TYPE_VIDEO,
@@ -5939,6 +6488,7 @@ AVCodec h263_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec h263p_encoder = {
@@ -5949,6 +6499,7 @@ AVCodec h263p_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec flv_encoder = {
@@ -5959,6 +6510,7 @@ AVCodec flv_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec rv10_encoder = {
@@ -5969,6 +6521,18 @@ AVCodec rv10_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
+};
+
+AVCodec rv20_encoder = {
+    "rv20",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_RV20,
+    sizeof(MpegEncContext),
+    MPV_encode_init,
+    MPV_encode_picture,
+    MPV_encode_end,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec mpeg4_encoder = {
@@ -5979,7 +6543,8 @@ AVCodec mpeg4_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .options = mpeg4_options,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
+    .capabilities= CODEC_CAP_DELAY,
 };
 
 AVCodec msmpeg4v1_encoder = {
@@ -5990,7 +6555,7 @@ AVCodec msmpeg4v1_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .options = mpeg4_options,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec msmpeg4v2_encoder = {
@@ -6001,7 +6566,7 @@ AVCodec msmpeg4v2_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .options = mpeg4_options,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec msmpeg4v3_encoder = {
@@ -6012,7 +6577,7 @@ AVCodec msmpeg4v3_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .options = mpeg4_options,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
 AVCodec wmv1_encoder = {
@@ -6023,11 +6588,9 @@ AVCodec wmv1_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .options = mpeg4_options,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
 };
 
-#endif
-
 AVCodec mjpeg_encoder = {
     "mjpeg",
     CODEC_TYPE_VIDEO,
@@ -6036,6 +6599,7 @@ AVCodec mjpeg_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUVJ420P, -1},
 };
 
 #endif //CONFIG_ENCODERS
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 715fb6d92..a1c459e97 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -27,11 +27,13 @@
 #define AVCODEC_MPEGVIDEO_H
 
 #include "dsputil.h"
+#include "bitstream.h"
 
 #define FRAME_SKIPED 100 ///< return value for header parsers if frame is not coded
 
 enum OutputFormat {
     FMT_MPEG1,
+    FMT_H261,
     FMT_H263,
     FMT_MJPEG, 
     FMT_H264,
@@ -49,7 +51,7 @@ enum OutputFormat {
 
 #define MAX_THREADS 8
 
-#define MAX_PICTURE_COUNT 15
+#define MAX_PICTURE_COUNT 32
 
 #define ME_MAP_SIZE 64
 #define ME_MAP_SHIFT 3
@@ -66,6 +68,8 @@ enum OutputFormat {
 #define SI_TYPE FF_SI_TYPE  ///< Switching Intra
 #define SP_TYPE FF_SP_TYPE  ///< Switching Predicted
 
+#define MAX_MB_BYTES (30*16*16*3/8 + 120)
+
 typedef struct Predictor{
     double coeff;
     double count;
@@ -169,6 +173,8 @@ typedef struct Picture{
     int frame_num;              ///< h264 frame_num
     int pic_id;                 ///< h264 pic_num or long_term_pic_idx
     int long_ref;               ///< 1->long term reference 0->short term reference
+    int ref_poc[2][16];         ///< h264 POCs of the frames used as reference
+    int ref_count[2];           ///< number of entries in ref_poc
 
     int mb_var_sum;             ///< sum of MB variance for current frame 
     int mc_mb_var_sum;          ///< motion compensated MB variance for current frame 
@@ -262,7 +268,7 @@ typedef struct MpegEncContext {
     int h263_msmpeg4; ///< generate MSMPEG4 compatible stream (deprecated, use msmpeg4_version instead)
     int h263_flv;     ///< use flv h263 header 
     
-    int codec_id;     /* see CODEC_ID_xxx */
+    enum CodecID codec_id;     /* see CODEC_ID_xxx */
     int fixed_qscale; ///< fixed qscale if non zero 
     int encoding;     ///< true if we are encoding (vs decoding) 
     int flags;        ///< AVCodecContext.flags (HQ, MV4, ...) 
@@ -368,8 +374,6 @@ typedef struct MpegEncContext {
     int last_non_b_pict_type;   ///< used for mpeg4 gmc b-frames & ratecontrol 
     int dropable;
     int frame_rate_index;
-    int frame_rate_ext_n;       ///< MPEG-2 specific framerate modificators (numerator)
-    int frame_rate_ext_d;       ///< MPEG-2 specific framerate modificators (denominator)
 
     /* motion compensation */
     int unrestricted_mv;        ///< mv can point outside of the coded picture 
@@ -599,9 +603,9 @@ typedef struct MpegEncContext {
     int divx_version;
     int divx_build;
     int divx_packed;
-#define BITSTREAM_BUFFER_SIZE 1024*256
     uint8_t *bitstream_buffer; //Divx 5.01 puts several frames in a single one, this is used to reorder them
     int bitstream_buffer_size;
+    int allocated_bitstream_buffer_size;
     
     int xvid_build;
     
@@ -669,6 +673,8 @@ typedef struct MpegEncContext {
 #define CHROMA_420 1
 #define CHROMA_422 2
 #define CHROMA_444 3
+    int chroma_x_shift;//depend on pix_format, that depend on chroma_format
+    int chroma_y_shift;
 
     int progressive_frame;
     int full_pel[2];
@@ -703,6 +709,10 @@ typedef struct MpegEncContext {
                            DCTELEM *block/*align 16*/, int n, int qscale);
     void (*dct_unquantize_h263_inter)(struct MpegEncContext *s, 
                            DCTELEM *block/*align 16*/, int n, int qscale);
+    void (*dct_unquantize_h261_intra)(struct MpegEncContext *s, 
+                           DCTELEM *block/*align 16*/, int n, int qscale);
+    void (*dct_unquantize_h261_inter)(struct MpegEncContext *s, 
+                           DCTELEM *block/*align 16*/, int n, int qscale);
     void (*dct_unquantize_intra)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both)
                            DCTELEM *block/*align 16*/, int n, int qscale);
     void (*dct_unquantize_inter)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both)
@@ -717,7 +727,7 @@ int DCT_common_init(MpegEncContext *s);
 void MPV_decode_defaults(MpegEncContext *s);
 int MPV_common_init(MpegEncContext *s);
 void MPV_common_end(MpegEncContext *s);
-void MPV_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
+void MPV_decode_mb(MpegEncContext *s, DCTELEM block[12][64]);
 int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx);
 void MPV_frame_end(MpegEncContext *s);
 int MPV_encode_init(AVCodecContext *avctx);
@@ -768,15 +778,17 @@ extern enum PixelFormat ff_yuv420p_list[2];
 void ff_init_block_index(MpegEncContext *s);
 
 static inline void ff_update_block_index(MpegEncContext *s){
+    const int block_size= 8>>s->avctx->lowres;
+
     s->block_index[0]+=2;
     s->block_index[1]+=2;
     s->block_index[2]+=2;
     s->block_index[3]+=2;
     s->block_index[4]++;
     s->block_index[5]++;
-    s->dest[0]+= 16;
-    s->dest[1]+= 8;
-    s->dest[2]+= 8;
+    s->dest[0]+= 2*block_size;
+    s->dest[1]+= block_size;
+    s->dest[2]+= block_size;
 }
 
 static inline int get_bits_diff(MpegEncContext *s){
@@ -799,7 +811,11 @@ void ff_fix_long_mvs(MpegEncContext * s, uint8_t *field_select_table, int field_
                      int16_t (*mv_table)[2], int f_code, int type, int truncate);
 void ff_init_me(MpegEncContext *s);
 int ff_pre_estimate_p_frame_motion(MpegEncContext * s, int mb_x, int mb_y);
-
+inline int ff_epzs_motion_search(MpegEncContext * s, int *mx_ptr, int *my_ptr,
+                             int P[10][2], int src_index, int ref_index, int16_t (*last_mv)[2], 
+                             int ref_mv_scale, int size, int h);                             
+int inline ff_get_mb_score(MpegEncContext * s, int mx, int my, int src_index,
+                               int ref_index, int size, int h, int add_rate);
 
 /* mpeg12.c */
 extern const int16_t ff_mpeg1_default_intra_matrix[64];
@@ -830,8 +846,8 @@ typedef struct RLTable {
     RL_VLC_ELEM *rl_vlc[32];       ///< decoding only 
 } RLTable;
 
-void init_rl(RLTable *rl);
-void init_vlc_rl(RLTable *rl);
+void init_rl(RLTable *rl, int use_static);
+void init_vlc_rl(RLTable *rl, int use_static);
 
 static inline int get_rl_index(const RLTable *rl, int last, int run, int level)
 {
@@ -852,6 +868,15 @@ extern const int16_t ff_mpeg4_default_non_intra_matrix[64];
 extern const uint8_t ff_h263_chroma_qscale_table[32];
 extern const uint8_t ff_h263_loop_filter_strength[32];
 
+/* h261.c */
+void ff_h261_loop_filter(MpegEncContext *s);
+void ff_h261_reorder_mb_index(MpegEncContext* s);
+void ff_h261_encode_mb(MpegEncContext *s,
+                    DCTELEM block[6][64],
+                    int motion_x, int motion_y);
+void ff_h261_encode_picture_header(MpegEncContext * s, int picture_number);
+void ff_h261_encode_init(MpegEncContext *s);
+
 
 /* h263.c, h263dec.c */
 int ff_h263_decode_init(AVCodecContext *avctx);
@@ -912,6 +937,7 @@ int ff_mpeg4_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size);
 /* rv10.c */
 void rv10_encode_picture_header(MpegEncContext *s, int picture_number);
 int rv_decode_dc(MpegEncContext *s, int n);
+void rv20_encode_picture_header(MpegEncContext *s, int picture_number);
 
 
 /* msmpeg4.c */
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 701a8da92..a3140abb8 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -59,6 +59,9 @@
 static uint32_t v2_dc_lum_table[512][2];
 static uint32_t v2_dc_chroma_table[512][2];
 
+static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n);
+static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
+                                       int n, int coded, const uint8_t *scantable);
 static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr);
 static int msmpeg4_decode_motion(MpegEncContext * s, 
                                  int *mx_ptr, int *my_ptr);
@@ -67,12 +70,13 @@ static void init_h263_dc_for_msmpeg4(void);
 static inline void msmpeg4_memsetw(short *tab, int val, int n);
 #ifdef CONFIG_ENCODERS
 static int get_size_of_code(MpegEncContext * s, RLTable *rl, int last, int run, int level, int intra);
-static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr);
 #endif //CONFIG_ENCODERS
 static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 static int wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 
+/* vc9 externs */
+extern uint8_t wmv3_dc_scale_table[32];
 
 #ifdef DEBUG
 int intra_count = 0;
@@ -173,6 +177,14 @@ static void common_init(MpegEncContext * s)
         s->y_dc_scale_table= wmv1_y_dc_scale_table;
         s->c_dc_scale_table= wmv1_c_dc_scale_table;
         break;
+    case 6:
+/* xine: comment this out as WMV3 support is incomplete */
+#if 0
+        s->y_dc_scale_table= wmv3_dc_scale_table;
+        s->c_dc_scale_table= wmv3_dc_scale_table;
+#endif /* #if 0 */
+        break;
+
     }
 
     
@@ -237,7 +249,7 @@ void ff_msmpeg4_encode_init(MpegEncContext *s)
         init_mv_table(&mv_tables[0]);
         init_mv_table(&mv_tables[1]);
         for(i=0;i<NB_RL_TABLES;i++)
-            init_rl(&rl_table[i]);
+            init_rl(&rl_table[i], 1);
 
         for(i=0; i<NB_RL_TABLES; i++){
             int level;
@@ -522,129 +534,6 @@ static inline void handle_slices(MpegEncContext *s){
     }
 }
 
-/* Encoding of a block. Very similar to MPEG4 except for a different
-   escape coding (same as H263) and more vlc tables.
- */
-static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
-{
-    int level, run, last, i, j, last_index;
-    int last_non_zero, sign, slevel;
-    int code, run_diff, dc_pred_dir;
-    const RLTable *rl;
-    const uint8_t *scantable;
-
-    if (s->mb_intra) {
-        set_stat(ST_DC);
-        msmpeg4_encode_dc(s, block[0], n, &dc_pred_dir);
-        i = 1;
-        if (n < 4) {
-            rl = &rl_table[s->rl_table_index];
-        } else {
-            rl = &rl_table[3 + s->rl_chroma_table_index];
-        }
-        run_diff = 0;
-        scantable= s->intra_scantable.permutated;
-        set_stat(ST_INTRA_AC);
-    } else {
-        i = 0;
-        rl = &rl_table[3 + s->rl_table_index];
-        if(s->msmpeg4_version<=2)
-            run_diff = 0;
-        else
-            run_diff = 1;
-        scantable= s->inter_scantable.permutated;
-        set_stat(ST_INTER_AC);
-    }
-
-    /* recalculate block_last_index for M$ wmv1 */
-    if(s->msmpeg4_version>=4 && s->block_last_index[n]>0){
-        for(last_index=63; last_index>=0; last_index--){
-            if(block[scantable[last_index]]) break;
-        }
-        s->block_last_index[n]= last_index;
-    }else
-        last_index = s->block_last_index[n];
-    /* AC coefs */
-    last_non_zero = i - 1;
-    for (; i <= last_index; i++) {
-	j = scantable[i];
-	level = block[j];
-	if (level) {
-	    run = i - last_non_zero - 1;
-	    last = (i == last_index);
-	    sign = 0;
-	    slevel = level;
-	    if (level < 0) {
-		sign = 1;
-		level = -level;
-	    }
-
-            if(level<=MAX_LEVEL && run<=MAX_RUN){
-                s->ac_stats[s->mb_intra][n>3][level][run][last]++;
-            }
-#if 0
-else
-    s->ac_stats[s->mb_intra][n>3][40][63][0]++; //esc3 like
-#endif
-            code = get_rl_index(rl, last, run, level);
-            put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-            if (code == rl->n) {
-                int level1, run1;
-
-                level1 = level - rl->max_level[last][run];
-                if (level1 < 1) 
-                    goto esc2;
-                code = get_rl_index(rl, last, run, level1);
-                if (code == rl->n) {
-                esc2:
-                    put_bits(&s->pb, 1, 0);
-                    if (level > MAX_LEVEL)
-                        goto esc3;
-                    run1 = run - rl->max_run[last][level] - run_diff;
-                    if (run1 < 0)
-                        goto esc3;
-                    code = get_rl_index(rl, last, run1, level);
-                    if (code == rl->n) {
-                    esc3:
-                        /* third escape */
-                        put_bits(&s->pb, 1, 0);
-                        put_bits(&s->pb, 1, last);
-                        if(s->msmpeg4_version>=4){
-                            if(s->esc3_level_length==0){
-                                s->esc3_level_length=8;
-                                s->esc3_run_length= 6;
-                                if(s->qscale<8)
-                                    put_bits(&s->pb, 6, 3);
-                                else
-                                    put_bits(&s->pb, 8, 3);
-                            }
-                            put_bits(&s->pb, s->esc3_run_length, run);
-                            put_bits(&s->pb, 1, sign);
-                            put_bits(&s->pb, s->esc3_level_length, level);
-                        }else{
-                            put_bits(&s->pb, 6, run);
-                            put_bits(&s->pb, 8, slevel & 0xff);
-                        }
-                    } else {
-                        /* second escape */
-                        put_bits(&s->pb, 1, 1);
-                        put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-                        put_bits(&s->pb, 1, sign);
-                    }
-                } else {
-                    /* first escape */
-                    put_bits(&s->pb, 1, 1);
-                    put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
-                    put_bits(&s->pb, 1, sign);
-                }
-            } else {
-                put_bits(&s->pb, 1, sign);
-            }
-	    last_non_zero = i;
-	}
-    }
-}
-
 void msmpeg4_encode_mb(MpegEncContext * s, 
                        DCTELEM block[6][64],
                        int motion_x, int motion_y)
@@ -750,7 +639,7 @@ void msmpeg4_encode_mb(MpegEncContext * s,
             if (s->pict_type == I_TYPE) {
                 set_stat(ST_INTRA_MB);
                 put_bits(&s->pb, 
-                         table_mb_intra[coded_cbp][1], table_mb_intra[coded_cbp][0]);
+                         ff_msmp4_mb_i_table[coded_cbp][1], ff_msmp4_mb_i_table[coded_cbp][0]);
             } else {
                 if (s->use_skip_mb_code)
                     put_bits(&s->pb, 1, 0);	/* mb coded */
@@ -837,7 +726,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n,
        necessitate to modify mpegvideo.c. The problem comes from the
        fact they decided to store the quantized DC (which would lead
        to problems if Q could vary !) */
-#if defined ARCH_X86 && !defined PIC
+#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined PIC
     asm volatile(
         "movl %3, %%eax		\n\t"
 	"shrl $1, %%eax		\n\t"
@@ -1003,15 +892,15 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
 
         if (s->dc_table_index == 0) {
             if (n < 4) {
-                put_bits(&s->pb, table0_dc_lum[code][1], table0_dc_lum[code][0]);
+                put_bits(&s->pb, ff_table0_dc_lum[code][1], ff_table0_dc_lum[code][0]);
             } else {
-                put_bits(&s->pb, table0_dc_chroma[code][1], table0_dc_chroma[code][0]);
+                put_bits(&s->pb, ff_table0_dc_chroma[code][1], ff_table0_dc_chroma[code][0]);
             }
         } else {
             if (n < 4) {
-                put_bits(&s->pb, table1_dc_lum[code][1], table1_dc_lum[code][0]);
+                put_bits(&s->pb, ff_table1_dc_lum[code][1], ff_table1_dc_lum[code][0]);
             } else {
-                put_bits(&s->pb, table1_dc_chroma[code][1], table1_dc_chroma[code][0]);
+                put_bits(&s->pb, ff_table1_dc_chroma[code][1], ff_table1_dc_chroma[code][0]);
             }
         }
             
@@ -1024,14 +913,136 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
     }
 }
 
+/* Encoding of a block. Very similar to MPEG4 except for a different
+   escape coding (same as H263) and more vlc tables.
+ */
+static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int n)
+{
+    int level, run, last, i, j, last_index;
+    int last_non_zero, sign, slevel;
+    int code, run_diff, dc_pred_dir;
+    const RLTable *rl;
+    const uint8_t *scantable;
+
+    if (s->mb_intra) {
+        set_stat(ST_DC);
+        msmpeg4_encode_dc(s, block[0], n, &dc_pred_dir);
+        i = 1;
+        if (n < 4) {
+            rl = &rl_table[s->rl_table_index];
+        } else {
+            rl = &rl_table[3 + s->rl_chroma_table_index];
+        }
+        run_diff = 0;
+        scantable= s->intra_scantable.permutated;
+        set_stat(ST_INTRA_AC);
+    } else {
+        i = 0;
+        rl = &rl_table[3 + s->rl_table_index];
+        if(s->msmpeg4_version<=2)
+            run_diff = 0;
+        else
+            run_diff = 1;
+        scantable= s->inter_scantable.permutated;
+        set_stat(ST_INTER_AC);
+    }
+
+    /* recalculate block_last_index for M$ wmv1 */
+    if(s->msmpeg4_version>=4 && s->block_last_index[n]>0){
+        for(last_index=63; last_index>=0; last_index--){
+            if(block[scantable[last_index]]) break;
+        }
+        s->block_last_index[n]= last_index;
+    }else
+        last_index = s->block_last_index[n];
+    /* AC coefs */
+    last_non_zero = i - 1;
+    for (; i <= last_index; i++) {
+	j = scantable[i];
+	level = block[j];
+	if (level) {
+	    run = i - last_non_zero - 1;
+	    last = (i == last_index);
+	    sign = 0;
+	    slevel = level;
+	    if (level < 0) {
+		sign = 1;
+		level = -level;
+	    }
+
+            if(level<=MAX_LEVEL && run<=MAX_RUN){
+                s->ac_stats[s->mb_intra][n>3][level][run][last]++;
+            }
+#if 0
+else
+    s->ac_stats[s->mb_intra][n>3][40][63][0]++; //esc3 like
+#endif
+            code = get_rl_index(rl, last, run, level);
+            put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+            if (code == rl->n) {
+                int level1, run1;
+
+                level1 = level - rl->max_level[last][run];
+                if (level1 < 1) 
+                    goto esc2;
+                code = get_rl_index(rl, last, run, level1);
+                if (code == rl->n) {
+                esc2:
+                    put_bits(&s->pb, 1, 0);
+                    if (level > MAX_LEVEL)
+                        goto esc3;
+                    run1 = run - rl->max_run[last][level] - run_diff;
+                    if (run1 < 0)
+                        goto esc3;
+                    code = get_rl_index(rl, last, run1, level);
+                    if (code == rl->n) {
+                    esc3:
+                        /* third escape */
+                        put_bits(&s->pb, 1, 0);
+                        put_bits(&s->pb, 1, last);
+                        if(s->msmpeg4_version>=4){
+                            if(s->esc3_level_length==0){
+                                s->esc3_level_length=8;
+                                s->esc3_run_length= 6;
+                                if(s->qscale<8)
+                                    put_bits(&s->pb, 6, 3);
+                                else
+                                    put_bits(&s->pb, 8, 3);
+                            }
+                            put_bits(&s->pb, s->esc3_run_length, run);
+                            put_bits(&s->pb, 1, sign);
+                            put_bits(&s->pb, s->esc3_level_length, level);
+                        }else{
+                            put_bits(&s->pb, 6, run);
+                            put_bits(&s->pb, 8, slevel & 0xff);
+                        }
+                    } else {
+                        /* second escape */
+                        put_bits(&s->pb, 1, 1);
+                        put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+                        put_bits(&s->pb, 1, sign);
+                    }
+                } else {
+                    /* first escape */
+                    put_bits(&s->pb, 1, 1);
+                    put_bits(&s->pb, rl->table_vlc[code][1], rl->table_vlc[code][0]);
+                    put_bits(&s->pb, 1, sign);
+                }
+            } else {
+                put_bits(&s->pb, 1, sign);
+            }
+	    last_non_zero = i;
+	}
+    }
+}
 
 /****************************************/
 /* decoding stuff */
 
 static VLC mb_non_intra_vlc[4];
-static VLC mb_intra_vlc;
-static VLC dc_lum_vlc[2];
-static VLC dc_chroma_vlc[2];
+VLC ff_msmp4_mb_i_vlc;
+VLC ff_msmp4_dc_luma_vlc[2];
+VLC ff_msmp4_dc_chroma_vlc[2];
 static VLC v2_dc_lum_vlc;
 static VLC v2_dc_chroma_vlc;
 static VLC cbpy_vlc;
@@ -1110,69 +1121,69 @@ int ff_msmpeg4_decode_init(MpegEncContext *s)
         done = 1;
 
         for(i=0;i<NB_RL_TABLES;i++) {
-            init_rl(&rl_table[i]);
-            init_vlc_rl(&rl_table[i]);
+            init_rl(&rl_table[i], 1);
+            init_vlc_rl(&rl_table[i], 1);
         }
         for(i=0;i<2;i++) {
             mv = &mv_tables[i];
             init_vlc(&mv->vlc, MV_VLC_BITS, mv->n + 1, 
                      mv->table_mv_bits, 1, 1,
-                     mv->table_mv_code, 2, 2);
+                     mv->table_mv_code, 2, 2, 1);
         }
 
-        init_vlc(&dc_lum_vlc[0], DC_VLC_BITS, 120, 
-                 &table0_dc_lum[0][1], 8, 4,
-                 &table0_dc_lum[0][0], 8, 4);
-        init_vlc(&dc_chroma_vlc[0], DC_VLC_BITS, 120, 
-                 &table0_dc_chroma[0][1], 8, 4,
-                 &table0_dc_chroma[0][0], 8, 4);
-        init_vlc(&dc_lum_vlc[1], DC_VLC_BITS, 120, 
-                 &table1_dc_lum[0][1], 8, 4,
-                 &table1_dc_lum[0][0], 8, 4);
-        init_vlc(&dc_chroma_vlc[1], DC_VLC_BITS, 120, 
-                 &table1_dc_chroma[0][1], 8, 4,
-                 &table1_dc_chroma[0][0], 8, 4);
+        init_vlc(&ff_msmp4_dc_luma_vlc[0], DC_VLC_BITS, 120, 
+                 &ff_table0_dc_lum[0][1], 8, 4,
+                 &ff_table0_dc_lum[0][0], 8, 4, 1);
+        init_vlc(&ff_msmp4_dc_chroma_vlc[0], DC_VLC_BITS, 120, 
+                 &ff_table0_dc_chroma[0][1], 8, 4,
+                 &ff_table0_dc_chroma[0][0], 8, 4, 1);
+        init_vlc(&ff_msmp4_dc_luma_vlc[1], DC_VLC_BITS, 120, 
+                 &ff_table1_dc_lum[0][1], 8, 4,
+                 &ff_table1_dc_lum[0][0], 8, 4, 1);
+        init_vlc(&ff_msmp4_dc_chroma_vlc[1], DC_VLC_BITS, 120, 
+                 &ff_table1_dc_chroma[0][1], 8, 4,
+                 &ff_table1_dc_chroma[0][0], 8, 4, 1);
     
         init_vlc(&v2_dc_lum_vlc, DC_VLC_BITS, 512, 
                  &v2_dc_lum_table[0][1], 8, 4,
-                 &v2_dc_lum_table[0][0], 8, 4);
+                 &v2_dc_lum_table[0][0], 8, 4, 1);
         init_vlc(&v2_dc_chroma_vlc, DC_VLC_BITS, 512, 
                  &v2_dc_chroma_table[0][1], 8, 4,
-                 &v2_dc_chroma_table[0][0], 8, 4);
+                 &v2_dc_chroma_table[0][0], 8, 4, 1);
     
         init_vlc(&cbpy_vlc, CBPY_VLC_BITS, 16,
                  &cbpy_tab[0][1], 2, 1,
-                 &cbpy_tab[0][0], 2, 1);
+                 &cbpy_tab[0][0], 2, 1, 1);
         init_vlc(&v2_intra_cbpc_vlc, V2_INTRA_CBPC_VLC_BITS, 4,
                  &v2_intra_cbpc[0][1], 2, 1,
-                 &v2_intra_cbpc[0][0], 2, 1);
+                 &v2_intra_cbpc[0][0], 2, 1, 1);
         init_vlc(&v2_mb_type_vlc, V2_MB_TYPE_VLC_BITS, 8,
                  &v2_mb_type[0][1], 2, 1,
-                 &v2_mb_type[0][0], 2, 1);
+                 &v2_mb_type[0][0], 2, 1, 1);
         init_vlc(&v2_mv_vlc, V2_MV_VLC_BITS, 33,
                  &mvtab[0][1], 2, 1,
-                 &mvtab[0][0], 2, 1);
+                 &mvtab[0][0], 2, 1, 1);
 
         for(i=0; i<4; i++){
             init_vlc(&mb_non_intra_vlc[i], MB_NON_INTRA_VLC_BITS, 128, 
                      &wmv2_inter_table[i][0][1], 8, 4,
-                     &wmv2_inter_table[i][0][0], 8, 4); //FIXME name?
+                     &wmv2_inter_table[i][0][0], 8, 4, 1); //FIXME name?
         }
         
-        init_vlc(&mb_intra_vlc, MB_INTRA_VLC_BITS, 64, 
-                 &table_mb_intra[0][1], 4, 2,
-                 &table_mb_intra[0][0], 4, 2);
+        init_vlc(&ff_msmp4_mb_i_vlc, MB_INTRA_VLC_BITS, 64, 
+                 &ff_msmp4_mb_i_table[0][1], 4, 2,
+                 &ff_msmp4_mb_i_table[0][0], 4, 2, 1);
         
         init_vlc(&v1_intra_cbpc_vlc, V1_INTRA_CBPC_VLC_BITS, 8, 
                  intra_MCBPC_bits, 1, 1,
-                 intra_MCBPC_code, 1, 1);
+                 intra_MCBPC_code, 1, 1, 1);
         init_vlc(&v1_inter_cbpc_vlc, V1_INTER_CBPC_VLC_BITS, 25, 
                  inter_MCBPC_bits, 1, 1,
-                 inter_MCBPC_code, 1, 1);
+                 inter_MCBPC_code, 1, 1, 1);
         
         init_vlc(&inter_intra_vlc, INTER_INTRA_VLC_BITS, 4, 
                  &table_inter_intra[0][1], 2, 1,
-                 &table_inter_intra[0][0], 2, 1);
+                 &table_inter_intra[0][0], 2, 1, 1);
     }
     
     switch(s->msmpeg4_version){
@@ -1186,6 +1197,8 @@ int ff_msmpeg4_decode_init(MpegEncContext *s)
         break;
     case 5:
         s->decode_mb= wmv2_decode_mb;
+    case 6:
+        //FIXME + TODO VC9 decode mb
         break;
     }
     
@@ -1194,16 +1207,6 @@ int ff_msmpeg4_decode_init(MpegEncContext *s)
     return 0;
 }
 
-static int decode012(GetBitContext *gb)
-{
-    int n;
-    n = get_bits1(gb);
-    if (n == 0)
-        return 0;
-    else
-        return get_bits1(gb) + 1;
-}
-
 int msmpeg4_decode_picture_header(MpegEncContext * s)
 {
     int code;
@@ -1477,6 +1480,183 @@ static int msmpeg4v2_decode_motion(MpegEncContext * s, int pred, int f_code)
     return val;
 }
 
+static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
+{
+    int cbp, code, i;
+    
+    if (s->pict_type == P_TYPE) {
+        if (s->use_skip_mb_code) {
+            if (get_bits1(&s->gb)) {
+                /* skip mb */
+                s->mb_intra = 0;
+                for(i=0;i<6;i++)
+                    s->block_last_index[i] = -1;
+                s->mv_dir = MV_DIR_FORWARD;
+                s->mv_type = MV_TYPE_16X16;
+                s->mv[0][0][0] = 0;
+                s->mv[0][0][1] = 0;
+                s->mb_skiped = 1;
+                return 0;
+            }
+        }
+
+        if(s->msmpeg4_version==2)
+            code = get_vlc2(&s->gb, v2_mb_type_vlc.table, V2_MB_TYPE_VLC_BITS, 1);
+        else
+            code = get_vlc2(&s->gb, v1_inter_cbpc_vlc.table, V1_INTER_CBPC_VLC_BITS, 3);
+        if(code<0 || code>7){
+            av_log(s->avctx, AV_LOG_ERROR, "cbpc %d invalid at %d %d\n", code, s->mb_x, s->mb_y);
+            return -1;
+        }
+
+        s->mb_intra = code >>2;
+    
+        cbp = code & 0x3;
+    } else {
+        s->mb_intra = 1;
+        if(s->msmpeg4_version==2)
+            cbp= get_vlc2(&s->gb, v2_intra_cbpc_vlc.table, V2_INTRA_CBPC_VLC_BITS, 1);
+        else
+            cbp= get_vlc2(&s->gb, v1_intra_cbpc_vlc.table, V1_INTRA_CBPC_VLC_BITS, 1);
+        if(cbp<0 || cbp>3){
+            av_log(s->avctx, AV_LOG_ERROR, "cbpc %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
+            return -1;
+        }
+    }
+
+    if (!s->mb_intra) {
+        int mx, my, cbpy;
+        
+        cbpy= get_vlc2(&s->gb, cbpy_vlc.table, CBPY_VLC_BITS, 1);
+        if(cbpy<0){
+            av_log(s->avctx, AV_LOG_ERROR, "cbpy %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
+            return -1;
+        }
+
+        cbp|= cbpy<<2;
+        if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C;
+        
+        h263_pred_motion(s, 0, 0, &mx, &my);
+        mx= msmpeg4v2_decode_motion(s, mx, 1);
+        my= msmpeg4v2_decode_motion(s, my, 1);
+        
+        s->mv_dir = MV_DIR_FORWARD;
+        s->mv_type = MV_TYPE_16X16;
+        s->mv[0][0][0] = mx;
+        s->mv[0][0][1] = my;
+    } else {
+        if(s->msmpeg4_version==2){
+            s->ac_pred = get_bits1(&s->gb);
+            cbp|= get_vlc2(&s->gb, cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
+        } else{
+            s->ac_pred = 0;
+            cbp|= get_vlc2(&s->gb, cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
+            if(s->pict_type==P_TYPE) cbp^=0x3C;
+        }
+    }
+
+    for (i = 0; i < 6; i++) {
+        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
+	{
+             av_log(s->avctx, AV_LOG_ERROR, "\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+             return -1;
+	}
+    }
+    return 0;
+}
+
+static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
+{
+    int cbp, code, i;
+    uint8_t *coded_val;
+    uint32_t * const mb_type_ptr= &s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride ];
+
+    if (s->pict_type == P_TYPE) {
+        set_stat(ST_INTER_MB);
+        if (s->use_skip_mb_code) {
+            if (get_bits1(&s->gb)) {
+                /* skip mb */
+                s->mb_intra = 0;
+                for(i=0;i<6;i++)
+                    s->block_last_index[i] = -1;
+                s->mv_dir = MV_DIR_FORWARD;
+                s->mv_type = MV_TYPE_16X16;
+                s->mv[0][0][0] = 0;
+                s->mv[0][0][1] = 0;
+                s->mb_skiped = 1;
+                *mb_type_ptr = MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
+
+                return 0;
+            }
+        }
+        
+        code = get_vlc2(&s->gb, mb_non_intra_vlc[DEFAULT_INTER_INDEX].table, MB_NON_INTRA_VLC_BITS, 3);
+        if (code < 0)
+            return -1;
+	//s->mb_intra = (code & 0x40) ? 0 : 1;
+	s->mb_intra = (~code & 0x40) >> 6;
+            
+        cbp = code & 0x3f;
+    } else {
+        set_stat(ST_INTRA_MB);
+        s->mb_intra = 1;
+        code = get_vlc2(&s->gb, ff_msmp4_mb_i_vlc.table, MB_INTRA_VLC_BITS, 2);
+        if (code < 0)
+            return -1;
+        /* predict coded block pattern */
+        cbp = 0;
+        for(i=0;i<6;i++) {
+            int val = ((code >> (5 - i)) & 1);
+            if (i < 4) {
+                int pred = coded_block_pred(s, i, &coded_val);
+                val = val ^ pred;
+                *coded_val = val;
+            }
+            cbp |= val << (5 - i);
+        }
+    }
+
+    if (!s->mb_intra) {
+        int mx, my;
+//printf("P at %d %d\n", s->mb_x, s->mb_y);
+        if(s->per_mb_rl_table && cbp){
+            s->rl_table_index = decode012(&s->gb);
+            s->rl_chroma_table_index = s->rl_table_index;
+        }
+        set_stat(ST_MV);
+        h263_pred_motion(s, 0, 0, &mx, &my);
+        if (msmpeg4_decode_motion(s, &mx, &my) < 0)
+            return -1;
+        s->mv_dir = MV_DIR_FORWARD;
+        s->mv_type = MV_TYPE_16X16;
+        s->mv[0][0][0] = mx;
+        s->mv[0][0][1] = my;
+        *mb_type_ptr = MB_TYPE_L0 | MB_TYPE_16x16;
+    } else {
+//printf("I at %d %d %d %06X\n", s->mb_x, s->mb_y, ((cbp&3)? 1 : 0) +((cbp&0x3C)? 2 : 0), show_bits(&s->gb, 24));
+        set_stat(ST_INTRA_MB);
+        s->ac_pred = get_bits1(&s->gb);
+        *mb_type_ptr = MB_TYPE_INTRA;
+        if(s->inter_intra_pred){
+            s->h263_aic_dir= get_vlc2(&s->gb, inter_intra_vlc.table, INTER_INTRA_VLC_BITS, 1);
+//            printf("%d%d %d %d/", s->ac_pred, s->h263_aic_dir, s->mb_x, s->mb_y);
+        }
+        if(s->per_mb_rl_table && cbp){
+            s->rl_table_index = decode012(&s->gb);
+            s->rl_chroma_table_index = s->rl_table_index;
+        }
+    }
+
+    for (i = 0; i < 6; i++) {
+        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
+	{
+	    av_log(s->avctx, AV_LOG_ERROR, "\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
+	    return -1;
+	}
+    }
+    
+    return 0;
+}
 //#define ERROR_DETAILS
 static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                               int n, int coded, const uint8_t *scan_table)
@@ -1554,7 +1734,7 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
     OPEN_READER(re, &s->gb);
     for(;;) {
         UPDATE_CACHE(re, &s->gb);
-        GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2);
+        GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2, 0);
         if (level==0) {
             int cache;
             cache= GET_CACHE(re, &s->gb);
@@ -1652,7 +1832,7 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
 #else
                     SKIP_BITS(re, &s->gb, 2);
 #endif
-                    GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2);
+                    GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2, 1);
                     i+= run + rl->max_run[run>>7][level/qmul] + run_diff; //FIXME opt indexing
                     level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
                     LAST_SKIP_BITS(re, &s->gb, 1);
@@ -1671,7 +1851,7 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
 #else
                 SKIP_BITS(re, &s->gb, 1);
 #endif
-                GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2);
+                GET_RL_VLC(level, run, re, &s->gb, rl_vlc, TEX_VLC_BITS, 2, 1);
                 i+= run;
                 level = level + rl->max_level[run>>7][(run-1)&63] * qmul;//FIXME opt indexing
                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
@@ -1728,184 +1908,6 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
     return 0;
 }
 
-static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
-{
-    int cbp, code, i;
-    
-    if (s->pict_type == P_TYPE) {
-        if (s->use_skip_mb_code) {
-            if (get_bits1(&s->gb)) {
-                /* skip mb */
-                s->mb_intra = 0;
-                for(i=0;i<6;i++)
-                    s->block_last_index[i] = -1;
-                s->mv_dir = MV_DIR_FORWARD;
-                s->mv_type = MV_TYPE_16X16;
-                s->mv[0][0][0] = 0;
-                s->mv[0][0][1] = 0;
-                s->mb_skiped = 1;
-                return 0;
-            }
-        }
-
-        if(s->msmpeg4_version==2)
-            code = get_vlc2(&s->gb, v2_mb_type_vlc.table, V2_MB_TYPE_VLC_BITS, 1);
-        else
-            code = get_vlc2(&s->gb, v1_inter_cbpc_vlc.table, V1_INTER_CBPC_VLC_BITS, 3);
-        if(code<0 || code>7){
-            av_log(s->avctx, AV_LOG_ERROR, "cbpc %d invalid at %d %d\n", code, s->mb_x, s->mb_y);
-            return -1;
-        }
-
-        s->mb_intra = code >>2;
-    
-        cbp = code & 0x3;
-    } else {
-        s->mb_intra = 1;
-        if(s->msmpeg4_version==2)
-            cbp= get_vlc2(&s->gb, v2_intra_cbpc_vlc.table, V2_INTRA_CBPC_VLC_BITS, 1);
-        else
-            cbp= get_vlc2(&s->gb, v1_intra_cbpc_vlc.table, V1_INTRA_CBPC_VLC_BITS, 1);
-        if(cbp<0 || cbp>3){
-            av_log(s->avctx, AV_LOG_ERROR, "cbpc %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
-            return -1;
-        }
-    }
-
-    if (!s->mb_intra) {
-        int mx, my, cbpy;
-        
-        cbpy= get_vlc2(&s->gb, cbpy_vlc.table, CBPY_VLC_BITS, 1);
-        if(cbpy<0){
-            av_log(s->avctx, AV_LOG_ERROR, "cbpy %d invalid at %d %d\n", cbp, s->mb_x, s->mb_y);
-            return -1;
-        }
-
-        cbp|= cbpy<<2;
-        if(s->msmpeg4_version==1 || (cbp&3) != 3) cbp^= 0x3C;
-        
-        h263_pred_motion(s, 0, 0, &mx, &my);
-        mx= msmpeg4v2_decode_motion(s, mx, 1);
-        my= msmpeg4v2_decode_motion(s, my, 1);
-        
-        s->mv_dir = MV_DIR_FORWARD;
-        s->mv_type = MV_TYPE_16X16;
-        s->mv[0][0][0] = mx;
-        s->mv[0][0][1] = my;
-    } else {
-        if(s->msmpeg4_version==2){
-            s->ac_pred = get_bits1(&s->gb);
-            cbp|= get_vlc2(&s->gb, cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
-        } else{
-            s->ac_pred = 0;
-            cbp|= get_vlc2(&s->gb, cbpy_vlc.table, CBPY_VLC_BITS, 1)<<2; //FIXME check errors
-            if(s->pict_type==P_TYPE) cbp^=0x3C;
-        }
-    }
-
-    for (i = 0; i < 6; i++) {
-        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
-	{
-             av_log(s->avctx, AV_LOG_ERROR, "\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
-             return -1;
-	}
-    }
-    return 0;
-}
-
-static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
-{
-    int cbp, code, i;
-    uint8_t *coded_val;
-    uint32_t * const mb_type_ptr= &s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride ];
-
-    if (s->pict_type == P_TYPE) {
-        set_stat(ST_INTER_MB);
-        if (s->use_skip_mb_code) {
-            if (get_bits1(&s->gb)) {
-                /* skip mb */
-                s->mb_intra = 0;
-                for(i=0;i<6;i++)
-                    s->block_last_index[i] = -1;
-                s->mv_dir = MV_DIR_FORWARD;
-                s->mv_type = MV_TYPE_16X16;
-                s->mv[0][0][0] = 0;
-                s->mv[0][0][1] = 0;
-                s->mb_skiped = 1;
-                *mb_type_ptr = MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
-
-                return 0;
-            }
-        }
-        
-        code = get_vlc2(&s->gb, mb_non_intra_vlc[DEFAULT_INTER_INDEX].table, MB_NON_INTRA_VLC_BITS, 3);
-        if (code < 0)
-            return -1;
-	//s->mb_intra = (code & 0x40) ? 0 : 1;
-	s->mb_intra = (~code & 0x40) >> 6;
-            
-        cbp = code & 0x3f;
-    } else {
-        set_stat(ST_INTRA_MB);
-        s->mb_intra = 1;
-        code = get_vlc2(&s->gb, mb_intra_vlc.table, MB_INTRA_VLC_BITS, 2);
-        if (code < 0)
-            return -1;
-        /* predict coded block pattern */
-        cbp = 0;
-        for(i=0;i<6;i++) {
-            int val = ((code >> (5 - i)) & 1);
-            if (i < 4) {
-                int pred = coded_block_pred(s, i, &coded_val);
-                val = val ^ pred;
-                *coded_val = val;
-            }
-            cbp |= val << (5 - i);
-        }
-    }
-
-    if (!s->mb_intra) {
-        int mx, my;
-//printf("P at %d %d\n", s->mb_x, s->mb_y);
-        if(s->per_mb_rl_table && cbp){
-            s->rl_table_index = decode012(&s->gb);
-            s->rl_chroma_table_index = s->rl_table_index;
-        }
-        set_stat(ST_MV);
-        h263_pred_motion(s, 0, 0, &mx, &my);
-        if (msmpeg4_decode_motion(s, &mx, &my) < 0)
-            return -1;
-        s->mv_dir = MV_DIR_FORWARD;
-        s->mv_type = MV_TYPE_16X16;
-        s->mv[0][0][0] = mx;
-        s->mv[0][0][1] = my;
-        *mb_type_ptr = MB_TYPE_L0 | MB_TYPE_16x16;
-    } else {
-//printf("I at %d %d %d %06X\n", s->mb_x, s->mb_y, ((cbp&3)? 1 : 0) +((cbp&0x3C)? 2 : 0), show_bits(&s->gb, 24));
-        set_stat(ST_INTRA_MB);
-        s->ac_pred = get_bits1(&s->gb);
-        *mb_type_ptr = MB_TYPE_INTRA;
-        if(s->inter_intra_pred){
-            s->h263_aic_dir= get_vlc2(&s->gb, inter_intra_vlc.table, INTER_INTRA_VLC_BITS, 1);
-//            printf("%d%d %d %d/", s->ac_pred, s->h263_aic_dir, s->mb_x, s->mb_y);
-        }
-        if(s->per_mb_rl_table && cbp){
-            s->rl_table_index = decode012(&s->gb);
-            s->rl_chroma_table_index = s->rl_table_index;
-        }
-    }
-
-    for (i = 0; i < 6; i++) {
-        if (msmpeg4_decode_block(s, block[i], i, (cbp >> (5 - i)) & 1, NULL) < 0)
-	{
-	    av_log(s->avctx, AV_LOG_ERROR, "\nerror while decoding block: %d x %d (%d)\n", s->mb_x, s->mb_y, i);
-	    return -1;
-	}
-    }
-    
-    return 0;
-}
-
 static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
 {
     int level, pred;
@@ -1921,9 +1923,9 @@ static int msmpeg4_decode_dc(MpegEncContext * s, int n, int *dir_ptr)
         level-=256;
     }else{  //FIXME optimize use unified tables & index
         if (n < 4) {
-            level = get_vlc2(&s->gb, dc_lum_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
+            level = get_vlc2(&s->gb, ff_msmp4_dc_luma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
         } else {
-            level = get_vlc2(&s->gb, dc_chroma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
+            level = get_vlc2(&s->gb, ff_msmp4_dc_chroma_vlc[s->dc_table_index].table, DC_VLC_BITS, 3);
         }
         if (level < 0){
             av_log(s->avctx, AV_LOG_ERROR, "illegal dc vlc\n");
diff --git a/src/libffmpeg/libavcodec/msmpeg4data.h b/src/libffmpeg/libavcodec/msmpeg4data.h
index 69568cbf6..bc4b454ac 100644
--- a/src/libffmpeg/libavcodec/msmpeg4data.h
+++ b/src/libffmpeg/libavcodec/msmpeg4data.h
@@ -4,7 +4,7 @@
  */
 
 /* intra picture macro block coded block pattern */
-static const uint16_t table_mb_intra[64][2] = {
+const uint16_t ff_msmp4_mb_i_table[64][2] = {
 { 0x1, 1 },{ 0x17, 6 },{ 0x9, 5 },{ 0x5, 5 },
 { 0x6, 5 },{ 0x47, 9 },{ 0x20, 7 },{ 0x10, 7 },
 { 0x2, 5 },{ 0x7c, 9 },{ 0x3a, 7 },{ 0x1d, 7 },
@@ -61,7 +61,7 @@ static const uint32_t table_mb_non_intra[128][2] = {
 
 /* dc table 0 */
 
-static const uint32_t table0_dc_lum[120][2] = {
+const uint32_t ff_table0_dc_lum[120][2] = {
 { 0x1, 1 },{ 0x1, 2 },{ 0x1, 4 },{ 0x1, 5 },
 { 0x5, 5 },{ 0x7, 5 },{ 0x8, 6 },{ 0xc, 6 },
 { 0x0, 7 },{ 0x2, 7 },{ 0x12, 7 },{ 0x1a, 7 },
@@ -94,7 +94,7 @@ static const uint32_t table0_dc_lum[120][2] = {
 { 0x6078c, 24 },{ 0x6078d, 24 },{ 0x6078e, 24 },{ 0x6078f, 24 },
 };
 
-static const uint32_t table0_dc_chroma[120][2] = {
+const uint32_t ff_table0_dc_chroma[120][2] = {
 { 0x0, 2 },{ 0x1, 2 },{ 0x5, 3 },{ 0x9, 4 },
 { 0xd, 4 },{ 0x11, 5 },{ 0x1d, 5 },{ 0x1f, 5 },
 { 0x21, 6 },{ 0x31, 6 },{ 0x38, 6 },{ 0x33, 6 },
@@ -129,7 +129,7 @@ static const uint32_t table0_dc_chroma[120][2] = {
 
 /* dc table 1 */
 
-static const uint32_t table1_dc_lum[120][2] = {
+const uint32_t ff_table1_dc_lum[120][2] = {
 { 0x2, 2 },{ 0x3, 2 },{ 0x3, 3 },{ 0x2, 4 },
 { 0x5, 4 },{ 0x1, 5 },{ 0x3, 5 },{ 0x8, 5 },
 { 0x0, 6 },{ 0x5, 6 },{ 0xd, 6 },{ 0xf, 6 },
@@ -162,7 +162,7 @@ static const uint32_t table1_dc_lum[120][2] = {
 { 0x1e6964, 26 },{ 0x1e6965, 26 },{ 0x1e6966, 26 },{ 0x1e6967, 26 },
 };
 
-static const uint32_t table1_dc_chroma[120][2] = {
+const uint32_t ff_table1_dc_chroma[120][2] = {
 { 0x0, 2 },{ 0x1, 2 },{ 0x4, 3 },{ 0x7, 3 },
 { 0xb, 4 },{ 0xd, 4 },{ 0x15, 5 },{ 0x28, 6 },
 { 0x30, 6 },{ 0x32, 6 },{ 0x52, 7 },{ 0x62, 7 },
diff --git a/src/libffmpeg/libavcodec/msrle.c b/src/libffmpeg/libavcodec/msrle.c
index b318faa77..d95e3f79b 100644
--- a/src/libffmpeg/libavcodec/msrle.c
+++ b/src/libffmpeg/libavcodec/msrle.c
@@ -254,10 +254,6 @@ static int msrle_decode_frame(AVCodecContext *avctx,
 {
     MsrleContext *s = (MsrleContext *)avctx->priv_data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     s->buf = buf;
     s->size = buf_size;
 
diff --git a/src/libffmpeg/libavcodec/msvideo1.c b/src/libffmpeg/libavcodec/msvideo1.c
index b88bdab5d..518df0e52 100644
--- a/src/libffmpeg/libavcodec/msvideo1.c
+++ b/src/libffmpeg/libavcodec/msvideo1.c
@@ -302,10 +302,6 @@ static int msvideo1_decode_frame(AVCodecContext *avctx,
 {
     Msvideo1Context *s = (Msvideo1Context *)avctx->priv_data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     s->buf = buf;
     s->size = buf_size;
 
diff --git a/src/libffmpeg/libavcodec/parser.c b/src/libffmpeg/libavcodec/parser.c
index ed386611a..4725d56c6 100644
--- a/src/libffmpeg/libavcodec/parser.c
+++ b/src/libffmpeg/libavcodec/parser.c
@@ -34,11 +34,16 @@ AVCodecParserContext *av_parser_init(int codec_id)
     AVCodecParserContext *s;
     AVCodecParser *parser;
     int ret;
+    
+    if(codec_id == CODEC_ID_NONE)
+        return NULL;
 
     for(parser = av_first_parser; parser != NULL; parser = parser->next) {
         if (parser->codec_ids[0] == codec_id ||
             parser->codec_ids[1] == codec_id ||
-            parser->codec_ids[2] == codec_id)
+            parser->codec_ids[2] == codec_id ||
+            parser->codec_ids[3] == codec_id ||
+            parser->codec_ids[4] == codec_id)
             goto found;
     }
     return NULL;
@@ -92,12 +97,14 @@ int av_parser_parse(AVCodecParserContext *s,
             s->fetch_timestamp=0;
             s->last_pts = pts;
             s->last_dts = dts;
+            s->cur_frame_pts[k] =
+            s->cur_frame_dts[k] = AV_NOPTS_VALUE;
         }
     }
 
     /* WARNING: the returned index can be negative */
     index = s->parser->parser_parse(s, avctx, poutbuf, poutbuf_size, buf, buf_size);
-//av_log(NULL, AV_LOG_DEBUG, "parser: in:%lld, %lld, out:%lld, %lld, in:%d out:%d %d\n", pts, dts, s->last_pts, s->last_dts, buf_size, *poutbuf_size, avctx->codec_id);
+//av_log(NULL, AV_LOG_DEBUG, "parser: in:%lld, %lld, out:%lld, %lld, in:%d out:%d id:%d\n", pts, dts, s->last_pts, s->last_dts, buf_size, *poutbuf_size, avctx->codec_id);
     /* update the file pointer */
     if (*poutbuf_size) {
         /* fill the data for the current frame */
@@ -183,7 +190,12 @@ int ff_combine_frame(ParseContext *pc, int next, uint8_t **buf, int *buf_size)
     for(; pc->overread>0; pc->overread--){
         pc->buffer[pc->index++]= pc->buffer[pc->overread_index++];
     }
-    
+
+    /* flush remaining if EOF */
+    if(!*buf_size && next == END_NOT_FOUND){
+        next= 0;
+    }
+
     pc->last_index= pc->index;
 
     /* copy into buffer end return */
@@ -279,8 +291,8 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
     int32_t start_code;
     int frame_rate_index, ext_type, bytes_left;
     int frame_rate_ext_n, frame_rate_ext_d;
-    int top_field_first, repeat_first_field, progressive_frame;
-    int horiz_size_ext, vert_size_ext;
+    int picture_structure, top_field_first, repeat_first_field, progressive_frame;
+    int horiz_size_ext, vert_size_ext, bit_rate_ext;
 
     s->repeat_pict = 0;
     buf_end = buf + buf_size;
@@ -294,12 +306,14 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
             }
             break;
         case SEQ_START_CODE:
-            if (bytes_left >= 4) {
-                pc->width = avctx->width = (buf[0] << 4) | (buf[1] >> 4);
-                pc->height = avctx->height = ((buf[1] & 0x0f) << 8) | buf[2];
+            if (bytes_left >= 7) {
+                pc->width  = (buf[0] << 4) | (buf[1] >> 4);
+                pc->height = ((buf[1] & 0x0f) << 8) | buf[2];
+                avcodec_set_dimensions(avctx, pc->width, pc->height);
                 frame_rate_index = buf[3] & 0xf;
                 pc->frame_rate = avctx->frame_rate = frame_rate_tab[frame_rate_index];
                 avctx->frame_rate_base = MPEG1_FRAME_RATE_BASE;
+                avctx->bit_rate = ((buf[4]<<10) | (buf[5]<<2) | (buf[6]>>6))*400;
                 avctx->codec_id = CODEC_ID_MPEG1VIDEO;
                 avctx->sub_id = 1;
             }
@@ -312,12 +326,16 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                     if (bytes_left >= 6) {
                         horiz_size_ext = ((buf[1] & 1) << 1) | (buf[2] >> 7);
                         vert_size_ext = (buf[2] >> 5) & 3;
+                        bit_rate_ext = ((buf[2] & 0x1F)<<7) | (buf[3]>>1);
                         frame_rate_ext_n = (buf[5] >> 5) & 3;
                         frame_rate_ext_d = (buf[5] & 0x1f);
                         pc->progressive_sequence = buf[1] & (1 << 3);
+                        avctx->has_b_frames= !(buf[5] >> 7);
 
-                        avctx->width = pc->width | (horiz_size_ext << 12);
-                        avctx->height = pc->height | (vert_size_ext << 12);
+                        pc->width  |=(horiz_size_ext << 12);
+                        pc->height |=( vert_size_ext << 12);
+                        avctx->bit_rate += (bit_rate_ext << 18) * 400;
+                        avcodec_set_dimensions(avctx, pc->width, pc->height);
                         avctx->frame_rate = pc->frame_rate * (frame_rate_ext_n + 1);
                         avctx->frame_rate_base = MPEG1_FRAME_RATE_BASE * (frame_rate_ext_d + 1);
                         avctx->codec_id = CODEC_ID_MPEG2VIDEO;
@@ -326,6 +344,7 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                     break;
                 case 0x8: /* picture coding extension */
                     if (bytes_left >= 5) {
+                        picture_structure = buf[2]&3;
                         top_field_first = buf[3] & (1 << 7);
                         repeat_first_field = buf[3] & (1 << 1);
                         progressive_frame = buf[4] & (1 << 7);
@@ -341,6 +360,11 @@ static void mpegvideo_extract_headers(AVCodecParserContext *s,
                                 s->repeat_pict = 1;
                             }
                         }
+                        
+                        /* the packet only represents half a frame 
+                           XXX,FIXME maybe find a different solution */
+                        if(picture_structure != 3)
+                            s->repeat_pict = -1;
                     }
                     break;
                 }
@@ -429,8 +453,7 @@ static int av_mpeg4_decode_header(AVCodecParserContext *s1,
     init_get_bits(gb, buf, 8 * buf_size);
     ret = ff_mpeg4_decode_picture_header(s, gb);
     if (s->width) {
-        avctx->width = s->width;
-        avctx->height = s->height;
+        avcodec_set_dimensions(avctx, s->width, s->height);
     }
     pc->first_picture = 0;
     return ret;
@@ -477,13 +500,16 @@ typedef struct MpegAudioParseContext {
     int frame_size;
     int free_format_frame_size;
     int free_format_next_header;
+    uint32_t header;
+    int header_count;
 } MpegAudioParseContext;
 
 #define MPA_HEADER_SIZE 4
 
 /* header + layer + bitrate + freq + lsf/mpeg25 */
+#undef SAME_HEADER_MASK /* mpegaudio.h defines different version */
 #define SAME_HEADER_MASK \
-   (0xffe00000 | (3 << 17) | (0xf << 12) | (3 << 10) | (3 << 19))
+   (0xffe00000 | (3 << 17) | (3 << 10) | (3 << 19))
 
 static int mpegaudio_parse_init(AVCodecParserContext *s1)
 {
@@ -498,7 +524,7 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
                            const uint8_t *buf, int buf_size)
 {
     MpegAudioParseContext *s = s1->priv_data;
-    int len, ret;
+    int len, ret, sr;
     uint32_t header;
     const uint8_t *buf_ptr;
 
@@ -532,11 +558,13 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
 	    }
 	    if ((s->inbuf_ptr - s->inbuf) >= MPA_HEADER_SIZE) {
             got_header:
+                sr= avctx->sample_rate;
 		header = (s->inbuf[0] << 24) | (s->inbuf[1] << 16) |
 		    (s->inbuf[2] << 8) | s->inbuf[3];
 
                 ret = mpa_decode_header(avctx, header);
                 if (ret < 0) {
+                    s->header_count= -2;
 		    /* no sync found : move by one byte (inefficient, but simple!) */
 		    memmove(s->inbuf, s->inbuf + 1, s->inbuf_ptr - s->inbuf - 1);
 		    s->inbuf_ptr--;
@@ -545,7 +573,12 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
                        to get a new bitrate */
                     s->free_format_frame_size = 0;
 		} else {
+                    if((header&SAME_HEADER_MASK) != (s->header&SAME_HEADER_MASK) && s->header)
+                        s->header_count= -3;
+                    s->header= header;
+                    s->header_count++;
                     s->frame_size = ret;
+                    
 #if 0
                     /* free format: prepare to compute frame size */
 		    if (decode_header(s, header) == 1) {
@@ -553,6 +586,8 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
                     }
 #endif
 		}
+                if(s->header_count <= 0)
+                    avctx->sample_rate= sr; //FIXME ugly
 	    }
         } else 
 #if 0
@@ -625,8 +660,10 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
         //    next_data:
         if (s->frame_size > 0 && 
             (s->inbuf_ptr - s->inbuf) >= s->frame_size) {
-            *poutbuf = s->inbuf;
-            *poutbuf_size = s->inbuf_ptr - s->inbuf;
+            if(s->header_count > 0){
+                *poutbuf = s->inbuf;
+                *poutbuf_size = s->inbuf_ptr - s->inbuf;
+            }
 	    s->inbuf_ptr = s->inbuf;
 	    s->frame_size = 0;
 	    break;
diff --git a/src/libffmpeg/libavcodec/pcm.c b/src/libffmpeg/libavcodec/pcm.c
index 4c999b430..8e57d11a1 100644
--- a/src/libffmpeg/libavcodec/pcm.c
+++ b/src/libffmpeg/libavcodec/pcm.c
@@ -127,6 +127,23 @@ static int pcm_encode_init(AVCodecContext *avctx)
         break;
     }
     
+    switch(avctx->codec->id) {
+    case CODEC_ID_PCM_S16LE:
+    case CODEC_ID_PCM_S16BE:
+    case CODEC_ID_PCM_U16LE:
+    case CODEC_ID_PCM_U16BE:
+        avctx->block_align = 2 * avctx->channels;
+        break;
+    case CODEC_ID_PCM_S8:
+    case CODEC_ID_PCM_U8:
+    case CODEC_ID_PCM_MULAW:
+    case CODEC_ID_PCM_ALAW:
+        avctx->block_align = avctx->channels;
+        break;
+    default:
+        break;
+    }
+
     avctx->coded_frame= avcodec_alloc_frame();
     avctx->coded_frame->key_frame= 1;
     
@@ -282,6 +299,9 @@ static int pcm_decode_frame(AVCodecContext *avctx,
     samples = data;
     src = buf;
 
+    if(buf_size > AVCODEC_MAX_AUDIO_FRAME_SIZE/2)
+        buf_size = AVCODEC_MAX_AUDIO_FRAME_SIZE/2;
+
     switch(avctx->codec->id) {
     case CODEC_ID_PCM_S16LE:
         n = buf_size >> 1;
diff --git a/src/libffmpeg/libavcodec/qdrw.c b/src/libffmpeg/libavcodec/qdrw.c
new file mode 100644
index 000000000..a12d45067
--- /dev/null
+++ b/src/libffmpeg/libavcodec/qdrw.c
@@ -0,0 +1,158 @@
+/*
+ * QuickDraw (qdrw) codec
+ * Copyright (c) 2004 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file qdrw.c
+ * Apple QuickDraw codec.
+ */
+ 
+#include "avcodec.h"
+#include "mpegvideo.h"
+
+typedef struct QdrawContext{
+    AVCodecContext *avctx;
+    AVFrame pic;
+    uint8_t palette[256*3];
+} QdrawContext;
+
+static int decode_frame(AVCodecContext *avctx, 
+                        void *data, int *data_size,
+                        uint8_t *buf, int buf_size)
+{
+    QdrawContext * const a = avctx->priv_data;
+    AVFrame * const p= (AVFrame*)&a->pic;
+    uint8_t* outdata;
+    int colors;
+    int i;
+    
+    if(p->data[0])
+        avctx->release_buffer(avctx, p);
+
+    p->reference= 0;
+    if(avctx->get_buffer(avctx, p) < 0){
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    p->pict_type= I_TYPE;
+    p->key_frame= 1;
+
+    outdata = a->pic.data[0];
+    
+    buf += 0x68; /* jump to palette */
+    colors = BE_32(buf);
+    buf += 4;
+    
+    if(colors < 0 || colors > 256) {
+        av_log(avctx, AV_LOG_ERROR, "Error color count - %i(0x%X)\n", colors, colors);
+        return -1;
+    }
+    
+    for (i = 0; i <= colors; i++) {
+        unsigned int idx;
+        idx = BE_16(buf); /* color index */
+        buf += 2;
+        
+        if (idx > 255) {
+            av_log(avctx, AV_LOG_ERROR, "Palette index out of range: %u\n", idx);
+            buf += 6;
+            continue;
+        }
+        a->palette[idx * 3 + 0] = *buf++;
+        buf++;
+        a->palette[idx * 3 + 1] = *buf++;
+        buf++;
+        a->palette[idx * 3 + 2] = *buf++;
+        buf++;
+    }
+
+    buf += 18; /* skip unneeded data */
+    for (i = 0; i < avctx->height; i++) {
+        int size, left, code, pix;
+        uint8_t *next;
+        uint8_t *out;
+        int tsize = 0;
+        
+        /* decode line */
+        out = outdata;
+        size = BE_16(buf); /* size of packed line */
+        buf += 2;
+        left = size;
+        next = buf + size;
+        while (left > 0) {
+            code = *buf++;
+            if (code & 0x80 ) { /* run */
+                int i;
+                pix = *buf++;
+                if ((out + (257 - code) * 3) > (outdata +  a->pic.linesize[0]))
+                    break;
+                for (i = 0; i < 257 - code; i++) {
+                    *out++ = a->palette[pix * 3 + 0];
+                    *out++ = a->palette[pix * 3 + 1];
+                    *out++ = a->palette[pix * 3 + 2];
+                }
+                tsize += 257 - code;
+                left -= 2;
+            } else { /* copy */
+                int i, pix;
+                if ((out + code * 3) > (outdata +  a->pic.linesize[0]))
+                    break;
+                for (i = 0; i <= code; i++) {
+                    pix = *buf++;
+                    *out++ = a->palette[pix * 3 + 0];
+                    *out++ = a->palette[pix * 3 + 1];
+                    *out++ = a->palette[pix * 3 + 2];
+                }
+                left -= 2 + code;
+                tsize += code + 1;
+            }
+        }
+        buf = next;
+        outdata += a->pic.linesize[0];
+    }
+
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = a->pic;
+    
+    return buf_size;
+}
+
+static int decode_init(AVCodecContext *avctx){
+//    QdrawContext * const a = avctx->priv_data;
+
+    if (avcodec_check_dimensions(avctx, avctx->height, avctx->width) < 0) {
+        return 1;
+    }
+
+    avctx->pix_fmt= PIX_FMT_RGB24;
+
+    return 0;
+}
+
+AVCodec qdraw_decoder = {
+    "qdraw",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_QDRAW,
+    sizeof(QdrawContext),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame,
+    CODEC_CAP_DR1,
+};
diff --git a/src/libffmpeg/libavcodec/qpeg.c b/src/libffmpeg/libavcodec/qpeg.c
new file mode 100644
index 000000000..a2d7e4acc
--- /dev/null
+++ b/src/libffmpeg/libavcodec/qpeg.c
@@ -0,0 +1,302 @@
+/*
+ * QPEG codec
+ * Copyright (c) 2004 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file qpeg.c
+ * QPEG codec.
+ */
+ 
+#include "avcodec.h"
+#include "mpegvideo.h"
+
+typedef struct QpegContext{
+    AVCodecContext *avctx;
+    AVFrame pic;
+    uint8_t *refdata;
+} QpegContext;
+
+static void qpeg_decode_intra(uint8_t *src, uint8_t *dst, int size,
+			    int stride, int width, int height)
+{
+    int i;
+    int code;
+    int c0, c1;
+    int run, copy;
+    int filled = 0;
+    
+    height--;
+    dst = dst + height * stride;
+    
+    while(size > 0) {
+	code = *src++;
+	size--;
+	run = copy = 0;
+	if(code == 0xFC) /* end-of-picture code */
+	    break;
+	if(code >= 0xF8) { /* very long run */
+	    c0 = *src++;
+	    c1 = *src++;
+	    size -= 2;
+	    run = ((code & 0x7) << 16) + (c0 << 8) + c1 + 2;
+	} else if (code >= 0xF0) { /* long run */
+	    c0 = *src++;
+	    size--;
+	    run = ((code & 0xF) << 8) + c0 + 2;
+	} else if (code >= 0xE0) { /* short run */
+	    run = (code & 0x1F) + 2;
+	} else if (code >= 0xC0) { /* very long copy */
+	    c0 = *src++;
+	    c1 = *src++;
+	    size -= 2;
+	    copy = ((code & 0x3F) << 16) + (c0 << 8) + c1 + 1;
+	} else if (code >= 0x80) { /* long copy */
+	    c0 = *src++;
+	    size--;
+	    copy = ((code & 0x7F) << 8) + c0 + 1;
+	} else { /* short copy */
+	    copy = code + 1;
+	}
+	
+	/* perform actual run or copy */
+	if(run) {
+	    int p;
+	    
+	    p = *src++;
+	    size--;
+	    for(i = 0; i < run; i++) {
+		dst[filled++] = p;
+		if (filled >= width) {
+		    filled = 0;
+		    dst -= stride;
+		}
+	    }
+	} else {
+	    for(i = 0; i < copy; i++) {
+		dst[filled++] = *src++;
+		if (filled >= width) {
+		    filled = 0;
+		    dst -= stride;
+		}
+	    }
+	    size -= copy;
+	}
+    }
+}
+
+static int qpeg_table_h[16] = 
+ { 0x00, 0x20, 0x20, 0x20, 0x18, 0x10, 0x10, 0x20, 0x10, 0x08, 0x18, 0x08, 0x08, 0x18, 0x10, 0x04};
+static int qpeg_table_w[16] =
+ { 0x00, 0x20, 0x18, 0x08, 0x18, 0x10, 0x20, 0x10, 0x08, 0x10, 0x20, 0x20, 0x08, 0x10, 0x18, 0x04};
+ 
+/* Decodes delta frames */
+static void qpeg_decode_inter(uint8_t *src, uint8_t *dst, int size,
+			    int stride, int width, int height,
+			    int delta, uint8_t *ctable, uint8_t *refdata)
+{
+    int i, j;
+    int code;
+    int filled = 0;
+    uint8_t *blkdata;
+    
+    /* copy prev frame */
+    for(i = 0; i < height; i++)
+	memcpy(refdata + (i * width), dst + (i * stride), width);
+    
+    blkdata = src - 0x86;
+    height--;
+    dst = dst + height * stride;
+
+    while(size > 0) {
+	code = *src++;
+	size--;
+	
+	if(delta) {
+	    /* motion compensation */
+	    while((code & 0xF0) == 0xF0) {
+		if(delta == 1) {
+		    int me_idx;
+		    int me_w, me_h, me_x, me_y;
+		    uint8_t *me_plane;
+		    int corr, val;
+		    
+		    /* get block size by index */
+		    me_idx = code & 0xF;
+		    me_w = qpeg_table_w[me_idx];
+		    me_h = qpeg_table_h[me_idx];
+		    
+		    /* extract motion vector */
+		    corr = *src++;
+		    size--;
+
+		    val = corr >> 4;
+		    if(val > 7)
+			val -= 16;
+		    me_x = val;
+		    
+		    val = corr & 0xF;
+		    if(val > 7)
+			val -= 16;
+		    me_y = val;
+		    
+		    /* do motion compensation */
+		    me_plane = refdata + (filled + me_x) + (height - me_y) * width;
+		    for(j = 0; j < me_h; j++) {
+			for(i = 0; i < me_w; i++)
+			    dst[filled + i - (j * stride)] = me_plane[i - (j * width)];
+		    }
+		}
+		code = *src++;
+		size--;
+	    }
+	}
+	
+	if(code == 0xE0) /* end-of-picture code */
+	    break;
+	if(code > 0xE0) { /* run code: 0xE1..0xFF */
+	    int p;
+
+	    code &= 0x1F;
+	    p = *src++;
+	    size--;
+	    for(i = 0; i <= code; i++) {
+		dst[filled++] = p;
+		if(filled >= width) {
+		    filled = 0;
+		    dst -= stride;
+		    height--;
+		}
+	    }
+	} else if(code >= 0xC0) { /* copy code: 0xC0..0xDF */
+	    code &= 0x1F;
+	    
+	    for(i = 0; i <= code; i++) {
+		dst[filled++] = *src++;
+		if(filled >= width) {
+		    filled = 0;
+		    dst -= stride;
+		    height--;
+		}
+	    }
+	    size -= code + 1;
+	} else if(code >= 0x80) { /* skip code: 0x80..0xBF */
+	    int skip;
+	    
+	    code &= 0x3F;
+	    /* codes 0x80 and 0x81 are actually escape codes,
+	       skip value minus constant is in the next byte */
+	    if(!code)
+		skip = (*src++) + 64;
+	    else if(code == 1)
+		skip = (*src++) + 320;
+	    else
+		skip = code;
+	    filled += skip;
+	    while( filled >= width) {
+		filled -= width;
+		dst -= stride;
+		height--;
+	    }
+	} else {
+	    /* zero code treated as one-pixel skip */
+	    if(code)
+		dst[filled++] = ctable[code & 0x7F];
+	    else
+		filled++;
+	    if(filled >= width) {
+		filled = 0;
+		dst -= stride;
+		height--;
+	    }
+	}
+    }
+}
+
+static int decode_frame(AVCodecContext *avctx, 
+                        void *data, int *data_size,
+                        uint8_t *buf, int buf_size)
+{
+    QpegContext * const a = avctx->priv_data;
+    AVFrame * const p= (AVFrame*)&a->pic;
+    uint8_t* outdata;
+    int delta;
+    
+    if(p->data[0])
+        avctx->release_buffer(avctx, p);
+
+    p->reference= 0;
+    if(avctx->get_buffer(avctx, p) < 0){
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    outdata = a->pic.data[0];
+    if(buf[0x85] == 0x10) {
+	qpeg_decode_intra(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height);
+    } else {
+	delta = buf[0x85];
+	qpeg_decode_inter(buf+0x86, outdata, buf_size - 0x86, a->pic.linesize[0], avctx->width, avctx->height, delta, buf + 4, a->refdata);
+    }
+
+    /* make the palette available on the way out */
+    memcpy(a->pic.data[1], a->avctx->palctrl->palette, AVPALETTE_SIZE);
+    if (a->avctx->palctrl->palette_changed) {
+        a->pic.palette_has_changed = 1;
+        a->avctx->palctrl->palette_changed = 0;
+    }
+
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = a->pic;
+    
+    return buf_size;
+}
+
+static int decode_init(AVCodecContext *avctx){
+    QpegContext * const a = avctx->priv_data;
+    
+    a->avctx = avctx;
+    avctx->pix_fmt= PIX_FMT_PAL8;
+    avctx->has_b_frames = 0;
+    a->pic.data[0] = NULL;
+    a->refdata = av_malloc(avctx->width * avctx->height);
+
+    return 0;
+}
+
+static int decode_end(AVCodecContext *avctx){
+    QpegContext * const a = avctx->priv_data;
+    AVFrame * const p= (AVFrame*)&a->pic;
+    
+    if(p->data[0])
+        avctx->release_buffer(avctx, p);
+
+    av_free(a->refdata);
+    return 0;
+}
+
+AVCodec qpeg_decoder = {
+    "qpeg",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_QPEG,
+    sizeof(QpegContext),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    CODEC_CAP_DR1,
+};
diff --git a/src/libffmpeg/libavcodec/qtrle.c b/src/libffmpeg/libavcodec/qtrle.c
index 55fa98663..41e4120db 100644
--- a/src/libffmpeg/libavcodec/qtrle.c
+++ b/src/libffmpeg/libavcodec/qtrle.c
@@ -74,6 +74,92 @@ static void qtrle_decode_2bpp(QtrleContext *s)
 
 static void qtrle_decode_4bpp(QtrleContext *s)
 {
+    int stream_ptr;
+    int header;
+    int start_line;
+    int lines_to_change;
+    int rle_code;
+    int row_ptr, pixel_ptr;
+    int row_inc = s->frame.linesize[0];
+    unsigned char pi1, pi2, pi3, pi4, pi5, pi6, pi7, pi8;  /* 8 palette indices */
+    unsigned char *rgb = s->frame.data[0];
+    int pixel_limit = s->frame.linesize[0] * s->avctx->height;
+
+    /* check if this frame is even supposed to change */
+    if (s->size < 8)
+        return;
+
+    /* start after the chunk size */
+    stream_ptr = 4;
+
+    /* fetch the header */
+    CHECK_STREAM_PTR(2);
+    header = BE_16(&s->buf[stream_ptr]);
+    stream_ptr += 2;
+
+    /* if a header is present, fetch additional decoding parameters */
+    if (header & 0x0008) {
+        CHECK_STREAM_PTR(8);
+        start_line = BE_16(&s->buf[stream_ptr]);
+        stream_ptr += 4;
+        lines_to_change = BE_16(&s->buf[stream_ptr]);
+        stream_ptr += 4;
+    } else {
+        start_line = 0;
+        lines_to_change = s->avctx->height;
+    }
+
+    row_ptr = row_inc * start_line;
+    while (lines_to_change--) {
+        CHECK_STREAM_PTR(2);
+        pixel_ptr = row_ptr + (8 * (s->buf[stream_ptr++] - 1));
+
+        while ((rle_code = (signed char)s->buf[stream_ptr++]) != -1) {
+            if (rle_code == 0) {
+                /* there's another skip code in the stream */
+                CHECK_STREAM_PTR(1);
+                pixel_ptr += (8 * (s->buf[stream_ptr++] - 1));
+            } else if (rle_code < 0) {
+                /* decode the run length code */
+                rle_code = -rle_code;
+                /* get the next 4 bytes from the stream, treat them as palette
+                 * indices, and output them rle_code times */
+                CHECK_STREAM_PTR(4);
+                pi1 = ((s->buf[stream_ptr]) >> 4) & 0x0f;
+                pi2 = (s->buf[stream_ptr++]) & 0x0f;
+                pi3 = ((s->buf[stream_ptr]) >> 4) & 0x0f;
+                pi4 = (s->buf[stream_ptr++]) & 0x0f;
+                pi5 = ((s->buf[stream_ptr]) >> 4) & 0x0f;
+                pi6 = (s->buf[stream_ptr++]) & 0x0f;
+                pi7 = ((s->buf[stream_ptr]) >> 4) & 0x0f;
+                pi8 = (s->buf[stream_ptr++]) & 0x0f;
+
+                CHECK_PIXEL_PTR(rle_code * 8);
+
+                while (rle_code--) {
+                    rgb[pixel_ptr++] = pi1;
+                    rgb[pixel_ptr++] = pi2;
+                    rgb[pixel_ptr++] = pi3;
+                    rgb[pixel_ptr++] = pi4;
+                    rgb[pixel_ptr++] = pi5;
+                    rgb[pixel_ptr++] = pi6;
+                    rgb[pixel_ptr++] = pi7;
+                    rgb[pixel_ptr++] = pi8;
+                }
+            } else {
+                /* copy the same pixel directly to output 4 times */
+                rle_code *= 4;
+                CHECK_STREAM_PTR(rle_code);
+                CHECK_PIXEL_PTR(rle_code*2);
+
+                while (rle_code--) {
+                    rgb[pixel_ptr++] = ((s->buf[stream_ptr]) >> 4) & 0x0f;
+                    rgb[pixel_ptr++] = (s->buf[stream_ptr++]) & 0x0f;
+                }
+            }
+        }
+        row_ptr += row_inc;
+    }
 }
 
 static void qtrle_decode_8bpp(QtrleContext *s)
@@ -444,10 +530,6 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
 {
     QtrleContext *s = (QtrleContext *)avctx->priv_data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     s->buf = buf;
     s->size = buf_size;
 
@@ -473,6 +555,12 @@ static int qtrle_decode_frame(AVCodecContext *avctx,
     case 4:
     case 36:
         qtrle_decode_4bpp(s);
+        /* make the palette available on the way out */
+        memcpy(s->frame.data[1], s->avctx->palctrl->palette, AVPALETTE_SIZE);
+        if (s->avctx->palctrl->palette_changed) {
+            s->frame.palette_has_changed = 1;
+            s->avctx->palctrl->palette_changed = 0;
+        }
         break;
 
     case 8:
diff --git a/src/libffmpeg/libavcodec/ra144.c b/src/libffmpeg/libavcodec/ra144.c
index 65829b6a3..79cce2cef 100644
--- a/src/libffmpeg/libavcodec/ra144.c
+++ b/src/libffmpeg/libavcodec/ra144.c
@@ -130,7 +130,7 @@ static void do_voice(int *a1, int *a2)
 
 
 /* do quarter-block output */
-static void do_output_subblock(Real144_internal *glob, int x)
+static void do_output_subblock(Real144_internal *glob, unsigned int x)
 {
   int a,b,c,d,e,f,g;
 
diff --git a/src/libffmpeg/libavcodec/ra288.c b/src/libffmpeg/libavcodec/ra288.c
index 09ecc7aac..4cff3106e 100644
--- a/src/libffmpeg/libavcodec/ra288.c
+++ b/src/libffmpeg/libavcodec/ra288.c
@@ -47,7 +47,7 @@ static void colmult(float *tgt, float *m1, const float *m2, int n);
 
 
 /* initial decode */
-static void unpack(unsigned short *tgt, unsigned char *src, int len)
+static void unpack(unsigned short *tgt, unsigned char *src, unsigned int len)
 {
   int x,y,z;
   int n,temp;
diff --git a/src/libffmpeg/libavcodec/rangecoder.c b/src/libffmpeg/libavcodec/rangecoder.c
new file mode 100644
index 000000000..ba3022c45
--- /dev/null
+++ b/src/libffmpeg/libavcodec/rangecoder.c
@@ -0,0 +1,178 @@
+/*
+ * Range coder
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file rangecoder.c
+ * Range coder.
+ * based upon
+ *    "Range encoding: an algorithm for removing redundancy from a digitised
+ *                     message.
+ *     G. N. N. Martin                  Presented in March 1979 to the Video &
+ *                                      Data Recording Conference,
+ *     IBM UK Scientific Center         held in Southampton July 24-27 1979."
+ *
+ */
+
+#include <string.h>
+
+#include "avcodec.h"
+#include "common.h"
+#include "rangecoder.h"
+
+
+void ff_init_range_encoder(RangeCoder *c, uint8_t *buf, int buf_size){
+    c->bytestream_start= 
+    c->bytestream= buf;
+    c->bytestream_end= buf + buf_size;
+
+    c->low= 0;
+    c->range= 0xFF00;
+    c->outstanding_count= 0;
+    c->outstanding_byte= -1;
+}
+
+void ff_init_range_decoder(RangeCoder *c, const uint8_t *buf, int buf_size){
+    ff_init_range_encoder(c, buf, buf_size);
+
+    c->low =(*c->bytestream++)<<8;
+    c->low+= *c->bytestream++;
+}
+
+void ff_build_rac_states(RangeCoder *c, int factor, int max_p){
+    const int64_t one= 1LL<<32;
+    int64_t p;
+    int last_p8, p8, i;
+
+    memset(c->zero_state, 0, sizeof(c->zero_state));
+    memset(c-> one_state, 0, sizeof(c-> one_state));
+
+#if 0
+    for(i=1; i<256; i++){
+        if(c->one_state[i]) 
+            continue;
+        
+        p= (i*one + 128) >> 8;
+        last_p8= i;
+        for(;;){
+            p+= ((one-p)*factor + one/2) >> 32;
+            p8= (256*p + one/2) >> 32; //FIXME try without the one
+            if(p8 <= last_p8) p8= last_p8+1;
+            if(p8 > max_p) p8= max_p;
+            if(p8 < last_p8)
+                break;
+            c->one_state[last_p8]=     p8;
+            if(p8 == last_p8)
+                break;
+            last_p8= p8;
+        }
+    }
+#endif
+#if 1
+    last_p8= 0;
+    p= one/2;
+    for(i=0; i<128; i++){
+        p8= (256*p + one/2) >> 32; //FIXME try without the one
+        if(p8 <= last_p8) p8= last_p8+1;
+        if(last_p8 && last_p8<256 && p8<=max_p)
+            c->one_state[last_p8]= p8;
+        
+        p+= ((one-p)*factor + one/2) >> 32;
+        last_p8= p8;
+    }
+#endif
+    for(i=256-max_p; i<=max_p; i++){
+        if(c->one_state[i]) 
+            continue;
+
+        p= (i*one + 128) >> 8;
+        p+= ((one-p)*factor + one/2) >> 32;
+        p8= (256*p + one/2) >> 32; //FIXME try without the one
+        if(p8 <= i) p8= i+1;
+        if(p8 > max_p) p8= max_p;
+        c->one_state[    i]=     p8;
+    }
+    
+    for(i=0; i<256; i++)
+        c->zero_state[i]= 256-c->one_state[256-i];
+#if 0
+    for(i=0; i<256; i++)
+        av_log(NULL, AV_LOG_DEBUG, "%3d %3d\n", i, c->one_state[i]);
+#endif
+}
+
+/**
+ *
+ * @return the number of bytes written
+ */
+int ff_rac_terminate(RangeCoder *c){
+    c->range=0xFF;
+    c->low +=0xFF;
+    renorm_encoder(c);
+    c->range=0xFF;
+    renorm_encoder(c);
+
+    assert(c->low   == 0);
+    assert(c->range >= 0x100);
+
+    return c->bytestream - c->bytestream_start;
+}
+
+#if 0 //selftest
+#define SIZE 10240
+int main(){
+    RangeCoder c;
+    uint8_t b[9*SIZE];
+    uint8_t r[9*SIZE];
+    int i;
+    uint8_t state[10]= {0};
+    
+    ff_init_range_encoder(&c, b, SIZE);
+    ff_build_rac_states(&c, 0.05*(1LL<<32), 128+64+32+16);
+    
+    memset(state, 128, sizeof(state));
+
+    for(i=0; i<SIZE; i++){
+        r[i]= random()%7;
+    }
+    
+  
+    for(i=0; i<SIZE; i++){
+START_TIMER
+        put_rac(&c, state, r[i]&1);
+STOP_TIMER("put_rac")
+    }
+
+    ff_put_rac_terminate(&c);
+    
+    ff_init_range_decoder(&c, b, SIZE);
+    
+    memset(state, 128, sizeof(state));
+    
+    for(i=0; i<SIZE; i++){
+START_TIMER
+        if( (r[i]&1) != get_rac(&c, state) )
+            av_log(NULL, AV_LOG_DEBUG, "rac failure at %d\n", i);
+STOP_TIMER("get_rac")
+    }
+    
+    return 0;
+}
+
+#endif
diff --git a/src/libffmpeg/libavcodec/rangecoder.h b/src/libffmpeg/libavcodec/rangecoder.h
new file mode 100644
index 000000000..6fd7b43bf
--- /dev/null
+++ b/src/libffmpeg/libavcodec/rangecoder.h
@@ -0,0 +1,125 @@
+/*
+ * Range coder
+ * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file rangecoder.h
+ * Range coder.
+ */
+
+typedef struct RangeCoder{
+    int low;
+    int range;
+    int outstanding_count;
+    int outstanding_byte;
+    uint8_t zero_state[256];
+    uint8_t  one_state[256];
+    uint8_t *bytestream_start;
+    uint8_t *bytestream;
+    uint8_t *bytestream_end;
+}RangeCoder;
+
+void ff_init_range_encoder(RangeCoder *c, uint8_t *buf, int buf_size);
+void ff_init_range_decoder(RangeCoder *c, const uint8_t *buf, int buf_size);
+int ff_rac_terminate(RangeCoder *c);
+void ff_build_rac_states(RangeCoder *c, int factor, int max_p);
+
+static inline void renorm_encoder(RangeCoder *c){
+    //FIXME optimize
+    while(c->range < 0x100){
+        if(c->outstanding_byte < 0){
+            c->outstanding_byte= c->low>>8;
+        }else if(c->low <= 0xFF00){
+            *c->bytestream++ = c->outstanding_byte;
+            for(;c->outstanding_count; c->outstanding_count--)
+                *c->bytestream++ = 0xFF;
+            c->outstanding_byte= c->low>>8;
+        }else if(c->low >= 0x10000){
+            *c->bytestream++ = c->outstanding_byte + 1;
+            for(;c->outstanding_count; c->outstanding_count--)
+                *c->bytestream++ = 0x00;
+            c->outstanding_byte= (c->low>>8) & 0xFF;
+        }else{
+            c->outstanding_count++;
+        }
+        
+        c->low = (c->low & 0xFF)<<8;
+        c->range <<= 8;
+    }
+}
+
+static inline void put_rac(RangeCoder *c, uint8_t * const state, int bit){
+    int range1= (c->range * (*state)) >> 8;
+
+    assert(*state);
+    assert(range1 < c->range);
+    assert(range1 > 0);
+    if(!bit){
+        c->range -= range1;
+        *state= c->zero_state[*state];
+    }else{
+        c->low += c->range - range1;
+        c->range = range1;
+        *state= c->one_state[*state];
+    }
+    
+    renorm_encoder(c);
+}
+
+static inline void refill(RangeCoder *c){
+    if(c->range < 0x100){
+        c->range <<= 8;
+        c->low <<= 8;
+        if(c->bytestream < c->bytestream_end)
+            c->low+= c->bytestream[0];
+        c->bytestream++;
+    }
+}
+
+static inline int get_rac(RangeCoder *c, uint8_t * const state){
+    int range1= (c->range * (*state)) >> 8;
+    int attribute_unused one_mask;
+    
+    c->range -= range1;
+#if 1
+    if(c->low < c->range){
+        *state= c->zero_state[*state];
+        refill(c);
+        return 0;
+    }else{
+        c->low -= c->range;
+        *state= c->one_state[*state];
+        c->range = range1;
+        refill(c);
+        return 1;
+    }
+#else
+    one_mask= (c->range - c->low-1)>>31;
+    
+    c->low -= c->range & one_mask;
+    c->range += (range1 - c->range) & one_mask;
+    
+    *state= c->zero_state[(*state) + (256&one_mask)];
+    
+    refill(c);
+
+    return one_mask&1;
+#endif
+}
+
diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c
index 473645def..19641d453 100644
--- a/src/libffmpeg/libavcodec/ratecontrol.c
+++ b/src/libffmpeg/libavcodec/ratecontrol.c
@@ -38,7 +38,7 @@ static int init_pass2(MpegEncContext *s);
 static double get_qscale(MpegEncContext *s, RateControlEntry *rce, double rate_factor, int frame_num);
 
 void ff_write_pass1_stats(MpegEncContext *s){
-    sprintf(s->avctx->stats_out, "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d;\n",
+    snprintf(s->avctx->stats_out, 256, "in:%d out:%d type:%d q:%d itex:%d ptex:%d mv:%d misc:%d fcode:%d bcode:%d mc-var:%d var:%d icount:%d;\n",
             s->current_picture_ptr->display_picture_number, s->current_picture_ptr->coded_picture_number, s->pict_type, 
             s->current_picture.quality, s->i_tex_bits, s->p_tex_bits, s->mv_bits, s->misc_bits, 
             s->f_code, s->b_code, s->current_picture.mc_mb_var_sum, s->current_picture.mb_var_sum, s->i_count);
@@ -74,6 +74,8 @@ int ff_rate_control_init(MpegEncContext *s)
             p= strchr(p+1, ';');
         }
         i+= s->max_b_frames;
+        if(i<=0 || i>=INT_MAX / sizeof(RateControlEntry))
+            return -1;
         rcc->entry = (RateControlEntry*)av_mallocz(i*sizeof(RateControlEntry));
         rcc->num_entries= i;
         
@@ -499,13 +501,16 @@ static void adaptive_quantization(MpegEncContext *s, double q){
     const float temp_cplx_masking= s->avctx->temporal_cplx_masking;
     const float spatial_cplx_masking = s->avctx->spatial_cplx_masking;
     const float p_masking = s->avctx->p_masking;
+    const float border_masking = s->avctx->border_masking;
     float bits_sum= 0.0;
     float cplx_sum= 0.0;
     float cplx_tab[s->mb_num];
     float bits_tab[s->mb_num];
-    const int qmin= s->avctx->lmin;
-    const int qmax= s->avctx->lmax;
+    const int qmin= s->avctx->mb_lmin;
+    const int qmax= s->avctx->mb_lmax;
     Picture * const pic= &s->current_picture;
+    const int mb_width = s->mb_width;
+    const int mb_height = s->mb_height;
     
     for(i=0; i<s->mb_num; i++){
         const int mb_xy= s->mb_index2xy[i];
@@ -513,6 +518,10 @@ static void adaptive_quantization(MpegEncContext *s, double q){
         float spat_cplx= sqrt(pic->mb_var[mb_xy]);
         const int lumi= pic->mb_mean[mb_xy];
         float bits, cplx, factor;
+        int mb_x = mb_xy % s->mb_stride;
+        int mb_y = mb_xy / s->mb_stride;
+        int mb_distance;
+        float mb_factor = 0.0;
 #if 0        
         if(spat_cplx < q/3) spat_cplx= q/3; //FIXME finetune
         if(temp_cplx < q/3) temp_cplx= q/3; //FIXME finetune
@@ -533,6 +542,23 @@ static void adaptive_quantization(MpegEncContext *s, double q){
             factor*= (1.0 - (lumi-128)*(lumi-128)*lumi_masking);
         else
             factor*= (1.0 - (lumi-128)*(lumi-128)*dark_masking);
+
+        if(mb_x < mb_width/5){
+            mb_distance = mb_width/5 - mb_x;
+            mb_factor = (float)mb_distance / (float)(mb_width/5);
+        }else if(mb_x > 4*mb_width/5){
+            mb_distance = mb_x - 4*mb_width/5;
+            mb_factor = (float)mb_distance / (float)(mb_width/5);
+        }
+        if(mb_y < mb_height/5){
+            mb_distance = mb_height/5 - mb_y;
+            mb_factor = FFMAX(mb_factor, (float)mb_distance / (float)(mb_height/5));
+        }else if(mb_y > 4*mb_height/5){
+            mb_distance = mb_y - 4*mb_height/5;
+            mb_factor = FFMAX(mb_factor, (float)mb_distance / (float)(mb_height/5));
+        }
+
+        factor*= 1.0 - border_masking*mb_factor;
         
         if(factor<0.00001) factor= 0.00001;
         
diff --git a/src/libffmpeg/libavcodec/rational.c b/src/libffmpeg/libavcodec/rational.c
index ad085653a..7ccad9e38 100644
--- a/src/libffmpeg/libavcodec/rational.c
+++ b/src/libffmpeg/libavcodec/rational.c
@@ -31,21 +31,33 @@
 #include "avcodec.h"
 #include "rational.h"
 
+/**
+ * returns b*c.
+ */
 AVRational av_mul_q(AVRational b, AVRational c){
     av_reduce(&b.num, &b.den, b.num * (int64_t)c.num, b.den * (int64_t)c.den, INT_MAX);
     return b;
 }
 
+/**
+ * returns b/c.
+ */
 AVRational av_div_q(AVRational b, AVRational c){
     av_reduce(&b.num, &b.den, b.num * (int64_t)c.den, b.den * (int64_t)c.num, INT_MAX);
     return b;
 }
 
+/**
+ * returns b+c.
+ */
 AVRational av_add_q(AVRational b, AVRational c){
     av_reduce(&b.num, &b.den, b.num * (int64_t)c.den + c.num * (int64_t)b.den, b.den * (int64_t)c.den, INT_MAX);
     return b;
 }
 
+/**
+ * returns b-c.
+ */
 AVRational av_sub_q(AVRational b, AVRational c){
     av_reduce(&b.num, &b.den, b.num * (int64_t)c.den - c.num * (int64_t)b.den, b.den * (int64_t)c.den, INT_MAX);
     return b;
diff --git a/src/libffmpeg/libavcodec/rational.h b/src/libffmpeg/libavcodec/rational.h
index d5fc77f1a..fcda759c4 100644
--- a/src/libffmpeg/libavcodec/rational.h
+++ b/src/libffmpeg/libavcodec/rational.h
@@ -27,19 +27,27 @@
 #ifndef RATIONAL_H
 #define RATIONAL_H
 
+/**
+ * Rational number num/den.
+ */
 typedef struct AVRational{
-    int num; 
-    int den;
+    int num; ///< numerator
+    int den; ///< denominator
 } AVRational;
 
+/**
+ * returns 0 if a==b, 1 if a>b and -1 if a<b.
+ */
 static inline int av_cmp_q(AVRational a, AVRational b){
     const int64_t tmp= a.num * (int64_t)b.den - b.num * (int64_t)a.den;
 
-    if     (tmp <  0) return -1;
-    else if(tmp == 0) return  0;
-    else              return  1;
+    if(tmp) return (tmp>>63)|1;
+    else    return 0;
 }
 
+/**
+ * converts the given AVRational to a double.
+ */
 static inline double av_q2d(AVRational a){
     return a.num / (double) a.den;
 }
diff --git a/src/libffmpeg/libavcodec/raw.c b/src/libffmpeg/libavcodec/raw.c
index 8c554c41c..957a809d8 100644
--- a/src/libffmpeg/libavcodec/raw.c
+++ b/src/libffmpeg/libavcodec/raw.c
@@ -47,7 +47,7 @@ const PixelFormatTag pixelFormatTags[] = {
 
 
     { PIX_FMT_YUV422,  MKTAG('Y', '4', '2', '2') }, /* Packed formats */
-    { PIX_FMT_YUV422,  MKTAG('U', 'Y', 'V', 'Y') },
+    { PIX_FMT_UYVY422, MKTAG('U', 'Y', 'V', 'Y') },
     { PIX_FMT_GRAY8,   MKTAG('G', 'R', 'E', 'Y') },
 
     { -1, 0 },
@@ -64,7 +64,7 @@ static int findPixelFormat(unsigned int fourcc)
     return PIX_FMT_YUV420P;
 }
 
-static unsigned int findFourCC(int fmt)
+unsigned int avcodec_pix_fmt_to_codec_tag(enum PixelFormat fmt)
 {
     const PixelFormatTag * tags = pixelFormatTags;
     while (tags->pix_fmt >= 0) {
@@ -83,6 +83,14 @@ static int raw_init_decoder(AVCodecContext *avctx)
 
     if (avctx->codec_tag)
         avctx->pix_fmt = findPixelFormat(avctx->codec_tag);
+    else if (avctx->bits_per_sample){
+        switch(avctx->bits_per_sample){
+        case 15: avctx->pix_fmt= PIX_FMT_RGB555; break;
+        case 16: avctx->pix_fmt= PIX_FMT_RGB565; break;
+        case 24: avctx->pix_fmt= PIX_FMT_BGR24 ; break;
+        case 32: avctx->pix_fmt= PIX_FMT_RGBA32; break;
+        }
+    }
     
     context->length = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
     context->buffer = av_malloc(context->length);
@@ -98,6 +106,13 @@ static int raw_init_decoder(AVCodecContext *avctx)
     return 0;
 }
 
+static void flip(AVCodecContext *avctx, AVPicture * picture){
+    if(!avctx->codec_tag && avctx->bits_per_sample && picture->linesize[1]==0){
+        picture->data[0] += picture->linesize[0] * (avctx->height-1);
+        picture->linesize[0] *= -1;
+    }
+}
+
 static int raw_decode(AVCodecContext *avctx,
 			    void *data, int *data_size,
 			    uint8_t *buf, int buf_size)
@@ -110,6 +125,7 @@ static int raw_decode(AVCodecContext *avctx,
     /* Early out without copy if packet size == frame size */
     if (buf_size == context->length  &&  context->p == context->buffer) {
         avpicture_fill(picture, buf, avctx->pix_fmt, avctx->width, avctx->height);
+        flip(avctx, picture);        
         *data_size = sizeof(AVPicture);
         return buf_size;
     }
@@ -124,6 +140,7 @@ static int raw_decode(AVCodecContext *avctx,
     memcpy(context->p, buf, bytesNeeded);
     context->p = context->buffer;
     avpicture_fill(picture, context->buffer, avctx->pix_fmt, avctx->width, avctx->height);
+    flip(avctx, picture);        
     *data_size = sizeof(AVPicture);
     return bytesNeeded;
 }
@@ -143,7 +160,8 @@ static int raw_init_encoder(AVCodecContext *avctx)
     avctx->coded_frame = (AVFrame *)avctx->priv_data;
     avctx->coded_frame->pict_type = FF_I_TYPE;
     avctx->coded_frame->key_frame = 1;
-    avctx->codec_tag = findFourCC(avctx->pix_fmt);
+    if(!avctx->codec_tag)
+        avctx->codec_tag = avcodec_pix_fmt_to_codec_tag(avctx->pix_fmt);
     return 0;
 }
 
diff --git a/src/libffmpeg/libavcodec/rpza.c b/src/libffmpeg/libavcodec/rpza.c
index 2be26346a..317c240b9 100644
--- a/src/libffmpeg/libavcodec/rpza.c
+++ b/src/libffmpeg/libavcodec/rpza.c
@@ -248,10 +248,6 @@ static int rpza_decode_frame(AVCodecContext *avctx,
 {
     RpzaContext *s = (RpzaContext *)avctx->priv_data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     s->buf = buf;
     s->size = buf_size;
 
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 58c5db7f4..884be9c7c 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -258,6 +258,36 @@ void rv10_encode_picture_header(MpegEncContext *s, int picture_number)
     put_bits(&s->pb, 3, 0);	/* ignored */
 }
 
+void rv20_encode_picture_header(MpegEncContext *s, int picture_number){
+    put_bits(&s->pb, 2, s->pict_type); //I 0 vs. 1 ?
+    put_bits(&s->pb, 1, 0);	/* unknown bit */
+    put_bits(&s->pb, 5, s->qscale);
+        
+    put_bits(&s->pb, 8, picture_number&0xFF); //FIXME wrong, but correct is not known
+    s->mb_x= s->mb_y= 0;
+    ff_h263_encode_mba(s);
+    
+    put_bits(&s->pb, 1, s->no_rounding);
+    
+    assert(s->f_code == 1);
+    assert(s->unrestricted_mv == 1);
+//    assert(s->h263_aic== (s->pict_type == I_TYPE));
+    assert(s->alt_inter_vlc == 0);
+    assert(s->umvplus == 0);
+    assert(s->modified_quant==1);
+    assert(s->loop_filter==1);
+
+    s->h263_aic= s->pict_type == I_TYPE;
+    if(s->h263_aic){
+        s->y_dc_scale_table= 
+        s->c_dc_scale_table= ff_aic_dc_scale_table;
+    }else{
+        s->y_dc_scale_table=
+        s->c_dc_scale_table= ff_mpeg1_dc_scale_table;
+    }
+}
+
+#if 0 /* unused, remove? */
 static int get_num(GetBitContext *gb)
 {
     int n, n1;
@@ -270,15 +300,15 @@ static int get_num(GetBitContext *gb)
         return (n << 16) | n1;
     }
 }
+#endif
 
 #endif //CONFIG_ENCODERS
 
 /* read RV 1.0 compatible frame header */
 static int rv10_decode_picture_header(MpegEncContext *s)
 {
-    int mb_count, pb_frame, marker, full_frame, unk;
+    int mb_count, pb_frame, marker, unk, mb_xy;
     
-    full_frame= s->avctx->slice_count==1;
 //printf("ff:%d\n", full_frame);
     marker = get_bits(&s->gb, 1);
 
@@ -321,7 +351,9 @@ static int rv10_decode_picture_header(MpegEncContext *s)
     }
     /* if multiple packets per frame are sent, the position at which
        to display the macro blocks is coded here */
-    if ((!full_frame) || show_bits(&s->gb, 12)==0) {
+
+    mb_xy= s->mb_x + s->mb_y*s->mb_width;
+    if(show_bits(&s->gb, 12)==0 || (mb_xy && mb_xy < s->mb_num)){
         s->mb_x = get_bits(&s->gb, 6);	/* mb_x */
         s->mb_y = get_bits(&s->gb, 6);	/* mb_y */
         mb_count = get_bits(&s->gb, 12);
@@ -342,6 +374,22 @@ static int rv20_decode_picture_header(MpegEncContext *s)
 {
     int seq, mb_pos, i;
     
+#if 0
+    GetBitContext gb= s->gb;
+    for(i=0; i<64; i++){
+        av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&gb));
+        if(i%4==3) av_log(s->avctx, AV_LOG_DEBUG, " ");
+    }
+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
+#endif
+#if 0
+    for(i=0; i<s->avctx->extradata_size; i++){
+        av_log(s->avctx, AV_LOG_DEBUG, "%2X ", ((uint8_t*)s->avctx->extradata)[i]);
+        if(i%4==3) av_log(s->avctx, AV_LOG_DEBUG, " ");
+    }
+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
+#endif
+    
     if(s->avctx->sub_id == 0x30202002 || s->avctx->sub_id == 0x30203002){
         if (get_bits(&s->gb, 3)){
             av_log(s->avctx, AV_LOG_ERROR, "unknown triplet set\n");
@@ -383,15 +431,23 @@ static int rv20_decode_picture_header(MpegEncContext *s)
     }
         
     if(s->avctx->has_b_frames){
+        int f=9;
+        int v= s->avctx->extradata_size >= 4 ? ((uint8_t*)s->avctx->extradata)[1] : 0;
+
         if (get_bits(&s->gb, 1)){
-//            av_log(s->avctx, AV_LOG_ERROR, "unknown bit3 set\n");
+            av_log(s->avctx, AV_LOG_ERROR, "unknown bit3 set\n");
 //            return -1;
         }
-        seq= get_bits(&s->gb, 15);
-        if (s->avctx->sub_id == 0x20201002 && get_bits(&s->gb, 1)){
-            av_log(s->avctx, AV_LOG_ERROR, "unknown bit4 set\n");
-//            return -1;
+        seq= get_bits(&s->gb, 14)<<1;
+
+        if(v>1 || (s->avctx->sub_id < 0x20201002 && v>0)){
+            f= get_bits(&s->gb, av_log2(v-1)+1);
         }
+        
+        if(s->avctx->debug & FF_DEBUG_PICT_INFO){
+            av_log(s->avctx, AV_LOG_DEBUG, "F %d/%d\n", f, v);
+        }
+
         mb_pos= get_bits(&s->gb, av_log2(s->mb_num-1)+1);
         s->mb_x= mb_pos % s->mb_width;
         s->mb_y= mb_pos / s->mb_width;
@@ -482,6 +538,7 @@ static int rv10_decode_init(AVCodecContext *avctx)
     case 0x20001000:
     case 0x20100001:
     case 0x20101001:
+    case 0x20103001:
         s->low_delay=1;
         break;
     case 0x20200002:
@@ -494,7 +551,11 @@ static int rv10_decode_init(AVCodecContext *avctx)
     default:
         av_log(s->avctx, AV_LOG_ERROR, "unknown header %X\n", avctx->sub_id);
     }
-//printf("ver:%X\n", avctx->sub_id);
+    
+    if(avctx->debug & FF_DEBUG_PICT_INFO){
+        av_log(avctx, AV_LOG_DEBUG, "ver:%X ver0:%X\n", avctx->sub_id, avctx->extradata_size >= 4 ? ((uint32_t*)avctx->extradata)[0] : -1);
+    }
+    
     if (MPV_common_init(s) < 0)
         return -1;
 
@@ -504,10 +565,10 @@ static int rv10_decode_init(AVCodecContext *avctx)
     if (!done) {
         init_vlc(&rv_dc_lum, DC_VLC_BITS, 256, 
                  rv_lum_bits, 1, 1,
-                 rv_lum_code, 2, 2);
+                 rv_lum_code, 2, 2, 1);
         init_vlc(&rv_dc_chrom, DC_VLC_BITS, 256, 
                  rv_chrom_bits, 1, 1,
-                 rv_chrom_code, 2, 2);
+                 rv_chrom_code, 2, 2, 1);
         done = 1;
     }
     
@@ -676,8 +737,8 @@ static int rv10_decode_frame(AVCodecContext *avctx,
             *pict= *(AVFrame*)&s->last_picture;
             ff_print_debug_info(s, pict);
         }
-        
-        *data_size = sizeof(AVFrame);
+        if(s->last_picture_ptr || s->low_delay)
+            *data_size = sizeof(AVFrame);
     }
 
     return buf_size;
@@ -704,6 +765,7 @@ AVCodec rv20_decoder = {
     NULL,
     rv10_decode_end,
     rv10_decode_frame,
-    CODEC_CAP_DR1
+    CODEC_CAP_DR1 | CODEC_CAP_DELAY,
+    .flush= ff_mpeg_flush,
 };
 
diff --git a/src/libffmpeg/libavcodec/shorten.c b/src/libffmpeg/libavcodec/shorten.c
new file mode 100644
index 000000000..b523a9250
--- /dev/null
+++ b/src/libffmpeg/libavcodec/shorten.c
@@ -0,0 +1,521 @@
+/*
+ * Shorten decoder
+ * Copyright (c) 2005 Jeff Muizelaar
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**
+ * @file shorten.c
+ * Shorten decoder
+ * @author Jeff Muizelaar
+ *
+ */
+
+#define DEBUG
+#include <limits.h>
+#include "avcodec.h"
+#include "bitstream.h"
+#include "golomb.h"
+
+#define MAX_CHANNELS 8
+#define MAX_BLOCKSIZE 65535
+
+#define OUT_BUFFER_SIZE 16384
+
+#define ULONGSIZE 2
+
+#define WAVE_FORMAT_PCM 0x0001
+
+#define DEFAULT_BLOCK_SIZE 256
+
+#define TYPESIZE 4
+#define CHANSIZE 0
+#define LPCQSIZE 2
+#define ENERGYSIZE 3
+#define BITSHIFTSIZE 2
+
+#define TYPE_S16HL 3
+#define TYPE_S16LH 5
+
+#define NWRAP 3
+#define NSKIPSIZE 1
+
+#define LPCQUANT 5
+#define V2LPCQOFFSET (1 << LPCQUANT)
+
+#define FNSIZE 2
+#define FN_DIFF0        0
+#define FN_DIFF1        1
+#define FN_DIFF2        2
+#define FN_DIFF3        3
+#define FN_QUIT         4
+#define FN_BLOCKSIZE    5
+#define FN_BITSHIFT     6
+#define FN_QLPC         7
+#define FN_ZERO         8
+#define FN_VERBATIM     9
+
+#define VERBATIM_CKSIZE_SIZE 5
+#define VERBATIM_BYTE_SIZE 8
+#define CANONICAL_HEADER_SIZE 44
+
+typedef struct ShortenContext {
+    AVCodecContext *avctx;
+    GetBitContext gb;
+
+    int min_framesize, max_framesize;
+    int channels;
+
+    int32_t *decoded[MAX_CHANNELS];
+    int32_t *offset[MAX_CHANNELS];
+    uint8_t *bitstream;
+    int bitstream_size;
+    int bitstream_index;
+    int allocated_bitstream_size;
+    int header_size;
+    uint8_t header[OUT_BUFFER_SIZE];
+    int version;
+    int cur_chan;
+    int bitshift;
+    int nmean;
+    int internal_ftype;
+    int nwrap;
+    int blocksize;
+    int bitindex;
+    int32_t lpcqoffset;
+} ShortenContext;
+
+static int shorten_decode_init(AVCodecContext * avctx)
+{
+    ShortenContext *s = avctx->priv_data;
+    s->avctx = avctx;
+
+    return 0;
+}
+
+static void allocate_buffers(ShortenContext *s)
+{
+    int i, chan;
+    for (chan=0; chan<s->channels; chan++) {
+        s->offset[chan] = av_realloc(s->offset[chan], sizeof(int32_t)*FFMAX(1, s->nmean));
+
+        s->decoded[chan] = av_realloc(s->decoded[chan], sizeof(int32_t)*(s->blocksize + s->nwrap));
+        for (i=0; i<s->nwrap; i++)
+            s->decoded[chan][i] = 0;
+        s->decoded[chan] += s->nwrap;
+
+    }
+}
+
+
+static inline unsigned int get_uint(ShortenContext *s, int k)
+{
+    if (s->version != 0)
+        k = get_ur_golomb_shorten(&s->gb, ULONGSIZE);
+    return get_ur_golomb_shorten(&s->gb, k);
+}
+
+
+static void fix_bitshift(ShortenContext *s, int32_t *buffer)
+{
+    int i;
+
+    if (s->bitshift != 0)
+        for (i = 0; i < s->blocksize; i++)
+            buffer[s->nwrap + i] <<= s->bitshift;
+}
+
+
+static void init_offset(ShortenContext *s)
+{
+    int32_t mean = 0;
+    int  chan, i;
+    int nblock = FFMAX(1, s->nmean);
+    /* initialise offset */
+    switch (s->internal_ftype)
+    {
+        case TYPE_S16HL:
+        case TYPE_S16LH:
+            mean = 0;
+            break;
+        default:
+            av_log(s->avctx, AV_LOG_ERROR, "unknown audio type");
+            abort();
+    }
+
+    for (chan = 0; chan < s->channels; chan++)
+        for (i = 0; i < nblock; i++)
+            s->offset[chan][i] = mean;
+}
+
+static int inline get_le32(GetBitContext *gb)
+{
+    return bswap_32(get_bits_long(gb, 32));
+}
+
+static short inline get_le16(GetBitContext *gb)
+{
+    return bswap_16(get_bits_long(gb, 16));
+}
+
+static int decode_wave_header(AVCodecContext *avctx, uint8_t *header, int header_size)
+{
+    GetBitContext hb;
+    int len;
+    int chunk_size;
+    short wave_format;
+
+    init_get_bits(&hb, header, header_size*8);
+    if (get_le32(&hb) != MKTAG('R','I','F','F')) {
+        av_log(avctx, AV_LOG_ERROR, "missing RIFF tag\n");
+        return -1;
+    }
+
+    chunk_size = get_le32(&hb);
+
+    if (get_le32(&hb) != MKTAG('W','A','V','E')) {
+        av_log(avctx, AV_LOG_ERROR, "missing WAVE tag\n");
+        return -1;
+    }
+
+    while (get_le32(&hb) != MKTAG('f','m','t',' ')) {
+        len = get_le32(&hb);
+        skip_bits(&hb, 8*len);
+    }
+    len = get_le32(&hb);
+
+    if (len < 16) {
+        av_log(avctx, AV_LOG_ERROR, "fmt chunk was too short\n");
+        return -1;
+    }
+
+    wave_format = get_le16(&hb);
+
+    switch (wave_format) {
+        case WAVE_FORMAT_PCM:
+            break;
+        default:
+            av_log(avctx, AV_LOG_ERROR, "unsupported wave format\n");
+            return -1;
+    }
+
+    avctx->channels = get_le16(&hb);
+    avctx->sample_rate = get_le32(&hb);
+    avctx->bit_rate = get_le32(&hb) * 8;
+    avctx->block_align = get_le16(&hb);
+    avctx->bits_per_sample = get_le16(&hb);
+
+    if (avctx->bits_per_sample != 16) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported number of bits per sample\n");
+        return -1;
+    }
+
+    len -= 16;
+    if (len > 0)
+        av_log(avctx, AV_LOG_INFO, "%d header bytes unparsed\n", len);
+
+    return 0;
+}
+
+static int16_t * interleave_buffer(int16_t *samples, int nchan, int blocksize, int32_t **buffer) {
+    int i, chan;
+    for (i=0; i<blocksize; i++)
+        for (chan=0; chan < nchan; chan++)
+            *samples++ = FFMIN(buffer[chan][i], 32768);
+    return samples;
+}
+
+static void decode_subframe_lpc(ShortenContext *s, int channel, int residual_size, int pred_order)
+{
+    int sum, i, j;
+    int coeffs[pred_order];
+
+    for (i=0; i<pred_order; i++)
+        coeffs[i] = get_sr_golomb_shorten(&s->gb, LPCQUANT);
+
+    for (i=0; i < s->blocksize; i++) {
+        sum = s->lpcqoffset;
+        for (j=0; j<pred_order; j++)
+            sum += coeffs[j] * s->decoded[channel][i-j-1];
+        s->decoded[channel][i] = get_sr_golomb_shorten(&s->gb, residual_size) + (sum >> LPCQUANT);
+    }
+}
+
+
+static int shorten_decode_frame(AVCodecContext *avctx,
+        void *data, int *data_size,
+        uint8_t *buf, int buf_size)
+{
+    ShortenContext *s = avctx->priv_data;
+    int i, input_buf_size = 0;
+    int16_t *samples = data;
+    if(s->max_framesize == 0){
+        s->max_framesize= 1024; // should hopefully be enough for the first header
+        s->bitstream= av_fast_realloc(s->bitstream, &s->allocated_bitstream_size, s->max_framesize);
+    }
+
+    if(1 && s->max_framesize){//FIXME truncated
+        buf_size= FFMIN(buf_size, s->max_framesize - s->bitstream_size);
+        input_buf_size= buf_size;
+
+        if(s->bitstream_index + s->bitstream_size + buf_size > s->allocated_bitstream_size){
+            //                printf("memmove\n");
+            memmove(s->bitstream, &s->bitstream[s->bitstream_index], s->bitstream_size);
+            s->bitstream_index=0;
+        }
+        memcpy(&s->bitstream[s->bitstream_index + s->bitstream_size], buf, buf_size);
+        buf= &s->bitstream[s->bitstream_index];
+        buf_size += s->bitstream_size;
+        s->bitstream_size= buf_size;
+
+        if(buf_size < s->max_framesize){
+            //dprintf("wanna more data ... %d\n", buf_size);
+            return input_buf_size;
+        }
+    }
+    init_get_bits(&s->gb, buf, buf_size*8);
+    get_bits(&s->gb, s->bitindex);
+    if (!s->blocksize)
+    {
+        int maxnlpc = 0;
+        /* shorten signature */
+        if (get_bits_long(&s->gb, 32) != bswap_32(ff_get_fourcc("ajkg"))) {
+            av_log(s->avctx, AV_LOG_ERROR, "missing shorten magic 'ajkg'\n");
+            return -1;
+        }
+
+        s->lpcqoffset = 0;
+        s->blocksize = DEFAULT_BLOCK_SIZE;
+        s->channels = 1;
+        s->nmean = -1;
+        s->version = get_bits(&s->gb, 8);
+        s->internal_ftype = get_uint(s, TYPESIZE);
+
+        s->channels = get_uint(s, CHANSIZE);
+        if (s->channels > MAX_CHANNELS) {
+            av_log(s->avctx, AV_LOG_ERROR, "too many channels: %d\n", s->channels);
+            return -1;
+        }
+
+        /* get blocksize if version > 0 */
+        if (s->version > 0) {
+            int skip_bytes;
+            s->blocksize = get_uint(s, av_log2(DEFAULT_BLOCK_SIZE));
+            maxnlpc = get_uint(s, LPCQSIZE);
+            s->nmean = get_uint(s, 0);
+
+            skip_bytes = get_uint(s, NSKIPSIZE);
+            for (i=0; i<skip_bytes; i++) {
+                skip_bits(&s->gb, 8);
+            }
+        }
+        s->nwrap = FFMAX(NWRAP, maxnlpc);
+
+        allocate_buffers(s);
+
+        init_offset(s);
+
+        if (s->version > 1)
+            s->lpcqoffset = V2LPCQOFFSET;
+
+        if (get_ur_golomb_shorten(&s->gb, FNSIZE) != FN_VERBATIM) {
+            av_log(s->avctx, AV_LOG_ERROR, "missing verbatim section at begining of stream\n");
+            return -1;
+        }
+
+        s->header_size = get_ur_golomb_shorten(&s->gb, VERBATIM_CKSIZE_SIZE);
+        if (s->header_size >= OUT_BUFFER_SIZE || s->header_size < CANONICAL_HEADER_SIZE) {
+            av_log(s->avctx, AV_LOG_ERROR, "header is wrong size: %d\n", s->header_size);
+            return -1;
+        }
+
+        for (i=0; i<s->header_size; i++)
+            s->header[i] = (char)get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
+
+        if (decode_wave_header(avctx, s->header, s->header_size) < 0)
+            return -1;
+
+        s->cur_chan = 0;
+        s->bitshift = 0;
+    }
+    else
+    {
+        int cmd;
+        int len;
+        cmd = get_ur_golomb_shorten(&s->gb, FNSIZE);
+        switch (cmd) {
+            case FN_ZERO:
+            case FN_DIFF0:
+            case FN_DIFF1:
+            case FN_DIFF2:
+            case FN_DIFF3:
+            case FN_QLPC:
+                {
+                    int residual_size = 0;
+                    int channel = s->cur_chan;
+                    int32_t coffset;
+                    if (cmd != FN_ZERO) {
+                        residual_size = get_ur_golomb_shorten(&s->gb, ENERGYSIZE);
+                        /* this is a hack as version 0 differed in defintion of get_sr_golomb_shorten */
+                        if (s->version == 0)
+                            residual_size--;
+                    }
+
+                    if (s->nmean == 0)
+                        coffset = s->offset[channel][0];
+                    else {
+                        int32_t sum = (s->version < 2) ? 0 : s->nmean / 2;
+                        for (i=0; i<s->nmean; i++)
+                            sum += s->offset[channel][i];
+                        coffset = sum / s->nmean;
+                        if (s->version >= 2)
+                            coffset >>= FFMIN(1, s->bitshift);
+                    }
+                    switch (cmd) {
+                        case FN_ZERO:
+                            for (i=0; i<s->blocksize; i++)
+                                s->decoded[channel][i] = 0;
+                            break;
+                        case FN_DIFF0:
+                            for (i=0; i<s->blocksize; i++)
+                                s->decoded[channel][i] = get_sr_golomb_shorten(&s->gb, residual_size) + coffset;
+                            break;
+                        case FN_DIFF1:
+                            for (i=0; i<s->blocksize; i++)
+                                s->decoded[channel][i] = get_sr_golomb_shorten(&s->gb, residual_size) + s->decoded[channel][i - 1];
+                            break;
+                        case FN_DIFF2:
+                            for (i=0; i<s->blocksize; i++)
+                                s->decoded[channel][i] = get_sr_golomb_shorten(&s->gb, residual_size) + 2*s->decoded[channel][i-1]
+                                                                                                      -   s->decoded[channel][i-2];
+                            break;
+                        case FN_DIFF3:
+                            for (i=0; i<s->blocksize; i++)
+                                s->decoded[channel][i] = get_sr_golomb_shorten(&s->gb, residual_size) + 3*s->decoded[channel][i-1]
+                                                                                                      - 3*s->decoded[channel][i-2]
+                                                                                                      +   s->decoded[channel][i-3];
+                            break;
+                        case FN_QLPC:
+                            {
+                                int pred_order = get_ur_golomb_shorten(&s->gb, LPCQSIZE);
+                                for (i=0; i<pred_order; i++)
+                                    s->decoded[channel][i - pred_order] -= coffset;
+                                decode_subframe_lpc(s, channel, residual_size, pred_order);
+                                if (coffset != 0)
+                                    for (i=0; i < s->blocksize; i++)
+                                        s->decoded[channel][i] += coffset;
+                            }
+                    }
+                    if (s->nmean > 0) {
+                        int32_t sum = (s->version < 2) ? 0 : s->blocksize / 2;
+                        for (i=0; i<s->blocksize; i++)
+                            sum += s->decoded[channel][i];
+
+                        for (i=1; i<s->nmean; i++)
+                            s->offset[channel][i-1] = s->offset[channel][i];
+
+                        if (s->version < 2)
+                            s->offset[channel][s->nmean - 1] = sum / s->blocksize;
+                        else
+                            s->offset[channel][s->nmean - 1] = (sum / s->blocksize) << s->bitshift;
+                    }
+                    for (i=-s->nwrap; i<0; i++)
+                        s->decoded[channel][i] = s->decoded[channel][i + s->blocksize];
+
+                    fix_bitshift(s, s->decoded[channel]);
+
+                    s->cur_chan++;
+                    if (s->cur_chan == s->channels) {
+                        samples = interleave_buffer(samples, s->channels, s->blocksize, s->decoded);
+                        s->cur_chan = 0;
+                        goto frame_done;
+                    }
+                    break;
+                }
+                break;
+            case FN_VERBATIM:
+                len = get_ur_golomb_shorten(&s->gb, VERBATIM_CKSIZE_SIZE);
+                while (len--) {
+                    get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE);
+                }
+                break;
+            case FN_BITSHIFT:
+                s->bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE);
+                break;
+            case FN_BLOCKSIZE:
+                s->blocksize = get_uint(s, av_log2(s->blocksize));
+                break;
+            case FN_QUIT:
+                return buf_size;
+                break;
+            default:
+                av_log(avctx, AV_LOG_ERROR, "unknown shorten function %d\n", cmd);
+                return -1;
+                break;
+        }
+    }
+frame_done:
+    *data_size = (int8_t *)samples - (int8_t *)data;
+
+    //    s->last_blocksize = s->blocksize;
+    s->bitindex = get_bits_count(&s->gb) - 8*((get_bits_count(&s->gb))/8);
+    i= (get_bits_count(&s->gb))/8;
+    if (i > buf_size) {
+        av_log(s->avctx, AV_LOG_ERROR, "overread: %d\n", i - buf_size);
+        s->bitstream_size=0;
+        s->bitstream_index=0;
+        return -1;
+    }
+    if (s->bitstream_size) {
+        s->bitstream_index += i;
+        s->bitstream_size  -= i;
+        return input_buf_size;
+    } else
+        return i;
+}
+
+static int shorten_decode_close(AVCodecContext *avctx)
+{
+    ShortenContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->channels; i++) {
+        s->decoded[i] -= s->nwrap;
+        av_freep(&s->decoded[i]);
+        av_freep(&s->offset[i]);
+    }
+    av_freep(&s->bitstream);
+    return 0;
+}
+
+static void shorten_flush(AVCodecContext *avctx){
+    ShortenContext *s = avctx->priv_data;
+
+    s->bitstream_size=
+        s->bitstream_index= 0;
+}
+
+AVCodec shorten_decoder = {
+    "shorten",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_SHORTEN,
+    sizeof(ShortenContext),
+    shorten_decode_init,
+    NULL,
+    shorten_decode_close,
+    shorten_decode_frame,
+    .flush= shorten_flush,
+};
diff --git a/src/libffmpeg/libavcodec/smc.c b/src/libffmpeg/libavcodec/smc.c
index e937b03c8..dbb5adef1 100644
--- a/src/libffmpeg/libavcodec/smc.c
+++ b/src/libffmpeg/libavcodec/smc.c
@@ -125,7 +125,7 @@ static void smc_decode_stream(SmcContext *s)
             chunk_size, s->size);
 
     chunk_size = s->size;
-    total_blocks = (s->avctx->width * s->avctx->height) / (4 * 4);
+    total_blocks = ((s->avctx->width + 3) / 4) * ((s->avctx->height + 3) / 4);
 
     /* traverse through the blocks */
     while (total_blocks) {
@@ -448,10 +448,6 @@ static int smc_decode_frame(AVCodecContext *avctx,
 {
     SmcContext *s = (SmcContext *)avctx->priv_data;
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
     s->buf = buf;
     s->size = buf_size;
 
diff --git a/src/libffmpeg/libavcodec/snow.c b/src/libffmpeg/libavcodec/snow.c
new file mode 100644
index 000000000..9cfddfa95
--- /dev/null
+++ b/src/libffmpeg/libavcodec/snow.c
@@ -0,0 +1,3996 @@
+/*
+ * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "avcodec.h"
+#include "common.h"
+#include "dsputil.h"
+
+#include "rangecoder.h"
+#define MID_STATE 128
+
+#include "mpegvideo.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+#define MAX_DECOMPOSITIONS 8
+#define MAX_PLANES 4
+#define DWTELEM int
+#define QSHIFT 5
+#define QROOT (1<<QSHIFT)
+#define LOSSLESS_QLOG -128
+#define FRAC_BITS 8
+
+static const int8_t quant3[256]={
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
+};
+static const int8_t quant3b[256]={
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+static const int8_t quant3bA[256]={
+ 0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+ 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
+};
+static const int8_t quant5[256]={
+ 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
+};
+static const int8_t quant7[256]={
+ 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
+};
+static const int8_t quant9[256]={
+ 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
+-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
+};
+static const int8_t quant11[256]={
+ 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
+-4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
+};
+static const int8_t quant13[256]={
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
+-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
+-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
+-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
+-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
+-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
+};
+
+#define LOG2_OBMC_MAX 6
+#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
+#if 0 //64*cubic
+static const uint8_t obmc32[1024]={
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
+ 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
+ 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
+ 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
+ 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
+ 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
+ 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
+ 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
+ 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
+ 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
+ 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
+ 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
+ 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
+ 0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
+ 0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
+ 0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
+ 0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
+ 0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
+ 0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
+ 0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
+ 0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
+ 0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
+ 0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
+ 0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
+ 0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
+ 0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+//error:0.000022
+};
+static const uint8_t obmc16[256]={
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
+ 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
+ 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
+ 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
+ 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
+ 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
+ 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
+ 1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
+ 1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
+ 0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
+ 0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
+ 0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
+ 0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
+ 0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+//error:0.000033
+};
+#elif 1 // 64*linear
+static const uint8_t obmc32[1024]={
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+ 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
+ 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,13,12,11,10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15,16,17,17,16,15,14,13,12,11,10, 8, 7, 6, 5, 4, 3, 2, 1,
+ 1, 2, 3, 5, 6, 8, 9,10,12,13,14,16,17,19,20,21,21,20,19,17,16,14,13,12,10, 9, 8, 6, 5, 3, 2, 1,
+ 1, 2, 4, 6, 7, 9,11,12,14,15,17,19,20,22,24,25,25,24,22,20,19,17,15,14,12,11, 9, 7, 6, 4, 2, 1,
+ 1, 3, 5, 7, 8,10,12,14,16,18,20,22,23,25,27,29,29,27,25,23,22,20,18,16,14,12,10, 8, 7, 5, 3, 1,
+ 1, 3, 5, 7,10,12,14,16,18,20,22,24,27,29,31,33,33,31,29,27,24,22,20,18,16,14,12,10, 7, 5, 3, 1,
+ 1, 4, 6, 8,11,13,15,18,20,23,25,27,30,32,34,37,37,34,32,30,27,25,23,20,18,15,13,11, 8, 6, 4, 1,
+ 1, 4, 7, 9,12,14,17,20,22,25,28,30,33,35,38,41,41,38,35,33,30,28,25,22,20,17,14,12, 9, 7, 4, 1,
+ 1, 4, 7,10,13,16,19,22,24,27,30,33,36,39,42,45,45,42,39,36,33,30,27,24,22,19,16,13,10, 7, 4, 1,
+ 2, 5, 8,11,14,17,20,23,27,30,33,36,39,42,45,48,48,45,42,39,36,33,30,27,23,20,17,14,11, 8, 5, 2,
+ 2, 5, 8,12,15,19,22,25,29,32,35,39,42,46,49,52,52,49,46,42,39,35,32,29,25,22,19,15,12, 8, 5, 2,
+ 2, 5, 9,13,16,20,24,27,31,34,38,42,45,49,53,56,56,53,49,45,42,38,34,31,27,24,20,16,13, 9, 5, 2,
+ 2, 6,10,14,17,21,25,29,33,37,41,45,48,52,56,60,60,56,52,48,45,41,37,33,29,25,21,17,14,10, 6, 2,
+ 2, 6,10,14,17,21,25,29,33,37,41,45,48,52,56,60,60,56,52,48,45,41,37,33,29,25,21,17,14,10, 6, 2,
+ 2, 5, 9,13,16,20,24,27,31,34,38,42,45,49,53,56,56,53,49,45,42,38,34,31,27,24,20,16,13, 9, 5, 2,
+ 2, 5, 8,12,15,19,22,25,29,32,35,39,42,46,49,52,52,49,46,42,39,35,32,29,25,22,19,15,12, 8, 5, 2,
+ 2, 5, 8,11,14,17,20,23,27,30,33,36,39,42,45,48,48,45,42,39,36,33,30,27,23,20,17,14,11, 8, 5, 2,
+ 1, 4, 7,10,13,16,19,22,24,27,30,33,36,39,42,45,45,42,39,36,33,30,27,24,22,19,16,13,10, 7, 4, 1,
+ 1, 4, 7, 9,12,14,17,20,22,25,28,30,33,35,38,41,41,38,35,33,30,28,25,22,20,17,14,12, 9, 7, 4, 1,
+ 1, 4, 6, 8,11,13,15,18,20,23,25,27,30,32,34,37,37,34,32,30,27,25,23,20,18,15,13,11, 8, 6, 4, 1,
+ 1, 3, 5, 7,10,12,14,16,18,20,22,24,27,29,31,33,33,31,29,27,24,22,20,18,16,14,12,10, 7, 5, 3, 1,
+ 1, 3, 5, 7, 8,10,12,14,16,18,20,22,23,25,27,29,29,27,25,23,22,20,18,16,14,12,10, 8, 7, 5, 3, 1,
+ 1, 2, 4, 6, 7, 9,11,12,14,15,17,19,20,22,24,25,25,24,22,20,19,17,15,14,12,11, 9, 7, 6, 4, 2, 1,
+ 1, 2, 3, 5, 6, 8, 9,10,12,13,14,16,17,19,20,21,21,20,19,17,16,14,13,12,10, 9, 8, 6, 5, 3, 2, 1,
+ 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15,16,17,17,16,15,14,13,12,11,10, 8, 7, 6, 5, 4, 3, 2, 1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,13,12,11,10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0,
+ 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
+ 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+ //error:0.000020
+};
+static const uint8_t obmc16[256]={
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
+ 1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
+ 1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
+ 2, 5, 9,12,16,19,23,26,26,23,19,16,12, 9, 5, 2,
+ 2, 7,11,16,20,25,29,34,34,29,25,20,16,11, 7, 2,
+ 3, 8,14,19,25,30,36,41,41,36,30,25,19,14, 8, 3,
+ 3,10,16,23,29,36,42,49,49,42,36,29,23,16,10, 3,
+ 4,11,19,26,34,41,49,56,56,49,41,34,26,19,11, 4,
+ 4,11,19,26,34,41,49,56,56,49,41,34,26,19,11, 4,
+ 3,10,16,23,29,36,42,49,49,42,36,29,23,16,10, 3,
+ 3, 8,14,19,25,30,36,41,41,36,30,25,19,14, 8, 3,
+ 2, 7,11,16,20,25,29,34,34,29,25,20,16,11, 7, 2,
+ 2, 5, 9,12,16,19,23,26,26,23,19,16,12, 9, 5, 2,
+ 1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
+ 1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
+//error:0.000015
+};
+#else //64*cos
+static const uint8_t obmc32[1024]={
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
+ 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
+ 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
+ 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
+ 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
+ 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
+ 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
+ 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
+ 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
+ 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
+ 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
+ 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
+ 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
+ 0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
+ 0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
+ 0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
+ 0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
+ 0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
+ 0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
+ 0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
+ 0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
+ 0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
+ 0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
+ 0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
+ 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
+ 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+//error:0.000022
+};
+static const uint8_t obmc16[256]={
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
+ 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
+ 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
+ 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
+ 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
+ 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
+ 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
+ 0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
+ 1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
+ 1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
+ 0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
+ 0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
+ 0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
+ 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
+ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+//error:0.000022
+};
+#endif
+
+//linear *64
+static const uint8_t obmc8[64]={
+ 1, 3, 5, 7, 7, 5, 3, 1,
+ 3, 9,15,21,21,15, 9, 3,
+ 5,15,25,35,35,25,15, 5,
+ 7,21,35,49,49,35,21, 7,
+ 7,21,35,49,49,35,21, 7,
+ 5,15,25,35,35,25,15, 5,
+ 3, 9,15,21,21,15, 9, 3,
+ 1, 3, 5, 7, 7, 5, 3, 1,
+//error:0.000000
+};
+
+//linear *64
+static const uint8_t obmc4[16]={
+ 4,12,12, 4,
+12,36,36,12,
+12,36,36,12,
+ 4,12,12, 4,
+//error:0.000000
+};
+
+static const uint8_t *obmc_tab[4]={
+    obmc32, obmc16, obmc8, obmc4
+};
+
+typedef struct BlockNode{
+    int16_t mx;
+    int16_t my;
+    uint8_t color[3];
+    uint8_t type;
+//#define TYPE_SPLIT    1
+#define BLOCK_INTRA   1
+//#define TYPE_NOCOLOR  4
+    uint8_t level; //FIXME merge into type?
+}BlockNode;
+
+#define LOG2_MB_SIZE 4
+#define MB_SIZE (1<<LOG2_MB_SIZE)
+
+typedef struct x_and_coeff{
+    int16_t x;
+    uint16_t coeff;
+} x_and_coeff;
+
+typedef struct SubBand{
+    int level;
+    int stride;
+    int width;
+    int height;
+    int qlog;                                   ///< log(qscale)/log[2^(1/6)]
+    DWTELEM *buf;
+    int buf_x_offset;
+    int buf_y_offset;
+    int stride_line; ///< Stride measured in lines, not pixels.
+    x_and_coeff * x_coeff;
+    struct SubBand *parent;
+    uint8_t state[/*7*2*/ 7 + 512][32];
+}SubBand;
+
+typedef struct Plane{
+    int width;
+    int height;
+    SubBand band[MAX_DECOMPOSITIONS][4];
+}Plane;
+
+/** Used to minimize the amount of memory used in order to optimize cache performance. **/
+typedef struct {
+    DWTELEM * * line; ///< For use by idwt and predict_slices.
+    DWTELEM * * data_stack; ///< Used for internal purposes.
+    int data_stack_top;
+    int line_count;
+    int line_width;
+    int data_count;
+    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
+} slice_buffer;
+
+typedef struct SnowContext{
+//    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
+
+    AVCodecContext *avctx;
+    RangeCoder c;
+    DSPContext dsp;
+    AVFrame input_picture;
+    AVFrame current_picture;
+    AVFrame last_picture;
+    AVFrame mconly_picture;
+//     uint8_t q_context[16];
+    uint8_t header_state[32];
+    uint8_t block_state[128 + 32*128];
+    int keyframe;
+    int always_reset;
+    int version;
+    int spatial_decomposition_type;
+    int temporal_decomposition_type;
+    int spatial_decomposition_count;
+    int temporal_decomposition_count;
+    DWTELEM *spatial_dwt_buffer;
+    int colorspace_type;
+    int chroma_h_shift;
+    int chroma_v_shift;
+    int spatial_scalability;
+    int qlog;
+    int lambda;
+    int lambda2;
+    int mv_scale;
+    int qbias;
+#define QBIAS_SHIFT 3
+    int b_width;
+    int b_height;
+    int block_max_depth;
+    Plane plane[MAX_PLANES];
+    BlockNode *block;
+    slice_buffer sb;
+
+    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
+}SnowContext;
+
+typedef struct {
+    DWTELEM *b0;
+    DWTELEM *b1;
+    DWTELEM *b2;
+    DWTELEM *b3;
+    int y;
+} dwt_compose_t;
+
+#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
+//#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
+
+static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
+{
+    int i;
+  
+    buf->base_buffer = base_buffer;
+    buf->line_count = line_count;
+    buf->line_width = line_width;
+    buf->data_count = max_allocated_lines;
+    buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
+    buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
+  
+    for (i = 0; i < max_allocated_lines; i++)
+    {
+      buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
+    }
+    
+    buf->data_stack_top = max_allocated_lines - 1;
+}
+
+static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
+{
+    int i;
+    int offset;
+    DWTELEM * buffer;
+  
+//  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);  
+  
+    assert(buf->data_stack_top >= 0);
+//  assert(!buf->line[line]);
+    if (buf->line[line])
+        return buf->line[line];
+    
+    offset = buf->line_width * line;
+    buffer = buf->data_stack[buf->data_stack_top];
+    buf->data_stack_top--;
+    buf->line[line] = buffer;
+  
+//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
+  
+    return buffer;
+}
+
+static void slice_buffer_release(slice_buffer * buf, int line)
+{
+    int i;
+    int offset;
+    DWTELEM * buffer;
+
+    assert(line >= 0 && line < buf->line_count);
+    assert(buf->line[line]);
+
+    offset = buf->line_width * line;
+    buffer = buf->line[line];
+    buf->data_stack_top++;
+    buf->data_stack[buf->data_stack_top] = buffer;
+    buf->line[line] = NULL;
+  
+//  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
+}
+
+static void slice_buffer_flush(slice_buffer * buf)
+{
+    int i;
+    for (i = 0; i < buf->line_count; i++)
+    {
+        if (buf->line[i])
+        {
+//      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
+            slice_buffer_release(buf, i);
+        }
+    }
+}
+
+static void slice_buffer_destroy(slice_buffer * buf)
+{
+    int i;
+    slice_buffer_flush(buf);
+  
+    for (i = buf->data_count - 1; i >= 0; i--)
+    {
+        assert(buf->data_stack[i]);
+        av_free(buf->data_stack[i]);
+    }
+    assert(buf->data_stack);
+    av_free(buf->data_stack);
+    assert(buf->line);
+    av_free(buf->line);
+}
+
+#ifdef	__sgi
+// Avoid a name clash on SGI IRIX
+#undef	qexp
+#endif
+#define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
+static uint8_t qexp[QROOT];
+
+static inline int mirror(int v, int m){
+    if     (v<0) return -v;
+    else if(v>m) return 2*m-v;
+    else         return v;
+}
+
+static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
+    int i;
+
+    if(v){
+        const int a= ABS(v);
+        const int e= av_log2(a);
+#if 1
+        const int el= FFMIN(e, 10);   
+        put_rac(c, state+0, 0);
+
+        for(i=0; i<el; i++){
+            put_rac(c, state+1+i, 1);  //1..10
+        }
+        for(; i<e; i++){
+            put_rac(c, state+1+9, 1);  //1..10
+        }
+        put_rac(c, state+1+FFMIN(i,9), 0);
+
+        for(i=e-1; i>=el; i--){
+            put_rac(c, state+22+9, (a>>i)&1); //22..31
+        }
+        for(; i>=0; i--){
+            put_rac(c, state+22+i, (a>>i)&1); //22..31
+        }
+
+        if(is_signed)
+            put_rac(c, state+11 + el, v < 0); //11..21
+#else
+        
+        put_rac(c, state+0, 0);
+        if(e<=9){
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+i, 1);  //1..10
+            }
+            put_rac(c, state+1+i, 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+i, (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + e, v < 0); //11..21
+        }else{
+            for(i=0; i<e; i++){
+                put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
+            }
+            put_rac(c, state+1+FFMIN(i,9), 0);
+
+            for(i=e-1; i>=0; i--){
+                put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
+            }
+
+            if(is_signed)
+                put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
+        }
+#endif
+    }else{
+        put_rac(c, state+0, 1);
+    }
+}
+
+static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
+    if(get_rac(c, state+0))
+        return 0;
+    else{
+        int i, e, a;
+        e= 0;
+        while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
+            e++;
+        }
+
+        a= 1;
+        for(i=e-1; i>=0; i--){
+            a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
+        }
+
+        if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
+            return -a;
+        else
+            return a;
+    }
+}
+
+static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+
+    assert(v>=0);
+    assert(log2>=-4);
+
+    while(v >= r){
+        put_rac(c, state+4+log2, 1);
+        v -= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+    put_rac(c, state+4+log2, 0);
+    
+    for(i=log2-1; i>=0; i--){
+        put_rac(c, state+31-i, (v>>i)&1);
+    }
+}
+
+static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
+    int i;
+    int r= log2>=0 ? 1<<log2 : 1;
+    int v=0;
+
+    assert(log2>=-4);
+
+    while(get_rac(c, state+4+log2)){
+        v+= r;
+        log2++;
+        if(log2>0) r+=r;
+    }
+    
+    for(i=log2-1; i>=0; i--){
+        v+= get_rac(c, state+31-i)<<i;
+    }
+
+    return v;
+}
+
+static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
+    const int mirror_left= !highpass;
+    const int mirror_right= (width&1) ^ highpass;
+    const int w= (width>>1) - 1 + (highpass & width);
+    int i;
+
+#define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
+    if(mirror_left){
+        dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
+        dst += dst_step;
+        src += src_step;
+    }
+    
+    for(i=0; i<w; i++){
+        dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
+    }
+    
+    if(mirror_right){
+        dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
+    }
+}
+
+static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
+    const int mirror_left= !highpass;
+    const int mirror_right= (width&1) ^ highpass;
+    const int w= (width>>1) - 1 + (highpass & width);
+    int i;
+
+    if(mirror_left){
+        int r= 3*2*ref[0];
+        r += r>>4;
+        r += r>>8;
+        dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
+        dst += dst_step;
+        src += src_step;
+    }
+    
+    for(i=0; i<w; i++){
+        int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
+        r += r>>4;
+        r += r>>8;
+        dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
+    }
+    
+    if(mirror_right){
+        int r= 3*2*ref[w*ref_step];
+        r += r>>4;
+        r += r>>8;
+        dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
+    }
+}
+
+static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
+    const int mirror_left= !highpass;
+    const int mirror_right= (width&1) ^ highpass;
+    const int w= (width>>1) - 1 + (highpass & width);
+    int i;
+
+    assert(shift == 4);
+#define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
+    if(mirror_left){
+        dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
+        dst += dst_step;
+        src += src_step;
+    }
+    
+    for(i=0; i<w; i++){
+        dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
+    }
+    
+    if(mirror_right){
+        dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
+    }
+}
+
+
+static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
+    int x, i;
+    
+    for(x=start; x<width; x+=2){
+        int64_t sum=0;
+
+        for(i=0; i<n; i++){
+            int x2= x + 2*i - n + 1;
+            if     (x2<     0) x2= -x2;
+            else if(x2>=width) x2= 2*width-x2-2;
+            sum += coeffs[i]*(int64_t)dst[x2];
+        }
+        if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
+        else        dst[x] += (sum + (1<<shift)/2)>>shift;
+    }
+}
+
+static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
+    int x, y, i;
+    for(y=start; y<height; y+=2){
+        for(x=0; x<width; x++){
+            int64_t sum=0;
+    
+            for(i=0; i<n; i++){
+                int y2= y + 2*i - n + 1;
+                if     (y2<      0) y2= -y2;
+                else if(y2>=height) y2= 2*height-y2-2;
+                sum += coeffs[i]*(int64_t)dst[x + y2*stride];
+            }
+            if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
+            else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
+        }
+    }
+}
+
+#define SCALEX 1
+#define LX0 0
+#define LX1 1
+
+#if 0 // more accurate 9/7
+#define N1 2
+#define SHIFT1 14
+#define COEFFS1 (int[]){-25987,-25987}
+#define N2 2
+#define SHIFT2 19
+#define COEFFS2 (int[]){-27777,-27777}
+#define N3 2
+#define SHIFT3 15
+#define COEFFS3 (int[]){28931,28931}
+#define N4 2
+#define SHIFT4 15
+#define COEFFS4 (int[]){14533,14533}
+#elif 1 // 13/7 CRF
+#define N1 4
+#define SHIFT1 4
+#define COEFFS1 (int[]){1,-9,-9,1}
+#define N2 4
+#define SHIFT2 4
+#define COEFFS2 (int[]){-1,5,5,-1}
+#define N3 0
+#define SHIFT3 1
+#define COEFFS3 NULL
+#define N4 0
+#define SHIFT4 1
+#define COEFFS4 NULL
+#elif 1 // 3/5
+#define LX0 1
+#define LX1 0
+#define SCALEX 0.5
+#define N1 2
+#define SHIFT1 1
+#define COEFFS1 (int[]){1,1}
+#define N2 2
+#define SHIFT2 2
+#define COEFFS2 (int[]){-1,-1}
+#define N3 0
+#define SHIFT3 0
+#define COEFFS3 NULL
+#define N4 0
+#define SHIFT4 0
+#define COEFFS4 NULL
+#elif 1 // 11/5 
+#define N1 0
+#define SHIFT1 1
+#define COEFFS1 NULL
+#define N2 2
+#define SHIFT2 2
+#define COEFFS2 (int[]){-1,-1}
+#define N3 2
+#define SHIFT3 0
+#define COEFFS3 (int[]){-1,-1}
+#define N4 4
+#define SHIFT4 7
+#define COEFFS4 (int[]){-5,29,29,-5}
+#define SCALEX 4
+#elif 1 // 9/7 CDF
+#define N1 2
+#define SHIFT1 7
+#define COEFFS1 (int[]){-203,-203}
+#define N2 2
+#define SHIFT2 12
+#define COEFFS2 (int[]){-217,-217}
+#define N3 2
+#define SHIFT3 7
+#define COEFFS3 (int[]){113,113}
+#define N4 2
+#define SHIFT4 9
+#define COEFFS4 (int[]){227,227}
+#define SCALEX 1
+#elif 1 // 7/5 CDF
+#define N1 0
+#define SHIFT1 1
+#define COEFFS1 NULL
+#define N2 2
+#define SHIFT2 2
+#define COEFFS2 (int[]){-1,-1}
+#define N3 2
+#define SHIFT3 0
+#define COEFFS3 (int[]){-1,-1}
+#define N4 2
+#define SHIFT4 4
+#define COEFFS4 (int[]){3,3}
+#elif 1 // 9/7 MN
+#define N1 4
+#define SHIFT1 4
+#define COEFFS1 (int[]){1,-9,-9,1}
+#define N2 2
+#define SHIFT2 2
+#define COEFFS2 (int[]){1,1}
+#define N3 0
+#define SHIFT3 1
+#define COEFFS3 NULL
+#define N4 0
+#define SHIFT4 1
+#define COEFFS4 NULL
+#else // 13/7 CRF
+#define N1 4
+#define SHIFT1 4
+#define COEFFS1 (int[]){1,-9,-9,1}
+#define N2 4
+#define SHIFT2 4
+#define COEFFS2 (int[]){-1,5,5,-1}
+#define N3 0
+#define SHIFT3 1
+#define COEFFS3 NULL
+#define N4 0
+#define SHIFT4 1
+#define COEFFS4 NULL
+#endif
+static void horizontal_decomposeX(DWTELEM *b, int width){
+    DWTELEM temp[width];
+    const int width2= width>>1;
+    const int w2= (width+1)>>1;
+    int A1,A2,A3,A4, x;
+
+    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
+    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
+    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
+    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
+    
+    for(x=0; x<width2; x++){
+        temp[x   ]= b[2*x    ];
+        temp[x+w2]= b[2*x + 1];
+    }
+    if(width&1)
+        temp[x   ]= b[2*x    ];
+    memcpy(b, temp, width*sizeof(int));
+}
+
+static void horizontal_composeX(DWTELEM *b, int width){
+    DWTELEM temp[width];
+    const int width2= width>>1;
+    int A1,A2,A3,A4, x;
+    const int w2= (width+1)>>1;
+
+    memcpy(temp, b, width*sizeof(int));
+    for(x=0; x<width2; x++){
+        b[2*x    ]= temp[x   ];
+        b[2*x + 1]= temp[x+w2];
+    }
+    if(width&1)
+        b[2*x    ]= temp[x   ];
+
+    inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
+    inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
+    inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
+    inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
+}
+
+static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
+    int x, y;
+  
+    for(y=0; y<height; y++){
+        for(x=0; x<width; x++){
+            buffer[y*stride + x] *= SCALEX;
+        }
+    }
+
+    for(y=0; y<height; y++){
+        horizontal_decomposeX(buffer + y*stride, width);
+    }
+    
+    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
+    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
+    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
+    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);    
+}
+
+static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
+    int x, y;
+  
+    inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
+    inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
+    inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
+    inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
+
+    for(y=0; y<height; y++){
+        horizontal_composeX(buffer + y*stride, width);
+    }
+
+    for(y=0; y<height; y++){
+        for(x=0; x<width; x++){
+            buffer[y*stride + x] /= SCALEX;
+        }
+    }
+}
+
+static void horizontal_decompose53i(DWTELEM *b, int width){
+    DWTELEM temp[width];
+    const int width2= width>>1;
+    int A1,A2,A3,A4, x;
+    const int w2= (width+1)>>1;
+
+    for(x=0; x<width2; x++){
+        temp[x   ]= b[2*x    ];
+        temp[x+w2]= b[2*x + 1];
+    }
+    if(width&1)
+        temp[x   ]= b[2*x    ];
+#if 0
+    A2= temp[1       ];
+    A4= temp[0       ];
+    A1= temp[0+width2];
+    A1 -= (A2 + A4)>>1;
+    A4 += (A1 + 1)>>1;
+    b[0+width2] = A1;
+    b[0       ] = A4;
+    for(x=1; x+1<width2; x+=2){
+        A3= temp[x+width2];
+        A4= temp[x+1     ];
+        A3 -= (A2 + A4)>>1;
+        A2 += (A1 + A3 + 2)>>2;
+        b[x+width2] = A3;
+        b[x       ] = A2;
+
+        A1= temp[x+1+width2];
+        A2= temp[x+2       ];
+        A1 -= (A2 + A4)>>1;
+        A4 += (A1 + A3 + 2)>>2;
+        b[x+1+width2] = A1;
+        b[x+1       ] = A4;
+    }
+    A3= temp[width-1];
+    A3 -= A2;
+    A2 += (A1 + A3 + 2)>>2;
+    b[width -1] = A3;
+    b[width2-1] = A2;
+#else        
+    lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
+    lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
+#endif
+}
+
+static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] -= (b0[i] + b2[i])>>1;
+    }
+}
+
+static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] += (b0[i] + b2[i] + 2)>>2;
+    }
+}
+
+static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
+    int y;
+    DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
+    DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
+  
+    for(y=-2; y<height; y+=2){
+        DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
+        DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
+
+{START_TIMER
+        if(b1 <= b3)     horizontal_decompose53i(b2, width);
+        if(y+2 < height) horizontal_decompose53i(b3, width);
+STOP_TIMER("horizontal_decompose53i")}
+        
+{START_TIMER
+        if(b1 <= b3) vertical_decompose53iH0(b1, b2, b3, width);
+        if(b0 <= b2) vertical_decompose53iL0(b0, b1, b2, width);
+STOP_TIMER("vertical_decompose53i*")}
+        
+        b0=b2;
+        b1=b3;
+    }
+}
+
+#define liftS lift
+#define lift5 lift
+#if 1
+#define W_AM 3
+#define W_AO 0
+#define W_AS 1
+
+#undef liftS
+#define W_BM 1
+#define W_BO 8
+#define W_BS 4
+
+#define W_CM 1
+#define W_CO 0
+#define W_CS 0
+
+#define W_DM 3
+#define W_DO 4
+#define W_DS 3
+#elif 0
+#define W_AM 55
+#define W_AO 16
+#define W_AS 5
+
+#define W_BM 3
+#define W_BO 32
+#define W_BS 6
+
+#define W_CM 127
+#define W_CO 64
+#define W_CS 7
+
+#define W_DM 7
+#define W_DO 8
+#define W_DS 4
+#elif 0
+#define W_AM 97
+#define W_AO 32
+#define W_AS 6
+
+#define W_BM 63
+#define W_BO 512
+#define W_BS 10
+
+#define W_CM 13
+#define W_CO 8
+#define W_CS 4
+
+#define W_DM 15
+#define W_DO 16
+#define W_DS 5
+
+#else
+
+#define W_AM 203
+#define W_AO 64
+#define W_AS 7
+
+#define W_BM 217
+#define W_BO 2048
+#define W_BS 12
+
+#define W_CM 113
+#define W_CO 64
+#define W_CS 7
+
+#define W_DM 227
+#define W_DO 128
+#define W_DS 9
+#endif
+static void horizontal_decompose97i(DWTELEM *b, int width){
+    DWTELEM temp[width];
+    const int w2= (width+1)>>1;
+
+    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
+    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
+    lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
+    lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
+}
+
+
+static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+}
+
+static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+#ifdef lift5
+        b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
+#else
+        int r= 3*(b0[i] + b2[i]);
+        r+= r>>4;
+        r+= r>>8;
+        b1[i] += (r+W_CO)>>W_CS;
+#endif
+    }
+}
+
+static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+#ifdef liftS
+        b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
+#else
+        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
+#endif
+    }
+}
+
+static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
+    }
+}
+
+static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
+    int y;
+    DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
+    DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
+    DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
+    DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
+  
+    for(y=-4; y<height; y+=2){
+        DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
+        DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
+
+{START_TIMER
+        if(b3 <= b5)     horizontal_decompose97i(b4, width);
+        if(y+4 < height) horizontal_decompose97i(b5, width);
+if(width>400){
+STOP_TIMER("horizontal_decompose97i")
+}}
+        
+{START_TIMER
+        if(b3 <= b5) vertical_decompose97iH0(b3, b4, b5, width);
+        if(b2 <= b4) vertical_decompose97iL0(b2, b3, b4, width);
+        if(b1 <= b3) vertical_decompose97iH1(b1, b2, b3, width);
+        if(b0 <= b2) vertical_decompose97iL1(b0, b1, b2, width);
+
+if(width>400){
+STOP_TIMER("vertical_decompose97i")
+}}
+        
+        b0=b2;
+        b1=b3;
+        b2=b4;
+        b3=b5;
+    }
+}
+
+void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
+    int level;
+    
+    for(level=0; level<decomposition_count; level++){
+        switch(type){
+        case 0: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
+        case 1: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
+        case 2: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
+        }
+    }
+}
+
+static void horizontal_compose53i(DWTELEM *b, int width){
+    DWTELEM temp[width];
+    const int width2= width>>1;
+    const int w2= (width+1)>>1;
+    int A1,A2,A3,A4, x;
+
+#if 0
+    A2= temp[1       ];
+    A4= temp[0       ];
+    A1= temp[0+width2];
+    A1 -= (A2 + A4)>>1;
+    A4 += (A1 + 1)>>1;
+    b[0+width2] = A1;
+    b[0       ] = A4;
+    for(x=1; x+1<width2; x+=2){
+        A3= temp[x+width2];
+        A4= temp[x+1     ];
+        A3 -= (A2 + A4)>>1;
+        A2 += (A1 + A3 + 2)>>2;
+        b[x+width2] = A3;
+        b[x       ] = A2;
+
+        A1= temp[x+1+width2];
+        A2= temp[x+2       ];
+        A1 -= (A2 + A4)>>1;
+        A4 += (A1 + A3 + 2)>>2;
+        b[x+1+width2] = A1;
+        b[x+1       ] = A4;
+    }
+    A3= temp[width-1];
+    A3 -= A2;
+    A2 += (A1 + A3 + 2)>>2;
+    b[width -1] = A3;
+    b[width2-1] = A2;
+#else   
+    lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
+    lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
+#endif
+    for(x=0; x<width2; x++){
+        b[2*x    ]= temp[x   ];
+        b[2*x + 1]= temp[x+w2];
+    }
+    if(width&1)
+        b[2*x    ]= temp[x   ];
+}
+
+static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] += (b0[i] + b2[i])>>1;
+    }
+}
+
+static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] -= (b0[i] + b2[i] + 2)>>2;
+    }
+}
+
+static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
+    cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
+    cs->y = -1;
+}
+
+static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
+    cs->b0 = buffer + mirror(-1-1, height-1)*stride;
+    cs->b1 = buffer + mirror(-1  , height-1)*stride;
+    cs->y = -1;
+}
+
+static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+    int y= cs->y;
+    int mirror0 = mirror(y-1, height-1);
+    int mirror1 = mirror(y  , height-1);
+    int mirror2 = mirror(y+1, height-1);
+    int mirror3 = mirror(y+2, height-1);
+    
+    DWTELEM *b0= cs->b0;
+    DWTELEM *b1= cs->b1;
+    DWTELEM *b2= slice_buffer_get_line(sb, mirror2 * stride_line);
+    DWTELEM *b3= slice_buffer_get_line(sb, mirror3 * stride_line);
+
+{START_TIMER
+        if(mirror1 <= mirror3) vertical_compose53iL0(b1, b2, b3, width);
+        if(mirror0 <= mirror2) vertical_compose53iH0(b0, b1, b2, width);
+STOP_TIMER("vertical_compose53i*")}
+
+{START_TIMER
+        if(y-1 >= 0) horizontal_compose53i(b0, width);
+        if(mirror0 <= mirror2) horizontal_compose53i(b1, width);
+STOP_TIMER("horizontal_compose53i")}
+
+    cs->b0 = b2;
+    cs->b1 = b3;
+    cs->y += 2;
+}
+
+static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
+    int y= cs->y;
+    DWTELEM *b0= cs->b0;
+    DWTELEM *b1= cs->b1;
+    DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
+    DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
+
+{START_TIMER
+        if(b1 <= b3) vertical_compose53iL0(b1, b2, b3, width);
+        if(b0 <= b2) vertical_compose53iH0(b0, b1, b2, width);
+STOP_TIMER("vertical_compose53i*")}
+
+{START_TIMER
+        if(y-1 >= 0) horizontal_compose53i(b0, width);
+        if(b0 <= b2) horizontal_compose53i(b1, width);
+STOP_TIMER("horizontal_compose53i")}
+
+    cs->b0 = b2;
+    cs->b1 = b3;
+    cs->y += 2;
+}
+
+static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
+    dwt_compose_t cs;
+    spatial_compose53i_init(&cs, buffer, height, stride);
+    while(cs.y <= height)
+        spatial_compose53i_dy(&cs, buffer, width, height, stride);
+}   
+
+ 
+static void horizontal_compose97i(DWTELEM *b, int width){
+    DWTELEM temp[width];
+    const int w2= (width+1)>>1;
+
+    lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
+    lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
+    liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
+    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
+}
+
+static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+}
+
+static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+#ifdef lift5
+        b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
+#else
+        int r= 3*(b0[i] + b2[i]);
+        r+= r>>4;
+        r+= r>>8;
+        b1[i] -= (r+W_CO)>>W_CS;
+#endif
+    }
+}
+
+static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+#ifdef liftS
+        b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
+#else
+        b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
+#endif
+    }
+}
+
+static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
+    }
+}
+
+static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+    int i;
+    
+    for(i=0; i<width; i++){
+        int r;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+#ifdef lift5
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+#else
+        r= 3*(b2[i] + b4[i]);
+        r+= r>>4;
+        r+= r>>8;
+        b3[i] -= (r+W_CO)>>W_CS;
+#endif
+#ifdef liftS
+        b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
+#else
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+#endif
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+}
+
+static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
+    cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
+    cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
+    cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
+    cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
+    cs->y = -3;
+}
+
+static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
+    cs->b0 = buffer + mirror(-3-1, height-1)*stride;
+    cs->b1 = buffer + mirror(-3  , height-1)*stride;
+    cs->b2 = buffer + mirror(-3+1, height-1)*stride;
+    cs->b3 = buffer + mirror(-3+2, height-1)*stride;
+    cs->y = -3;
+}
+
+static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+    int y = cs->y;
+    
+    int mirror0 = mirror(y - 1, height - 1);
+    int mirror1 = mirror(y + 0, height - 1);
+    int mirror2 = mirror(y + 1, height - 1);
+    int mirror3 = mirror(y + 2, height - 1);
+    int mirror4 = mirror(y + 3, height - 1);
+    int mirror5 = mirror(y + 4, height - 1);
+    DWTELEM *b0= cs->b0;
+    DWTELEM *b1= cs->b1;
+    DWTELEM *b2= cs->b2;
+    DWTELEM *b3= cs->b3;
+    DWTELEM *b4= slice_buffer_get_line(sb, mirror4 * stride_line);
+    DWTELEM *b5= slice_buffer_get_line(sb, mirror5 * stride_line);
+        
+{START_TIMER
+    if(y>0 && y+4<height){
+        vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+    }else{
+        if(mirror3 <= mirror5) vertical_compose97iL1(b3, b4, b5, width);
+        if(mirror2 <= mirror4) vertical_compose97iH1(b2, b3, b4, width);
+        if(mirror1 <= mirror3) vertical_compose97iL0(b1, b2, b3, width);
+        if(mirror0 <= mirror2) vertical_compose97iH0(b0, b1, b2, width);
+    }
+if(width>400){
+STOP_TIMER("vertical_compose97i")}}
+
+{START_TIMER
+        if(y-1>=  0) horizontal_compose97i(b0, width);
+        if(mirror0 <= mirror2) horizontal_compose97i(b1, width);
+if(width>400 && mirror0 <= mirror2){
+STOP_TIMER("horizontal_compose97i")}}
+
+    cs->b0=b2;
+    cs->b1=b3;
+    cs->b2=b4;
+    cs->b3=b5;
+    cs->y += 2;
+}
+
+static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
+    int y = cs->y;
+    DWTELEM *b0= cs->b0;
+    DWTELEM *b1= cs->b1;
+    DWTELEM *b2= cs->b2;
+    DWTELEM *b3= cs->b3;
+    DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
+    DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
+
+        if(stride == width && y+4 < height && 0){ 
+            int x;
+            for(x=0; x<width/2; x++)
+                b5[x] += 64*2;
+            for(; x<width; x++)
+                b5[x] += 169*2;
+        }
+        
+{START_TIMER
+        if(b3 <= b5) vertical_compose97iL1(b3, b4, b5, width);
+        if(b2 <= b4) vertical_compose97iH1(b2, b3, b4, width);
+        if(b1 <= b3) vertical_compose97iL0(b1, b2, b3, width);
+        if(b0 <= b2) vertical_compose97iH0(b0, b1, b2, width);
+if(width>400){
+STOP_TIMER("vertical_compose97i")}}
+
+{START_TIMER
+        if(y-1>=  0) horizontal_compose97i(b0, width);
+        if(b0 <= b2) horizontal_compose97i(b1, width);
+if(width>400 && b0 <= b2){
+STOP_TIMER("horizontal_compose97i")}}
+
+    cs->b0=b2;
+    cs->b1=b3;
+    cs->b2=b4;
+    cs->b3=b5;
+    cs->y += 2;
+}
+
+static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
+    dwt_compose_t cs;
+    spatial_compose97i_init(&cs, buffer, height, stride);
+    while(cs.y <= height)
+        spatial_compose97i_dy(&cs, buffer, width, height, stride);
+}
+
+void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
+    int level;
+    for(level=decomposition_count-1; level>=0; level--){
+        switch(type){
+        case 0: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
+        case 1: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
+        /* not slicified yet */
+        case 2: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
+          av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
+        }
+    }
+}
+
+void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
+    int level;
+    for(level=decomposition_count-1; level>=0; level--){
+        switch(type){
+        case 0: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
+        case 1: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
+        /* not slicified yet */
+        case 2: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
+        }
+    }
+}
+
+void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
+    const int support = type==1 ? 3 : 5;
+    int level;
+    if(type==2) return;
+
+    for(level=decomposition_count-1; level>=0; level--){
+        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
+            switch(type){
+            case 0: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
+                    break;
+            case 1: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
+                    break;
+            case 2: break;
+            }
+        }
+    }
+}
+
+void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
+    const int support = type==1 ? 3 : 5;
+    int level;
+    if(type==2) return;
+
+    for(level=decomposition_count-1; level>=0; level--){
+        while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
+            switch(type){
+            case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
+                    break;
+            case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
+                    break;
+            case 2: break;
+            }
+        }
+    }
+}
+
+void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
+    if(type==2){
+        int level;
+        for(level=decomposition_count-1; level>=0; level--)
+            spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
+    }else{
+        dwt_compose_t cs[MAX_DECOMPOSITIONS];
+        int y;
+        ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
+        for(y=0; y<height; y+=4)
+            ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
+    }
+}
+
+static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x, y;
+
+    if(1){
+        int run=0;
+        int runs[w*h];
+        int run_index=0;
+        int max_index;
+                
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height) 
+                        p= parent[px + py*2*stride];
+                }
+                if(!(/*ll|*/l|lt|t|rt|p)){
+                    if(v){
+                        runs[run_index++]= run;
+                        run=0;
+                    }else{
+                        run++;
+                    }
+                }
+            }
+        }
+        max_index= run_index;
+        runs[run_index++]= run;
+        run_index=0;
+        run= runs[run_index++];
+
+        put_symbol2(&s->c, b->state[30], max_index, 0);
+        if(run_index <= max_index)
+            put_symbol2(&s->c, b->state[1], run, 3);
+        
+        for(y=0; y<h; y++){
+            if(s->c.bytestream_end - s->c.bytestream < w*40){
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
+            for(x=0; x<w; x++){
+                int v, p=0;
+                int /*ll=0, */l=0, lt=0, t=0, rt=0;
+                v= src[x + y*stride];
+
+                if(y){
+                    t= src[x + (y-1)*stride];
+                    if(x){
+                        lt= src[x - 1 + (y-1)*stride];
+                    }
+                    if(x + 1 < w){
+                        rt= src[x + 1 + (y-1)*stride];
+                    }
+                }
+                if(x){
+                    l= src[x - 1 + y*stride];
+                    /*if(x > 1){
+                        if(orientation==1) ll= src[y + (x-2)*stride];
+                        else               ll= src[x - 2 + y*stride];
+                    }*/
+                }
+                if(parent){
+                    int px= x>>1;
+                    int py= y>>1;
+                    if(px<b->parent->width && py<b->parent->height) 
+                        p= parent[px + py*2*stride];
+                }
+                if(/*ll|*/l|lt|t|rt|p){
+                    int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
+
+                    put_rac(&s->c, &b->state[0][context], !!v);
+                }else{
+                    if(!run){
+                        run= runs[run_index++];
+
+                        if(run_index <= max_index)
+                            put_symbol2(&s->c, b->state[1], run, 3);
+                        assert(v);
+                    }else{
+                        run--;
+                        assert(!v);
+                    }
+                }
+                if(v){
+                    int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
+                    int l2= 2*ABS(l) + (l<0);
+                    int t2= 2*ABS(t) + (t<0);
+
+                    put_symbol2(&s->c, b->state[context + 2], ABS(v)-1, context-4);
+                    put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){    
+//    encode_subband_qtree(s, b, src, parent, stride, orientation);
+//    encode_subband_z0run(s, b, src, parent, stride, orientation);
+    return encode_subband_c0run(s, b, src, parent, stride, orientation);
+//    encode_subband_dzr(s, b, src, parent, stride, orientation);
+}
+
+static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+    
+    if(1){
+        int run, runs;
+        x_and_coeff *xc= b->x_coeff;
+        x_and_coeff *prev_xc= NULL;
+        x_and_coeff *prev2_xc= xc;
+        x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
+        x_and_coeff *prev_parent_xc= parent_xc;
+
+        runs= get_symbol2(&s->c, b->state[30], 0);
+        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+        else           run= INT_MAX;
+
+        for(y=0; y<h; y++){
+            int v=0;
+            int lt=0, t=0, rt=0;
+
+            if(y && prev_xc->x == 0){
+                rt= prev_xc->coeff;
+            }
+            for(x=0; x<w; x++){
+                int p=0;
+                const int l= v;
+                
+                lt= t; t= rt;
+
+                if(y){
+                    if(prev_xc->x <= x)
+                        prev_xc++;
+                    if(prev_xc->x == x + 1)
+                        rt= prev_xc->coeff;
+                    else
+                        rt=0;
+                }
+                if(parent_xc){
+                    if(x>>1 > parent_xc->x){
+                        parent_xc++;
+                    }
+                    if(x>>1 == parent_xc->x){
+                        p= parent_xc->coeff;
+                    }
+                }
+                if(/*ll|*/l|lt|t|rt|p){
+                    int context= av_log2(/*ABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
+
+                    v=get_rac(&s->c, &b->state[0][context]);
+                    if(v){
+                        v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
+                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
+                        
+                        xc->x=x;
+                        (xc++)->coeff= v;
+                    }
+                }else{
+                    if(!run){
+                        if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
+                        else           run= INT_MAX;
+                        v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
+                        v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
+                        
+                        xc->x=x;
+                        (xc++)->coeff= v;
+                    }else{
+                        int max_run;
+                        run--;
+                        v=0;
+
+                        if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
+                        else  max_run= FFMIN(run, w-x-1);
+                        if(parent_xc)
+                            max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
+                        x+= max_run;
+                        run-= max_run;
+                    }
+                }
+            }
+            (xc++)->x= w+1; //end marker
+            prev_xc= prev2_xc;
+            prev2_xc= xc;
+            
+            if(parent_xc){
+                if(y&1){
+                    while(parent_xc->x != parent->width+1)
+                        parent_xc++;
+                    parent_xc++;
+                    prev_parent_xc= parent_xc;
+                }else{
+                    parent_xc= prev_parent_xc;
+                }
+            }
+        }
+
+        (xc++)->x= w+1; //end marker
+    }
+}
+
+static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
+    const int w= b->width;
+    int x,y;
+    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
+    int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int new_index = 0;
+    
+    START_TIMER
+
+    if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
+        qadd= 0;
+        qmul= 1<<QEXPSHIFT;
+    }
+
+    /* If we are on the second or later slice, restore our index. */
+    if (start_y != 0)
+        new_index = save_state[0];
+
+        
+    for(y=start_y; y<h; y++){
+        int x = 0;
+        int v;
+        DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
+        memset(line, 0, b->width*sizeof(DWTELEM));
+        v = b->x_coeff[new_index].coeff;
+        x = b->x_coeff[new_index++].x;
+        while(x < w)
+        {
+            register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
+            register int u= -(v&1);
+            line[x] = (t^u) - u;
+
+            v = b->x_coeff[new_index].coeff;
+            x = b->x_coeff[new_index++].x;
+        }
+    }
+    if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
+        STOP_TIMER("decode_subband")
+    }
+        
+    /* Save our variables for the next slice. */
+    save_state[0] = new_index;
+        
+    return;
+}
+
+static void reset_contexts(SnowContext *s){
+    int plane_index, level, orientation;
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1:0; orientation<4; orientation++){
+                memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
+            }
+        }
+    }
+    memset(s->header_state, MID_STATE, sizeof(s->header_state));
+    memset(s->block_state, MID_STATE, sizeof(s->block_state));
+}
+
+static int alloc_blocks(SnowContext *s){
+    int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
+    int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
+    
+    s->b_width = w;
+    s->b_height= h;
+    
+    s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
+    return 0;
+}
+
+static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
+    uint8_t *bytestream= d->bytestream;
+    uint8_t *bytestream_start= d->bytestream_start;
+    *d= *s;
+    d->bytestream= bytestream;
+    d->bytestream_start= bytestream_start;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_sum(uint8_t * pix, int line_size, int w)
+{
+    int s, i, j;
+
+    s = 0;
+    for (i = 0; i < w; i++) {
+        for (j = 0; j < w; j++) {
+            s += pix[0];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+//near copy & paste from dsputil, FIXME
+static int pix_norm1(uint8_t * pix, int line_size, int w)
+{
+    int s, i, j;
+    uint32_t *sq = squareTbl + 256;
+
+    s = 0;
+    for (i = 0; i < w; i++) {
+        for (j = 0; j < w; j ++) {
+            s += sq[pix[0]];
+            pix ++;
+        }
+        pix += line_size - w;
+    }
+    return s;
+}
+
+static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int type){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<rem_depth;
+    BlockNode block;
+    int i,j;
+    
+    block.color[0]= l;
+    block.color[1]= cb;
+    block.color[2]= cr;
+    block.mx= mx;
+    block.my= my;
+    block.type= type;
+    block.level= level;
+
+    for(j=0; j<block_w; j++){
+        for(i=0; i<block_w; i++){
+            s->block[index + i + j*w]= block;
+        }
+    }
+}
+
+static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
+    const int offset[3]= {
+          y*c->  stride + x,
+        ((y*c->uvstride + x)>>1),
+        ((y*c->uvstride + x)>>1),
+    };
+    int i;
+    for(i=0; i<3; i++){
+        c->src[0][i]= src [i];
+        c->ref[0][i]= ref [i] + offset[i];
+    }
+    assert(!ref_index);
+}
+
+//FIXME copy&paste
+#define P_LEFT P[1]
+#define P_TOP P[2]
+#define P_TOPRIGHT P[3]
+#define P_MEDIAN P[4]
+#define P_MV1 P[9]
+#define FLAG_QPEL   1 //must be 1
+
+static int encode_q_branch(SnowContext *s, int level, int x, int y){
+    uint8_t p_buffer[1024];
+    uint8_t i_buffer[1024];
+    uint8_t p_state[sizeof(s->block_state)];
+    uint8_t i_state[sizeof(s->block_state)];
+    RangeCoder pc, ic;
+    uint8_t *pbbak= s->c.bytestream;
+    uint8_t *pbbak_start= s->c.bytestream_start;
+    int score, score2, iscore, i_len, p_len, block_s, sum;
+    const int w= s->b_width  << s->block_max_depth;
+    const int h= s->b_height << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    const int block_w= 1<<(LOG2_MB_SIZE - level);
+    static BlockNode null_block= { //FIXME add border maybe
+        .color= {128,128,128},
+        .mx= 0,
+        .my= 0,
+        .type= 0,
+        .level= 0,
+    };
+    int trx= (x+1)<<rem_depth;
+    int try= (y+1)<<rem_depth;
+    BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
+    BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
+    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int pl = left->color[0];
+    int pcb= left->color[1];
+    int pcr= left->color[2];
+    int pmx= mid_pred(left->mx, top->mx, tr->mx);
+    int pmy= mid_pred(left->my, top->my, tr->my);
+    int mx=0, my=0;
+    int l,cr,cb, i;
+    const int stride= s->current_picture.linesize[0];
+    const int uvstride= s->current_picture.linesize[1];
+    const int instride= s->input_picture.linesize[0];
+    const int uvinstride= s->input_picture.linesize[1];
+    uint8_t *new_l = s->input_picture.data[0] + (x + y*  instride)*block_w;
+    uint8_t *new_cb= s->input_picture.data[1] + (x + y*uvinstride)*block_w/2;
+    uint8_t *new_cr= s->input_picture.data[2] + (x + y*uvinstride)*block_w/2;
+    uint8_t current_mb[3][stride*block_w];
+    uint8_t *current_data[3]= {&current_mb[0][0], &current_mb[1][0], &current_mb[2][0]};
+    int P[10][2];
+    int16_t last_mv[3][2];
+    int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
+    const int shift= 1+qpel;
+    MotionEstContext *c= &s->m.me;
+    int mx_context= av_log2(2*ABS(left->mx - top->mx));
+    int my_context= av_log2(2*ABS(left->my - top->my));
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+
+    assert(sizeof(s->block_state) >= 256);
+    if(s->keyframe){
+        set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
+        return 0;
+    }
+
+    //FIXME optimize
+    for(i=0; i<block_w; i++)
+        memcpy(&current_mb[0][0] +   stride*i, new_l  +   instride*i, block_w);
+    for(i=0; i<block_w>>1; i++)
+        memcpy(&current_mb[1][0] + uvstride*i, new_cb + uvinstride*i, block_w>>1);
+    for(i=0; i<block_w>>1; i++)
+        memcpy(&current_mb[2][0] + uvstride*i, new_cr + uvinstride*i, block_w>>1);
+
+//    clip predictors / edge ?
+
+    P_LEFT[0]= left->mx;
+    P_LEFT[1]= left->my;
+    P_TOP [0]= top->mx;
+    P_TOP [1]= top->my;
+    P_TOPRIGHT[0]= tr->mx;
+    P_TOPRIGHT[1]= tr->my;
+    
+    last_mv[0][0]= s->block[index].mx;
+    last_mv[0][1]= s->block[index].my;
+    last_mv[1][0]= right->mx;
+    last_mv[1][1]= right->my;
+    last_mv[2][0]= bottom->mx;
+    last_mv[2][1]= bottom->my;
+    
+    s->m.mb_stride=2;
+    s->m.mb_x= 
+    s->m.mb_y= 0;
+    s->m.me.skip= 0;
+
+    init_ref(c, current_data, s->last_picture.data, NULL, block_w*x, block_w*y, 0);
+    
+    assert(s->m.me.  stride ==   stride);
+    assert(s->m.me.uvstride == uvstride);
+    
+    c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
+    c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
+    c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
+    c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
+    
+    c->xmin = - x*block_w - 16+2;
+    c->ymin = - y*block_w - 16+2;
+    c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
+    c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
+
+    if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
+    if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift); 
+    if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
+    if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
+    if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
+    if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
+    if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
+
+    P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
+    P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
+
+    if (!y) {
+        c->pred_x= P_LEFT[0];
+        c->pred_y= P_LEFT[1];
+    } else {
+        c->pred_x = P_MEDIAN[0];
+        c->pred_y = P_MEDIAN[1];
+    }
+
+    score= ff_epzs_motion_search(&s->m, &mx, &my, P, 0, /*ref_index*/ 0, last_mv, 
+                             (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
+
+    assert(mx >= c->xmin);
+    assert(mx <= c->xmax);
+    assert(my >= c->ymin);
+    assert(my <= c->ymax);
+    
+    score= s->m.me.sub_motion_search(&s->m, &mx, &my, score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
+    score= ff_get_mb_score(&s->m, mx, my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
+    //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
+                             
+  //  subpel search
+    pc= s->c;
+    pc.bytestream_start=
+    pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
+    memcpy(p_state, s->block_state, sizeof(s->block_state));
+
+    if(level!=s->block_max_depth)
+        put_rac(&pc, &p_state[4 + s_context], 1);
+    put_rac(&pc, &p_state[1 + left->type + top->type], 0);
+    put_symbol(&pc, &p_state[128 + 32*mx_context], mx - pmx, 1);
+    put_symbol(&pc, &p_state[128 + 32*my_context], my - pmy, 1);
+    p_len= pc.bytestream - pc.bytestream_start;
+    score += (s->lambda2*(p_len*8
+              + (pc.outstanding_count - s->c.outstanding_count)*8
+              + (-av_log2(pc.range)    + av_log2(s->c.range))
+             ))>>FF_LAMBDA_SHIFT;
+
+    block_s= block_w*block_w;
+    sum = pix_sum(&current_mb[0][0], stride, block_w);
+    l= (sum + block_s/2)/block_s;
+    iscore = pix_norm1(&current_mb[0][0], stride, block_w) - 2*l*sum + l*l*block_s;
+    
+    block_s= block_w*block_w>>2;
+    sum = pix_sum(&current_mb[1][0], uvstride, block_w>>1);
+    cb= (sum + block_s/2)/block_s;
+//    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
+    sum = pix_sum(&current_mb[2][0], uvstride, block_w>>1);
+    cr= (sum + block_s/2)/block_s;
+//    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
+
+    ic= s->c;
+    ic.bytestream_start=
+    ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
+    memcpy(i_state, s->block_state, sizeof(s->block_state));
+    if(level!=s->block_max_depth)
+        put_rac(&ic, &i_state[4 + s_context], 1);
+    put_rac(&ic, &i_state[1 + left->type + top->type], 1);
+    put_symbol(&ic, &i_state[32],  l-pl , 1);
+    put_symbol(&ic, &i_state[64], cb-pcb, 1);
+    put_symbol(&ic, &i_state[96], cr-pcr, 1);
+    i_len= ic.bytestream - ic.bytestream_start;
+    iscore += (s->lambda2*(i_len*8
+              + (ic.outstanding_count - s->c.outstanding_count)*8
+              + (-av_log2(ic.range)    + av_log2(s->c.range))
+             ))>>FF_LAMBDA_SHIFT;
+
+//    assert(score==256*256*256*64-1);
+    assert(iscore < 255*255*256 + s->lambda2*10);
+    assert(iscore >= 0);
+    assert(l>=0 && l<=255);
+    assert(pl>=0 && pl<=255);
+
+    if(level==0){
+        int varc= iscore >> 8;
+        int vard= score >> 8;
+        if (vard <= 64 || vard < varc)
+            c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
+        else
+            c->scene_change_score+= s->m.qscale;
+    }
+        
+    if(level!=s->block_max_depth){
+        put_rac(&s->c, &s->block_state[4 + s_context], 0);
+        score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
+        score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
+        score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
+        score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
+    
+        if(score2 < score && score2 < iscore)
+            return score2;
+    }
+    
+    if(iscore < score){
+        memcpy(pbbak, i_buffer, i_len);
+        s->c= ic;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + i_len;
+        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, BLOCK_INTRA);
+        memcpy(s->block_state, i_state, sizeof(s->block_state));
+        return iscore;
+    }else{
+        memcpy(pbbak, p_buffer, p_len);
+        s->c= pc;
+        s->c.bytestream_start= pbbak_start;
+        s->c.bytestream= pbbak + p_len;
+        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, 0);
+        memcpy(s->block_state, p_state, sizeof(s->block_state));
+        return score;
+    }
+}
+
+static void decode_q_branch(SnowContext *s, int level, int x, int y){
+    const int w= s->b_width << s->block_max_depth;
+    const int rem_depth= s->block_max_depth - level;
+    const int index= (x + y*w) << rem_depth;
+    static BlockNode null_block= { //FIXME add border maybe
+        .color= {128,128,128},
+        .mx= 0,
+        .my= 0,
+        .type= 0,
+        .level= 0,
+    };
+    int trx= (x+1)<<rem_depth;
+    BlockNode *left  = x ? &s->block[index-1] : &null_block;
+    BlockNode *top   = y ? &s->block[index-w] : &null_block;
+    BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
+    BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
+    int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    
+    if(s->keyframe){
+        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, BLOCK_INTRA);
+        return;
+    }
+
+    if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
+        int type;
+        int l = left->color[0];
+        int cb= left->color[1];
+        int cr= left->color[2];
+        int mx= mid_pred(left->mx, top->mx, tr->mx);
+        int my= mid_pred(left->my, top->my, tr->my);
+        int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 0*av_log2(2*ABS(tr->mx - top->mx));
+        int my_context= av_log2(2*ABS(left->my - top->my)) + 0*av_log2(2*ABS(tr->my - top->my));
+        
+        type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
+
+        if(type){
+            l += get_symbol(&s->c, &s->block_state[32], 1);
+            cb+= get_symbol(&s->c, &s->block_state[64], 1);
+            cr+= get_symbol(&s->c, &s->block_state[96], 1);
+        }else{
+            mx+= get_symbol(&s->c, &s->block_state[128 + 32*mx_context], 1);
+            my+= get_symbol(&s->c, &s->block_state[128 + 32*my_context], 1);
+        }
+        set_blocks(s, level, x, y, l, cb, cr, mx, my, type);
+    }else{
+        decode_q_branch(s, level+1, 2*x+0, 2*y+0);
+        decode_q_branch(s, level+1, 2*x+1, 2*y+0);
+        decode_q_branch(s, level+1, 2*x+0, 2*y+1);
+        decode_q_branch(s, level+1, 2*x+1, 2*y+1);
+    }
+}
+
+static void encode_blocks(SnowContext *s){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+
+    for(y=0; y<h; y++){
+        if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
+            av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+            return;
+        }
+        for(x=0; x<w; x++){
+            encode_q_branch(s, 0, x, y);
+        }
+    }
+}
+
+static void decode_blocks(SnowContext *s){
+    int x, y;
+    int w= s->b_width;
+    int h= s->b_height;
+
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            decode_q_branch(s, 0, x, y);
+        }
+    }
+}
+
+static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
+    int x, y;
+START_TIMER
+    for(y=0; y < b_h+5; y++){
+        for(x=0; x < b_w; x++){
+            int a0= src[x    ];
+            int a1= src[x + 1];
+            int a2= src[x + 2];
+            int a3= src[x + 3];
+            int a4= src[x + 4];
+            int a5= src[x + 5];
+//            int am= 9*(a1+a2) - (a0+a3);
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+//            int am= 18*(a2+a3) - 2*(a1+a4);
+//             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
+//             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
+
+//            if(b_w==16) am= 8*(a1+a2);
+
+            if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
+            else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
+            
+            /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
+            if(am&(~255)) am= ~(am>>31);
+            
+            tmp[x] = am;
+
+/*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
+            else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
+            else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
+            else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
+        }
+        tmp += stride;
+        src += stride;
+    }
+    tmp -= (b_h+5)*stride;
+    
+    for(y=0; y < b_h; y++){
+        for(x=0; x < b_w; x++){
+            int a0= tmp[x + 0*stride];
+            int a1= tmp[x + 1*stride];
+            int a2= tmp[x + 2*stride];
+            int a3= tmp[x + 3*stride];
+            int a4= tmp[x + 4*stride];
+            int a5= tmp[x + 5*stride];
+            int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
+//            int am= 18*(a2+a3) - 2*(a1+a4);
+/*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
+            int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
+            
+//            if(b_w==16) am= 8*(a1+a2);
+
+            if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
+            else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
+
+            if(am&(~255)) am= ~(am>>31);
+            
+            dst[x] = am;
+/*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
+            else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
+            else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
+            else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
+        }
+        dst += stride;
+        tmp += stride;
+    }
+STOP_TIMER("mc_block")
+}
+
+#define mca(dx,dy,b_w)\
+static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
+    uint8_t tmp[stride*(b_w+5)];\
+    assert(h==b_w);\
+    mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
+}
+
+mca( 0, 0,16)
+mca( 8, 0,16)
+mca( 0, 8,16)
+mca( 8, 8,16)
+mca( 0, 0,8)
+mca( 8, 0,8)
+mca( 0, 8,8)
+mca( 8, 8,8)
+
+static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
+    if(block->type){
+        int x, y;
+        const int color= block->color[plane_index];
+        for(y=0; y < b_h; y++){
+            for(x=0; x < b_w; x++){
+                dst[x + y*stride]= color;
+            }
+        }
+    }else{
+        const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
+        int mx= block->mx*scale;
+        int my= block->my*scale;
+        const int dx= mx&15;
+        const int dy= my&15;
+        sx += (mx>>4) - 2;
+        sy += (my>>4) - 2;
+        src += sx + sy*stride;
+        if(   (unsigned)sx >= w - b_w - 4
+           || (unsigned)sy >= h - b_h - 4){
+            ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
+            src= tmp + MB_SIZE;
+        }
+        if((dx&3) || (dy&3) || b_w!=b_h || (b_w!=4 && b_w!=8 && b_w!=16))
+            mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
+        else
+            s->dsp.put_h264_qpel_pixels_tab[2-(b_w>>3)][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
+    }
+}
+
+static always_inline int same_block(BlockNode *a, BlockNode *b){
+    return !((a->mx - b->mx) | (a->my - b->my) | a->type | b->type);
+}
+
+//FIXME name clenup (b_w, block_w, b_width stuff)
+static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
+    DWTELEM * dst = NULL;
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    BlockNode *lt= &s->block[b_x + b_y*b_stride];
+    BlockNode *rt= lt+1;
+    BlockNode *lb= lt+b_stride;
+    BlockNode *rb= lb+1;
+    uint8_t *block[4]; 
+    uint8_t tmp[src_stride*(b_h+5)]; //FIXME align
+    int x,y;
+
+    if(b_x<0){
+        lt= rt;
+        lb= rb;
+    }else if(b_x + 1 >= b_width){
+        rt= lt;
+        rb= lb;
+    }
+    if(b_y<0){
+        lt= lb;
+        rt= rb;
+    }else if(b_y + 1 >= b_height){
+        lb= lt;
+        rb= rt;
+    }
+        
+    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
+        obmc -= src_x;
+        b_w += src_x;
+        src_x=0;
+    }else if(src_x + b_w > w){
+        b_w = w - src_x;
+    }
+    if(src_y<0){
+        obmc -= src_y*obmc_stride;
+        b_h += src_y;
+        src_y=0;
+    }else if(src_y + b_h> h){
+        b_h = h - src_y;
+    }
+    
+    if(b_w<=0 || b_h<=0) return;
+
+assert(src_stride > 7*MB_SIZE);
+//    old_dst += src_x + src_y*dst_stride;
+    dst8+= src_x + src_y*src_stride;
+//    src += src_x + src_y*src_stride;
+
+    block[0]= tmp+3*MB_SIZE;
+    pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);    
+
+    if(same_block(lt, rt)){
+        block[1]= block[0];
+    }else{
+        block[1]= tmp + 4*MB_SIZE;
+        pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+    }
+        
+    if(same_block(lt, lb)){
+        block[2]= block[0];
+    }else if(same_block(rt, lb)){
+        block[2]= block[1];
+    }else{
+        block[2]= tmp+5*MB_SIZE;
+        pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+    }
+
+    if(same_block(lt, rb) ){
+        block[3]= block[0];
+    }else if(same_block(rt, rb)){
+        block[3]= block[1];
+    }else if(same_block(lb, rb)){
+        block[3]= block[2];
+    }else{
+        block[3]= tmp+6*MB_SIZE;
+        pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+    }
+#if 0
+    for(y=0; y<b_h; y++){
+        for(x=0; x<b_w; x++){
+            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+    for(y=0; y<b_h; y++){
+        uint8_t *obmc2= obmc + (obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+    for(y=0; y<b_h; y++){
+        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+    for(y=0; y<b_h; y++){
+        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
+        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+#else
+{
+
+    START_TIMER
+    
+    int block_index = 0;
+    for(y=0; y<b_h; y++){
+        //FIXME ugly missue of obmc_stride
+        uint8_t *obmc1= obmc + y*obmc_stride;
+        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        dst = slice_buffer_get_line(sb, src_y + y);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v += 1<<(7 - FRAC_BITS);
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+//                v += old_dst[x + y*dst_stride];
+                v += dst[x + src_x];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+//                old_dst[x + y*dst_stride] -= v;
+                dst[x + src_x] -= v;
+            }
+        }
+    }
+        STOP_TIMER("Inner add y block")
+}
+#endif
+}
+
+//FIXME name clenup (b_w, block_w, b_width stuff)
+static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
+    const int b_width = s->b_width  << s->block_max_depth;
+    const int b_height= s->b_height << s->block_max_depth;
+    const int b_stride= b_width;
+    BlockNode *lt= &s->block[b_x + b_y*b_stride];
+    BlockNode *rt= lt+1;
+    BlockNode *lb= lt+b_stride;
+    BlockNode *rb= lb+1;
+    uint8_t *block[4]; 
+    uint8_t tmp[src_stride*(b_h+5)]; //FIXME align
+    int x,y;
+
+    if(b_x<0){
+        lt= rt;
+        lb= rb;
+    }else if(b_x + 1 >= b_width){
+        rt= lt;
+        rb= lb;
+    }
+    if(b_y<0){
+        lt= lb;
+        rt= rb;
+    }else if(b_y + 1 >= b_height){
+        lb= lt;
+        rb= rt;
+    }
+        
+    if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
+        obmc -= src_x;
+        b_w += src_x;
+        src_x=0;
+    }else if(src_x + b_w > w){
+        b_w = w - src_x;
+    }
+    if(src_y<0){
+        obmc -= src_y*obmc_stride;
+        b_h += src_y;
+        src_y=0;
+    }else if(src_y + b_h> h){
+        b_h = h - src_y;
+    }
+    
+    if(b_w<=0 || b_h<=0) return;
+
+assert(src_stride > 7*MB_SIZE);
+    dst += src_x + src_y*dst_stride;
+    dst8+= src_x + src_y*src_stride;
+//    src += src_x + src_y*src_stride;
+
+    block[0]= tmp+3*MB_SIZE;
+    pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);    
+
+    if(same_block(lt, rt)){
+        block[1]= block[0];
+    }else{
+        block[1]= tmp + 4*MB_SIZE;
+        pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+    }
+        
+    if(same_block(lt, lb)){
+        block[2]= block[0];
+    }else if(same_block(rt, lb)){
+        block[2]= block[1];
+    }else{
+        block[2]= tmp+5*MB_SIZE;
+        pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+    }
+
+    if(same_block(lt, rb) ){
+        block[3]= block[0];
+    }else if(same_block(rt, rb)){
+        block[3]= block[1];
+    }else if(same_block(lb, rb)){
+        block[3]= block[2];
+    }else{
+        block[3]= tmp+6*MB_SIZE;
+        pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+    }
+#if 0
+    for(y=0; y<b_h; y++){
+        for(x=0; x<b_w; x++){
+            int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+    for(y=0; y<b_h; y++){
+        uint8_t *obmc2= obmc + (obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+    for(y=0; y<b_h; y++){
+        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+    for(y=0; y<b_h; y++){
+        uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
+        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
+            if(add) dst[x + y*dst_stride] += v;
+            else    dst[x + y*dst_stride] -= v;
+        }
+    }
+#else
+    for(y=0; y<b_h; y++){
+        //FIXME ugly missue of obmc_stride
+        uint8_t *obmc1= obmc + y*obmc_stride;
+        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+            
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v += 1<<(7 - FRAC_BITS);
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+                v += dst[x + y*dst_stride];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+                dst[x + y*dst_stride] -= v;
+            }
+        }
+    }
+#endif
+}
+
+static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size/2 : block_size;
+    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
+    int obmc_stride= plane_index ? block_size : 2*block_size;
+    int ref_stride= s->current_picture.linesize[plane_index];
+    uint8_t *ref  = s->last_picture.data[plane_index];
+    uint8_t *dst8= s->current_picture.data[plane_index];
+    int w= p->width;
+    int h= p->height;
+    START_TIMER
+    
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
+            {
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                DWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++)
+                {
+//                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
+            {
+//                DWTELEM * line = slice_buffer_get_line(sb, y);
+                DWTELEM * line = sb->line[y];
+                for(x=0; x<w; x++)
+                {
+                    line[x] -= 128 << FRAC_BITS;
+//                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+    
+        for(mb_x=0; mb_x<=mb_w; mb_x++){
+            START_TIMER
+
+            add_yblock_buffered(s, sb, old_buffer, dst8, ref, obmc, 
+                       block_w*mb_x - block_w/2,
+                       block_w*mb_y - block_w/2,
+                       block_w, block_w,
+                       w, h,
+                       w, ref_stride, obmc_stride,
+                       mb_x - 1, mb_y - 1,
+                       add, plane_index);
+            
+            STOP_TIMER("add_yblock")
+        }
+    
+    STOP_TIMER("predict_slice")
+}
+
+static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
+    Plane *p= &s->plane[plane_index];
+    const int mb_w= s->b_width  << s->block_max_depth;
+    const int mb_h= s->b_height << s->block_max_depth;
+    int x, y, mb_x;
+    int block_size = MB_SIZE >> s->block_max_depth;
+    int block_w    = plane_index ? block_size/2 : block_size;
+    const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
+    int obmc_stride= plane_index ? block_size : 2*block_size;
+    int ref_stride= s->current_picture.linesize[plane_index];
+    uint8_t *ref  = s->last_picture.data[plane_index];
+    uint8_t *dst8= s->current_picture.data[plane_index];
+    int w= p->width;
+    int h= p->height;
+    START_TIMER
+    
+    if(s->keyframe || (s->avctx->debug&512)){
+        if(mb_y==mb_h)
+            return;
+
+        if(add){
+            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
+                    v >>= FRAC_BITS;
+                    if(v&(~255)) v= ~(v>>31);
+                    dst8[x + y*ref_stride]= v;
+                }
+            }
+        }else{
+            for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
+                for(x=0; x<w; x++){
+                    buf[x + y*w]-= 128<<FRAC_BITS;
+                }
+            }
+        }
+
+        return;
+    }
+    
+        for(mb_x=0; mb_x<=mb_w; mb_x++){
+            START_TIMER
+
+            add_yblock(s, buf, dst8, ref, obmc, 
+                       block_w*mb_x - block_w/2,
+                       block_w*mb_y - block_w/2,
+                       block_w, block_w,
+                       w, h,
+                       w, ref_stride, obmc_stride,
+                       mb_x - 1, mb_y - 1,
+                       add, plane_index);
+            
+            STOP_TIMER("add_yblock")
+        }
+    
+    STOP_TIMER("predict_slice")
+}
+
+static always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
+    const int mb_h= s->b_height << s->block_max_depth;
+    int mb_y;
+    for(mb_y=0; mb_y<=mb_h; mb_y++)
+        predict_slice(s, buf, plane_index, add, mb_y);
+}
+
+static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
+    const int level= b->level;
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    int x,y, thres1, thres2;
+    START_TIMER
+
+    if(s->qlog == LOSSLESS_QLOG) return;
+ 
+    bias= bias ? 0 : (3*qmul)>>3;
+    thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
+    thres2= 2*thres1;
+    
+    if(!bias){
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride];
+                
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        src[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i/= qmul; //FIXME optimize
+                        src[x + y*stride]= -i;
+                    }
+                }else
+                    src[x + y*stride]= 0;
+            }
+        }
+    }else{
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int i= src[x + y*stride]; 
+                
+                if((unsigned)(i+thres1) > thres2){
+                    if(i>=0){
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        src[x + y*stride]=  i;
+                    }else{
+                        i= -i;
+                        i<<= QEXPSHIFT;
+                        i= (i + bias) / qmul; //FIXME optimize
+                        src[x + y*stride]= -i;
+                    }
+                }else
+                    src[x + y*stride]= 0;
+            }
+        }
+    }
+    if(level+1 == s->spatial_decomposition_count){
+//        STOP_TIMER("quantize")
+    }
+}
+
+static void dequantize_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+    START_TIMER
+    
+    if(s->qlog == LOSSLESS_QLOG) return;
+    
+    for(y=0; y<h; y++){
+//        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            int i= line[x];
+            if(i<0){
+                line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
+        STOP_TIMER("dquant")
+    }
+}
+
+static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
+    const int w= b->width;
+    const int h= b->height;
+    const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
+    const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+    const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
+    int x,y;
+    START_TIMER
+    
+    if(s->qlog == LOSSLESS_QLOG) return;
+    
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= src[x + y*stride];
+            if(i<0){
+                src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
+            }else if(i>0){
+                src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
+            }
+        }
+    }
+    if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
+        STOP_TIMER("dquant")
+    }
+}
+
+static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+    
+    for(y=h-1; y>=0; y--){
+        for(x=w-1; x>=0; x--){
+            int i= x + y*stride;
+            
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] -= src[i - 1];
+                }else{
+                    if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] -= src[i - 1];
+                }
+            }else{
+                if(y) src[i] -= src[i - stride];
+            }
+        }
+    }
+}
+
+static void correlate_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+    
+//    START_TIMER
+    
+    DWTELEM * line;
+    DWTELEM * prev;
+    
+    for(y=0; y<h; y++){
+        prev = line;
+//        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
+        line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
+        for(x=0; x<w; x++){
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
+                    else  line[x] += line[x - 1];
+                }else{
+                    if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
+                    else  line[x] += line[x - 1];
+                }
+            }else{
+                if(y) line[x] += prev[x];
+            }
+        }
+    }
+    
+//    STOP_TIMER("correlate")
+}
+
+static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
+    const int w= b->width;
+    const int h= b->height;
+    int x,y;
+    
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x++){
+            int i= x + y*stride;
+            
+            if(x){
+                if(use_median){
+                    if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
+                    else  src[i] += src[i - 1];
+                }else{
+                    if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
+                    else  src[i] += src[i - 1];
+                }
+            }else{
+                if(y) src[i] += src[i - stride];
+            }
+        }
+    }
+}
+
+static void encode_header(SnowContext *s){
+    int plane_index, level, orientation;
+    uint8_t kstate[32]; 
+    
+    memset(kstate, MID_STATE, sizeof(kstate));   
+
+    put_rac(&s->c, kstate, s->keyframe);
+    if(s->keyframe || s->always_reset)
+        reset_contexts(s);
+    if(s->keyframe){
+        put_symbol(&s->c, s->header_state, s->version, 0);
+        put_rac(&s->c, s->header_state, s->always_reset);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
+        put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
+        put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
+        put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
+        put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
+        put_rac(&s->c, s->header_state, s->spatial_scalability);
+//        put_rac(&s->c, s->header_state, s->rate_scalability);
+
+        for(plane_index=0; plane_index<2; plane_index++){
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1:0; orientation<4; orientation++){
+                    if(orientation==2) continue;
+                    put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
+                }
+            }
+        }
+    }
+    put_symbol(&s->c, s->header_state, s->spatial_decomposition_type, 0);
+    put_symbol(&s->c, s->header_state, s->qlog, 1); 
+    put_symbol(&s->c, s->header_state, s->mv_scale, 0); 
+    put_symbol(&s->c, s->header_state, s->qbias, 1);
+    put_symbol(&s->c, s->header_state, s->block_max_depth, 0);
+}
+
+static int decode_header(SnowContext *s){
+    int plane_index, level, orientation;
+    uint8_t kstate[32];
+
+    memset(kstate, MID_STATE, sizeof(kstate));   
+
+    s->keyframe= get_rac(&s->c, kstate);
+    if(s->keyframe || s->always_reset)
+        reset_contexts(s);
+    if(s->keyframe){
+        s->version= get_symbol(&s->c, s->header_state, 0);
+        if(s->version>0){
+            av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
+            return -1;
+        }
+        s->always_reset= get_rac(&s->c, s->header_state);
+        s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
+        s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
+        s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
+        s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
+        s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
+        s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
+        s->spatial_scalability= get_rac(&s->c, s->header_state);
+//        s->rate_scalability= get_rac(&s->c, s->header_state);
+
+        for(plane_index=0; plane_index<3; plane_index++){
+            for(level=0; level<s->spatial_decomposition_count; level++){
+                for(orientation=level ? 1:0; orientation<4; orientation++){
+                    int q;
+                    if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
+                    else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
+                    else                    q= get_symbol(&s->c, s->header_state, 1);
+                    s->plane[plane_index].band[level][orientation].qlog= q;
+                }
+            }
+        }
+    }
+    
+    s->spatial_decomposition_type= get_symbol(&s->c, s->header_state, 0);
+    if(s->spatial_decomposition_type > 2){
+        av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
+        return -1;
+    }
+    
+    s->qlog= get_symbol(&s->c, s->header_state, 1);
+    s->mv_scale= get_symbol(&s->c, s->header_state, 0);
+    s->qbias= get_symbol(&s->c, s->header_state, 1);
+    s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
+
+    return 0;
+}
+
+static void init_qexp(){
+    int i;
+    double v=128;
+
+    for(i=0; i<QROOT; i++){
+        qexp[i]= lrintf(v);
+        v *= pow(2, 1.0 / QROOT); 
+    }
+}
+
+static int common_init(AVCodecContext *avctx){
+    SnowContext *s = avctx->priv_data;
+    int width, height;
+    int level, orientation, plane_index, dec;
+
+    s->avctx= avctx;
+        
+    dsputil_init(&s->dsp, avctx);
+
+#define mcf(dx,dy)\
+    s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
+    s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
+        s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
+    s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
+    s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
+        s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
+
+    mcf( 0, 0)
+    mcf( 4, 0)
+    mcf( 8, 0)
+    mcf(12, 0)
+    mcf( 0, 4)
+    mcf( 4, 4)
+    mcf( 8, 4)
+    mcf(12, 4)
+    mcf( 0, 8)
+    mcf( 4, 8)
+    mcf( 8, 8)
+    mcf(12, 8)
+    mcf( 0,12)
+    mcf( 4,12)
+    mcf( 8,12)
+    mcf(12,12)
+
+#define mcfh(dx,dy)\
+    s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
+    s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 16;\
+    s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
+    s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
+        mc_block_hpel ## dx ## dy ## 8;
+
+    mcfh(0, 0)
+    mcfh(8, 0)
+    mcfh(0, 8)
+    mcfh(8, 8)
+
+    if(!qexp[0])
+        init_qexp();
+
+    dec= s->spatial_decomposition_count= 5;
+    s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
+    
+    s->chroma_h_shift= 1; //FIXME XXX
+    s->chroma_v_shift= 1;
+    
+//    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
+    
+    width= s->avctx->width;
+    height= s->avctx->height;
+
+    s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
+    
+    s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
+    s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
+    
+    for(plane_index=0; plane_index<3; plane_index++){    
+        int w= s->avctx->width;
+        int h= s->avctx->height;
+
+        if(plane_index){
+            w>>= s->chroma_h_shift;
+            h>>= s->chroma_v_shift;
+        }
+        s->plane[plane_index].width = w;
+        s->plane[plane_index].height= h;
+//av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
+        for(level=s->spatial_decomposition_count-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+                
+                b->buf= s->spatial_dwt_buffer;
+                b->level= level;
+                b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
+                b->width = (w + !(orientation&1))>>1;
+                b->height= (h + !(orientation>1))>>1;
+                
+                b->stride_line = 1 << (s->spatial_decomposition_count - level);
+                b->buf_x_offset = 0;
+                b->buf_y_offset = 0;
+                
+                if(orientation&1){
+                    b->buf += (w+1)>>1;
+                    b->buf_x_offset = (w+1)>>1;
+                }
+                if(orientation>1){
+                    b->buf += b->stride>>1;
+                    b->buf_y_offset = b->stride_line >> 1;
+                }
+                
+                if(level)
+                    b->parent= &s->plane[plane_index].band[level-1][orientation];
+                b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
+            }
+            w= (w+1)>>1;
+            h= (h+1)>>1;
+        }
+    }
+    
+    reset_contexts(s);
+/*    
+    width= s->width= avctx->width;
+    height= s->height= avctx->height;
+    
+    assert(width && height);
+*/
+    s->avctx->get_buffer(s->avctx, &s->mconly_picture);
+    
+    return 0;
+}
+
+
+static void calculate_vissual_weight(SnowContext *s, Plane *p){
+    int width = p->width;
+    int height= p->height;
+    int level, orientation, x, y;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &p->band[level][orientation];
+            DWTELEM *buf= b->buf;
+            int64_t error=0;
+            
+            memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
+            buf[b->width/2 + b->height/2*b->stride]= 256*256;
+            ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= s->spatial_dwt_buffer[x + y*width];
+                    error += d*d;
+                }
+            }
+
+            b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
+//            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
+        }
+    }
+}
+
+static int encode_init(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int plane_index;
+
+    if(avctx->strict_std_compliance >= 0){
+        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it wont be decodeable with future versions!!!\n"
+               "use vstrict=-1 / -strict -1 to use it anyway\n");
+        return -1;
+    }
+ 
+    common_init(avctx);
+    alloc_blocks(s);
+ 
+    s->version=0;
+    
+    s->m.avctx   = avctx;
+    s->m.flags   = avctx->flags;
+    s->m.bit_rate= avctx->bit_rate;
+
+    s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
+    s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
+    h263_encode_init(&s->m); //mv_penalty
+
+    if(avctx->flags&CODEC_FLAG_PASS1){
+        if(!avctx->stats_out)
+            avctx->stats_out = av_mallocz(256);
+    }
+    if(avctx->flags&CODEC_FLAG_PASS2){
+        if(ff_rate_control_init(&s->m) < 0)
+            return -1;
+    }
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        calculate_vissual_weight(s, &s->plane[plane_index]);
+    }
+    
+    
+    avctx->coded_frame= &s->current_picture;
+    switch(avctx->pix_fmt){
+//    case PIX_FMT_YUV444P:
+//    case PIX_FMT_YUV422P:
+    case PIX_FMT_YUV420P:
+    case PIX_FMT_GRAY8:
+//    case PIX_FMT_YUV411P:
+//    case PIX_FMT_YUV410P:
+        s->colorspace_type= 0;
+        break;
+/*    case PIX_FMT_RGBA32:
+        s->colorspace= 1;
+        break;*/
+    default:
+        av_log(avctx, AV_LOG_ERROR, "format not supported\n");
+        return -1;
+    }
+//    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
+    s->chroma_h_shift= 1;
+    s->chroma_v_shift= 1;
+    return 0;
+}
+
+static int frame_start(SnowContext *s){
+   AVFrame tmp;
+   int w= s->avctx->width; //FIXME round up to x16 ?
+   int h= s->avctx->height;
+
+    if(s->current_picture.data[0]){
+        draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
+        draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
+        draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
+    }
+
+    tmp= s->last_picture;
+    s->last_picture= s->current_picture;
+    s->current_picture= tmp;
+    
+    s->current_picture.reference= 1;
+    if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
+        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    
+    return 0;
+}
+
+static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    AVFrame *pict = data;
+    const int width= s->avctx->width;
+    const int height= s->avctx->height;
+    int level, orientation, plane_index;
+
+    ff_init_range_encoder(c, buf, buf_size);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
+    
+    s->input_picture = *pict;
+
+    if(avctx->flags&CODEC_FLAG_PASS2){
+        s->m.pict_type =
+        pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
+        s->keyframe= pict->pict_type==FF_I_TYPE;
+        s->m.picture_number= avctx->frame_number;
+        pict->quality= ff_rate_estimate_qscale(&s->m);
+    }else{
+        s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
+        pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
+    }
+    
+    if(pict->quality){
+        s->qlog= rint(QROOT*log(pict->quality / (float)FF_QP2LAMBDA)/log(2));
+        //<64 >60
+        s->qlog += 61*QROOT/8;
+    }else{
+        s->qlog= LOSSLESS_QLOG;
+    }
+
+    frame_start(s);
+    s->current_picture.key_frame= s->keyframe;
+
+    s->m.current_picture_ptr= &s->m.current_picture;
+    if(pict->pict_type == P_TYPE){
+        int block_width = (width +15)>>4;
+        int block_height= (height+15)>>4;
+        int stride= s->current_picture.linesize[0];
+        
+        assert(s->current_picture.data[0]);
+        assert(s->last_picture.data[0]);
+     
+        s->m.avctx= s->avctx;
+        s->m.current_picture.data[0]= s->current_picture.data[0];
+        s->m.   last_picture.data[0]= s->   last_picture.data[0];
+        s->m.    new_picture.data[0]= s->  input_picture.data[0];
+        s->m.   last_picture_ptr= &s->m.   last_picture;
+        s->m.linesize=
+        s->m.   last_picture.linesize[0]=
+        s->m.    new_picture.linesize[0]=
+        s->m.current_picture.linesize[0]= stride;
+        s->m.uvlinesize= s->current_picture.linesize[1];
+        s->m.width = width;
+        s->m.height= height;
+        s->m.mb_width = block_width;
+        s->m.mb_height= block_height;
+        s->m.mb_stride=   s->m.mb_width+1;
+        s->m.b8_stride= 2*s->m.mb_width+1;
+        s->m.f_code=1;
+        s->m.pict_type= pict->pict_type;
+        s->m.me_method= s->avctx->me_method;
+        s->m.me.scene_change_score=0;
+        s->m.flags= s->avctx->flags;
+        s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
+        s->m.out_format= FMT_H263;
+        s->m.unrestricted_mv= 1;
+
+        s->lambda = s->m.lambda= pict->quality * 3/2; //FIXME bug somewhere else
+        s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
+        s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
+
+        s->m.dsp= s->dsp; //move
+        ff_init_me(&s->m);
+    }
+    
+redo_frame:
+        
+    s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
+
+    encode_header(s);
+    s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    encode_blocks(s);
+    s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
+      
+    for(plane_index=0; plane_index<3; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+//        int bits= put_bits_count(&s->c.pb);
+
+        //FIXME optimize
+     if(pict->data[plane_index]) //FIXME gray hack
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                s->spatial_dwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
+            }
+        }
+        predict_plane(s, s->spatial_dwt_buffer, plane_index, 0);
+        
+        if(   plane_index==0 
+           && pict->pict_type == P_TYPE 
+           && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
+            ff_init_range_encoder(c, buf, buf_size);
+            ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
+            pict->pict_type= FF_I_TYPE;
+            s->keyframe=1;
+            reset_contexts(s);
+            goto redo_frame;
+        }
+        
+        if(s->qlog == LOSSLESS_QLOG){
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    s->spatial_dwt_buffer[y*w + x]= (s->spatial_dwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
+                }
+            }
+        }
+ 
+        ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &p->band[level][orientation];
+                
+                quantize(s, b, b->buf, b->stride, s->qbias);
+                if(orientation==0)
+                    decorrelate(s, b, b->buf, b->stride, pict->pict_type == P_TYPE, 0);
+                encode_subband(s, b, b->buf, b->parent ? b->parent->buf : NULL, b->stride, orientation);
+                assert(b->parent==NULL || b->parent->stride == b->stride*2);
+                if(orientation==0)
+                    correlate(s, b, b->buf, b->stride, 1, 0);
+            }
+        }
+//        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
+
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &p->band[level][orientation];
+
+                dequantize(s, b, b->buf, b->stride);
+            }
+        }
+
+        ff_spatial_idwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
+        if(s->qlog == LOSSLESS_QLOG){
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    s->spatial_dwt_buffer[y*w + x]<<=FRAC_BITS;
+                }
+            }
+        }
+{START_TIMER
+        predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
+STOP_TIMER("pred-conv")}
+        if(s->avctx->flags&CODEC_FLAG_PSNR){
+            int64_t error= 0;
+            
+    if(pict->data[plane_index]) //FIXME gray hack
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                    error += d*d;
+                }
+            }
+            s->avctx->error[plane_index] += error;
+            s->current_picture.error[plane_index] = error;
+        }
+    }
+
+    if(s->last_picture.data[0])
+        avctx->release_buffer(avctx, &s->last_picture);
+
+    s->current_picture.coded_picture_number = avctx->frame_number;
+    s->current_picture.pict_type = pict->pict_type;
+    s->current_picture.quality = pict->quality;
+    if(avctx->flags&CODEC_FLAG_PASS1){
+        s->m.p_tex_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits - s->m.mv_bits;
+        s->m.current_picture.display_picture_number =
+        s->m.current_picture.coded_picture_number = avctx->frame_number;
+        s->m.pict_type = pict->pict_type;
+        s->m.current_picture.quality = pict->quality;
+        ff_write_pass1_stats(&s->m);
+    }
+    if(avctx->flags&CODEC_FLAG_PASS2){
+        s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
+    }
+
+    emms_c();
+    
+    return ff_rac_terminate(c);
+}
+
+static void common_end(SnowContext *s){
+    int plane_index, level, orientation;
+
+    av_freep(&s->spatial_dwt_buffer);
+
+    av_freep(&s->m.me.scratchpad);    
+    av_freep(&s->m.me.map);
+    av_freep(&s->m.me.score_map);
+ 
+    av_freep(&s->block);
+
+    for(plane_index=0; plane_index<3; plane_index++){    
+        for(level=s->spatial_decomposition_count-1; level>=0; level--){
+            for(orientation=level ? 1 : 0; orientation<4; orientation++){
+                SubBand *b= &s->plane[plane_index].band[level][orientation];
+                
+                av_freep(&b->x_coeff);
+            }
+        }
+    }
+}
+
+static int encode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    common_end(s);
+    av_free(avctx->stats_out);
+
+    return 0;
+}
+
+static int decode_init(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+    int block_size;
+
+    common_init(avctx);
+    
+    block_size = MB_SIZE >> s->block_max_depth;
+    /* FIXME block_size * 2 is determined empirically. block_size * 1.5 is definitely needed, but I (Robert) cannot figure out why more than that is needed. Perhaps there is a bug, or perhaps I overlooked some demands that are placed on the buffer. */
+    /* FIXME The formula is WRONG. For height > 480, the buffer will overflow. */
+    /* FIXME For now, I will use a full frame of lines. Fortunately, this should not materially effect cache performance because lines are allocated using a stack, so if in fact only 50 out of 496 lines are needed at a time, the other 446 will sit allocated but never accessed. */
+//    slice_buffer_init(s->plane[0].sb, s->plane[0].height, (block_size * 2) + (s->spatial_decomposition_count * s->spatial_decomposition_count), s->plane[0].width, s->spatial_dwt_buffer);
+    slice_buffer_init(&s->sb, s->plane[0].height, s->plane[0].height, s->plane[0].width, s->spatial_dwt_buffer);
+    
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
+    SnowContext *s = avctx->priv_data;
+    RangeCoder * const c= &s->c;
+    int bytes_read;
+    AVFrame *picture = data;
+    int level, orientation, plane_index;
+
+    ff_init_range_decoder(c, buf, buf_size);
+    ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
+
+    s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
+    decode_header(s);
+    if(!s->block) alloc_blocks(s);
+
+    frame_start(s);
+    //keyframe flag dupliaction mess FIXME
+    if(avctx->debug&FF_DEBUG_PICT_INFO)
+        av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
+    
+    decode_blocks(s);
+
+    for(plane_index=0; plane_index<3; plane_index++){
+        Plane *p= &s->plane[plane_index];
+        int w= p->width;
+        int h= p->height;
+        int x, y;
+        int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
+        SubBand * correlate_band;
+        
+if(s->avctx->debug&2048){
+        memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
+        predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
+
+        for(y=0; y<h; y++){
+            for(x=0; x<w; x++){
+                int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
+                s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
+            }
+        }
+}
+
+{   START_TIMER
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &p->band[level][orientation];
+            unpack_coeffs(s, b, b->parent, orientation);
+        }
+    }
+    STOP_TIMER("unpack coeffs");
+}
+        
+        /* Handle level 0, orientation 0 specially. It is particularly resistant to slicing but fortunately quite small, so process it in one pass. */
+        correlate_band = &p->band[0][0];
+        decode_subband_slice_buffered(s, correlate_band, &s->sb, 0, correlate_band->height, decode_state[0][0]);
+        correlate_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, 1, 0);
+        dequantize_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride);
+
+{START_TIMER
+    const int mb_h= s->b_height << s->block_max_depth;
+    const int block_size = MB_SIZE >> s->block_max_depth;
+    const int block_w    = plane_index ? block_size/2 : block_size;
+    int mb_y;
+    dwt_compose_t cs[MAX_DECOMPOSITIONS];
+    int yd=0, yq=0;
+    int y;
+    int end_y;
+
+    ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
+    for(mb_y=0; mb_y<=mb_h; mb_y++){
+        
+        const int slice_starty = block_w*mb_y;
+        const int slice_h = block_w*(mb_y+1);
+
+        {        
+        START_TIMER
+        for(level=0; level<s->spatial_decomposition_count; level++){
+            for(orientation=level ? 1 : 1; orientation<4; orientation++){
+                SubBand *b= &p->band[level][orientation];
+                int start_y;
+                int end_y;
+                int our_mb_start = mb_y;
+                int our_mb_end = (mb_y + 1);
+                start_y = FFMIN(b->height, (mb_y ? ((block_w * our_mb_start - 4) >> (s->spatial_decomposition_count - level)) + 5 : 0));
+                end_y = FFMIN(b->height, (((block_w * our_mb_end - 4) >> (s->spatial_decomposition_count - level)) + 5));
+                    
+                if (start_y != end_y)
+                    decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
+            }
+        }
+        STOP_TIMER("decode_subband_slice");
+        }
+        
+{   START_TIMER
+        for(; yd<slice_h; yd+=4){
+            ff_spatial_idwt_buffered_slice(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+        }
+    STOP_TIMER("idwt slice");}
+
+        
+        if(s->qlog == LOSSLESS_QLOG){
+            for(; yq<slice_h && yq<h; yq++){
+                DWTELEM * line = slice_buffer_get_line(&s->sb, yq);
+                for(x=0; x<w; x++){
+                    line[x] <<= FRAC_BITS;
+                }
+            }
+        }
+
+        predict_slice_buffered(s, &s->sb, s->spatial_dwt_buffer, plane_index, 1, mb_y);
+        
+        /* Nasty hack based empirically on how predict_slice_buffered() hits the buffer. */
+        /* FIXME If possible, make predict_slice fit into the slice. As of now, it works on some previous lines (up to slice_height / 2) if the condition on the next line is false. */
+        if (s->keyframe || (s->avctx->debug&512)){
+            y = FFMIN(p->height, slice_starty);
+            end_y = FFMIN(p->height, slice_h);
+        }
+        else{
+            y = FFMAX(0, FFMIN(p->height, slice_starty - (block_w >> 1)));
+            end_y = FFMAX(0, FFMIN(p->height, slice_h - (block_w >> 1)));
+        }
+        while(y < end_y)
+            slice_buffer_release(&s->sb, y++);
+    }
+    
+    slice_buffer_flush(&s->sb);
+    
+STOP_TIMER("idwt + predict_slices")}
+    }
+            
+    emms_c();
+
+    if(s->last_picture.data[0])
+        avctx->release_buffer(avctx, &s->last_picture);
+
+if(!(s->avctx->debug&2048))        
+    *picture= s->current_picture;
+else
+    *picture= s->mconly_picture;
+    
+    *data_size = sizeof(AVFrame);
+    
+    bytes_read= c->bytestream - c->bytestream_start;
+    if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
+
+    return bytes_read;
+}
+
+static int decode_end(AVCodecContext *avctx)
+{
+    SnowContext *s = avctx->priv_data;
+
+    slice_buffer_destroy(&s->sb);
+    
+    common_end(s);
+
+    return 0;
+}
+
+AVCodec snow_decoder = {
+    "snow",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_SNOW,
+    sizeof(SnowContext),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
+    NULL
+};
+
+#ifdef CONFIG_ENCODERS
+AVCodec snow_encoder = {
+    "snow",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_SNOW,
+    sizeof(SnowContext),
+    encode_init,
+    encode_frame,
+    encode_end,
+};
+#endif
+
+
+#if 0
+#undef malloc
+#undef free
+#undef printf
+
+int main(){
+    int width=256;
+    int height=256;
+    int buffer[2][width*height];
+    SnowContext s;
+    int i;
+    s.spatial_decomposition_count=6;
+    s.spatial_decomposition_type=1;
+    
+    printf("testing 5/3 DWT\n");
+    for(i=0; i<width*height; i++)
+        buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
+    
+    ff_spatial_dwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+    ff_spatial_idwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+    
+    for(i=0; i<width*height; i++)
+        if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
+
+    printf("testing 9/7 DWT\n");
+    s.spatial_decomposition_type=0;
+    for(i=0; i<width*height; i++)
+        buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
+    
+    ff_spatial_dwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+    ff_spatial_idwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+    
+    for(i=0; i<width*height; i++)
+        if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
+        
+    printf("testing AC coder\n");
+    memset(s.header_state, 0, sizeof(s.header_state));
+    ff_init_range_encoder(&s.c, buffer[0], 256*256);
+    ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
+        
+    for(i=-256; i<256; i++){
+START_TIMER
+        put_symbol(&s.c, s.header_state, i*i*i/3*ABS(i), 1);
+STOP_TIMER("put_symbol")
+    }
+    ff_rac_terminate(&s.c);
+
+    memset(s.header_state, 0, sizeof(s.header_state));
+    ff_init_range_decoder(&s.c, buffer[0], 256*256);
+    ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
+    
+    for(i=-256; i<256; i++){
+        int j;
+START_TIMER
+        j= get_symbol(&s.c, s.header_state, 1);
+STOP_TIMER("get_symbol")
+        if(j!=i*i*i/3*ABS(i)) printf("fsck: %d != %d\n", i, j);
+    }
+{
+int level, orientation, x, y;
+int64_t errors[8][4];
+int64_t g=0;
+
+    memset(errors, 0, sizeof(errors));
+    s.spatial_decomposition_count=3;
+    s.spatial_decomposition_type=0;
+    for(level=0; level<s.spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            int w= width  >> (s.spatial_decomposition_count-level);
+            int h= height >> (s.spatial_decomposition_count-level);
+            int stride= width  << (s.spatial_decomposition_count-level);
+            DWTELEM *buf= buffer[0];
+            int64_t error=0;
+
+            if(orientation&1) buf+=w;
+            if(orientation>1) buf+=stride>>1;
+            
+            memset(buffer[0], 0, sizeof(int)*width*height);
+            buf[w/2 + h/2*stride]= 256*256;
+            ff_spatial_idwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= buffer[0][x + y*width];
+                    error += d*d;
+                    if(ABS(width/2-x)<9 && ABS(height/2-y)<9 && level==2) printf("%8lld ", d);
+                }
+                if(ABS(height/2-y)<9 && level==2) printf("\n");
+            }
+            error= (int)(sqrt(error)+0.5);
+            errors[level][orientation]= error;
+            if(g) g=ff_gcd(g, error);
+            else g= error;
+        }
+    }
+    printf("static int const visual_weight[][4]={\n");
+    for(level=0; level<s.spatial_decomposition_count; level++){
+        printf("  {");
+        for(orientation=0; orientation<4; orientation++){
+            printf("%8lld,", errors[level][orientation]/g);
+        }
+        printf("},\n");
+    }
+    printf("};\n");
+    {
+            int level=2;
+            int orientation=3;
+            int w= width  >> (s.spatial_decomposition_count-level);
+            int h= height >> (s.spatial_decomposition_count-level);
+            int stride= width  << (s.spatial_decomposition_count-level);
+            DWTELEM *buf= buffer[0];
+            int64_t error=0;
+
+            buf+=w;
+            buf+=stride>>1;
+            
+            memset(buffer[0], 0, sizeof(int)*width*height);
+#if 1
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int tab[4]={0,2,3,1};
+                    buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
+                }
+            }
+            ff_spatial_dwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+#else
+            for(y=0; y<h; y++){
+                for(x=0; x<w; x++){
+                    buf[x + y*stride  ]=169;
+                    buf[x + y*stride-w]=64;
+                }
+            }
+            ff_spatial_idwt(buffer[0], width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
+#endif
+            for(y=0; y<height; y++){
+                for(x=0; x<width; x++){
+                    int64_t d= buffer[0][x + y*width];
+                    error += d*d;
+                    if(ABS(width/2-x)<9 && ABS(height/2-y)<9) printf("%8lld ", d);
+                }
+                if(ABS(height/2-y)<9) printf("\n");
+            }
+    }
+
+}
+    return 0;
+}
+#endif
+
diff --git a/src/libffmpeg/libavcodec/sparc/dsputil_vis.c b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
index e5feff27f..53f38b2aa 100644
--- a/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
+++ b/src/libffmpeg/libavcodec/sparc/dsputil_vis.c
@@ -26,7 +26,7 @@
 
 #include "config.h"
 
-#if defined(ARCH_SPARC) && defined(ENABLE_VIS)
+#ifdef ARCH_SPARC
 
 #include <inttypes.h>
 #include <signal.h>
@@ -3986,21 +3986,6 @@ static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
 
 /* End of no rounding code */
 
-void get_pixels_vis(uint8_t *restrict dest, const uint8_t *_ref, int stride)
-{
-  int i;
-  uint8_t *ref = (uint8_t*)_ref;
-  ref = vis_alignaddr(ref);
-
-  for (i = 0; i < 8; i++)
-    {
-      vis_ld64(ref[0], TMP0);
-      vis_st64(TMP0, dest[0]);
-      dest += 8;
-      ref += stride;
-    }
-}
-
 static sigjmp_buf jmpbuf;
 static volatile sig_atomic_t canjump = 0;
  
@@ -4010,7 +3995,7 @@ static void sigill_handler (int sig)
         signal (sig, SIG_DFL);
         raise (sig);
     }
-                                                                                
+
     canjump = 0;
     siglongjmp (jmpbuf, 1);
 }
@@ -4032,23 +4017,23 @@ static int vis_level ()
  
     /* pdist %f0, %f0, %f0 */
     __asm__ __volatile__(".word\t0x81b007c0");
-                                                                                
+
     canjump = 0;
     accel |= ACCEL_SPARC_VIS;
-                                                                                
+
     if (sigsetjmp (jmpbuf, 1)) {
         signal (SIGILL, SIG_DFL);
         return accel;
     }
-                                                                                
+
     canjump = 1;
-                                                                                
+
     /* edge8n %g0, %g0, %g0 */
     __asm__ __volatile__(".word\t0x81b00020");
-                                                                                
+
     canjump = 0;
     accel |= ACCEL_SPARC_VIS2;
-                                                                                
+
     signal (SIGILL, SIG_DFL);
 
     return accel;
@@ -4061,7 +4046,6 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
   int accel = vis_level ();
 
   if (accel & ACCEL_SPARC_VIS) {
-      c->get_pixels = get_pixels_vis;
       c->put_pixels_tab[0][0] = MC_put_o_16_vis;
       c->put_pixels_tab[0][1] = MC_put_x_16_vis;
       c->put_pixels_tab[0][2] = MC_put_y_16_vis;
@@ -4104,4 +4088,4 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
   }
 }
 
-#endif  /* defined(ARCH_SPARC) && defined(ENABLE_VIS) */
+#endif  /* !(ARCH_SPARC) */
diff --git a/src/libffmpeg/libavcodec/svq1.c b/src/libffmpeg/libavcodec/svq1.c
index 25bc44fd1..068dd51d3 100644
--- a/src/libffmpeg/libavcodec/svq1.c
+++ b/src/libffmpeg/libavcodec/svq1.c
@@ -600,6 +600,7 @@ static uint16_t svq1_packet_checksum (uint8_t *data, int length, int value) {
   return value;
 }
 
+#if 0 /* unused, remove? */
 static uint16_t svq1_component_checksum (uint16_t *pixels, int pitch,
                                          int width, int height, int value) {
   int x, y;
@@ -614,6 +615,7 @@ static uint16_t svq1_component_checksum (uint16_t *pixels, int pitch,
 
   return value;
 }
+#endif
 
 static void svq1_parse_string (GetBitContext *bitbuf, uint8_t *out) {
   uint8_t seed;
@@ -713,10 +715,6 @@ static int svq1_decode_frame(AVCodecContext *avctx,
   int		result, i, x, y, width, height;
   AVFrame *pict = data; 
 
-  if(buf==NULL && buf_size==0){
-      return 0;
-  }
-  
   /* initialize bit buffer */
   init_get_bits(&s->gb,buf,buf_size*8);
 
@@ -844,28 +842,28 @@ static int svq1_decode_init(AVCodecContext *avctx)
 
     init_vlc(&svq1_block_type, 2, 4,
         &svq1_block_type_vlc[0][1], 2, 1,
-        &svq1_block_type_vlc[0][0], 2, 1);
+        &svq1_block_type_vlc[0][0], 2, 1, 1);
 
     init_vlc(&svq1_motion_component, 7, 33,
         &mvtab[0][1], 2, 1,
-        &mvtab[0][0], 2, 1);
+        &mvtab[0][0], 2, 1, 1);
 
     for (i = 0; i < 6; i++) {
         init_vlc(&svq1_intra_multistage[i], 3, 8,
             &svq1_intra_multistage_vlc[i][0][1], 2, 1,
-            &svq1_intra_multistage_vlc[i][0][0], 2, 1);
+            &svq1_intra_multistage_vlc[i][0][0], 2, 1, 1);
         init_vlc(&svq1_inter_multistage[i], 3, 8,
             &svq1_inter_multistage_vlc[i][0][1], 2, 1,
-            &svq1_inter_multistage_vlc[i][0][0], 2, 1);
+            &svq1_inter_multistage_vlc[i][0][0], 2, 1, 1);
     }
 
     init_vlc(&svq1_intra_mean, 8, 256,
         &svq1_intra_mean_vlc[0][1], 4, 2,
-        &svq1_intra_mean_vlc[0][0], 4, 2);
+        &svq1_intra_mean_vlc[0][0], 4, 2, 1);
 
     init_vlc(&svq1_inter_mean, 9, 512,
         &svq1_inter_mean_vlc[0][1], 4, 2,
-        &svq1_inter_mean_vlc[0][0], 4, 2);
+        &svq1_inter_mean_vlc[0][0], 4, 2, 1);
 
     return 0;
 }
@@ -880,6 +878,8 @@ static int svq1_decode_end(AVCodecContext *avctx)
 
 static void svq1_write_header(SVQ1Context *s, int frame_type)
 {
+    int i;
+
     /* frame code */
     put_bits(&s->pb, 22, 0x20);
 
@@ -898,12 +898,22 @@ static void svq1_write_header(SVQ1Context *s, int frame_type)
         /* output 5 unknown bits (2 + 2 + 1) */
         put_bits(&s->pb, 5, 0);
 
-        /* forget about matching up resolutions, just use the free-form
-         * resolution code (7) for now */
-        put_bits(&s->pb, 3, 7);
-        put_bits(&s->pb, 12, s->frame_width);
-        put_bits(&s->pb, 12, s->frame_height);
-
+	for (i = 0; i < 7; i++)
+	{
+	    if ((svq1_frame_size_table[i].width == s->frame_width) &&
+		(svq1_frame_size_table[i].height == s->frame_height))
+	    {
+		put_bits(&s->pb, 3, i);
+		break;
+	    }
+	}
+	
+	if (i == 7)
+	{
+	    put_bits(&s->pb, 3, 7);
+    	    put_bits(&s->pb, 12, s->frame_width);
+    	    put_bits(&s->pb, 12, s->frame_height);
+	}
     }
 
     /* no checksum or extra data (next 2 bits get 0) */
@@ -1069,7 +1079,7 @@ static int encode_block(SVQ1Context *s, uint8_t *src, uint8_t *ref, uint8_t *dec
 
 #ifdef CONFIG_ENCODERS
 
-static void svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plane, unsigned char *ref_plane, unsigned char *decoded_plane,
+static int svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plane, unsigned char *ref_plane, unsigned char *decoded_plane,
     int width, int height, int src_stride, int stride)
 {
     int x, y;
@@ -1108,10 +1118,10 @@ static void svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plan
         s->m.me_method= s->avctx->me_method;
         
         if(!s->motion_val8[plane]){
-            s->motion_val8 [plane]= av_mallocz(s->m.b8_stride*block_height*2*2*sizeof(int16_t));
-            s->motion_val16[plane]= av_mallocz(s->m.mb_stride*block_height*2*sizeof(int16_t));
+            s->motion_val8 [plane]= av_mallocz((s->m.b8_stride*block_height*2 + 2)*2*sizeof(int16_t));
+            s->motion_val16[plane]= av_mallocz((s->m.mb_stride*(block_height + 2) + 1)*2*sizeof(int16_t));
         }
-        
+
         s->m.mb_type= s->mb_type;
         
         //dummies, to avoid segfaults
@@ -1120,8 +1130,8 @@ static void svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plan
         s->m.current_picture.mc_mb_var= (uint16_t*)s->dummy;
         s->m.current_picture.mb_type= s->dummy;
         
-        s->m.current_picture.motion_val[0]= s->motion_val8[plane];
-        s->m.p_mv_table= s->motion_val16[plane];
+        s->m.current_picture.motion_val[0]= s->motion_val8[plane] + 2;
+        s->m.p_mv_table= s->motion_val16[plane] + s->m.mb_stride + 1;
         s->m.dsp= s->dsp; //move
         ff_init_me(&s->m);
     
@@ -1176,6 +1186,11 @@ static void svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plan
             uint8_t *ref= ref_plane + offset;
             int score[4]={0,0,0,0}, best;
             uint8_t temp[16*stride];
+            
+            if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < 3000){ //FIXME check size
+                av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
+                return -1;
+            }
 
             s->m.mb_x= x;
             ff_init_block_index(&s->m);
@@ -1268,6 +1283,7 @@ static void svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plan
         }
         s->m.first_slice_line=0;
     }
+    return 0;
 }
 
 static int svq1_encode_init(AVCodecContext *avctx)
@@ -1287,6 +1303,7 @@ static int svq1_encode_init(AVCodecContext *avctx)
     s->c_block_height = (s->frame_height / 4 + 15) / 16;
 
     s->avctx= avctx;
+    s->m.avctx= avctx;
     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t)); 
     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
@@ -1294,11 +1311,6 @@ static int svq1_encode_init(AVCodecContext *avctx)
     s->dummy          = av_mallocz((s->y_block_width+1)*s->y_block_height*sizeof(int32_t));
     h263_encode_init(&s->m); //mv_penalty
     
-av_log(s->avctx, AV_LOG_INFO, " Hey: %d x %d, %d x %d, %d x %d\n",
-  s->frame_width, s->frame_height,
-  s->y_block_width, s->y_block_height,
-  s->c_block_width, s->c_block_height);
-
     return 0;
 }
 
@@ -1333,10 +1345,11 @@ static int svq1_encode_frame(AVCodecContext *avctx, unsigned char *buf,
 
     svq1_write_header(s, p->pict_type);
     for(i=0; i<3; i++){
-        svq1_encode_plane(s, i,
+        if(svq1_encode_plane(s, i,
             s->picture.data[i], s->last_picture.data[i], s->current_picture.data[i],
             s->frame_width / (i?4:1), s->frame_height / (i?4:1), 
-            s->picture.linesize[i], s->current_picture.linesize[i]);
+            s->picture.linesize[i], s->current_picture.linesize[i]) < 0)
+                return -1;
     }
 
 //    align_put_bits(&s->pb);
diff --git a/src/libffmpeg/libavcodec/svq3.c b/src/libffmpeg/libavcodec/svq3.c
index e064626fc..547679bf1 100644
--- a/src/libffmpeg/libavcodec/svq3.c
+++ b/src/libffmpeg/libavcodec/svq3.c
@@ -370,6 +370,7 @@ static inline int svq3_mc_dir (H264Context *h, int size, int mode, int dir, int
 	dx = svq3_get_se_golomb (&s->gb);
 
 	if (dx == INVALID_VLC || dy == INVALID_VLC) {
+          av_log(h->s.avctx, AV_LOG_ERROR, "invalid MV vlc\n");
 	  return -1;
 	}
       }
@@ -453,8 +454,11 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
 
       mb_type = MB_TYPE_SKIP;
     } else {
-      svq3_mc_dir (h, s->next_picture.mb_type[mb_xy], PREDICT_MODE, 0, 0);
-      svq3_mc_dir (h, s->next_picture.mb_type[mb_xy], PREDICT_MODE, 1, 1);
+      mb_type= FFMIN(s->next_picture.mb_type[mb_xy], 6);
+      if(svq3_mc_dir (h, mb_type, PREDICT_MODE, 0, 0) < 0)
+        return -1;
+      if(svq3_mc_dir (h, mb_type, PREDICT_MODE, 1, 1) < 0)
+        return -1;
 
       mb_type = MB_TYPE_16x16;
     }
@@ -512,17 +516,20 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
 
     /* decode motion vector(s) and form prediction(s) */
     if (s->pict_type == P_TYPE) {
-      svq3_mc_dir (h, (mb_type - 1), mode, 0, 0);
+      if(svq3_mc_dir (h, (mb_type - 1), mode, 0, 0) < 0)
+        return -1;
     } else {	/* B_TYPE */
       if (mb_type != 2) {
-	svq3_mc_dir (h, 0, mode, 0, 0);
+	if(svq3_mc_dir (h, 0, mode, 0, 0) < 0)
+          return -1;
       } else {
 	for (i=0; i < 4; i++) {
 	  memset (s->current_picture.motion_val[0][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
 	}
       }
       if (mb_type != 1) {
-	svq3_mc_dir (h, 0, mode, 1, (mb_type == 3));
+	if(svq3_mc_dir (h, 0, mode, 1, (mb_type == 3)) < 0)
+          return -1;
       } else {
 	for (i=0; i < 4; i++) {
 	  memset (s->current_picture.motion_val[1][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
@@ -558,8 +565,10 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
       for (i=0; i < 16; i+=2) {
 	vlc = svq3_get_ue_golomb (&s->gb);
 
-	if (vlc >= 25)
+	if (vlc >= 25){
+          av_log(h->s.avctx, AV_LOG_ERROR, "luma prediction:%d\n", vlc);
 	  return -1;
+        }
 
 	left	= &h->intra4x4_pred_mode_cache[scan8[i] - 1];
 	top	= &h->intra4x4_pred_mode_cache[scan8[i] - 8];
@@ -567,8 +576,10 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
 	left[1]	= svq3_pred_1[top[0] + 1][left[0] + 1][svq3_pred_0[vlc][0]];
 	left[2]	= svq3_pred_1[top[1] + 1][left[1] + 1][svq3_pred_0[vlc][1]];
 
-	if (left[1] == -1 || left[2] == -1)
+	if (left[1] == -1 || left[2] == -1){
+          av_log(h->s.avctx, AV_LOG_ERROR, "weird prediction\n");
 	  return -1;
+        }
       }
     } else {	/* mb_type == 33, DC_128_PRED block type */
       for (i=0; i < 4; i++) {
@@ -597,8 +608,10 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
     dir = i_mb_type_info[mb_type - 8].pred_mode;
     dir = (dir >> 1) ^ 3*(dir & 1) ^ 1;
 
-    if ((h->intra16x16_pred_mode = check_intra_pred_mode (h, dir)) == -1)
+    if ((h->intra16x16_pred_mode = check_intra_pred_mode (h, dir)) == -1){
+      av_log(h->s.avctx, AV_LOG_ERROR, "check_intra_pred_mode = -1\n");
       return -1;
+    }
 
     cbp = i_mb_type_info[mb_type - 8].cbp;
     mb_type = MB_TYPE_INTRA16x16;
@@ -623,20 +636,26 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
   }
 
   if (!IS_INTRA16x16(mb_type) && (!IS_SKIP(mb_type) || s->pict_type == B_TYPE)) {
-    if ((vlc = svq3_get_ue_golomb (&s->gb)) >= 48)
+    if ((vlc = svq3_get_ue_golomb (&s->gb)) >= 48){
+      av_log(h->s.avctx, AV_LOG_ERROR, "cbp_vlc=%d\n", vlc);
       return -1;
+    }
 
     cbp = IS_INTRA(mb_type) ? golomb_to_intra4x4_cbp[vlc] : golomb_to_inter_cbp[vlc];
   }
   if (IS_INTRA16x16(mb_type) || (s->pict_type != I_TYPE && s->adaptive_quant && cbp)) {
     s->qscale += svq3_get_se_golomb (&s->gb);
 
-    if (s->qscale > 31)
+    if (s->qscale > 31){
+      av_log(h->s.avctx, AV_LOG_ERROR, "qscale:%d\n", s->qscale);
       return -1;
+    }
   }
   if (IS_INTRA16x16(mb_type)) {
-    if (svq3_decode_block (&s->gb, h->mb, 0, 0))
+    if (svq3_decode_block (&s->gb, h->mb, 0, 0)){
+      av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding intra luma dc\n");
       return -1;
+    }
   }
 
   if (cbp) {
@@ -649,24 +668,30 @@ static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
 	  k = index ? ((j&1) + 2*(i&1) + 2*(j&2) + 4*(i&2)) : (4*i + j);
 	  h->non_zero_count_cache[ scan8[k] ] = 1;
 
-	  if (svq3_decode_block (&s->gb, &h->mb[16*k], index, type))
+	  if (svq3_decode_block (&s->gb, &h->mb[16*k], index, type)){
+            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding block\n");
 	    return -1;
+          }
 	}
       }
     }
 
     if ((cbp & 0x30)) {
       for (i=0; i < 2; ++i) {
-	if (svq3_decode_block (&s->gb, &h->mb[16*(16 + 4*i)], 0, 3))
+	if (svq3_decode_block (&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){
+          av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n");
 	  return -1;
+        }
       }
 
       if ((cbp & 0x20)) {
 	for (i=0; i < 8; i++) {
 	  h->non_zero_count_cache[ scan8[16+i] ] = 1;
 
-	  if (svq3_decode_block (&s->gb, &h->mb[16*(16 + i)], 1, 1))
+	  if (svq3_decode_block (&s->gb, &h->mb[16*(16 + i)], 1, 1)){
+            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
 	    return -1;
+          }
 	}
       }
     }
@@ -695,7 +720,7 @@ static int svq3_decode_slice_header (H264Context *h) {
   } else {
     int length = (header >> 5) & 3;
 
-    h->next_slice_index = s->gb.index + 8*show_bits (&s->gb, 8*length) + 8*length;
+    h->next_slice_index = get_bits_count(&s->gb) + 8*show_bits (&s->gb, 8*length) + 8*length;
 
     if (h->next_slice_index > s->gb.size_in_bits){
       av_log(h->s.avctx, AV_LOG_ERROR, "slice after bitstream end\n");
@@ -703,10 +728,10 @@ static int svq3_decode_slice_header (H264Context *h) {
     }
 
     s->gb.size_in_bits = h->next_slice_index - 8*(length - 1);
-    s->gb.index += 8;
+    skip_bits(&s->gb, 8);
 
     if (length > 0) {
-      memcpy ((uint8_t *) &s->gb.buffer[s->gb.index >> 3],
+      memcpy ((uint8_t *) &s->gb.buffer[get_bits_count(&s->gb) >> 3],
              &s->gb.buffer[s->gb.size_in_bits >> 3], (length - 1));
     }
   }
@@ -914,10 +939,10 @@ static int svq3_decode_frame (AVCodecContext *avctx,
   for (s->mb_y=0; s->mb_y < s->mb_height; s->mb_y++) {
     for (s->mb_x=0; s->mb_x < s->mb_width; s->mb_x++) {
 
-      if ( (s->gb.index + 7) >= s->gb.size_in_bits &&
-	  ((s->gb.index & 7) == 0 || show_bits (&s->gb, (-s->gb.index & 7)) == 0)) {
+      if ( (get_bits_count(&s->gb) + 7) >= s->gb.size_in_bits &&
+	  ((get_bits_count(&s->gb) & 7) == 0 || show_bits (&s->gb, (-get_bits_count(&s->gb) & 7)) == 0)) {
 
-	s->gb.index = h->next_slice_index;
+	skip_bits(&s->gb, h->next_slice_index - get_bits_count(&s->gb));
 	s->gb.size_in_bits = 8*buf_size;
 
 	if (svq3_decode_slice_header (h))
@@ -979,5 +1004,5 @@ AVCodec svq3_decoder = {
     NULL,
     decode_end,
     svq3_decode_frame,
-    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1,
+    CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1 | CODEC_CAP_DELAY,
 };
diff --git a/src/libffmpeg/libavcodec/truemotion1.c b/src/libffmpeg/libavcodec/truemotion1.c
index 5f32227fe..0c3bb09cf 100644
--- a/src/libffmpeg/libavcodec/truemotion1.c
+++ b/src/libffmpeg/libavcodec/truemotion1.c
@@ -24,7 +24,7 @@
  * Mike Melanson (melanson@pcisys.net)
  *
  * The TrueMotion v1 decoder presently only decodes 16-bit TM1 data and
- * outputs RGB555 data. 24-bit TM1 data is not supported yet.
+ * outputs RGB555 (or RGB565) data. 24-bit TM1 data is not supported yet.
  */
 
 #include <stdio.h>
@@ -43,12 +43,12 @@ typedef struct TrueMotion1Context {
     AVFrame frame;
     AVFrame prev_frame;
 
-    unsigned char *buf;
+    uint8_t *buf;
     int size;
 
-    unsigned char *mb_change_bits;
+    uint8_t *mb_change_bits;
     int mb_change_bits_row_size;
-    unsigned char *index_stream;
+    uint8_t *index_stream;
     int index_stream_size;
 
     int flags;
@@ -56,6 +56,8 @@ typedef struct TrueMotion1Context {
     
     uint32_t y_predictor_table[1024];
     uint32_t c_predictor_table[1024];
+    uint32_t fat_y_predictor_table[1024];
+    uint32_t fat_c_predictor_table[1024];
     
     int compression;
     int block_type;
@@ -109,12 +111,12 @@ struct frame_header {
 
 typedef struct comp_types {
     int algorithm;
-    int block_width;
-    int block_height;
+    int block_width; // vres
+    int block_height; // hres
     int block_type;
 } comp_types;
 
-/* { valid for metatype }, algorithm, num of deltas, horiz res, vert res */
+/* { valid for metatype }, algorithm, num of deltas, vert res, horiz res */
 static comp_types compression_types[17] = {
     { ALGO_NOP,    0, 0, 0 },
 
@@ -163,9 +165,9 @@ static void select_delta_tables(TrueMotion1Context *s, int delta_table_index)
 }
 
 #ifdef WORDS_BIGENDIAN
-static int make_ydt_entry(int p2, int p1, int16_t *ydt)
+static int make_ydt15_entry(int p2, int p1, int16_t *ydt)
 #else
-static int make_ydt_entry(int p1, int p2, int16_t *ydt)
+static int make_ydt15_entry(int p1, int p2, int16_t *ydt)
 #endif
 {
     int lo, hi;
@@ -178,9 +180,9 @@ static int make_ydt_entry(int p1, int p2, int16_t *ydt)
 }
 
 #ifdef WORDS_BIGENDIAN
-static int make_cdt_entry(int p2, int p1, int16_t *cdt)
+static int make_cdt15_entry(int p2, int p1, int16_t *cdt)
 #else
-static int make_cdt_entry(int p1, int p2, int16_t *cdt)
+static int make_cdt15_entry(int p1, int p2, int16_t *cdt)
 #endif
 {
     int r, b, lo;
@@ -191,7 +193,62 @@ static int make_cdt_entry(int p1, int p2, int16_t *cdt)
     return ((lo + (lo << 16)) << 1);
 }
 
-static void gen_vector_table(TrueMotion1Context *s, uint8_t *sel_vector_table)
+#ifdef WORDS_BIGENDIAN
+static int make_ydt16_entry(int p2, int p1, int16_t *ydt)
+#else
+static int make_ydt16_entry(int p1, int p2, int16_t *ydt)
+#endif
+{
+    int lo, hi;
+    
+    lo = ydt[p1];
+    lo += (lo << 6) + (lo << 11);
+    hi = ydt[p2];
+    hi += (hi << 6) + (hi << 11);
+    return ((lo + (hi << 16)) << 1);
+}
+
+#ifdef WORDS_BIGENDIAN
+static int make_cdt16_entry(int p2, int p1, int16_t *cdt)
+#else
+static int make_cdt16_entry(int p1, int p2, int16_t *cdt)
+#endif
+{
+    int r, b, lo;
+    
+    b = cdt[p2];
+    r = cdt[p1] << 11;
+    lo = b + r;
+    return ((lo + (lo << 16)) << 1);
+}
+
+#ifdef WORDS_BIGENDIAN
+static int make_ydt24_entry(int p2, int p1, int16_t *ydt)
+#else
+static int make_ydt24_entry(int p1, int p2, int16_t *ydt)
+#endif
+{
+    int lo, hi;
+    
+    lo = ydt[p1];
+    hi = ydt[p2];
+    return ((lo + (hi << 8)) << 1);
+}
+
+#ifdef WORDS_BIGENDIAN
+static int make_cdt24_entry(int p2, int p1, int16_t *cdt)
+#else
+static int make_cdt24_entry(int p1, int p2, int16_t *cdt)
+#endif
+{
+    int r, b;
+    
+    b = cdt[p2];
+    r = cdt[p1]<<16;
+    return ((b+r) << 1);
+}
+
+static void gen_vector_table15(TrueMotion1Context *s, uint8_t *sel_vector_table)
 {
     int len, i, j;
     unsigned char delta_pair;
@@ -203,15 +260,63 @@ static void gen_vector_table(TrueMotion1Context *s, uint8_t *sel_vector_table)
         {
             delta_pair = *sel_vector_table++;
             s->y_predictor_table[i+j] = 0xfffffffe & 
-                make_ydt_entry(delta_pair >> 4, delta_pair & 0xf, s->ydt);
+                make_ydt15_entry(delta_pair >> 4, delta_pair & 0xf, s->ydt);
             s->c_predictor_table[i+j] = 0xfffffffe & 
-                make_cdt_entry(delta_pair >> 4, delta_pair & 0xf, s->cdt);
+                make_cdt15_entry(delta_pair >> 4, delta_pair & 0xf, s->cdt);
         }
         s->y_predictor_table[i+(j-1)] |= 1;
         s->c_predictor_table[i+(j-1)] |= 1;
     }
 }
 
+static void gen_vector_table16(TrueMotion1Context *s, uint8_t *sel_vector_table)
+{
+    int len, i, j;
+    unsigned char delta_pair;
+    
+    for (i = 0; i < 1024; i += 4)
+    {
+        len = *sel_vector_table++ / 2;
+        for (j = 0; j < len; j++)
+        {
+            delta_pair = *sel_vector_table++;
+            s->y_predictor_table[i+j] = 0xfffffffe & 
+                make_ydt16_entry(delta_pair >> 4, delta_pair & 0xf, s->ydt);
+            s->c_predictor_table[i+j] = 0xfffffffe & 
+                make_cdt16_entry(delta_pair >> 4, delta_pair & 0xf, s->cdt);
+        }
+        s->y_predictor_table[i+(j-1)] |= 1;
+        s->c_predictor_table[i+(j-1)] |= 1;
+    }
+}
+
+static void gen_vector_table24(TrueMotion1Context *s, uint8_t *sel_vector_table)
+{
+    int len, i, j;
+    unsigned char delta_pair;
+    
+    for (i = 0; i < 1024; i += 4)
+    {
+        len = *sel_vector_table++ / 2;
+        for (j = 0; j < len; j++)
+        {
+            delta_pair = *sel_vector_table++;
+            s->y_predictor_table[i+j] = 0xfffffffe & 
+                make_ydt24_entry(delta_pair >> 4, delta_pair & 0xf, s->ydt);
+            s->c_predictor_table[i+j] = 0xfffffffe & 
+                make_cdt24_entry(delta_pair >> 4, delta_pair & 0xf, s->cdt);
+            s->fat_y_predictor_table[i+j] = 0xfffffffe & 
+                make_ydt24_entry(delta_pair >> 4, delta_pair & 0xf, s->fat_ydt);
+            s->fat_c_predictor_table[i+j] = 0xfffffffe & 
+                make_cdt24_entry(delta_pair >> 4, delta_pair & 0xf, s->fat_cdt);
+        }
+        s->y_predictor_table[i+(j-1)] |= 1;
+        s->c_predictor_table[i+(j-1)] |= 1;
+        s->fat_y_predictor_table[i+(j-1)] |= 1;
+        s->fat_c_predictor_table[i+(j-1)] |= 1;
+    }
+}
+
 /* Returns the number of bytes consumed from the bytestream. Returns -1 if
  * there was an error while decoding the header */ 
 static int truemotion1_decode_header(TrueMotion1Context *s)
@@ -229,14 +334,15 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     header.header_size = ((s->buf[0] >> 5) | (s->buf[0] << 3)) & 0x7f;
     if (s->buf[0] < 0x10)
     {
-        av_log(s->avctx, AV_LOG_ERROR, "invalid header size\n");
+	av_log(s->avctx, AV_LOG_ERROR, "invalid header size (%d)\n", s->buf[0]);
         return -1;
     }
 
     /* unscramble the header bytes with a XOR operation */
     memset(header_buffer, 0, 128);
     for (i = 1; i < header.header_size; i++)
-    header_buffer[i - 1] = s->buf[i] ^ s->buf[i + 1];
+	header_buffer[i - 1] = s->buf[i] ^ s->buf[i + 1];
+
     header.compression = header_buffer[0];
     header.deltaset = header_buffer[1];
     header.vectable = header_buffer[2];
@@ -253,7 +359,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     {
         if (header.header_type > 3)
         {
-            av_log(s->avctx, AV_LOG_ERROR, "truemotion1: invalid header type\n");
+            av_log(s->avctx, AV_LOG_ERROR, "invalid header type (%d)\n", header.header_type);
             return -1;
         } else if ((header.header_type == 2) || (header.header_type == 3)) {
             s->flags = header.flags;
@@ -265,6 +371,7 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
         s->flags = FLAG_KEYFRAME;
     
     if (s->flags & FLAG_SPRITE) {
+	av_log(s->avctx, AV_LOG_INFO, "SPRITE frame found, please report the sample to the developers\n");
         s->w = header.width;
         s->h = header.height;
         s->x = header.xoffset;
@@ -274,7 +381,10 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
         s->h = header.ysize;
         if (header.header_type < 2) {
             if ((s->w < 213) && (s->h >= 176))
+	    {
                 s->flags |= FLAG_INTERPOLATED;
+	        av_log(s->avctx, AV_LOG_INFO, "INTERPOLATION selected, please report the sample to the developers\n");
+	    }
         }
     }
 
@@ -297,15 +407,22 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
             return -1;
         }
     }
+    
+    // FIXME: where to place this ?!?!
+    if (compression_types[header.compression].algorithm == ALGO_RGB24H)
+        s->avctx->pix_fmt = PIX_FMT_BGR24;
+    else
+	s->avctx->pix_fmt = PIX_FMT_RGB555; // RGB565 is supported aswell
 
     if ((header.deltaset != s->last_deltaset) || (header.vectable != s->last_vectable))
     {
         if (compression_types[header.compression].algorithm == ALGO_RGB24H)
-        {
-            av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n");
-        }
+            gen_vector_table24(s, sel_vector_table);
         else
-            gen_vector_table(s, sel_vector_table);
+	if (s->avctx->pix_fmt == PIX_FMT_RGB555)
+            gen_vector_table15(s, sel_vector_table);
+	else
+            gen_vector_table16(s, sel_vector_table);
     }
 
     /* set up pointers to the other key data chunks */
@@ -327,6 +444,15 @@ static int truemotion1_decode_header(TrueMotion1Context *s)
     s->block_height = compression_types[header.compression].block_height;
     s->block_type = compression_types[header.compression].block_type;
 
+    if (s->avctx->debug & FF_DEBUG_PICT_INFO)
+	av_log(s->avctx, AV_LOG_INFO, "tables: %d / %d c:%d %dx%d t:%d %s%s%s%s\n",
+	    s->last_deltaset, s->last_vectable, s->compression, s->block_width,
+	    s->block_height, s->block_type,
+	    s->flags & FLAG_KEYFRAME ? " KEY" : "",
+	    s->flags & FLAG_INTERFRAME ? " INTER" : "",
+	    s->flags & FLAG_SPRITE ? " SPRITE" : "",
+	    s->flags & FLAG_INTERPOLATED ? " INTERPOL" : "");
+
     return header.header_size;    
 }
 
@@ -336,7 +462,12 @@ static int truemotion1_decode_init(AVCodecContext *avctx)
 
     s->avctx = avctx;
 
-    avctx->pix_fmt = PIX_FMT_RGB555;
+    // FIXME: it may change ?
+//    if (avctx->bits_per_sample == 24)
+//	avctx->pix_fmt = PIX_FMT_RGB24;
+//    else
+//	avctx->pix_fmt = PIX_FMT_RGB555;
+
     avctx->has_b_frames = 0;
     s->frame.data[0] = s->prev_frame.data[0] = NULL;
 
@@ -348,6 +479,32 @@ static int truemotion1_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
+/*
+Block decoding order:
+
+dxi: Y-Y
+dxic: Y-C-Y
+dxic2: Y-C-Y-C
+
+hres,vres,i,i%vres (0 < i < 4)
+2x2 0: 0 dxic2
+2x2 1: 1 dxi
+2x2 2: 0 dxic2
+2x2 3: 1 dxi
+2x4 0: 0 dxic2
+2x4 1: 1 dxi
+2x4 2: 2 dxi
+2x4 3: 3 dxi
+4x2 0: 0 dxic
+4x2 1: 1 dxi
+4x2 2: 0 dxic
+4x2 3: 1 dxi
+4x4 0: 0 dxic
+4x4 1: 1 dxi
+4x4 2: 2 dxi
+4x4 3: 3 dxi
+*/
+
 #define GET_NEXT_INDEX() \
 {\
     if (index_stream_index >= s->index_stream_size) { \
@@ -374,6 +531,25 @@ static int truemotion1_decode_init(AVCodecContext *avctx)
     } else \
         index++;
 
+#define APPLY_C_PREDICTOR_24() \
+    predictor_pair = s->c_predictor_table[index]; \
+    c_horiz_pred += (predictor_pair >> 1); \
+    if (predictor_pair & 1) { \
+        GET_NEXT_INDEX() \
+        if (!index) { \
+            GET_NEXT_INDEX() \
+            predictor_pair = s->fat_c_predictor_table[index]; \
+            c_horiz_pred += (predictor_pair >> 1); \
+            if (predictor_pair & 1) \
+                GET_NEXT_INDEX() \
+            else \
+                index++; \
+        } \
+    } else \
+        index++; 
+//    c_last+coff = clast+c_horiz_pred;
+
+
 #define APPLY_Y_PREDICTOR() \
     predictor_pair = s->y_predictor_table[index]; \
     horiz_pred += (predictor_pair >> 1); \
@@ -391,6 +567,23 @@ static int truemotion1_decode_init(AVCodecContext *avctx)
     } else \
         index++;
 
+#define APPLY_Y_PREDICTOR_24() \
+    predictor_pair = s->y_predictor_table[index]; \
+    horiz_pred += (predictor_pair >> 1); \
+    if (predictor_pair & 1) { \
+        GET_NEXT_INDEX() \
+        if (!index) { \
+            GET_NEXT_INDEX() \
+            predictor_pair = s->fat_y_predictor_table[index]; \
+            horiz_pred += (predictor_pair >> 1); \
+            if (predictor_pair & 1) \
+                GET_NEXT_INDEX() \
+            else \
+                index++; \
+        } \
+    } else \
+        index++;
+
 #define OUTPUT_PIXEL_PAIR() \
     *current_pixel_pair = *vert_pred + horiz_pred; \
     *vert_pred++ = *current_pixel_pair++; \
@@ -528,6 +721,149 @@ static void truemotion1_decode_16bit(TrueMotion1Context *s)
     }
 }
 
+static void truemotion1_decode_24bit(TrueMotion1Context *s)
+{
+    int y;
+    int pixels_left;  /* remaining pixels on this line */
+    unsigned int predictor_pair;
+    unsigned int horiz_pred;
+    unsigned int c_horiz_pred;
+    unsigned int *vert_pred;
+    unsigned int *current_pixel_pair;
+    unsigned int *prev_pixel_pair;
+    unsigned char *current_line = s->frame.data[0];
+    unsigned char *prev_line = s->prev_frame.data[0];
+    int keyframe = s->flags & FLAG_KEYFRAME;
+
+    /* these variables are for managing the stream of macroblock change bits */
+    unsigned char *mb_change_bits = s->mb_change_bits;
+    unsigned char mb_change_byte;
+    unsigned char mb_change_byte_mask;
+    int mb_change_index;
+
+    /* these variables are for managing the main index stream */
+    int index_stream_index = 0;  /* yes, the index into the index stream */
+    int index;
+
+    /* clean out the line buffer */
+    memset(s->vert_pred, 0, s->avctx->width * sizeof(unsigned short));
+
+    GET_NEXT_INDEX();
+
+    for (y = 0; y < s->avctx->height; y++) {
+
+        /* re-init variables for the next line iteration */
+        horiz_pred = c_horiz_pred = 0;
+        current_pixel_pair = (unsigned int *)current_line;
+        prev_pixel_pair = (unsigned int *)prev_line;
+        vert_pred = s->vert_pred;
+        mb_change_index = 0;
+        mb_change_byte = mb_change_bits[mb_change_index++];
+        mb_change_byte_mask = 0x01;
+        pixels_left = s->avctx->width;
+
+        while (pixels_left > 0) {
+
+            if (keyframe || ((mb_change_byte & mb_change_byte_mask) == 0)) {
+
+                switch (y & 3) {
+                case 0:
+                    /* if macroblock width is 2, apply C-Y-C-Y; else 
+                     * apply C-Y-Y */
+                    if (s->block_width == 2) {
+                        APPLY_C_PREDICTOR_24();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                        APPLY_C_PREDICTOR_24();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                    } else {
+                        APPLY_C_PREDICTOR_24();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                    /* always apply 2 Y predictors on these iterations */
+                    APPLY_Y_PREDICTOR_24();
+                    OUTPUT_PIXEL_PAIR();
+                    APPLY_Y_PREDICTOR_24();
+                    OUTPUT_PIXEL_PAIR();
+                    break;
+
+                case 2:
+                    /* this iteration might be C-Y-C-Y, Y-Y, or C-Y-Y 
+                     * depending on the macroblock type */
+                    if (s->block_type == BLOCK_2x2) {
+                        APPLY_C_PREDICTOR_24();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                        APPLY_C_PREDICTOR_24();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                    } else if (s->block_type == BLOCK_4x2) {
+                        APPLY_C_PREDICTOR_24();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+//                        OUTPUT_PIXEL_PAIR_24_C();
+                    } else {
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+                        APPLY_Y_PREDICTOR_24();
+                        OUTPUT_PIXEL_PAIR();
+                    }
+                    break;
+                }
+
+            } else {
+
+                /* skip (copy) four pixels, but reassign the horizontal 
+                 * predictor */
+                *current_pixel_pair = *prev_pixel_pair++;
+                *vert_pred++ = *current_pixel_pair++;
+                *current_pixel_pair = *prev_pixel_pair++;
+                horiz_pred = *current_pixel_pair - *vert_pred;
+//		c_horiz_pred = *current_pixel_pair - *vert_pred;
+                *vert_pred++ = *current_pixel_pair++;
+                
+            }
+
+            if (!keyframe) {
+                mb_change_byte_mask <<= 1;
+
+                /* next byte */
+                if (!mb_change_byte_mask) {
+                    mb_change_byte = mb_change_bits[mb_change_index++];
+                    mb_change_byte_mask = 0x01;
+                }
+            }
+
+            pixels_left -= 4;
+        }
+
+        /* next change row */
+        if (((y + 1) & 3) == 0)
+            mb_change_bits += s->mb_change_bits_row_size;
+
+        current_line += s->frame.linesize[0];
+        prev_line += s->prev_frame.linesize[0];
+    }
+}
+
+
 static int truemotion1_decode_frame(AVCodecContext *avctx,
                                     void *data, int *data_size,
                                     uint8_t *buf, int buf_size)
@@ -537,26 +873,22 @@ static int truemotion1_decode_frame(AVCodecContext *avctx,
     s->buf = buf;
     s->size = buf_size;
 
+    if (truemotion1_decode_header(s) == -1)
+        return -1;
+
     s->frame.reference = 1;
     if (avctx->get_buffer(avctx, &s->frame) < 0) {
-        av_log(s->avctx, AV_LOG_ERROR, "truemotion1: get_buffer() failed\n");
+        av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         return -1;
     }
 
-    /* no supplementary picture */
-    if (buf_size == 0)
-        return 0;
-
-    if (truemotion1_decode_header(s) == -1)
-        return -1;
-
     /* check for a do-nothing frame and copy the previous frame */
     if (compression_types[s->compression].algorithm == ALGO_NOP)
     {
         memcpy(s->frame.data[0], s->prev_frame.data[0],
             s->frame.linesize[0] * s->avctx->height);
     } else if (compression_types[s->compression].algorithm == ALGO_RGB24H) {
-        av_log(s->avctx, AV_LOG_ERROR, "24bit compression not yet supported\n");
+        truemotion1_decode_24bit(s);
     } else {
         truemotion1_decode_16bit(s);
     }
diff --git a/src/libffmpeg/libavcodec/tscc.c b/src/libffmpeg/libavcodec/tscc.c
new file mode 100644
index 000000000..109404404
--- /dev/null
+++ b/src/libffmpeg/libavcodec/tscc.c
@@ -0,0 +1,330 @@
+/*
+ * TechSmith Camtasia decoder
+ * Copyright (c) 2004 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+/**
+ * @file tscc.c
+ * TechSmith Camtasia decoder
+ *
+ * Fourcc: TSCC
+ *
+ * Codec is very simple:
+ *  it codes picture (picture difference, really)
+ *  with algorithm almost identical to Windows RLE8,
+ *  only without padding and with greater pixel sizes,
+ *  then this coded picture is packed with ZLib
+ *
+ * Supports: BGR8,BGR555,BGR24 - only BGR8 and BGR555 tested
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "avcodec.h"
+
+#ifdef CONFIG_ZLIB
+#include <zlib.h>
+#endif
+
+
+/*
+ * Decoder context
+ */
+typedef struct TsccContext {
+
+    AVCodecContext *avctx;
+    AVFrame pic;
+
+    // Bits per pixel
+    int bpp;
+    // Decompressed data size
+    unsigned int decomp_size;
+    // Decompression buffer
+    unsigned char* decomp_buf;
+    int height;
+#ifdef CONFIG_ZLIB
+    z_stream zstream;
+#endif
+} CamtasiaContext;
+
+/*
+ *
+ * Decode RLE - almost identical to Windows BMP RLE8
+ *              and enhanced to bigger color depths
+ *
+ */
+ 
+static int decode_rle(CamtasiaContext *c, unsigned int srcsize)
+{
+    unsigned char *src = c->decomp_buf;
+    unsigned char *output, *output_end;
+    int p1, p2, line=c->height, pos=0, i;
+    
+    output = c->pic.data[0] + (c->height - 1) * c->pic.linesize[0];
+    output_end = c->pic.data[0] + (c->height) * c->pic.linesize[0];
+    while(src < c->decomp_buf + srcsize) {
+        p1 = *src++;
+        if(p1 == 0) { //Escape code
+            p2 = *src++;
+            if(p2 == 0) { //End-of-line
+                output = c->pic.data[0] + (--line) * c->pic.linesize[0];
+                if (line < 0)
+                    return -1;
+                pos = 0;
+                continue;
+            } else if(p2 == 1) { //End-of-picture
+                return 0;
+            } else if(p2 == 2) { //Skip
+                p1 = *src++;
+                p2 = *src++;
+                line -= p2;
+                if (line < 0)
+                    return -1;
+                pos += p1;
+                output = c->pic.data[0] + line * c->pic.linesize[0] + pos * (c->bpp / 8);
+                continue;
+            }
+            // Copy data
+            if (output + p2 * (c->bpp / 8) > output_end) {
+                src += p2 * (c->bpp / 8);
+                continue;
+            }
+            for(i = 0; i < p2 * (c->bpp / 8); i++) {
+                *output++ = *src++;
+            }
+	    // RLE8 copy is actually padded - and runs are not!
+	    if(c->bpp == 8 && (p2 & 1)) {
+		src++;
+	    }
+            pos += p2;
+        } else { //Run of pixels
+            int pix[4]; //original pixel
+            switch(c->bpp){
+            case  8: pix[0] = *src++;
+                     break;
+            case 16: pix[0] = *src++;
+                     pix[1] = *src++;
+                     break;
+            case 24: pix[0] = *src++;
+                     pix[1] = *src++;
+                     pix[2] = *src++;
+                     break;
+            case 32: pix[0] = *src++;
+                     pix[1] = *src++;
+                     pix[2] = *src++;
+                     pix[3] = *src++;
+                     break;
+            }
+            if (output + p1 * (c->bpp / 8) > output_end)
+                continue;
+            for(i = 0; i < p1; i++) {
+                switch(c->bpp){
+                case  8: *output++ = pix[0];
+                         break;
+                case 16: *output++ = pix[0];
+                         *output++ = pix[1];
+                         break;
+                case 24: *output++ = pix[0];
+                         *output++ = pix[1];
+                         *output++ = pix[2];
+                         break;
+                case 32: *output++ = pix[0];
+                         *output++ = pix[1];
+                         *output++ = pix[2];
+                         *output++ = pix[3];
+                         break;
+                }
+            }
+            pos += p1;
+        }
+    }
+    
+    av_log(c->avctx, AV_LOG_ERROR, "Camtasia warning: no End-of-picture code\n");        
+    return 1;
+}
+
+/*
+ *
+ * Decode a frame
+ *
+ */
+static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size)
+{
+    CamtasiaContext * const c = (CamtasiaContext *)avctx->priv_data;
+    unsigned char *encoded = (unsigned char *)buf;
+    unsigned char *outptr;
+#ifdef CONFIG_ZLIB
+    int zret; // Zlib return code
+#endif
+    int len = buf_size;
+
+    if(c->pic.data[0])
+            avctx->release_buffer(avctx, &c->pic);
+
+    c->pic.reference = 1;
+    c->pic.buffer_hints = FF_BUFFER_HINTS_VALID;
+    if(avctx->get_buffer(avctx, &c->pic) < 0){
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+
+    outptr = c->pic.data[0]; // Output image pointer
+
+#ifdef CONFIG_ZLIB
+    zret = inflateReset(&(c->zstream));
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate reset error: %d\n", zret);
+        return -1;
+    }
+    c->zstream.next_in = encoded;
+    c->zstream.avail_in = len;
+    c->zstream.next_out = c->decomp_buf;
+    c->zstream.avail_out = c->decomp_size;
+    zret = inflate(&(c->zstream), Z_FINISH);
+    // Z_DATA_ERROR means empty picture
+    if ((zret != Z_OK) && (zret != Z_STREAM_END) && (zret != Z_DATA_ERROR)) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate error: %d\n", zret);
+        return -1;
+    }
+
+
+    if(zret != Z_DATA_ERROR)
+        decode_rle(c, c->zstream.avail_out);
+    
+    /* make the palette available on the way out */
+    if (c->avctx->pix_fmt == PIX_FMT_PAL8) {
+        memcpy(c->pic.data[1], c->avctx->palctrl->palette, AVPALETTE_SIZE);
+        if (c->avctx->palctrl->palette_changed) {
+            c->pic.palette_has_changed = 1;
+            c->avctx->palctrl->palette_changed = 0;
+        }
+    }
+
+#else
+    av_log(avctx, AV_LOG_ERROR, "BUG! Zlib support not compiled in frame decoder.\n");
+    return -1;
+#endif
+
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = c->pic;
+
+    /* always report that the buffer was completely consumed */
+    return buf_size;
+}
+
+
+
+/*
+ *
+ * Init tscc decoder
+ *
+ */
+static int decode_init(AVCodecContext *avctx)
+{
+    CamtasiaContext * const c = (CamtasiaContext *)avctx->priv_data;
+    int zret; // Zlib return code
+
+    c->avctx = avctx;
+    avctx->has_b_frames = 0;
+
+    c->pic.data[0] = NULL;
+    c->height = avctx->height;
+
+    if (avcodec_check_dimensions(avctx, avctx->height, avctx->width) < 0) {
+        return 1;
+    }
+
+#ifdef CONFIG_ZLIB
+    // Needed if zlib unused or init aborted before inflateInit
+    memset(&(c->zstream), 0, sizeof(z_stream)); 
+#else
+    av_log(avctx, AV_LOG_ERROR, "Zlib support not compiled.\n");
+    return 1;
+#endif
+    switch(avctx->bits_per_sample){
+    case  8: avctx->pix_fmt = PIX_FMT_PAL8; break;
+    case 16: avctx->pix_fmt = PIX_FMT_RGB555; break;
+    case 24:
+             avctx->pix_fmt = PIX_FMT_BGR24;
+             break;
+    case 32: avctx->pix_fmt = PIX_FMT_RGBA32; break;
+    default: av_log(avctx, AV_LOG_ERROR, "Camtasia error: unknown depth %i bpp\n", avctx->bits_per_sample);
+             return -1;             
+    }
+    c->bpp = avctx->bits_per_sample;
+    c->decomp_size = (avctx->width * c->bpp + (avctx->width + 254) / 255 + 2) * avctx->height + 2;//RLE in the 'best' case
+
+    /* Allocate decompression buffer */
+    if (c->decomp_size) {
+        if ((c->decomp_buf = av_malloc(c->decomp_size)) == NULL) {
+            av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
+            return 1;
+        }
+    }
+  
+#ifdef CONFIG_ZLIB
+    c->zstream.zalloc = Z_NULL;
+    c->zstream.zfree = Z_NULL;
+    c->zstream.opaque = Z_NULL;
+    zret = inflateInit(&(c->zstream));
+    if (zret != Z_OK) {
+        av_log(avctx, AV_LOG_ERROR, "Inflate init error: %d\n", zret);
+        return 1;
+    }
+#endif
+
+    return 0;
+}
+
+
+
+/*
+ *
+ * Uninit tscc decoder
+ *
+ */
+static int decode_end(AVCodecContext *avctx)
+{
+    CamtasiaContext * const c = (CamtasiaContext *)avctx->priv_data;
+
+    av_freep(&c->decomp_buf);
+
+    if (c->pic.data[0])
+        avctx->release_buffer(avctx, &c->pic);
+#ifdef CONFIG_ZLIB
+    inflateEnd(&(c->zstream));
+#endif
+
+    return 0;
+}
+
+AVCodec tscc_decoder = {
+        "camtasia",
+        CODEC_TYPE_VIDEO,
+        CODEC_ID_TSCC,
+        sizeof(CamtasiaContext),
+        decode_init,
+        NULL,
+        decode_end,
+        decode_frame,
+        CODEC_CAP_DR1,
+};
+
diff --git a/src/libffmpeg/libavcodec/ulti.c b/src/libffmpeg/libavcodec/ulti.c
new file mode 100755
index 000000000..d4a0c847a
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ulti.c
@@ -0,0 +1,428 @@
+/*
+ *
+ * Copyright (C) 2004 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * IBM Ultimotion Video Decoder
+ *
+ */
+
+/**
+ * @file ulti.c 
+ * IBM Ultimotion Video Decoder.
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "avcodec.h"
+
+#include "ulti_cb.h"
+
+typedef struct UltimotionDecodeContext {
+    AVCodecContext *avctx;
+    int width, height, blocks;
+    AVFrame frame;
+    uint8_t *ulti_codebook;
+} UltimotionDecodeContext;
+
+static int ulti_decode_init(AVCodecContext *avctx)
+{
+    UltimotionDecodeContext *s = avctx->priv_data;
+
+    s->avctx = avctx;
+    s->width = avctx->width;
+    s->height = avctx->height;
+    s->blocks = (s->width / 8) * (s->height / 8);
+    avctx->pix_fmt = PIX_FMT_YUV410P;
+    avctx->has_b_frames = 0;
+    avctx->coded_frame = (AVFrame*) &s->frame;
+    s->ulti_codebook = ulti_codebook;
+
+    return 0;
+}
+
+static int block_coords[8] = // 4x4 block coords in 8x8 superblock
+    { 0, 0, 0, 4, 4, 4, 4, 0};
+
+static int angle_by_index[4] = { 0, 2, 6, 12};
+
+/* Lookup tables for luma and chroma - used by ulti_convert_yuv() */
+static uint8_t ulti_lumas[64] =
+    { 0x10, 0x13, 0x17, 0x1A, 0x1E, 0x21, 0x25, 0x28,
+      0x2C, 0x2F, 0x33, 0x36, 0x3A, 0x3D, 0x41, 0x44,
+      0x48, 0x4B, 0x4F, 0x52, 0x56, 0x59, 0x5C, 0x60,
+      0x63, 0x67, 0x6A, 0x6E, 0x71, 0x75, 0x78, 0x7C,
+      0x7F, 0x83, 0x86, 0x8A, 0x8D, 0x91, 0x94, 0x98,
+      0x9B, 0x9F, 0xA2, 0xA5, 0xA9, 0xAC, 0xB0, 0xB3,
+      0xB7, 0xBA, 0xBE, 0xC1, 0xC5, 0xC8, 0xCC, 0xCF,
+      0xD3, 0xD6, 0xDA, 0xDD, 0xE1, 0xE4, 0xE8, 0xEB};
+      
+static uint8_t ulti_chromas[16] =
+    { 0x60, 0x67, 0x6D, 0x73, 0x7A, 0x80, 0x86, 0x8D,
+      0x93, 0x99, 0xA0, 0xA6, 0xAC, 0xB3, 0xB9, 0xC0};
+      
+/* convert Ultimotion YUV block (sixteen 6-bit Y samples and
+ two 4-bit chroma samples) into standard YUV and put it into frame */
+static void ulti_convert_yuv(AVFrame *frame, int x, int y,
+			     uint8_t *luma,int chroma)
+{
+    uint8_t *y_plane, *cr_plane, *cb_plane;
+    int i;
+    
+    y_plane = frame->data[0] + x + y * frame->linesize[0];
+    cr_plane = frame->data[1] + (x / 4) + (y / 4) * frame->linesize[1];
+    cb_plane = frame->data[2] + (x / 4) + (y / 4) * frame->linesize[2];
+    
+    cr_plane[0] = ulti_chromas[chroma >> 4];
+    
+    cb_plane[0] = ulti_chromas[chroma & 0xF];
+
+    
+    for(i = 0; i < 16; i++){
+	y_plane[i & 3] = ulti_lumas[luma[i]];
+	if((i & 3) == 3) { //next row
+	    y_plane += frame->linesize[0];
+	}
+    }
+}
+
+/* generate block like in MS Video1 */
+static void ulti_pattern(AVFrame *frame, int x, int y,
+			 int f0, int f1, int Y0, int Y1, int chroma)
+{
+    uint8_t Luma[16];
+    int mask, i;
+    for(mask = 0x80, i = 0; mask; mask >>= 1, i++) {
+	if(f0 & mask)
+	    Luma[i] = Y1;
+	else
+	    Luma[i] = Y0;
+    }
+    
+    for(mask = 0x80, i = 8; mask; mask >>= 1, i++) {
+	if(f1 & mask)
+	    Luma[i] = Y1;
+	else
+	    Luma[i] = Y0;
+    }
+    
+    ulti_convert_yuv(frame, x, y, Luma, chroma);
+}
+
+/* fill block with some gradient */
+static void ulti_grad(AVFrame *frame, int x, int y, uint8_t *Y, int chroma, int angle)
+{
+    uint8_t Luma[16];
+    if(angle & 8) { //reverse order
+	int t;
+	angle &= 0x7;
+	t = Y[0];
+	Y[0] = Y[3];
+	Y[3] = t;
+	t = Y[1];
+	Y[1] = Y[2];
+	Y[2] = t;
+    }
+    switch(angle){
+    case 0:
+	Luma[0]  = Y[0]; Luma[1]  = Y[1]; Luma[2]  = Y[2]; Luma[3]  = Y[3];
+	Luma[4]  = Y[0]; Luma[5]  = Y[1]; Luma[6]  = Y[2]; Luma[7]  = Y[3];
+	Luma[8]  = Y[0]; Luma[9]  = Y[1]; Luma[10] = Y[2]; Luma[11] = Y[3];
+	Luma[12] = Y[0]; Luma[13] = Y[1]; Luma[14] = Y[2]; Luma[15] = Y[3];	
+	break;
+    case 1:
+	Luma[0]  = Y[1]; Luma[1]  = Y[2]; Luma[2]  = Y[3]; Luma[3]  = Y[3];
+	Luma[4]  = Y[0]; Luma[5]  = Y[1]; Luma[6]  = Y[2]; Luma[7]  = Y[3];
+	Luma[8]  = Y[0]; Luma[9]  = Y[1]; Luma[10] = Y[2]; Luma[11] = Y[3];
+	Luma[12] = Y[0]; Luma[13] = Y[0]; Luma[14] = Y[1]; Luma[15] = Y[2];	
+	break;
+    case 2:
+	Luma[0]  = Y[1]; Luma[1]  = Y[2]; Luma[2]  = Y[3]; Luma[3]  = Y[3];
+	Luma[4]  = Y[1]; Luma[5]  = Y[2]; Luma[6]  = Y[2]; Luma[7]  = Y[3];
+	Luma[8]  = Y[0]; Luma[9]  = Y[1]; Luma[10] = Y[1]; Luma[11] = Y[2];
+	Luma[12] = Y[0]; Luma[13] = Y[0]; Luma[14] = Y[1]; Luma[15] = Y[2];	
+	break;
+    case 3:
+	Luma[0]  = Y[2]; Luma[1]  = Y[3]; Luma[2]  = Y[3]; Luma[3]  = Y[3];
+	Luma[4]  = Y[1]; Luma[5]  = Y[2]; Luma[6]  = Y[2]; Luma[7]  = Y[3];
+	Luma[8]  = Y[0]; Luma[9]  = Y[1]; Luma[10] = Y[1]; Luma[11] = Y[2];
+	Luma[12] = Y[0]; Luma[13] = Y[0]; Luma[14] = Y[0]; Luma[15] = Y[1];	
+	break;
+    case 4:
+	Luma[0]  = Y[3]; Luma[1]  = Y[3]; Luma[2]  = Y[3]; Luma[3]  = Y[3];
+	Luma[4]  = Y[2]; Luma[5]  = Y[2]; Luma[6]  = Y[2]; Luma[7]  = Y[2];
+	Luma[8]  = Y[1]; Luma[9]  = Y[1]; Luma[10] = Y[1]; Luma[11] = Y[1];
+	Luma[12] = Y[0]; Luma[13] = Y[0]; Luma[14] = Y[0]; Luma[15] = Y[0];	
+	break;
+    case 5:
+	Luma[0]  = Y[3]; Luma[1]  = Y[3]; Luma[2]  = Y[3]; Luma[3]  = Y[2];
+	Luma[4]  = Y[3]; Luma[5]  = Y[2]; Luma[6]  = Y[2]; Luma[7]  = Y[1];
+	Luma[8]  = Y[2]; Luma[9]  = Y[1]; Luma[10] = Y[1]; Luma[11] = Y[0];
+	Luma[12] = Y[1]; Luma[13] = Y[0]; Luma[14] = Y[0]; Luma[15] = Y[0];	
+	break;
+    case 6:
+	Luma[0]  = Y[3]; Luma[1]  = Y[3]; Luma[2]  = Y[2]; Luma[3]  = Y[2];
+	Luma[4]  = Y[3]; Luma[5]  = Y[2]; Luma[6]  = Y[1]; Luma[7]  = Y[1];
+	Luma[8]  = Y[2]; Luma[9]  = Y[2]; Luma[10] = Y[1]; Luma[11] = Y[0];
+	Luma[12] = Y[1]; Luma[13] = Y[1]; Luma[14] = Y[0]; Luma[15] = Y[0];	
+	break;
+    case 7:
+	Luma[0]  = Y[3]; Luma[1]  = Y[3]; Luma[2]  = Y[2]; Luma[3]  = Y[1];
+	Luma[4]  = Y[3]; Luma[5]  = Y[2]; Luma[6]  = Y[1]; Luma[7]  = Y[0];
+	Luma[8]  = Y[3]; Luma[9]  = Y[2]; Luma[10] = Y[1]; Luma[11] = Y[0];
+	Luma[12] = Y[2]; Luma[13] = Y[1]; Luma[14] = Y[0]; Luma[15] = Y[0];	
+	break;
+    default:
+	Luma[0]  = Y[0]; Luma[1]  = Y[0]; Luma[2]  = Y[1]; Luma[3]  = Y[1];
+	Luma[4]  = Y[0]; Luma[5]  = Y[0]; Luma[6]  = Y[1]; Luma[7]  = Y[1];
+	Luma[8]  = Y[2]; Luma[9]  = Y[2]; Luma[10] = Y[3]; Luma[11] = Y[3];
+	Luma[12] = Y[2]; Luma[13] = Y[2]; Luma[14] = Y[3]; Luma[15] = Y[3];	
+	break;
+    }
+    
+    ulti_convert_yuv(frame, x, y, Luma, chroma);
+}
+
+static int ulti_decode_frame(AVCodecContext *avctx, 
+                             void *data, int *data_size,
+                             uint8_t *buf, int buf_size)
+{
+    UltimotionDecodeContext *s=avctx->priv_data;
+    int modifier = 0;
+    int uniq = 0;
+    int mode = 0;
+    int blocks = 0;
+    int done = 0;
+    int x = 0, y = 0;
+    int i;
+    int skip;
+    int tmp;
+
+    if(s->frame.data[0])
+        avctx->release_buffer(avctx, &s->frame);
+
+    s->frame.reference = 1;
+    s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE | FF_BUFFER_HINTS_REUSABLE;
+    if(avctx->get_buffer(avctx, &s->frame) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    
+    while(!done) {
+	int idx;
+	if(blocks >= s->blocks || y >= s->height)
+	    break;//all blocks decoded
+	
+	idx = *buf++;
+	if((idx & 0xF8) == 0x70) {
+	    switch(idx) {
+	    case 0x70: //change modifier
+		modifier = *buf++;
+		if(modifier>1)
+		    av_log(avctx, AV_LOG_INFO, "warning: modifier must be 0 or 1, got %i\n", modifier);
+		break;
+	    case 0x71: // set uniq flag
+		uniq = 1;
+		break;
+	    case 0x72: //toggle mode
+		mode = !mode;
+		break;
+	    case 0x73: //end-of-frame
+		done = 1;
+		break;
+	    case 0x74: //skip some blocks
+		skip = *buf++;
+		if ((blocks + skip) >= s->blocks)
+		    break;
+		blocks += skip;
+		x += skip * 8;
+		while(x >= s->width) {
+		    x -= s->width;
+		    y += 8;
+		}
+		break;
+	    default:
+		av_log(avctx, AV_LOG_INFO, "warning: unknown escape 0x%02X\n", idx);
+	    }	
+	} else { //handle one block
+	    int code;
+	    int cf;
+	    int angle = 0;
+	    uint8_t Y[4]; // luma samples of block
+	    int tx = 0, ty = 0; //coords of subblock
+	    int chroma = 0;
+	    if (mode || uniq) {
+		uniq = 0;
+		cf = 1;
+		chroma = 0;
+	    } else {
+		cf = 0;
+		if (idx)
+		    chroma = *buf++;
+	    }
+	    for (i = 0; i < 4; i++) { // for every subblock
+		code = (idx >> (6 - i*2)) & 3; //extract 2 bits
+		if(!code) //skip subblock
+		    continue;
+		if(cf)
+		    chroma = *buf++;
+		tx = x + block_coords[i * 2];
+		ty = y + block_coords[(i * 2) + 1];
+		switch(code) {
+		case 1: 
+		    tmp = *buf++;
+		    
+		    angle = angle_by_index[(tmp >> 6) & 0x3];
+		    
+		    Y[0] = tmp & 0x3F;
+		    Y[1] = Y[0];
+		    
+		    if (angle) {
+			Y[2] = Y[0]+1;
+			if (Y[2] > 0x3F)
+			    Y[2] = 0x3F;
+			Y[3] = Y[2];			
+		    } else {
+			Y[2] = Y[0];
+			Y[3] = Y[0];
+		    }
+		    break;
+		    
+		case 2:
+		    if (modifier) { // unpack four luma samples
+			tmp = (*buf++) << 16;
+			tmp += (*buf++) << 8;
+			tmp += *buf++;
+			
+			Y[0] = (tmp >> 18) & 0x3F;
+			Y[1] = (tmp >> 12) & 0x3F;
+			Y[2] = (tmp >> 6) & 0x3F;
+			Y[3] = tmp & 0x3F;
+			angle = 16;
+		    } else { // retrieve luma samples from codebook
+			tmp = (*buf++) << 8;
+			tmp += (*buf++);
+			
+			angle = (tmp >> 12) & 0xF;
+			tmp &= 0xFFF;
+			tmp <<= 2;
+			Y[0] = s->ulti_codebook[tmp];
+			Y[1] = s->ulti_codebook[tmp + 1];
+			Y[2] = s->ulti_codebook[tmp + 2];
+			Y[3] = s->ulti_codebook[tmp + 3];
+		    }
+		    break;
+		    
+		case 3:
+		    if (modifier) { // all 16 luma samples
+			uint8_t Luma[16];
+			
+			tmp = (*buf++) << 16;
+			tmp += (*buf++) << 8;
+			tmp += *buf++;
+			Luma[0] = (tmp >> 18) & 0x3F;
+			Luma[1] = (tmp >> 12) & 0x3F;
+			Luma[2] = (tmp >> 6) & 0x3F;
+			Luma[3] = tmp & 0x3F;
+			
+			tmp = (*buf++) << 16;
+			tmp += (*buf++) << 8;
+			tmp += *buf++;
+			Luma[4] = (tmp >> 18) & 0x3F;
+			Luma[5] = (tmp >> 12) & 0x3F;
+			Luma[6] = (tmp >> 6) & 0x3F;
+			Luma[7] = tmp & 0x3F;
+			
+			tmp = (*buf++) << 16;
+			tmp += (*buf++) << 8;
+			tmp += *buf++;
+			Luma[8] = (tmp >> 18) & 0x3F;
+			Luma[9] = (tmp >> 12) & 0x3F;
+			Luma[10] = (tmp >> 6) & 0x3F;
+			Luma[11] = tmp & 0x3F;
+			
+			tmp = (*buf++) << 16;
+			tmp += (*buf++) << 8;
+			tmp += *buf++;
+			Luma[12] = (tmp >> 18) & 0x3F;
+			Luma[13] = (tmp >> 12) & 0x3F;
+			Luma[14] = (tmp >> 6) & 0x3F;
+			Luma[15] = tmp & 0x3F;
+			
+			ulti_convert_yuv(&s->frame, tx, ty, Luma, chroma);
+		    } else {
+			tmp = *buf++;
+			if(tmp & 0x80) {
+			    angle = (tmp >> 4) & 0x7;
+			    tmp = (tmp << 8) + *buf++;
+			    Y[0] = (tmp >> 6) & 0x3F;
+			    Y[1] = tmp & 0x3F;
+			    Y[2] = (*buf++) & 0x3F;
+			    Y[3] = (*buf++) & 0x3F;
+			    ulti_grad(&s->frame, tx, ty, Y, chroma, angle); //draw block
+			} else { // some patterns
+			    int f0, f1;
+			    f0 = *buf++;
+			    f1 = tmp;
+			    Y[0] = (*buf++) & 0x3F;
+			    Y[1] = (*buf++) & 0x3F;
+			    ulti_pattern(&s->frame, tx, ty, f1, f0, Y[0], Y[1], chroma);
+			}
+		    }
+		    break;
+		}
+		if(code != 3)
+		    ulti_grad(&s->frame, tx, ty, Y, chroma, angle); // draw block
+	    }
+	    blocks++;
+    	    x += 8;
+	    if(x >= s->width) {
+		x = 0;
+		y += 8;
+	    }
+	}
+    }
+    
+    *data_size=sizeof(AVFrame);
+    *(AVFrame*)data= s->frame;
+
+    return buf_size;
+}
+
+static int ulti_decode_end(AVCodecContext *avctx)
+{
+/*    UltimotionDecodeContext *s = avctx->priv_data;*/
+
+    return 0;
+}
+
+AVCodec ulti_decoder = {
+    "ultimotion",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_ULTI,
+    sizeof(UltimotionDecodeContext),
+    ulti_decode_init,
+    NULL,
+    ulti_decode_end,
+    ulti_decode_frame,
+    CODEC_CAP_DR1,
+    NULL
+};
+
diff --git a/src/libffmpeg/libavcodec/ulti_cb.h b/src/libffmpeg/libavcodec/ulti_cb.h
new file mode 100755
index 000000000..d059439dc
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ulti_cb.h
@@ -0,0 +1,4098 @@
+static unsigned char ulti_codebook[16384]={
+    0x00, 0x01, 0x01, 0x02,
+    0x00, 0x01, 0x02, 0x03,
+    0x00, 0x02, 0x03, 0x04,
+    0x00, 0x01, 0x03, 0x04,
+    0x00, 0x01, 0x02, 0x04,
+    0x00, 0x02, 0x03, 0x05,
+    0x00, 0x02, 0x04, 0x05,
+    0x00, 0x01, 0x04, 0x05,
+    0x00, 0x01, 0x03, 0x05,
+    0x00, 0x02, 0x04, 0x06,
+    0x00, 0x03, 0x05, 0x06,
+    0x00, 0x01, 0x05, 0x06,
+    0x00, 0x01, 0x03, 0x06,
+    0x00, 0x06, 0x06, 0x06,
+    0x00, 0x00, 0x06, 0x06,
+    0x00, 0x00, 0x00, 0x06,
+    0x00, 0x03, 0x04, 0x07,
+    0x00, 0x03, 0x06, 0x07,
+    0x00, 0x01, 0x06, 0x07,
+    0x00, 0x01, 0x04, 0x07,
+    0x00, 0x03, 0x05, 0x08,
+    0x00, 0x04, 0x06, 0x08,
+    0x00, 0x02, 0x06, 0x08,
+    0x00, 0x02, 0x04, 0x08,
+    0x00, 0x08, 0x08, 0x08,
+    0x00, 0x00, 0x08, 0x08,
+    0x00, 0x00, 0x00, 0x08,
+    0x00, 0x04, 0x07, 0x0B,
+    0x00, 0x05, 0x09, 0x0B,
+    0x00, 0x02, 0x09, 0x0B,
+    0x00, 0x02, 0x06, 0x0B,
+    0x00, 0x0B, 0x0B, 0x0B,
+    0x00, 0x00, 0x0B, 0x0B,
+    0x00, 0x00, 0x00, 0x0B,
+    0x00, 0x05, 0x09, 0x0E,
+    0x00, 0x07, 0x0B, 0x0E,
+    0x00, 0x03, 0x0B, 0x0E,
+    0x00, 0x03, 0x07, 0x0E,
+    0x00, 0x0E, 0x0E, 0x0E,
+    0x00, 0x00, 0x0E, 0x0E,
+    0x00, 0x00, 0x00, 0x0E,
+    0x00, 0x06, 0x0B, 0x11,
+    0x00, 0x08, 0x0D, 0x11,
+    0x00, 0x04, 0x0D, 0x11,
+    0x00, 0x04, 0x09, 0x11,
+    0x00, 0x11, 0x11, 0x11,
+    0x00, 0x00, 0x11, 0x11,
+    0x00, 0x00, 0x00, 0x11,
+    0x00, 0x07, 0x0D, 0x14,
+    0x00, 0x0A, 0x0F, 0x14,
+    0x00, 0x05, 0x0F, 0x14,
+    0x00, 0x05, 0x0A, 0x14,
+    0x00, 0x14, 0x14, 0x14,
+    0x00, 0x00, 0x14, 0x14,
+    0x00, 0x00, 0x00, 0x14,
+    0x00, 0x0B, 0x12, 0x17,
+    0x00, 0x05, 0x12, 0x17,
+    0x00, 0x05, 0x0C, 0x17,
+    0x00, 0x17, 0x17, 0x17,
+    0x00, 0x00, 0x17, 0x17,
+    0x00, 0x00, 0x00, 0x17,
+    0x00, 0x0D, 0x14, 0x1A,
+    0x00, 0x06, 0x14, 0x1A,
+    0x00, 0x06, 0x0D, 0x1A,
+    0x00, 0x1A, 0x1A, 0x1A,
+    0x00, 0x00, 0x1A, 0x1A,
+    0x00, 0x00, 0x00, 0x1A,
+    0x00, 0x0E, 0x16, 0x1D,
+    0x00, 0x07, 0x16, 0x1D,
+    0x00, 0x07, 0x0F, 0x1D,
+    0x00, 0x1D, 0x1D, 0x1D,
+    0x00, 0x00, 0x1D, 0x1D,
+    0x00, 0x00, 0x00, 0x1D,
+    0x00, 0x10, 0x18, 0x20,
+    0x00, 0x08, 0x18, 0x20,
+    0x00, 0x08, 0x10, 0x20,
+    0x00, 0x20, 0x20, 0x20,
+    0x00, 0x00, 0x20, 0x20,
+    0x00, 0x00, 0x00, 0x20,
+    0x00, 0x23, 0x23, 0x23,
+    0x00, 0x00, 0x23, 0x23,
+    0x00, 0x00, 0x00, 0x23,
+    0x00, 0x12, 0x1B, 0x24,
+    0x00, 0x09, 0x1B, 0x24,
+    0x00, 0x09, 0x12, 0x24,
+    0x00, 0x28, 0x28, 0x28,
+    0x00, 0x00, 0x28, 0x28,
+    0x00, 0x00, 0x00, 0x28,
+    0x00, 0x2E, 0x2E, 0x2E,
+    0x00, 0x00, 0x2E, 0x2E,
+    0x00, 0x00, 0x00, 0x2E,
+    0x01, 0x02, 0x02, 0x03,
+    0x01, 0x02, 0x03, 0x04,
+    0x01, 0x03, 0x04, 0x05,
+    0x01, 0x02, 0x04, 0x05,
+    0x01, 0x02, 0x03, 0x05,
+    0x01, 0x03, 0x04, 0x06,
+    0x01, 0x03, 0x05, 0x06,
+    0x01, 0x02, 0x05, 0x06,
+    0x01, 0x02, 0x04, 0x06,
+    0x01, 0x03, 0x05, 0x07,
+    0x01, 0x04, 0x06, 0x07,
+    0x01, 0x02, 0x06, 0x07,
+    0x01, 0x02, 0x04, 0x07,
+    0x01, 0x07, 0x07, 0x07,
+    0x01, 0x01, 0x07, 0x07,
+    0x01, 0x01, 0x01, 0x07,
+    0x01, 0x04, 0x05, 0x08,
+    0x01, 0x04, 0x07, 0x08,
+    0x01, 0x02, 0x07, 0x08,
+    0x01, 0x02, 0x05, 0x08,
+    0x01, 0x04, 0x06, 0x09,
+    0x01, 0x05, 0x07, 0x09,
+    0x01, 0x03, 0x07, 0x09,
+    0x01, 0x03, 0x05, 0x09,
+    0x01, 0x09, 0x09, 0x09,
+    0x01, 0x01, 0x09, 0x09,
+    0x01, 0x01, 0x01, 0x09,
+    0x01, 0x05, 0x08, 0x0C,
+    0x01, 0x06, 0x0A, 0x0C,
+    0x01, 0x03, 0x0A, 0x0C,
+    0x01, 0x03, 0x07, 0x0C,
+    0x01, 0x0C, 0x0C, 0x0C,
+    0x01, 0x01, 0x0C, 0x0C,
+    0x01, 0x01, 0x01, 0x0C,
+    0x01, 0x06, 0x0A, 0x0F,
+    0x01, 0x08, 0x0C, 0x0F,
+    0x01, 0x04, 0x0C, 0x0F,
+    0x01, 0x04, 0x08, 0x0F,
+    0x01, 0x0F, 0x0F, 0x0F,
+    0x01, 0x01, 0x0F, 0x0F,
+    0x01, 0x01, 0x01, 0x0F,
+    0x01, 0x07, 0x0C, 0x12,
+    0x01, 0x09, 0x0E, 0x12,
+    0x01, 0x05, 0x0E, 0x12,
+    0x01, 0x05, 0x0A, 0x12,
+    0x01, 0x12, 0x12, 0x12,
+    0x01, 0x01, 0x12, 0x12,
+    0x01, 0x01, 0x01, 0x12,
+    0x01, 0x08, 0x0E, 0x15,
+    0x01, 0x0B, 0x10, 0x15,
+    0x01, 0x06, 0x10, 0x15,
+    0x01, 0x06, 0x0B, 0x15,
+    0x01, 0x15, 0x15, 0x15,
+    0x01, 0x01, 0x15, 0x15,
+    0x01, 0x01, 0x01, 0x15,
+    0x01, 0x0C, 0x13, 0x18,
+    0x01, 0x06, 0x13, 0x18,
+    0x01, 0x06, 0x0D, 0x18,
+    0x01, 0x18, 0x18, 0x18,
+    0x01, 0x01, 0x18, 0x18,
+    0x01, 0x01, 0x01, 0x18,
+    0x01, 0x0E, 0x15, 0x1B,
+    0x01, 0x07, 0x15, 0x1B,
+    0x01, 0x07, 0x0E, 0x1B,
+    0x01, 0x1B, 0x1B, 0x1B,
+    0x01, 0x01, 0x1B, 0x1B,
+    0x01, 0x01, 0x01, 0x1B,
+    0x01, 0x0F, 0x17, 0x1E,
+    0x01, 0x08, 0x17, 0x1E,
+    0x01, 0x08, 0x10, 0x1E,
+    0x01, 0x1E, 0x1E, 0x1E,
+    0x01, 0x01, 0x1E, 0x1E,
+    0x01, 0x01, 0x01, 0x1E,
+    0x01, 0x11, 0x19, 0x21,
+    0x01, 0x09, 0x19, 0x21,
+    0x01, 0x09, 0x11, 0x21,
+    0x01, 0x21, 0x21, 0x21,
+    0x01, 0x01, 0x21, 0x21,
+    0x01, 0x01, 0x01, 0x21,
+    0x01, 0x24, 0x24, 0x24,
+    0x01, 0x01, 0x24, 0x24,
+    0x01, 0x01, 0x01, 0x24,
+    0x01, 0x13, 0x1C, 0x25,
+    0x01, 0x0A, 0x1C, 0x25,
+    0x01, 0x0A, 0x13, 0x25,
+    0x01, 0x29, 0x29, 0x29,
+    0x01, 0x01, 0x29, 0x29,
+    0x01, 0x01, 0x01, 0x29,
+    0x01, 0x2F, 0x2F, 0x2F,
+    0x01, 0x01, 0x2F, 0x2F,
+    0x01, 0x01, 0x01, 0x2F,
+    0x02, 0x03, 0x03, 0x04,
+    0x02, 0x03, 0x04, 0x05,
+    0x02, 0x04, 0x05, 0x06,
+    0x02, 0x03, 0x05, 0x06,
+    0x02, 0x03, 0x04, 0x06,
+    0x02, 0x04, 0x05, 0x07,
+    0x02, 0x04, 0x06, 0x07,
+    0x02, 0x03, 0x06, 0x07,
+    0x02, 0x03, 0x05, 0x07,
+    0x02, 0x04, 0x06, 0x08,
+    0x02, 0x05, 0x07, 0x08,
+    0x02, 0x03, 0x07, 0x08,
+    0x02, 0x03, 0x05, 0x08,
+    0x02, 0x08, 0x08, 0x08,
+    0x02, 0x02, 0x08, 0x08,
+    0x02, 0x02, 0x02, 0x08,
+    0x02, 0x05, 0x06, 0x09,
+    0x02, 0x05, 0x08, 0x09,
+    0x02, 0x03, 0x08, 0x09,
+    0x02, 0x03, 0x06, 0x09,
+    0x02, 0x05, 0x07, 0x0A,
+    0x02, 0x06, 0x08, 0x0A,
+    0x02, 0x04, 0x08, 0x0A,
+    0x02, 0x04, 0x06, 0x0A,
+    0x02, 0x0A, 0x0A, 0x0A,
+    0x02, 0x02, 0x0A, 0x0A,
+    0x02, 0x02, 0x02, 0x0A,
+    0x02, 0x06, 0x09, 0x0D,
+    0x02, 0x07, 0x0B, 0x0D,
+    0x02, 0x04, 0x0B, 0x0D,
+    0x02, 0x04, 0x08, 0x0D,
+    0x02, 0x0D, 0x0D, 0x0D,
+    0x02, 0x02, 0x0D, 0x0D,
+    0x02, 0x02, 0x02, 0x0D,
+    0x02, 0x07, 0x0B, 0x10,
+    0x02, 0x09, 0x0D, 0x10,
+    0x02, 0x05, 0x0D, 0x10,
+    0x02, 0x05, 0x09, 0x10,
+    0x02, 0x10, 0x10, 0x10,
+    0x02, 0x02, 0x10, 0x10,
+    0x02, 0x02, 0x02, 0x10,
+    0x02, 0x08, 0x0D, 0x13,
+    0x02, 0x0A, 0x0F, 0x13,
+    0x02, 0x06, 0x0F, 0x13,
+    0x02, 0x06, 0x0B, 0x13,
+    0x02, 0x13, 0x13, 0x13,
+    0x02, 0x02, 0x13, 0x13,
+    0x02, 0x02, 0x02, 0x13,
+    0x02, 0x09, 0x0F, 0x16,
+    0x02, 0x0C, 0x11, 0x16,
+    0x02, 0x07, 0x11, 0x16,
+    0x02, 0x07, 0x0C, 0x16,
+    0x02, 0x16, 0x16, 0x16,
+    0x02, 0x02, 0x16, 0x16,
+    0x02, 0x02, 0x02, 0x16,
+    0x02, 0x0D, 0x14, 0x19,
+    0x02, 0x07, 0x14, 0x19,
+    0x02, 0x07, 0x0E, 0x19,
+    0x02, 0x19, 0x19, 0x19,
+    0x02, 0x02, 0x19, 0x19,
+    0x02, 0x02, 0x02, 0x19,
+    0x02, 0x0F, 0x16, 0x1C,
+    0x02, 0x08, 0x16, 0x1C,
+    0x02, 0x08, 0x0F, 0x1C,
+    0x02, 0x1C, 0x1C, 0x1C,
+    0x02, 0x02, 0x1C, 0x1C,
+    0x02, 0x02, 0x02, 0x1C,
+    0x02, 0x10, 0x18, 0x1F,
+    0x02, 0x09, 0x18, 0x1F,
+    0x02, 0x09, 0x11, 0x1F,
+    0x02, 0x1F, 0x1F, 0x1F,
+    0x02, 0x02, 0x1F, 0x1F,
+    0x02, 0x02, 0x02, 0x1F,
+    0x02, 0x12, 0x1A, 0x22,
+    0x02, 0x0A, 0x1A, 0x22,
+    0x02, 0x0A, 0x12, 0x22,
+    0x02, 0x22, 0x22, 0x22,
+    0x02, 0x02, 0x22, 0x22,
+    0x02, 0x02, 0x02, 0x22,
+    0x02, 0x25, 0x25, 0x25,
+    0x02, 0x02, 0x25, 0x25,
+    0x02, 0x02, 0x02, 0x25,
+    0x02, 0x14, 0x1D, 0x26,
+    0x02, 0x0B, 0x1D, 0x26,
+    0x02, 0x0B, 0x14, 0x26,
+    0x02, 0x2A, 0x2A, 0x2A,
+    0x02, 0x02, 0x2A, 0x2A,
+    0x02, 0x02, 0x02, 0x2A,
+    0x02, 0x30, 0x30, 0x30,
+    0x02, 0x02, 0x30, 0x30,
+    0x02, 0x02, 0x02, 0x30,
+    0x03, 0x04, 0x04, 0x05,
+    0x03, 0x04, 0x05, 0x06,
+    0x03, 0x05, 0x06, 0x07,
+    0x03, 0x04, 0x06, 0x07,
+    0x03, 0x04, 0x05, 0x07,
+    0x03, 0x05, 0x06, 0x08,
+    0x03, 0x05, 0x07, 0x08,
+    0x03, 0x04, 0x07, 0x08,
+    0x03, 0x04, 0x06, 0x08,
+    0x03, 0x05, 0x07, 0x09,
+    0x03, 0x06, 0x08, 0x09,
+    0x03, 0x04, 0x08, 0x09,
+    0x03, 0x04, 0x06, 0x09,
+    0x03, 0x09, 0x09, 0x09,
+    0x03, 0x03, 0x09, 0x09,
+    0x03, 0x03, 0x03, 0x09,
+    0x03, 0x06, 0x07, 0x0A,
+    0x03, 0x06, 0x09, 0x0A,
+    0x03, 0x04, 0x09, 0x0A,
+    0x03, 0x04, 0x07, 0x0A,
+    0x03, 0x06, 0x08, 0x0B,
+    0x03, 0x07, 0x09, 0x0B,
+    0x03, 0x05, 0x09, 0x0B,
+    0x03, 0x05, 0x07, 0x0B,
+    0x03, 0x0B, 0x0B, 0x0B,
+    0x03, 0x03, 0x0B, 0x0B,
+    0x03, 0x03, 0x03, 0x0B,
+    0x03, 0x07, 0x0A, 0x0E,
+    0x03, 0x08, 0x0C, 0x0E,
+    0x03, 0x05, 0x0C, 0x0E,
+    0x03, 0x05, 0x09, 0x0E,
+    0x03, 0x0E, 0x0E, 0x0E,
+    0x03, 0x03, 0x0E, 0x0E,
+    0x03, 0x03, 0x03, 0x0E,
+    0x03, 0x08, 0x0C, 0x11,
+    0x03, 0x0A, 0x0E, 0x11,
+    0x03, 0x06, 0x0E, 0x11,
+    0x03, 0x06, 0x0A, 0x11,
+    0x03, 0x11, 0x11, 0x11,
+    0x03, 0x03, 0x11, 0x11,
+    0x03, 0x03, 0x03, 0x11,
+    0x03, 0x09, 0x0E, 0x14,
+    0x03, 0x0B, 0x10, 0x14,
+    0x03, 0x07, 0x10, 0x14,
+    0x03, 0x07, 0x0C, 0x14,
+    0x03, 0x14, 0x14, 0x14,
+    0x03, 0x03, 0x14, 0x14,
+    0x03, 0x03, 0x03, 0x14,
+    0x03, 0x0A, 0x10, 0x17,
+    0x03, 0x0D, 0x12, 0x17,
+    0x03, 0x08, 0x12, 0x17,
+    0x03, 0x08, 0x0D, 0x17,
+    0x03, 0x17, 0x17, 0x17,
+    0x03, 0x03, 0x17, 0x17,
+    0x03, 0x03, 0x03, 0x17,
+    0x03, 0x0E, 0x15, 0x1A,
+    0x03, 0x08, 0x15, 0x1A,
+    0x03, 0x08, 0x0F, 0x1A,
+    0x03, 0x1A, 0x1A, 0x1A,
+    0x03, 0x03, 0x1A, 0x1A,
+    0x03, 0x03, 0x03, 0x1A,
+    0x03, 0x10, 0x17, 0x1D,
+    0x03, 0x09, 0x17, 0x1D,
+    0x03, 0x09, 0x10, 0x1D,
+    0x03, 0x1D, 0x1D, 0x1D,
+    0x03, 0x03, 0x1D, 0x1D,
+    0x03, 0x03, 0x03, 0x1D,
+    0x03, 0x11, 0x19, 0x20,
+    0x03, 0x0A, 0x19, 0x20,
+    0x03, 0x0A, 0x12, 0x20,
+    0x03, 0x20, 0x20, 0x20,
+    0x03, 0x03, 0x20, 0x20,
+    0x03, 0x03, 0x03, 0x20,
+    0x03, 0x13, 0x1B, 0x23,
+    0x03, 0x0B, 0x1B, 0x23,
+    0x03, 0x0B, 0x13, 0x23,
+    0x03, 0x23, 0x23, 0x23,
+    0x03, 0x03, 0x23, 0x23,
+    0x03, 0x03, 0x03, 0x23,
+    0x03, 0x26, 0x26, 0x26,
+    0x03, 0x03, 0x26, 0x26,
+    0x03, 0x03, 0x03, 0x26,
+    0x03, 0x15, 0x1E, 0x27,
+    0x03, 0x0C, 0x1E, 0x27,
+    0x03, 0x0C, 0x15, 0x27,
+    0x03, 0x2B, 0x2B, 0x2B,
+    0x03, 0x03, 0x2B, 0x2B,
+    0x03, 0x03, 0x03, 0x2B,
+    0x03, 0x31, 0x31, 0x31,
+    0x03, 0x03, 0x31, 0x31,
+    0x03, 0x03, 0x03, 0x31,
+    0x04, 0x05, 0x05, 0x06,
+    0x04, 0x05, 0x06, 0x07,
+    0x04, 0x06, 0x07, 0x08,
+    0x04, 0x05, 0x07, 0x08,
+    0x04, 0x05, 0x06, 0x08,
+    0x04, 0x06, 0x07, 0x09,
+    0x04, 0x06, 0x08, 0x09,
+    0x04, 0x05, 0x08, 0x09,
+    0x04, 0x05, 0x07, 0x09,
+    0x04, 0x06, 0x08, 0x0A,
+    0x04, 0x07, 0x09, 0x0A,
+    0x04, 0x05, 0x09, 0x0A,
+    0x04, 0x05, 0x07, 0x0A,
+    0x04, 0x0A, 0x0A, 0x0A,
+    0x04, 0x04, 0x0A, 0x0A,
+    0x04, 0x04, 0x04, 0x0A,
+    0x04, 0x07, 0x08, 0x0B,
+    0x04, 0x07, 0x0A, 0x0B,
+    0x04, 0x05, 0x0A, 0x0B,
+    0x04, 0x05, 0x08, 0x0B,
+    0x04, 0x07, 0x09, 0x0C,
+    0x04, 0x08, 0x0A, 0x0C,
+    0x04, 0x06, 0x0A, 0x0C,
+    0x04, 0x06, 0x08, 0x0C,
+    0x04, 0x0C, 0x0C, 0x0C,
+    0x04, 0x04, 0x0C, 0x0C,
+    0x04, 0x04, 0x04, 0x0C,
+    0x04, 0x08, 0x0B, 0x0F,
+    0x04, 0x09, 0x0D, 0x0F,
+    0x04, 0x06, 0x0D, 0x0F,
+    0x04, 0x06, 0x0A, 0x0F,
+    0x04, 0x0F, 0x0F, 0x0F,
+    0x04, 0x04, 0x0F, 0x0F,
+    0x04, 0x04, 0x04, 0x0F,
+    0x04, 0x09, 0x0D, 0x12,
+    0x04, 0x0B, 0x0F, 0x12,
+    0x04, 0x07, 0x0F, 0x12,
+    0x04, 0x07, 0x0B, 0x12,
+    0x04, 0x12, 0x12, 0x12,
+    0x04, 0x04, 0x12, 0x12,
+    0x04, 0x04, 0x04, 0x12,
+    0x04, 0x0A, 0x0F, 0x15,
+    0x04, 0x0C, 0x11, 0x15,
+    0x04, 0x08, 0x11, 0x15,
+    0x04, 0x08, 0x0D, 0x15,
+    0x04, 0x15, 0x15, 0x15,
+    0x04, 0x04, 0x15, 0x15,
+    0x04, 0x04, 0x04, 0x15,
+    0x04, 0x0B, 0x11, 0x18,
+    0x04, 0x0E, 0x13, 0x18,
+    0x04, 0x09, 0x13, 0x18,
+    0x04, 0x09, 0x0E, 0x18,
+    0x04, 0x18, 0x18, 0x18,
+    0x04, 0x04, 0x18, 0x18,
+    0x04, 0x04, 0x04, 0x18,
+    0x04, 0x0F, 0x16, 0x1B,
+    0x04, 0x09, 0x16, 0x1B,
+    0x04, 0x09, 0x10, 0x1B,
+    0x04, 0x1B, 0x1B, 0x1B,
+    0x04, 0x04, 0x1B, 0x1B,
+    0x04, 0x04, 0x04, 0x1B,
+    0x04, 0x11, 0x18, 0x1E,
+    0x04, 0x0A, 0x18, 0x1E,
+    0x04, 0x0A, 0x11, 0x1E,
+    0x04, 0x1E, 0x1E, 0x1E,
+    0x04, 0x04, 0x1E, 0x1E,
+    0x04, 0x04, 0x04, 0x1E,
+    0x04, 0x12, 0x1A, 0x21,
+    0x04, 0x0B, 0x1A, 0x21,
+    0x04, 0x0B, 0x13, 0x21,
+    0x04, 0x21, 0x21, 0x21,
+    0x04, 0x04, 0x21, 0x21,
+    0x04, 0x04, 0x04, 0x21,
+    0x04, 0x14, 0x1C, 0x24,
+    0x04, 0x0C, 0x1C, 0x24,
+    0x04, 0x0C, 0x14, 0x24,
+    0x04, 0x24, 0x24, 0x24,
+    0x04, 0x04, 0x24, 0x24,
+    0x04, 0x04, 0x04, 0x24,
+    0x04, 0x27, 0x27, 0x27,
+    0x04, 0x04, 0x27, 0x27,
+    0x04, 0x04, 0x04, 0x27,
+    0x04, 0x16, 0x1F, 0x28,
+    0x04, 0x0D, 0x1F, 0x28,
+    0x04, 0x0D, 0x16, 0x28,
+    0x04, 0x2C, 0x2C, 0x2C,
+    0x04, 0x04, 0x2C, 0x2C,
+    0x04, 0x04, 0x04, 0x2C,
+    0x04, 0x32, 0x32, 0x32,
+    0x04, 0x04, 0x32, 0x32,
+    0x04, 0x04, 0x04, 0x32,
+    0x05, 0x06, 0x06, 0x07,
+    0x05, 0x06, 0x07, 0x08,
+    0x05, 0x07, 0x08, 0x09,
+    0x05, 0x06, 0x08, 0x09,
+    0x05, 0x06, 0x07, 0x09,
+    0x05, 0x07, 0x08, 0x0A,
+    0x05, 0x07, 0x09, 0x0A,
+    0x05, 0x06, 0x09, 0x0A,
+    0x05, 0x06, 0x08, 0x0A,
+    0x05, 0x07, 0x09, 0x0B,
+    0x05, 0x08, 0x0A, 0x0B,
+    0x05, 0x06, 0x0A, 0x0B,
+    0x05, 0x06, 0x08, 0x0B,
+    0x05, 0x0B, 0x0B, 0x0B,
+    0x05, 0x05, 0x0B, 0x0B,
+    0x05, 0x05, 0x05, 0x0B,
+    0x05, 0x08, 0x09, 0x0C,
+    0x05, 0x08, 0x0B, 0x0C,
+    0x05, 0x06, 0x0B, 0x0C,
+    0x05, 0x06, 0x09, 0x0C,
+    0x05, 0x08, 0x0A, 0x0D,
+    0x05, 0x09, 0x0B, 0x0D,
+    0x05, 0x07, 0x0B, 0x0D,
+    0x05, 0x07, 0x09, 0x0D,
+    0x05, 0x0D, 0x0D, 0x0D,
+    0x05, 0x05, 0x0D, 0x0D,
+    0x05, 0x05, 0x05, 0x0D,
+    0x05, 0x09, 0x0C, 0x10,
+    0x05, 0x0A, 0x0E, 0x10,
+    0x05, 0x07, 0x0E, 0x10,
+    0x05, 0x07, 0x0B, 0x10,
+    0x05, 0x10, 0x10, 0x10,
+    0x05, 0x05, 0x10, 0x10,
+    0x05, 0x05, 0x05, 0x10,
+    0x05, 0x0A, 0x0E, 0x13,
+    0x05, 0x0C, 0x10, 0x13,
+    0x05, 0x08, 0x10, 0x13,
+    0x05, 0x08, 0x0C, 0x13,
+    0x05, 0x13, 0x13, 0x13,
+    0x05, 0x05, 0x13, 0x13,
+    0x05, 0x05, 0x05, 0x13,
+    0x05, 0x0B, 0x10, 0x16,
+    0x05, 0x0D, 0x12, 0x16,
+    0x05, 0x09, 0x12, 0x16,
+    0x05, 0x09, 0x0E, 0x16,
+    0x05, 0x16, 0x16, 0x16,
+    0x05, 0x05, 0x16, 0x16,
+    0x05, 0x05, 0x05, 0x16,
+    0x05, 0x0C, 0x12, 0x19,
+    0x05, 0x0F, 0x14, 0x19,
+    0x05, 0x0A, 0x14, 0x19,
+    0x05, 0x0A, 0x0F, 0x19,
+    0x05, 0x19, 0x19, 0x19,
+    0x05, 0x05, 0x19, 0x19,
+    0x05, 0x05, 0x05, 0x19,
+    0x05, 0x10, 0x17, 0x1C,
+    0x05, 0x0A, 0x17, 0x1C,
+    0x05, 0x0A, 0x11, 0x1C,
+    0x05, 0x1C, 0x1C, 0x1C,
+    0x05, 0x05, 0x1C, 0x1C,
+    0x05, 0x05, 0x05, 0x1C,
+    0x05, 0x12, 0x19, 0x1F,
+    0x05, 0x0B, 0x19, 0x1F,
+    0x05, 0x0B, 0x12, 0x1F,
+    0x05, 0x1F, 0x1F, 0x1F,
+    0x05, 0x05, 0x1F, 0x1F,
+    0x05, 0x05, 0x05, 0x1F,
+    0x05, 0x13, 0x1B, 0x22,
+    0x05, 0x0C, 0x1B, 0x22,
+    0x05, 0x0C, 0x14, 0x22,
+    0x05, 0x22, 0x22, 0x22,
+    0x05, 0x05, 0x22, 0x22,
+    0x05, 0x05, 0x05, 0x22,
+    0x05, 0x15, 0x1D, 0x25,
+    0x05, 0x0D, 0x1D, 0x25,
+    0x05, 0x0D, 0x15, 0x25,
+    0x05, 0x25, 0x25, 0x25,
+    0x05, 0x05, 0x25, 0x25,
+    0x05, 0x05, 0x05, 0x25,
+    0x05, 0x28, 0x28, 0x28,
+    0x05, 0x05, 0x28, 0x28,
+    0x05, 0x05, 0x05, 0x28,
+    0x05, 0x17, 0x20, 0x29,
+    0x05, 0x0E, 0x20, 0x29,
+    0x05, 0x0E, 0x17, 0x29,
+    0x05, 0x2D, 0x2D, 0x2D,
+    0x05, 0x05, 0x2D, 0x2D,
+    0x05, 0x05, 0x05, 0x2D,
+    0x05, 0x33, 0x33, 0x33,
+    0x05, 0x05, 0x33, 0x33,
+    0x05, 0x05, 0x05, 0x33,
+    0x06, 0x07, 0x07, 0x08,
+    0x06, 0x07, 0x08, 0x09,
+    0x06, 0x08, 0x09, 0x0A,
+    0x06, 0x07, 0x09, 0x0A,
+    0x06, 0x07, 0x08, 0x0A,
+    0x06, 0x08, 0x09, 0x0B,
+    0x06, 0x08, 0x0A, 0x0B,
+    0x06, 0x07, 0x0A, 0x0B,
+    0x06, 0x07, 0x09, 0x0B,
+    0x06, 0x08, 0x0A, 0x0C,
+    0x06, 0x09, 0x0B, 0x0C,
+    0x06, 0x07, 0x0B, 0x0C,
+    0x06, 0x07, 0x09, 0x0C,
+    0x06, 0x0C, 0x0C, 0x0C,
+    0x06, 0x06, 0x0C, 0x0C,
+    0x06, 0x06, 0x06, 0x0C,
+    0x06, 0x09, 0x0A, 0x0D,
+    0x06, 0x09, 0x0C, 0x0D,
+    0x06, 0x07, 0x0C, 0x0D,
+    0x06, 0x07, 0x0A, 0x0D,
+    0x06, 0x09, 0x0B, 0x0E,
+    0x06, 0x0A, 0x0C, 0x0E,
+    0x06, 0x08, 0x0C, 0x0E,
+    0x06, 0x08, 0x0A, 0x0E,
+    0x06, 0x0E, 0x0E, 0x0E,
+    0x06, 0x06, 0x0E, 0x0E,
+    0x06, 0x06, 0x06, 0x0E,
+    0x06, 0x0A, 0x0D, 0x11,
+    0x06, 0x0B, 0x0F, 0x11,
+    0x06, 0x08, 0x0F, 0x11,
+    0x06, 0x08, 0x0C, 0x11,
+    0x06, 0x11, 0x11, 0x11,
+    0x06, 0x06, 0x11, 0x11,
+    0x06, 0x06, 0x06, 0x11,
+    0x06, 0x0B, 0x0F, 0x14,
+    0x06, 0x0D, 0x11, 0x14,
+    0x06, 0x09, 0x11, 0x14,
+    0x06, 0x09, 0x0D, 0x14,
+    0x06, 0x14, 0x14, 0x14,
+    0x06, 0x06, 0x14, 0x14,
+    0x06, 0x06, 0x06, 0x14,
+    0x06, 0x0C, 0x11, 0x17,
+    0x06, 0x0E, 0x13, 0x17,
+    0x06, 0x0A, 0x13, 0x17,
+    0x06, 0x0A, 0x0F, 0x17,
+    0x06, 0x17, 0x17, 0x17,
+    0x06, 0x06, 0x17, 0x17,
+    0x06, 0x06, 0x06, 0x17,
+    0x06, 0x0D, 0x13, 0x1A,
+    0x06, 0x10, 0x15, 0x1A,
+    0x06, 0x0B, 0x15, 0x1A,
+    0x06, 0x0B, 0x10, 0x1A,
+    0x06, 0x1A, 0x1A, 0x1A,
+    0x06, 0x06, 0x1A, 0x1A,
+    0x06, 0x06, 0x06, 0x1A,
+    0x06, 0x11, 0x18, 0x1D,
+    0x06, 0x0B, 0x18, 0x1D,
+    0x06, 0x0B, 0x12, 0x1D,
+    0x06, 0x1D, 0x1D, 0x1D,
+    0x06, 0x06, 0x1D, 0x1D,
+    0x06, 0x06, 0x06, 0x1D,
+    0x06, 0x13, 0x1A, 0x20,
+    0x06, 0x0C, 0x1A, 0x20,
+    0x06, 0x0C, 0x13, 0x20,
+    0x06, 0x20, 0x20, 0x20,
+    0x06, 0x06, 0x20, 0x20,
+    0x06, 0x06, 0x06, 0x20,
+    0x06, 0x14, 0x1C, 0x23,
+    0x06, 0x0D, 0x1C, 0x23,
+    0x06, 0x0D, 0x15, 0x23,
+    0x06, 0x23, 0x23, 0x23,
+    0x06, 0x06, 0x23, 0x23,
+    0x06, 0x06, 0x06, 0x23,
+    0x06, 0x16, 0x1E, 0x26,
+    0x06, 0x0E, 0x1E, 0x26,
+    0x06, 0x0E, 0x16, 0x26,
+    0x06, 0x26, 0x26, 0x26,
+    0x06, 0x06, 0x26, 0x26,
+    0x06, 0x06, 0x06, 0x26,
+    0x06, 0x29, 0x29, 0x29,
+    0x06, 0x06, 0x29, 0x29,
+    0x06, 0x06, 0x06, 0x29,
+    0x06, 0x18, 0x21, 0x2A,
+    0x06, 0x0F, 0x21, 0x2A,
+    0x06, 0x0F, 0x18, 0x2A,
+    0x06, 0x2E, 0x2E, 0x2E,
+    0x06, 0x06, 0x2E, 0x2E,
+    0x06, 0x06, 0x06, 0x2E,
+    0x06, 0x34, 0x34, 0x34,
+    0x06, 0x06, 0x34, 0x34,
+    0x06, 0x06, 0x06, 0x34,
+    0x07, 0x08, 0x08, 0x09,
+    0x07, 0x08, 0x09, 0x0A,
+    0x07, 0x09, 0x0A, 0x0B,
+    0x07, 0x08, 0x0A, 0x0B,
+    0x07, 0x08, 0x09, 0x0B,
+    0x07, 0x09, 0x0A, 0x0C,
+    0x07, 0x09, 0x0B, 0x0C,
+    0x07, 0x08, 0x0B, 0x0C,
+    0x07, 0x08, 0x0A, 0x0C,
+    0x07, 0x09, 0x0B, 0x0D,
+    0x07, 0x0A, 0x0C, 0x0D,
+    0x07, 0x08, 0x0C, 0x0D,
+    0x07, 0x08, 0x0A, 0x0D,
+    0x07, 0x0D, 0x0D, 0x0D,
+    0x07, 0x07, 0x0D, 0x0D,
+    0x07, 0x07, 0x07, 0x0D,
+    0x07, 0x0A, 0x0B, 0x0E,
+    0x07, 0x0A, 0x0D, 0x0E,
+    0x07, 0x08, 0x0D, 0x0E,
+    0x07, 0x08, 0x0B, 0x0E,
+    0x07, 0x0A, 0x0C, 0x0F,
+    0x07, 0x0B, 0x0D, 0x0F,
+    0x07, 0x09, 0x0D, 0x0F,
+    0x07, 0x09, 0x0B, 0x0F,
+    0x07, 0x0F, 0x0F, 0x0F,
+    0x07, 0x07, 0x0F, 0x0F,
+    0x07, 0x07, 0x07, 0x0F,
+    0x07, 0x0B, 0x0E, 0x12,
+    0x07, 0x0C, 0x10, 0x12,
+    0x07, 0x09, 0x10, 0x12,
+    0x07, 0x09, 0x0D, 0x12,
+    0x07, 0x12, 0x12, 0x12,
+    0x07, 0x07, 0x12, 0x12,
+    0x07, 0x07, 0x07, 0x12,
+    0x07, 0x0C, 0x10, 0x15,
+    0x07, 0x0E, 0x12, 0x15,
+    0x07, 0x0A, 0x12, 0x15,
+    0x07, 0x0A, 0x0E, 0x15,
+    0x07, 0x15, 0x15, 0x15,
+    0x07, 0x07, 0x15, 0x15,
+    0x07, 0x07, 0x07, 0x15,
+    0x07, 0x0D, 0x12, 0x18,
+    0x07, 0x0F, 0x14, 0x18,
+    0x07, 0x0B, 0x14, 0x18,
+    0x07, 0x0B, 0x10, 0x18,
+    0x07, 0x18, 0x18, 0x18,
+    0x07, 0x07, 0x18, 0x18,
+    0x07, 0x07, 0x07, 0x18,
+    0x07, 0x0E, 0x14, 0x1B,
+    0x07, 0x11, 0x16, 0x1B,
+    0x07, 0x0C, 0x16, 0x1B,
+    0x07, 0x0C, 0x11, 0x1B,
+    0x07, 0x1B, 0x1B, 0x1B,
+    0x07, 0x07, 0x1B, 0x1B,
+    0x07, 0x07, 0x07, 0x1B,
+    0x07, 0x12, 0x19, 0x1E,
+    0x07, 0x0C, 0x19, 0x1E,
+    0x07, 0x0C, 0x13, 0x1E,
+    0x07, 0x1E, 0x1E, 0x1E,
+    0x07, 0x07, 0x1E, 0x1E,
+    0x07, 0x07, 0x07, 0x1E,
+    0x07, 0x14, 0x1B, 0x21,
+    0x07, 0x0D, 0x1B, 0x21,
+    0x07, 0x0D, 0x14, 0x21,
+    0x07, 0x21, 0x21, 0x21,
+    0x07, 0x07, 0x21, 0x21,
+    0x07, 0x07, 0x07, 0x21,
+    0x07, 0x15, 0x1D, 0x24,
+    0x07, 0x0E, 0x1D, 0x24,
+    0x07, 0x0E, 0x16, 0x24,
+    0x07, 0x24, 0x24, 0x24,
+    0x07, 0x07, 0x24, 0x24,
+    0x07, 0x07, 0x07, 0x24,
+    0x07, 0x17, 0x1F, 0x27,
+    0x07, 0x0F, 0x1F, 0x27,
+    0x07, 0x0F, 0x17, 0x27,
+    0x07, 0x27, 0x27, 0x27,
+    0x07, 0x07, 0x27, 0x27,
+    0x07, 0x07, 0x07, 0x27,
+    0x07, 0x2A, 0x2A, 0x2A,
+    0x07, 0x07, 0x2A, 0x2A,
+    0x07, 0x07, 0x07, 0x2A,
+    0x07, 0x19, 0x22, 0x2B,
+    0x07, 0x10, 0x22, 0x2B,
+    0x07, 0x10, 0x19, 0x2B,
+    0x07, 0x2F, 0x2F, 0x2F,
+    0x07, 0x07, 0x2F, 0x2F,
+    0x07, 0x07, 0x07, 0x2F,
+    0x07, 0x35, 0x35, 0x35,
+    0x07, 0x07, 0x35, 0x35,
+    0x07, 0x07, 0x07, 0x35,
+    0x08, 0x09, 0x09, 0x0A,
+    0x08, 0x09, 0x0A, 0x0B,
+    0x08, 0x0A, 0x0B, 0x0C,
+    0x08, 0x09, 0x0B, 0x0C,
+    0x08, 0x09, 0x0A, 0x0C,
+    0x08, 0x0A, 0x0B, 0x0D,
+    0x08, 0x0A, 0x0C, 0x0D,
+    0x08, 0x09, 0x0C, 0x0D,
+    0x08, 0x09, 0x0B, 0x0D,
+    0x08, 0x0A, 0x0C, 0x0E,
+    0x08, 0x0B, 0x0D, 0x0E,
+    0x08, 0x09, 0x0D, 0x0E,
+    0x08, 0x09, 0x0B, 0x0E,
+    0x08, 0x0E, 0x0E, 0x0E,
+    0x08, 0x08, 0x0E, 0x0E,
+    0x08, 0x08, 0x08, 0x0E,
+    0x08, 0x0B, 0x0C, 0x0F,
+    0x08, 0x0B, 0x0E, 0x0F,
+    0x08, 0x09, 0x0E, 0x0F,
+    0x08, 0x09, 0x0C, 0x0F,
+    0x08, 0x0B, 0x0D, 0x10,
+    0x08, 0x0C, 0x0E, 0x10,
+    0x08, 0x0A, 0x0E, 0x10,
+    0x08, 0x0A, 0x0C, 0x10,
+    0x08, 0x10, 0x10, 0x10,
+    0x08, 0x08, 0x10, 0x10,
+    0x08, 0x08, 0x08, 0x10,
+    0x08, 0x0C, 0x0F, 0x13,
+    0x08, 0x0D, 0x11, 0x13,
+    0x08, 0x0A, 0x11, 0x13,
+    0x08, 0x0A, 0x0E, 0x13,
+    0x08, 0x13, 0x13, 0x13,
+    0x08, 0x08, 0x13, 0x13,
+    0x08, 0x08, 0x08, 0x13,
+    0x08, 0x0D, 0x11, 0x16,
+    0x08, 0x0F, 0x13, 0x16,
+    0x08, 0x0B, 0x13, 0x16,
+    0x08, 0x0B, 0x0F, 0x16,
+    0x08, 0x16, 0x16, 0x16,
+    0x08, 0x08, 0x16, 0x16,
+    0x08, 0x08, 0x08, 0x16,
+    0x08, 0x0E, 0x13, 0x19,
+    0x08, 0x10, 0x15, 0x19,
+    0x08, 0x0C, 0x15, 0x19,
+    0x08, 0x0C, 0x11, 0x19,
+    0x08, 0x19, 0x19, 0x19,
+    0x08, 0x08, 0x19, 0x19,
+    0x08, 0x08, 0x08, 0x19,
+    0x08, 0x0F, 0x15, 0x1C,
+    0x08, 0x12, 0x17, 0x1C,
+    0x08, 0x0D, 0x17, 0x1C,
+    0x08, 0x0D, 0x12, 0x1C,
+    0x08, 0x1C, 0x1C, 0x1C,
+    0x08, 0x08, 0x1C, 0x1C,
+    0x08, 0x08, 0x08, 0x1C,
+    0x08, 0x13, 0x1A, 0x1F,
+    0x08, 0x0D, 0x1A, 0x1F,
+    0x08, 0x0D, 0x14, 0x1F,
+    0x08, 0x1F, 0x1F, 0x1F,
+    0x08, 0x08, 0x1F, 0x1F,
+    0x08, 0x08, 0x08, 0x1F,
+    0x08, 0x15, 0x1C, 0x22,
+    0x08, 0x0E, 0x1C, 0x22,
+    0x08, 0x0E, 0x15, 0x22,
+    0x08, 0x22, 0x22, 0x22,
+    0x08, 0x08, 0x22, 0x22,
+    0x08, 0x08, 0x08, 0x22,
+    0x08, 0x16, 0x1E, 0x25,
+    0x08, 0x0F, 0x1E, 0x25,
+    0x08, 0x0F, 0x17, 0x25,
+    0x08, 0x25, 0x25, 0x25,
+    0x08, 0x08, 0x25, 0x25,
+    0x08, 0x08, 0x08, 0x25,
+    0x08, 0x18, 0x20, 0x28,
+    0x08, 0x10, 0x20, 0x28,
+    0x08, 0x10, 0x18, 0x28,
+    0x08, 0x28, 0x28, 0x28,
+    0x08, 0x08, 0x28, 0x28,
+    0x08, 0x08, 0x08, 0x28,
+    0x08, 0x2B, 0x2B, 0x2B,
+    0x08, 0x08, 0x2B, 0x2B,
+    0x08, 0x08, 0x08, 0x2B,
+    0x08, 0x1A, 0x23, 0x2C,
+    0x08, 0x11, 0x23, 0x2C,
+    0x08, 0x11, 0x1A, 0x2C,
+    0x08, 0x30, 0x30, 0x30,
+    0x08, 0x08, 0x30, 0x30,
+    0x08, 0x08, 0x08, 0x30,
+    0x08, 0x36, 0x36, 0x36,
+    0x08, 0x08, 0x36, 0x36,
+    0x08, 0x08, 0x08, 0x36,
+    0x09, 0x0A, 0x0A, 0x0B,
+    0x09, 0x0A, 0x0B, 0x0C,
+    0x09, 0x0B, 0x0C, 0x0D,
+    0x09, 0x0A, 0x0C, 0x0D,
+    0x09, 0x0A, 0x0B, 0x0D,
+    0x09, 0x0B, 0x0C, 0x0E,
+    0x09, 0x0B, 0x0D, 0x0E,
+    0x09, 0x0A, 0x0D, 0x0E,
+    0x09, 0x0A, 0x0C, 0x0E,
+    0x09, 0x0B, 0x0D, 0x0F,
+    0x09, 0x0C, 0x0E, 0x0F,
+    0x09, 0x0A, 0x0E, 0x0F,
+    0x09, 0x0A, 0x0C, 0x0F,
+    0x09, 0x0F, 0x0F, 0x0F,
+    0x09, 0x09, 0x0F, 0x0F,
+    0x09, 0x09, 0x09, 0x0F,
+    0x09, 0x0C, 0x0D, 0x10,
+    0x09, 0x0C, 0x0F, 0x10,
+    0x09, 0x0A, 0x0F, 0x10,
+    0x09, 0x0A, 0x0D, 0x10,
+    0x09, 0x0C, 0x0E, 0x11,
+    0x09, 0x0D, 0x0F, 0x11,
+    0x09, 0x0B, 0x0F, 0x11,
+    0x09, 0x0B, 0x0D, 0x11,
+    0x09, 0x11, 0x11, 0x11,
+    0x09, 0x09, 0x11, 0x11,
+    0x09, 0x09, 0x09, 0x11,
+    0x09, 0x0D, 0x10, 0x14,
+    0x09, 0x0E, 0x12, 0x14,
+    0x09, 0x0B, 0x12, 0x14,
+    0x09, 0x0B, 0x0F, 0x14,
+    0x09, 0x14, 0x14, 0x14,
+    0x09, 0x09, 0x14, 0x14,
+    0x09, 0x09, 0x09, 0x14,
+    0x09, 0x0E, 0x12, 0x17,
+    0x09, 0x10, 0x14, 0x17,
+    0x09, 0x0C, 0x14, 0x17,
+    0x09, 0x0C, 0x10, 0x17,
+    0x09, 0x17, 0x17, 0x17,
+    0x09, 0x09, 0x17, 0x17,
+    0x09, 0x09, 0x09, 0x17,
+    0x09, 0x0F, 0x14, 0x1A,
+    0x09, 0x11, 0x16, 0x1A,
+    0x09, 0x0D, 0x16, 0x1A,
+    0x09, 0x0D, 0x12, 0x1A,
+    0x09, 0x1A, 0x1A, 0x1A,
+    0x09, 0x09, 0x1A, 0x1A,
+    0x09, 0x09, 0x09, 0x1A,
+    0x09, 0x10, 0x16, 0x1D,
+    0x09, 0x13, 0x18, 0x1D,
+    0x09, 0x0E, 0x18, 0x1D,
+    0x09, 0x0E, 0x13, 0x1D,
+    0x09, 0x1D, 0x1D, 0x1D,
+    0x09, 0x09, 0x1D, 0x1D,
+    0x09, 0x09, 0x09, 0x1D,
+    0x09, 0x14, 0x1B, 0x20,
+    0x09, 0x0E, 0x1B, 0x20,
+    0x09, 0x0E, 0x15, 0x20,
+    0x09, 0x20, 0x20, 0x20,
+    0x09, 0x09, 0x20, 0x20,
+    0x09, 0x09, 0x09, 0x20,
+    0x09, 0x16, 0x1D, 0x23,
+    0x09, 0x0F, 0x1D, 0x23,
+    0x09, 0x0F, 0x16, 0x23,
+    0x09, 0x23, 0x23, 0x23,
+    0x09, 0x09, 0x23, 0x23,
+    0x09, 0x09, 0x09, 0x23,
+    0x09, 0x17, 0x1F, 0x26,
+    0x09, 0x10, 0x1F, 0x26,
+    0x09, 0x10, 0x18, 0x26,
+    0x09, 0x26, 0x26, 0x26,
+    0x09, 0x09, 0x26, 0x26,
+    0x09, 0x09, 0x09, 0x26,
+    0x09, 0x19, 0x21, 0x29,
+    0x09, 0x11, 0x21, 0x29,
+    0x09, 0x11, 0x19, 0x29,
+    0x09, 0x29, 0x29, 0x29,
+    0x09, 0x09, 0x29, 0x29,
+    0x09, 0x09, 0x09, 0x29,
+    0x09, 0x2C, 0x2C, 0x2C,
+    0x09, 0x09, 0x2C, 0x2C,
+    0x09, 0x09, 0x09, 0x2C,
+    0x09, 0x1B, 0x24, 0x2D,
+    0x09, 0x12, 0x24, 0x2D,
+    0x09, 0x12, 0x1B, 0x2D,
+    0x09, 0x31, 0x31, 0x31,
+    0x09, 0x09, 0x31, 0x31,
+    0x09, 0x09, 0x09, 0x31,
+    0x09, 0x37, 0x37, 0x37,
+    0x09, 0x09, 0x37, 0x37,
+    0x09, 0x09, 0x09, 0x37,
+    0x0A, 0x0B, 0x0B, 0x0C,
+    0x0A, 0x0B, 0x0C, 0x0D,
+    0x0A, 0x0C, 0x0D, 0x0E,
+    0x0A, 0x0B, 0x0D, 0x0E,
+    0x0A, 0x0B, 0x0C, 0x0E,
+    0x0A, 0x0C, 0x0D, 0x0F,
+    0x0A, 0x0C, 0x0E, 0x0F,
+    0x0A, 0x0B, 0x0E, 0x0F,
+    0x0A, 0x0B, 0x0D, 0x0F,
+    0x0A, 0x0C, 0x0E, 0x10,
+    0x0A, 0x0D, 0x0F, 0x10,
+    0x0A, 0x0B, 0x0F, 0x10,
+    0x0A, 0x0B, 0x0D, 0x10,
+    0x0A, 0x10, 0x10, 0x10,
+    0x0A, 0x0A, 0x10, 0x10,
+    0x0A, 0x0A, 0x0A, 0x10,
+    0x0A, 0x0D, 0x0E, 0x11,
+    0x0A, 0x0D, 0x10, 0x11,
+    0x0A, 0x0B, 0x10, 0x11,
+    0x0A, 0x0B, 0x0E, 0x11,
+    0x0A, 0x0D, 0x0F, 0x12,
+    0x0A, 0x0E, 0x10, 0x12,
+    0x0A, 0x0C, 0x10, 0x12,
+    0x0A, 0x0C, 0x0E, 0x12,
+    0x0A, 0x12, 0x12, 0x12,
+    0x0A, 0x0A, 0x12, 0x12,
+    0x0A, 0x0A, 0x0A, 0x12,
+    0x0A, 0x0E, 0x11, 0x15,
+    0x0A, 0x0F, 0x13, 0x15,
+    0x0A, 0x0C, 0x13, 0x15,
+    0x0A, 0x0C, 0x10, 0x15,
+    0x0A, 0x15, 0x15, 0x15,
+    0x0A, 0x0A, 0x15, 0x15,
+    0x0A, 0x0A, 0x0A, 0x15,
+    0x0A, 0x0F, 0x13, 0x18,
+    0x0A, 0x11, 0x15, 0x18,
+    0x0A, 0x0D, 0x15, 0x18,
+    0x0A, 0x0D, 0x11, 0x18,
+    0x0A, 0x18, 0x18, 0x18,
+    0x0A, 0x0A, 0x18, 0x18,
+    0x0A, 0x0A, 0x0A, 0x18,
+    0x0A, 0x10, 0x15, 0x1B,
+    0x0A, 0x12, 0x17, 0x1B,
+    0x0A, 0x0E, 0x17, 0x1B,
+    0x0A, 0x0E, 0x13, 0x1B,
+    0x0A, 0x1B, 0x1B, 0x1B,
+    0x0A, 0x0A, 0x1B, 0x1B,
+    0x0A, 0x0A, 0x0A, 0x1B,
+    0x0A, 0x11, 0x17, 0x1E,
+    0x0A, 0x14, 0x19, 0x1E,
+    0x0A, 0x0F, 0x19, 0x1E,
+    0x0A, 0x0F, 0x14, 0x1E,
+    0x0A, 0x1E, 0x1E, 0x1E,
+    0x0A, 0x0A, 0x1E, 0x1E,
+    0x0A, 0x0A, 0x0A, 0x1E,
+    0x0A, 0x15, 0x1C, 0x21,
+    0x0A, 0x0F, 0x1C, 0x21,
+    0x0A, 0x0F, 0x16, 0x21,
+    0x0A, 0x21, 0x21, 0x21,
+    0x0A, 0x0A, 0x21, 0x21,
+    0x0A, 0x0A, 0x0A, 0x21,
+    0x0A, 0x17, 0x1E, 0x24,
+    0x0A, 0x10, 0x1E, 0x24,
+    0x0A, 0x10, 0x17, 0x24,
+    0x0A, 0x24, 0x24, 0x24,
+    0x0A, 0x0A, 0x24, 0x24,
+    0x0A, 0x0A, 0x0A, 0x24,
+    0x0A, 0x18, 0x20, 0x27,
+    0x0A, 0x11, 0x20, 0x27,
+    0x0A, 0x11, 0x19, 0x27,
+    0x0A, 0x27, 0x27, 0x27,
+    0x0A, 0x0A, 0x27, 0x27,
+    0x0A, 0x0A, 0x0A, 0x27,
+    0x0A, 0x1A, 0x22, 0x2A,
+    0x0A, 0x12, 0x22, 0x2A,
+    0x0A, 0x12, 0x1A, 0x2A,
+    0x0A, 0x2A, 0x2A, 0x2A,
+    0x0A, 0x0A, 0x2A, 0x2A,
+    0x0A, 0x0A, 0x0A, 0x2A,
+    0x0A, 0x2D, 0x2D, 0x2D,
+    0x0A, 0x0A, 0x2D, 0x2D,
+    0x0A, 0x0A, 0x0A, 0x2D,
+    0x0A, 0x1C, 0x25, 0x2E,
+    0x0A, 0x13, 0x25, 0x2E,
+    0x0A, 0x13, 0x1C, 0x2E,
+    0x0A, 0x32, 0x32, 0x32,
+    0x0A, 0x0A, 0x32, 0x32,
+    0x0A, 0x0A, 0x0A, 0x32,
+    0x0A, 0x38, 0x38, 0x38,
+    0x0A, 0x0A, 0x38, 0x38,
+    0x0A, 0x0A, 0x0A, 0x38,
+    0x0B, 0x0C, 0x0C, 0x0D,
+    0x0B, 0x0C, 0x0D, 0x0E,
+    0x0B, 0x0D, 0x0E, 0x0F,
+    0x0B, 0x0C, 0x0E, 0x0F,
+    0x0B, 0x0C, 0x0D, 0x0F,
+    0x0B, 0x0D, 0x0E, 0x10,
+    0x0B, 0x0D, 0x0F, 0x10,
+    0x0B, 0x0C, 0x0F, 0x10,
+    0x0B, 0x0C, 0x0E, 0x10,
+    0x0B, 0x0D, 0x0F, 0x11,
+    0x0B, 0x0E, 0x10, 0x11,
+    0x0B, 0x0C, 0x10, 0x11,
+    0x0B, 0x0C, 0x0E, 0x11,
+    0x0B, 0x11, 0x11, 0x11,
+    0x0B, 0x0B, 0x11, 0x11,
+    0x0B, 0x0B, 0x0B, 0x11,
+    0x0B, 0x0E, 0x0F, 0x12,
+    0x0B, 0x0E, 0x11, 0x12,
+    0x0B, 0x0C, 0x11, 0x12,
+    0x0B, 0x0C, 0x0F, 0x12,
+    0x0B, 0x0E, 0x10, 0x13,
+    0x0B, 0x0F, 0x11, 0x13,
+    0x0B, 0x0D, 0x11, 0x13,
+    0x0B, 0x0D, 0x0F, 0x13,
+    0x0B, 0x13, 0x13, 0x13,
+    0x0B, 0x0B, 0x13, 0x13,
+    0x0B, 0x0B, 0x0B, 0x13,
+    0x0B, 0x0F, 0x12, 0x16,
+    0x0B, 0x10, 0x14, 0x16,
+    0x0B, 0x0D, 0x14, 0x16,
+    0x0B, 0x0D, 0x11, 0x16,
+    0x0B, 0x16, 0x16, 0x16,
+    0x0B, 0x0B, 0x16, 0x16,
+    0x0B, 0x0B, 0x0B, 0x16,
+    0x0B, 0x10, 0x14, 0x19,
+    0x0B, 0x12, 0x16, 0x19,
+    0x0B, 0x0E, 0x16, 0x19,
+    0x0B, 0x0E, 0x12, 0x19,
+    0x0B, 0x19, 0x19, 0x19,
+    0x0B, 0x0B, 0x19, 0x19,
+    0x0B, 0x0B, 0x0B, 0x19,
+    0x0B, 0x11, 0x16, 0x1C,
+    0x0B, 0x13, 0x18, 0x1C,
+    0x0B, 0x0F, 0x18, 0x1C,
+    0x0B, 0x0F, 0x14, 0x1C,
+    0x0B, 0x1C, 0x1C, 0x1C,
+    0x0B, 0x0B, 0x1C, 0x1C,
+    0x0B, 0x0B, 0x0B, 0x1C,
+    0x0B, 0x12, 0x18, 0x1F,
+    0x0B, 0x15, 0x1A, 0x1F,
+    0x0B, 0x10, 0x1A, 0x1F,
+    0x0B, 0x10, 0x15, 0x1F,
+    0x0B, 0x1F, 0x1F, 0x1F,
+    0x0B, 0x0B, 0x1F, 0x1F,
+    0x0B, 0x0B, 0x0B, 0x1F,
+    0x0B, 0x16, 0x1D, 0x22,
+    0x0B, 0x10, 0x1D, 0x22,
+    0x0B, 0x10, 0x17, 0x22,
+    0x0B, 0x22, 0x22, 0x22,
+    0x0B, 0x0B, 0x22, 0x22,
+    0x0B, 0x0B, 0x0B, 0x22,
+    0x0B, 0x18, 0x1F, 0x25,
+    0x0B, 0x11, 0x1F, 0x25,
+    0x0B, 0x11, 0x18, 0x25,
+    0x0B, 0x25, 0x25, 0x25,
+    0x0B, 0x0B, 0x25, 0x25,
+    0x0B, 0x0B, 0x0B, 0x25,
+    0x0B, 0x19, 0x21, 0x28,
+    0x0B, 0x12, 0x21, 0x28,
+    0x0B, 0x12, 0x1A, 0x28,
+    0x0B, 0x28, 0x28, 0x28,
+    0x0B, 0x0B, 0x28, 0x28,
+    0x0B, 0x0B, 0x0B, 0x28,
+    0x0B, 0x1B, 0x23, 0x2B,
+    0x0B, 0x13, 0x23, 0x2B,
+    0x0B, 0x13, 0x1B, 0x2B,
+    0x0B, 0x2B, 0x2B, 0x2B,
+    0x0B, 0x0B, 0x2B, 0x2B,
+    0x0B, 0x0B, 0x0B, 0x2B,
+    0x0B, 0x2E, 0x2E, 0x2E,
+    0x0B, 0x0B, 0x2E, 0x2E,
+    0x0B, 0x0B, 0x0B, 0x2E,
+    0x0B, 0x1D, 0x26, 0x2F,
+    0x0B, 0x14, 0x26, 0x2F,
+    0x0B, 0x14, 0x1D, 0x2F,
+    0x0B, 0x33, 0x33, 0x33,
+    0x0B, 0x0B, 0x33, 0x33,
+    0x0B, 0x0B, 0x0B, 0x33,
+    0x0B, 0x39, 0x39, 0x39,
+    0x0B, 0x0B, 0x39, 0x39,
+    0x0B, 0x0B, 0x0B, 0x39,
+    0x0C, 0x0D, 0x0D, 0x0E,
+    0x0C, 0x0D, 0x0E, 0x0F,
+    0x0C, 0x0E, 0x0F, 0x10,
+    0x0C, 0x0D, 0x0F, 0x10,
+    0x0C, 0x0D, 0x0E, 0x10,
+    0x0C, 0x0E, 0x0F, 0x11,
+    0x0C, 0x0E, 0x10, 0x11,
+    0x0C, 0x0D, 0x10, 0x11,
+    0x0C, 0x0D, 0x0F, 0x11,
+    0x0C, 0x0E, 0x10, 0x12,
+    0x0C, 0x0F, 0x11, 0x12,
+    0x0C, 0x0D, 0x11, 0x12,
+    0x0C, 0x0D, 0x0F, 0x12,
+    0x0C, 0x12, 0x12, 0x12,
+    0x0C, 0x0C, 0x12, 0x12,
+    0x0C, 0x0C, 0x0C, 0x12,
+    0x0C, 0x0F, 0x10, 0x13,
+    0x0C, 0x0F, 0x12, 0x13,
+    0x0C, 0x0D, 0x12, 0x13,
+    0x0C, 0x0D, 0x10, 0x13,
+    0x0C, 0x0F, 0x11, 0x14,
+    0x0C, 0x10, 0x12, 0x14,
+    0x0C, 0x0E, 0x12, 0x14,
+    0x0C, 0x0E, 0x10, 0x14,
+    0x0C, 0x14, 0x14, 0x14,
+    0x0C, 0x0C, 0x14, 0x14,
+    0x0C, 0x0C, 0x0C, 0x14,
+    0x0C, 0x10, 0x13, 0x17,
+    0x0C, 0x11, 0x15, 0x17,
+    0x0C, 0x0E, 0x15, 0x17,
+    0x0C, 0x0E, 0x12, 0x17,
+    0x0C, 0x17, 0x17, 0x17,
+    0x0C, 0x0C, 0x17, 0x17,
+    0x0C, 0x0C, 0x0C, 0x17,
+    0x0C, 0x11, 0x15, 0x1A,
+    0x0C, 0x13, 0x17, 0x1A,
+    0x0C, 0x0F, 0x17, 0x1A,
+    0x0C, 0x0F, 0x13, 0x1A,
+    0x0C, 0x1A, 0x1A, 0x1A,
+    0x0C, 0x0C, 0x1A, 0x1A,
+    0x0C, 0x0C, 0x0C, 0x1A,
+    0x0C, 0x12, 0x17, 0x1D,
+    0x0C, 0x14, 0x19, 0x1D,
+    0x0C, 0x10, 0x19, 0x1D,
+    0x0C, 0x10, 0x15, 0x1D,
+    0x0C, 0x1D, 0x1D, 0x1D,
+    0x0C, 0x0C, 0x1D, 0x1D,
+    0x0C, 0x0C, 0x0C, 0x1D,
+    0x0C, 0x13, 0x19, 0x20,
+    0x0C, 0x16, 0x1B, 0x20,
+    0x0C, 0x11, 0x1B, 0x20,
+    0x0C, 0x11, 0x16, 0x20,
+    0x0C, 0x20, 0x20, 0x20,
+    0x0C, 0x0C, 0x20, 0x20,
+    0x0C, 0x0C, 0x0C, 0x20,
+    0x0C, 0x17, 0x1E, 0x23,
+    0x0C, 0x11, 0x1E, 0x23,
+    0x0C, 0x11, 0x18, 0x23,
+    0x0C, 0x23, 0x23, 0x23,
+    0x0C, 0x0C, 0x23, 0x23,
+    0x0C, 0x0C, 0x0C, 0x23,
+    0x0C, 0x19, 0x20, 0x26,
+    0x0C, 0x12, 0x20, 0x26,
+    0x0C, 0x12, 0x19, 0x26,
+    0x0C, 0x26, 0x26, 0x26,
+    0x0C, 0x0C, 0x26, 0x26,
+    0x0C, 0x0C, 0x0C, 0x26,
+    0x0C, 0x1A, 0x22, 0x29,
+    0x0C, 0x13, 0x22, 0x29,
+    0x0C, 0x13, 0x1B, 0x29,
+    0x0C, 0x29, 0x29, 0x29,
+    0x0C, 0x0C, 0x29, 0x29,
+    0x0C, 0x0C, 0x0C, 0x29,
+    0x0C, 0x1C, 0x24, 0x2C,
+    0x0C, 0x14, 0x24, 0x2C,
+    0x0C, 0x14, 0x1C, 0x2C,
+    0x0C, 0x2C, 0x2C, 0x2C,
+    0x0C, 0x0C, 0x2C, 0x2C,
+    0x0C, 0x0C, 0x0C, 0x2C,
+    0x0C, 0x2F, 0x2F, 0x2F,
+    0x0C, 0x0C, 0x2F, 0x2F,
+    0x0C, 0x0C, 0x0C, 0x2F,
+    0x0C, 0x1E, 0x27, 0x30,
+    0x0C, 0x15, 0x27, 0x30,
+    0x0C, 0x15, 0x1E, 0x30,
+    0x0C, 0x34, 0x34, 0x34,
+    0x0C, 0x0C, 0x34, 0x34,
+    0x0C, 0x0C, 0x0C, 0x34,
+    0x0C, 0x3A, 0x3A, 0x3A,
+    0x0C, 0x0C, 0x3A, 0x3A,
+    0x0C, 0x0C, 0x0C, 0x3A,
+    0x0D, 0x0E, 0x0E, 0x0F,
+    0x0D, 0x0E, 0x0F, 0x10,
+    0x0D, 0x0F, 0x10, 0x11,
+    0x0D, 0x0E, 0x10, 0x11,
+    0x0D, 0x0E, 0x0F, 0x11,
+    0x0D, 0x0F, 0x10, 0x12,
+    0x0D, 0x0F, 0x11, 0x12,
+    0x0D, 0x0E, 0x11, 0x12,
+    0x0D, 0x0E, 0x10, 0x12,
+    0x0D, 0x0F, 0x11, 0x13,
+    0x0D, 0x10, 0x12, 0x13,
+    0x0D, 0x0E, 0x12, 0x13,
+    0x0D, 0x0E, 0x10, 0x13,
+    0x0D, 0x13, 0x13, 0x13,
+    0x0D, 0x0D, 0x13, 0x13,
+    0x0D, 0x0D, 0x0D, 0x13,
+    0x0D, 0x10, 0x11, 0x14,
+    0x0D, 0x10, 0x13, 0x14,
+    0x0D, 0x0E, 0x13, 0x14,
+    0x0D, 0x0E, 0x11, 0x14,
+    0x0D, 0x10, 0x12, 0x15,
+    0x0D, 0x11, 0x13, 0x15,
+    0x0D, 0x0F, 0x13, 0x15,
+    0x0D, 0x0F, 0x11, 0x15,
+    0x0D, 0x15, 0x15, 0x15,
+    0x0D, 0x0D, 0x15, 0x15,
+    0x0D, 0x0D, 0x0D, 0x15,
+    0x0D, 0x11, 0x14, 0x18,
+    0x0D, 0x12, 0x16, 0x18,
+    0x0D, 0x0F, 0x16, 0x18,
+    0x0D, 0x0F, 0x13, 0x18,
+    0x0D, 0x18, 0x18, 0x18,
+    0x0D, 0x0D, 0x18, 0x18,
+    0x0D, 0x0D, 0x0D, 0x18,
+    0x0D, 0x12, 0x16, 0x1B,
+    0x0D, 0x14, 0x18, 0x1B,
+    0x0D, 0x10, 0x18, 0x1B,
+    0x0D, 0x10, 0x14, 0x1B,
+    0x0D, 0x1B, 0x1B, 0x1B,
+    0x0D, 0x0D, 0x1B, 0x1B,
+    0x0D, 0x0D, 0x0D, 0x1B,
+    0x0D, 0x13, 0x18, 0x1E,
+    0x0D, 0x15, 0x1A, 0x1E,
+    0x0D, 0x11, 0x1A, 0x1E,
+    0x0D, 0x11, 0x16, 0x1E,
+    0x0D, 0x1E, 0x1E, 0x1E,
+    0x0D, 0x0D, 0x1E, 0x1E,
+    0x0D, 0x0D, 0x0D, 0x1E,
+    0x0D, 0x14, 0x1A, 0x21,
+    0x0D, 0x17, 0x1C, 0x21,
+    0x0D, 0x12, 0x1C, 0x21,
+    0x0D, 0x12, 0x17, 0x21,
+    0x0D, 0x21, 0x21, 0x21,
+    0x0D, 0x0D, 0x21, 0x21,
+    0x0D, 0x0D, 0x0D, 0x21,
+    0x0D, 0x18, 0x1F, 0x24,
+    0x0D, 0x12, 0x1F, 0x24,
+    0x0D, 0x12, 0x19, 0x24,
+    0x0D, 0x24, 0x24, 0x24,
+    0x0D, 0x0D, 0x24, 0x24,
+    0x0D, 0x0D, 0x0D, 0x24,
+    0x0D, 0x1A, 0x21, 0x27,
+    0x0D, 0x13, 0x21, 0x27,
+    0x0D, 0x13, 0x1A, 0x27,
+    0x0D, 0x27, 0x27, 0x27,
+    0x0D, 0x0D, 0x27, 0x27,
+    0x0D, 0x0D, 0x0D, 0x27,
+    0x0D, 0x1B, 0x23, 0x2A,
+    0x0D, 0x14, 0x23, 0x2A,
+    0x0D, 0x14, 0x1C, 0x2A,
+    0x0D, 0x2A, 0x2A, 0x2A,
+    0x0D, 0x0D, 0x2A, 0x2A,
+    0x0D, 0x0D, 0x0D, 0x2A,
+    0x0D, 0x1D, 0x25, 0x2D,
+    0x0D, 0x15, 0x25, 0x2D,
+    0x0D, 0x15, 0x1D, 0x2D,
+    0x0D, 0x2D, 0x2D, 0x2D,
+    0x0D, 0x0D, 0x2D, 0x2D,
+    0x0D, 0x0D, 0x0D, 0x2D,
+    0x0D, 0x30, 0x30, 0x30,
+    0x0D, 0x0D, 0x30, 0x30,
+    0x0D, 0x0D, 0x0D, 0x30,
+    0x0D, 0x1F, 0x28, 0x31,
+    0x0D, 0x16, 0x28, 0x31,
+    0x0D, 0x16, 0x1F, 0x31,
+    0x0D, 0x35, 0x35, 0x35,
+    0x0D, 0x0D, 0x35, 0x35,
+    0x0D, 0x0D, 0x0D, 0x35,
+    0x0D, 0x3B, 0x3B, 0x3B,
+    0x0D, 0x0D, 0x3B, 0x3B,
+    0x0D, 0x0D, 0x0D, 0x3B,
+    0x0E, 0x0F, 0x0F, 0x10,
+    0x0E, 0x0F, 0x10, 0x11,
+    0x0E, 0x10, 0x11, 0x12,
+    0x0E, 0x0F, 0x11, 0x12,
+    0x0E, 0x0F, 0x10, 0x12,
+    0x0E, 0x10, 0x11, 0x13,
+    0x0E, 0x10, 0x12, 0x13,
+    0x0E, 0x0F, 0x12, 0x13,
+    0x0E, 0x0F, 0x11, 0x13,
+    0x0E, 0x10, 0x12, 0x14,
+    0x0E, 0x11, 0x13, 0x14,
+    0x0E, 0x0F, 0x13, 0x14,
+    0x0E, 0x0F, 0x11, 0x14,
+    0x0E, 0x14, 0x14, 0x14,
+    0x0E, 0x0E, 0x14, 0x14,
+    0x0E, 0x0E, 0x0E, 0x14,
+    0x0E, 0x11, 0x12, 0x15,
+    0x0E, 0x11, 0x14, 0x15,
+    0x0E, 0x0F, 0x14, 0x15,
+    0x0E, 0x0F, 0x12, 0x15,
+    0x0E, 0x11, 0x13, 0x16,
+    0x0E, 0x12, 0x14, 0x16,
+    0x0E, 0x10, 0x14, 0x16,
+    0x0E, 0x10, 0x12, 0x16,
+    0x0E, 0x16, 0x16, 0x16,
+    0x0E, 0x0E, 0x16, 0x16,
+    0x0E, 0x0E, 0x0E, 0x16,
+    0x0E, 0x12, 0x15, 0x19,
+    0x0E, 0x13, 0x17, 0x19,
+    0x0E, 0x10, 0x17, 0x19,
+    0x0E, 0x10, 0x14, 0x19,
+    0x0E, 0x19, 0x19, 0x19,
+    0x0E, 0x0E, 0x19, 0x19,
+    0x0E, 0x0E, 0x0E, 0x19,
+    0x0E, 0x13, 0x17, 0x1C,
+    0x0E, 0x15, 0x19, 0x1C,
+    0x0E, 0x11, 0x19, 0x1C,
+    0x0E, 0x11, 0x15, 0x1C,
+    0x0E, 0x1C, 0x1C, 0x1C,
+    0x0E, 0x0E, 0x1C, 0x1C,
+    0x0E, 0x0E, 0x0E, 0x1C,
+    0x0E, 0x14, 0x19, 0x1F,
+    0x0E, 0x16, 0x1B, 0x1F,
+    0x0E, 0x12, 0x1B, 0x1F,
+    0x0E, 0x12, 0x17, 0x1F,
+    0x0E, 0x1F, 0x1F, 0x1F,
+    0x0E, 0x0E, 0x1F, 0x1F,
+    0x0E, 0x0E, 0x0E, 0x1F,
+    0x0E, 0x15, 0x1B, 0x22,
+    0x0E, 0x18, 0x1D, 0x22,
+    0x0E, 0x13, 0x1D, 0x22,
+    0x0E, 0x13, 0x18, 0x22,
+    0x0E, 0x22, 0x22, 0x22,
+    0x0E, 0x0E, 0x22, 0x22,
+    0x0E, 0x0E, 0x0E, 0x22,
+    0x0E, 0x19, 0x20, 0x25,
+    0x0E, 0x13, 0x20, 0x25,
+    0x0E, 0x13, 0x1A, 0x25,
+    0x0E, 0x25, 0x25, 0x25,
+    0x0E, 0x0E, 0x25, 0x25,
+    0x0E, 0x0E, 0x0E, 0x25,
+    0x0E, 0x1B, 0x22, 0x28,
+    0x0E, 0x14, 0x22, 0x28,
+    0x0E, 0x14, 0x1B, 0x28,
+    0x0E, 0x28, 0x28, 0x28,
+    0x0E, 0x0E, 0x28, 0x28,
+    0x0E, 0x0E, 0x0E, 0x28,
+    0x0E, 0x1C, 0x24, 0x2B,
+    0x0E, 0x15, 0x24, 0x2B,
+    0x0E, 0x15, 0x1D, 0x2B,
+    0x0E, 0x2B, 0x2B, 0x2B,
+    0x0E, 0x0E, 0x2B, 0x2B,
+    0x0E, 0x0E, 0x0E, 0x2B,
+    0x0E, 0x1E, 0x26, 0x2E,
+    0x0E, 0x16, 0x26, 0x2E,
+    0x0E, 0x16, 0x1E, 0x2E,
+    0x0E, 0x2E, 0x2E, 0x2E,
+    0x0E, 0x0E, 0x2E, 0x2E,
+    0x0E, 0x0E, 0x0E, 0x2E,
+    0x0E, 0x31, 0x31, 0x31,
+    0x0E, 0x0E, 0x31, 0x31,
+    0x0E, 0x0E, 0x0E, 0x31,
+    0x0E, 0x20, 0x29, 0x32,
+    0x0E, 0x17, 0x29, 0x32,
+    0x0E, 0x17, 0x20, 0x32,
+    0x0E, 0x36, 0x36, 0x36,
+    0x0E, 0x0E, 0x36, 0x36,
+    0x0E, 0x0E, 0x0E, 0x36,
+    0x0E, 0x3C, 0x3C, 0x3C,
+    0x0E, 0x0E, 0x3C, 0x3C,
+    0x0E, 0x0E, 0x0E, 0x3C,
+    0x0F, 0x10, 0x10, 0x11,
+    0x0F, 0x10, 0x11, 0x12,
+    0x0F, 0x11, 0x12, 0x13,
+    0x0F, 0x10, 0x12, 0x13,
+    0x0F, 0x10, 0x11, 0x13,
+    0x0F, 0x11, 0x12, 0x14,
+    0x0F, 0x11, 0x13, 0x14,
+    0x0F, 0x10, 0x13, 0x14,
+    0x0F, 0x10, 0x12, 0x14,
+    0x0F, 0x11, 0x13, 0x15,
+    0x0F, 0x12, 0x14, 0x15,
+    0x0F, 0x10, 0x14, 0x15,
+    0x0F, 0x10, 0x12, 0x15,
+    0x0F, 0x15, 0x15, 0x15,
+    0x0F, 0x0F, 0x15, 0x15,
+    0x0F, 0x0F, 0x0F, 0x15,
+    0x0F, 0x12, 0x13, 0x16,
+    0x0F, 0x12, 0x15, 0x16,
+    0x0F, 0x10, 0x15, 0x16,
+    0x0F, 0x10, 0x13, 0x16,
+    0x0F, 0x12, 0x14, 0x17,
+    0x0F, 0x13, 0x15, 0x17,
+    0x0F, 0x11, 0x15, 0x17,
+    0x0F, 0x11, 0x13, 0x17,
+    0x0F, 0x17, 0x17, 0x17,
+    0x0F, 0x0F, 0x17, 0x17,
+    0x0F, 0x0F, 0x0F, 0x17,
+    0x0F, 0x13, 0x16, 0x1A,
+    0x0F, 0x14, 0x18, 0x1A,
+    0x0F, 0x11, 0x18, 0x1A,
+    0x0F, 0x11, 0x15, 0x1A,
+    0x0F, 0x1A, 0x1A, 0x1A,
+    0x0F, 0x0F, 0x1A, 0x1A,
+    0x0F, 0x0F, 0x0F, 0x1A,
+    0x0F, 0x14, 0x18, 0x1D,
+    0x0F, 0x16, 0x1A, 0x1D,
+    0x0F, 0x12, 0x1A, 0x1D,
+    0x0F, 0x12, 0x16, 0x1D,
+    0x0F, 0x1D, 0x1D, 0x1D,
+    0x0F, 0x0F, 0x1D, 0x1D,
+    0x0F, 0x0F, 0x0F, 0x1D,
+    0x0F, 0x15, 0x1A, 0x20,
+    0x0F, 0x17, 0x1C, 0x20,
+    0x0F, 0x13, 0x1C, 0x20,
+    0x0F, 0x13, 0x18, 0x20,
+    0x0F, 0x20, 0x20, 0x20,
+    0x0F, 0x0F, 0x20, 0x20,
+    0x0F, 0x0F, 0x0F, 0x20,
+    0x0F, 0x16, 0x1C, 0x23,
+    0x0F, 0x19, 0x1E, 0x23,
+    0x0F, 0x14, 0x1E, 0x23,
+    0x0F, 0x14, 0x19, 0x23,
+    0x0F, 0x23, 0x23, 0x23,
+    0x0F, 0x0F, 0x23, 0x23,
+    0x0F, 0x0F, 0x0F, 0x23,
+    0x0F, 0x1A, 0x21, 0x26,
+    0x0F, 0x14, 0x21, 0x26,
+    0x0F, 0x14, 0x1B, 0x26,
+    0x0F, 0x26, 0x26, 0x26,
+    0x0F, 0x0F, 0x26, 0x26,
+    0x0F, 0x0F, 0x0F, 0x26,
+    0x0F, 0x1C, 0x23, 0x29,
+    0x0F, 0x15, 0x23, 0x29,
+    0x0F, 0x15, 0x1C, 0x29,
+    0x0F, 0x29, 0x29, 0x29,
+    0x0F, 0x0F, 0x29, 0x29,
+    0x0F, 0x0F, 0x0F, 0x29,
+    0x0F, 0x1D, 0x25, 0x2C,
+    0x0F, 0x16, 0x25, 0x2C,
+    0x0F, 0x16, 0x1E, 0x2C,
+    0x0F, 0x2C, 0x2C, 0x2C,
+    0x0F, 0x0F, 0x2C, 0x2C,
+    0x0F, 0x0F, 0x0F, 0x2C,
+    0x0F, 0x1F, 0x27, 0x2F,
+    0x0F, 0x17, 0x27, 0x2F,
+    0x0F, 0x17, 0x1F, 0x2F,
+    0x0F, 0x2F, 0x2F, 0x2F,
+    0x0F, 0x0F, 0x2F, 0x2F,
+    0x0F, 0x0F, 0x0F, 0x2F,
+    0x0F, 0x32, 0x32, 0x32,
+    0x0F, 0x0F, 0x32, 0x32,
+    0x0F, 0x0F, 0x0F, 0x32,
+    0x0F, 0x21, 0x2A, 0x33,
+    0x0F, 0x18, 0x2A, 0x33,
+    0x0F, 0x18, 0x21, 0x33,
+    0x0F, 0x37, 0x37, 0x37,
+    0x0F, 0x0F, 0x37, 0x37,
+    0x0F, 0x0F, 0x0F, 0x37,
+    0x0F, 0x3D, 0x3D, 0x3D,
+    0x0F, 0x0F, 0x3D, 0x3D,
+    0x0F, 0x0F, 0x0F, 0x3D,
+    0x10, 0x11, 0x11, 0x12,
+    0x10, 0x11, 0x12, 0x13,
+    0x10, 0x12, 0x13, 0x14,
+    0x10, 0x11, 0x13, 0x14,
+    0x10, 0x11, 0x12, 0x14,
+    0x10, 0x12, 0x13, 0x15,
+    0x10, 0x12, 0x14, 0x15,
+    0x10, 0x11, 0x14, 0x15,
+    0x10, 0x11, 0x13, 0x15,
+    0x10, 0x12, 0x14, 0x16,
+    0x10, 0x13, 0x15, 0x16,
+    0x10, 0x11, 0x15, 0x16,
+    0x10, 0x11, 0x13, 0x16,
+    0x10, 0x16, 0x16, 0x16,
+    0x10, 0x10, 0x16, 0x16,
+    0x10, 0x10, 0x10, 0x16,
+    0x10, 0x13, 0x14, 0x17,
+    0x10, 0x13, 0x16, 0x17,
+    0x10, 0x11, 0x16, 0x17,
+    0x10, 0x11, 0x14, 0x17,
+    0x10, 0x13, 0x15, 0x18,
+    0x10, 0x14, 0x16, 0x18,
+    0x10, 0x12, 0x16, 0x18,
+    0x10, 0x12, 0x14, 0x18,
+    0x10, 0x18, 0x18, 0x18,
+    0x10, 0x10, 0x18, 0x18,
+    0x10, 0x10, 0x10, 0x18,
+    0x10, 0x14, 0x17, 0x1B,
+    0x10, 0x15, 0x19, 0x1B,
+    0x10, 0x12, 0x19, 0x1B,
+    0x10, 0x12, 0x16, 0x1B,
+    0x10, 0x1B, 0x1B, 0x1B,
+    0x10, 0x10, 0x1B, 0x1B,
+    0x10, 0x10, 0x10, 0x1B,
+    0x10, 0x15, 0x19, 0x1E,
+    0x10, 0x17, 0x1B, 0x1E,
+    0x10, 0x13, 0x1B, 0x1E,
+    0x10, 0x13, 0x17, 0x1E,
+    0x10, 0x1E, 0x1E, 0x1E,
+    0x10, 0x10, 0x1E, 0x1E,
+    0x10, 0x10, 0x10, 0x1E,
+    0x10, 0x16, 0x1B, 0x21,
+    0x10, 0x18, 0x1D, 0x21,
+    0x10, 0x14, 0x1D, 0x21,
+    0x10, 0x14, 0x19, 0x21,
+    0x10, 0x21, 0x21, 0x21,
+    0x10, 0x10, 0x21, 0x21,
+    0x10, 0x10, 0x10, 0x21,
+    0x10, 0x17, 0x1D, 0x24,
+    0x10, 0x1A, 0x1F, 0x24,
+    0x10, 0x15, 0x1F, 0x24,
+    0x10, 0x15, 0x1A, 0x24,
+    0x10, 0x24, 0x24, 0x24,
+    0x10, 0x10, 0x24, 0x24,
+    0x10, 0x10, 0x10, 0x24,
+    0x10, 0x1B, 0x22, 0x27,
+    0x10, 0x15, 0x22, 0x27,
+    0x10, 0x15, 0x1C, 0x27,
+    0x10, 0x27, 0x27, 0x27,
+    0x10, 0x10, 0x27, 0x27,
+    0x10, 0x10, 0x10, 0x27,
+    0x10, 0x1D, 0x24, 0x2A,
+    0x10, 0x16, 0x24, 0x2A,
+    0x10, 0x16, 0x1D, 0x2A,
+    0x10, 0x2A, 0x2A, 0x2A,
+    0x10, 0x10, 0x2A, 0x2A,
+    0x10, 0x10, 0x10, 0x2A,
+    0x10, 0x1E, 0x26, 0x2D,
+    0x10, 0x17, 0x26, 0x2D,
+    0x10, 0x17, 0x1F, 0x2D,
+    0x10, 0x2D, 0x2D, 0x2D,
+    0x10, 0x10, 0x2D, 0x2D,
+    0x10, 0x10, 0x10, 0x2D,
+    0x10, 0x20, 0x28, 0x30,
+    0x10, 0x18, 0x28, 0x30,
+    0x10, 0x18, 0x20, 0x30,
+    0x10, 0x30, 0x30, 0x30,
+    0x10, 0x10, 0x30, 0x30,
+    0x10, 0x10, 0x10, 0x30,
+    0x10, 0x33, 0x33, 0x33,
+    0x10, 0x10, 0x33, 0x33,
+    0x10, 0x10, 0x10, 0x33,
+    0x10, 0x22, 0x2B, 0x34,
+    0x10, 0x19, 0x2B, 0x34,
+    0x10, 0x19, 0x22, 0x34,
+    0x10, 0x38, 0x38, 0x38,
+    0x10, 0x10, 0x38, 0x38,
+    0x10, 0x10, 0x10, 0x38,
+    0x10, 0x3E, 0x3E, 0x3E,
+    0x10, 0x10, 0x3E, 0x3E,
+    0x10, 0x10, 0x10, 0x3E,
+    0x11, 0x12, 0x12, 0x13,
+    0x11, 0x12, 0x13, 0x14,
+    0x11, 0x13, 0x14, 0x15,
+    0x11, 0x12, 0x14, 0x15,
+    0x11, 0x12, 0x13, 0x15,
+    0x11, 0x13, 0x14, 0x16,
+    0x11, 0x13, 0x15, 0x16,
+    0x11, 0x12, 0x15, 0x16,
+    0x11, 0x12, 0x14, 0x16,
+    0x11, 0x13, 0x15, 0x17,
+    0x11, 0x14, 0x16, 0x17,
+    0x11, 0x12, 0x16, 0x17,
+    0x11, 0x12, 0x14, 0x17,
+    0x11, 0x17, 0x17, 0x17,
+    0x11, 0x11, 0x17, 0x17,
+    0x11, 0x11, 0x11, 0x17,
+    0x11, 0x14, 0x15, 0x18,
+    0x11, 0x14, 0x17, 0x18,
+    0x11, 0x12, 0x17, 0x18,
+    0x11, 0x12, 0x15, 0x18,
+    0x11, 0x14, 0x16, 0x19,
+    0x11, 0x15, 0x17, 0x19,
+    0x11, 0x13, 0x17, 0x19,
+    0x11, 0x13, 0x15, 0x19,
+    0x11, 0x19, 0x19, 0x19,
+    0x11, 0x11, 0x19, 0x19,
+    0x11, 0x11, 0x11, 0x19,
+    0x11, 0x15, 0x18, 0x1C,
+    0x11, 0x16, 0x1A, 0x1C,
+    0x11, 0x13, 0x1A, 0x1C,
+    0x11, 0x13, 0x17, 0x1C,
+    0x11, 0x1C, 0x1C, 0x1C,
+    0x11, 0x11, 0x1C, 0x1C,
+    0x11, 0x11, 0x11, 0x1C,
+    0x11, 0x16, 0x1A, 0x1F,
+    0x11, 0x18, 0x1C, 0x1F,
+    0x11, 0x14, 0x1C, 0x1F,
+    0x11, 0x14, 0x18, 0x1F,
+    0x11, 0x1F, 0x1F, 0x1F,
+    0x11, 0x11, 0x1F, 0x1F,
+    0x11, 0x11, 0x11, 0x1F,
+    0x11, 0x17, 0x1C, 0x22,
+    0x11, 0x19, 0x1E, 0x22,
+    0x11, 0x15, 0x1E, 0x22,
+    0x11, 0x15, 0x1A, 0x22,
+    0x11, 0x22, 0x22, 0x22,
+    0x11, 0x11, 0x22, 0x22,
+    0x11, 0x11, 0x11, 0x22,
+    0x11, 0x18, 0x1E, 0x25,
+    0x11, 0x1B, 0x20, 0x25,
+    0x11, 0x16, 0x20, 0x25,
+    0x11, 0x16, 0x1B, 0x25,
+    0x11, 0x25, 0x25, 0x25,
+    0x11, 0x11, 0x25, 0x25,
+    0x11, 0x11, 0x11, 0x25,
+    0x11, 0x1C, 0x23, 0x28,
+    0x11, 0x16, 0x23, 0x28,
+    0x11, 0x16, 0x1D, 0x28,
+    0x11, 0x28, 0x28, 0x28,
+    0x11, 0x11, 0x28, 0x28,
+    0x11, 0x11, 0x11, 0x28,
+    0x11, 0x1E, 0x25, 0x2B,
+    0x11, 0x17, 0x25, 0x2B,
+    0x11, 0x17, 0x1E, 0x2B,
+    0x11, 0x2B, 0x2B, 0x2B,
+    0x11, 0x11, 0x2B, 0x2B,
+    0x11, 0x11, 0x11, 0x2B,
+    0x11, 0x1F, 0x27, 0x2E,
+    0x11, 0x18, 0x27, 0x2E,
+    0x11, 0x18, 0x20, 0x2E,
+    0x11, 0x2E, 0x2E, 0x2E,
+    0x11, 0x11, 0x2E, 0x2E,
+    0x11, 0x11, 0x11, 0x2E,
+    0x11, 0x21, 0x29, 0x31,
+    0x11, 0x19, 0x29, 0x31,
+    0x11, 0x19, 0x21, 0x31,
+    0x11, 0x31, 0x31, 0x31,
+    0x11, 0x11, 0x31, 0x31,
+    0x11, 0x11, 0x11, 0x31,
+    0x11, 0x34, 0x34, 0x34,
+    0x11, 0x11, 0x34, 0x34,
+    0x11, 0x11, 0x11, 0x34,
+    0x11, 0x23, 0x2C, 0x35,
+    0x11, 0x1A, 0x2C, 0x35,
+    0x11, 0x1A, 0x23, 0x35,
+    0x11, 0x39, 0x39, 0x39,
+    0x11, 0x11, 0x39, 0x39,
+    0x11, 0x11, 0x11, 0x39,
+    0x11, 0x3F, 0x3F, 0x3F,
+    0x11, 0x11, 0x3F, 0x3F,
+    0x11, 0x11, 0x11, 0x3F,
+    0x12, 0x13, 0x13, 0x14,
+    0x12, 0x13, 0x14, 0x15,
+    0x12, 0x14, 0x15, 0x16,
+    0x12, 0x13, 0x15, 0x16,
+    0x12, 0x13, 0x14, 0x16,
+    0x12, 0x14, 0x15, 0x17,
+    0x12, 0x14, 0x16, 0x17,
+    0x12, 0x13, 0x16, 0x17,
+    0x12, 0x13, 0x15, 0x17,
+    0x12, 0x14, 0x16, 0x18,
+    0x12, 0x15, 0x17, 0x18,
+    0x12, 0x13, 0x17, 0x18,
+    0x12, 0x13, 0x15, 0x18,
+    0x12, 0x18, 0x18, 0x18,
+    0x12, 0x12, 0x18, 0x18,
+    0x12, 0x12, 0x12, 0x18,
+    0x12, 0x15, 0x16, 0x19,
+    0x12, 0x15, 0x18, 0x19,
+    0x12, 0x13, 0x18, 0x19,
+    0x12, 0x13, 0x16, 0x19,
+    0x12, 0x15, 0x17, 0x1A,
+    0x12, 0x16, 0x18, 0x1A,
+    0x12, 0x14, 0x18, 0x1A,
+    0x12, 0x14, 0x16, 0x1A,
+    0x12, 0x1A, 0x1A, 0x1A,
+    0x12, 0x12, 0x1A, 0x1A,
+    0x12, 0x12, 0x12, 0x1A,
+    0x12, 0x16, 0x19, 0x1D,
+    0x12, 0x17, 0x1B, 0x1D,
+    0x12, 0x14, 0x1B, 0x1D,
+    0x12, 0x14, 0x18, 0x1D,
+    0x12, 0x1D, 0x1D, 0x1D,
+    0x12, 0x12, 0x1D, 0x1D,
+    0x12, 0x12, 0x12, 0x1D,
+    0x12, 0x17, 0x1B, 0x20,
+    0x12, 0x19, 0x1D, 0x20,
+    0x12, 0x15, 0x1D, 0x20,
+    0x12, 0x15, 0x19, 0x20,
+    0x12, 0x20, 0x20, 0x20,
+    0x12, 0x12, 0x20, 0x20,
+    0x12, 0x12, 0x12, 0x20,
+    0x12, 0x18, 0x1D, 0x23,
+    0x12, 0x1A, 0x1F, 0x23,
+    0x12, 0x16, 0x1F, 0x23,
+    0x12, 0x16, 0x1B, 0x23,
+    0x12, 0x23, 0x23, 0x23,
+    0x12, 0x12, 0x23, 0x23,
+    0x12, 0x12, 0x12, 0x23,
+    0x12, 0x19, 0x1F, 0x26,
+    0x12, 0x1C, 0x21, 0x26,
+    0x12, 0x17, 0x21, 0x26,
+    0x12, 0x17, 0x1C, 0x26,
+    0x12, 0x26, 0x26, 0x26,
+    0x12, 0x12, 0x26, 0x26,
+    0x12, 0x12, 0x12, 0x26,
+    0x12, 0x1D, 0x24, 0x29,
+    0x12, 0x17, 0x24, 0x29,
+    0x12, 0x17, 0x1E, 0x29,
+    0x12, 0x29, 0x29, 0x29,
+    0x12, 0x12, 0x29, 0x29,
+    0x12, 0x12, 0x12, 0x29,
+    0x12, 0x1F, 0x26, 0x2C,
+    0x12, 0x18, 0x26, 0x2C,
+    0x12, 0x18, 0x1F, 0x2C,
+    0x12, 0x2C, 0x2C, 0x2C,
+    0x12, 0x12, 0x2C, 0x2C,
+    0x12, 0x12, 0x12, 0x2C,
+    0x12, 0x20, 0x28, 0x2F,
+    0x12, 0x19, 0x28, 0x2F,
+    0x12, 0x19, 0x21, 0x2F,
+    0x12, 0x2F, 0x2F, 0x2F,
+    0x12, 0x12, 0x2F, 0x2F,
+    0x12, 0x12, 0x12, 0x2F,
+    0x12, 0x22, 0x2A, 0x32,
+    0x12, 0x1A, 0x2A, 0x32,
+    0x12, 0x1A, 0x22, 0x32,
+    0x12, 0x32, 0x32, 0x32,
+    0x12, 0x12, 0x32, 0x32,
+    0x12, 0x12, 0x12, 0x32,
+    0x12, 0x35, 0x35, 0x35,
+    0x12, 0x12, 0x35, 0x35,
+    0x12, 0x12, 0x12, 0x35,
+    0x12, 0x24, 0x2D, 0x36,
+    0x12, 0x1B, 0x2D, 0x36,
+    0x12, 0x1B, 0x24, 0x36,
+    0x12, 0x3A, 0x3A, 0x3A,
+    0x12, 0x12, 0x3A, 0x3A,
+    0x12, 0x12, 0x12, 0x3A,
+    0x13, 0x14, 0x14, 0x15,
+    0x13, 0x14, 0x15, 0x16,
+    0x13, 0x15, 0x16, 0x17,
+    0x13, 0x14, 0x16, 0x17,
+    0x13, 0x14, 0x15, 0x17,
+    0x13, 0x15, 0x16, 0x18,
+    0x13, 0x15, 0x17, 0x18,
+    0x13, 0x14, 0x17, 0x18,
+    0x13, 0x14, 0x16, 0x18,
+    0x13, 0x15, 0x17, 0x19,
+    0x13, 0x16, 0x18, 0x19,
+    0x13, 0x14, 0x18, 0x19,
+    0x13, 0x14, 0x16, 0x19,
+    0x13, 0x19, 0x19, 0x19,
+    0x13, 0x13, 0x19, 0x19,
+    0x13, 0x13, 0x13, 0x19,
+    0x13, 0x16, 0x17, 0x1A,
+    0x13, 0x16, 0x19, 0x1A,
+    0x13, 0x14, 0x19, 0x1A,
+    0x13, 0x14, 0x17, 0x1A,
+    0x13, 0x16, 0x18, 0x1B,
+    0x13, 0x17, 0x19, 0x1B,
+    0x13, 0x15, 0x19, 0x1B,
+    0x13, 0x15, 0x17, 0x1B,
+    0x13, 0x1B, 0x1B, 0x1B,
+    0x13, 0x13, 0x1B, 0x1B,
+    0x13, 0x13, 0x13, 0x1B,
+    0x13, 0x17, 0x1A, 0x1E,
+    0x13, 0x18, 0x1C, 0x1E,
+    0x13, 0x15, 0x1C, 0x1E,
+    0x13, 0x15, 0x19, 0x1E,
+    0x13, 0x1E, 0x1E, 0x1E,
+    0x13, 0x13, 0x1E, 0x1E,
+    0x13, 0x13, 0x13, 0x1E,
+    0x13, 0x18, 0x1C, 0x21,
+    0x13, 0x1A, 0x1E, 0x21,
+    0x13, 0x16, 0x1E, 0x21,
+    0x13, 0x16, 0x1A, 0x21,
+    0x13, 0x21, 0x21, 0x21,
+    0x13, 0x13, 0x21, 0x21,
+    0x13, 0x13, 0x13, 0x21,
+    0x13, 0x19, 0x1E, 0x24,
+    0x13, 0x1B, 0x20, 0x24,
+    0x13, 0x17, 0x20, 0x24,
+    0x13, 0x17, 0x1C, 0x24,
+    0x13, 0x24, 0x24, 0x24,
+    0x13, 0x13, 0x24, 0x24,
+    0x13, 0x13, 0x13, 0x24,
+    0x13, 0x1A, 0x20, 0x27,
+    0x13, 0x1D, 0x22, 0x27,
+    0x13, 0x18, 0x22, 0x27,
+    0x13, 0x18, 0x1D, 0x27,
+    0x13, 0x27, 0x27, 0x27,
+    0x13, 0x13, 0x27, 0x27,
+    0x13, 0x13, 0x13, 0x27,
+    0x13, 0x1E, 0x25, 0x2A,
+    0x13, 0x18, 0x25, 0x2A,
+    0x13, 0x18, 0x1F, 0x2A,
+    0x13, 0x2A, 0x2A, 0x2A,
+    0x13, 0x13, 0x2A, 0x2A,
+    0x13, 0x13, 0x13, 0x2A,
+    0x13, 0x20, 0x27, 0x2D,
+    0x13, 0x19, 0x27, 0x2D,
+    0x13, 0x19, 0x20, 0x2D,
+    0x13, 0x2D, 0x2D, 0x2D,
+    0x13, 0x13, 0x2D, 0x2D,
+    0x13, 0x13, 0x13, 0x2D,
+    0x13, 0x21, 0x29, 0x30,
+    0x13, 0x1A, 0x29, 0x30,
+    0x13, 0x1A, 0x22, 0x30,
+    0x13, 0x30, 0x30, 0x30,
+    0x13, 0x13, 0x30, 0x30,
+    0x13, 0x13, 0x13, 0x30,
+    0x13, 0x23, 0x2B, 0x33,
+    0x13, 0x1B, 0x2B, 0x33,
+    0x13, 0x1B, 0x23, 0x33,
+    0x13, 0x33, 0x33, 0x33,
+    0x13, 0x13, 0x33, 0x33,
+    0x13, 0x13, 0x13, 0x33,
+    0x13, 0x36, 0x36, 0x36,
+    0x13, 0x13, 0x36, 0x36,
+    0x13, 0x13, 0x13, 0x36,
+    0x13, 0x25, 0x2E, 0x37,
+    0x13, 0x1C, 0x2E, 0x37,
+    0x13, 0x1C, 0x25, 0x37,
+    0x13, 0x3B, 0x3B, 0x3B,
+    0x13, 0x13, 0x3B, 0x3B,
+    0x13, 0x13, 0x13, 0x3B,
+    0x14, 0x15, 0x15, 0x16,
+    0x14, 0x15, 0x16, 0x17,
+    0x14, 0x16, 0x17, 0x18,
+    0x14, 0x15, 0x17, 0x18,
+    0x14, 0x15, 0x16, 0x18,
+    0x14, 0x16, 0x17, 0x19,
+    0x14, 0x16, 0x18, 0x19,
+    0x14, 0x15, 0x18, 0x19,
+    0x14, 0x15, 0x17, 0x19,
+    0x14, 0x16, 0x18, 0x1A,
+    0x14, 0x17, 0x19, 0x1A,
+    0x14, 0x15, 0x19, 0x1A,
+    0x14, 0x15, 0x17, 0x1A,
+    0x14, 0x1A, 0x1A, 0x1A,
+    0x14, 0x14, 0x1A, 0x1A,
+    0x14, 0x14, 0x14, 0x1A,
+    0x14, 0x17, 0x18, 0x1B,
+    0x14, 0x17, 0x1A, 0x1B,
+    0x14, 0x15, 0x1A, 0x1B,
+    0x14, 0x15, 0x18, 0x1B,
+    0x14, 0x17, 0x19, 0x1C,
+    0x14, 0x18, 0x1A, 0x1C,
+    0x14, 0x16, 0x1A, 0x1C,
+    0x14, 0x16, 0x18, 0x1C,
+    0x14, 0x1C, 0x1C, 0x1C,
+    0x14, 0x14, 0x1C, 0x1C,
+    0x14, 0x14, 0x14, 0x1C,
+    0x14, 0x18, 0x1B, 0x1F,
+    0x14, 0x19, 0x1D, 0x1F,
+    0x14, 0x16, 0x1D, 0x1F,
+    0x14, 0x16, 0x1A, 0x1F,
+    0x14, 0x1F, 0x1F, 0x1F,
+    0x14, 0x14, 0x1F, 0x1F,
+    0x14, 0x14, 0x14, 0x1F,
+    0x14, 0x19, 0x1D, 0x22,
+    0x14, 0x1B, 0x1F, 0x22,
+    0x14, 0x17, 0x1F, 0x22,
+    0x14, 0x17, 0x1B, 0x22,
+    0x14, 0x22, 0x22, 0x22,
+    0x14, 0x14, 0x22, 0x22,
+    0x14, 0x14, 0x14, 0x22,
+    0x14, 0x1A, 0x1F, 0x25,
+    0x14, 0x1C, 0x21, 0x25,
+    0x14, 0x18, 0x21, 0x25,
+    0x14, 0x18, 0x1D, 0x25,
+    0x14, 0x25, 0x25, 0x25,
+    0x14, 0x14, 0x25, 0x25,
+    0x14, 0x14, 0x14, 0x25,
+    0x14, 0x1B, 0x21, 0x28,
+    0x14, 0x1E, 0x23, 0x28,
+    0x14, 0x19, 0x23, 0x28,
+    0x14, 0x19, 0x1E, 0x28,
+    0x14, 0x28, 0x28, 0x28,
+    0x14, 0x14, 0x28, 0x28,
+    0x14, 0x14, 0x14, 0x28,
+    0x14, 0x1F, 0x26, 0x2B,
+    0x14, 0x19, 0x26, 0x2B,
+    0x14, 0x19, 0x20, 0x2B,
+    0x14, 0x2B, 0x2B, 0x2B,
+    0x14, 0x14, 0x2B, 0x2B,
+    0x14, 0x14, 0x14, 0x2B,
+    0x14, 0x21, 0x28, 0x2E,
+    0x14, 0x1A, 0x28, 0x2E,
+    0x14, 0x1A, 0x21, 0x2E,
+    0x14, 0x2E, 0x2E, 0x2E,
+    0x14, 0x14, 0x2E, 0x2E,
+    0x14, 0x14, 0x14, 0x2E,
+    0x14, 0x22, 0x2A, 0x31,
+    0x14, 0x1B, 0x2A, 0x31,
+    0x14, 0x1B, 0x23, 0x31,
+    0x14, 0x31, 0x31, 0x31,
+    0x14, 0x14, 0x31, 0x31,
+    0x14, 0x14, 0x14, 0x31,
+    0x14, 0x24, 0x2C, 0x34,
+    0x14, 0x1C, 0x2C, 0x34,
+    0x14, 0x1C, 0x24, 0x34,
+    0x14, 0x34, 0x34, 0x34,
+    0x14, 0x14, 0x34, 0x34,
+    0x14, 0x14, 0x14, 0x34,
+    0x14, 0x37, 0x37, 0x37,
+    0x14, 0x14, 0x37, 0x37,
+    0x14, 0x14, 0x14, 0x37,
+    0x14, 0x26, 0x2F, 0x38,
+    0x14, 0x1D, 0x2F, 0x38,
+    0x14, 0x1D, 0x26, 0x38,
+    0x14, 0x3C, 0x3C, 0x3C,
+    0x14, 0x14, 0x3C, 0x3C,
+    0x14, 0x14, 0x14, 0x3C,
+    0x15, 0x16, 0x16, 0x17,
+    0x15, 0x16, 0x17, 0x18,
+    0x15, 0x17, 0x18, 0x19,
+    0x15, 0x16, 0x18, 0x19,
+    0x15, 0x16, 0x17, 0x19,
+    0x15, 0x17, 0x18, 0x1A,
+    0x15, 0x17, 0x19, 0x1A,
+    0x15, 0x16, 0x19, 0x1A,
+    0x15, 0x16, 0x18, 0x1A,
+    0x15, 0x17, 0x19, 0x1B,
+    0x15, 0x18, 0x1A, 0x1B,
+    0x15, 0x16, 0x1A, 0x1B,
+    0x15, 0x16, 0x18, 0x1B,
+    0x15, 0x1B, 0x1B, 0x1B,
+    0x15, 0x15, 0x1B, 0x1B,
+    0x15, 0x15, 0x15, 0x1B,
+    0x15, 0x18, 0x19, 0x1C,
+    0x15, 0x18, 0x1B, 0x1C,
+    0x15, 0x16, 0x1B, 0x1C,
+    0x15, 0x16, 0x19, 0x1C,
+    0x15, 0x18, 0x1A, 0x1D,
+    0x15, 0x19, 0x1B, 0x1D,
+    0x15, 0x17, 0x1B, 0x1D,
+    0x15, 0x17, 0x19, 0x1D,
+    0x15, 0x1D, 0x1D, 0x1D,
+    0x15, 0x15, 0x1D, 0x1D,
+    0x15, 0x15, 0x15, 0x1D,
+    0x15, 0x19, 0x1C, 0x20,
+    0x15, 0x1A, 0x1E, 0x20,
+    0x15, 0x17, 0x1E, 0x20,
+    0x15, 0x17, 0x1B, 0x20,
+    0x15, 0x20, 0x20, 0x20,
+    0x15, 0x15, 0x20, 0x20,
+    0x15, 0x15, 0x15, 0x20,
+    0x15, 0x1A, 0x1E, 0x23,
+    0x15, 0x1C, 0x20, 0x23,
+    0x15, 0x18, 0x20, 0x23,
+    0x15, 0x18, 0x1C, 0x23,
+    0x15, 0x23, 0x23, 0x23,
+    0x15, 0x15, 0x23, 0x23,
+    0x15, 0x15, 0x15, 0x23,
+    0x15, 0x1B, 0x20, 0x26,
+    0x15, 0x1D, 0x22, 0x26,
+    0x15, 0x19, 0x22, 0x26,
+    0x15, 0x19, 0x1E, 0x26,
+    0x15, 0x26, 0x26, 0x26,
+    0x15, 0x15, 0x26, 0x26,
+    0x15, 0x15, 0x15, 0x26,
+    0x15, 0x1C, 0x22, 0x29,
+    0x15, 0x1F, 0x24, 0x29,
+    0x15, 0x1A, 0x24, 0x29,
+    0x15, 0x1A, 0x1F, 0x29,
+    0x15, 0x29, 0x29, 0x29,
+    0x15, 0x15, 0x29, 0x29,
+    0x15, 0x15, 0x15, 0x29,
+    0x15, 0x20, 0x27, 0x2C,
+    0x15, 0x1A, 0x27, 0x2C,
+    0x15, 0x1A, 0x21, 0x2C,
+    0x15, 0x2C, 0x2C, 0x2C,
+    0x15, 0x15, 0x2C, 0x2C,
+    0x15, 0x15, 0x15, 0x2C,
+    0x15, 0x22, 0x29, 0x2F,
+    0x15, 0x1B, 0x29, 0x2F,
+    0x15, 0x1B, 0x22, 0x2F,
+    0x15, 0x2F, 0x2F, 0x2F,
+    0x15, 0x15, 0x2F, 0x2F,
+    0x15, 0x15, 0x15, 0x2F,
+    0x15, 0x23, 0x2B, 0x32,
+    0x15, 0x1C, 0x2B, 0x32,
+    0x15, 0x1C, 0x24, 0x32,
+    0x15, 0x32, 0x32, 0x32,
+    0x15, 0x15, 0x32, 0x32,
+    0x15, 0x15, 0x15, 0x32,
+    0x15, 0x25, 0x2D, 0x35,
+    0x15, 0x1D, 0x2D, 0x35,
+    0x15, 0x1D, 0x25, 0x35,
+    0x15, 0x35, 0x35, 0x35,
+    0x15, 0x15, 0x35, 0x35,
+    0x15, 0x15, 0x15, 0x35,
+    0x15, 0x38, 0x38, 0x38,
+    0x15, 0x15, 0x38, 0x38,
+    0x15, 0x15, 0x15, 0x38,
+    0x15, 0x27, 0x30, 0x39,
+    0x15, 0x1E, 0x30, 0x39,
+    0x15, 0x1E, 0x27, 0x39,
+    0x15, 0x3D, 0x3D, 0x3D,
+    0x15, 0x15, 0x3D, 0x3D,
+    0x15, 0x15, 0x15, 0x3D,
+    0x16, 0x17, 0x17, 0x18,
+    0x16, 0x17, 0x18, 0x19,
+    0x16, 0x18, 0x19, 0x1A,
+    0x16, 0x17, 0x19, 0x1A,
+    0x16, 0x17, 0x18, 0x1A,
+    0x16, 0x18, 0x19, 0x1B,
+    0x16, 0x18, 0x1A, 0x1B,
+    0x16, 0x17, 0x1A, 0x1B,
+    0x16, 0x17, 0x19, 0x1B,
+    0x16, 0x18, 0x1A, 0x1C,
+    0x16, 0x19, 0x1B, 0x1C,
+    0x16, 0x17, 0x1B, 0x1C,
+    0x16, 0x17, 0x19, 0x1C,
+    0x16, 0x1C, 0x1C, 0x1C,
+    0x16, 0x16, 0x1C, 0x1C,
+    0x16, 0x16, 0x16, 0x1C,
+    0x16, 0x19, 0x1A, 0x1D,
+    0x16, 0x19, 0x1C, 0x1D,
+    0x16, 0x17, 0x1C, 0x1D,
+    0x16, 0x17, 0x1A, 0x1D,
+    0x16, 0x19, 0x1B, 0x1E,
+    0x16, 0x1A, 0x1C, 0x1E,
+    0x16, 0x18, 0x1C, 0x1E,
+    0x16, 0x18, 0x1A, 0x1E,
+    0x16, 0x1E, 0x1E, 0x1E,
+    0x16, 0x16, 0x1E, 0x1E,
+    0x16, 0x16, 0x16, 0x1E,
+    0x16, 0x1A, 0x1D, 0x21,
+    0x16, 0x1B, 0x1F, 0x21,
+    0x16, 0x18, 0x1F, 0x21,
+    0x16, 0x18, 0x1C, 0x21,
+    0x16, 0x21, 0x21, 0x21,
+    0x16, 0x16, 0x21, 0x21,
+    0x16, 0x16, 0x16, 0x21,
+    0x16, 0x1B, 0x1F, 0x24,
+    0x16, 0x1D, 0x21, 0x24,
+    0x16, 0x19, 0x21, 0x24,
+    0x16, 0x19, 0x1D, 0x24,
+    0x16, 0x24, 0x24, 0x24,
+    0x16, 0x16, 0x24, 0x24,
+    0x16, 0x16, 0x16, 0x24,
+    0x16, 0x1C, 0x21, 0x27,
+    0x16, 0x1E, 0x23, 0x27,
+    0x16, 0x1A, 0x23, 0x27,
+    0x16, 0x1A, 0x1F, 0x27,
+    0x16, 0x27, 0x27, 0x27,
+    0x16, 0x16, 0x27, 0x27,
+    0x16, 0x16, 0x16, 0x27,
+    0x16, 0x1D, 0x23, 0x2A,
+    0x16, 0x20, 0x25, 0x2A,
+    0x16, 0x1B, 0x25, 0x2A,
+    0x16, 0x1B, 0x20, 0x2A,
+    0x16, 0x2A, 0x2A, 0x2A,
+    0x16, 0x16, 0x2A, 0x2A,
+    0x16, 0x16, 0x16, 0x2A,
+    0x16, 0x21, 0x28, 0x2D,
+    0x16, 0x1B, 0x28, 0x2D,
+    0x16, 0x1B, 0x22, 0x2D,
+    0x16, 0x2D, 0x2D, 0x2D,
+    0x16, 0x16, 0x2D, 0x2D,
+    0x16, 0x16, 0x16, 0x2D,
+    0x16, 0x23, 0x2A, 0x30,
+    0x16, 0x1C, 0x2A, 0x30,
+    0x16, 0x1C, 0x23, 0x30,
+    0x16, 0x30, 0x30, 0x30,
+    0x16, 0x16, 0x30, 0x30,
+    0x16, 0x16, 0x16, 0x30,
+    0x16, 0x24, 0x2C, 0x33,
+    0x16, 0x1D, 0x2C, 0x33,
+    0x16, 0x1D, 0x25, 0x33,
+    0x16, 0x33, 0x33, 0x33,
+    0x16, 0x16, 0x33, 0x33,
+    0x16, 0x16, 0x16, 0x33,
+    0x16, 0x26, 0x2E, 0x36,
+    0x16, 0x1E, 0x2E, 0x36,
+    0x16, 0x1E, 0x26, 0x36,
+    0x16, 0x36, 0x36, 0x36,
+    0x16, 0x16, 0x36, 0x36,
+    0x16, 0x16, 0x16, 0x36,
+    0x16, 0x39, 0x39, 0x39,
+    0x16, 0x16, 0x39, 0x39,
+    0x16, 0x16, 0x16, 0x39,
+    0x16, 0x28, 0x31, 0x3A,
+    0x16, 0x1F, 0x31, 0x3A,
+    0x16, 0x1F, 0x28, 0x3A,
+    0x16, 0x3E, 0x3E, 0x3E,
+    0x16, 0x16, 0x3E, 0x3E,
+    0x16, 0x16, 0x16, 0x3E,
+    0x17, 0x18, 0x18, 0x19,
+    0x17, 0x18, 0x19, 0x1A,
+    0x17, 0x19, 0x1A, 0x1B,
+    0x17, 0x18, 0x1A, 0x1B,
+    0x17, 0x18, 0x19, 0x1B,
+    0x17, 0x19, 0x1A, 0x1C,
+    0x17, 0x19, 0x1B, 0x1C,
+    0x17, 0x18, 0x1B, 0x1C,
+    0x17, 0x18, 0x1A, 0x1C,
+    0x17, 0x19, 0x1B, 0x1D,
+    0x17, 0x1A, 0x1C, 0x1D,
+    0x17, 0x18, 0x1C, 0x1D,
+    0x17, 0x18, 0x1A, 0x1D,
+    0x17, 0x1D, 0x1D, 0x1D,
+    0x17, 0x17, 0x1D, 0x1D,
+    0x17, 0x17, 0x17, 0x1D,
+    0x17, 0x1A, 0x1B, 0x1E,
+    0x17, 0x1A, 0x1D, 0x1E,
+    0x17, 0x18, 0x1D, 0x1E,
+    0x17, 0x18, 0x1B, 0x1E,
+    0x17, 0x1A, 0x1C, 0x1F,
+    0x17, 0x1B, 0x1D, 0x1F,
+    0x17, 0x19, 0x1D, 0x1F,
+    0x17, 0x19, 0x1B, 0x1F,
+    0x17, 0x1F, 0x1F, 0x1F,
+    0x17, 0x17, 0x1F, 0x1F,
+    0x17, 0x17, 0x17, 0x1F,
+    0x17, 0x1B, 0x1E, 0x22,
+    0x17, 0x1C, 0x20, 0x22,
+    0x17, 0x19, 0x20, 0x22,
+    0x17, 0x19, 0x1D, 0x22,
+    0x17, 0x22, 0x22, 0x22,
+    0x17, 0x17, 0x22, 0x22,
+    0x17, 0x17, 0x17, 0x22,
+    0x17, 0x1C, 0x20, 0x25,
+    0x17, 0x1E, 0x22, 0x25,
+    0x17, 0x1A, 0x22, 0x25,
+    0x17, 0x1A, 0x1E, 0x25,
+    0x17, 0x25, 0x25, 0x25,
+    0x17, 0x17, 0x25, 0x25,
+    0x17, 0x17, 0x17, 0x25,
+    0x17, 0x1D, 0x22, 0x28,
+    0x17, 0x1F, 0x24, 0x28,
+    0x17, 0x1B, 0x24, 0x28,
+    0x17, 0x1B, 0x20, 0x28,
+    0x17, 0x28, 0x28, 0x28,
+    0x17, 0x17, 0x28, 0x28,
+    0x17, 0x17, 0x17, 0x28,
+    0x17, 0x1E, 0x24, 0x2B,
+    0x17, 0x21, 0x26, 0x2B,
+    0x17, 0x1C, 0x26, 0x2B,
+    0x17, 0x1C, 0x21, 0x2B,
+    0x17, 0x2B, 0x2B, 0x2B,
+    0x17, 0x17, 0x2B, 0x2B,
+    0x17, 0x17, 0x17, 0x2B,
+    0x17, 0x22, 0x29, 0x2E,
+    0x17, 0x1C, 0x29, 0x2E,
+    0x17, 0x1C, 0x23, 0x2E,
+    0x17, 0x2E, 0x2E, 0x2E,
+    0x17, 0x17, 0x2E, 0x2E,
+    0x17, 0x17, 0x17, 0x2E,
+    0x17, 0x24, 0x2B, 0x31,
+    0x17, 0x1D, 0x2B, 0x31,
+    0x17, 0x1D, 0x24, 0x31,
+    0x17, 0x31, 0x31, 0x31,
+    0x17, 0x17, 0x31, 0x31,
+    0x17, 0x17, 0x17, 0x31,
+    0x17, 0x25, 0x2D, 0x34,
+    0x17, 0x1E, 0x2D, 0x34,
+    0x17, 0x1E, 0x26, 0x34,
+    0x17, 0x34, 0x34, 0x34,
+    0x17, 0x17, 0x34, 0x34,
+    0x17, 0x17, 0x17, 0x34,
+    0x17, 0x27, 0x2F, 0x37,
+    0x17, 0x1F, 0x2F, 0x37,
+    0x17, 0x1F, 0x27, 0x37,
+    0x17, 0x37, 0x37, 0x37,
+    0x17, 0x17, 0x37, 0x37,
+    0x17, 0x17, 0x17, 0x37,
+    0x17, 0x3A, 0x3A, 0x3A,
+    0x17, 0x17, 0x3A, 0x3A,
+    0x17, 0x17, 0x17, 0x3A,
+    0x17, 0x29, 0x32, 0x3B,
+    0x17, 0x20, 0x32, 0x3B,
+    0x17, 0x20, 0x29, 0x3B,
+    0x17, 0x3F, 0x3F, 0x3F,
+    0x17, 0x17, 0x3F, 0x3F,
+    0x17, 0x17, 0x17, 0x3F,
+    0x18, 0x19, 0x19, 0x1A,
+    0x18, 0x19, 0x1A, 0x1B,
+    0x18, 0x1A, 0x1B, 0x1C,
+    0x18, 0x19, 0x1B, 0x1C,
+    0x18, 0x19, 0x1A, 0x1C,
+    0x18, 0x1A, 0x1B, 0x1D,
+    0x18, 0x1A, 0x1C, 0x1D,
+    0x18, 0x19, 0x1C, 0x1D,
+    0x18, 0x19, 0x1B, 0x1D,
+    0x18, 0x1A, 0x1C, 0x1E,
+    0x18, 0x1B, 0x1D, 0x1E,
+    0x18, 0x19, 0x1D, 0x1E,
+    0x18, 0x19, 0x1B, 0x1E,
+    0x18, 0x1E, 0x1E, 0x1E,
+    0x18, 0x18, 0x1E, 0x1E,
+    0x18, 0x18, 0x18, 0x1E,
+    0x18, 0x1B, 0x1C, 0x1F,
+    0x18, 0x1B, 0x1E, 0x1F,
+    0x18, 0x19, 0x1E, 0x1F,
+    0x18, 0x19, 0x1C, 0x1F,
+    0x18, 0x1B, 0x1D, 0x20,
+    0x18, 0x1C, 0x1E, 0x20,
+    0x18, 0x1A, 0x1E, 0x20,
+    0x18, 0x1A, 0x1C, 0x20,
+    0x18, 0x20, 0x20, 0x20,
+    0x18, 0x18, 0x20, 0x20,
+    0x18, 0x18, 0x18, 0x20,
+    0x18, 0x1C, 0x1F, 0x23,
+    0x18, 0x1D, 0x21, 0x23,
+    0x18, 0x1A, 0x21, 0x23,
+    0x18, 0x1A, 0x1E, 0x23,
+    0x18, 0x23, 0x23, 0x23,
+    0x18, 0x18, 0x23, 0x23,
+    0x18, 0x18, 0x18, 0x23,
+    0x18, 0x1D, 0x21, 0x26,
+    0x18, 0x1F, 0x23, 0x26,
+    0x18, 0x1B, 0x23, 0x26,
+    0x18, 0x1B, 0x1F, 0x26,
+    0x18, 0x26, 0x26, 0x26,
+    0x18, 0x18, 0x26, 0x26,
+    0x18, 0x18, 0x18, 0x26,
+    0x18, 0x1E, 0x23, 0x29,
+    0x18, 0x20, 0x25, 0x29,
+    0x18, 0x1C, 0x25, 0x29,
+    0x18, 0x1C, 0x21, 0x29,
+    0x18, 0x29, 0x29, 0x29,
+    0x18, 0x18, 0x29, 0x29,
+    0x18, 0x18, 0x18, 0x29,
+    0x18, 0x1F, 0x25, 0x2C,
+    0x18, 0x22, 0x27, 0x2C,
+    0x18, 0x1D, 0x27, 0x2C,
+    0x18, 0x1D, 0x22, 0x2C,
+    0x18, 0x2C, 0x2C, 0x2C,
+    0x18, 0x18, 0x2C, 0x2C,
+    0x18, 0x18, 0x18, 0x2C,
+    0x18, 0x23, 0x2A, 0x2F,
+    0x18, 0x1D, 0x2A, 0x2F,
+    0x18, 0x1D, 0x24, 0x2F,
+    0x18, 0x2F, 0x2F, 0x2F,
+    0x18, 0x18, 0x2F, 0x2F,
+    0x18, 0x18, 0x18, 0x2F,
+    0x18, 0x25, 0x2C, 0x32,
+    0x18, 0x1E, 0x2C, 0x32,
+    0x18, 0x1E, 0x25, 0x32,
+    0x18, 0x32, 0x32, 0x32,
+    0x18, 0x18, 0x32, 0x32,
+    0x18, 0x18, 0x18, 0x32,
+    0x18, 0x26, 0x2E, 0x35,
+    0x18, 0x1F, 0x2E, 0x35,
+    0x18, 0x1F, 0x27, 0x35,
+    0x18, 0x35, 0x35, 0x35,
+    0x18, 0x18, 0x35, 0x35,
+    0x18, 0x18, 0x18, 0x35,
+    0x18, 0x28, 0x30, 0x38,
+    0x18, 0x20, 0x30, 0x38,
+    0x18, 0x20, 0x28, 0x38,
+    0x18, 0x38, 0x38, 0x38,
+    0x18, 0x18, 0x38, 0x38,
+    0x18, 0x18, 0x18, 0x38,
+    0x18, 0x3B, 0x3B, 0x3B,
+    0x18, 0x18, 0x3B, 0x3B,
+    0x18, 0x18, 0x18, 0x3B,
+    0x18, 0x2A, 0x33, 0x3C,
+    0x18, 0x21, 0x33, 0x3C,
+    0x18, 0x21, 0x2A, 0x3C,
+    0x19, 0x1A, 0x1A, 0x1B,
+    0x19, 0x1A, 0x1B, 0x1C,
+    0x19, 0x1B, 0x1C, 0x1D,
+    0x19, 0x1A, 0x1C, 0x1D,
+    0x19, 0x1A, 0x1B, 0x1D,
+    0x19, 0x1B, 0x1C, 0x1E,
+    0x19, 0x1B, 0x1D, 0x1E,
+    0x19, 0x1A, 0x1D, 0x1E,
+    0x19, 0x1A, 0x1C, 0x1E,
+    0x19, 0x1B, 0x1D, 0x1F,
+    0x19, 0x1C, 0x1E, 0x1F,
+    0x19, 0x1A, 0x1E, 0x1F,
+    0x19, 0x1A, 0x1C, 0x1F,
+    0x19, 0x1F, 0x1F, 0x1F,
+    0x19, 0x19, 0x1F, 0x1F,
+    0x19, 0x19, 0x19, 0x1F,
+    0x19, 0x1C, 0x1D, 0x20,
+    0x19, 0x1C, 0x1F, 0x20,
+    0x19, 0x1A, 0x1F, 0x20,
+    0x19, 0x1A, 0x1D, 0x20,
+    0x19, 0x1C, 0x1E, 0x21,
+    0x19, 0x1D, 0x1F, 0x21,
+    0x19, 0x1B, 0x1F, 0x21,
+    0x19, 0x1B, 0x1D, 0x21,
+    0x19, 0x21, 0x21, 0x21,
+    0x19, 0x19, 0x21, 0x21,
+    0x19, 0x19, 0x19, 0x21,
+    0x19, 0x1D, 0x20, 0x24,
+    0x19, 0x1E, 0x22, 0x24,
+    0x19, 0x1B, 0x22, 0x24,
+    0x19, 0x1B, 0x1F, 0x24,
+    0x19, 0x24, 0x24, 0x24,
+    0x19, 0x19, 0x24, 0x24,
+    0x19, 0x19, 0x19, 0x24,
+    0x19, 0x1E, 0x22, 0x27,
+    0x19, 0x20, 0x24, 0x27,
+    0x19, 0x1C, 0x24, 0x27,
+    0x19, 0x1C, 0x20, 0x27,
+    0x19, 0x27, 0x27, 0x27,
+    0x19, 0x19, 0x27, 0x27,
+    0x19, 0x19, 0x19, 0x27,
+    0x19, 0x1F, 0x24, 0x2A,
+    0x19, 0x21, 0x26, 0x2A,
+    0x19, 0x1D, 0x26, 0x2A,
+    0x19, 0x1D, 0x22, 0x2A,
+    0x19, 0x2A, 0x2A, 0x2A,
+    0x19, 0x19, 0x2A, 0x2A,
+    0x19, 0x19, 0x19, 0x2A,
+    0x19, 0x20, 0x26, 0x2D,
+    0x19, 0x23, 0x28, 0x2D,
+    0x19, 0x1E, 0x28, 0x2D,
+    0x19, 0x1E, 0x23, 0x2D,
+    0x19, 0x2D, 0x2D, 0x2D,
+    0x19, 0x19, 0x2D, 0x2D,
+    0x19, 0x19, 0x19, 0x2D,
+    0x19, 0x24, 0x2B, 0x30,
+    0x19, 0x1E, 0x2B, 0x30,
+    0x19, 0x1E, 0x25, 0x30,
+    0x19, 0x30, 0x30, 0x30,
+    0x19, 0x19, 0x30, 0x30,
+    0x19, 0x19, 0x19, 0x30,
+    0x19, 0x26, 0x2D, 0x33,
+    0x19, 0x1F, 0x2D, 0x33,
+    0x19, 0x1F, 0x26, 0x33,
+    0x19, 0x33, 0x33, 0x33,
+    0x19, 0x19, 0x33, 0x33,
+    0x19, 0x19, 0x19, 0x33,
+    0x19, 0x27, 0x2F, 0x36,
+    0x19, 0x20, 0x2F, 0x36,
+    0x19, 0x20, 0x28, 0x36,
+    0x19, 0x36, 0x36, 0x36,
+    0x19, 0x19, 0x36, 0x36,
+    0x19, 0x19, 0x19, 0x36,
+    0x19, 0x29, 0x31, 0x39,
+    0x19, 0x21, 0x31, 0x39,
+    0x19, 0x21, 0x29, 0x39,
+    0x19, 0x39, 0x39, 0x39,
+    0x19, 0x19, 0x39, 0x39,
+    0x19, 0x19, 0x19, 0x39,
+    0x19, 0x3C, 0x3C, 0x3C,
+    0x19, 0x19, 0x3C, 0x3C,
+    0x19, 0x19, 0x19, 0x3C,
+    0x19, 0x2B, 0x34, 0x3D,
+    0x19, 0x22, 0x34, 0x3D,
+    0x19, 0x22, 0x2B, 0x3D,
+    0x1A, 0x1B, 0x1B, 0x1C,
+    0x1A, 0x1B, 0x1C, 0x1D,
+    0x1A, 0x1C, 0x1D, 0x1E,
+    0x1A, 0x1B, 0x1D, 0x1E,
+    0x1A, 0x1B, 0x1C, 0x1E,
+    0x1A, 0x1C, 0x1D, 0x1F,
+    0x1A, 0x1C, 0x1E, 0x1F,
+    0x1A, 0x1B, 0x1E, 0x1F,
+    0x1A, 0x1B, 0x1D, 0x1F,
+    0x1A, 0x1C, 0x1E, 0x20,
+    0x1A, 0x1D, 0x1F, 0x20,
+    0x1A, 0x1B, 0x1F, 0x20,
+    0x1A, 0x1B, 0x1D, 0x20,
+    0x1A, 0x20, 0x20, 0x20,
+    0x1A, 0x1A, 0x20, 0x20,
+    0x1A, 0x1A, 0x1A, 0x20,
+    0x1A, 0x1D, 0x1E, 0x21,
+    0x1A, 0x1D, 0x20, 0x21,
+    0x1A, 0x1B, 0x20, 0x21,
+    0x1A, 0x1B, 0x1E, 0x21,
+    0x1A, 0x1D, 0x1F, 0x22,
+    0x1A, 0x1E, 0x20, 0x22,
+    0x1A, 0x1C, 0x20, 0x22,
+    0x1A, 0x1C, 0x1E, 0x22,
+    0x1A, 0x22, 0x22, 0x22,
+    0x1A, 0x1A, 0x22, 0x22,
+    0x1A, 0x1A, 0x1A, 0x22,
+    0x1A, 0x1E, 0x21, 0x25,
+    0x1A, 0x1F, 0x23, 0x25,
+    0x1A, 0x1C, 0x23, 0x25,
+    0x1A, 0x1C, 0x20, 0x25,
+    0x1A, 0x25, 0x25, 0x25,
+    0x1A, 0x1A, 0x25, 0x25,
+    0x1A, 0x1A, 0x1A, 0x25,
+    0x1A, 0x1F, 0x23, 0x28,
+    0x1A, 0x21, 0x25, 0x28,
+    0x1A, 0x1D, 0x25, 0x28,
+    0x1A, 0x1D, 0x21, 0x28,
+    0x1A, 0x28, 0x28, 0x28,
+    0x1A, 0x1A, 0x28, 0x28,
+    0x1A, 0x1A, 0x1A, 0x28,
+    0x1A, 0x20, 0x25, 0x2B,
+    0x1A, 0x22, 0x27, 0x2B,
+    0x1A, 0x1E, 0x27, 0x2B,
+    0x1A, 0x1E, 0x23, 0x2B,
+    0x1A, 0x2B, 0x2B, 0x2B,
+    0x1A, 0x1A, 0x2B, 0x2B,
+    0x1A, 0x1A, 0x1A, 0x2B,
+    0x1A, 0x21, 0x27, 0x2E,
+    0x1A, 0x24, 0x29, 0x2E,
+    0x1A, 0x1F, 0x29, 0x2E,
+    0x1A, 0x1F, 0x24, 0x2E,
+    0x1A, 0x2E, 0x2E, 0x2E,
+    0x1A, 0x1A, 0x2E, 0x2E,
+    0x1A, 0x1A, 0x1A, 0x2E,
+    0x1A, 0x25, 0x2C, 0x31,
+    0x1A, 0x1F, 0x2C, 0x31,
+    0x1A, 0x1F, 0x26, 0x31,
+    0x1A, 0x31, 0x31, 0x31,
+    0x1A, 0x1A, 0x31, 0x31,
+    0x1A, 0x1A, 0x1A, 0x31,
+    0x1A, 0x27, 0x2E, 0x34,
+    0x1A, 0x20, 0x2E, 0x34,
+    0x1A, 0x20, 0x27, 0x34,
+    0x1A, 0x34, 0x34, 0x34,
+    0x1A, 0x1A, 0x34, 0x34,
+    0x1A, 0x1A, 0x1A, 0x34,
+    0x1A, 0x28, 0x30, 0x37,
+    0x1A, 0x21, 0x30, 0x37,
+    0x1A, 0x21, 0x29, 0x37,
+    0x1A, 0x37, 0x37, 0x37,
+    0x1A, 0x1A, 0x37, 0x37,
+    0x1A, 0x1A, 0x1A, 0x37,
+    0x1A, 0x2A, 0x32, 0x3A,
+    0x1A, 0x22, 0x32, 0x3A,
+    0x1A, 0x22, 0x2A, 0x3A,
+    0x1A, 0x3A, 0x3A, 0x3A,
+    0x1A, 0x1A, 0x3A, 0x3A,
+    0x1A, 0x1A, 0x1A, 0x3A,
+    0x1A, 0x3D, 0x3D, 0x3D,
+    0x1A, 0x1A, 0x3D, 0x3D,
+    0x1A, 0x1A, 0x1A, 0x3D,
+    0x1A, 0x2C, 0x35, 0x3E,
+    0x1A, 0x23, 0x35, 0x3E,
+    0x1A, 0x23, 0x2C, 0x3E,
+    0x1B, 0x1C, 0x1C, 0x1D,
+    0x1B, 0x1C, 0x1D, 0x1E,
+    0x1B, 0x1D, 0x1E, 0x1F,
+    0x1B, 0x1C, 0x1E, 0x1F,
+    0x1B, 0x1C, 0x1D, 0x1F,
+    0x1B, 0x1D, 0x1E, 0x20,
+    0x1B, 0x1D, 0x1F, 0x20,
+    0x1B, 0x1C, 0x1F, 0x20,
+    0x1B, 0x1C, 0x1E, 0x20,
+    0x1B, 0x1D, 0x1F, 0x21,
+    0x1B, 0x1E, 0x20, 0x21,
+    0x1B, 0x1C, 0x20, 0x21,
+    0x1B, 0x1C, 0x1E, 0x21,
+    0x1B, 0x21, 0x21, 0x21,
+    0x1B, 0x1B, 0x21, 0x21,
+    0x1B, 0x1B, 0x1B, 0x21,
+    0x1B, 0x1E, 0x1F, 0x22,
+    0x1B, 0x1E, 0x21, 0x22,
+    0x1B, 0x1C, 0x21, 0x22,
+    0x1B, 0x1C, 0x1F, 0x22,
+    0x1B, 0x1E, 0x20, 0x23,
+    0x1B, 0x1F, 0x21, 0x23,
+    0x1B, 0x1D, 0x21, 0x23,
+    0x1B, 0x1D, 0x1F, 0x23,
+    0x1B, 0x23, 0x23, 0x23,
+    0x1B, 0x1B, 0x23, 0x23,
+    0x1B, 0x1B, 0x1B, 0x23,
+    0x1B, 0x1F, 0x22, 0x26,
+    0x1B, 0x20, 0x24, 0x26,
+    0x1B, 0x1D, 0x24, 0x26,
+    0x1B, 0x1D, 0x21, 0x26,
+    0x1B, 0x26, 0x26, 0x26,
+    0x1B, 0x1B, 0x26, 0x26,
+    0x1B, 0x1B, 0x1B, 0x26,
+    0x1B, 0x20, 0x24, 0x29,
+    0x1B, 0x22, 0x26, 0x29,
+    0x1B, 0x1E, 0x26, 0x29,
+    0x1B, 0x1E, 0x22, 0x29,
+    0x1B, 0x29, 0x29, 0x29,
+    0x1B, 0x1B, 0x29, 0x29,
+    0x1B, 0x1B, 0x1B, 0x29,
+    0x1B, 0x21, 0x26, 0x2C,
+    0x1B, 0x23, 0x28, 0x2C,
+    0x1B, 0x1F, 0x28, 0x2C,
+    0x1B, 0x1F, 0x24, 0x2C,
+    0x1B, 0x2C, 0x2C, 0x2C,
+    0x1B, 0x1B, 0x2C, 0x2C,
+    0x1B, 0x1B, 0x1B, 0x2C,
+    0x1B, 0x22, 0x28, 0x2F,
+    0x1B, 0x25, 0x2A, 0x2F,
+    0x1B, 0x20, 0x2A, 0x2F,
+    0x1B, 0x20, 0x25, 0x2F,
+    0x1B, 0x2F, 0x2F, 0x2F,
+    0x1B, 0x1B, 0x2F, 0x2F,
+    0x1B, 0x1B, 0x1B, 0x2F,
+    0x1B, 0x26, 0x2D, 0x32,
+    0x1B, 0x20, 0x2D, 0x32,
+    0x1B, 0x20, 0x27, 0x32,
+    0x1B, 0x32, 0x32, 0x32,
+    0x1B, 0x1B, 0x32, 0x32,
+    0x1B, 0x1B, 0x1B, 0x32,
+    0x1B, 0x28, 0x2F, 0x35,
+    0x1B, 0x21, 0x2F, 0x35,
+    0x1B, 0x21, 0x28, 0x35,
+    0x1B, 0x35, 0x35, 0x35,
+    0x1B, 0x1B, 0x35, 0x35,
+    0x1B, 0x1B, 0x1B, 0x35,
+    0x1B, 0x29, 0x31, 0x38,
+    0x1B, 0x22, 0x31, 0x38,
+    0x1B, 0x22, 0x2A, 0x38,
+    0x1B, 0x38, 0x38, 0x38,
+    0x1B, 0x1B, 0x38, 0x38,
+    0x1B, 0x1B, 0x1B, 0x38,
+    0x1B, 0x2B, 0x33, 0x3B,
+    0x1B, 0x23, 0x33, 0x3B,
+    0x1B, 0x23, 0x2B, 0x3B,
+    0x1B, 0x3B, 0x3B, 0x3B,
+    0x1B, 0x1B, 0x3B, 0x3B,
+    0x1B, 0x1B, 0x1B, 0x3B,
+    0x1B, 0x3E, 0x3E, 0x3E,
+    0x1B, 0x1B, 0x3E, 0x3E,
+    0x1B, 0x1B, 0x1B, 0x3E,
+    0x1B, 0x2D, 0x36, 0x3F,
+    0x1B, 0x24, 0x36, 0x3F,
+    0x1B, 0x24, 0x2D, 0x3F,
+    0x1C, 0x1D, 0x1D, 0x1E,
+    0x1C, 0x1D, 0x1E, 0x1F,
+    0x1C, 0x1E, 0x1F, 0x20,
+    0x1C, 0x1D, 0x1F, 0x20,
+    0x1C, 0x1D, 0x1E, 0x20,
+    0x1C, 0x1E, 0x1F, 0x21,
+    0x1C, 0x1E, 0x20, 0x21,
+    0x1C, 0x1D, 0x20, 0x21,
+    0x1C, 0x1D, 0x1F, 0x21,
+    0x1C, 0x1E, 0x20, 0x22,
+    0x1C, 0x1F, 0x21, 0x22,
+    0x1C, 0x1D, 0x21, 0x22,
+    0x1C, 0x1D, 0x1F, 0x22,
+    0x1C, 0x22, 0x22, 0x22,
+    0x1C, 0x1C, 0x22, 0x22,
+    0x1C, 0x1C, 0x1C, 0x22,
+    0x1C, 0x1F, 0x20, 0x23,
+    0x1C, 0x1F, 0x22, 0x23,
+    0x1C, 0x1D, 0x22, 0x23,
+    0x1C, 0x1D, 0x20, 0x23,
+    0x1C, 0x1F, 0x21, 0x24,
+    0x1C, 0x20, 0x22, 0x24,
+    0x1C, 0x1E, 0x22, 0x24,
+    0x1C, 0x1E, 0x20, 0x24,
+    0x1C, 0x24, 0x24, 0x24,
+    0x1C, 0x1C, 0x24, 0x24,
+    0x1C, 0x1C, 0x1C, 0x24,
+    0x1C, 0x20, 0x23, 0x27,
+    0x1C, 0x21, 0x25, 0x27,
+    0x1C, 0x1E, 0x25, 0x27,
+    0x1C, 0x1E, 0x22, 0x27,
+    0x1C, 0x27, 0x27, 0x27,
+    0x1C, 0x1C, 0x27, 0x27,
+    0x1C, 0x1C, 0x1C, 0x27,
+    0x1C, 0x21, 0x25, 0x2A,
+    0x1C, 0x23, 0x27, 0x2A,
+    0x1C, 0x1F, 0x27, 0x2A,
+    0x1C, 0x1F, 0x23, 0x2A,
+    0x1C, 0x2A, 0x2A, 0x2A,
+    0x1C, 0x1C, 0x2A, 0x2A,
+    0x1C, 0x1C, 0x1C, 0x2A,
+    0x1C, 0x22, 0x27, 0x2D,
+    0x1C, 0x24, 0x29, 0x2D,
+    0x1C, 0x20, 0x29, 0x2D,
+    0x1C, 0x20, 0x25, 0x2D,
+    0x1C, 0x2D, 0x2D, 0x2D,
+    0x1C, 0x1C, 0x2D, 0x2D,
+    0x1C, 0x1C, 0x1C, 0x2D,
+    0x1C, 0x23, 0x29, 0x30,
+    0x1C, 0x26, 0x2B, 0x30,
+    0x1C, 0x21, 0x2B, 0x30,
+    0x1C, 0x21, 0x26, 0x30,
+    0x1C, 0x30, 0x30, 0x30,
+    0x1C, 0x1C, 0x30, 0x30,
+    0x1C, 0x1C, 0x1C, 0x30,
+    0x1C, 0x27, 0x2E, 0x33,
+    0x1C, 0x21, 0x2E, 0x33,
+    0x1C, 0x21, 0x28, 0x33,
+    0x1C, 0x33, 0x33, 0x33,
+    0x1C, 0x1C, 0x33, 0x33,
+    0x1C, 0x1C, 0x1C, 0x33,
+    0x1C, 0x29, 0x30, 0x36,
+    0x1C, 0x22, 0x30, 0x36,
+    0x1C, 0x22, 0x29, 0x36,
+    0x1C, 0x36, 0x36, 0x36,
+    0x1C, 0x1C, 0x36, 0x36,
+    0x1C, 0x1C, 0x1C, 0x36,
+    0x1C, 0x2A, 0x32, 0x39,
+    0x1C, 0x23, 0x32, 0x39,
+    0x1C, 0x23, 0x2B, 0x39,
+    0x1C, 0x39, 0x39, 0x39,
+    0x1C, 0x1C, 0x39, 0x39,
+    0x1C, 0x1C, 0x1C, 0x39,
+    0x1C, 0x2C, 0x34, 0x3C,
+    0x1C, 0x24, 0x34, 0x3C,
+    0x1C, 0x24, 0x2C, 0x3C,
+    0x1C, 0x3C, 0x3C, 0x3C,
+    0x1C, 0x1C, 0x3C, 0x3C,
+    0x1C, 0x1C, 0x1C, 0x3C,
+    0x1C, 0x3F, 0x3F, 0x3F,
+    0x1C, 0x1C, 0x3F, 0x3F,
+    0x1C, 0x1C, 0x1C, 0x3F,
+    0x1D, 0x1E, 0x1E, 0x1F,
+    0x1D, 0x1E, 0x1F, 0x20,
+    0x1D, 0x1F, 0x20, 0x21,
+    0x1D, 0x1E, 0x20, 0x21,
+    0x1D, 0x1E, 0x1F, 0x21,
+    0x1D, 0x1F, 0x20, 0x22,
+    0x1D, 0x1F, 0x21, 0x22,
+    0x1D, 0x1E, 0x21, 0x22,
+    0x1D, 0x1E, 0x20, 0x22,
+    0x1D, 0x1F, 0x21, 0x23,
+    0x1D, 0x20, 0x22, 0x23,
+    0x1D, 0x1E, 0x22, 0x23,
+    0x1D, 0x1E, 0x20, 0x23,
+    0x1D, 0x23, 0x23, 0x23,
+    0x1D, 0x1D, 0x23, 0x23,
+    0x1D, 0x1D, 0x1D, 0x23,
+    0x1D, 0x20, 0x21, 0x24,
+    0x1D, 0x20, 0x23, 0x24,
+    0x1D, 0x1E, 0x23, 0x24,
+    0x1D, 0x1E, 0x21, 0x24,
+    0x1D, 0x20, 0x22, 0x25,
+    0x1D, 0x21, 0x23, 0x25,
+    0x1D, 0x1F, 0x23, 0x25,
+    0x1D, 0x1F, 0x21, 0x25,
+    0x1D, 0x25, 0x25, 0x25,
+    0x1D, 0x1D, 0x25, 0x25,
+    0x1D, 0x1D, 0x1D, 0x25,
+    0x1D, 0x21, 0x24, 0x28,
+    0x1D, 0x22, 0x26, 0x28,
+    0x1D, 0x1F, 0x26, 0x28,
+    0x1D, 0x1F, 0x23, 0x28,
+    0x1D, 0x28, 0x28, 0x28,
+    0x1D, 0x1D, 0x28, 0x28,
+    0x1D, 0x1D, 0x1D, 0x28,
+    0x1D, 0x22, 0x26, 0x2B,
+    0x1D, 0x24, 0x28, 0x2B,
+    0x1D, 0x20, 0x28, 0x2B,
+    0x1D, 0x20, 0x24, 0x2B,
+    0x1D, 0x2B, 0x2B, 0x2B,
+    0x1D, 0x1D, 0x2B, 0x2B,
+    0x1D, 0x1D, 0x1D, 0x2B,
+    0x1D, 0x23, 0x28, 0x2E,
+    0x1D, 0x25, 0x2A, 0x2E,
+    0x1D, 0x21, 0x2A, 0x2E,
+    0x1D, 0x21, 0x26, 0x2E,
+    0x1D, 0x2E, 0x2E, 0x2E,
+    0x1D, 0x1D, 0x2E, 0x2E,
+    0x1D, 0x1D, 0x1D, 0x2E,
+    0x1D, 0x24, 0x2A, 0x31,
+    0x1D, 0x27, 0x2C, 0x31,
+    0x1D, 0x22, 0x2C, 0x31,
+    0x1D, 0x22, 0x27, 0x31,
+    0x1D, 0x31, 0x31, 0x31,
+    0x1D, 0x1D, 0x31, 0x31,
+    0x1D, 0x1D, 0x1D, 0x31,
+    0x1D, 0x28, 0x2F, 0x34,
+    0x1D, 0x22, 0x2F, 0x34,
+    0x1D, 0x22, 0x29, 0x34,
+    0x1D, 0x34, 0x34, 0x34,
+    0x1D, 0x1D, 0x34, 0x34,
+    0x1D, 0x1D, 0x1D, 0x34,
+    0x1D, 0x2A, 0x31, 0x37,
+    0x1D, 0x23, 0x31, 0x37,
+    0x1D, 0x23, 0x2A, 0x37,
+    0x1D, 0x37, 0x37, 0x37,
+    0x1D, 0x1D, 0x37, 0x37,
+    0x1D, 0x1D, 0x1D, 0x37,
+    0x1D, 0x2B, 0x33, 0x3A,
+    0x1D, 0x24, 0x33, 0x3A,
+    0x1D, 0x24, 0x2C, 0x3A,
+    0x1D, 0x3A, 0x3A, 0x3A,
+    0x1D, 0x1D, 0x3A, 0x3A,
+    0x1D, 0x1D, 0x1D, 0x3A,
+    0x1D, 0x2D, 0x35, 0x3D,
+    0x1D, 0x25, 0x35, 0x3D,
+    0x1D, 0x25, 0x2D, 0x3D,
+    0x1D, 0x3D, 0x3D, 0x3D,
+    0x1D, 0x1D, 0x3D, 0x3D,
+    0x1D, 0x1D, 0x1D, 0x3D,
+    0x1E, 0x1F, 0x1F, 0x20,
+    0x1E, 0x1F, 0x20, 0x21,
+    0x1E, 0x20, 0x21, 0x22,
+    0x1E, 0x1F, 0x21, 0x22,
+    0x1E, 0x1F, 0x20, 0x22,
+    0x1E, 0x20, 0x21, 0x23,
+    0x1E, 0x20, 0x22, 0x23,
+    0x1E, 0x1F, 0x22, 0x23,
+    0x1E, 0x1F, 0x21, 0x23,
+    0x1E, 0x20, 0x22, 0x24,
+    0x1E, 0x21, 0x23, 0x24,
+    0x1E, 0x1F, 0x23, 0x24,
+    0x1E, 0x1F, 0x21, 0x24,
+    0x1E, 0x24, 0x24, 0x24,
+    0x1E, 0x1E, 0x24, 0x24,
+    0x1E, 0x1E, 0x1E, 0x24,
+    0x1E, 0x21, 0x22, 0x25,
+    0x1E, 0x21, 0x24, 0x25,
+    0x1E, 0x1F, 0x24, 0x25,
+    0x1E, 0x1F, 0x22, 0x25,
+    0x1E, 0x21, 0x23, 0x26,
+    0x1E, 0x22, 0x24, 0x26,
+    0x1E, 0x20, 0x24, 0x26,
+    0x1E, 0x20, 0x22, 0x26,
+    0x1E, 0x26, 0x26, 0x26,
+    0x1E, 0x1E, 0x26, 0x26,
+    0x1E, 0x1E, 0x1E, 0x26,
+    0x1E, 0x22, 0x25, 0x29,
+    0x1E, 0x23, 0x27, 0x29,
+    0x1E, 0x20, 0x27, 0x29,
+    0x1E, 0x20, 0x24, 0x29,
+    0x1E, 0x29, 0x29, 0x29,
+    0x1E, 0x1E, 0x29, 0x29,
+    0x1E, 0x1E, 0x1E, 0x29,
+    0x1E, 0x23, 0x27, 0x2C,
+    0x1E, 0x25, 0x29, 0x2C,
+    0x1E, 0x21, 0x29, 0x2C,
+    0x1E, 0x21, 0x25, 0x2C,
+    0x1E, 0x2C, 0x2C, 0x2C,
+    0x1E, 0x1E, 0x2C, 0x2C,
+    0x1E, 0x1E, 0x1E, 0x2C,
+    0x1E, 0x24, 0x29, 0x2F,
+    0x1E, 0x26, 0x2B, 0x2F,
+    0x1E, 0x22, 0x2B, 0x2F,
+    0x1E, 0x22, 0x27, 0x2F,
+    0x1E, 0x2F, 0x2F, 0x2F,
+    0x1E, 0x1E, 0x2F, 0x2F,
+    0x1E, 0x1E, 0x1E, 0x2F,
+    0x1E, 0x25, 0x2B, 0x32,
+    0x1E, 0x28, 0x2D, 0x32,
+    0x1E, 0x23, 0x2D, 0x32,
+    0x1E, 0x23, 0x28, 0x32,
+    0x1E, 0x32, 0x32, 0x32,
+    0x1E, 0x1E, 0x32, 0x32,
+    0x1E, 0x1E, 0x1E, 0x32,
+    0x1E, 0x29, 0x30, 0x35,
+    0x1E, 0x23, 0x30, 0x35,
+    0x1E, 0x23, 0x2A, 0x35,
+    0x1E, 0x35, 0x35, 0x35,
+    0x1E, 0x1E, 0x35, 0x35,
+    0x1E, 0x1E, 0x1E, 0x35,
+    0x1E, 0x2B, 0x32, 0x38,
+    0x1E, 0x24, 0x32, 0x38,
+    0x1E, 0x24, 0x2B, 0x38,
+    0x1E, 0x38, 0x38, 0x38,
+    0x1E, 0x1E, 0x38, 0x38,
+    0x1E, 0x1E, 0x1E, 0x38,
+    0x1E, 0x2C, 0x34, 0x3B,
+    0x1E, 0x25, 0x34, 0x3B,
+    0x1E, 0x25, 0x2D, 0x3B,
+    0x1E, 0x3B, 0x3B, 0x3B,
+    0x1E, 0x1E, 0x3B, 0x3B,
+    0x1E, 0x1E, 0x1E, 0x3B,
+    0x1E, 0x2E, 0x36, 0x3E,
+    0x1E, 0x26, 0x36, 0x3E,
+    0x1E, 0x26, 0x2E, 0x3E,
+    0x1E, 0x3E, 0x3E, 0x3E,
+    0x1E, 0x1E, 0x3E, 0x3E,
+    0x1E, 0x1E, 0x1E, 0x3E,
+    0x1F, 0x20, 0x20, 0x21,
+    0x1F, 0x20, 0x21, 0x22,
+    0x1F, 0x21, 0x22, 0x23,
+    0x1F, 0x20, 0x22, 0x23,
+    0x1F, 0x20, 0x21, 0x23,
+    0x1F, 0x21, 0x22, 0x24,
+    0x1F, 0x21, 0x23, 0x24,
+    0x1F, 0x20, 0x23, 0x24,
+    0x1F, 0x20, 0x22, 0x24,
+    0x1F, 0x21, 0x23, 0x25,
+    0x1F, 0x22, 0x24, 0x25,
+    0x1F, 0x20, 0x24, 0x25,
+    0x1F, 0x20, 0x22, 0x25,
+    0x1F, 0x25, 0x25, 0x25,
+    0x1F, 0x1F, 0x25, 0x25,
+    0x1F, 0x1F, 0x1F, 0x25,
+    0x1F, 0x22, 0x23, 0x26,
+    0x1F, 0x22, 0x25, 0x26,
+    0x1F, 0x20, 0x25, 0x26,
+    0x1F, 0x20, 0x23, 0x26,
+    0x1F, 0x22, 0x24, 0x27,
+    0x1F, 0x23, 0x25, 0x27,
+    0x1F, 0x21, 0x25, 0x27,
+    0x1F, 0x21, 0x23, 0x27,
+    0x1F, 0x27, 0x27, 0x27,
+    0x1F, 0x1F, 0x27, 0x27,
+    0x1F, 0x1F, 0x1F, 0x27,
+    0x1F, 0x23, 0x26, 0x2A,
+    0x1F, 0x24, 0x28, 0x2A,
+    0x1F, 0x21, 0x28, 0x2A,
+    0x1F, 0x21, 0x25, 0x2A,
+    0x1F, 0x2A, 0x2A, 0x2A,
+    0x1F, 0x1F, 0x2A, 0x2A,
+    0x1F, 0x1F, 0x1F, 0x2A,
+    0x1F, 0x24, 0x28, 0x2D,
+    0x1F, 0x26, 0x2A, 0x2D,
+    0x1F, 0x22, 0x2A, 0x2D,
+    0x1F, 0x22, 0x26, 0x2D,
+    0x1F, 0x2D, 0x2D, 0x2D,
+    0x1F, 0x1F, 0x2D, 0x2D,
+    0x1F, 0x1F, 0x1F, 0x2D,
+    0x1F, 0x25, 0x2A, 0x30,
+    0x1F, 0x27, 0x2C, 0x30,
+    0x1F, 0x23, 0x2C, 0x30,
+    0x1F, 0x23, 0x28, 0x30,
+    0x1F, 0x30, 0x30, 0x30,
+    0x1F, 0x1F, 0x30, 0x30,
+    0x1F, 0x1F, 0x1F, 0x30,
+    0x1F, 0x26, 0x2C, 0x33,
+    0x1F, 0x29, 0x2E, 0x33,
+    0x1F, 0x24, 0x2E, 0x33,
+    0x1F, 0x24, 0x29, 0x33,
+    0x1F, 0x33, 0x33, 0x33,
+    0x1F, 0x1F, 0x33, 0x33,
+    0x1F, 0x1F, 0x1F, 0x33,
+    0x1F, 0x2A, 0x31, 0x36,
+    0x1F, 0x24, 0x31, 0x36,
+    0x1F, 0x24, 0x2B, 0x36,
+    0x1F, 0x36, 0x36, 0x36,
+    0x1F, 0x1F, 0x36, 0x36,
+    0x1F, 0x1F, 0x1F, 0x36,
+    0x1F, 0x2C, 0x33, 0x39,
+    0x1F, 0x25, 0x33, 0x39,
+    0x1F, 0x25, 0x2C, 0x39,
+    0x1F, 0x39, 0x39, 0x39,
+    0x1F, 0x1F, 0x39, 0x39,
+    0x1F, 0x1F, 0x1F, 0x39,
+    0x1F, 0x2D, 0x35, 0x3C,
+    0x1F, 0x26, 0x35, 0x3C,
+    0x1F, 0x26, 0x2E, 0x3C,
+    0x1F, 0x3C, 0x3C, 0x3C,
+    0x1F, 0x1F, 0x3C, 0x3C,
+    0x1F, 0x1F, 0x1F, 0x3C,
+    0x1F, 0x2F, 0x37, 0x3F,
+    0x1F, 0x27, 0x37, 0x3F,
+    0x1F, 0x27, 0x2F, 0x3F,
+    0x1F, 0x3F, 0x3F, 0x3F,
+    0x1F, 0x1F, 0x3F, 0x3F,
+    0x1F, 0x1F, 0x1F, 0x3F,
+    0x20, 0x21, 0x21, 0x22,
+    0x20, 0x21, 0x22, 0x23,
+    0x20, 0x22, 0x23, 0x24,
+    0x20, 0x21, 0x23, 0x24,
+    0x20, 0x21, 0x22, 0x24,
+    0x20, 0x22, 0x23, 0x25,
+    0x20, 0x22, 0x24, 0x25,
+    0x20, 0x21, 0x24, 0x25,
+    0x20, 0x21, 0x23, 0x25,
+    0x20, 0x22, 0x24, 0x26,
+    0x20, 0x23, 0x25, 0x26,
+    0x20, 0x21, 0x25, 0x26,
+    0x20, 0x21, 0x23, 0x26,
+    0x20, 0x26, 0x26, 0x26,
+    0x20, 0x20, 0x26, 0x26,
+    0x20, 0x20, 0x20, 0x26,
+    0x20, 0x23, 0x24, 0x27,
+    0x20, 0x23, 0x26, 0x27,
+    0x20, 0x21, 0x26, 0x27,
+    0x20, 0x21, 0x24, 0x27,
+    0x20, 0x23, 0x25, 0x28,
+    0x20, 0x24, 0x26, 0x28,
+    0x20, 0x22, 0x26, 0x28,
+    0x20, 0x22, 0x24, 0x28,
+    0x20, 0x28, 0x28, 0x28,
+    0x20, 0x20, 0x28, 0x28,
+    0x20, 0x20, 0x20, 0x28,
+    0x20, 0x24, 0x27, 0x2B,
+    0x20, 0x25, 0x29, 0x2B,
+    0x20, 0x22, 0x29, 0x2B,
+    0x20, 0x22, 0x26, 0x2B,
+    0x20, 0x2B, 0x2B, 0x2B,
+    0x20, 0x20, 0x2B, 0x2B,
+    0x20, 0x20, 0x20, 0x2B,
+    0x20, 0x25, 0x29, 0x2E,
+    0x20, 0x27, 0x2B, 0x2E,
+    0x20, 0x23, 0x2B, 0x2E,
+    0x20, 0x23, 0x27, 0x2E,
+    0x20, 0x2E, 0x2E, 0x2E,
+    0x20, 0x20, 0x2E, 0x2E,
+    0x20, 0x20, 0x20, 0x2E,
+    0x20, 0x26, 0x2B, 0x31,
+    0x20, 0x28, 0x2D, 0x31,
+    0x20, 0x24, 0x2D, 0x31,
+    0x20, 0x24, 0x29, 0x31,
+    0x20, 0x31, 0x31, 0x31,
+    0x20, 0x20, 0x31, 0x31,
+    0x20, 0x20, 0x20, 0x31,
+    0x20, 0x27, 0x2D, 0x34,
+    0x20, 0x2A, 0x2F, 0x34,
+    0x20, 0x25, 0x2F, 0x34,
+    0x20, 0x25, 0x2A, 0x34,
+    0x20, 0x34, 0x34, 0x34,
+    0x20, 0x20, 0x34, 0x34,
+    0x20, 0x20, 0x20, 0x34,
+    0x20, 0x2B, 0x32, 0x37,
+    0x20, 0x25, 0x32, 0x37,
+    0x20, 0x25, 0x2C, 0x37,
+    0x20, 0x37, 0x37, 0x37,
+    0x20, 0x20, 0x37, 0x37,
+    0x20, 0x20, 0x20, 0x37,
+    0x20, 0x2D, 0x34, 0x3A,
+    0x20, 0x26, 0x34, 0x3A,
+    0x20, 0x26, 0x2D, 0x3A,
+    0x20, 0x3A, 0x3A, 0x3A,
+    0x20, 0x20, 0x3A, 0x3A,
+    0x20, 0x20, 0x20, 0x3A,
+    0x20, 0x2E, 0x36, 0x3D,
+    0x20, 0x27, 0x36, 0x3D,
+    0x20, 0x27, 0x2F, 0x3D,
+    0x20, 0x3D, 0x3D, 0x3D,
+    0x20, 0x20, 0x3D, 0x3D,
+    0x20, 0x20, 0x20, 0x3D,
+    0x21, 0x22, 0x22, 0x23,
+    0x21, 0x22, 0x23, 0x24,
+    0x21, 0x23, 0x24, 0x25,
+    0x21, 0x22, 0x24, 0x25,
+    0x21, 0x22, 0x23, 0x25,
+    0x21, 0x23, 0x24, 0x26,
+    0x21, 0x23, 0x25, 0x26,
+    0x21, 0x22, 0x25, 0x26,
+    0x21, 0x22, 0x24, 0x26,
+    0x21, 0x23, 0x25, 0x27,
+    0x21, 0x24, 0x26, 0x27,
+    0x21, 0x22, 0x26, 0x27,
+    0x21, 0x22, 0x24, 0x27,
+    0x21, 0x27, 0x27, 0x27,
+    0x21, 0x21, 0x27, 0x27,
+    0x21, 0x21, 0x21, 0x27,
+    0x21, 0x24, 0x25, 0x28,
+    0x21, 0x24, 0x27, 0x28,
+    0x21, 0x22, 0x27, 0x28,
+    0x21, 0x22, 0x25, 0x28,
+    0x21, 0x24, 0x26, 0x29,
+    0x21, 0x25, 0x27, 0x29,
+    0x21, 0x23, 0x27, 0x29,
+    0x21, 0x23, 0x25, 0x29,
+    0x21, 0x29, 0x29, 0x29,
+    0x21, 0x21, 0x29, 0x29,
+    0x21, 0x21, 0x21, 0x29,
+    0x21, 0x25, 0x28, 0x2C,
+    0x21, 0x26, 0x2A, 0x2C,
+    0x21, 0x23, 0x2A, 0x2C,
+    0x21, 0x23, 0x27, 0x2C,
+    0x21, 0x2C, 0x2C, 0x2C,
+    0x21, 0x21, 0x2C, 0x2C,
+    0x21, 0x21, 0x21, 0x2C,
+    0x21, 0x26, 0x2A, 0x2F,
+    0x21, 0x28, 0x2C, 0x2F,
+    0x21, 0x24, 0x2C, 0x2F,
+    0x21, 0x24, 0x28, 0x2F,
+    0x21, 0x2F, 0x2F, 0x2F,
+    0x21, 0x21, 0x2F, 0x2F,
+    0x21, 0x21, 0x21, 0x2F,
+    0x21, 0x27, 0x2C, 0x32,
+    0x21, 0x29, 0x2E, 0x32,
+    0x21, 0x25, 0x2E, 0x32,
+    0x21, 0x25, 0x2A, 0x32,
+    0x21, 0x32, 0x32, 0x32,
+    0x21, 0x21, 0x32, 0x32,
+    0x21, 0x21, 0x21, 0x32,
+    0x21, 0x28, 0x2E, 0x35,
+    0x21, 0x2B, 0x30, 0x35,
+    0x21, 0x26, 0x30, 0x35,
+    0x21, 0x26, 0x2B, 0x35,
+    0x21, 0x35, 0x35, 0x35,
+    0x21, 0x21, 0x35, 0x35,
+    0x21, 0x21, 0x21, 0x35,
+    0x21, 0x2C, 0x33, 0x38,
+    0x21, 0x26, 0x33, 0x38,
+    0x21, 0x26, 0x2D, 0x38,
+    0x21, 0x38, 0x38, 0x38,
+    0x21, 0x21, 0x38, 0x38,
+    0x21, 0x21, 0x21, 0x38,
+    0x21, 0x2E, 0x35, 0x3B,
+    0x21, 0x27, 0x35, 0x3B,
+    0x21, 0x27, 0x2E, 0x3B,
+    0x21, 0x3B, 0x3B, 0x3B,
+    0x21, 0x21, 0x3B, 0x3B,
+    0x21, 0x21, 0x21, 0x3B,
+    0x21, 0x2F, 0x37, 0x3E,
+    0x21, 0x28, 0x37, 0x3E,
+    0x21, 0x28, 0x30, 0x3E,
+    0x21, 0x3E, 0x3E, 0x3E,
+    0x21, 0x21, 0x3E, 0x3E,
+    0x21, 0x21, 0x21, 0x3E,
+    0x22, 0x23, 0x23, 0x24,
+    0x22, 0x23, 0x24, 0x25,
+    0x22, 0x24, 0x25, 0x26,
+    0x22, 0x23, 0x25, 0x26,
+    0x22, 0x23, 0x24, 0x26,
+    0x22, 0x24, 0x25, 0x27,
+    0x22, 0x24, 0x26, 0x27,
+    0x22, 0x23, 0x26, 0x27,
+    0x22, 0x23, 0x25, 0x27,
+    0x22, 0x24, 0x26, 0x28,
+    0x22, 0x25, 0x27, 0x28,
+    0x22, 0x23, 0x27, 0x28,
+    0x22, 0x23, 0x25, 0x28,
+    0x22, 0x28, 0x28, 0x28,
+    0x22, 0x22, 0x28, 0x28,
+    0x22, 0x22, 0x22, 0x28,
+    0x22, 0x25, 0x26, 0x29,
+    0x22, 0x25, 0x28, 0x29,
+    0x22, 0x23, 0x28, 0x29,
+    0x22, 0x23, 0x26, 0x29,
+    0x22, 0x25, 0x27, 0x2A,
+    0x22, 0x26, 0x28, 0x2A,
+    0x22, 0x24, 0x28, 0x2A,
+    0x22, 0x24, 0x26, 0x2A,
+    0x22, 0x2A, 0x2A, 0x2A,
+    0x22, 0x22, 0x2A, 0x2A,
+    0x22, 0x22, 0x22, 0x2A,
+    0x22, 0x26, 0x29, 0x2D,
+    0x22, 0x27, 0x2B, 0x2D,
+    0x22, 0x24, 0x2B, 0x2D,
+    0x22, 0x24, 0x28, 0x2D,
+    0x22, 0x2D, 0x2D, 0x2D,
+    0x22, 0x22, 0x2D, 0x2D,
+    0x22, 0x22, 0x22, 0x2D,
+    0x22, 0x27, 0x2B, 0x30,
+    0x22, 0x29, 0x2D, 0x30,
+    0x22, 0x25, 0x2D, 0x30,
+    0x22, 0x25, 0x29, 0x30,
+    0x22, 0x30, 0x30, 0x30,
+    0x22, 0x22, 0x30, 0x30,
+    0x22, 0x22, 0x22, 0x30,
+    0x22, 0x28, 0x2D, 0x33,
+    0x22, 0x2A, 0x2F, 0x33,
+    0x22, 0x26, 0x2F, 0x33,
+    0x22, 0x26, 0x2B, 0x33,
+    0x22, 0x33, 0x33, 0x33,
+    0x22, 0x22, 0x33, 0x33,
+    0x22, 0x22, 0x22, 0x33,
+    0x22, 0x29, 0x2F, 0x36,
+    0x22, 0x2C, 0x31, 0x36,
+    0x22, 0x27, 0x31, 0x36,
+    0x22, 0x27, 0x2C, 0x36,
+    0x22, 0x36, 0x36, 0x36,
+    0x22, 0x22, 0x36, 0x36,
+    0x22, 0x22, 0x22, 0x36,
+    0x22, 0x2D, 0x34, 0x39,
+    0x22, 0x27, 0x34, 0x39,
+    0x22, 0x27, 0x2E, 0x39,
+    0x22, 0x39, 0x39, 0x39,
+    0x22, 0x22, 0x39, 0x39,
+    0x22, 0x22, 0x22, 0x39,
+    0x22, 0x2F, 0x36, 0x3C,
+    0x22, 0x28, 0x36, 0x3C,
+    0x22, 0x28, 0x2F, 0x3C,
+    0x22, 0x3C, 0x3C, 0x3C,
+    0x22, 0x22, 0x3C, 0x3C,
+    0x22, 0x22, 0x22, 0x3C,
+    0x22, 0x30, 0x38, 0x3F,
+    0x22, 0x29, 0x38, 0x3F,
+    0x22, 0x29, 0x31, 0x3F,
+    0x22, 0x3F, 0x3F, 0x3F,
+    0x22, 0x22, 0x3F, 0x3F,
+    0x22, 0x22, 0x22, 0x3F,
+    0x23, 0x24, 0x24, 0x25,
+    0x23, 0x24, 0x25, 0x26,
+    0x23, 0x25, 0x26, 0x27,
+    0x23, 0x24, 0x26, 0x27,
+    0x23, 0x24, 0x25, 0x27,
+    0x23, 0x25, 0x26, 0x28,
+    0x23, 0x25, 0x27, 0x28,
+    0x23, 0x24, 0x27, 0x28,
+    0x23, 0x24, 0x26, 0x28,
+    0x23, 0x25, 0x27, 0x29,
+    0x23, 0x26, 0x28, 0x29,
+    0x23, 0x24, 0x28, 0x29,
+    0x23, 0x24, 0x26, 0x29,
+    0x23, 0x29, 0x29, 0x29,
+    0x23, 0x23, 0x29, 0x29,
+    0x23, 0x23, 0x23, 0x29,
+    0x23, 0x26, 0x27, 0x2A,
+    0x23, 0x26, 0x29, 0x2A,
+    0x23, 0x24, 0x29, 0x2A,
+    0x23, 0x24, 0x27, 0x2A,
+    0x23, 0x26, 0x28, 0x2B,
+    0x23, 0x27, 0x29, 0x2B,
+    0x23, 0x25, 0x29, 0x2B,
+    0x23, 0x25, 0x27, 0x2B,
+    0x23, 0x2B, 0x2B, 0x2B,
+    0x23, 0x23, 0x2B, 0x2B,
+    0x23, 0x23, 0x23, 0x2B,
+    0x23, 0x27, 0x2A, 0x2E,
+    0x23, 0x28, 0x2C, 0x2E,
+    0x23, 0x25, 0x2C, 0x2E,
+    0x23, 0x25, 0x29, 0x2E,
+    0x23, 0x2E, 0x2E, 0x2E,
+    0x23, 0x23, 0x2E, 0x2E,
+    0x23, 0x23, 0x23, 0x2E,
+    0x23, 0x28, 0x2C, 0x31,
+    0x23, 0x2A, 0x2E, 0x31,
+    0x23, 0x26, 0x2E, 0x31,
+    0x23, 0x26, 0x2A, 0x31,
+    0x23, 0x31, 0x31, 0x31,
+    0x23, 0x23, 0x31, 0x31,
+    0x23, 0x23, 0x23, 0x31,
+    0x23, 0x29, 0x2E, 0x34,
+    0x23, 0x2B, 0x30, 0x34,
+    0x23, 0x27, 0x30, 0x34,
+    0x23, 0x27, 0x2C, 0x34,
+    0x23, 0x34, 0x34, 0x34,
+    0x23, 0x23, 0x34, 0x34,
+    0x23, 0x23, 0x23, 0x34,
+    0x23, 0x2A, 0x30, 0x37,
+    0x23, 0x2D, 0x32, 0x37,
+    0x23, 0x28, 0x32, 0x37,
+    0x23, 0x28, 0x2D, 0x37,
+    0x23, 0x37, 0x37, 0x37,
+    0x23, 0x23, 0x37, 0x37,
+    0x23, 0x23, 0x23, 0x37,
+    0x23, 0x2E, 0x35, 0x3A,
+    0x23, 0x28, 0x35, 0x3A,
+    0x23, 0x28, 0x2F, 0x3A,
+    0x23, 0x3A, 0x3A, 0x3A,
+    0x23, 0x23, 0x3A, 0x3A,
+    0x23, 0x23, 0x23, 0x3A,
+    0x23, 0x30, 0x37, 0x3D,
+    0x23, 0x29, 0x37, 0x3D,
+    0x23, 0x29, 0x30, 0x3D,
+    0x23, 0x3D, 0x3D, 0x3D,
+    0x23, 0x23, 0x3D, 0x3D,
+    0x23, 0x23, 0x23, 0x3D,
+    0x24, 0x25, 0x25, 0x26,
+    0x24, 0x25, 0x26, 0x27,
+    0x24, 0x26, 0x27, 0x28,
+    0x24, 0x25, 0x27, 0x28,
+    0x24, 0x25, 0x26, 0x28,
+    0x24, 0x26, 0x27, 0x29,
+    0x24, 0x26, 0x28, 0x29,
+    0x24, 0x25, 0x28, 0x29,
+    0x24, 0x25, 0x27, 0x29,
+    0x24, 0x26, 0x28, 0x2A,
+    0x24, 0x27, 0x29, 0x2A,
+    0x24, 0x25, 0x29, 0x2A,
+    0x24, 0x25, 0x27, 0x2A,
+    0x24, 0x2A, 0x2A, 0x2A,
+    0x24, 0x24, 0x2A, 0x2A,
+    0x24, 0x24, 0x24, 0x2A,
+    0x24, 0x27, 0x28, 0x2B,
+    0x24, 0x27, 0x2A, 0x2B,
+    0x24, 0x25, 0x2A, 0x2B,
+    0x24, 0x25, 0x28, 0x2B,
+    0x24, 0x27, 0x29, 0x2C,
+    0x24, 0x28, 0x2A, 0x2C,
+    0x24, 0x26, 0x2A, 0x2C,
+    0x24, 0x26, 0x28, 0x2C,
+    0x24, 0x2C, 0x2C, 0x2C,
+    0x24, 0x24, 0x2C, 0x2C,
+    0x24, 0x24, 0x24, 0x2C,
+    0x24, 0x28, 0x2B, 0x2F,
+    0x24, 0x29, 0x2D, 0x2F,
+    0x24, 0x26, 0x2D, 0x2F,
+    0x24, 0x26, 0x2A, 0x2F,
+    0x24, 0x2F, 0x2F, 0x2F,
+    0x24, 0x24, 0x2F, 0x2F,
+    0x24, 0x24, 0x24, 0x2F,
+    0x24, 0x29, 0x2D, 0x32,
+    0x24, 0x2B, 0x2F, 0x32,
+    0x24, 0x27, 0x2F, 0x32,
+    0x24, 0x27, 0x2B, 0x32,
+    0x24, 0x32, 0x32, 0x32,
+    0x24, 0x24, 0x32, 0x32,
+    0x24, 0x24, 0x24, 0x32,
+    0x24, 0x2A, 0x2F, 0x35,
+    0x24, 0x2C, 0x31, 0x35,
+    0x24, 0x28, 0x31, 0x35,
+    0x24, 0x28, 0x2D, 0x35,
+    0x24, 0x35, 0x35, 0x35,
+    0x24, 0x24, 0x35, 0x35,
+    0x24, 0x24, 0x24, 0x35,
+    0x24, 0x2B, 0x31, 0x38,
+    0x24, 0x2E, 0x33, 0x38,
+    0x24, 0x29, 0x33, 0x38,
+    0x24, 0x29, 0x2E, 0x38,
+    0x24, 0x38, 0x38, 0x38,
+    0x24, 0x24, 0x38, 0x38,
+    0x24, 0x24, 0x24, 0x38,
+    0x24, 0x2F, 0x36, 0x3B,
+    0x24, 0x29, 0x36, 0x3B,
+    0x24, 0x29, 0x30, 0x3B,
+    0x24, 0x3B, 0x3B, 0x3B,
+    0x24, 0x24, 0x3B, 0x3B,
+    0x24, 0x24, 0x24, 0x3B,
+    0x24, 0x31, 0x38, 0x3E,
+    0x24, 0x2A, 0x38, 0x3E,
+    0x24, 0x2A, 0x31, 0x3E,
+    0x24, 0x3E, 0x3E, 0x3E,
+    0x24, 0x24, 0x3E, 0x3E,
+    0x24, 0x24, 0x24, 0x3E,
+    0x25, 0x26, 0x26, 0x27,
+    0x25, 0x26, 0x27, 0x28,
+    0x25, 0x27, 0x28, 0x29,
+    0x25, 0x26, 0x28, 0x29,
+    0x25, 0x26, 0x27, 0x29,
+    0x25, 0x27, 0x28, 0x2A,
+    0x25, 0x27, 0x29, 0x2A,
+    0x25, 0x26, 0x29, 0x2A,
+    0x25, 0x26, 0x28, 0x2A,
+    0x25, 0x27, 0x29, 0x2B,
+    0x25, 0x28, 0x2A, 0x2B,
+    0x25, 0x26, 0x2A, 0x2B,
+    0x25, 0x26, 0x28, 0x2B,
+    0x25, 0x2B, 0x2B, 0x2B,
+    0x25, 0x25, 0x2B, 0x2B,
+    0x25, 0x25, 0x25, 0x2B,
+    0x25, 0x28, 0x29, 0x2C,
+    0x25, 0x28, 0x2B, 0x2C,
+    0x25, 0x26, 0x2B, 0x2C,
+    0x25, 0x26, 0x29, 0x2C,
+    0x25, 0x28, 0x2A, 0x2D,
+    0x25, 0x29, 0x2B, 0x2D,
+    0x25, 0x27, 0x2B, 0x2D,
+    0x25, 0x27, 0x29, 0x2D,
+    0x25, 0x2D, 0x2D, 0x2D,
+    0x25, 0x25, 0x2D, 0x2D,
+    0x25, 0x25, 0x25, 0x2D,
+    0x25, 0x29, 0x2C, 0x30,
+    0x25, 0x2A, 0x2E, 0x30,
+    0x25, 0x27, 0x2E, 0x30,
+    0x25, 0x27, 0x2B, 0x30,
+    0x25, 0x30, 0x30, 0x30,
+    0x25, 0x25, 0x30, 0x30,
+    0x25, 0x25, 0x25, 0x30,
+    0x25, 0x2A, 0x2E, 0x33,
+    0x25, 0x2C, 0x30, 0x33,
+    0x25, 0x28, 0x30, 0x33,
+    0x25, 0x28, 0x2C, 0x33,
+    0x25, 0x33, 0x33, 0x33,
+    0x25, 0x25, 0x33, 0x33,
+    0x25, 0x25, 0x25, 0x33,
+    0x25, 0x2B, 0x30, 0x36,
+    0x25, 0x2D, 0x32, 0x36,
+    0x25, 0x29, 0x32, 0x36,
+    0x25, 0x29, 0x2E, 0x36,
+    0x25, 0x36, 0x36, 0x36,
+    0x25, 0x25, 0x36, 0x36,
+    0x25, 0x25, 0x25, 0x36,
+    0x25, 0x2C, 0x32, 0x39,
+    0x25, 0x2F, 0x34, 0x39,
+    0x25, 0x2A, 0x34, 0x39,
+    0x25, 0x2A, 0x2F, 0x39,
+    0x25, 0x39, 0x39, 0x39,
+    0x25, 0x25, 0x39, 0x39,
+    0x25, 0x25, 0x25, 0x39,
+    0x25, 0x30, 0x37, 0x3C,
+    0x25, 0x2A, 0x37, 0x3C,
+    0x25, 0x2A, 0x31, 0x3C,
+    0x25, 0x3C, 0x3C, 0x3C,
+    0x25, 0x25, 0x3C, 0x3C,
+    0x25, 0x25, 0x25, 0x3C,
+    0x25, 0x32, 0x39, 0x3F,
+    0x25, 0x2B, 0x39, 0x3F,
+    0x25, 0x2B, 0x32, 0x3F,
+    0x25, 0x3F, 0x3F, 0x3F,
+    0x25, 0x25, 0x3F, 0x3F,
+    0x25, 0x25, 0x25, 0x3F,
+    0x26, 0x27, 0x27, 0x28,
+    0x26, 0x27, 0x28, 0x29,
+    0x26, 0x28, 0x29, 0x2A,
+    0x26, 0x27, 0x29, 0x2A,
+    0x26, 0x27, 0x28, 0x2A,
+    0x26, 0x28, 0x29, 0x2B,
+    0x26, 0x28, 0x2A, 0x2B,
+    0x26, 0x27, 0x2A, 0x2B,
+    0x26, 0x27, 0x29, 0x2B,
+    0x26, 0x28, 0x2A, 0x2C,
+    0x26, 0x29, 0x2B, 0x2C,
+    0x26, 0x27, 0x2B, 0x2C,
+    0x26, 0x27, 0x29, 0x2C,
+    0x26, 0x2C, 0x2C, 0x2C,
+    0x26, 0x26, 0x2C, 0x2C,
+    0x26, 0x26, 0x26, 0x2C,
+    0x26, 0x29, 0x2A, 0x2D,
+    0x26, 0x29, 0x2C, 0x2D,
+    0x26, 0x27, 0x2C, 0x2D,
+    0x26, 0x27, 0x2A, 0x2D,
+    0x26, 0x29, 0x2B, 0x2E,
+    0x26, 0x2A, 0x2C, 0x2E,
+    0x26, 0x28, 0x2C, 0x2E,
+    0x26, 0x28, 0x2A, 0x2E,
+    0x26, 0x2E, 0x2E, 0x2E,
+    0x26, 0x26, 0x2E, 0x2E,
+    0x26, 0x26, 0x26, 0x2E,
+    0x26, 0x2A, 0x2D, 0x31,
+    0x26, 0x2B, 0x2F, 0x31,
+    0x26, 0x28, 0x2F, 0x31,
+    0x26, 0x28, 0x2C, 0x31,
+    0x26, 0x31, 0x31, 0x31,
+    0x26, 0x26, 0x31, 0x31,
+    0x26, 0x26, 0x26, 0x31,
+    0x26, 0x2B, 0x2F, 0x34,
+    0x26, 0x2D, 0x31, 0x34,
+    0x26, 0x29, 0x31, 0x34,
+    0x26, 0x29, 0x2D, 0x34,
+    0x26, 0x34, 0x34, 0x34,
+    0x26, 0x26, 0x34, 0x34,
+    0x26, 0x26, 0x26, 0x34,
+    0x26, 0x2C, 0x31, 0x37,
+    0x26, 0x2E, 0x33, 0x37,
+    0x26, 0x2A, 0x33, 0x37,
+    0x26, 0x2A, 0x2F, 0x37,
+    0x26, 0x37, 0x37, 0x37,
+    0x26, 0x26, 0x37, 0x37,
+    0x26, 0x26, 0x26, 0x37,
+    0x26, 0x2D, 0x33, 0x3A,
+    0x26, 0x30, 0x35, 0x3A,
+    0x26, 0x2B, 0x35, 0x3A,
+    0x26, 0x2B, 0x30, 0x3A,
+    0x26, 0x3A, 0x3A, 0x3A,
+    0x26, 0x26, 0x3A, 0x3A,
+    0x26, 0x26, 0x26, 0x3A,
+    0x26, 0x31, 0x38, 0x3D,
+    0x26, 0x2B, 0x38, 0x3D,
+    0x26, 0x2B, 0x32, 0x3D,
+    0x26, 0x3D, 0x3D, 0x3D,
+    0x26, 0x26, 0x3D, 0x3D,
+    0x26, 0x26, 0x26, 0x3D,
+    0x27, 0x28, 0x28, 0x29,
+    0x27, 0x28, 0x29, 0x2A,
+    0x27, 0x29, 0x2A, 0x2B,
+    0x27, 0x28, 0x2A, 0x2B,
+    0x27, 0x28, 0x29, 0x2B,
+    0x27, 0x29, 0x2A, 0x2C,
+    0x27, 0x29, 0x2B, 0x2C,
+    0x27, 0x28, 0x2B, 0x2C,
+    0x27, 0x28, 0x2A, 0x2C,
+    0x27, 0x29, 0x2B, 0x2D,
+    0x27, 0x2A, 0x2C, 0x2D,
+    0x27, 0x28, 0x2C, 0x2D,
+    0x27, 0x28, 0x2A, 0x2D,
+    0x27, 0x2D, 0x2D, 0x2D,
+    0x27, 0x27, 0x2D, 0x2D,
+    0x27, 0x27, 0x27, 0x2D,
+    0x27, 0x2A, 0x2B, 0x2E,
+    0x27, 0x2A, 0x2D, 0x2E,
+    0x27, 0x28, 0x2D, 0x2E,
+    0x27, 0x28, 0x2B, 0x2E,
+    0x27, 0x2A, 0x2C, 0x2F,
+    0x27, 0x2B, 0x2D, 0x2F,
+    0x27, 0x29, 0x2D, 0x2F,
+    0x27, 0x29, 0x2B, 0x2F,
+    0x27, 0x2F, 0x2F, 0x2F,
+    0x27, 0x27, 0x2F, 0x2F,
+    0x27, 0x27, 0x27, 0x2F,
+    0x27, 0x2B, 0x2E, 0x32,
+    0x27, 0x2C, 0x30, 0x32,
+    0x27, 0x29, 0x30, 0x32,
+    0x27, 0x29, 0x2D, 0x32,
+    0x27, 0x32, 0x32, 0x32,
+    0x27, 0x27, 0x32, 0x32,
+    0x27, 0x27, 0x27, 0x32,
+    0x27, 0x2C, 0x30, 0x35,
+    0x27, 0x2E, 0x32, 0x35,
+    0x27, 0x2A, 0x32, 0x35,
+    0x27, 0x2A, 0x2E, 0x35,
+    0x27, 0x35, 0x35, 0x35,
+    0x27, 0x27, 0x35, 0x35,
+    0x27, 0x27, 0x27, 0x35,
+    0x27, 0x2D, 0x32, 0x38,
+    0x27, 0x2F, 0x34, 0x38,
+    0x27, 0x2B, 0x34, 0x38,
+    0x27, 0x2B, 0x30, 0x38,
+    0x27, 0x38, 0x38, 0x38,
+    0x27, 0x27, 0x38, 0x38,
+    0x27, 0x27, 0x27, 0x38,
+    0x27, 0x2E, 0x34, 0x3B,
+    0x27, 0x31, 0x36, 0x3B,
+    0x27, 0x2C, 0x36, 0x3B,
+    0x27, 0x2C, 0x31, 0x3B,
+    0x27, 0x3B, 0x3B, 0x3B,
+    0x27, 0x27, 0x3B, 0x3B,
+    0x27, 0x27, 0x27, 0x3B,
+    0x27, 0x32, 0x39, 0x3E,
+    0x27, 0x2C, 0x39, 0x3E,
+    0x27, 0x2C, 0x33, 0x3E,
+    0x27, 0x3E, 0x3E, 0x3E,
+    0x27, 0x27, 0x3E, 0x3E,
+    0x27, 0x27, 0x27, 0x3E,
+    0x28, 0x29, 0x29, 0x2A,
+    0x28, 0x29, 0x2A, 0x2B,
+    0x28, 0x2A, 0x2B, 0x2C,
+    0x28, 0x29, 0x2B, 0x2C,
+    0x28, 0x29, 0x2A, 0x2C,
+    0x28, 0x2A, 0x2B, 0x2D,
+    0x28, 0x2A, 0x2C, 0x2D,
+    0x28, 0x29, 0x2C, 0x2D,
+    0x28, 0x29, 0x2B, 0x2D,
+    0x28, 0x2A, 0x2C, 0x2E,
+    0x28, 0x2B, 0x2D, 0x2E,
+    0x28, 0x29, 0x2D, 0x2E,
+    0x28, 0x29, 0x2B, 0x2E,
+    0x28, 0x2E, 0x2E, 0x2E,
+    0x28, 0x28, 0x2E, 0x2E,
+    0x28, 0x28, 0x28, 0x2E,
+    0x28, 0x2B, 0x2C, 0x2F,
+    0x28, 0x2B, 0x2E, 0x2F,
+    0x28, 0x29, 0x2E, 0x2F,
+    0x28, 0x29, 0x2C, 0x2F,
+    0x28, 0x2B, 0x2D, 0x30,
+    0x28, 0x2C, 0x2E, 0x30,
+    0x28, 0x2A, 0x2E, 0x30,
+    0x28, 0x2A, 0x2C, 0x30,
+    0x28, 0x30, 0x30, 0x30,
+    0x28, 0x28, 0x30, 0x30,
+    0x28, 0x28, 0x28, 0x30,
+    0x28, 0x2C, 0x2F, 0x33,
+    0x28, 0x2D, 0x31, 0x33,
+    0x28, 0x2A, 0x31, 0x33,
+    0x28, 0x2A, 0x2E, 0x33,
+    0x28, 0x33, 0x33, 0x33,
+    0x28, 0x28, 0x33, 0x33,
+    0x28, 0x28, 0x28, 0x33,
+    0x28, 0x2D, 0x31, 0x36,
+    0x28, 0x2F, 0x33, 0x36,
+    0x28, 0x2B, 0x33, 0x36,
+    0x28, 0x2B, 0x2F, 0x36,
+    0x28, 0x36, 0x36, 0x36,
+    0x28, 0x28, 0x36, 0x36,
+    0x28, 0x28, 0x28, 0x36,
+    0x28, 0x2E, 0x33, 0x39,
+    0x28, 0x30, 0x35, 0x39,
+    0x28, 0x2C, 0x35, 0x39,
+    0x28, 0x2C, 0x31, 0x39,
+    0x28, 0x39, 0x39, 0x39,
+    0x28, 0x28, 0x39, 0x39,
+    0x28, 0x28, 0x28, 0x39,
+    0x28, 0x2F, 0x35, 0x3C,
+    0x28, 0x32, 0x37, 0x3C,
+    0x28, 0x2D, 0x37, 0x3C,
+    0x28, 0x2D, 0x32, 0x3C,
+    0x28, 0x3C, 0x3C, 0x3C,
+    0x28, 0x28, 0x3C, 0x3C,
+    0x28, 0x28, 0x28, 0x3C,
+    0x28, 0x33, 0x3A, 0x3F,
+    0x28, 0x2D, 0x3A, 0x3F,
+    0x28, 0x2D, 0x34, 0x3F,
+    0x28, 0x3F, 0x3F, 0x3F,
+    0x28, 0x28, 0x3F, 0x3F,
+    0x28, 0x28, 0x28, 0x3F,
+    0x29, 0x2A, 0x2A, 0x2B,
+    0x29, 0x2A, 0x2B, 0x2C,
+    0x29, 0x2B, 0x2C, 0x2D,
+    0x29, 0x2A, 0x2C, 0x2D,
+    0x29, 0x2A, 0x2B, 0x2D,
+    0x29, 0x2B, 0x2C, 0x2E,
+    0x29, 0x2B, 0x2D, 0x2E,
+    0x29, 0x2A, 0x2D, 0x2E,
+    0x29, 0x2A, 0x2C, 0x2E,
+    0x29, 0x2B, 0x2D, 0x2F,
+    0x29, 0x2C, 0x2E, 0x2F,
+    0x29, 0x2A, 0x2E, 0x2F,
+    0x29, 0x2A, 0x2C, 0x2F,
+    0x29, 0x2F, 0x2F, 0x2F,
+    0x29, 0x29, 0x2F, 0x2F,
+    0x29, 0x29, 0x29, 0x2F,
+    0x29, 0x2C, 0x2D, 0x30,
+    0x29, 0x2C, 0x2F, 0x30,
+    0x29, 0x2A, 0x2F, 0x30,
+    0x29, 0x2A, 0x2D, 0x30,
+    0x29, 0x2C, 0x2E, 0x31,
+    0x29, 0x2D, 0x2F, 0x31,
+    0x29, 0x2B, 0x2F, 0x31,
+    0x29, 0x2B, 0x2D, 0x31,
+    0x29, 0x31, 0x31, 0x31,
+    0x29, 0x29, 0x31, 0x31,
+    0x29, 0x29, 0x29, 0x31,
+    0x29, 0x2D, 0x30, 0x34,
+    0x29, 0x2E, 0x32, 0x34,
+    0x29, 0x2B, 0x32, 0x34,
+    0x29, 0x2B, 0x2F, 0x34,
+    0x29, 0x34, 0x34, 0x34,
+    0x29, 0x29, 0x34, 0x34,
+    0x29, 0x29, 0x29, 0x34,
+    0x29, 0x2E, 0x32, 0x37,
+    0x29, 0x30, 0x34, 0x37,
+    0x29, 0x2C, 0x34, 0x37,
+    0x29, 0x2C, 0x30, 0x37,
+    0x29, 0x37, 0x37, 0x37,
+    0x29, 0x29, 0x37, 0x37,
+    0x29, 0x29, 0x29, 0x37,
+    0x29, 0x2F, 0x34, 0x3A,
+    0x29, 0x31, 0x36, 0x3A,
+    0x29, 0x2D, 0x36, 0x3A,
+    0x29, 0x2D, 0x32, 0x3A,
+    0x29, 0x3A, 0x3A, 0x3A,
+    0x29, 0x29, 0x3A, 0x3A,
+    0x29, 0x29, 0x29, 0x3A,
+    0x29, 0x30, 0x36, 0x3D,
+    0x29, 0x33, 0x38, 0x3D,
+    0x29, 0x2E, 0x38, 0x3D,
+    0x29, 0x2E, 0x33, 0x3D,
+    0x29, 0x3D, 0x3D, 0x3D,
+    0x29, 0x29, 0x3D, 0x3D,
+    0x29, 0x29, 0x29, 0x3D,
+    0x2A, 0x2B, 0x2B, 0x2C,
+    0x2A, 0x2B, 0x2C, 0x2D,
+    0x2A, 0x2C, 0x2D, 0x2E,
+    0x2A, 0x2B, 0x2D, 0x2E,
+    0x2A, 0x2B, 0x2C, 0x2E,
+    0x2A, 0x2C, 0x2D, 0x2F,
+    0x2A, 0x2C, 0x2E, 0x2F,
+    0x2A, 0x2B, 0x2E, 0x2F,
+    0x2A, 0x2B, 0x2D, 0x2F,
+    0x2A, 0x2C, 0x2E, 0x30,
+    0x2A, 0x2D, 0x2F, 0x30,
+    0x2A, 0x2B, 0x2F, 0x30,
+    0x2A, 0x2B, 0x2D, 0x30,
+    0x2A, 0x30, 0x30, 0x30,
+    0x2A, 0x2A, 0x30, 0x30,
+    0x2A, 0x2A, 0x2A, 0x30,
+    0x2A, 0x2D, 0x2E, 0x31,
+    0x2A, 0x2D, 0x30, 0x31,
+    0x2A, 0x2B, 0x30, 0x31,
+    0x2A, 0x2B, 0x2E, 0x31,
+    0x2A, 0x2D, 0x2F, 0x32,
+    0x2A, 0x2E, 0x30, 0x32,
+    0x2A, 0x2C, 0x30, 0x32,
+    0x2A, 0x2C, 0x2E, 0x32,
+    0x2A, 0x32, 0x32, 0x32,
+    0x2A, 0x2A, 0x32, 0x32,
+    0x2A, 0x2A, 0x2A, 0x32,
+    0x2A, 0x2E, 0x31, 0x35,
+    0x2A, 0x2F, 0x33, 0x35,
+    0x2A, 0x2C, 0x33, 0x35,
+    0x2A, 0x2C, 0x30, 0x35,
+    0x2A, 0x35, 0x35, 0x35,
+    0x2A, 0x2A, 0x35, 0x35,
+    0x2A, 0x2A, 0x2A, 0x35,
+    0x2A, 0x2F, 0x33, 0x38,
+    0x2A, 0x31, 0x35, 0x38,
+    0x2A, 0x2D, 0x35, 0x38,
+    0x2A, 0x2D, 0x31, 0x38,
+    0x2A, 0x38, 0x38, 0x38,
+    0x2A, 0x2A, 0x38, 0x38,
+    0x2A, 0x2A, 0x2A, 0x38,
+    0x2A, 0x30, 0x35, 0x3B,
+    0x2A, 0x32, 0x37, 0x3B,
+    0x2A, 0x2E, 0x37, 0x3B,
+    0x2A, 0x2E, 0x33, 0x3B,
+    0x2A, 0x3B, 0x3B, 0x3B,
+    0x2A, 0x2A, 0x3B, 0x3B,
+    0x2A, 0x2A, 0x2A, 0x3B,
+    0x2A, 0x31, 0x37, 0x3E,
+    0x2A, 0x34, 0x39, 0x3E,
+    0x2A, 0x2F, 0x39, 0x3E,
+    0x2A, 0x2F, 0x34, 0x3E,
+    0x2A, 0x3E, 0x3E, 0x3E,
+    0x2A, 0x2A, 0x3E, 0x3E,
+    0x2A, 0x2A, 0x2A, 0x3E,
+    0x2B, 0x2C, 0x2C, 0x2D,
+    0x2B, 0x2C, 0x2D, 0x2E,
+    0x2B, 0x2D, 0x2E, 0x2F,
+    0x2B, 0x2C, 0x2E, 0x2F,
+    0x2B, 0x2C, 0x2D, 0x2F,
+    0x2B, 0x2D, 0x2E, 0x30,
+    0x2B, 0x2D, 0x2F, 0x30,
+    0x2B, 0x2C, 0x2F, 0x30,
+    0x2B, 0x2C, 0x2E, 0x30,
+    0x2B, 0x2D, 0x2F, 0x31,
+    0x2B, 0x2E, 0x30, 0x31,
+    0x2B, 0x2C, 0x30, 0x31,
+    0x2B, 0x2C, 0x2E, 0x31,
+    0x2B, 0x31, 0x31, 0x31,
+    0x2B, 0x2B, 0x31, 0x31,
+    0x2B, 0x2B, 0x2B, 0x31,
+    0x2B, 0x2E, 0x2F, 0x32,
+    0x2B, 0x2E, 0x31, 0x32,
+    0x2B, 0x2C, 0x31, 0x32,
+    0x2B, 0x2C, 0x2F, 0x32,
+    0x2B, 0x2E, 0x30, 0x33,
+    0x2B, 0x2F, 0x31, 0x33,
+    0x2B, 0x2D, 0x31, 0x33,
+    0x2B, 0x2D, 0x2F, 0x33,
+    0x2B, 0x33, 0x33, 0x33,
+    0x2B, 0x2B, 0x33, 0x33,
+    0x2B, 0x2B, 0x2B, 0x33,
+    0x2B, 0x2F, 0x32, 0x36,
+    0x2B, 0x30, 0x34, 0x36,
+    0x2B, 0x2D, 0x34, 0x36,
+    0x2B, 0x2D, 0x31, 0x36,
+    0x2B, 0x36, 0x36, 0x36,
+    0x2B, 0x2B, 0x36, 0x36,
+    0x2B, 0x2B, 0x2B, 0x36,
+    0x2B, 0x30, 0x34, 0x39,
+    0x2B, 0x32, 0x36, 0x39,
+    0x2B, 0x2E, 0x36, 0x39,
+    0x2B, 0x2E, 0x32, 0x39,
+    0x2B, 0x39, 0x39, 0x39,
+    0x2B, 0x2B, 0x39, 0x39,
+    0x2B, 0x2B, 0x2B, 0x39,
+    0x2B, 0x31, 0x36, 0x3C,
+    0x2B, 0x33, 0x38, 0x3C,
+    0x2B, 0x2F, 0x38, 0x3C,
+    0x2B, 0x2F, 0x34, 0x3C,
+    0x2B, 0x3C, 0x3C, 0x3C,
+    0x2B, 0x2B, 0x3C, 0x3C,
+    0x2B, 0x2B, 0x2B, 0x3C,
+    0x2B, 0x32, 0x38, 0x3F,
+    0x2B, 0x35, 0x3A, 0x3F,
+    0x2B, 0x30, 0x3A, 0x3F,
+    0x2B, 0x30, 0x35, 0x3F,
+    0x2B, 0x3F, 0x3F, 0x3F,
+    0x2B, 0x2B, 0x3F, 0x3F,
+    0x2B, 0x2B, 0x2B, 0x3F,
+    0x2C, 0x2D, 0x2D, 0x2E,
+    0x2C, 0x2D, 0x2E, 0x2F,
+    0x2C, 0x2E, 0x2F, 0x30,
+    0x2C, 0x2D, 0x2F, 0x30,
+    0x2C, 0x2D, 0x2E, 0x30,
+    0x2C, 0x2E, 0x2F, 0x31,
+    0x2C, 0x2E, 0x30, 0x31,
+    0x2C, 0x2D, 0x30, 0x31,
+    0x2C, 0x2D, 0x2F, 0x31,
+    0x2C, 0x2E, 0x30, 0x32,
+    0x2C, 0x2F, 0x31, 0x32,
+    0x2C, 0x2D, 0x31, 0x32,
+    0x2C, 0x2D, 0x2F, 0x32,
+    0x2C, 0x32, 0x32, 0x32,
+    0x2C, 0x2C, 0x32, 0x32,
+    0x2C, 0x2C, 0x2C, 0x32,
+    0x2C, 0x2F, 0x30, 0x33,
+    0x2C, 0x2F, 0x32, 0x33,
+    0x2C, 0x2D, 0x32, 0x33,
+    0x2C, 0x2D, 0x30, 0x33,
+    0x2C, 0x2F, 0x31, 0x34,
+    0x2C, 0x30, 0x32, 0x34,
+    0x2C, 0x2E, 0x32, 0x34,
+    0x2C, 0x2E, 0x30, 0x34,
+    0x2C, 0x34, 0x34, 0x34,
+    0x2C, 0x2C, 0x34, 0x34,
+    0x2C, 0x2C, 0x2C, 0x34,
+    0x2C, 0x30, 0x33, 0x37,
+    0x2C, 0x31, 0x35, 0x37,
+    0x2C, 0x2E, 0x35, 0x37,
+    0x2C, 0x2E, 0x32, 0x37,
+    0x2C, 0x37, 0x37, 0x37,
+    0x2C, 0x2C, 0x37, 0x37,
+    0x2C, 0x2C, 0x2C, 0x37,
+    0x2C, 0x31, 0x35, 0x3A,
+    0x2C, 0x33, 0x37, 0x3A,
+    0x2C, 0x2F, 0x37, 0x3A,
+    0x2C, 0x2F, 0x33, 0x3A,
+    0x2C, 0x3A, 0x3A, 0x3A,
+    0x2C, 0x2C, 0x3A, 0x3A,
+    0x2C, 0x2C, 0x2C, 0x3A,
+    0x2C, 0x32, 0x37, 0x3D,
+    0x2C, 0x34, 0x39, 0x3D,
+    0x2C, 0x30, 0x39, 0x3D,
+    0x2C, 0x30, 0x35, 0x3D,
+    0x2C, 0x3D, 0x3D, 0x3D,
+    0x2C, 0x2C, 0x3D, 0x3D,
+    0x2C, 0x2C, 0x2C, 0x3D,
+    0x2D, 0x2E, 0x2E, 0x2F,
+    0x2D, 0x2E, 0x2F, 0x30,
+    0x2D, 0x2F, 0x30, 0x31,
+    0x2D, 0x2E, 0x30, 0x31,
+    0x2D, 0x2E, 0x2F, 0x31,
+    0x2D, 0x2F, 0x30, 0x32,
+    0x2D, 0x2F, 0x31, 0x32,
+    0x2D, 0x2E, 0x31, 0x32,
+    0x2D, 0x2E, 0x30, 0x32,
+    0x2D, 0x2F, 0x31, 0x33,
+    0x2D, 0x30, 0x32, 0x33,
+    0x2D, 0x2E, 0x32, 0x33,
+    0x2D, 0x2E, 0x30, 0x33,
+    0x2D, 0x33, 0x33, 0x33,
+    0x2D, 0x2D, 0x33, 0x33,
+    0x2D, 0x2D, 0x2D, 0x33,
+    0x2D, 0x30, 0x31, 0x34,
+    0x2D, 0x30, 0x33, 0x34,
+    0x2D, 0x2E, 0x33, 0x34,
+    0x2D, 0x2E, 0x31, 0x34,
+    0x2D, 0x30, 0x32, 0x35,
+    0x2D, 0x31, 0x33, 0x35,
+    0x2D, 0x2F, 0x33, 0x35,
+    0x2D, 0x2F, 0x31, 0x35,
+    0x2D, 0x35, 0x35, 0x35,
+    0x2D, 0x2D, 0x35, 0x35,
+    0x2D, 0x2D, 0x2D, 0x35,
+    0x2D, 0x31, 0x34, 0x38,
+    0x2D, 0x32, 0x36, 0x38,
+    0x2D, 0x2F, 0x36, 0x38,
+    0x2D, 0x2F, 0x33, 0x38,
+    0x2D, 0x38, 0x38, 0x38,
+    0x2D, 0x2D, 0x38, 0x38,
+    0x2D, 0x2D, 0x2D, 0x38,
+    0x2D, 0x32, 0x36, 0x3B,
+    0x2D, 0x34, 0x38, 0x3B,
+    0x2D, 0x30, 0x38, 0x3B,
+    0x2D, 0x30, 0x34, 0x3B,
+    0x2D, 0x3B, 0x3B, 0x3B,
+    0x2D, 0x2D, 0x3B, 0x3B,
+    0x2D, 0x2D, 0x2D, 0x3B,
+    0x2D, 0x33, 0x38, 0x3E,
+    0x2D, 0x35, 0x3A, 0x3E,
+    0x2D, 0x31, 0x3A, 0x3E,
+    0x2D, 0x31, 0x36, 0x3E,
+    0x2D, 0x3E, 0x3E, 0x3E,
+    0x2D, 0x2D, 0x3E, 0x3E,
+    0x2D, 0x2D, 0x2D, 0x3E,
+    0x2E, 0x2F, 0x2F, 0x30,
+    0x2E, 0x2F, 0x30, 0x31,
+    0x2E, 0x30, 0x31, 0x32,
+    0x2E, 0x2F, 0x31, 0x32,
+    0x2E, 0x2F, 0x30, 0x32,
+    0x2E, 0x30, 0x31, 0x33,
+    0x2E, 0x30, 0x32, 0x33,
+    0x2E, 0x2F, 0x32, 0x33,
+    0x2E, 0x2F, 0x31, 0x33,
+    0x2E, 0x30, 0x32, 0x34,
+    0x2E, 0x31, 0x33, 0x34,
+    0x2E, 0x2F, 0x33, 0x34,
+    0x2E, 0x2F, 0x31, 0x34,
+    0x2E, 0x34, 0x34, 0x34,
+    0x2E, 0x2E, 0x34, 0x34,
+    0x2E, 0x2E, 0x2E, 0x34,
+    0x2E, 0x31, 0x32, 0x35,
+    0x2E, 0x31, 0x34, 0x35,
+    0x2E, 0x2F, 0x34, 0x35,
+    0x2E, 0x2F, 0x32, 0x35,
+    0x2E, 0x31, 0x33, 0x36,
+    0x2E, 0x32, 0x34, 0x36,
+    0x2E, 0x30, 0x34, 0x36,
+    0x2E, 0x30, 0x32, 0x36,
+    0x2E, 0x36, 0x36, 0x36,
+    0x2E, 0x2E, 0x36, 0x36,
+    0x2E, 0x2E, 0x2E, 0x36,
+    0x2E, 0x32, 0x35, 0x39,
+    0x2E, 0x33, 0x37, 0x39,
+    0x2E, 0x30, 0x37, 0x39,
+    0x2E, 0x30, 0x34, 0x39,
+    0x2E, 0x39, 0x39, 0x39,
+    0x2E, 0x2E, 0x39, 0x39,
+    0x2E, 0x2E, 0x2E, 0x39,
+    0x2E, 0x33, 0x37, 0x3C,
+    0x2E, 0x35, 0x39, 0x3C,
+    0x2E, 0x31, 0x39, 0x3C,
+    0x2E, 0x31, 0x35, 0x3C,
+    0x2E, 0x3C, 0x3C, 0x3C,
+    0x2E, 0x2E, 0x3C, 0x3C,
+    0x2E, 0x2E, 0x2E, 0x3C,
+    0x2E, 0x34, 0x39, 0x3F,
+    0x2E, 0x36, 0x3B, 0x3F,
+    0x2E, 0x32, 0x3B, 0x3F,
+    0x2E, 0x32, 0x37, 0x3F,
+    0x2E, 0x3F, 0x3F, 0x3F,
+    0x2E, 0x2E, 0x3F, 0x3F,
+    0x2E, 0x2E, 0x2E, 0x3F,
+    0x2F, 0x30, 0x30, 0x31,
+    0x2F, 0x30, 0x31, 0x32,
+    0x2F, 0x31, 0x32, 0x33,
+    0x2F, 0x30, 0x32, 0x33,
+    0x2F, 0x30, 0x31, 0x33,
+    0x2F, 0x31, 0x32, 0x34,
+    0x2F, 0x31, 0x33, 0x34,
+    0x2F, 0x30, 0x33, 0x34,
+    0x2F, 0x30, 0x32, 0x34,
+    0x2F, 0x31, 0x33, 0x35,
+    0x2F, 0x32, 0x34, 0x35,
+    0x2F, 0x30, 0x34, 0x35,
+    0x2F, 0x30, 0x32, 0x35,
+    0x2F, 0x35, 0x35, 0x35,
+    0x2F, 0x2F, 0x35, 0x35,
+    0x2F, 0x2F, 0x2F, 0x35,
+    0x2F, 0x32, 0x33, 0x36,
+    0x2F, 0x32, 0x35, 0x36,
+    0x2F, 0x30, 0x35, 0x36,
+    0x2F, 0x30, 0x33, 0x36,
+    0x2F, 0x32, 0x34, 0x37,
+    0x2F, 0x33, 0x35, 0x37,
+    0x2F, 0x31, 0x35, 0x37,
+    0x2F, 0x31, 0x33, 0x37,
+    0x2F, 0x37, 0x37, 0x37,
+    0x2F, 0x2F, 0x37, 0x37,
+    0x2F, 0x2F, 0x2F, 0x37,
+    0x2F, 0x33, 0x36, 0x3A,
+    0x2F, 0x34, 0x38, 0x3A,
+    0x2F, 0x31, 0x38, 0x3A,
+    0x2F, 0x31, 0x35, 0x3A,
+    0x2F, 0x3A, 0x3A, 0x3A,
+    0x2F, 0x2F, 0x3A, 0x3A,
+    0x2F, 0x2F, 0x2F, 0x3A,
+    0x2F, 0x34, 0x38, 0x3D,
+    0x2F, 0x36, 0x3A, 0x3D,
+    0x2F, 0x32, 0x3A, 0x3D,
+    0x2F, 0x32, 0x36, 0x3D,
+    0x2F, 0x3D, 0x3D, 0x3D,
+    0x2F, 0x2F, 0x3D, 0x3D,
+    0x2F, 0x2F, 0x2F, 0x3D,
+    0x30, 0x31, 0x31, 0x32,
+    0x30, 0x31, 0x32, 0x33,
+    0x30, 0x32, 0x33, 0x34,
+    0x30, 0x31, 0x33, 0x34,
+    0x30, 0x31, 0x32, 0x34,
+    0x30, 0x32, 0x33, 0x35,
+    0x30, 0x32, 0x34, 0x35,
+    0x30, 0x31, 0x34, 0x35,
+    0x30, 0x31, 0x33, 0x35,
+    0x30, 0x32, 0x34, 0x36,
+    0x30, 0x33, 0x35, 0x36,
+    0x30, 0x31, 0x35, 0x36,
+    0x30, 0x31, 0x33, 0x36,
+    0x30, 0x36, 0x36, 0x36,
+    0x30, 0x30, 0x36, 0x36,
+    0x30, 0x30, 0x30, 0x36,
+    0x30, 0x33, 0x34, 0x37,
+    0x30, 0x33, 0x36, 0x37,
+    0x30, 0x31, 0x36, 0x37,
+    0x30, 0x31, 0x34, 0x37,
+    0x30, 0x33, 0x35, 0x38,
+    0x30, 0x34, 0x36, 0x38,
+    0x30, 0x32, 0x36, 0x38,
+    0x30, 0x32, 0x34, 0x38,
+    0x30, 0x38, 0x38, 0x38,
+    0x30, 0x30, 0x38, 0x38,
+    0x30, 0x30, 0x30, 0x38,
+    0x30, 0x34, 0x37, 0x3B,
+    0x30, 0x35, 0x39, 0x3B,
+    0x30, 0x32, 0x39, 0x3B,
+    0x30, 0x32, 0x36, 0x3B,
+    0x30, 0x3B, 0x3B, 0x3B,
+    0x30, 0x30, 0x3B, 0x3B,
+    0x30, 0x30, 0x30, 0x3B,
+    0x30, 0x35, 0x39, 0x3E,
+    0x30, 0x37, 0x3B, 0x3E,
+    0x30, 0x33, 0x3B, 0x3E,
+    0x30, 0x33, 0x37, 0x3E,
+    0x30, 0x3E, 0x3E, 0x3E,
+    0x30, 0x30, 0x3E, 0x3E,
+    0x30, 0x30, 0x30, 0x3E,
+    0x31, 0x32, 0x32, 0x33,
+    0x31, 0x32, 0x33, 0x34,
+    0x31, 0x33, 0x34, 0x35,
+    0x31, 0x32, 0x34, 0x35,
+    0x31, 0x32, 0x33, 0x35,
+    0x31, 0x33, 0x34, 0x36,
+    0x31, 0x33, 0x35, 0x36,
+    0x31, 0x32, 0x35, 0x36,
+    0x31, 0x32, 0x34, 0x36,
+    0x31, 0x33, 0x35, 0x37,
+    0x31, 0x34, 0x36, 0x37,
+    0x31, 0x32, 0x36, 0x37,
+    0x31, 0x32, 0x34, 0x37,
+    0x31, 0x37, 0x37, 0x37,
+    0x31, 0x31, 0x37, 0x37,
+    0x31, 0x31, 0x31, 0x37,
+    0x31, 0x34, 0x35, 0x38,
+    0x31, 0x34, 0x37, 0x38,
+    0x31, 0x32, 0x37, 0x38,
+    0x31, 0x32, 0x35, 0x38,
+    0x31, 0x34, 0x36, 0x39,
+    0x31, 0x35, 0x37, 0x39,
+    0x31, 0x33, 0x37, 0x39,
+    0x31, 0x33, 0x35, 0x39,
+    0x31, 0x39, 0x39, 0x39,
+    0x31, 0x31, 0x39, 0x39,
+    0x31, 0x31, 0x31, 0x39,
+    0x31, 0x35, 0x38, 0x3C,
+    0x31, 0x36, 0x3A, 0x3C,
+    0x31, 0x33, 0x3A, 0x3C,
+    0x31, 0x33, 0x37, 0x3C,
+    0x31, 0x3C, 0x3C, 0x3C,
+    0x31, 0x31, 0x3C, 0x3C,
+    0x31, 0x31, 0x31, 0x3C,
+    0x31, 0x36, 0x3A, 0x3F,
+    0x31, 0x38, 0x3C, 0x3F,
+    0x31, 0x34, 0x3C, 0x3F,
+    0x31, 0x34, 0x38, 0x3F,
+    0x31, 0x3F, 0x3F, 0x3F,
+    0x31, 0x31, 0x3F, 0x3F,
+    0x31, 0x31, 0x31, 0x3F,
+    0x32, 0x33, 0x33, 0x34,
+    0x32, 0x33, 0x34, 0x35,
+    0x32, 0x34, 0x35, 0x36,
+    0x32, 0x33, 0x35, 0x36,
+    0x32, 0x33, 0x34, 0x36,
+    0x32, 0x34, 0x35, 0x37,
+    0x32, 0x34, 0x36, 0x37,
+    0x32, 0x33, 0x36, 0x37,
+    0x32, 0x33, 0x35, 0x37,
+    0x32, 0x34, 0x36, 0x38,
+    0x32, 0x35, 0x37, 0x38,
+    0x32, 0x33, 0x37, 0x38,
+    0x32, 0x33, 0x35, 0x38,
+    0x32, 0x38, 0x38, 0x38,
+    0x32, 0x32, 0x38, 0x38,
+    0x32, 0x32, 0x32, 0x38,
+    0x32, 0x35, 0x36, 0x39,
+    0x32, 0x35, 0x38, 0x39,
+    0x32, 0x33, 0x38, 0x39,
+    0x32, 0x33, 0x36, 0x39,
+    0x32, 0x35, 0x37, 0x3A,
+    0x32, 0x36, 0x38, 0x3A,
+    0x32, 0x34, 0x38, 0x3A,
+    0x32, 0x34, 0x36, 0x3A,
+    0x32, 0x3A, 0x3A, 0x3A,
+    0x32, 0x32, 0x3A, 0x3A,
+    0x32, 0x32, 0x32, 0x3A,
+    0x32, 0x36, 0x39, 0x3D,
+    0x32, 0x37, 0x3B, 0x3D,
+    0x32, 0x34, 0x3B, 0x3D,
+    0x32, 0x34, 0x38, 0x3D,
+    0x32, 0x3D, 0x3D, 0x3D,
+    0x32, 0x32, 0x3D, 0x3D,
+    0x32, 0x32, 0x32, 0x3D,
+    0x33, 0x34, 0x34, 0x35,
+    0x33, 0x34, 0x35, 0x36,
+    0x33, 0x35, 0x36, 0x37,
+    0x33, 0x34, 0x36, 0x37,
+    0x33, 0x34, 0x35, 0x37,
+    0x33, 0x35, 0x36, 0x38,
+    0x33, 0x35, 0x37, 0x38,
+    0x33, 0x34, 0x37, 0x38,
+    0x33, 0x34, 0x36, 0x38,
+    0x33, 0x35, 0x37, 0x39,
+    0x33, 0x36, 0x38, 0x39,
+    0x33, 0x34, 0x38, 0x39,
+    0x33, 0x34, 0x36, 0x39,
+    0x33, 0x39, 0x39, 0x39,
+    0x33, 0x33, 0x39, 0x39,
+    0x33, 0x33, 0x33, 0x39,
+    0x33, 0x36, 0x37, 0x3A,
+    0x33, 0x36, 0x39, 0x3A,
+    0x33, 0x34, 0x39, 0x3A,
+    0x33, 0x34, 0x37, 0x3A,
+    0x33, 0x36, 0x38, 0x3B,
+    0x33, 0x37, 0x39, 0x3B,
+    0x33, 0x35, 0x39, 0x3B,
+    0x33, 0x35, 0x37, 0x3B,
+    0x33, 0x3B, 0x3B, 0x3B,
+    0x33, 0x33, 0x3B, 0x3B,
+    0x33, 0x33, 0x33, 0x3B,
+    0x33, 0x37, 0x3A, 0x3E,
+    0x33, 0x38, 0x3C, 0x3E,
+    0x33, 0x35, 0x3C, 0x3E,
+    0x33, 0x35, 0x39, 0x3E,
+    0x33, 0x3E, 0x3E, 0x3E,
+    0x33, 0x33, 0x3E, 0x3E,
+    0x33, 0x33, 0x33, 0x3E,
+    0x34, 0x35, 0x35, 0x36,
+    0x34, 0x35, 0x36, 0x37,
+    0x34, 0x36, 0x37, 0x38,
+    0x34, 0x35, 0x37, 0x38,
+    0x34, 0x35, 0x36, 0x38,
+    0x34, 0x36, 0x37, 0x39,
+    0x34, 0x36, 0x38, 0x39,
+    0x34, 0x35, 0x38, 0x39,
+    0x34, 0x35, 0x37, 0x39,
+    0x34, 0x36, 0x38, 0x3A,
+    0x34, 0x37, 0x39, 0x3A,
+    0x34, 0x35, 0x39, 0x3A,
+    0x34, 0x35, 0x37, 0x3A,
+    0x34, 0x3A, 0x3A, 0x3A,
+    0x34, 0x34, 0x3A, 0x3A,
+    0x34, 0x34, 0x34, 0x3A,
+    0x34, 0x37, 0x38, 0x3B,
+    0x34, 0x37, 0x3A, 0x3B,
+    0x34, 0x35, 0x3A, 0x3B,
+    0x34, 0x35, 0x38, 0x3B,
+    0x34, 0x37, 0x39, 0x3C,
+    0x34, 0x38, 0x3A, 0x3C,
+    0x34, 0x36, 0x3A, 0x3C,
+    0x34, 0x36, 0x38, 0x3C,
+    0x34, 0x3C, 0x3C, 0x3C,
+    0x34, 0x34, 0x3C, 0x3C,
+    0x34, 0x34, 0x34, 0x3C,
+    0x34, 0x38, 0x3B, 0x3F,
+    0x34, 0x39, 0x3D, 0x3F,
+    0x34, 0x36, 0x3D, 0x3F,
+    0x34, 0x36, 0x3A, 0x3F,
+    0x34, 0x3F, 0x3F, 0x3F,
+    0x34, 0x34, 0x3F, 0x3F,
+    0x34, 0x34, 0x34, 0x3F,
+    0x35, 0x36, 0x36, 0x37,
+    0x35, 0x36, 0x37, 0x38,
+    0x35, 0x37, 0x38, 0x39,
+    0x35, 0x36, 0x38, 0x39,
+    0x35, 0x36, 0x37, 0x39,
+    0x35, 0x37, 0x38, 0x3A,
+    0x35, 0x37, 0x39, 0x3A,
+    0x35, 0x36, 0x39, 0x3A,
+    0x35, 0x36, 0x38, 0x3A,
+    0x35, 0x37, 0x39, 0x3B,
+    0x35, 0x38, 0x3A, 0x3B,
+    0x35, 0x36, 0x3A, 0x3B,
+    0x35, 0x36, 0x38, 0x3B,
+    0x35, 0x3B, 0x3B, 0x3B,
+    0x35, 0x35, 0x3B, 0x3B,
+    0x35, 0x35, 0x35, 0x3B,
+    0x35, 0x38, 0x39, 0x3C,
+    0x35, 0x38, 0x3B, 0x3C,
+    0x35, 0x36, 0x3B, 0x3C,
+    0x35, 0x36, 0x39, 0x3C,
+    0x35, 0x38, 0x3A, 0x3D,
+    0x35, 0x39, 0x3B, 0x3D,
+    0x35, 0x37, 0x3B, 0x3D,
+    0x35, 0x37, 0x39, 0x3D,
+    0x35, 0x3D, 0x3D, 0x3D,
+    0x35, 0x35, 0x3D, 0x3D,
+    0x35, 0x35, 0x35, 0x3D,
+    0x36, 0x37, 0x37, 0x38,
+    0x36, 0x37, 0x38, 0x39,
+    0x36, 0x38, 0x39, 0x3A,
+    0x36, 0x37, 0x39, 0x3A,
+    0x36, 0x37, 0x38, 0x3A,
+    0x36, 0x38, 0x39, 0x3B,
+    0x36, 0x38, 0x3A, 0x3B,
+    0x36, 0x37, 0x3A, 0x3B,
+    0x36, 0x37, 0x39, 0x3B,
+    0x36, 0x38, 0x3A, 0x3C,
+    0x36, 0x39, 0x3B, 0x3C,
+    0x36, 0x37, 0x3B, 0x3C,
+    0x36, 0x37, 0x39, 0x3C,
+    0x36, 0x3C, 0x3C, 0x3C,
+    0x36, 0x36, 0x3C, 0x3C,
+    0x36, 0x36, 0x36, 0x3C,
+    0x36, 0x39, 0x3A, 0x3D,
+    0x36, 0x39, 0x3C, 0x3D,
+    0x36, 0x37, 0x3C, 0x3D,
+    0x36, 0x37, 0x3A, 0x3D,
+    0x36, 0x39, 0x3B, 0x3E,
+    0x36, 0x3A, 0x3C, 0x3E,
+    0x36, 0x38, 0x3C, 0x3E,
+    0x36, 0x38, 0x3A, 0x3E,
+    0x36, 0x3E, 0x3E, 0x3E,
+    0x36, 0x36, 0x3E, 0x3E,
+    0x36, 0x36, 0x36, 0x3E,
+    0x37, 0x38, 0x38, 0x39,
+    0x37, 0x38, 0x39, 0x3A,
+    0x37, 0x39, 0x3A, 0x3B,
+    0x37, 0x38, 0x3A, 0x3B,
+    0x37, 0x38, 0x39, 0x3B,
+    0x37, 0x39, 0x3A, 0x3C,
+    0x37, 0x39, 0x3B, 0x3C,
+    0x37, 0x38, 0x3B, 0x3C,
+    0x37, 0x38, 0x3A, 0x3C,
+    0x37, 0x39, 0x3B, 0x3D,
+    0x37, 0x3A, 0x3C, 0x3D,
+    0x37, 0x38, 0x3C, 0x3D,
+    0x37, 0x38, 0x3A, 0x3D,
+    0x37, 0x3D, 0x3D, 0x3D,
+    0x37, 0x37, 0x3D, 0x3D,
+    0x37, 0x37, 0x37, 0x3D,
+    0x37, 0x3A, 0x3B, 0x3E,
+    0x37, 0x3A, 0x3D, 0x3E,
+    0x37, 0x38, 0x3D, 0x3E,
+    0x37, 0x38, 0x3B, 0x3E,
+    0x37, 0x3A, 0x3C, 0x3F,
+    0x37, 0x3B, 0x3D, 0x3F,
+    0x37, 0x39, 0x3D, 0x3F,
+    0x37, 0x39, 0x3B, 0x3F,
+    0x37, 0x3F, 0x3F, 0x3F,
+    0x37, 0x37, 0x3F, 0x3F,
+    0x37, 0x37, 0x37, 0x3F,
+    0x38, 0x39, 0x39, 0x3A,
+    0x38, 0x39, 0x3A, 0x3B,
+    0x38, 0x3A, 0x3B, 0x3C,
+    0x38, 0x39, 0x3B, 0x3C,
+    0x38, 0x39, 0x3A, 0x3C,
+    0x38, 0x3A, 0x3B, 0x3D,
+    0x38, 0x3A, 0x3C, 0x3D,
+    0x38, 0x39, 0x3C, 0x3D,
+    0x38, 0x39, 0x3B, 0x3D,
+    0x38, 0x3A, 0x3C, 0x3E,
+    0x38, 0x3B, 0x3D, 0x3E,
+    0x38, 0x39, 0x3D, 0x3E,
+    0x38, 0x39, 0x3B, 0x3E,
+    0x38, 0x3E, 0x3E, 0x3E,
+    0x38, 0x38, 0x3E, 0x3E,
+    0x38, 0x38, 0x38, 0x3E,
+    0x38, 0x3B, 0x3C, 0x3F,
+    0x38, 0x3B, 0x3E, 0x3F,
+    0x38, 0x39, 0x3E, 0x3F,
+    0x38, 0x39, 0x3C, 0x3F,
+    0x39, 0x3A, 0x3A, 0x3B,
+    0x39, 0x3A, 0x3B, 0x3C,
+    0x39, 0x3B, 0x3C, 0x3D,
+    0x39, 0x3A, 0x3C, 0x3D,
+    0x39, 0x3A, 0x3B, 0x3D,
+    0x39, 0x3B, 0x3C, 0x3E,
+    0x39, 0x3B, 0x3D, 0x3E,
+    0x39, 0x3A, 0x3D, 0x3E,
+    0x39, 0x3A, 0x3C, 0x3E,
+    0x39, 0x3B, 0x3D, 0x3F,
+    0x39, 0x3C, 0x3E, 0x3F,
+    0x39, 0x3A, 0x3E, 0x3F,
+    0x39, 0x3A, 0x3C, 0x3F,
+    0x39, 0x3F, 0x3F, 0x3F,
+    0x39, 0x39, 0x3F, 0x3F,
+    0x39, 0x39, 0x39, 0x3F,
+    0x3A, 0x3B, 0x3B, 0x3C,
+    0x3A, 0x3B, 0x3C, 0x3D,
+    0x3A, 0x3C, 0x3D, 0x3E,
+    0x3A, 0x3B, 0x3D, 0x3E,
+    0x3A, 0x3B, 0x3C, 0x3E,
+    0x3A, 0x3C, 0x3D, 0x3F,
+    0x3A, 0x3C, 0x3E, 0x3F,
+    0x3A, 0x3B, 0x3E, 0x3F,
+    0x3A, 0x3B, 0x3D, 0x3F,
+    0x3B, 0x3C, 0x3C, 0x3D,
+    0x3B, 0x3C, 0x3D, 0x3E,
+    0x3B, 0x3D, 0x3E, 0x3F,
+    0x3B, 0x3C, 0x3E, 0x3F,
+    0x3B, 0x3C, 0x3D, 0x3F,
+    0x3C, 0x3D, 0x3D, 0x3E,
+    0x3C, 0x3D, 0x3E, 0x3F,
+    0x3D, 0x3E, 0x3E, 0x3F
+};
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index a45d57de0..afff2f98a 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -31,7 +31,25 @@
 #include <stdarg.h>
 #include <limits.h>
 
-static void avcodec_default_free_buffers(AVCodecContext *s);
+const uint8_t ff_sqrt_tab[128]={
+        0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
+        5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+        9, 9, 9, 9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11
+};
+
+const uint8_t ff_log2_tab[256]={
+        0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+        5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+void avcodec_default_free_buffers(AVCodecContext *s);
 
 void *av_mallocz(unsigned int size)
 {
@@ -64,7 +82,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size)
     if(min_size < *size) 
         return ptr;
     
-    *size= 17*min_size/16 + 32;
+    *size= FFMAX(17*min_size/16 + 32, min_size);
 
     return av_realloc(ptr, *size);
 }
@@ -83,6 +101,8 @@ void *av_mallocz_static(unsigned int size)
 
     if(ptr){ 
         array_static =av_fast_realloc(array_static, &allocated_static, sizeof(void*)*(last_static+1));
+        if(!array_static)
+            return NULL;
         array_static[last_static++] = ptr;
     }
 
@@ -90,6 +110,26 @@ void *av_mallocz_static(unsigned int size)
 }
 
 /**
+ * same as above, but does realloc
+ */
+
+void *av_realloc_static(void *ptr, unsigned int size)
+{
+    int i;
+    if(!ptr)
+      return av_mallocz_static(size);
+    /* Look for the old ptr */
+    for(i = 0; i < last_static; i++) {
+        if(array_static[i] == ptr) {
+            array_static[i] = av_realloc(array_static[i], size);
+            return array_static[i];
+        }
+    }
+    return NULL;
+
+}
+
+/**
  * free all static arrays and reset pointers to 0.
  */
 void av_free_static(void)
@@ -112,7 +152,7 @@ void av_freep(void *arg)
 }
 
 /* encoder management */
-AVCodec *first_avcodec;
+AVCodec *first_avcodec = NULL;
 
 void register_avcodec(AVCodec *format)
 {
@@ -123,6 +163,13 @@ void register_avcodec(AVCodec *format)
     format->next = NULL;
 }
 
+void avcodec_set_dimensions(AVCodecContext *s, int width, int height){
+    s->coded_width = width;
+    s->coded_height= height;
+    s->width = -((-width )>>s->lowres);
+    s->height= -((-height)>>s->lowres);
+}
+
 typedef struct InternalBuffer{
     int last_pic_num;
     uint8_t *base[4];
@@ -141,6 +188,7 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){
     switch(s->pix_fmt){
     case PIX_FMT_YUV420P:
     case PIX_FMT_YUV422:
+    case PIX_FMT_UYVY422:
     case PIX_FMT_YUV422P:
     case PIX_FMT_YUV444P:
     case PIX_FMT_GRAY8:
@@ -151,6 +199,7 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){
         h_align= 16;
         break;
     case PIX_FMT_YUV411P:
+    case PIX_FMT_UYVY411:
         w_align=32;
         h_align=8;
         break;
@@ -159,6 +208,22 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){
             w_align=64;
             h_align=64;
         }
+    case PIX_FMT_RGB555:
+        if(s->codec_id == CODEC_ID_RPZA){
+            w_align=4;
+            h_align=4;
+        }
+    case PIX_FMT_PAL8:
+        if(s->codec_id == CODEC_ID_SMC){
+            w_align=4;
+            h_align=4;
+        }
+        break;
+    case PIX_FMT_BGR24:
+        if((s->codec_id == CODEC_ID_MSZH) || (s->codec_id == CODEC_ID_ZLIB)){
+            w_align=4;
+            h_align=4;
+        }
         break;
     default:
         w_align= 1;
@@ -170,16 +235,27 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){
     *height= ALIGN(*height, h_align);
 }
 
+int avcodec_check_dimensions(void *av_log_ctx, unsigned int w, unsigned int h){
+    if((int)w>0 && (int)h>0 && (w+128)*(uint64_t)(h+128) < INT_MAX/4)
+        return 0;
+    
+    av_log(av_log_ctx, AV_LOG_ERROR, "picture size invalid (%ux%u)\n", w, h);
+    return -1;
+}
+
 int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
     int i;
     int w= s->width;
     int h= s->height;
     InternalBuffer *buf;
     int *picture_number;
-    
+
     assert(pic->data[0]==NULL);
     assert(INTERNAL_BUFFER_SIZE > s->internal_buffer_count);
 
+    if(avcodec_check_dimensions(s,w,h))
+        return -1;
+
     if(s->internal_buffer==NULL){
         s->internal_buffer= av_mallocz(INTERNAL_BUFFER_SIZE*sizeof(InternalBuffer));
     }
@@ -200,7 +276,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
         buf->last_pic_num= *picture_number;
     }else{
         int h_chroma_shift, v_chroma_shift;
-        int s_align, pixel_size;
+        int pixel_size;
         
         avcodec_get_chroma_sub_sample(s->pix_fmt, &h_chroma_shift, &v_chroma_shift);
         
@@ -208,6 +284,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
         case PIX_FMT_RGB555:
         case PIX_FMT_RGB565:
         case PIX_FMT_YUV422:
+        case PIX_FMT_UYVY422:
             pixel_size=2;
             break;
         case PIX_FMT_RGB24:
@@ -222,11 +299,6 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
         }
 
         avcodec_align_dimensions(s, &w, &h);
-#if defined(ARCH_POWERPC) || defined(HAVE_MMI) //FIXME some cleaner check
-        s_align= 16;
-#else
-        s_align= 8;
-#endif
             
         if(!(s->flags&CODEC_FLAG_EMU_EDGE)){
             w+= EDGE_WIDTH*2;
@@ -240,16 +312,16 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){
             const int v_shift= i==0 ? 0 : v_chroma_shift;
 
             //FIXME next ensures that linesize= 2^x uvlinesize, thats needed because some MC code assumes it
-            buf->linesize[i]= ALIGN(pixel_size*w>>h_shift, s_align<<(h_chroma_shift-h_shift)); 
+            buf->linesize[i]= ALIGN(pixel_size*w>>h_shift, STRIDE_ALIGN<<(h_chroma_shift-h_shift)); 
 
-            buf->base[i]= av_mallocz((buf->linesize[i]*h>>v_shift)+16); //FIXME 16
+            buf->base[i]= av_malloc((buf->linesize[i]*h>>v_shift)+16); //FIXME 16
             if(buf->base[i]==NULL) return -1;
             memset(buf->base[i], 128, buf->linesize[i]*h>>v_shift);
         
             if(s->flags&CODEC_FLAG_EMU_EDGE)
                 buf->data[i] = buf->base[i];
             else
-                buf->data[i] = buf->base[i] + ALIGN((buf->linesize[i]*EDGE_WIDTH>>v_shift) + (EDGE_WIDTH>>h_shift), s_align);
+                buf->data[i] = buf->base[i] + ALIGN((buf->linesize[i]*EDGE_WIDTH>>v_shift) + (EDGE_WIDTH>>h_shift), STRIDE_ALIGN);
         }
         pic->age= 256*256*256*64;
     }
@@ -358,8 +430,8 @@ void avcodec_get_context_defaults(AVCodecContext *s){
     s->bit_rate_tolerance= s->bit_rate*10;
     s->qmin= 2;
     s->qmax= 31;
-    s->mb_qmin= 2;
-    s->mb_qmax= 31;
+    s->mb_lmin= FF_QP2LAMBDA * 2;
+    s->mb_lmax= FF_QP2LAMBDA * 31;
     s->rc_eq= "tex^qComp";
     s->qcompress= 0.5;
     s->max_qdiff= 3;
@@ -384,6 +456,9 @@ void avcodec_get_context_defaults(AVCodecContext *s){
     s->lmax= FF_QP2LAMBDA * s->qmax;
     s->sample_aspect_ratio= (AVRational){0,1};
     s->ildct_cmp= FF_CMP_VSAD;
+    s->profile= FF_PROFILE_UNKNOWN;
+    s->level= FF_LEVEL_UNKNOWN;
+    s->me_penalty_compensation= 256;
     
     s->intra_quant_bias= FF_DEFAULT_QUANT_BIAS;
     s->inter_quant_bias= FF_DEFAULT_QUANT_BIAS;
@@ -409,6 +484,7 @@ void avcodec_get_frame_defaults(AVFrame *pic){
     memset(pic, 0, sizeof(AVFrame));
 
     pic->pts= AV_NOPTS_VALUE;
+    pic->key_frame= 1;
 }
 
 /**
@@ -442,6 +518,17 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec)
     } else {
         avctx->priv_data = NULL;
     }
+
+    if(avctx->coded_width && avctx->coded_height)
+        avcodec_set_dimensions(avctx, avctx->coded_width, avctx->coded_height);
+    else if(avctx->width && avctx->height)
+        avcodec_set_dimensions(avctx, avctx->width, avctx->height);
+
+    if((avctx->coded_width||avctx->coded_height) && avcodec_check_dimensions(avctx,avctx->coded_width,avctx->coded_height)){
+        av_freep(&avctx->priv_data);
+        return -1;
+    }
+
     ret = avctx->codec->init(avctx);
     if (ret < 0) {
         av_freep(&avctx->priv_data);
@@ -453,24 +540,35 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec)
 int avcodec_encode_audio(AVCodecContext *avctx, uint8_t *buf, int buf_size, 
                          const short *samples)
 {
-    int ret;
-
-    ret = avctx->codec->encode(avctx, buf, buf_size, (void *)samples);
-    avctx->frame_number++;
-    return ret;
+    if(buf_size < FF_MIN_BUFFER_SIZE && 0){
+        av_log(avctx, AV_LOG_ERROR, "buffer smaller then minimum size\n");
+        return -1;
+    }
+    if((avctx->codec->capabilities & CODEC_CAP_DELAY) || samples){
+        int ret = avctx->codec->encode(avctx, buf, buf_size, (void *)samples);
+        avctx->frame_number++;
+        return ret;
+    }else
+        return 0;
 }
 
 int avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf, int buf_size, 
                          const AVFrame *pict)
 {
-    int ret;
-
-    ret = avctx->codec->encode(avctx, buf, buf_size, (void *)pict);
+    if(buf_size < FF_MIN_BUFFER_SIZE){
+        av_log(avctx, AV_LOG_ERROR, "buffer smaller then minimum size\n");
+        return -1;
+    }
+    if(avcodec_check_dimensions(avctx,avctx->width,avctx->height))
+        return -1;
+    if((avctx->codec->capabilities & CODEC_CAP_DELAY) || pict){
+        int ret = avctx->codec->encode(avctx, buf, buf_size, (void *)pict);
+        avctx->frame_number++;
+        emms_c(); //needed to avoid a emms_c() call before every return;
     
-    emms_c(); //needed to avoid a emms_c() call before every return;
-
-    avctx->frame_number++;
-    return ret;
+        return ret;
+    }else
+        return 0;
 }
 
 /** 
@@ -489,13 +587,19 @@ int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture,
     int ret;
     
     *got_picture_ptr= 0;
-    ret = avctx->codec->decode(avctx, picture, got_picture_ptr, 
-                               buf, buf_size);
+    if((avctx->coded_width||avctx->coded_height) && avcodec_check_dimensions(avctx,avctx->coded_width,avctx->coded_height))
+        return -1;
+    if((avctx->codec->capabilities & CODEC_CAP_DELAY) || buf_size){
+        ret = avctx->codec->decode(avctx, picture, got_picture_ptr, 
+                                buf, buf_size);
 
-    emms_c(); //needed to avoid a emms_c() call before every return;
+        emms_c(); //needed to avoid a emms_c() call before every return;
     
-    if (*got_picture_ptr)                           
-        avctx->frame_number++;
+        if (*got_picture_ptr)                           
+            avctx->frame_number++;
+    }else
+        ret= 0;
+
     return ret;
 }
 
@@ -574,18 +678,6 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
     return NULL;
 }
 
-static AVCodec *avcodec_find(enum CodecID id)
-{
-    AVCodec *p;
-    p = first_avcodec;
-    while (p) {
-        if (p->id == id)
-            return p;
-        p = p->next;
-    }
-    return NULL;
-}
-
 void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
 {
     const char *codec_name;
@@ -664,7 +756,7 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
                 strcpy(channels_str, "5:1");
                 break;
             default:
-                sprintf(channels_str, "%d channels", enc->channels);
+                snprintf(channels_str, sizeof(channels_str), "%d channels", enc->channels);
                 break;
         }
         if (enc->sample_rate) {
@@ -698,7 +790,8 @@ void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode)
         bitrate = enc->bit_rate;
         break;
     default:
-        av_abort();
+        snprintf(buf, buf_size, "Invalid Codec type %d", enc->codec_type);
+        return;
     }
     if (encode) {
         if (enc->flags & CODEC_FLAG_PASS1)
@@ -745,7 +838,7 @@ void avcodec_flush_buffers(AVCodecContext *avctx)
         avctx->codec->flush(avctx);
 }
 
-static void avcodec_default_free_buffers(AVCodecContext *s){
+void avcodec_default_free_buffers(AVCodecContext *s){
     int i, j;
 
     if(s->internal_buffer==NULL) return;
@@ -775,70 +868,70 @@ char av_get_pict_type_char(int pict_type){
 }
 
 int av_reduce(int *dst_nom, int *dst_den, int64_t nom, int64_t den, int64_t max){
-    int exact=1, sign=0;
-    int64_t gcd;
-
-    assert(den != 0);
-
-    if(den < 0)
-        return av_reduce(dst_nom, dst_den, -nom, -den, max);
-    
-    sign= nom < 0;
-    nom= ABS(nom);
-    
-    gcd = ff_gcd(nom, den);
-    nom /= gcd;
-    den /= gcd;
-    
-    if(nom > max || den > max){
-        AVRational a0={0,1}, a1={1,0};
-        exact=0;
-
-        for(;;){
-            int64_t x= nom / den;
-            int64_t a2n= x*a1.num + a0.num;
-            int64_t a2d= x*a1.den + a0.den;
-
-            if(a2n > max || a2d > max) break;
-
-            nom %= den;
-        
-            a0= a1;
-            a1= (AVRational){a2n, a2d};
-            if(nom==0) break;
-            x= nom; nom=den; den=x;
-        }
-        nom= a1.num;
-        den= a1.den;
+    AVRational a0={0,1}, a1={1,0};
+    int sign= (nom<0) ^ (den<0);
+    int64_t gcd= ff_gcd(ABS(nom), ABS(den));
+
+    nom = ABS(nom)/gcd;
+    den = ABS(den)/gcd;
+    if(nom<=max && den<=max){
+        a1= (AVRational){nom, den};
+        den=0;
     }
     
-    assert(ff_gcd(nom, den) == 1);
+    while(den){
+        int64_t x       = nom / den;
+        int64_t next_den= nom - den*x;
+        int64_t a2n= x*a1.num + a0.num;
+        int64_t a2d= x*a1.den + a0.den;
+
+        if(a2n > max || a2d > max) break;
+
+        a0= a1;
+        a1= (AVRational){a2n, a2d};
+        nom= den;
+        den= next_den;
+    }
+    assert(ff_gcd(a1.num, a1.den) == 1);
     
-    *dst_nom = sign ? -nom : nom;
-    *dst_den = den;
+    *dst_nom = sign ? -a1.num : a1.num;
+    *dst_den = a1.den;
     
-    return exact;
+    return den==0;
 }
 
-int64_t av_rescale(int64_t a, int64_t b, int64_t c){
-    AVInteger ai, ci;
+int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding rnd){
+    AVInteger ai;
+    int64_t r=0;
     assert(c > 0);
     assert(b >=0);
+    assert(rnd >=0 && rnd<=5 && rnd!=4);
     
-    if(a<0) return -av_rescale(-a, b, c);
+    if(a<0 && a != INT64_MIN) return -av_rescale_rnd(-a, b, c, rnd ^ ((rnd>>1)&1)); 
     
+    if(rnd==AV_ROUND_NEAR_INF) r= c/2;
+    else if(rnd&1)             r= c-1;
+
     if(b<=INT_MAX && c<=INT_MAX){
         if(a<=INT_MAX)
-            return (a * b + c/2)/c;
+            return (a * b + r)/c;
         else
-            return a/c*b + (a%c*b + c/2)/c;
+            return a/c*b + (a%c*b + r)/c;
     }
     
     ai= av_mul_i(av_int2i(a), av_int2i(b));
-    ci= av_int2i(c);
-    ai= av_add_i(ai, av_shr_i(ci,1));
+    ai= av_add_i(ai, av_int2i(r));
     
-    return av_i2int(av_div_i(ai, ci));
+    return av_i2int(av_div_i(ai, av_int2i(c)));
+}
+
+int64_t av_rescale(int64_t a, int64_t b, int64_t c){
+    return av_rescale_rnd(a, b, c, AV_ROUND_NEAR_INF);
+}
+
+int64_t ff_gcd(int64_t a, int64_t b){
+    if(b) return ff_gcd(b, a%b);
+    else  return a;
 }
 
 /* av_log API */
@@ -892,7 +985,7 @@ void av_log_set_callback(void (*callback)(void*, int, const char*, va_list))
     av_log_callback = callback;
 }
 
-#if !defined(HAVE_PTHREADS) && !defined(HAVE_W32THREADS)
+#if !defined(HAVE_THREADS)
 int avcodec_thread_init(AVCodecContext *s, int thread_count){
     return -1;
 }
diff --git a/src/libffmpeg/libavcodec/vcr1.c b/src/libffmpeg/libavcodec/vcr1.c
index 9a706af31..442ad9136 100644
--- a/src/libffmpeg/libavcodec/vcr1.c
+++ b/src/libffmpeg/libavcodec/vcr1.c
@@ -45,11 +45,6 @@ static int decode_frame(AVCodecContext *avctx,
     uint8_t *bytestream= buf;
     int i, x, y;
 
-    /* special case for last picture */
-    if (buf_size == 0) {
-        return 0;
-    }
-
     if(p->data[0])
         avctx->release_buffer(avctx, p);
 
@@ -156,12 +151,14 @@ static int decode_init(AVCodecContext *avctx){
     return 0;
 }
 
+#if 0
 static int encode_init(AVCodecContext *avctx){
  
     common_init(avctx);
     
     return 0;
 }
+#endif
 
 AVCodec vcr1_decoder = {
     "vcr1",
diff --git a/src/libffmpeg/libavcodec/vmdav.c b/src/libffmpeg/libavcodec/vmdav.c
index c11f80af6..4305f81fd 100644
--- a/src/libffmpeg/libavcodec/vmdav.c
+++ b/src/libffmpeg/libavcodec/vmdav.c
@@ -494,13 +494,10 @@ memset(data, 0x00, s->block_align * 2);
 bytes_decoded = s->block_align * 2;
             }
         } else {
-            if (s->bits == 16) {
-            } else {
-                /* copy the data but convert it to signed */
-                for (i = 0; i < s->block_align; i++)
-                    data[i * 2 + 1] = buf[i] + 0x80;
-                bytes_decoded = s->block_align * 2;
-            }
+            /* copy the data but convert it to signed */
+            for (i = 0; i < s->block_align; i++)
+                data[i * 2 + 1] = buf[i] + 0x80;
+            bytes_decoded = s->block_align * 2;
         }
     }
 
diff --git a/src/libffmpeg/libavcodec/vp3.c b/src/libffmpeg/libavcodec/vp3.c
index 5b3f1b926..659d6913b 100644
--- a/src/libffmpeg/libavcodec/vp3.c
+++ b/src/libffmpeg/libavcodec/vp3.c
@@ -2093,6 +2093,9 @@ static void render_fragments(Vp3DecodeContext *s,
         upper_motion_limit = 7 * s->current_frame.linesize[2];
         lower_motion_limit = height * s->current_frame.linesize[2] + width - 8;
     }
+    
+    if(ABS(stride) > 2048)
+        return; //various tables are fixed size
 
     /* for each fragment row... */
     for (y = 0; y < height; y += 8) {
@@ -2216,7 +2219,143 @@ av_log(s->avctx, AV_LOG_ERROR, " help! got beefy vector! (%X, %X)\n", motion_x,
     }
 
     emms_c();
+}
+
+#define SATURATE_U8(x) ((x) < 0) ? 0 : ((x) > 255) ? 255 : x
+
+static void horizontal_filter(unsigned char *first_pixel, int stride,
+    int *bounding_values)
+{
+    int i;
+    int filter_value;
+
+    for (i = 0; i < 8; i++, first_pixel += stride) {
+        filter_value = 
+            (first_pixel[-2] * 1) - 
+            (first_pixel[-1] * 3) +
+            (first_pixel[ 0] * 3) -
+            (first_pixel[ 1] * 1);
+        filter_value = bounding_values[(filter_value + 4) >> 3];
+        first_pixel[-1] = SATURATE_U8(first_pixel[-1] + filter_value);
+        first_pixel[ 0] = SATURATE_U8(first_pixel[ 0] - filter_value);
+    }
+}
+
+static void vertical_filter(unsigned char *first_pixel, int stride,
+    int *bounding_values)
+{
+    int i;
+    int filter_value;
+
+    for (i = 0; i < 8; i++, first_pixel++) {
+        filter_value = 
+            (first_pixel[-(2 * stride)] * 1) - 
+            (first_pixel[-(1 * stride)] * 3) +
+            (first_pixel[ (0         )] * 3) -
+            (first_pixel[ (1 * stride)] * 1);
+        filter_value = bounding_values[(filter_value + 4) >> 3];
+        first_pixel[-(1 * stride)] = SATURATE_U8(first_pixel[-(1 * stride)] + filter_value);
+        first_pixel[0] = SATURATE_U8(first_pixel[0] - filter_value);
+    }
+}
+
+static void apply_loop_filter(Vp3DecodeContext *s)
+{
+    int x, y, plane;
+    int width, height;
+    int fragment;
+    int stride;
+    unsigned char *plane_data;
+    int bounding_values[256];
+    int filter_limit;
+
+    /* find the right loop limit value */
+    for (x = 63; x >= 0; x--) {
+        if (vp31_ac_scale_factor[x] >= s->quality_index)
+            break;
+    }
+    filter_limit = vp31_filter_limit_values[x];
+
+    /* set up the bounding values */
+    memset(bounding_values, 0, 256 * sizeof(int));
+    for (x = 0; x < filter_limit; x++) {
+        bounding_values[-x - filter_limit] = -filter_limit + x;
+        bounding_values[-x] = -x;
+        bounding_values[x] = x;
+        bounding_values[x + filter_limit] = filter_limit - x;
+    }
+
+    for (plane = 0; plane < 3; plane++) {
+
+        if (plane == 0) {
+            /* Y plane parameters */
+            fragment = 0;
+            width = s->fragment_width;
+            height = s->fragment_height;
+            stride = s->current_frame.linesize[0];
+            plane_data = s->current_frame.data[0];
+        } else if (plane == 1) {
+            /* U plane parameters */
+            fragment = s->u_fragment_start;
+            width = s->fragment_width / 2;
+            height = s->fragment_height / 2;
+            stride = s->current_frame.linesize[1];
+            plane_data = s->current_frame.data[1];
+        } else {
+            /* V plane parameters */
+            fragment = s->v_fragment_start;
+            width = s->fragment_width / 2;
+            height = s->fragment_height / 2;
+            stride = s->current_frame.linesize[2];
+            plane_data = s->current_frame.data[2];
+        }
+
+        for (y = 0; y < height; y++) {
+
+            for (x = 0; x < width; x++) {
+
+                /* do not perform left edge filter for left columns frags */
+                if ((x > 0) &&
+                    (s->all_fragments[fragment].coding_method != MODE_COPY)) {
+                    horizontal_filter(
+                        plane_data + s->all_fragments[fragment].first_pixel, 
+                        stride, bounding_values);
+                }
+
+                /* do not perform top edge filter for top row fragments */
+                if ((y > 0) &&
+                    (s->all_fragments[fragment].coding_method != MODE_COPY)) {
+                    vertical_filter(
+                        plane_data + s->all_fragments[fragment].first_pixel, 
+                        stride, bounding_values);
+                }
+
+                /* do not perform right edge filter for right column
+                 * fragments or if right fragment neighbor is also coded
+                 * in this frame (it will be filtered in next iteration) */
+                if ((x < width - 1) &&
+                    (s->all_fragments[fragment].coding_method != MODE_COPY) &&
+                    (s->all_fragments[fragment + 1].coding_method == MODE_COPY)) {
+                    horizontal_filter(
+                        plane_data + s->all_fragments[fragment + 1].first_pixel, 
+                        stride, bounding_values);
+                }
 
+                /* do not perform bottom edge filter for bottom row
+                 * fragments or if bottom fragment neighbor is also coded
+                 * in this frame (it will be filtered in the next row) */
+                if ((y < height - 1) &&
+                    (s->all_fragments[fragment].coding_method != MODE_COPY) &&
+                    (s->all_fragments[fragment + width].coding_method == MODE_COPY)) {
+                    vertical_filter(
+                        plane_data + s->all_fragments[fragment + width].first_pixel, 
+                        stride, bounding_values);
+                }
+
+                fragment++;
+            }
+        }
+    }
 }
 
 /* 
@@ -2420,27 +2559,27 @@ static int vp3_decode_init(AVCodecContext *avctx)
         /* DC histograms */
         init_vlc(&s->dc_vlc[i], 5, 32,
             &dc_bias[i][0][1], 4, 2,
-            &dc_bias[i][0][0], 4, 2);
+            &dc_bias[i][0][0], 4, 2, 0);
 
         /* group 1 AC histograms */
         init_vlc(&s->ac_vlc_1[i], 5, 32,
             &ac_bias_0[i][0][1], 4, 2,
-            &ac_bias_0[i][0][0], 4, 2);
+            &ac_bias_0[i][0][0], 4, 2, 0);
 
         /* group 2 AC histograms */
         init_vlc(&s->ac_vlc_2[i], 5, 32,
             &ac_bias_1[i][0][1], 4, 2,
-            &ac_bias_1[i][0][0], 4, 2);
+            &ac_bias_1[i][0][0], 4, 2, 0);
 
         /* group 3 AC histograms */
         init_vlc(&s->ac_vlc_3[i], 5, 32,
             &ac_bias_2[i][0][1], 4, 2,
-            &ac_bias_2[i][0][0], 4, 2);
+            &ac_bias_2[i][0][0], 4, 2, 0);
 
         /* group 4 AC histograms */
         init_vlc(&s->ac_vlc_4[i], 5, 32,
             &ac_bias_3[i][0][1], 4, 2,
-            &ac_bias_3[i][0][0], 4, 2);
+            &ac_bias_3[i][0][0], 4, 2, 0);
     }
 
     /* build quantization zigzag table */
@@ -2598,6 +2737,7 @@ if (!s->keyframe) {
 
     reverse_dc_prediction(s, 0, s->fragment_width, s->fragment_height);
     render_fragments(s, 0, s->width, s->height, 0);
+//    apply_loop_filter(s);
 
     if ((avctx->flags & CODEC_FLAG_GRAY) == 0) {
         reverse_dc_prediction(s, s->u_fragment_start,
@@ -2681,6 +2821,11 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
     s->width = get_bits(&gb, 16) << 4;
     s->height = get_bits(&gb, 16) << 4;
     
+    if(avcodec_check_dimensions(avctx, s->width, s->height)){
+        s->width= s->height= 0;
+        return -1;
+    }
+    
     skip_bits(&gb, 24); /* frame width */
     skip_bits(&gb, 24); /* frame height */
 
@@ -2719,16 +2864,16 @@ static int theora_decode_comments(AVCodecContext *avctx, GetBitContext gb)
 {
     int nb_comments, i, tmp;
 
-    tmp = get_bits(&gb, 32);
+    tmp = get_bits_long(&gb, 32);
     tmp = be2me_32(tmp);
     while(tmp--)
 	    skip_bits(&gb, 8);
 
-    nb_comments = get_bits(&gb, 32);
+    nb_comments = get_bits_long(&gb, 32);
     nb_comments = be2me_32(nb_comments);
     for (i = 0; i < nb_comments; i++)
     {
-	tmp = get_bits(&gb, 32);
+	tmp = get_bits_long(&gb, 32);
 	tmp = be2me_32(tmp);
 	while(tmp--)
 	    skip_bits(&gb, 8);
@@ -2774,13 +2919,20 @@ static int theora_decode_init(AVCodecContext *avctx)
     Vp3DecodeContext *s = avctx->priv_data;
     GetBitContext gb;
     int ptype;
+    uint8_t *p= avctx->extradata;
+    int op_bytes, i;
     
     s->theora = 1;
 
     if (!avctx->extradata_size)
 	return -1;
 
-    init_get_bits(&gb, avctx->extradata, avctx->extradata_size);
+  for(i=0;i<3;i++) {
+    op_bytes = *(p++)<<8;
+    op_bytes += *(p++);
+
+    init_get_bits(&gb, p, op_bytes);
+    p += op_bytes;
 
     ptype = get_bits(&gb, 8);
     debug_vp3("Theora headerpacket type: %x\n", ptype);
@@ -2803,6 +2955,7 @@ static int theora_decode_init(AVCodecContext *avctx)
 	    theora_decode_tables(avctx, gb);
 	    break;
     }
+  }
 
     return 0;
 }
diff --git a/src/libffmpeg/libavcodec/vp3data.h b/src/libffmpeg/libavcodec/vp3data.h
index 1dd511fa0..85a233716 100644
--- a/src/libffmpeg/libavcodec/vp3data.h
+++ b/src/libffmpeg/libavcodec/vp3data.h
@@ -61,6 +61,17 @@ static const uint32_t vp31_ac_scale_factor[64] =
    21,   19,   18,   17,   15,   13,  12,  10
 };
 
+static const uint32_t vp31_filter_limit_values[64] = 
+{  30, 25, 20, 20, 15, 15, 14, 14,
+   13, 13, 12, 12, 11, 11, 10, 10,
+    9,  9,  8,  8,  7,  7,  7,  7,
+    6,  6,  6,  6,  5,  5,  5,  5,
+    4,  4,  4,  4,  3,  3,  3,  3,
+    2,  2,  2,  2,  2,  2,  2,  2,
+    0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0
+};
+
 /* table used to convert natural order <-> zigzag order */
 static const int dezigzag_index[64] =
 {   0,  1,  8,  16,  9,  2,  3, 10,
diff --git a/src/libffmpeg/libavcodec/vqavideo.c b/src/libffmpeg/libavcodec/vqavideo.c
index ea86fb108..fb0871e18 100644
--- a/src/libffmpeg/libavcodec/vqavideo.c
+++ b/src/libffmpeg/libavcodec/vqavideo.c
@@ -151,6 +151,10 @@ static int vqa_decode_init(AVCodecContext *avctx)
     s->vqa_version = vqa_header[0];
     s->width = LE_16(&vqa_header[6]);
     s->height = LE_16(&vqa_header[8]);
+    if(avcodec_check_dimensions(avctx, s->width, s->height)){
+        s->width= s->height= 0;
+        return -1;
+    }
     s->vector_width = vqa_header[10];
     s->vector_height = vqa_header[11];
     s->partial_count = s->partial_countdown = vqa_header[13];
diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c
index cf2db1494..e0788375f 100644
--- a/src/libffmpeg/libavcodec/wmadec.c
+++ b/src/libffmpeg/libavcodec/wmadec.c
@@ -32,6 +32,7 @@
  */
 
 #include "avcodec.h"
+#include "bitstream.h"
 #include "dsputil.h"
 
 /* size of blocks */
@@ -182,7 +183,7 @@ static void init_coef_vlc(VLC *vlc,
     const uint16_t *p;
     int i, l, j, level;
 
-    init_vlc(vlc, 9, n, table_bits, 1, 1, table_codes, 4, 4);
+    init_vlc(vlc, 9, n, table_bits, 1, 1, table_codes, 4, 4, 0);
 
     run_table = av_malloc(n * sizeof(uint16_t));
     level_table = av_malloc(n * sizeof(uint16_t));
@@ -208,7 +209,8 @@ static int wma_decode_init(AVCodecContext * avctx)
     int i, flags1, flags2;
     float *window;
     uint8_t *extradata;
-    float bps1, high_freq, bps;
+    float bps1, high_freq;
+    volatile float bps;
     int sample_rate1;
     int coef_vlc_table;
     
@@ -492,13 +494,13 @@ static int wma_decode_init(AVCodecContext * avctx)
 #endif
         init_vlc(&s->hgain_vlc, 9, sizeof(hgain_huffbits), 
                  hgain_huffbits, 1, 1,
-                 hgain_huffcodes, 2, 2);
+                 hgain_huffcodes, 2, 2, 0);
     }
 
     if (s->use_exp_vlc) {
         init_vlc(&s->exp_vlc, 9, sizeof(scale_huffbits), 
                  scale_huffbits, 1, 1,
-                 scale_huffcodes, 4, 4);
+                 scale_huffcodes, 4, 4, 0);
     } else {
         wma_lsp_to_curve_init(s, s->frame_len);
     }
@@ -702,7 +704,12 @@ static int wma_decode_block(WMADecodeContext *s)
     int n, v, a, ch, code, bsize;
     int coef_nb_bits, total_gain, parse_exponents;
     float window[BLOCK_MAX_SIZE * 2];
+// XXX: FIXME!! there's a bug somewhere which makes this mandatory under altivec
+#ifdef HAVE_ALTIVEC
+    volatile int nb_coefs[MAX_CHANNELS] __attribute__((aligned(16)));
+#else
     int nb_coefs[MAX_CHANNELS];
+#endif
     float mdct_norm;
 
 #ifdef TRACE
diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c
index 13a112d1f..b6376d80b 100644
--- a/src/libffmpeg/libavcodec/wmv2.c
+++ b/src/libffmpeg/libavcodec/wmv2.c
@@ -101,6 +101,7 @@ static int wmv2_encode_init(AVCodecContext *avctx){
     return 0;
 }
 
+#if 0 /* unused, remove? */
 static int wmv2_encode_end(AVCodecContext *avctx){
     
     if(MPV_encode_end(avctx) < 0)
@@ -111,6 +112,7 @@ static int wmv2_encode_end(AVCodecContext *avctx){
     
     return 0;
 }
+#endif
 
 int ff_wmv2_encode_picture_header(MpegEncContext * s, int picture_number)
 {
@@ -244,7 +246,7 @@ void ff_wmv2_encode_mb(MpegEncContext * s,
         if (s->pict_type == I_TYPE) {
             set_stat(ST_INTRA_MB);
             put_bits(&s->pb, 
-                     table_mb_intra[coded_cbp][1], table_mb_intra[coded_cbp][0]);
+                     ff_msmp4_mb_i_table[coded_cbp][1], ff_msmp4_mb_i_table[coded_cbp][0]);
         } else {
             put_bits(&s->pb, 
                      wmv2_inter_table[w->cbp_table_index][cbp][1], 
@@ -734,7 +736,7 @@ static int wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         cbp = code & 0x3f;
     } else {
         s->mb_intra = 1;
-        code = get_vlc2(&s->gb, mb_intra_vlc.table, MB_INTRA_VLC_BITS, 2);
+        code = get_vlc2(&s->gb, ff_msmp4_mb_i_vlc.table, MB_INTRA_VLC_BITS, 2);
         if (code < 0){
             av_log(s->avctx, AV_LOG_ERROR, "II-cbp illegal at %d %d\n", s->mb_x, s->mb_y);
             return -1;
diff --git a/src/libffmpeg/libavcodec/wnv1.c b/src/libffmpeg/libavcodec/wnv1.c
new file mode 100644
index 000000000..292c7e042
--- /dev/null
+++ b/src/libffmpeg/libavcodec/wnv1.c
@@ -0,0 +1,144 @@
+/*
+ * Winnov WNV1 codec
+ * Copyright (c) 2005 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file wnv1.c
+ * Winnov WNV1 codec.
+ */
+ 
+#include "avcodec.h"
+#include "common.h"
+#include "bitstream.h"
+
+
+typedef struct WNV1Context{
+    AVCodecContext *avctx;
+    AVFrame pic;
+
+    int shift;
+    GetBitContext gb;
+} WNV1Context;
+
+static uint16_t code_tab[16][2]={
+{0x1FD,9}, {0xFD,8}, {0x7D,7}, {0x3D,6}, {0x1D,5}, {0x0D,4}, {0x005,3},
+{0x000,1}, 
+{0x004,3}, {0x0C,4}, {0x1C,5}, {0x3C,6}, {0x7C,7}, {0xFC,8}, {0x1FC,9}, {0xFF,8}
+};
+
+#define CODE_VLC_BITS 9
+static VLC code_vlc;
+
+/* returns modified base_value */
+static inline int wnv1_get_code(WNV1Context *w, int base_value)
+{
+    int v = get_vlc2(&w->gb, code_vlc.table, CODE_VLC_BITS, 1);
+
+    if(v==15)
+        return ff_reverse[ get_bits(&w->gb, 8 - w->shift) ];
+    else
+        return base_value + ((v - 7)<<w->shift);
+}
+
+static int decode_frame(AVCodecContext *avctx, 
+                        void *data, int *data_size,
+                        uint8_t *buf, int buf_size)
+{
+    WNV1Context * const l = avctx->priv_data;
+    AVFrame * const p= (AVFrame*)&l->pic;
+    unsigned char *Y,*U,*V;
+    int i, j;
+    int prev_y = 0, prev_u = 0, prev_v = 0;
+
+    if(p->data[0])
+        avctx->release_buffer(avctx, p);
+
+    p->reference = 0;
+    if(avctx->get_buffer(avctx, p) < 0){
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    p->key_frame = 1;
+
+    for(i=8; i<buf_size; i++)
+        buf[i]= ff_reverse[ buf[i] ]; //FIXME ensure that the buffer is modifyable or use a temp one
+    init_get_bits(&l->gb, buf+8, (buf_size-8)*8);
+
+    if (buf[2] >> 4 == 6)
+        l->shift = 2;
+    else {
+        l->shift = 8 - (buf[2] >> 4);
+        if (l->shift > 4) {
+            av_log(avctx, AV_LOG_ERROR, "Unknown WNV1 frame header value %i, please upload file for study\n", buf[2] >> 4);
+            l->shift = 4;
+        }
+        if (l->shift < 1) {
+            av_log(avctx, AV_LOG_ERROR, "Unknown WNV1 frame header value %i, please upload file for study\n", buf[2] >> 4);
+            l->shift = 1;
+        }
+    }
+    
+    Y = p->data[0];
+    U = p->data[1];
+    V = p->data[2];
+    for (j = 0; j < avctx->height; j++) {
+        for (i = 0; i < avctx->width / 2; i++) {
+            Y[i * 2] = wnv1_get_code(l, prev_y);
+            prev_u = U[i] = wnv1_get_code(l, prev_u);
+            prev_y = Y[(i * 2) + 1] = wnv1_get_code(l, Y[i * 2]);
+            prev_v = V[i] = wnv1_get_code(l, prev_v);
+        }
+        Y += p->linesize[0];
+        U += p->linesize[1];
+        V += p->linesize[2];
+    }
+
+    
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = l->pic;
+    
+    return buf_size;
+}
+
+static int decode_init(AVCodecContext *avctx){
+    WNV1Context * const l = avctx->priv_data;
+
+    l->avctx = avctx;
+    avctx->pix_fmt = PIX_FMT_YUV422P;
+
+    if(!code_vlc.table){
+        init_vlc(&code_vlc, CODE_VLC_BITS, 16,
+                    &code_tab[0][1], 4, 2,
+                    &code_tab[0][0], 4, 2, 1);
+    }
+
+    return 0;
+}
+
+AVCodec wnv1_decoder = {
+    "wnv1",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_WNV1,
+    sizeof(WNV1Context),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame,
+    CODEC_CAP_DR1,
+};
diff --git a/src/libffmpeg/libavcodec/ws-snd1.c b/src/libffmpeg/libavcodec/ws-snd1.c
new file mode 100644
index 000000000..5ac4c61bd
--- /dev/null
+++ b/src/libffmpeg/libavcodec/ws-snd1.c
@@ -0,0 +1,145 @@
+/*
+ * Westwood SNDx codecs
+ * Copyright (c) 2005 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "avcodec.h"
+
+/**
+ * @file ws-snd.c
+ * Westwood SNDx codecs.
+ *
+ * Reference documents about VQA format and its audio codecs
+ * can be found here:
+ * http://www.multimedia.cx
+ */
+
+typedef struct {
+} WSSNDContext;
+
+static const char ws_adpcm_2bit[] = { -2, -1, 0, 1};
+static const char ws_adpcm_4bit[] = {
+    -9, -8, -6, -5, -4, -3, -2, -1,
+     0,  1,  2,  3,  4,  5,  6,  8 };
+
+#define CLIP8(a) if(a>127)a=127;if(a<-128)a=-128;
+
+static int ws_snd_decode_init(AVCodecContext * avctx)
+{
+//    WSSNDContext *c = avctx->priv_data;
+    
+    return 0;
+}
+
+static int ws_snd_decode_frame(AVCodecContext *avctx,
+                void *data, int *data_size,
+                uint8_t *buf, int buf_size)
+{
+//    WSSNDContext *c = avctx->priv_data;
+    
+    int in_size, out_size;
+    int sample = 0;
+    int i;
+    short *samples = data;
+    
+    if (!buf_size)
+        return 0;
+
+    out_size = LE_16(&buf[0]);
+    *data_size = out_size * 2;
+    in_size = LE_16(&buf[2]);
+    buf += 4;
+    
+    if (in_size == out_size) {
+        for (i = 0; i < out_size; i++)
+            *samples++ = (*buf++ - 0x80) << 8;
+        return buf_size;
+    }
+    
+    while (out_size > 0) {
+        int code;
+        uint8_t count;
+        code = (*buf) >> 6;
+        count = (*buf) & 0x3F;
+        buf++;
+        switch(code) {
+        case 0: /* ADPCM 2-bit */
+            for (count++; count > 0; count--) {
+                code = *buf++;
+                sample += ws_adpcm_2bit[code & 0x3];
+                CLIP8(sample);
+                *samples++ = sample << 8;
+                sample += ws_adpcm_2bit[(code >> 2) & 0x3];
+                CLIP8(sample);
+                *samples++ = sample << 8;
+                sample += ws_adpcm_2bit[(code >> 4) & 0x3];
+                CLIP8(sample);
+                *samples++ = sample << 8;
+                sample += ws_adpcm_2bit[(code >> 6) & 0x3];
+                CLIP8(sample);
+                *samples++ = sample << 8;
+                out_size -= 4;
+            }
+            break;
+        case 1: /* ADPCM 4-bit */
+            for (count++; count > 0; count--) {
+                code = *buf++;
+                sample += ws_adpcm_4bit[code & 0xF];
+                CLIP8(sample);
+                *samples++ = sample << 8;
+                sample += ws_adpcm_4bit[code >> 4];
+                CLIP8(sample);
+                *samples++ = sample << 8;
+                out_size -= 2;
+            }
+            break;
+        case 2: /* no compression */
+            if (count & 0x20) { /* big delta */
+                char t;
+                t = count;
+                t <<= 3;
+                sample += t >> 3;
+                *samples++ = sample << 8;
+                out_size--;
+            } else { /* copy */
+                for (count++; count > 0; count--) {
+                    *samples++ = (*buf++ - 0x80) << 8;
+                    out_size--;
+                }
+                sample = buf[-1] - 0x80;
+            }
+            break;
+        default: /* run */
+            for(count++; count > 0; count--) {
+                *samples++ = sample << 8;
+                out_size--;
+            }
+        }
+    }
+    
+    return buf_size;
+}
+
+AVCodec ws_snd1_decoder = {
+    "ws_snd1",
+    CODEC_TYPE_AUDIO,
+    CODEC_ID_WESTWOOD_SND1,
+    sizeof(WSSNDContext),
+    ws_snd_decode_init,
+    NULL,
+    NULL,
+    ws_snd_decode_frame,
+};
diff --git a/src/libffmpeg/libavcodec/xan.c b/src/libffmpeg/libavcodec/xan.c
index f98a06bca..377a71ef2 100644
--- a/src/libffmpeg/libavcodec/xan.c
+++ b/src/libffmpeg/libavcodec/xan.c
@@ -132,6 +132,9 @@ static int xan_decode_init(AVCodecContext *avctx)
         v_b_table[i] = V_B * i;
     }
 
+    if(avcodec_check_dimensions(avctx, avctx->width, avctx->height))
+        return -1;
+    
     s->buffer1 = av_malloc(avctx->width * avctx->height);
     s->buffer2 = av_malloc(avctx->width * avctx->height);
     if (!s->buffer1 || !s->buffer2)
diff --git a/src/libffmpeg/libavcodec/xl.c b/src/libffmpeg/libavcodec/xl.c
new file mode 100644
index 000000000..2ba48eb27
--- /dev/null
+++ b/src/libffmpeg/libavcodec/xl.c
@@ -0,0 +1,138 @@
+/*
+ * Miro VideoXL codec
+ * Copyright (c) 2004 Konstantin Shishkov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+ 
+/**
+ * @file xl.c
+ * Miro VideoXL codec.
+ */
+ 
+#include "avcodec.h"
+#include "mpegvideo.h"
+
+typedef struct VideoXLContext{
+    AVCodecContext *avctx;
+    AVFrame pic;
+} VideoXLContext;
+
+const int xl_table[32] = {
+   0,   1,   2,   3,   4,   5,   6,   7,
+   8,   9,  12,  15,  20,  25,  34,  46,
+  64,  82,  94, 103, 108, 113, 116, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127};
+
+static int decode_frame(AVCodecContext *avctx, 
+                        void *data, int *data_size,
+                        uint8_t *buf, int buf_size)
+{
+    VideoXLContext * const a = avctx->priv_data;
+    AVFrame * const p= (AVFrame*)&a->pic;
+    uint8_t *Y, *U, *V;
+    int i, j;
+    int stride;
+    uint32_t val;
+    int y0, y1, y2, y3, c0, c1;
+
+    if(p->data[0])
+        avctx->release_buffer(avctx, p);
+
+    p->reference = 0;
+    if(avctx->get_buffer(avctx, p) < 0){
+        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
+        return -1;
+    }
+    p->pict_type= I_TYPE;
+    p->key_frame= 1;
+
+    Y = a->pic.data[0];
+    U = a->pic.data[1];
+    V = a->pic.data[2];
+    
+    stride = avctx->width - 4;
+    for (i = 0; i < avctx->height; i++) {
+        /* lines are stored in reversed order */
+        buf += stride;
+        
+        for (j = 0; j < avctx->width; j += 4) {
+            /* value is stored in LE dword with word swapped */
+            val = LE_32(buf);
+            buf -= 4;
+            val = ((val >> 16) & 0xFFFF) | ((val & 0xFFFF) << 16);
+    
+            if(!j)
+                y0 = (val & 0x1F) << 2;
+            else
+                y0 = y3 + xl_table[val & 0x1F];
+            val >>= 5;
+            y1 = y0 + xl_table[val & 0x1F];
+            val >>= 5;
+            y2 = y1 + xl_table[val & 0x1F];
+            val >>= 6; /* align to word */
+            y3 = y2 + xl_table[val & 0x1F];
+            val >>= 5;
+            if(!j)
+                c0 = (val & 0x1F) << 2;
+            else
+                c0 += xl_table[val & 0x1F];
+            val >>= 5;
+            if(!j)
+                c1 = (val & 0x1F) << 2;
+            else
+                c1 += xl_table[val & 0x1F];
+            
+            Y[j + 0] = y0 << 1;
+            Y[j + 1] = y1 << 1;
+            Y[j + 2] = y2 << 1;
+            Y[j + 3] = y3 << 1;
+            
+            U[j >> 2] = c0 << 1;
+            V[j >> 2] = c1 << 1;
+        }
+        
+        buf += avctx->width + 4;
+        Y += a->pic.linesize[0];
+        U += a->pic.linesize[1];
+        V += a->pic.linesize[2];
+    }
+
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = a->pic;
+    
+    return buf_size;
+}
+
+static int decode_init(AVCodecContext *avctx){
+//    VideoXLContext * const a = avctx->priv_data;
+
+    avctx->pix_fmt= PIX_FMT_YUV411P;
+
+    return 0;
+}
+
+AVCodec xl_decoder = {
+    "xl",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_VIXL,
+    sizeof(VideoXLContext),
+    decode_init,
+    NULL,
+    NULL,
+    decode_frame,
+    CODEC_CAP_DR1,
+};