75 files changed, 6816 insertions, 3345 deletions
diff --git a/src/libffmpeg/libavcodec/4xm.c b/src/libffmpeg/libavcodec/4xm.c
index 3ca2338d2..a986f151e 100644
--- a/src/libffmpeg/libavcodec/4xm.c
+++ b/src/libffmpeg/libavcodec/4xm.c
@@ -606,7 +606,7 @@ static int decode_frame(AVCodecContext *avctx,
     int i, frame_4cc, frame_size;
 
     frame_4cc= get32(buf);
-    if(buf_size != get32(buf+4)+8){
+    if(buf_size != get32(buf+4)+8 || buf_size < 20){
         av_log(f->avctx, AV_LOG_ERROR, "size mismatch %d %d\n", buf_size, get32(buf+4));
     }
 
@@ -634,6 +634,10 @@ static int decode_frame(AVCodecContext *avctx,
         cfrm= &f->cfrm[i];
 
         cfrm->data= av_fast_realloc(cfrm->data, &cfrm->allocated_size, cfrm->size + data_size + FF_INPUT_BUFFER_PADDING_SIZE);
+        if(!cfrm->data){ //explicit check needed as memcpy below might not catch a NULL
+            av_log(f->avctx, AV_LOG_ERROR, "realloc falure");
+            return -1;
+        }
 
         memcpy(cfrm->data + cfrm->size, buf+20, data_size);
         cfrm->size += data_size;
diff --git a/src/libffmpeg/libavcodec/Makefile.am b/src/libffmpeg/libavcodec/Makefile.am
index 687f37f3e..d94646f59 100644
--- a/src/libffmpeg/libavcodec/Makefile.am
+++ b/src/libffmpeg/libavcodec/Makefile.am
@@ -9,7 +9,7 @@ EXTRA_DIST = motion_est_template.c \
 # we need to compile everything in debug mode, including the encoders,
 # otherwise we get unresolved symbols, because some unsatisfied function calls
 # are not optimized away with debug optimization
-AM_CFLAGS = `test "$(CFLAGS)" = "$(DEBUG_CFLAGS)" && echo -DCONFIG_ENCODERS` -fno-strict-aliasing
+AM_CFLAGS = `test "$(CFLAGS)" = "$(DEBUG_CFLAGS)" && echo -DCONFIG_ENCODERS` -fno-strict-aliasing -DCONFIG_VC1_DECODER
 AM_CPPFLAGS = $(ZLIB_CPPFLAGS) $(LIBFFMPEG_CPPFLAGS) \
 	-I$(top_srcdir)/src/libffmpeg/libavutil
 ASFLAGS =
@@ -23,9 +23,13 @@ libavcodec_la_SOURCES = \
 	adpcm.c \
 	alac.c \
 	asv1.c \
+	avs.c \
 	bitstream.c \
 	cabac.c \
+	cavs.c \
 	cinepak.c \
+	cook.c \
+	cscd.c \
 	cyuv.c \
 	dpcm.c \
 	dsputil.c \
@@ -34,6 +38,7 @@ libavcodec_la_SOURCES = \
 	eval.c \
 	faandct.c \
 	flac.c \
+	flashsv.c \
 	flicvideo.c \
 	fraps.c \
 	fft.c \
@@ -53,12 +58,15 @@ libavcodec_la_SOURCES = \
 	jfdctfst.c \
 	jfdctint.c \
 	jrevdct.c \
+	kmvc.c \
 	lcl.c \
 	loco.c \
+	lzo.c \
 	mdct.c \
 	mace.c \
 	mem.c \
 	mjpeg.c \
+	mmvideo.c \
 	motion_est.c \
 	mpeg12.c \
 	mpegaudiodec.c \
@@ -66,6 +74,7 @@ libavcodec_la_SOURCES = \
 	msmpeg4.c \
 	msrle.c \
 	msvideo1.c \
+	nuv.c \
 	parser.c \
 	pcm.c \
 	qdm2.c \
@@ -79,17 +88,23 @@ libavcodec_la_SOURCES = \
 	resample2.c \
 	roqvideo.c \
 	rpza.c \
+	rtjpeg.c \
 	rv10.c \
 	shorten.c \
 	simple_idct.c \
+	smacker.c \
 	smc.c \
 	snow.c \
 	svq1.c \
 	tscc.c \
 	truemotion1.c \
 	truemotion2.c \
+	truespeech.c \
+	tta.c \
 	ulti.c \
 	utils.c \
+	vc1.c \
+	vc1dsp.c \
 	vcr1.c \
 	vmdav.c \
 	vorbis.c \
@@ -99,7 +114,8 @@ libavcodec_la_SOURCES = \
 	wmadec.c \
 	wnv1.c \
 	xan.c \
-	xl.c
+	xl.c \
+	zmbv.c 
 
 libavcodec_la_LDFLAGS = \
 	$(top_builddir)/src/libffmpeg/libavcodec/armv4l/libavcodec_armv4l.la \
@@ -114,6 +130,8 @@ noinst_HEADERS = \
 	avcodec.h \
 	bitstream.h \
 	cabac.h \
+	cavsdata.h \
+	cookdata.h \
 	dsputil.h \
 	dvdata.h \
 	faandct.h \
@@ -142,7 +160,10 @@ noinst_HEADERS = \
 	svq1_cb.h \
 	svq1_vlc.h \
 	truemotion1data.h \
+	truespeech_data.h \
 	ulti_cb.h \
 	vorbis.h \
+	vc1acdata.h \
+	vc1data.h \
 	vp3data.h \
 	wmadata.h
diff --git a/src/libffmpeg/libavcodec/adpcm.c b/src/libffmpeg/libavcodec/adpcm.c
index ed3106aa0..796cd267c 100644
--- a/src/libffmpeg/libavcodec/adpcm.c
+++ b/src/libffmpeg/libavcodec/adpcm.c
@@ -203,49 +203,11 @@ static int adpcm_encode_close(AVCodecContext *avctx)
 
 static inline unsigned char adpcm_ima_compress_sample(ADPCMChannelStatus *c, short sample)
 {
-    int step_index;
-    unsigned char nibble;
-
-    int sign = 0; /* sign bit of the nibble (MSB) */
-    int delta, predicted_delta;
-
-    delta = sample - c->prev_sample;
-
-    if (delta < 0) {
-        sign = 1;
-        delta = -delta;
-    }
-
-    step_index = c->step_index;
-
-    /* nibble = 4 * delta / step_table[step_index]; */
-    nibble = (delta << 2) / step_table[step_index];
-
-    if (nibble > 7)
-        nibble = 7;
-
-    step_index += index_table[nibble];
-    if (step_index < 0)
-        step_index = 0;
-    if (step_index > 88)
-        step_index = 88;
-
-    /* what the decoder will find */
-    predicted_delta = ((step_table[step_index] * nibble) / 4) + (step_table[step_index] / 8);
-
-    if (sign)
-        c->prev_sample -= predicted_delta;
-    else
-        c->prev_sample += predicted_delta;
-
+    int delta = sample - c->prev_sample;
+    int nibble = FFMIN(7, abs(delta)*4/step_table[c->step_index]) + (delta<0)*8;
+    c->prev_sample = c->prev_sample + ((step_table[c->step_index] * yamaha_difflookup[nibble]) / 8);
     CLAMP_TO_SHORT(c->prev_sample);
-
-
-    nibble += sign << 3; /* sign * 8 */
-
-    /* save back */
-    c->step_index = step_index;
-
+    c->step_index = clip(c->step_index + index_table[nibble], 0, 88);
     return nibble;
 }
 
@@ -276,27 +238,194 @@ static inline unsigned char adpcm_ms_compress_sample(ADPCMChannelStatus *c, shor
 
 static inline unsigned char adpcm_yamaha_compress_sample(ADPCMChannelStatus *c, short sample)
 {
-    int i1 = 0, j1;
+    int nibble, delta;
 
     if(!c->step) {
         c->predictor = 0;
         c->step = 127;
     }
-    j1 = sample - c->predictor;
 
-    j1 = (j1 * 8) / c->step;
-    i1 = abs(j1) / 2;
-    if (i1 > 7)
-        i1 = 7;
-    if (j1 < 0)
-        i1 += 8;
+    delta = sample - c->predictor;
+
+    nibble = FFMIN(7, abs(delta)*4/c->step) + (delta<0)*8;
 
-    c->predictor = c->predictor + ((c->step * yamaha_difflookup[i1]) / 8);
+    c->predictor = c->predictor + ((c->step * yamaha_difflookup[nibble]) / 8);
     CLAMP_TO_SHORT(c->predictor);
-    c->step = (c->step * yamaha_indexscale[i1]) >> 8;
+    c->step = (c->step * yamaha_indexscale[nibble]) >> 8;
     c->step = clip(c->step, 127, 24567);
 
-    return i1;
+    return nibble;
+}
+
+typedef struct TrellisPath {
+    int nibble;
+    int prev;
+} TrellisPath;
+
+typedef struct TrellisNode {
+    uint32_t ssd;
+    int path;
+    int sample1;
+    int sample2;
+    int step;
+} TrellisNode;
+
+static void adpcm_compress_trellis(AVCodecContext *avctx, const short *samples,
+                                   uint8_t *dst, ADPCMChannelStatus *c, int n)
+{
+#define FREEZE_INTERVAL 128
+    //FIXME 6% faster if frontier is a compile-time constant
+    const int frontier = 1 << avctx->trellis;
+    const int stride = avctx->channels;
+    const int version = avctx->codec->id;
+    const int max_paths = frontier*FREEZE_INTERVAL;
+    TrellisPath paths[max_paths], *p;
+    TrellisNode node_buf[2][frontier];
+    TrellisNode *nodep_buf[2][frontier];
+    TrellisNode **nodes = nodep_buf[0]; // nodes[] is always sorted by .ssd
+    TrellisNode **nodes_next = nodep_buf[1];
+    int pathn = 0, froze = -1, i, j, k;
+
+    assert(!(max_paths&(max_paths-1)));
+
+    memset(nodep_buf, 0, sizeof(nodep_buf));
+    nodes[0] = &node_buf[1][0];
+    nodes[0]->ssd = 0;
+    nodes[0]->path = 0;
+    nodes[0]->step = c->step_index;
+    nodes[0]->sample1 = c->sample1;
+    nodes[0]->sample2 = c->sample2;
+    if(version == CODEC_ID_ADPCM_IMA_WAV)
+        nodes[0]->sample1 = c->prev_sample;
+    if(version == CODEC_ID_ADPCM_MS)
+        nodes[0]->step = c->idelta;
+    if(version == CODEC_ID_ADPCM_YAMAHA) {
+        if(c->step == 0) {
+            nodes[0]->step = 127;
+            nodes[0]->sample1 = 0;
+        } else {
+            nodes[0]->step = c->step;
+            nodes[0]->sample1 = c->predictor;
+        }
+    }
+
+    for(i=0; i<n; i++) {
+        TrellisNode *t = node_buf[i&1];
+        TrellisNode **u;
+        int sample = samples[i*stride];
+        memset(nodes_next, 0, frontier*sizeof(TrellisNode*));
+        for(j=0; j<frontier && nodes[j]; j++) {
+            // higher j have higher ssd already, so they're unlikely to use a suboptimal next sample too
+            const int range = (j < frontier/2) ? 1 : 0;
+            const int step = nodes[j]->step;
+            int nidx;
+            if(version == CODEC_ID_ADPCM_MS) {
+                const int predictor = ((nodes[j]->sample1 * c->coeff1) + (nodes[j]->sample2 * c->coeff2)) / 256;
+                const int div = (sample - predictor) / step;
+                const int nmin = clip(div-range, -8, 6);
+                const int nmax = clip(div+range, -7, 7);
+                for(nidx=nmin; nidx<=nmax; nidx++) {
+                    const int nibble = nidx & 0xf;
+                    int dec_sample = predictor + nidx * step;
+#define STORE_NODE(NAME, STEP_INDEX)\
+                    int d;\
+                    uint32_t ssd;\
+                    CLAMP_TO_SHORT(dec_sample);\
+                    d = sample - dec_sample;\
+                    ssd = nodes[j]->ssd + d*d;\
+                    if(nodes_next[frontier-1] && ssd >= nodes_next[frontier-1]->ssd)\
+                        continue;\
+                    /* Collapse any two states with the same previous sample value. \
+                     * One could also distinguish states by step and by 2nd to last
+                     * sample, but the effects of that are negligible. */\
+                    for(k=0; k<frontier && nodes_next[k]; k++) {\
+                        if(dec_sample == nodes_next[k]->sample1) {\
+                            assert(ssd >= nodes_next[k]->ssd);\
+                            goto next_##NAME;\
+                        }\
+                    }\
+                    for(k=0; k<frontier; k++) {\
+                        if(!nodes_next[k] || ssd < nodes_next[k]->ssd) {\
+                            TrellisNode *u = nodes_next[frontier-1];\
+                            if(!u) {\
+                                assert(pathn < max_paths);\
+                                u = t++;\
+                                u->path = pathn++;\
+                            }\
+                            u->ssd = ssd;\
+                            u->step = STEP_INDEX;\
+                            u->sample2 = nodes[j]->sample1;\
+                            u->sample1 = dec_sample;\
+                            paths[u->path].nibble = nibble;\
+                            paths[u->path].prev = nodes[j]->path;\
+                            memmove(&nodes_next[k+1], &nodes_next[k], (frontier-k-1)*sizeof(TrellisNode*));\
+                            nodes_next[k] = u;\
+                            break;\
+                        }\
+                    }\
+                    next_##NAME:;
+                    STORE_NODE(ms, FFMAX(16, (AdaptationTable[nibble] * step) >> 8));
+                }
+            } else if(version == CODEC_ID_ADPCM_IMA_WAV) {
+#define LOOP_NODES(NAME, STEP_TABLE, STEP_INDEX)\
+                const int predictor = nodes[j]->sample1;\
+                const int div = (sample - predictor) * 4 / STEP_TABLE;\
+                int nmin = clip(div-range, -7, 6);\
+                int nmax = clip(div+range, -6, 7);\
+                if(nmin<=0) nmin--; /* distinguish -0 from +0 */\
+                if(nmax<0) nmax--;\
+                for(nidx=nmin; nidx<=nmax; nidx++) {\
+                    const int nibble = nidx<0 ? 7-nidx : nidx;\
+                    int dec_sample = predictor + (STEP_TABLE * yamaha_difflookup[nibble]) / 8;\
+                    STORE_NODE(NAME, STEP_INDEX);\
+                }
+                LOOP_NODES(ima, step_table[step], clip(step + index_table[nibble], 0, 88));
+            } else { //CODEC_ID_ADPCM_YAMAHA
+                LOOP_NODES(yamaha, step, clip((step * yamaha_indexscale[nibble]) >> 8, 127, 24567));
+#undef LOOP_NODES
+#undef STORE_NODE
+            }
+        }
+
+        u = nodes;
+        nodes = nodes_next;
+        nodes_next = u;
+
+        // prevent overflow
+        if(nodes[0]->ssd > (1<<28)) {
+            for(j=1; j<frontier && nodes[j]; j++)
+                nodes[j]->ssd -= nodes[0]->ssd;
+            nodes[0]->ssd = 0;
+        }
+
+        // merge old paths to save memory
+        if(i == froze + FREEZE_INTERVAL) {
+            p = &paths[nodes[0]->path];
+            for(k=i; k>froze; k--) {
+                dst[k] = p->nibble;
+                p = &paths[p->prev];
+            }
+            froze = i;
+            pathn = 0;
+            // other nodes might use paths that don't coincide with the frozen one.
+            // checking which nodes do so is too slow, so just kill them all.
+            // this also slightly improves quality, but I don't know why.
+            memset(nodes+1, 0, (frontier-1)*sizeof(TrellisNode*));
+        }
+    }
+
+    p = &paths[nodes[0]->path];
+    for(i=n-1; i>froze; i--) {
+        dst[i] = p->nibble;
+        p = &paths[p->prev];
+    }
+
+    c->predictor = nodes[0]->sample1;
+    c->sample1 = nodes[0]->sample1;
+    c->sample2 = nodes[0]->sample2;
+    c->step_index = nodes[0]->step;
+    c->step = nodes[0]->step;
+    c->idelta = nodes[0]->step;
 }
 
 static int adpcm_encode_frame(AVCodecContext *avctx,
@@ -335,6 +464,24 @@ static int adpcm_encode_frame(AVCodecContext *avctx,
             }
 
             /* stereo: 4 bytes (8 samples) for left, 4 bytes for right, 4 bytes left, ... */
+            if(avctx->trellis > 0) {
+                uint8_t buf[2][n*8];
+                adpcm_compress_trellis(avctx, samples, buf[0], &c->status[0], n*8);
+                if(avctx->channels == 2)
+                    adpcm_compress_trellis(avctx, samples+1, buf[1], &c->status[1], n*8);
+                for(i=0; i<n; i++) {
+                    *dst++ = buf[0][8*i+0] | (buf[0][8*i+1] << 4);
+                    *dst++ = buf[0][8*i+2] | (buf[0][8*i+3] << 4);
+                    *dst++ = buf[0][8*i+4] | (buf[0][8*i+5] << 4);
+                    *dst++ = buf[0][8*i+6] | (buf[0][8*i+7] << 4);
+                    if (avctx->channels == 2) {
+                        *dst++ = buf[1][8*i+0] | (buf[1][8*i+1] << 4);
+                        *dst++ = buf[1][8*i+2] | (buf[1][8*i+3] << 4);
+                        *dst++ = buf[1][8*i+4] | (buf[1][8*i+5] << 4);
+                        *dst++ = buf[1][8*i+6] | (buf[1][8*i+7] << 4);
+                    }
+                }
+            } else
             for (; n>0; n--) {
                 *dst = adpcm_ima_compress_sample(&c->status[0], samples[0]) & 0x0F;
                 *dst |= (adpcm_ima_compress_sample(&c->status[0], samples[avctx->channels]) << 4) & 0xF0;
@@ -394,6 +541,21 @@ static int adpcm_encode_frame(AVCodecContext *avctx,
             *dst++ = c->status[i].sample2 >> 8;
         }
 
+        if(avctx->trellis > 0) {
+            int n = avctx->block_align - 7*avctx->channels;
+            uint8_t buf[2][n];
+            if(avctx->channels == 1) {
+                n *= 2;
+                adpcm_compress_trellis(avctx, samples, buf[0], &c->status[0], n);
+                for(i=0; i<n; i+=2)
+                    *dst++ = (buf[0][i] << 4) | buf[0][i+1];
+            } else {
+                adpcm_compress_trellis(avctx, samples, buf[0], &c->status[0], n);
+                adpcm_compress_trellis(avctx, samples+1, buf[1], &c->status[1], n);
+                for(i=0; i<n; i++)
+                    *dst++ = (buf[0][i] << 4) | buf[1][i];
+            }
+        } else
         for(i=7*avctx->channels; i<avctx->block_align; i++) {
             int nibble;
             nibble = adpcm_ms_compress_sample(&c->status[ 0], *samples++)<<4;
@@ -403,6 +565,20 @@ static int adpcm_encode_frame(AVCodecContext *avctx,
         break;
     case CODEC_ID_ADPCM_YAMAHA:
         n = avctx->frame_size / 2;
+        if(avctx->trellis > 0) {
+            uint8_t buf[2][n*2];
+            n *= 2;
+            if(avctx->channels == 1) {
+                adpcm_compress_trellis(avctx, samples, buf[0], &c->status[0], n);
+                for(i=0; i<n; i+=2)
+                    *dst++ = buf[0][i] | (buf[0][i+1] << 4);
+            } else {
+                adpcm_compress_trellis(avctx, samples, buf[0], &c->status[0], n);
+                adpcm_compress_trellis(avctx, samples+1, buf[1], &c->status[1], n);
+                for(i=0; i<n; i++)
+                    *dst++ = buf[0][i] | (buf[1][i] << 4);
+            }
+        } else
         for (; n>0; n--) {
             for(i = 0; i < avctx->channels; i++) {
                 int nibble;
@@ -514,6 +690,34 @@ static inline short adpcm_ct_expand_nibble(ADPCMChannelStatus *c, char nibble)
     return (short)predictor;
 }
 
+static inline short adpcm_sbpro_expand_nibble(ADPCMChannelStatus *c, char nibble, int size, int shift)
+{
+    int sign, delta, diff;
+
+    sign = nibble & (1<<(size-1));
+    delta = nibble & ((1<<(size-1))-1);
+    diff = delta << (7 + c->step + shift);
+
+    if (sign)
+        c->predictor -= diff;
+    else
+        c->predictor += diff;
+
+    /* clamp result */
+    if (c->predictor > 16256)
+        c->predictor = 16256;
+    else if (c->predictor < -16384)
+        c->predictor = -16384;
+
+    /* calculate new step */
+    if (delta >= (2*size - 3) && c->step < 3)
+        c->step++;
+    else if (delta == 0 && c->step > 0)
+        c->step--;
+
+    return (short) c->predictor;
+}
+
 static inline short adpcm_yamaha_expand_nibble(ADPCMChannelStatus *c, unsigned char nibble)
 {
     if(!c->step) {
@@ -644,7 +848,7 @@ static int adpcm_decode_frame(AVCodecContext *avctx,
     samples = data;
     src = buf;
 
-    st = avctx->channels == 2;
+    st = avctx->channels == 2 ? 1 : 0;
 
     switch(avctx->codec->id) {
     case CODEC_ID_ADPCM_IMA_QT:
@@ -666,8 +870,10 @@ static int adpcm_decode_frame(AVCodecContext *avctx,
 
         cs->step_index = (*src++) & 0x7F;
 
-        if (cs->step_index > 88) av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n", cs->step_index);
-        if (cs->step_index > 88) cs->step_index = 88;
+        if (cs->step_index > 88){
+            av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n", cs->step_index);
+            cs->step_index = 88;
+        }
 
         cs->step = step_table[cs->step_index];
 
@@ -693,35 +899,32 @@ static int adpcm_decode_frame(AVCodecContext *avctx,
         if (avctx->block_align != 0 && buf_size > avctx->block_align)
             buf_size = avctx->block_align;
 
+//        samples_per_block= (block_align-4*chanels)*8 / (bits_per_sample * chanels) + 1;
+
         for(i=0; i<avctx->channels; i++){
             cs = &(c->status[i]);
-            cs->predictor = *src++;
-            cs->predictor |= (*src++) << 8;
-            if(cs->predictor & 0x8000)
-                cs->predictor -= 0x10000;
-            CLAMP_TO_SHORT(cs->predictor);
+            cs->predictor = (int16_t)(src[0] + (src[1]<<8));
+            src+=2;
 
         // XXX: is this correct ??: *samples++ = cs->predictor;
 
             cs->step_index = *src++;
-            if (cs->step_index < 0) cs->step_index = 0;
-            if (cs->step_index > 88) cs->step_index = 88;
-            if (*src++) av_log(avctx, AV_LOG_ERROR, "unused byte should be null !!\n"); /* unused */
+            if (cs->step_index > 88){
+                av_log(avctx, AV_LOG_ERROR, "ERROR: step_index = %i\n", cs->step_index);
+                cs->step_index = 88;
+            }
+            if (*src++) av_log(avctx, AV_LOG_ERROR, "unused byte should be null but is %d!!\n", src[-1]); /* unused */
         }
 
-        for(m=4; src < (buf + buf_size);) {
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], src[0] & 0x0F, 3);
-            if (st)
-                *samples++ = adpcm_ima_expand_nibble(&c->status[1], src[4] & 0x0F, 3);
-            *samples++ = adpcm_ima_expand_nibble(&c->status[0], (src[0] >> 4) & 0x0F, 3);
-            if (st) {
-                *samples++ = adpcm_ima_expand_nibble(&c->status[1], (src[4] >> 4) & 0x0F, 3);
-                if (!--m) {
-                    m=4;
-                    src+=4;
-                }
+        while(src < buf + buf_size){
+            for(m=0; m<4; m++){
+                for(i=0; i<=st; i++)
+                    *samples++ = adpcm_ima_expand_nibble(&c->status[i], src[4*i] & 0x0F, 3);
+                for(i=0; i<=st; i++)
+                    *samples++ = adpcm_ima_expand_nibble(&c->status[i], src[4*i] >> 4  , 3);
+                src++;
             }
-            src++;
+            src += 4*st;
         }
         break;
     case CODEC_ID_ADPCM_4XM:
@@ -973,6 +1176,48 @@ static int adpcm_decode_frame(AVCodecContext *avctx,
             src++;
         }
         break;
+    case CODEC_ID_ADPCM_SBPRO_4:
+    case CODEC_ID_ADPCM_SBPRO_3:
+    case CODEC_ID_ADPCM_SBPRO_2:
+        if (!c->status[0].step_index) {
+            /* the first byte is a raw sample */
+            *samples++ = 128 * (*src++ - 0x80);
+            if (st)
+              *samples++ = 128 * (*src++ - 0x80);
+            c->status[0].step_index = 1;
+        }
+        if (avctx->codec->id == CODEC_ID_ADPCM_SBPRO_4) {
+            while (src < buf + buf_size) {
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
+                    (src[0] >> 4) & 0x0F, 4, 0);
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[st],
+                    src[0] & 0x0F, 4, 0);
+                src++;
+            }
+        } else if (avctx->codec->id == CODEC_ID_ADPCM_SBPRO_3) {
+            while (src < buf + buf_size) {
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
+                    (src[0] >> 5) & 0x07, 3, 0);
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
+                    (src[0] >> 2) & 0x07, 3, 0);
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
+                    src[0] & 0x03, 2, 0);
+                src++;
+            }
+        } else {
+            while (src < buf + buf_size) {
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
+                    (src[0] >> 6) & 0x03, 2, 2);
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[st],
+                    (src[0] >> 4) & 0x03, 2, 2);
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[0],
+                    (src[0] >> 2) & 0x03, 2, 2);
+                *samples++ = adpcm_sbpro_expand_nibble(&c->status[st],
+                    src[0] & 0x03, 2, 2);
+                src++;
+            }
+        }
+        break;
     case CODEC_ID_ADPCM_SWF:
     {
         GetBitContext gb;
@@ -1112,10 +1357,12 @@ ADPCM_CODEC(CODEC_ID_ADPCM_IMA_SMJPEG, adpcm_ima_smjpeg);
 ADPCM_CODEC(CODEC_ID_ADPCM_MS, adpcm_ms);
 ADPCM_CODEC(CODEC_ID_ADPCM_4XM, adpcm_4xm);
 ADPCM_CODEC(CODEC_ID_ADPCM_XA, adpcm_xa);
-ADPCM_CODEC(CODEC_ID_ADPCM_ADX, adpcm_adx);
 ADPCM_CODEC(CODEC_ID_ADPCM_EA, adpcm_ea);
 ADPCM_CODEC(CODEC_ID_ADPCM_CT, adpcm_ct);
 ADPCM_CODEC(CODEC_ID_ADPCM_SWF, adpcm_swf);
 ADPCM_CODEC(CODEC_ID_ADPCM_YAMAHA, adpcm_yamaha);
+ADPCM_CODEC(CODEC_ID_ADPCM_SBPRO_4, adpcm_sbpro_4);
+ADPCM_CODEC(CODEC_ID_ADPCM_SBPRO_3, adpcm_sbpro_3);
+ADPCM_CODEC(CODEC_ID_ADPCM_SBPRO_2, adpcm_sbpro_2);
 
 #undef ADPCM_CODEC
diff --git a/src/libffmpeg/libavcodec/adx.c b/src/libffmpeg/libavcodec/adx.c
index c841e4eb8..c8c785590 100644
--- a/src/libffmpeg/libavcodec/adx.c
+++ b/src/libffmpeg/libavcodec/adx.c
@@ -267,7 +267,7 @@ static uint32_t read_long(const unsigned char *p)
     return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
 }
 
-int is_adx(const unsigned char *buf,size_t bufsize)
+static int is_adx(const unsigned char *buf,size_t bufsize)
 {
     int    offset;
 
@@ -385,8 +385,8 @@ static int adx_decode_frame(AVCodecContext *avctx,
 }
 
 #ifdef CONFIG_ENCODERS
-AVCodec adx_adpcm_encoder = {
-    "adx_adpcm",
+AVCodec adpcm_adx_encoder = {
+    "adpcm_adx",
     CODEC_TYPE_AUDIO,
     CODEC_ID_ADPCM_ADX,
     sizeof(ADXContext),
@@ -397,8 +397,8 @@ AVCodec adx_adpcm_encoder = {
 };
 #endif //CONFIG_ENCODERS
 
-AVCodec adx_adpcm_decoder = {
-    "adx_adpcm",
+AVCodec adpcm_adx_decoder = {
+    "adpcm_adx",
     CODEC_TYPE_AUDIO,
     CODEC_ID_ADPCM_ADX,
     sizeof(ADXContext),
diff --git a/src/libffmpeg/libavcodec/alac.c b/src/libffmpeg/libavcodec/alac.c
index 21457ab23..8bd75e5d9 100644
--- a/src/libffmpeg/libavcodec/alac.c
+++ b/src/libffmpeg/libavcodec/alac.c
@@ -32,6 +32,22 @@
  *  bytes 0-3   atom size (0x24), big-endian
  *  bytes 4-7   atom type ('alac', not the 'alac' tag from start of stsd)
  *  bytes 8-35  data bytes needed by decoder
+ *
+ * Extradata:
+ * 32bit  size
+ * 32bit  tag (=alac)
+ * 32bit  zero?
+ * 32bit  max sample per frame
+ *  8bit  ?? (zero?)
+ *  8bit  sample size
+ *  8bit  history mult
+ *  8bit  initial history
+ *  8bit  kmodifier
+ *  8bit  channels?
+ * 16bit  ??
+ * 32bit  max coded frame size
+ * 32bit  bitrate?
+ * 32bit  samplerate
  */
 
 
@@ -84,7 +100,7 @@ static void allocate_buffers(ALACContext *alac)
     alac->outputsamples_buffer_b = av_malloc(alac->setinfo_max_samples_per_frame * 4);
 }
 
-static void alac_set_info(ALACContext *alac)
+static int alac_set_info(ALACContext *alac)
 {
     unsigned char *ptr = alac->avctx->extradata;
 
@@ -92,6 +108,10 @@ static void alac_set_info(ALACContext *alac)
     ptr += 4; /* alac */
     ptr += 4; /* 0 ? */
 
+    if(BE_32(ptr) >= UINT_MAX/4){
+        av_log(alac->avctx, AV_LOG_ERROR, "setinfo_max_samples_per_frame too large\n");
+        return -1;
+    }
     alac->setinfo_max_samples_per_frame = BE_32(ptr); /* buffer size / 2 ? */
     ptr += 4;
     alac->setinfo_7a = *ptr++;
@@ -99,17 +119,19 @@ static void alac_set_info(ALACContext *alac)
     alac->setinfo_rice_historymult = *ptr++;
     alac->setinfo_rice_initialhistory = *ptr++;
     alac->setinfo_rice_kmodifier = *ptr++;
-    alac->setinfo_7f = *ptr++;
+    alac->setinfo_7f = *ptr++; // channels?
     alac->setinfo_80 = BE_16(ptr);
     ptr += 2;
-    alac->setinfo_82 = BE_32(ptr);
+    alac->setinfo_82 = BE_32(ptr); // max coded frame size
     ptr += 4;
-    alac->setinfo_86 = BE_32(ptr);
+    alac->setinfo_86 = BE_32(ptr); // bitrate ?
     ptr += 4;
-    alac->setinfo_8a_rate = BE_32(ptr);
+    alac->setinfo_8a_rate = BE_32(ptr); // samplerate
     ptr += 4;
 
     allocate_buffers(alac);
+
+    return 0;
 }
 
 /* hideously inefficient. could use a bitmask search,
@@ -385,7 +407,7 @@ static void predictor_decompress_fir_adapt(int32_t *error_buffer,
     }
 }
 
-void deinterlace_16(int32_t *buffer_a, int32_t *buffer_b,
+static void deinterlace_16(int32_t *buffer_a, int32_t *buffer_b,
                     int16_t *buffer_out,
                     int numchannels, int numsamples,
                     uint8_t interlacing_shift,
@@ -444,7 +466,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
     /* initialize from the extradata */
     if (!alac->context_initialized) {
         if (alac->avctx->extradata_size != ALAC_EXTRADATA_SIZE) {
-            av_log(NULL, AV_LOG_ERROR, "alac: expected %d extradata bytes\n",
+            av_log(avctx, AV_LOG_ERROR, "alac: expected %d extradata bytes\n",
                 ALAC_EXTRADATA_SIZE);
             return input_buffer_size;
         }
@@ -500,7 +522,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
             int prediction_quantitization;
             int i;
 
-            /* skip 16 bits, not sure what they are. seem to be used in
+            /* FIXME: skip 16 bits, not sure what they are. seem to be used in
              * two channel case */
             get_bits(&alac->gb, 8);
             get_bits(&alac->gb, 8);
@@ -520,7 +542,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
                 /* these bytes seem to have something to do with
                  * > 2 channel files.
                  */
-                av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented, unhandling of wasted_bytes\n");
+                av_log(avctx, AV_LOG_ERROR, "FIXME: unimplemented, unhandling of wasted_bytes\n");
             }
 
             bastardized_rice_decompress(alac,
@@ -542,7 +564,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
                                                predictor_coef_num,
                                                prediction_quantitization);
             } else {
-                av_log(NULL, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type);
+                av_log(avctx, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type);
                 /* i think the only other prediction type (or perhaps this is just a
                  * boolean?) runs adaptive fir twice.. like:
                  * predictor_decompress_fir_adapt(predictor_error, tempout, ...)
@@ -586,7 +608,6 @@ static int alac_decode_frame(AVCodecContext *avctx,
             int i;
             for (i = 0; i < outputsamples; i++) {
                 int16_t sample = alac->outputsamples_buffer_a[i];
-                sample = be2me_16(sample);
                 ((int16_t*)outbuffer)[i * alac->numchannels] = sample;
             }
             break;
@@ -594,7 +615,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
         case 20:
         case 24:
         case 32:
-            av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented sample size %i\n", alac->setinfo_sample_size);
+            av_log(avctx, AV_LOG_ERROR, "FIXME: unimplemented sample size %i\n", alac->setinfo_sample_size);
             break;
         default:
             break;
@@ -679,7 +700,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
             /*********************/
             if (wasted_bytes) {
               /* see mono case */
-                av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented, unhandling of wasted_bytes\n");
+                av_log(avctx, AV_LOG_ERROR, "FIXME: unimplemented, unhandling of wasted_bytes\n");
             }
 
             /* channel 1 */
@@ -703,7 +724,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
                                                prediction_quantitization_a);
             } else {
               /* see mono case */
-                av_log(NULL, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type_a);
+                av_log(avctx, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type_a);
             }
 
             /* channel 2 */
@@ -726,7 +747,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
                                                predictor_coef_num_b,
                                                prediction_quantitization_b);
             } else {
-                av_log(NULL, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type_b);
+                av_log(avctx, AV_LOG_ERROR, "FIXME: unhandled prediction type: %i\n", prediction_type_b);
             }
         } else {
          /* not compressed, easy case */
@@ -782,7 +803,7 @@ static int alac_decode_frame(AVCodecContext *avctx,
         case 20:
         case 24:
         case 32:
-            av_log(NULL, AV_LOG_ERROR, "FIXME: unimplemented sample size %i\n", alac->setinfo_sample_size);
+            av_log(avctx, AV_LOG_ERROR, "FIXME: unimplemented sample size %i\n", alac->setinfo_sample_size);
             break;
         default:
             break;
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
index d555b874c..29ba9dc02 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
@@ -2,19 +2,19 @@
  * Alpha optimized DSP utils
  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /*
diff --git a/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S b/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S
index 276d310ef..e043f4371 100644
--- a/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S
+++ b/src/libffmpeg/libavcodec/alpha/motion_est_mvi_asm.S
@@ -2,19 +2,19 @@
  * Alpha optimized DSP utils
  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful,
+ * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "regdef.h"
diff --git a/src/libffmpeg/libavcodec/asv1.c b/src/libffmpeg/libavcodec/asv1.c
index 1cb15d812..3cfb76e65 100644
--- a/src/libffmpeg/libavcodec/asv1.c
+++ b/src/libffmpeg/libavcodec/asv1.c
@@ -289,6 +289,7 @@ static inline void asv2_encode_block(ASV1Context *a, DCTELEM block[64]){
         if( (block[index + 1] = (block[index + 1]*a->q_intra_matrix[index + 1] + (1<<15))>>16) ) ccp |= 2;
         if( (block[index + 9] = (block[index + 9]*a->q_intra_matrix[index + 9] + (1<<15))>>16) ) ccp |= 1;
 
+        assert(i || ccp<8);
         if(i) put_bits(&a->pb, ac_ccp_tab[ccp][1], ac_ccp_tab[ccp][0]);
         else  put_bits(&a->pb, dc_ccp_tab[ccp][1], dc_ccp_tab[ccp][0]);
 
diff --git a/src/libffmpeg/libavcodec/avcodec.h b/src/libffmpeg/libavcodec/avcodec.h
index 430504dc3..9be5dcf6e 100644
--- a/src/libffmpeg/libavcodec/avcodec.h
+++ b/src/libffmpeg/libavcodec/avcodec.h
@@ -14,15 +14,11 @@ extern "C" {
 #include "avutil.h"
 #include <sys/types.h> /* size_t */
 
-//FIXME the following 2 really dont belong in here
-#define FFMPEG_VERSION_INT      0x000409
-#define FFMPEG_VERSION          "CVS"
-
 #define AV_STRINGIFY(s)         AV_TOSTRING(s)
 #define AV_TOSTRING(s) #s
 
-#define LIBAVCODEC_VERSION_INT  ((51<<16)+(1<<8)+0)
-#define LIBAVCODEC_VERSION      51.1.0
+#define LIBAVCODEC_VERSION_INT  ((51<<16)+(11<<8)+0)
+#define LIBAVCODEC_VERSION      51.11.0
 #define LIBAVCODEC_BUILD        LIBAVCODEC_VERSION_INT
 
 #define LIBAVCODEC_IDENT        "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
@@ -112,7 +108,7 @@ enum CodecID {
     CODEC_ID_FFVHUFF,
     CODEC_ID_RV30,
     CODEC_ID_RV40,
-    CODEC_ID_VC9,
+    CODEC_ID_VC1,
     CODEC_ID_WMV3,
     CODEC_ID_LOCO,
     CODEC_ID_WNV1,
@@ -122,6 +118,14 @@ enum CodecID {
     CODEC_ID_TRUEMOTION2,
     CODEC_ID_BMP,
     CODEC_ID_CSCD,
+    CODEC_ID_MMVIDEO,
+    CODEC_ID_ZMBV,
+    CODEC_ID_AVS,
+    CODEC_ID_SMACKVIDEO,
+    CODEC_ID_NUV,
+    CODEC_ID_KMVC,
+    CODEC_ID_FLASHSV,
+    CODEC_ID_CAVS,
 
     /* various pcm "codecs" */
     CODEC_ID_PCM_S16LE= 0x10000,
@@ -158,6 +162,9 @@ enum CodecID {
     CODEC_ID_ADPCM_CT,
     CODEC_ID_ADPCM_SWF,
     CODEC_ID_ADPCM_YAMAHA,
+    CODEC_ID_ADPCM_SBPRO_4,
+    CODEC_ID_ADPCM_SBPRO_3,
+    CODEC_ID_ADPCM_SBPRO_2,
 
     /* AMR */
     CODEC_ID_AMR_NB= 0x12000,
@@ -198,8 +205,8 @@ enum CodecID {
     CODEC_ID_QDM2,
     CODEC_ID_COOK,
     CODEC_ID_TRUESPEECH,
-
-    CODEC_ID_OGGTHEORA= 0x16000,
+    CODEC_ID_TTA,
+    CODEC_ID_SMACKAUDIO,
 
     /* subtitle codecs */
     CODEC_ID_DVD_SUBTITLE= 0x17000,
@@ -220,61 +227,19 @@ enum CodecType {
     CODEC_TYPE_SUBTITLE,
 };
 
-/**
- * Pixel format. Notes:
- *
- * PIX_FMT_RGBA32 is handled in an endian-specific manner. A RGBA
- * color is put together as:
- *  (A << 24) | (R << 16) | (G << 8) | B
- * This is stored as BGRA on little endian CPU architectures and ARGB on
- * big endian CPUs.
- *
- * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized
- * image data is stored in AVFrame.data[0]. The palette is transported in
- * AVFrame.data[1] and, is 1024 bytes long (256 4-byte entries) and is
- * formatted the same as in PIX_FMT_RGBA32 described above (i.e., it is
- * also endian-specific). Note also that the individual RGB palette
- * components stored in AVFrame.data[1] should be in the range 0..255.
- * This is important as many custom PAL8 video codecs that were designed
- * to run on the IBM VGA graphics adapter use 6-bit palette components.
- */
-enum PixelFormat {
-    PIX_FMT_NONE= -1,
-    PIX_FMT_YUV420P,   ///< Planar YUV 4:2:0 (1 Cr & Cb sample per 2x2 Y samples)
-    PIX_FMT_YUV422,    ///< Packed pixel, Y0 Cb Y1 Cr
-    PIX_FMT_RGB24,     ///< Packed pixel, 3 bytes per pixel, RGBRGB...
-    PIX_FMT_BGR24,     ///< Packed pixel, 3 bytes per pixel, BGRBGR...
-    PIX_FMT_YUV422P,   ///< Planar YUV 4:2:2 (1 Cr & Cb sample per 2x1 Y samples)
-    PIX_FMT_YUV444P,   ///< Planar YUV 4:4:4 (1 Cr & Cb sample per 1x1 Y samples)
-    PIX_FMT_RGBA32,    ///< Packed pixel, 4 bytes per pixel, BGRABGRA..., stored in cpu endianness
-    PIX_FMT_YUV410P,   ///< Planar YUV 4:1:0 (1 Cr & Cb sample per 4x4 Y samples)
-    PIX_FMT_YUV411P,   ///< Planar YUV 4:1:1 (1 Cr & Cb sample per 4x1 Y samples)
-    PIX_FMT_RGB565,    ///< always stored in cpu endianness
-    PIX_FMT_RGB555,    ///< always stored in cpu endianness, most significant bit to 1
-    PIX_FMT_GRAY8,
-    PIX_FMT_MONOWHITE, ///< 0 is white
-    PIX_FMT_MONOBLACK, ///< 0 is black
-    PIX_FMT_PAL8,      ///< 8 bit with RGBA palette
-    PIX_FMT_YUVJ420P,  ///< Planar YUV 4:2:0 full scale (jpeg)
-    PIX_FMT_YUVJ422P,  ///< Planar YUV 4:2:2 full scale (jpeg)
-    PIX_FMT_YUVJ444P,  ///< Planar YUV 4:4:4 full scale (jpeg)
-    PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing(xvmc_render.h)
-    PIX_FMT_XVMC_MPEG2_IDCT,
-    PIX_FMT_UYVY422,   ///< Packed pixel, Cb Y0 Cr Y1
-    PIX_FMT_UYVY411,   ///< Packed pixel, Cb Y0 Y1 Cr Y2 Y3
-    PIX_FMT_NB,
-};
-
 /* currently unused, may be used if 24/32 bits samples ever supported */
+/* all in native endian */
 enum SampleFormat {
-    SAMPLE_FMT_S16 = 0,         ///< signed 16 bits
+    SAMPLE_FMT_NONE = -1,
+    SAMPLE_FMT_U8,              ///< unsigned 8 bits
+    SAMPLE_FMT_S16,             ///< signed 16 bits
+    SAMPLE_FMT_S24,             ///< signed 24 bits
     SAMPLE_FMT_S32,             ///< signed 32 bits
     SAMPLE_FMT_FLT,             ///< float
-    SAMPLE_FMT_DBL,             ///< double
 };
 
 /* in bytes */
-#define AVCODEC_MAX_AUDIO_FRAME_SIZE 131072
+#define AVCODEC_MAX_AUDIO_FRAME_SIZE 192000 // 1 second of 48khz 32bit audio
 
 /**
  * Required number of additionally allocated bytes at the end of the input bitstream for decoding.
@@ -321,10 +286,7 @@ typedef struct RcOverride{
     float quality_factor;
 } RcOverride;
 
-/* only for ME compatiblity with old apps */
-extern int motion_estimation_method;
-
-#define FF_MAX_B_FRAMES 8
+#define FF_MAX_B_FRAMES 16
 
 /* encoding support
    these flags can be passed in AVCodecContext.flags before initing
@@ -367,7 +329,7 @@ extern int motion_estimation_method;
 #define CODEC_FLAG_H263P_SLICE_STRUCT 0x10000000
 #define CODEC_FLAG_INTERLACED_ME  0x20000000 ///< interlaced motion estimation
 #define CODEC_FLAG_SVCD_SCAN_OFFSET 0x40000000 ///< will reserve space for SVCD scan offset user data
-#define CODEC_FLAG_CLOSED_GOP     0x80000000
+#define CODEC_FLAG_CLOSED_GOP     ((int)0x80000000)
 #define CODEC_FLAG2_FAST          0x00000001 ///< allow non spec compliant speedup tricks
 #define CODEC_FLAG2_STRICT_GOP    0x00000002 ///< strictly enforce GOP size
 #define CODEC_FLAG2_NO_OUTPUT     0x00000004 ///< skip bitstream encoding
@@ -379,6 +341,8 @@ extern int motion_estimation_method;
 #define CODEC_FLAG2_FASTPSKIP     0x00000100 ///< H.264 fast pskip
 #define CODEC_FLAG2_AUD           0x00000200 ///< H.264 access unit delimiters
 #define CODEC_FLAG2_BRDO          0x00000400 ///< b-frame rate-distortion optimization
+#define CODEC_FLAG2_INTRA_VLC     0x00000800 ///< use MPEG-2 intra VLC table
+#define CODEC_FLAG2_MEMC_ONLY     0x00001000 ///< only do ME/MC (I frames -> ref, P frame -> ME+MC)
 
 /* Unsupported options :
  *              Syntax Arithmetic coding (SAC)
@@ -404,6 +368,11 @@ extern int motion_estimation_method;
  * if this is not set, the codec is guranteed to never be feeded with NULL data
  */
 #define CODEC_CAP_DELAY           0x0020
+/**
+ * Codec can be fed a final frame with a smaller size.
+ * This can be used to prevent truncation of the last audio samples.
+ */
+#define CODEC_CAP_SMALL_LAST_FRAME 0x0040
 
 //the following defines may change, don't expect compatibility if you use them
 #define MB_TYPE_INTRA4x4   0x0001
@@ -697,19 +666,6 @@ typedef struct AVFrame {
 #define DEFAULT_FRAME_RATE_BASE 1001000
 
 /**
- * Used by av_log
- */
-typedef struct AVCLASS AVClass;
-struct AVCLASS {
-    const char* class_name;
-    const char* (*item_name)(void*); /* actually passing a pointer to an AVCodecContext
-                                        or AVFormatContext, which begin with an AVClass.
-                                        Needed because av_log is in libavcodec and has no visibility
-                                        of AVIn/OutputFormat */
-    struct AVOption *option;
-};
-
-/**
  * main external api structure.
  */
 typedef struct AVCodecContext {
@@ -1243,6 +1199,7 @@ typedef struct AVCodecContext {
 #define FF_IDCT_VP3          12
 #define FF_IDCT_IPP          13
 #define FF_IDCT_XVIDMMX      14
+#define FF_IDCT_CAVS         15
 
     /**
      * slice count.
@@ -2005,6 +1962,73 @@ typedef struct AVCodecContext {
      * - decoding: unused
      */
     int scenechange_factor;
+
+    /**
+     *
+     * note: value depends upon the compare functin used for fullpel ME
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int mv0_threshold;
+
+    /**
+     * adjusts sensitivity of b_frame_strategy 1
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int b_sensitivity;
+
+    /**
+     * - encoding: set by user.
+     * - decoding: unused
+     */
+    int compression_level;
+#define FF_COMPRESSION_DEFAULT -1
+
+    /**
+     * sets whether to use LPC mode - used by FLAC encoder
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int use_lpc;
+
+    /**
+     * LPC coefficient precision - used by FLAC encoder
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int lpc_coeff_precision;
+
+    /**
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int min_prediction_order;
+
+    /**
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int max_prediction_order;
+
+    /**
+     * search method for selecting prediction order
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int prediction_order_method;
+
+    /**
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int min_partition_order;
+
+    /**
+     * - encoding: set by user.
+     * - decoding: unused.
+     */
+    int max_partition_order;
 } AVCodecContext;
 
 /**
@@ -2083,8 +2107,8 @@ extern AVCodec ac3_encoder;
 extern AVCodec mp2_encoder;
 extern AVCodec mp3lame_encoder;
 extern AVCodec oggvorbis_encoder;
-extern AVCodec oggtheora_encoder;
 extern AVCodec faac_encoder;
+extern AVCodec flac_encoder;
 extern AVCodec xvid_encoder;
 extern AVCodec mpeg1video_encoder;
 extern AVCodec mpeg2video_encoder;
@@ -2133,7 +2157,7 @@ extern AVCodec msmpeg4v2_decoder;
 extern AVCodec msmpeg4v3_decoder;
 extern AVCodec wmv1_decoder;
 extern AVCodec wmv2_decoder;
-extern AVCodec vc9_decoder;
+extern AVCodec vc1_decoder;
 extern AVCodec wmv3_decoder;
 extern AVCodec mpeg1video_decoder;
 extern AVCodec mpeg2video_decoder;
@@ -2161,12 +2185,12 @@ extern AVCodec mp3on4_decoder;
 extern AVCodec qdm2_decoder;
 extern AVCodec cook_decoder;
 extern AVCodec truespeech_decoder;
+extern AVCodec tta_decoder;
 extern AVCodec mace3_decoder;
 extern AVCodec mace6_decoder;
 extern AVCodec huffyuv_decoder;
 extern AVCodec ffvhuff_decoder;
 extern AVCodec oggvorbis_decoder;
-extern AVCodec oggtheora_decoder;
 extern AVCodec cyuv_decoder;
 extern AVCodec h264_decoder;
 extern AVCodec indeo3_decoder;
@@ -2215,6 +2239,7 @@ extern AVCodec qtrle_decoder;
 extern AVCodec flac_decoder;
 extern AVCodec tscc_decoder;
 extern AVCodec cscd_decoder;
+extern AVCodec nuv_decoder;
 extern AVCodec ulti_decoder;
 extern AVCodec qdraw_decoder;
 extern AVCodec xl_decoder;
@@ -2231,6 +2256,14 @@ extern AVCodec fraps_decoder;
 extern AVCodec libgsm_encoder;
 extern AVCodec libgsm_decoder;
 extern AVCodec bmp_decoder;
+extern AVCodec mmvideo_decoder;
+extern AVCodec zmbv_decoder;
+extern AVCodec avs_decoder;
+extern AVCodec smacker_decoder;
+extern AVCodec smackaud_decoder;
+extern AVCodec kmvc_decoder;
+extern AVCodec flashsv_decoder;
+extern AVCodec cavs_decoder;
 
 /* pcm codecs */
 #define PCM_CODEC(id, name) \
@@ -2272,6 +2305,9 @@ PCM_CODEC(CODEC_ID_ADPCM_G726, adpcm_g726);
 PCM_CODEC(CODEC_ID_ADPCM_CT, adpcm_ct);
 PCM_CODEC(CODEC_ID_ADPCM_SWF, adpcm_swf);
 PCM_CODEC(CODEC_ID_ADPCM_YAMAHA, adpcm_yamaha);
+PCM_CODEC(CODEC_ID_ADPCM_SBPRO_4, adpcm_sbpro_4);
+PCM_CODEC(CODEC_ID_ADPCM_SBPRO_3, adpcm_sbpro_3);
+PCM_CODEC(CODEC_ID_ADPCM_SBPRO_2, adpcm_sbpro_2);
 
 #undef PCM_CODEC
 
@@ -2454,6 +2490,10 @@ void avcodec_default_free_buffers(AVCodecContext *s);
  */
 char av_get_pict_type_char(int pict_type);
 
+/**
+ * returns codec bits per sample
+ */
+int av_get_bits_per_sample(enum CodecID codec_id);
 
 /* frame parsing */
 typedef struct AVCodecParserContext {
@@ -2514,6 +2554,7 @@ void av_parser_close(AVCodecParserContext *s);
 
 extern AVCodecParser mpegvideo_parser;
 extern AVCodecParser mpeg4video_parser;
+extern AVCodecParser cavsvideo_parser;
 extern AVCodecParser h261_parser;
 extern AVCodecParser h263_parser;
 extern AVCodecParser h264_parser;
@@ -2523,12 +2564,44 @@ extern AVCodecParser mpegaudio_parser;
 extern AVCodecParser ac3_parser;
 extern AVCodecParser dvdsub_parser;
 extern AVCodecParser dvbsub_parser;
+extern AVCodecParser aac_parser;
+
+
+typedef struct AVBitStreamFilterContext {
+    void *priv_data;
+    struct AVBitStreamFilter *filter;
+    AVCodecParserContext *parser;
+    struct AVBitStreamFilterContext *next;
+} AVBitStreamFilterContext;
+
+
+typedef struct AVBitStreamFilter {
+    const char *name;
+    int priv_data_size;
+    int (*filter)(AVBitStreamFilterContext *bsfc,
+                  AVCodecContext *avctx, const char *args,
+                  uint8_t **poutbuf, int *poutbuf_size,
+                  const uint8_t *buf, int buf_size, int keyframe);
+    struct AVBitStreamFilter *next;
+} AVBitStreamFilter;
+
+extern AVBitStreamFilter *av_first_bitstream_filter;
+
+void av_register_bitstream_filter(AVBitStreamFilter *bsf);
+AVBitStreamFilterContext *av_bitstream_filter_init(const char *name);
+int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
+                               AVCodecContext *avctx, const char *args,
+                               uint8_t **poutbuf, int *poutbuf_size,
+                               const uint8_t *buf, int buf_size, int keyframe);
+void av_bitstream_filter_close(AVBitStreamFilterContext *bsf);
+
+extern AVBitStreamFilter dump_extradata_bsf;
+extern AVBitStreamFilter remove_extradata_bsf;
+extern AVBitStreamFilter noise_bsf;
+
 
 /* memory */
-void *av_malloc(unsigned int size);
 void *av_mallocz(unsigned int size);
-void *av_realloc(void *ptr, unsigned int size);
-void av_free(void *ptr);
 char *av_strdup(const char *s);
 void av_freep(void *ptr);
 void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
@@ -2538,31 +2611,14 @@ void av_free_static(void);
 void *av_mallocz_static(unsigned int size);
 void *av_realloc_static(void *ptr, unsigned int size);
 
-/* add by bero : in adx.c */
-int is_adx(const unsigned char *buf,size_t bufsize);
-
 void img_copy(AVPicture *dst, const AVPicture *src,
               int pix_fmt, int width, int height);
 
-/* av_log API */
-
-#include <stdarg.h>
-
-#define AV_LOG_QUIET -1
-#define AV_LOG_ERROR 0
-#define AV_LOG_INFO 1
-#define AV_LOG_DEBUG 2
-
-#ifdef __GNUC__
-extern void av_log(void*, int level, const char *fmt, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
-#else
-extern void av_log(void*, int level, const char *fmt, ...);
-#endif
+int img_crop(AVPicture *dst, const AVPicture *src,
+             int pix_fmt, int top_band, int left_band);
 
-extern void av_vlog(void*, int level, const char *fmt, va_list);
-extern int av_log_get_level(void);
-extern void av_log_set_level(int);
-extern void av_log_set_callback(void (*)(void*, int, const char*, va_list));
+int img_pad(AVPicture *dst, const AVPicture *src, int height, int width, int pix_fmt,
+            int padtop, int padbottom, int padleft, int padright, int *color);
 
 /* endian macros */
 #if !defined(BE_16) || !defined(BE_32) || !defined(LE_16) || !defined(LE_32)
diff --git a/src/libffmpeg/libavcodec/bitstream.h b/src/libffmpeg/libavcodec/bitstream.h
index 4a3d55d19..10db64d33 100644
--- a/src/libffmpeg/libavcodec/bitstream.h
+++ b/src/libffmpeg/libavcodec/bitstream.h
@@ -135,31 +135,40 @@ typedef struct RL_VLC_ELEM {
     uint8_t run;
 } RL_VLC_ELEM;
 
-#if defined(ARCH_SPARC) || defined(ARCH_ARMV4L)
+#if defined(ARCH_SPARC) || defined(ARCH_ARMV4L) || defined(ARCH_MIPS)
 #define UNALIGNED_STORES_ARE_BAD
 #endif
 
 /* used to avoid missaligned exceptions on some archs (alpha, ...) */
 #if defined(ARCH_X86) || defined(ARCH_X86_64)
+#    define unaligned16(a) (*(const uint16_t*)(a))
 #    define unaligned32(a) (*(const uint32_t*)(a))
+#    define unaligned64(a) (*(const uint64_t*)(a))
 #else
 #    ifdef __GNUC__
-static inline uint32_t unaligned32(const void *v) {
-    struct Unaligned {
-        uint32_t i;
-    } __attribute__((packed));
-
-    return ((const struct Unaligned *) v)->i;
+#    define unaligned(x)                                \
+static inline uint##x##_t unaligned##x(const void *v) { \
+    struct Unaligned {                                  \
+        uint##x##_t i;                                  \
+    } __attribute__((packed));                          \
+                                                        \
+    return ((const struct Unaligned *) v)->i;           \
 }
 #    elif defined(__DECC)
-static inline uint32_t unaligned32(const void *v) {
-    return *(const __unaligned uint32_t *) v;
+#    define unaligned(x)                                        \
+static inline uint##x##_t unaligned##x##(const void *v) {       \
+    return *(const __unaligned uint##x##_t *) v;                \
 }
 #    else
-static inline uint32_t unaligned32(const void *v) {
-    return *(const uint32_t *) v;
+#    define unaligned(x)                                        \
+static inline uint##x##_t unaligned##x##(const void *v) {       \
+    return *(const uint##x##_t *) v;                            \
 }
 #    endif
+unaligned(16)
+unaligned(32)
+unaligned(64)
+#undef unaligned
 #endif //!ARCH_X86
 
 #ifndef ALT_BITSTREAM_WRITER
@@ -168,9 +177,6 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     unsigned int bit_buf;
     int bit_left;
 
-#ifdef STATS
-    st_out_bit_counts[st_current_index] += n;
-#endif
     //    printf("put_bits=%d %x\n", n, value);
     assert(n == 32 || value < (1U << n));
 
@@ -574,21 +580,15 @@ static inline int get_bits_count(GetBitContext *s){
  * @author BERO
  */
 static inline int get_xbits(GetBitContext *s, int n){
-    register int tmp;
+    register int sign;
     register int32_t cache;
     OPEN_READER(re, s)
     UPDATE_CACHE(re, s)
     cache = GET_CACHE(re,s);
-    if ((int32_t)cache<0) { //MSB=1
-        tmp = NEG_USR32(cache,n);
-    } else {
-    //   tmp = (-1<<n) | NEG_USR32(cache,n) + 1; mpeg12.c algo
-    //   tmp = - (NEG_USR32(cache,n) ^ ((1 << n) - 1)); h263.c algo
-        tmp = - NEG_USR32(~cache,n);
-    }
+    sign=(~cache)>>31;
     LAST_SKIP_BITS(re, s, n)
     CLOSE_READER(re, s)
-    return tmp;
+    return (NEG_USR32(sign ^ cache, n) ^ sign) - sign;
 }
 
 static inline int get_sbits(GetBitContext *s, int n){
diff --git a/src/libffmpeg/libavcodec/cabac.h b/src/libffmpeg/libavcodec/cabac.h
index 2e4ec7083..e79774157 100644
--- a/src/libffmpeg/libavcodec/cabac.h
+++ b/src/libffmpeg/libavcodec/cabac.h
@@ -24,7 +24,7 @@
  */
 
 
-#undef NDEBUG
+//#undef NDEBUG
 #include <assert.h>
 
 #define CABAC_BITS 8
diff --git a/src/libffmpeg/libavcodec/dpcm.c b/src/libffmpeg/libavcodec/dpcm.c
index c920cb403..df9da9489 100644
--- a/src/libffmpeg/libavcodec/dpcm.c
+++ b/src/libffmpeg/libavcodec/dpcm.c
@@ -41,7 +41,7 @@ typedef struct DPCMContext {
     int channels;
     short roq_square_array[256];
     long sample[2];//for SOL_DPCM
-    int *sol_table;//for SOL_DPCM
+    const int *sol_table;//for SOL_DPCM
 } DPCMContext;
 
 #define SATURATE_S16(x)  if (x < -32768) x = -32768; \
@@ -84,15 +84,15 @@ static int interplay_delta_table[] = {
 
 };
 
-static int sol_table_old[16] =
+static const int sol_table_old[16] =
     { 0x0,  0x1,  0x2 , 0x3,  0x6,  0xA,  0xF, 0x15,
     -0x15, -0xF, -0xA, -0x6, -0x3, -0x2, -0x1, 0x0};
 
-static int sol_table_new[16] =
+static const int sol_table_new[16] =
     { 0x0,  0x1,  0x2,  0x3,  0x6,  0xA,  0xF,  0x15,
       0x0, -0x1, -0x2, -0x3, -0x6, -0xA, -0xF, -0x15};
 
-static int sol_table_16[128] = {
+static const int sol_table_16[128] = {
     0x000, 0x008, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x080,
     0x090, 0x0A0, 0x0B0, 0x0C0, 0x0D0, 0x0E0, 0x0F0, 0x100, 0x110, 0x120,
     0x130, 0x140, 0x150, 0x160, 0x170, 0x180, 0x190, 0x1A0, 0x1B0, 0x1C0,
diff --git a/src/libffmpeg/libavcodec/dsputil.c b/src/libffmpeg/libavcodec/dsputil.c
index 3931c3978..9b79b8659 100644
--- a/src/libffmpeg/libavcodec/dsputil.c
+++ b/src/libffmpeg/libavcodec/dsputil.c
@@ -30,6 +30,7 @@
 #include "mpegvideo.h"
 #include "simple_idct.h"
 #include "faandct.h"
+#include "snow.h"
 
 /* snow.c */
 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
@@ -291,35 +292,34 @@ static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 }
 
 
+#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
-#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
     int s, i, j;
     const int dec_count= w==8 ? 3 : 4;
-    int tmp[16*16];
-#if 0
+    int tmp[32*32];
     int level, ori;
     static const int scale[2][2][4][4]={
       {
         {
-            //8x8 dec=3
+            // 9/7 8x8 dec=3
             {268, 239, 239, 213},
             {  0, 224, 224, 152},
             {  0, 135, 135, 110},
         },{
-            //16x16 dec=4
+            // 9/7 16x16 or 32x32 dec=4
             {344, 310, 310, 280},
             {  0, 320, 320, 228},
             {  0, 175, 175, 136},
             {  0, 129, 129, 102},
         }
       },{
-        {//FIXME 5/3
-            //8x8 dec=3
+        {
+            // 5/3 8x8 dec=3
             {275, 245, 245, 218},
             {  0, 230, 230, 156},
             {  0, 138, 138, 113},
         },{
-            //16x16 dec=4
+            // 5/3 16x16 or 32x32 dec=4
             {352, 317, 317, 286},
             {  0, 328, 328, 233},
             {  0, 180, 180, 140},
@@ -327,29 +327,28 @@ static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
         }
       }
     };
-#endif
 
     for (i = 0; i < h; i++) {
         for (j = 0; j < w; j+=4) {
-            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
-            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
-            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
-            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
+            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
+            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
+            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
+            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
         }
         pix1 += line_size;
         pix2 += line_size;
     }
 
-    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
+    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 
     s=0;
-#if 0
+    assert(w==h);
     for(level=0; level<dec_count; level++){
         for(ori= level ? 1 : 0; ori<4; ori++){
-            int sx= (ori&1) ? 1<<level: 0;
-            int stride= 16<<(dec_count-level);
+            int size= w>>(dec_count-level);
+            int sx= (ori&1) ? size : 0;
+            int stride= 32<<(dec_count-level);
             int sy= (ori&2) ? stride>>1 : 0;
-            int size= 1<<level;
 
             for(i=0; i<size; i++){
                 for(j=0; j<size; j++){
@@ -359,21 +358,8 @@ static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
             }
         }
     }
-#endif
-    for (i = 0; i < h; i++) {
-        for (j = 0; j < w; j+=4) {
-            s+= ABS(tmp[16*i+j+0]);
-            s+= ABS(tmp[16*i+j+1]);
-            s+= ABS(tmp[16*i+j+2]);
-            s+= ABS(tmp[16*i+j+3]);
-        }
-    }
     assert(s>=0);
-
-    return s>>2;
-#else
-    return 0;
-#endif
+    return s>>9;
 }
 
 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
@@ -392,6 +378,15 @@ static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 }
 
+int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
+    return w_c(v, pix1, pix2, line_size, 32, h, 1);
+}
+
+int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
+    return w_c(v, pix1, pix2, line_size, 32, h, 0);
+}
+#endif
+
 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 {
     int i;
@@ -1145,7 +1140,7 @@ static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y
     }
 }
 
-static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 {
     int y, vx, vy;
@@ -2575,6 +2570,33 @@ static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int
     }
 }
 
+#ifdef CONFIG_CAVS_DECODER
+/* AVS specific */
+void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
+
+void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels8_c(dst, src, stride, 8);
+}
+void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels8_c(dst, src, stride, 8);
+}
+void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels16_c(dst, src, stride, 16);
+}
+void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels16_c(dst, src, stride, 16);
+}
+#endif /* CONFIG_CAVS_DECODER */
+
+#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
+/* VC-1 specific */
+void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
+
+void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
+    put_pixels8_c(dst, src, stride, 8);
+}
+#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
+
 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
     uint8_t *cm = cropTbl + MAX_NEG_CROP;
     int i;
@@ -3217,12 +3239,14 @@ void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
         case FF_CMP_NSSE:
             cmp[i]= c->nsse[i];
             break;
+#ifdef CONFIG_SNOW_ENCODER
         case FF_CMP_W53:
             cmp[i]= c->w53[i];
             break;
         case FF_CMP_W97:
             cmp[i]= c->w97[i];
             break;
+#endif
         default:
             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
         }
@@ -3774,6 +3798,8 @@ static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
 }
 
+static void just_return() { return; }
+
 /* init static data */
 void dsputil_static_init(void)
 {
@@ -3853,6 +3879,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 
     c->h264_idct_add= ff_h264_idct_add_c;
     c->h264_idct8_add= ff_h264_idct8_add_c;
+    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
+    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
 
     c->get_pixels = get_pixels_c;
     c->diff_pixels = diff_pixels_c;
@@ -3862,7 +3890,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->add_pixels8 = add_pixels8_c;
     c->add_pixels4 = add_pixels4_c;
     c->gmc1 = gmc1_c;
-    c->gmc = gmc_c;
+    c->gmc = ff_gmc_c;
     c->clear_blocks = clear_blocks_c;
     c->pix_sum = pix_sum_c;
     c->pix_norm1 = pix_norm1_c;
@@ -3988,6 +4016,13 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
 
+#ifdef CONFIG_CAVS_DECODER
+    ff_cavsdsp_init(c,avctx);
+#endif
+#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
+    ff_vc1dsp_init(c,avctx);
+#endif
+
     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
@@ -4022,10 +4057,12 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->vsse[4]= vsse_intra16_c;
     c->nsse[0]= nsse16_c;
     c->nsse[1]= nsse8_c;
+#ifdef CONFIG_SNOW_ENCODER
     c->w53[0]= w53_16_c;
     c->w53[1]= w53_8_c;
     c->w97[0]= w97_16_c;
     c->w97[1]= w97_8_c;
+#endif
 
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
@@ -4047,6 +4084,19 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->try_8x8basis= try_8x8basis_c;
     c->add_8x8basis= add_8x8basis_c;
 
+#ifdef CONFIG_SNOW_ENCODER
+    c->vertical_compose97i = ff_snow_vertical_compose97i;
+    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
+    c->inner_add_yblock = ff_snow_inner_add_yblock;
+#endif
+
+    c->shrink[0]= ff_img_copy_plane;
+    c->shrink[1]= ff_shrink22;
+    c->shrink[2]= ff_shrink44;
+    c->shrink[3]= ff_shrink88;
+
+    c->prefetch= just_return;
+
 #ifdef HAVE_MMX
     dsputil_init_mmx(c, avctx);
 #endif
diff --git a/src/libffmpeg/libavcodec/dsputil.h b/src/libffmpeg/libavcodec/dsputil.h
index dc3bc01e8..df7830564 100644
--- a/src/libffmpeg/libavcodec/dsputil.h
+++ b/src/libffmpeg/libavcodec/dsputil.h
@@ -38,6 +38,7 @@
 //#define DEBUG
 /* dct code */
 typedef short DCTELEM;
+typedef int DWTELEM;
 
 void fdct_ifast (DCTELEM *data);
 void fdct_ifast248 (DCTELEM *data);
@@ -55,6 +56,8 @@ void ff_fdct_sse2(DCTELEM *block);
 
 void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
 
@@ -76,6 +79,15 @@ void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
 void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
 void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
 
+/* 1/2^n downscaling functions from imgconvert.c */
+void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+
+void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
+
 /* minimum alignment rules ;)
 if u notice errors in the align stuff, need more alignment for some asm code for some cpu
 or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ...
@@ -134,6 +146,9 @@ static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
 
 
+// for snow slices
+typedef struct slice_buffer_s slice_buffer;
+
 /**
  * DSPContext.
  */
@@ -262,6 +277,15 @@ typedef struct DSPContext {
     h264_weight_func weight_h264_pixels_tab[10];
     h264_biweight_func biweight_h264_pixels_tab[10];
 
+    /* AVS specific */
+    qpel_mc_func put_cavs_qpel_pixels_tab[2][16];
+    qpel_mc_func avg_cavs_qpel_pixels_tab[2][16];
+    void (*cavs_filter_lv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
+    void (*cavs_filter_lh)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
+    void (*cavs_filter_cv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
+    void (*cavs_filter_ch)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
+    void (*cavs_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
+
     me_cmp_func pix_abs[2][4];
 
     /* huffyuv specific */
@@ -333,6 +357,29 @@ typedef struct DSPContext {
 
     void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
+
+    /* snow wavelet */
+    void (*vertical_compose97i)(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+    void (*horizontal_compose97i)(DWTELEM *b, int width);
+    void (*inner_add_yblock)(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+    void (*prefetch)(void *mem, int stride, int h);
+
+    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+
+    /* vc1 functions */
+    void (*vc1_inv_trans_8x8)(DCTELEM *b);
+    void (*vc1_inv_trans_8x4)(DCTELEM *b, int n);
+    void (*vc1_inv_trans_4x8)(DCTELEM *b, int n);
+    void (*vc1_inv_trans_4x4)(DCTELEM *b, int n);
+    void (*vc1_v_overlap)(uint8_t* src, int stride, int rnd);
+    void (*vc1_h_overlap)(uint8_t* src, int stride, int rnd);
+    /* put 8x8 block with bicubic interpolation and quarterpel precision
+     * last argument is actually round value instead of height
+     */
+    op_pixels_func put_vc1_mspel_pixels_tab[16];
 } DSPContext;
 
 void dsputil_static_init(void);
@@ -409,6 +456,7 @@ int mm_support(void);
 #define MM_SSE    0x0008 /* SSE functions */
 #define MM_SSE2   0x0010 /* PIV SSE2 functions */
 #define MM_3DNOWEXT  0x0020 /* AMD 3DNowExt */
+#define MM_SSE3   0x0040 /* Prescott SSE3 functions */
 
 extern int mm_flags;
 
@@ -531,6 +579,7 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
 #define LD32(a) (*((uint32_t*)(a)))
 #define LD64(a) (*((uint64_t*)(a)))
 
+#define ST16(a, b) *((uint16_t*)(a)) = (b)
 #define ST32(a, b) *((uint32_t*)(a)) = (b)
 
 #endif /* !__GNUC__ */
@@ -563,6 +612,8 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse);
 void ff_fft_permute(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 
 static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
diff --git a/src/libffmpeg/libavcodec/dv.c b/src/libffmpeg/libavcodec/dv.c
index 08611a900..c39d70c54 100644
--- a/src/libffmpeg/libavcodec/dv.c
+++ b/src/libffmpeg/libavcodec/dv.c
@@ -6,6 +6,9 @@
  * DV encoder
  * Copyright (c) 2003 Roman Shaposhnik.
  *
+ * 50 Mbps (DVCPRO50) support
+ * Copyright (c) 2006 Daniel Maas <dmaas@maasdigital.com>
+ *
  * Many thanks to Dan Dennedy <dan@dennedy.org> for providing wealth
  * of DV technical info.
  *
@@ -51,8 +54,12 @@ typedef struct DVVideoContext {
     void (*idct_put[2])(uint8_t *dest, int line_size, DCTELEM *block);
 } DVVideoContext;
 
-/* MultiThreading - applies to entire DV codec, not just the avcontext */
-uint8_t** dv_anchor;
+/* MultiThreading - dv_anchor applies to entire DV codec, not just the avcontext */
+/* one element is needed for each video segment in a DV frame */
+/* at most there are 2 DIF channels * 12 DIF sequences * 27 video segments (PAL 50Mbps) */
+#define DV_ANCHOR_SIZE (2*12*27)
+
+static void* dv_anchor[DV_ANCHOR_SIZE];
 
 #define TEX_VLC_BITS 9
 
@@ -118,11 +125,7 @@ static int dvvideo_init(AVCodecContext *avctx)
             return -ENOMEM;
 
         /* dv_anchor lets each thread know its Id */
-        dv_anchor = av_malloc(12*27*sizeof(void*));
-        if (!dv_anchor) {
-            return -ENOMEM;
-        }
-        for (i=0; i<12*27; i++)
+        for (i=0; i<DV_ANCHOR_SIZE; i++)
             dv_anchor[i] = (void*)(size_t)i;
 
         /* it's faster to include sign bit in a generic VLC parsing scheme */
@@ -238,9 +241,6 @@ static int dvvideo_init(AVCodecContext *avctx)
     /* XXX: do it only for constant case */
     dv_build_unquantize_tables(s, dsp.idct_permutation);
 
-    /* FIXME: I really don't think this should be here */
-    if (dv_codec_profile(avctx))
-        avctx->pix_fmt = dv_codec_profile(avctx)->pix_fmt;
     avctx->coded_frame = &s->picture;
     s->avctx= avctx;
 
@@ -253,6 +253,7 @@ static int dvvideo_init(AVCodecContext *avctx)
 typedef struct BlockInfo {
     const uint8_t *shift_table;
     const uint8_t *scan_table;
+    const int *iweight_table;
     uint8_t pos; /* position in block */
     uint8_t dct_mode;
     uint8_t partial_bit_count;
@@ -295,6 +296,7 @@ static void dv_decode_ac(GetBitContext *gb, BlockInfo *mb, DCTELEM *block)
     int last_index = get_bits_size(gb);
     const uint8_t *scan_table = mb->scan_table;
     const uint8_t *shift_table = mb->shift_table;
+    const int *iweight_table = mb->iweight_table;
     int pos = mb->pos;
     int partial_bit_count = mb->partial_bit_count;
     int level, pos1, run, vlc_len, index;
@@ -342,9 +344,13 @@ static void dv_decode_ac(GetBitContext *gb, BlockInfo *mb, DCTELEM *block)
         if (pos >= 64)
             break;
 
-        assert(level);
         pos1 = scan_table[pos];
-        block[pos1] = level << shift_table[pos1];
+        level <<= shift_table[pos1];
+
+        /* unweigh, round, and shift down */
+        level = (level*iweight_table[pos] + (1 << (dv_iweight_bits-1))) >> dv_iweight_bits;
+
+        block[pos1] = level;
 
         UPDATE_CACHE(re, gb);
     }
@@ -410,6 +416,7 @@ static inline void dv_decode_video_segment(DVVideoContext *s,
             dct_mode = get_bits1(&gb);
             mb->dct_mode = dct_mode;
             mb->scan_table = s->dv_zigzag[dct_mode];
+            mb->iweight_table = dct_mode ? dv_iweight_248 : dv_iweight_88;
             class1 = get_bits(&gb, 2);
             mb->shift_table = s->dv_idct_shift[class1 == 3][dct_mode]
                 [quant + dv_quant_offset[class1]];
@@ -488,45 +495,63 @@ static inline void dv_decode_video_segment(DVVideoContext *s,
         v = *mb_pos_ptr++;
         mb_x = v & 0xff;
         mb_y = v >> 8;
-        y_ptr = s->picture.data[0] + ((mb_y * s->picture.linesize[0] + mb_x)<<log2_blocksize);
-        if (s->sys->pix_fmt == PIX_FMT_YUV411P)
+        if (s->sys->pix_fmt == PIX_FMT_YUV422P) {
+            y_ptr = s->picture.data[0] + ((mb_y * s->picture.linesize[0] + (mb_x>>1))<<log2_blocksize);
             c_offset = ((mb_y * s->picture.linesize[1] + (mb_x >> 2))<<log2_blocksize);
-        else
-            c_offset = (((mb_y >> 1) * s->picture.linesize[1] + (mb_x >> 1))<<log2_blocksize);
+        } else { /* 4:1:1 or 4:2:0 */
+            y_ptr = s->picture.data[0] + ((mb_y * s->picture.linesize[0] + mb_x)<<log2_blocksize);
+            if (s->sys->pix_fmt == PIX_FMT_YUV411P)
+                c_offset = ((mb_y * s->picture.linesize[1] + (mb_x >> 2))<<log2_blocksize);
+            else /* 4:2:0 */
+                c_offset = (((mb_y >> 1) * s->picture.linesize[1] + (mb_x >> 1))<<log2_blocksize);
+        }
         for(j = 0;j < 6; j++) {
             idct_put = s->idct_put[mb->dct_mode && log2_blocksize==3];
-            if (j < 4) {
-                if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x < (704 / 8)) {
-                    /* NOTE: at end of line, the macroblock is handled as 420 */
-                    idct_put(y_ptr + (j<<log2_blocksize), s->picture.linesize[0], block);
-                } else {
-                    idct_put(y_ptr + (((j & 1) + (j >> 1) * s->picture.linesize[0])<<log2_blocksize),
+            if (s->sys->pix_fmt == PIX_FMT_YUV422P) { /* 4:2:2 */
+                if (j == 0 || j == 2) {
+                    /* Y0 Y1 */
+                    idct_put(y_ptr + ((j >> 1)<<log2_blocksize),
                              s->picture.linesize[0], block);
+                } else if(j > 3) {
+                    /* Cr Cb */
+                    idct_put(s->picture.data[6 - j] + c_offset,
+                             s->picture.linesize[6 - j], block);
                 }
-            } else {
-                if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x >= (704 / 8)) {
-                    uint64_t aligned_pixels[64/8];
-                    uint8_t *pixels= (uint8_t*)aligned_pixels;
-                    uint8_t *c_ptr, *c_ptr1, *ptr, *ptr1;
-                    int x, y, linesize;
-                    /* NOTE: at end of line, the macroblock is handled as 420 */
-                    idct_put(pixels, 8, block);
-                    linesize = s->picture.linesize[6 - j];
-                    c_ptr = s->picture.data[6 - j] + c_offset;
-                    ptr = pixels;
-                    for(y = 0;y < (1<<log2_blocksize); y++) {
-                        ptr1= ptr + (1<<(log2_blocksize-1));
-                        c_ptr1 = c_ptr + (linesize<<log2_blocksize);
-                        for(x=0; x < (1<<(log2_blocksize-1)); x++){
-                            c_ptr[x]= ptr[x]; c_ptr1[x]= ptr1[x];
-                        }
-                        c_ptr += linesize;
-                        ptr += 8;
+                /* note: j=1 and j=3 are "dummy" blocks in 4:2:2 */
+            } else { /* 4:1:1 or 4:2:0 */
+                if (j < 4) {
+                    if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x < (704 / 8)) {
+                        /* NOTE: at end of line, the macroblock is handled as 420 */
+                        idct_put(y_ptr + (j<<log2_blocksize), s->picture.linesize[0], block);
+                    } else {
+                        idct_put(y_ptr + (((j & 1) + (j >> 1) * s->picture.linesize[0])<<log2_blocksize),
+                                 s->picture.linesize[0], block);
                     }
                 } else {
-                    /* don't ask me why they inverted Cb and Cr ! */
-                    idct_put(s->picture.data[6 - j] + c_offset,
-                             s->picture.linesize[6 - j], block);
+                    if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x >= (704 / 8)) {
+                        uint64_t aligned_pixels[64/8];
+                        uint8_t *pixels= (uint8_t*)aligned_pixels;
+                        uint8_t *c_ptr, *c_ptr1, *ptr, *ptr1;
+                        int x, y, linesize;
+                        /* NOTE: at end of line, the macroblock is handled as 420 */
+                        idct_put(pixels, 8, block);
+                        linesize = s->picture.linesize[6 - j];
+                        c_ptr = s->picture.data[6 - j] + c_offset;
+                        ptr = pixels;
+                        for(y = 0;y < (1<<log2_blocksize); y++) {
+                            ptr1= ptr + (1<<(log2_blocksize-1));
+                            c_ptr1 = c_ptr + (linesize<<log2_blocksize);
+                            for(x=0; x < (1<<(log2_blocksize-1)); x++){
+                                c_ptr[x]= ptr[x]; c_ptr1[x]= ptr1[x];
+                            }
+                            c_ptr += linesize;
+                            ptr += 8;
+                        }
+                    } else {
+                        /* don't ask me why they inverted Cb and Cr ! */
+                        idct_put(s->picture.data[6 - j] + c_offset,
+                                 s->picture.linesize[6 - j], block);
+                    }
                 }
             }
             block += 64;
@@ -648,11 +673,25 @@ static always_inline PutBitContext* dv_encode_ac(EncBlockInfo* bi, PutBitContext
 }
 
 static always_inline void dv_set_class_number(DCTELEM* blk, EncBlockInfo* bi,
-                                              const uint8_t* zigzag_scan, int bias)
+                                              const uint8_t* zigzag_scan, const int *weight, int bias)
 {
     int i, area;
+    /* We offer two different methods for class number assignment: the
+       method suggested in SMPTE 314M Table 22, and an improved
+       method. The SMPTE method is very conservative; it assigns class
+       3 (i.e. severe quantization) to any block where the largest AC
+       component is greater than 36. ffmpeg's DV encoder tracks AC bit
+       consumption precisely, so there is no need to bias most blocks
+       towards strongly lossy compression. Instead, we assign class 2
+       to most blocks, and use class 3 only when strictly necessary
+       (for blocks whose largest AC component exceeds 255). */
+
+#if 0 /* SMPTE spec method */
     static const int classes[] = {12, 24, 36, 0xffff};
-    int max=12;
+#else /* improved ffmpeg method */
+    static const int classes[] = {-1, -1, 255, 0xffff};
+#endif
+    int max=classes[0];
     int prev=0;
 
     bi->mb[0] = blk[0];
@@ -665,7 +704,11 @@ static always_inline void dv_set_class_number(DCTELEM* blk, EncBlockInfo* bi,
 
           if (level+15 > 30U) {
               bi->sign[i] = (level>>31)&1;
-              bi->mb[i] = level= ABS(level)>>4;
+              /* weigh it and and shift down into range, adding for rounding */
+              /* the extra division by a factor of 2^4 reverses the 8x expansion of the DCT
+                 AND the 2x doubling of the weights */
+              level = (ABS(level) * weight[i] + (1<<(dv_weight_bits+3))) >> (dv_weight_bits+4);
+              bi->mb[i] = level;
               if(level>max) max= level;
               bi->bit_size[area] += dv_rl2vlc_size(i - prev  - 1, level);
               bi->next[prev]= i;
@@ -728,9 +771,10 @@ static always_inline int dv_guess_dct_mode(DCTELEM *blk) {
 static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos)
 {
     int size[5];
-    int i, j, k, a, prev;
+    int i, j, k, a, prev, a2;
     EncBlockInfo* b;
 
+    size[0] = size[1] = size[2] = size[3] = size[4] = 1<<24;
     do {
        b = blks;
        for (i=0; i<5; i++) {
@@ -745,12 +789,23 @@ static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos)
                     b->bit_size[a] = 1; // 4 areas 4 bits for EOB :)
                     b->area_q[a]++;
                     prev= b->prev[a];
+                    assert(b->next[prev] >= mb_area_start[a+1] || b->mb[prev]);
                     for (k= b->next[prev] ; k<mb_area_start[a+1]; k= b->next[k]) {
                        b->mb[k] >>= 1;
                        if (b->mb[k]) {
                            b->bit_size[a] += dv_rl2vlc_size(k - prev - 1, b->mb[k]);
                            prev= k;
                        } else {
+                           if(b->next[k] >= mb_area_start[a+1] && b->next[k]<64){
+                                for(a2=a+1; b->next[k] >= mb_area_start[a2+1]; a2++)
+                                    b->prev[a2] = prev;
+                                assert(a2<4);
+                                assert(b->mb[b->next[k]]);
+                                b->bit_size[a2] += dv_rl2vlc_size(b->next[k] - prev - 1, b->mb[b->next[k]])
+                                                  -dv_rl2vlc_size(b->next[k] -    k - 1, b->mb[b->next[k]]);
+                                assert(b->prev[a2]==k && (a2+1 >= 4 || b->prev[a2+1]!=k));
+                                b->prev[a2] = prev;
+                           }
                            b->next[prev] = b->next[k];
                        }
                     }
@@ -759,16 +814,29 @@ static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos)
                 size[i] += b->bit_size[a];
              }
           }
+          if(vs_total_ac_bits >= size[0] + size[1] + size[2] + size[3] + size[4])
+                return;
        }
-    } while ((vs_total_ac_bits < size[0] + size[1] + size[2] + size[3] + size[4]) &&
-             (qnos[0]|qnos[1]|qnos[2]|qnos[3]|qnos[4]));
+    } while (qnos[0]|qnos[1]|qnos[2]|qnos[3]|qnos[4]);
+
+
+    for(a=2; a==2 || vs_total_ac_bits < size[0]; a+=a){
+        b = blks;
+        size[0] = 5*6*4; //EOB
+        for (j=0; j<6*5; j++, b++) {
+            prev= b->prev[0];
+            for (k= b->next[prev]; k<64; k= b->next[k]) {
+                if(b->mb[k] < a && b->mb[k] > -a){
+                    b->next[prev] = b->next[k];
+                }else{
+                    size[0] += dv_rl2vlc_size(k - prev - 1, b->mb[k]);
+                    prev= k;
+                }
+            }
+        }
+    }
 }
 
-/*
- * This is a very rough initial implementaion. The performance is
- * horrible and the weighting is missing. But it's missing from the
- * decoding step also -- so at least we're on the same page with decoder ;-)
- */
 static inline void dv_encode_video_segment(DVVideoContext *s,
                                            uint8_t *dif,
                                            const uint16_t *mb_pos_ptr)
@@ -795,28 +863,52 @@ static inline void dv_encode_video_segment(DVVideoContext *s,
         v = *mb_pos_ptr++;
         mb_x = v & 0xff;
         mb_y = v >> 8;
-        y_ptr = s->picture.data[0] + (mb_y * s->picture.linesize[0] * 8) + (mb_x * 8);
-        c_offset = (s->sys->pix_fmt == PIX_FMT_YUV411P) ?
-                   ((mb_y * s->picture.linesize[1] * 8) + ((mb_x >> 2) * 8)) :
-                   (((mb_y >> 1) * s->picture.linesize[1] * 8) + ((mb_x >> 1) * 8));
+        if (s->sys->pix_fmt == PIX_FMT_YUV422P) {
+            y_ptr = s->picture.data[0] + (mb_y * s->picture.linesize[0] * 8) + (mb_x * 4);
+        } else { /* 4:1:1 */
+            y_ptr = s->picture.data[0] + (mb_y * s->picture.linesize[0] * 8) + (mb_x * 8);
+        }
+        if (s->sys->pix_fmt == PIX_FMT_YUV420P) {
+            c_offset = (((mb_y >> 1) * s->picture.linesize[1] * 8) + ((mb_x >> 1) * 8));
+        } else { /* 4:2:2 or 4:1:1 */
+            c_offset = ((mb_y * s->picture.linesize[1] * 8) + ((mb_x >> 2) * 8));
+        }
         do_edge_wrap = 0;
         qnos[mb_index] = 15; /* No quantization */
         ptr = dif + mb_index*80 + 4;
         for(j = 0;j < 6; j++) {
-            if (j < 4) {  /* Four Y blocks */
-                /* NOTE: at end of line, the macroblock is handled as 420 */
-                if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x < (704 / 8)) {
-                    data = y_ptr + (j * 8);
+            int dummy = 0;
+            if (s->sys->pix_fmt == PIX_FMT_YUV422P) { /* 4:2:2 */
+                if (j == 0 || j == 2) {
+                    /* Y0 Y1 */
+                    data = y_ptr + ((j>>1) * 8);
+                    linesize = s->picture.linesize[0];
+                } else if (j > 3) {
+                    /* Cr Cb */
+                    data = s->picture.data[6 - j] + c_offset;
+                    linesize = s->picture.linesize[6 - j];
                 } else {
-                    data = y_ptr + ((j & 1) * 8) + ((j >> 1) * 8 * s->picture.linesize[0]);
+                    /* j=1 and j=3 are "dummy" blocks, used for AC data only */
+                    data = 0;
+                    linesize = 0;
+                    dummy = 1;
+                }
+            } else { /* 4:1:1 or 4:2:0 */
+                if (j < 4) {  /* Four Y blocks */
+                    /* NOTE: at end of line, the macroblock is handled as 420 */
+                    if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x < (704 / 8)) {
+                        data = y_ptr + (j * 8);
+                    } else {
+                        data = y_ptr + ((j & 1) * 8) + ((j >> 1) * 8 * s->picture.linesize[0]);
+                    }
+                    linesize = s->picture.linesize[0];
+                } else {      /* Cr and Cb blocks */
+                    /* don't ask Fabrice why they inverted Cb and Cr ! */
+                    data = s->picture.data[6 - j] + c_offset;
+                    linesize = s->picture.linesize[6 - j];
+                    if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x >= (704 / 8))
+                        do_edge_wrap = 1;
                 }
-                linesize = s->picture.linesize[0];
-            } else {      /* Cr and Cb blocks */
-                /* don't ask Fabrice why they inverted Cb and Cr ! */
-                data = s->picture.data[6 - j] + c_offset;
-                linesize = s->picture.linesize[6 - j];
-                if (s->sys->pix_fmt == PIX_FMT_YUV411P && mb_x >= (704 / 8))
-                    do_edge_wrap = 1;
             }
 
             /* Everything is set up -- now just copy data -> DCT block */
@@ -831,7 +923,8 @@ static inline void dv_encode_video_segment(DVVideoContext *s,
                    b += 8;
                 }
             } else {             /* Simple copy: 8x8 -> 8x8 */
-                s->get_pixels(block, data, linesize);
+                if (!dummy)
+                    s->get_pixels(block, data, linesize);
             }
 
             if(s->avctx->flags & CODEC_FLAG_INTERLACED_DCT)
@@ -843,10 +936,18 @@ static inline void dv_encode_video_segment(DVVideoContext *s,
             enc_blk->partial_bit_buffer = 0;
             enc_blk->cur_ac = 0;
 
-            s->fdct[enc_blk->dct_mode](block);
+            if (dummy) {
+                /* We rely on the fact that encoding all zeros leads to an immediate EOB,
+                   which is precisely what the spec calls for in the "dummy" blocks. */
+                memset(block, 0, sizeof(block));
+            } else {
+                s->fdct[enc_blk->dct_mode](block);
+            }
 
             dv_set_class_number(block, enc_blk,
-                                enc_blk->dct_mode ? ff_zigzag248_direct : ff_zigzag_direct, j/4);
+                                enc_blk->dct_mode ? ff_zigzag248_direct : ff_zigzag_direct,
+                                enc_blk->dct_mode ? dv_weight_248 : dv_weight_88,
+                                j/4);
 
             init_put_bits(pb, ptr, block_sizes[j]/8);
             put_bits(pb, 9, (uint16_t)(((enc_blk->mb[0] >> 3) - 1024 + 2) >> 2));
@@ -886,6 +987,8 @@ static inline void dv_encode_video_segment(DVVideoContext *s,
     for (j=0; j<5*6; j++) {
        if (enc_blks[j].partial_bit_count)
            pb=dv_encode_ac(&enc_blks[j], pb, &pbs[6*5]);
+       if (enc_blks[j].partial_bit_count)
+            av_log(NULL, AV_LOG_ERROR, "ac bitstream overflow\n");
     }
 
     for (j=0; j<5*6; j++)
@@ -896,7 +999,17 @@ static int dv_decode_mt(AVCodecContext *avctx, void* sl)
 {
     DVVideoContext *s = avctx->priv_data;
     int slice = (size_t)sl;
-    dv_decode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80],
+
+    /* which DIF channel is this? */
+    int chan = slice / (s->sys->difseg_size * 27);
+
+    /* slice within the DIF channel */
+    int chan_slice = slice % (s->sys->difseg_size * 27);
+
+    /* byte offset of this channel's data */
+    int chan_offset = chan * s->sys->difseg_size * 150 * 80;
+
+    dv_decode_video_segment(s, &s->buf[((chan_slice/27)*6+(chan_slice/3)+chan_slice*5+7)*80 + chan_offset],
                             &s->sys->video_place[slice*5]);
     return 0;
 }
@@ -905,13 +1018,23 @@ static int dv_encode_mt(AVCodecContext *avctx, void* sl)
 {
     DVVideoContext *s = avctx->priv_data;
     int slice = (size_t)sl;
-    dv_encode_video_segment(s, &s->buf[((slice/27)*6+(slice/3)+slice*5+7)*80],
+
+    /* which DIF channel is this? */
+    int chan = slice / (s->sys->difseg_size * 27);
+
+    /* slice within the DIF channel */
+    int chan_slice = slice % (s->sys->difseg_size * 27);
+
+    /* byte offset of this channel's data */
+    int chan_offset = chan * s->sys->difseg_size * 150 * 80;
+
+    dv_encode_video_segment(s, &s->buf[((chan_slice/27)*6+(chan_slice/3)+chan_slice*5+7)*80 + chan_offset],
                             &s->sys->video_place[slice*5]);
     return 0;
 }
 
 /* NOTE: exactly one frame must be given (120000 bytes for NTSC,
-   144000 bytes for PAL) */
+   144000 bytes for PAL - or twice those for 50Mbps) */
 static int dvvideo_decode_frame(AVCodecContext *avctx,
                                  void *data, int *data_size,
                                  uint8_t *buf, int buf_size)
@@ -939,7 +1062,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
 
     s->buf = buf;
     avctx->execute(avctx, dv_decode_mt, (void**)&dv_anchor[0], NULL,
-                   s->sys->difseg_size * 27);
+                   s->sys->n_difchan * s->sys->difseg_size * 27);
 
     emms_c();
 
@@ -968,9 +1091,23 @@ static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size,
 
     s->buf = buf;
     c->execute(c, dv_encode_mt, (void**)&dv_anchor[0], NULL,
-               s->sys->difseg_size * 27);
+               s->sys->n_difchan * s->sys->difseg_size * 27);
 
     emms_c();
+
+    /* Fill in just enough of the header for dv_frame_profile() to
+       return the correct result, so that the frame can be decoded
+       correctly. The rest of the metadata is filled in by the dvvideo
+       avformat. (this should probably change so that encode_frame()
+       fills in ALL of the metadata - e.g. for Quicktime-wrapped DV
+       streams) */
+
+    /* NTSC/PAL format */
+    buf[3] = s->sys->dsf ? 0x80 : 0x00;
+
+    /* 25Mbps or 50Mbps */
+    buf[80*5 + 48 + 3] = (s->sys->pix_fmt == PIX_FMT_YUV422P) ? 0x4 : 0x0;
+
     return s->sys->frame_size;
 }
 
diff --git a/src/libffmpeg/libavcodec/dvdata.h b/src/libffmpeg/libavcodec/dvdata.h
index f817ead2a..a3d42d66c 100644
--- a/src/libffmpeg/libavcodec/dvdata.h
+++ b/src/libffmpeg/libavcodec/dvdata.h
@@ -31,7 +31,8 @@
 typedef struct DVprofile {
     int              dsf;                 /* value of the dsf in the DV header */
     int              frame_size;          /* total size of one frame in bytes */
-    int              difseg_size;         /* number of DIF segments */
+    int              difseg_size;         /* number of DIF segments per DIF channel */
+    int              n_difchan;           /* number of DIF channels per frame */
     int              frame_rate;
     int              frame_rate_base;
     int              ltc_divisor;         /* FPS from the LTS standpoint */
@@ -1256,6 +1257,1251 @@ static const uint16_t dv_place_411[1350] = {
  0x0834, 0x2320, 0x2f44, 0x3810, 0x1658,
 };
 
+/* 4:2:2 macroblock placement tables created by dvtables.py */
+
+/* 2 channels per frame, 10 DIF sequences per channel,
+   27 video segments per DIF sequence, 5 macroblocks per video segment */
+static const uint16_t dv_place_422_525[2*10*27*5] = {
+ 0x0c48, 0x2424, 0x306c, 0x0000, 0x1890,
+ 0x0d48, 0x2524, 0x316c, 0x0100, 0x1990,
+ 0x0e48, 0x2624, 0x326c, 0x0200, 0x1a90,
+ 0x0e4c, 0x2628, 0x3270, 0x0204, 0x1a94,
+ 0x0d4c, 0x2528, 0x3170, 0x0104, 0x1994,
+ 0x0c4c, 0x2428, 0x3070, 0x0004, 0x1894,
+ 0x0c50, 0x242c, 0x3074, 0x0008, 0x1898,
+ 0x0d50, 0x252c, 0x3174, 0x0108, 0x1998,
+ 0x0e50, 0x262c, 0x3274, 0x0208, 0x1a98,
+ 0x0e54, 0x2630, 0x3278, 0x020c, 0x1a9c,
+ 0x0d54, 0x2530, 0x3178, 0x010c, 0x199c,
+ 0x0c54, 0x2430, 0x3078, 0x000c, 0x189c,
+ 0x0c58, 0x2434, 0x307c, 0x0010, 0x18a0,
+ 0x0d58, 0x2534, 0x317c, 0x0110, 0x19a0,
+ 0x0e58, 0x2634, 0x327c, 0x0210, 0x1aa0,
+ 0x0e5c, 0x2638, 0x3280, 0x0214, 0x1aa4,
+ 0x0d5c, 0x2538, 0x3180, 0x0114, 0x19a4,
+ 0x0c5c, 0x2438, 0x3080, 0x0014, 0x18a4,
+ 0x0c60, 0x243c, 0x3084, 0x0018, 0x18a8,
+ 0x0d60, 0x253c, 0x3184, 0x0118, 0x19a8,
+ 0x0e60, 0x263c, 0x3284, 0x0218, 0x1aa8,
+ 0x0e64, 0x2640, 0x3288, 0x021c, 0x1aac,
+ 0x0d64, 0x2540, 0x3188, 0x011c, 0x19ac,
+ 0x0c64, 0x2440, 0x3088, 0x001c, 0x18ac,
+ 0x0c68, 0x2444, 0x308c, 0x0020, 0x18b0,
+ 0x0d68, 0x2544, 0x318c, 0x0120, 0x19b0,
+ 0x0e68, 0x2644, 0x328c, 0x0220, 0x1ab0,
+ 0x1248, 0x2a24, 0x366c, 0x0600, 0x1e90,
+ 0x1348, 0x2b24, 0x376c, 0x0700, 0x1f90,
+ 0x1448, 0x2c24, 0x386c, 0x0800, 0x2090,
+ 0x144c, 0x2c28, 0x3870, 0x0804, 0x2094,
+ 0x134c, 0x2b28, 0x3770, 0x0704, 0x1f94,
+ 0x124c, 0x2a28, 0x3670, 0x0604, 0x1e94,
+ 0x1250, 0x2a2c, 0x3674, 0x0608, 0x1e98,
+ 0x1350, 0x2b2c, 0x3774, 0x0708, 0x1f98,
+ 0x1450, 0x2c2c, 0x3874, 0x0808, 0x2098,
+ 0x1454, 0x2c30, 0x3878, 0x080c, 0x209c,
+ 0x1354, 0x2b30, 0x3778, 0x070c, 0x1f9c,
+ 0x1254, 0x2a30, 0x3678, 0x060c, 0x1e9c,
+ 0x1258, 0x2a34, 0x367c, 0x0610, 0x1ea0,
+ 0x1358, 0x2b34, 0x377c, 0x0710, 0x1fa0,
+ 0x1458, 0x2c34, 0x387c, 0x0810, 0x20a0,
+ 0x145c, 0x2c38, 0x3880, 0x0814, 0x20a4,
+ 0x135c, 0x2b38, 0x3780, 0x0714, 0x1fa4,
+ 0x125c, 0x2a38, 0x3680, 0x0614, 0x1ea4,
+ 0x1260, 0x2a3c, 0x3684, 0x0618, 0x1ea8,
+ 0x1360, 0x2b3c, 0x3784, 0x0718, 0x1fa8,
+ 0x1460, 0x2c3c, 0x3884, 0x0818, 0x20a8,
+ 0x1464, 0x2c40, 0x3888, 0x081c, 0x20ac,
+ 0x1364, 0x2b40, 0x3788, 0x071c, 0x1fac,
+ 0x1264, 0x2a40, 0x3688, 0x061c, 0x1eac,
+ 0x1268, 0x2a44, 0x368c, 0x0620, 0x1eb0,
+ 0x1368, 0x2b44, 0x378c, 0x0720, 0x1fb0,
+ 0x1468, 0x2c44, 0x388c, 0x0820, 0x20b0,
+ 0x1848, 0x3024, 0x006c, 0x0c00, 0x2490,
+ 0x1948, 0x3124, 0x016c, 0x0d00, 0x2590,
+ 0x1a48, 0x3224, 0x026c, 0x0e00, 0x2690,
+ 0x1a4c, 0x3228, 0x0270, 0x0e04, 0x2694,
+ 0x194c, 0x3128, 0x0170, 0x0d04, 0x2594,
+ 0x184c, 0x3028, 0x0070, 0x0c04, 0x2494,
+ 0x1850, 0x302c, 0x0074, 0x0c08, 0x2498,
+ 0x1950, 0x312c, 0x0174, 0x0d08, 0x2598,
+ 0x1a50, 0x322c, 0x0274, 0x0e08, 0x2698,
+ 0x1a54, 0x3230, 0x0278, 0x0e0c, 0x269c,
+ 0x1954, 0x3130, 0x0178, 0x0d0c, 0x259c,
+ 0x1854, 0x3030, 0x0078, 0x0c0c, 0x249c,
+ 0x1858, 0x3034, 0x007c, 0x0c10, 0x24a0,
+ 0x1958, 0x3134, 0x017c, 0x0d10, 0x25a0,
+ 0x1a58, 0x3234, 0x027c, 0x0e10, 0x26a0,
+ 0x1a5c, 0x3238, 0x0280, 0x0e14, 0x26a4,
+ 0x195c, 0x3138, 0x0180, 0x0d14, 0x25a4,
+ 0x185c, 0x3038, 0x0080, 0x0c14, 0x24a4,
+ 0x1860, 0x303c, 0x0084, 0x0c18, 0x24a8,
+ 0x1960, 0x313c, 0x0184, 0x0d18, 0x25a8,
+ 0x1a60, 0x323c, 0x0284, 0x0e18, 0x26a8,
+ 0x1a64, 0x3240, 0x0288, 0x0e1c, 0x26ac,
+ 0x1964, 0x3140, 0x0188, 0x0d1c, 0x25ac,
+ 0x1864, 0x3040, 0x0088, 0x0c1c, 0x24ac,
+ 0x1868, 0x3044, 0x008c, 0x0c20, 0x24b0,
+ 0x1968, 0x3144, 0x018c, 0x0d20, 0x25b0,
+ 0x1a68, 0x3244, 0x028c, 0x0e20, 0x26b0,
+ 0x1e48, 0x3624, 0x066c, 0x1200, 0x2a90,
+ 0x1f48, 0x3724, 0x076c, 0x1300, 0x2b90,
+ 0x2048, 0x3824, 0x086c, 0x1400, 0x2c90,
+ 0x204c, 0x3828, 0x0870, 0x1404, 0x2c94,
+ 0x1f4c, 0x3728, 0x0770, 0x1304, 0x2b94,
+ 0x1e4c, 0x3628, 0x0670, 0x1204, 0x2a94,
+ 0x1e50, 0x362c, 0x0674, 0x1208, 0x2a98,
+ 0x1f50, 0x372c, 0x0774, 0x1308, 0x2b98,
+ 0x2050, 0x382c, 0x0874, 0x1408, 0x2c98,
+ 0x2054, 0x3830, 0x0878, 0x140c, 0x2c9c,
+ 0x1f54, 0x3730, 0x0778, 0x130c, 0x2b9c,
+ 0x1e54, 0x3630, 0x0678, 0x120c, 0x2a9c,
+ 0x1e58, 0x3634, 0x067c, 0x1210, 0x2aa0,
+ 0x1f58, 0x3734, 0x077c, 0x1310, 0x2ba0,
+ 0x2058, 0x3834, 0x087c, 0x1410, 0x2ca0,
+ 0x205c, 0x3838, 0x0880, 0x1414, 0x2ca4,
+ 0x1f5c, 0x3738, 0x0780, 0x1314, 0x2ba4,
+ 0x1e5c, 0x3638, 0x0680, 0x1214, 0x2aa4,
+ 0x1e60, 0x363c, 0x0684, 0x1218, 0x2aa8,
+ 0x1f60, 0x373c, 0x0784, 0x1318, 0x2ba8,
+ 0x2060, 0x383c, 0x0884, 0x1418, 0x2ca8,
+ 0x2064, 0x3840, 0x0888, 0x141c, 0x2cac,
+ 0x1f64, 0x3740, 0x0788, 0x131c, 0x2bac,
+ 0x1e64, 0x3640, 0x0688, 0x121c, 0x2aac,
+ 0x1e68, 0x3644, 0x068c, 0x1220, 0x2ab0,
+ 0x1f68, 0x3744, 0x078c, 0x1320, 0x2bb0,
+ 0x2068, 0x3844, 0x088c, 0x1420, 0x2cb0,
+ 0x2448, 0x0024, 0x0c6c, 0x1800, 0x3090,
+ 0x2548, 0x0124, 0x0d6c, 0x1900, 0x3190,
+ 0x2648, 0x0224, 0x0e6c, 0x1a00, 0x3290,
+ 0x264c, 0x0228, 0x0e70, 0x1a04, 0x3294,
+ 0x254c, 0x0128, 0x0d70, 0x1904, 0x3194,
+ 0x244c, 0x0028, 0x0c70, 0x1804, 0x3094,
+ 0x2450, 0x002c, 0x0c74, 0x1808, 0x3098,
+ 0x2550, 0x012c, 0x0d74, 0x1908, 0x3198,
+ 0x2650, 0x022c, 0x0e74, 0x1a08, 0x3298,
+ 0x2654, 0x0230, 0x0e78, 0x1a0c, 0x329c,
+ 0x2554, 0x0130, 0x0d78, 0x190c, 0x319c,
+ 0x2454, 0x0030, 0x0c78, 0x180c, 0x309c,
+ 0x2458, 0x0034, 0x0c7c, 0x1810, 0x30a0,
+ 0x2558, 0x0134, 0x0d7c, 0x1910, 0x31a0,
+ 0x2658, 0x0234, 0x0e7c, 0x1a10, 0x32a0,
+ 0x265c, 0x0238, 0x0e80, 0x1a14, 0x32a4,
+ 0x255c, 0x0138, 0x0d80, 0x1914, 0x31a4,
+ 0x245c, 0x0038, 0x0c80, 0x1814, 0x30a4,
+ 0x2460, 0x003c, 0x0c84, 0x1818, 0x30a8,
+ 0x2560, 0x013c, 0x0d84, 0x1918, 0x31a8,
+ 0x2660, 0x023c, 0x0e84, 0x1a18, 0x32a8,
+ 0x2664, 0x0240, 0x0e88, 0x1a1c, 0x32ac,
+ 0x2564, 0x0140, 0x0d88, 0x191c, 0x31ac,
+ 0x2464, 0x0040, 0x0c88, 0x181c, 0x30ac,
+ 0x2468, 0x0044, 0x0c8c, 0x1820, 0x30b0,
+ 0x2568, 0x0144, 0x0d8c, 0x1920, 0x31b0,
+ 0x2668, 0x0244, 0x0e8c, 0x1a20, 0x32b0,
+ 0x2a48, 0x0624, 0x126c, 0x1e00, 0x3690,
+ 0x2b48, 0x0724, 0x136c, 0x1f00, 0x3790,
+ 0x2c48, 0x0824, 0x146c, 0x2000, 0x3890,
+ 0x2c4c, 0x0828, 0x1470, 0x2004, 0x3894,
+ 0x2b4c, 0x0728, 0x1370, 0x1f04, 0x3794,
+ 0x2a4c, 0x0628, 0x1270, 0x1e04, 0x3694,
+ 0x2a50, 0x062c, 0x1274, 0x1e08, 0x3698,
+ 0x2b50, 0x072c, 0x1374, 0x1f08, 0x3798,
+ 0x2c50, 0x082c, 0x1474, 0x2008, 0x3898,
+ 0x2c54, 0x0830, 0x1478, 0x200c, 0x389c,
+ 0x2b54, 0x0730, 0x1378, 0x1f0c, 0x379c,
+ 0x2a54, 0x0630, 0x1278, 0x1e0c, 0x369c,
+ 0x2a58, 0x0634, 0x127c, 0x1e10, 0x36a0,
+ 0x2b58, 0x0734, 0x137c, 0x1f10, 0x37a0,
+ 0x2c58, 0x0834, 0x147c, 0x2010, 0x38a0,
+ 0x2c5c, 0x0838, 0x1480, 0x2014, 0x38a4,
+ 0x2b5c, 0x0738, 0x1380, 0x1f14, 0x37a4,
+ 0x2a5c, 0x0638, 0x1280, 0x1e14, 0x36a4,
+ 0x2a60, 0x063c, 0x1284, 0x1e18, 0x36a8,
+ 0x2b60, 0x073c, 0x1384, 0x1f18, 0x37a8,
+ 0x2c60, 0x083c, 0x1484, 0x2018, 0x38a8,
+ 0x2c64, 0x0840, 0x1488, 0x201c, 0x38ac,
+ 0x2b64, 0x0740, 0x1388, 0x1f1c, 0x37ac,
+ 0x2a64, 0x0640, 0x1288, 0x1e1c, 0x36ac,
+ 0x2a68, 0x0644, 0x128c, 0x1e20, 0x36b0,
+ 0x2b68, 0x0744, 0x138c, 0x1f20, 0x37b0,
+ 0x2c68, 0x0844, 0x148c, 0x2020, 0x38b0,
+ 0x3048, 0x0c24, 0x186c, 0x2400, 0x0090,
+ 0x3148, 0x0d24, 0x196c, 0x2500, 0x0190,
+ 0x3248, 0x0e24, 0x1a6c, 0x2600, 0x0290,
+ 0x324c, 0x0e28, 0x1a70, 0x2604, 0x0294,
+ 0x314c, 0x0d28, 0x1970, 0x2504, 0x0194,
+ 0x304c, 0x0c28, 0x1870, 0x2404, 0x0094,
+ 0x3050, 0x0c2c, 0x1874, 0x2408, 0x0098,
+ 0x3150, 0x0d2c, 0x1974, 0x2508, 0x0198,
+ 0x3250, 0x0e2c, 0x1a74, 0x2608, 0x0298,
+ 0x3254, 0x0e30, 0x1a78, 0x260c, 0x029c,
+ 0x3154, 0x0d30, 0x1978, 0x250c, 0x019c,
+ 0x3054, 0x0c30, 0x1878, 0x240c, 0x009c,
+ 0x3058, 0x0c34, 0x187c, 0x2410, 0x00a0,
+ 0x3158, 0x0d34, 0x197c, 0x2510, 0x01a0,
+ 0x3258, 0x0e34, 0x1a7c, 0x2610, 0x02a0,
+ 0x325c, 0x0e38, 0x1a80, 0x2614, 0x02a4,
+ 0x315c, 0x0d38, 0x1980, 0x2514, 0x01a4,
+ 0x305c, 0x0c38, 0x1880, 0x2414, 0x00a4,
+ 0x3060, 0x0c3c, 0x1884, 0x2418, 0x00a8,
+ 0x3160, 0x0d3c, 0x1984, 0x2518, 0x01a8,
+ 0x3260, 0x0e3c, 0x1a84, 0x2618, 0x02a8,
+ 0x3264, 0x0e40, 0x1a88, 0x261c, 0x02ac,
+ 0x3164, 0x0d40, 0x1988, 0x251c, 0x01ac,
+ 0x3064, 0x0c40, 0x1888, 0x241c, 0x00ac,
+ 0x3068, 0x0c44, 0x188c, 0x2420, 0x00b0,
+ 0x3168, 0x0d44, 0x198c, 0x2520, 0x01b0,
+ 0x3268, 0x0e44, 0x1a8c, 0x2620, 0x02b0,
+ 0x3648, 0x1224, 0x1e6c, 0x2a00, 0x0690,
+ 0x3748, 0x1324, 0x1f6c, 0x2b00, 0x0790,
+ 0x3848, 0x1424, 0x206c, 0x2c00, 0x0890,
+ 0x384c, 0x1428, 0x2070, 0x2c04, 0x0894,
+ 0x374c, 0x1328, 0x1f70, 0x2b04, 0x0794,
+ 0x364c, 0x1228, 0x1e70, 0x2a04, 0x0694,
+ 0x3650, 0x122c, 0x1e74, 0x2a08, 0x0698,
+ 0x3750, 0x132c, 0x1f74, 0x2b08, 0x0798,
+ 0x3850, 0x142c, 0x2074, 0x2c08, 0x0898,
+ 0x3854, 0x1430, 0x2078, 0x2c0c, 0x089c,
+ 0x3754, 0x1330, 0x1f78, 0x2b0c, 0x079c,
+ 0x3654, 0x1230, 0x1e78, 0x2a0c, 0x069c,
+ 0x3658, 0x1234, 0x1e7c, 0x2a10, 0x06a0,
+ 0x3758, 0x1334, 0x1f7c, 0x2b10, 0x07a0,
+ 0x3858, 0x1434, 0x207c, 0x2c10, 0x08a0,
+ 0x385c, 0x1438, 0x2080, 0x2c14, 0x08a4,
+ 0x375c, 0x1338, 0x1f80, 0x2b14, 0x07a4,
+ 0x365c, 0x1238, 0x1e80, 0x2a14, 0x06a4,
+ 0x3660, 0x123c, 0x1e84, 0x2a18, 0x06a8,
+ 0x3760, 0x133c, 0x1f84, 0x2b18, 0x07a8,
+ 0x3860, 0x143c, 0x2084, 0x2c18, 0x08a8,
+ 0x3864, 0x1440, 0x2088, 0x2c1c, 0x08ac,
+ 0x3764, 0x1340, 0x1f88, 0x2b1c, 0x07ac,
+ 0x3664, 0x1240, 0x1e88, 0x2a1c, 0x06ac,
+ 0x3668, 0x1244, 0x1e8c, 0x2a20, 0x06b0,
+ 0x3768, 0x1344, 0x1f8c, 0x2b20, 0x07b0,
+ 0x3868, 0x1444, 0x208c, 0x2c20, 0x08b0,
+ 0x0048, 0x1824, 0x246c, 0x3000, 0x0c90,
+ 0x0148, 0x1924, 0x256c, 0x3100, 0x0d90,
+ 0x0248, 0x1a24, 0x266c, 0x3200, 0x0e90,
+ 0x024c, 0x1a28, 0x2670, 0x3204, 0x0e94,
+ 0x014c, 0x1928, 0x2570, 0x3104, 0x0d94,
+ 0x004c, 0x1828, 0x2470, 0x3004, 0x0c94,
+ 0x0050, 0x182c, 0x2474, 0x3008, 0x0c98,
+ 0x0150, 0x192c, 0x2574, 0x3108, 0x0d98,
+ 0x0250, 0x1a2c, 0x2674, 0x3208, 0x0e98,
+ 0x0254, 0x1a30, 0x2678, 0x320c, 0x0e9c,
+ 0x0154, 0x1930, 0x2578, 0x310c, 0x0d9c,
+ 0x0054, 0x1830, 0x2478, 0x300c, 0x0c9c,
+ 0x0058, 0x1834, 0x247c, 0x3010, 0x0ca0,
+ 0x0158, 0x1934, 0x257c, 0x3110, 0x0da0,
+ 0x0258, 0x1a34, 0x267c, 0x3210, 0x0ea0,
+ 0x025c, 0x1a38, 0x2680, 0x3214, 0x0ea4,
+ 0x015c, 0x1938, 0x2580, 0x3114, 0x0da4,
+ 0x005c, 0x1838, 0x2480, 0x3014, 0x0ca4,
+ 0x0060, 0x183c, 0x2484, 0x3018, 0x0ca8,
+ 0x0160, 0x193c, 0x2584, 0x3118, 0x0da8,
+ 0x0260, 0x1a3c, 0x2684, 0x3218, 0x0ea8,
+ 0x0264, 0x1a40, 0x2688, 0x321c, 0x0eac,
+ 0x0164, 0x1940, 0x2588, 0x311c, 0x0dac,
+ 0x0064, 0x1840, 0x2488, 0x301c, 0x0cac,
+ 0x0068, 0x1844, 0x248c, 0x3020, 0x0cb0,
+ 0x0168, 0x1944, 0x258c, 0x3120, 0x0db0,
+ 0x0268, 0x1a44, 0x268c, 0x3220, 0x0eb0,
+ 0x0648, 0x1e24, 0x2a6c, 0x3600, 0x1290,
+ 0x0748, 0x1f24, 0x2b6c, 0x3700, 0x1390,
+ 0x0848, 0x2024, 0x2c6c, 0x3800, 0x1490,
+ 0x084c, 0x2028, 0x2c70, 0x3804, 0x1494,
+ 0x074c, 0x1f28, 0x2b70, 0x3704, 0x1394,
+ 0x064c, 0x1e28, 0x2a70, 0x3604, 0x1294,
+ 0x0650, 0x1e2c, 0x2a74, 0x3608, 0x1298,
+ 0x0750, 0x1f2c, 0x2b74, 0x3708, 0x1398,
+ 0x0850, 0x202c, 0x2c74, 0x3808, 0x1498,
+ 0x0854, 0x2030, 0x2c78, 0x380c, 0x149c,
+ 0x0754, 0x1f30, 0x2b78, 0x370c, 0x139c,
+ 0x0654, 0x1e30, 0x2a78, 0x360c, 0x129c,
+ 0x0658, 0x1e34, 0x2a7c, 0x3610, 0x12a0,
+ 0x0758, 0x1f34, 0x2b7c, 0x3710, 0x13a0,
+ 0x0858, 0x2034, 0x2c7c, 0x3810, 0x14a0,
+ 0x085c, 0x2038, 0x2c80, 0x3814, 0x14a4,
+ 0x075c, 0x1f38, 0x2b80, 0x3714, 0x13a4,
+ 0x065c, 0x1e38, 0x2a80, 0x3614, 0x12a4,
+ 0x0660, 0x1e3c, 0x2a84, 0x3618, 0x12a8,
+ 0x0760, 0x1f3c, 0x2b84, 0x3718, 0x13a8,
+ 0x0860, 0x203c, 0x2c84, 0x3818, 0x14a8,
+ 0x0864, 0x2040, 0x2c88, 0x381c, 0x14ac,
+ 0x0764, 0x1f40, 0x2b88, 0x371c, 0x13ac,
+ 0x0664, 0x1e40, 0x2a88, 0x361c, 0x12ac,
+ 0x0668, 0x1e44, 0x2a8c, 0x3620, 0x12b0,
+ 0x0768, 0x1f44, 0x2b8c, 0x3720, 0x13b0,
+ 0x0868, 0x2044, 0x2c8c, 0x3820, 0x14b0,
+ 0x0f48, 0x2724, 0x336c, 0x0300, 0x1b90,
+ 0x1048, 0x2824, 0x346c, 0x0400, 0x1c90,
+ 0x1148, 0x2924, 0x356c, 0x0500, 0x1d90,
+ 0x114c, 0x2928, 0x3570, 0x0504, 0x1d94,
+ 0x104c, 0x2828, 0x3470, 0x0404, 0x1c94,
+ 0x0f4c, 0x2728, 0x3370, 0x0304, 0x1b94,
+ 0x0f50, 0x272c, 0x3374, 0x0308, 0x1b98,
+ 0x1050, 0x282c, 0x3474, 0x0408, 0x1c98,
+ 0x1150, 0x292c, 0x3574, 0x0508, 0x1d98,
+ 0x1154, 0x2930, 0x3578, 0x050c, 0x1d9c,
+ 0x1054, 0x2830, 0x3478, 0x040c, 0x1c9c,
+ 0x0f54, 0x2730, 0x3378, 0x030c, 0x1b9c,
+ 0x0f58, 0x2734, 0x337c, 0x0310, 0x1ba0,
+ 0x1058, 0x2834, 0x347c, 0x0410, 0x1ca0,
+ 0x1158, 0x2934, 0x357c, 0x0510, 0x1da0,
+ 0x115c, 0x2938, 0x3580, 0x0514, 0x1da4,
+ 0x105c, 0x2838, 0x3480, 0x0414, 0x1ca4,
+ 0x0f5c, 0x2738, 0x3380, 0x0314, 0x1ba4,
+ 0x0f60, 0x273c, 0x3384, 0x0318, 0x1ba8,
+ 0x1060, 0x283c, 0x3484, 0x0418, 0x1ca8,
+ 0x1160, 0x293c, 0x3584, 0x0518, 0x1da8,
+ 0x1164, 0x2940, 0x3588, 0x051c, 0x1dac,
+ 0x1064, 0x2840, 0x3488, 0x041c, 0x1cac,
+ 0x0f64, 0x2740, 0x3388, 0x031c, 0x1bac,
+ 0x0f68, 0x2744, 0x338c, 0x0320, 0x1bb0,
+ 0x1068, 0x2844, 0x348c, 0x0420, 0x1cb0,
+ 0x1168, 0x2944, 0x358c, 0x0520, 0x1db0,
+ 0x1548, 0x2d24, 0x396c, 0x0900, 0x2190,
+ 0x1648, 0x2e24, 0x3a6c, 0x0a00, 0x2290,
+ 0x1748, 0x2f24, 0x3b6c, 0x0b00, 0x2390,
+ 0x174c, 0x2f28, 0x3b70, 0x0b04, 0x2394,
+ 0x164c, 0x2e28, 0x3a70, 0x0a04, 0x2294,
+ 0x154c, 0x2d28, 0x3970, 0x0904, 0x2194,
+ 0x1550, 0x2d2c, 0x3974, 0x0908, 0x2198,
+ 0x1650, 0x2e2c, 0x3a74, 0x0a08, 0x2298,
+ 0x1750, 0x2f2c, 0x3b74, 0x0b08, 0x2398,
+ 0x1754, 0x2f30, 0x3b78, 0x0b0c, 0x239c,
+ 0x1654, 0x2e30, 0x3a78, 0x0a0c, 0x229c,
+ 0x1554, 0x2d30, 0x3978, 0x090c, 0x219c,
+ 0x1558, 0x2d34, 0x397c, 0x0910, 0x21a0,
+ 0x1658, 0x2e34, 0x3a7c, 0x0a10, 0x22a0,
+ 0x1758, 0x2f34, 0x3b7c, 0x0b10, 0x23a0,
+ 0x175c, 0x2f38, 0x3b80, 0x0b14, 0x23a4,
+ 0x165c, 0x2e38, 0x3a80, 0x0a14, 0x22a4,
+ 0x155c, 0x2d38, 0x3980, 0x0914, 0x21a4,
+ 0x1560, 0x2d3c, 0x3984, 0x0918, 0x21a8,
+ 0x1660, 0x2e3c, 0x3a84, 0x0a18, 0x22a8,
+ 0x1760, 0x2f3c, 0x3b84, 0x0b18, 0x23a8,
+ 0x1764, 0x2f40, 0x3b88, 0x0b1c, 0x23ac,
+ 0x1664, 0x2e40, 0x3a88, 0x0a1c, 0x22ac,
+ 0x1564, 0x2d40, 0x3988, 0x091c, 0x21ac,
+ 0x1568, 0x2d44, 0x398c, 0x0920, 0x21b0,
+ 0x1668, 0x2e44, 0x3a8c, 0x0a20, 0x22b0,
+ 0x1768, 0x2f44, 0x3b8c, 0x0b20, 0x23b0,
+ 0x1b48, 0x3324, 0x036c, 0x0f00, 0x2790,
+ 0x1c48, 0x3424, 0x046c, 0x1000, 0x2890,
+ 0x1d48, 0x3524, 0x056c, 0x1100, 0x2990,
+ 0x1d4c, 0x3528, 0x0570, 0x1104, 0x2994,
+ 0x1c4c, 0x3428, 0x0470, 0x1004, 0x2894,
+ 0x1b4c, 0x3328, 0x0370, 0x0f04, 0x2794,
+ 0x1b50, 0x332c, 0x0374, 0x0f08, 0x2798,
+ 0x1c50, 0x342c, 0x0474, 0x1008, 0x2898,
+ 0x1d50, 0x352c, 0x0574, 0x1108, 0x2998,
+ 0x1d54, 0x3530, 0x0578, 0x110c, 0x299c,
+ 0x1c54, 0x3430, 0x0478, 0x100c, 0x289c,
+ 0x1b54, 0x3330, 0x0378, 0x0f0c, 0x279c,
+ 0x1b58, 0x3334, 0x037c, 0x0f10, 0x27a0,
+ 0x1c58, 0x3434, 0x047c, 0x1010, 0x28a0,
+ 0x1d58, 0x3534, 0x057c, 0x1110, 0x29a0,
+ 0x1d5c, 0x3538, 0x0580, 0x1114, 0x29a4,
+ 0x1c5c, 0x3438, 0x0480, 0x1014, 0x28a4,
+ 0x1b5c, 0x3338, 0x0380, 0x0f14, 0x27a4,
+ 0x1b60, 0x333c, 0x0384, 0x0f18, 0x27a8,
+ 0x1c60, 0x343c, 0x0484, 0x1018, 0x28a8,
+ 0x1d60, 0x353c, 0x0584, 0x1118, 0x29a8,
+ 0x1d64, 0x3540, 0x0588, 0x111c, 0x29ac,
+ 0x1c64, 0x3440, 0x0488, 0x101c, 0x28ac,
+ 0x1b64, 0x3340, 0x0388, 0x0f1c, 0x27ac,
+ 0x1b68, 0x3344, 0x038c, 0x0f20, 0x27b0,
+ 0x1c68, 0x3444, 0x048c, 0x1020, 0x28b0,
+ 0x1d68, 0x3544, 0x058c, 0x1120, 0x29b0,
+ 0x2148, 0x3924, 0x096c, 0x1500, 0x2d90,
+ 0x2248, 0x3a24, 0x0a6c, 0x1600, 0x2e90,
+ 0x2348, 0x3b24, 0x0b6c, 0x1700, 0x2f90,
+ 0x234c, 0x3b28, 0x0b70, 0x1704, 0x2f94,
+ 0x224c, 0x3a28, 0x0a70, 0x1604, 0x2e94,
+ 0x214c, 0x3928, 0x0970, 0x1504, 0x2d94,
+ 0x2150, 0x392c, 0x0974, 0x1508, 0x2d98,
+ 0x2250, 0x3a2c, 0x0a74, 0x1608, 0x2e98,
+ 0x2350, 0x3b2c, 0x0b74, 0x1708, 0x2f98,
+ 0x2354, 0x3b30, 0x0b78, 0x170c, 0x2f9c,
+ 0x2254, 0x3a30, 0x0a78, 0x160c, 0x2e9c,
+ 0x2154, 0x3930, 0x0978, 0x150c, 0x2d9c,
+ 0x2158, 0x3934, 0x097c, 0x1510, 0x2da0,
+ 0x2258, 0x3a34, 0x0a7c, 0x1610, 0x2ea0,
+ 0x2358, 0x3b34, 0x0b7c, 0x1710, 0x2fa0,
+ 0x235c, 0x3b38, 0x0b80, 0x1714, 0x2fa4,
+ 0x225c, 0x3a38, 0x0a80, 0x1614, 0x2ea4,
+ 0x215c, 0x3938, 0x0980, 0x1514, 0x2da4,
+ 0x2160, 0x393c, 0x0984, 0x1518, 0x2da8,
+ 0x2260, 0x3a3c, 0x0a84, 0x1618, 0x2ea8,
+ 0x2360, 0x3b3c, 0x0b84, 0x1718, 0x2fa8,
+ 0x2364, 0x3b40, 0x0b88, 0x171c, 0x2fac,
+ 0x2264, 0x3a40, 0x0a88, 0x161c, 0x2eac,
+ 0x2164, 0x3940, 0x0988, 0x151c, 0x2dac,
+ 0x2168, 0x3944, 0x098c, 0x1520, 0x2db0,
+ 0x2268, 0x3a44, 0x0a8c, 0x1620, 0x2eb0,
+ 0x2368, 0x3b44, 0x0b8c, 0x1720, 0x2fb0,
+ 0x2748, 0x0324, 0x0f6c, 0x1b00, 0x3390,
+ 0x2848, 0x0424, 0x106c, 0x1c00, 0x3490,
+ 0x2948, 0x0524, 0x116c, 0x1d00, 0x3590,
+ 0x294c, 0x0528, 0x1170, 0x1d04, 0x3594,
+ 0x284c, 0x0428, 0x1070, 0x1c04, 0x3494,
+ 0x274c, 0x0328, 0x0f70, 0x1b04, 0x3394,
+ 0x2750, 0x032c, 0x0f74, 0x1b08, 0x3398,
+ 0x2850, 0x042c, 0x1074, 0x1c08, 0x3498,
+ 0x2950, 0x052c, 0x1174, 0x1d08, 0x3598,
+ 0x2954, 0x0530, 0x1178, 0x1d0c, 0x359c,
+ 0x2854, 0x0430, 0x1078, 0x1c0c, 0x349c,
+ 0x2754, 0x0330, 0x0f78, 0x1b0c, 0x339c,
+ 0x2758, 0x0334, 0x0f7c, 0x1b10, 0x33a0,
+ 0x2858, 0x0434, 0x107c, 0x1c10, 0x34a0,
+ 0x2958, 0x0534, 0x117c, 0x1d10, 0x35a0,
+ 0x295c, 0x0538, 0x1180, 0x1d14, 0x35a4,
+ 0x285c, 0x0438, 0x1080, 0x1c14, 0x34a4,
+ 0x275c, 0x0338, 0x0f80, 0x1b14, 0x33a4,
+ 0x2760, 0x033c, 0x0f84, 0x1b18, 0x33a8,
+ 0x2860, 0x043c, 0x1084, 0x1c18, 0x34a8,
+ 0x2960, 0x053c, 0x1184, 0x1d18, 0x35a8,
+ 0x2964, 0x0540, 0x1188, 0x1d1c, 0x35ac,
+ 0x2864, 0x0440, 0x1088, 0x1c1c, 0x34ac,
+ 0x2764, 0x0340, 0x0f88, 0x1b1c, 0x33ac,
+ 0x2768, 0x0344, 0x0f8c, 0x1b20, 0x33b0,
+ 0x2868, 0x0444, 0x108c, 0x1c20, 0x34b0,
+ 0x2968, 0x0544, 0x118c, 0x1d20, 0x35b0,
+ 0x2d48, 0x0924, 0x156c, 0x2100, 0x3990,
+ 0x2e48, 0x0a24, 0x166c, 0x2200, 0x3a90,
+ 0x2f48, 0x0b24, 0x176c, 0x2300, 0x3b90,
+ 0x2f4c, 0x0b28, 0x1770, 0x2304, 0x3b94,
+ 0x2e4c, 0x0a28, 0x1670, 0x2204, 0x3a94,
+ 0x2d4c, 0x0928, 0x1570, 0x2104, 0x3994,
+ 0x2d50, 0x092c, 0x1574, 0x2108, 0x3998,
+ 0x2e50, 0x0a2c, 0x1674, 0x2208, 0x3a98,
+ 0x2f50, 0x0b2c, 0x1774, 0x2308, 0x3b98,
+ 0x2f54, 0x0b30, 0x1778, 0x230c, 0x3b9c,
+ 0x2e54, 0x0a30, 0x1678, 0x220c, 0x3a9c,
+ 0x2d54, 0x0930, 0x1578, 0x210c, 0x399c,
+ 0x2d58, 0x0934, 0x157c, 0x2110, 0x39a0,
+ 0x2e58, 0x0a34, 0x167c, 0x2210, 0x3aa0,
+ 0x2f58, 0x0b34, 0x177c, 0x2310, 0x3ba0,
+ 0x2f5c, 0x0b38, 0x1780, 0x2314, 0x3ba4,
+ 0x2e5c, 0x0a38, 0x1680, 0x2214, 0x3aa4,
+ 0x2d5c, 0x0938, 0x1580, 0x2114, 0x39a4,
+ 0x2d60, 0x093c, 0x1584, 0x2118, 0x39a8,
+ 0x2e60, 0x0a3c, 0x1684, 0x2218, 0x3aa8,
+ 0x2f60, 0x0b3c, 0x1784, 0x2318, 0x3ba8,
+ 0x2f64, 0x0b40, 0x1788, 0x231c, 0x3bac,
+ 0x2e64, 0x0a40, 0x1688, 0x221c, 0x3aac,
+ 0x2d64, 0x0940, 0x1588, 0x211c, 0x39ac,
+ 0x2d68, 0x0944, 0x158c, 0x2120, 0x39b0,
+ 0x2e68, 0x0a44, 0x168c, 0x2220, 0x3ab0,
+ 0x2f68, 0x0b44, 0x178c, 0x2320, 0x3bb0,
+ 0x3348, 0x0f24, 0x1b6c, 0x2700, 0x0390,
+ 0x3448, 0x1024, 0x1c6c, 0x2800, 0x0490,
+ 0x3548, 0x1124, 0x1d6c, 0x2900, 0x0590,
+ 0x354c, 0x1128, 0x1d70, 0x2904, 0x0594,
+ 0x344c, 0x1028, 0x1c70, 0x2804, 0x0494,
+ 0x334c, 0x0f28, 0x1b70, 0x2704, 0x0394,
+ 0x3350, 0x0f2c, 0x1b74, 0x2708, 0x0398,
+ 0x3450, 0x102c, 0x1c74, 0x2808, 0x0498,
+ 0x3550, 0x112c, 0x1d74, 0x2908, 0x0598,
+ 0x3554, 0x1130, 0x1d78, 0x290c, 0x059c,
+ 0x3454, 0x1030, 0x1c78, 0x280c, 0x049c,
+ 0x3354, 0x0f30, 0x1b78, 0x270c, 0x039c,
+ 0x3358, 0x0f34, 0x1b7c, 0x2710, 0x03a0,
+ 0x3458, 0x1034, 0x1c7c, 0x2810, 0x04a0,
+ 0x3558, 0x1134, 0x1d7c, 0x2910, 0x05a0,
+ 0x355c, 0x1138, 0x1d80, 0x2914, 0x05a4,
+ 0x345c, 0x1038, 0x1c80, 0x2814, 0x04a4,
+ 0x335c, 0x0f38, 0x1b80, 0x2714, 0x03a4,
+ 0x3360, 0x0f3c, 0x1b84, 0x2718, 0x03a8,
+ 0x3460, 0x103c, 0x1c84, 0x2818, 0x04a8,
+ 0x3560, 0x113c, 0x1d84, 0x2918, 0x05a8,
+ 0x3564, 0x1140, 0x1d88, 0x291c, 0x05ac,
+ 0x3464, 0x1040, 0x1c88, 0x281c, 0x04ac,
+ 0x3364, 0x0f40, 0x1b88, 0x271c, 0x03ac,
+ 0x3368, 0x0f44, 0x1b8c, 0x2720, 0x03b0,
+ 0x3468, 0x1044, 0x1c8c, 0x2820, 0x04b0,
+ 0x3568, 0x1144, 0x1d8c, 0x2920, 0x05b0,
+ 0x3948, 0x1524, 0x216c, 0x2d00, 0x0990,
+ 0x3a48, 0x1624, 0x226c, 0x2e00, 0x0a90,
+ 0x3b48, 0x1724, 0x236c, 0x2f00, 0x0b90,
+ 0x3b4c, 0x1728, 0x2370, 0x2f04, 0x0b94,
+ 0x3a4c, 0x1628, 0x2270, 0x2e04, 0x0a94,
+ 0x394c, 0x1528, 0x2170, 0x2d04, 0x0994,
+ 0x3950, 0x152c, 0x2174, 0x2d08, 0x0998,
+ 0x3a50, 0x162c, 0x2274, 0x2e08, 0x0a98,
+ 0x3b50, 0x172c, 0x2374, 0x2f08, 0x0b98,
+ 0x3b54, 0x1730, 0x2378, 0x2f0c, 0x0b9c,
+ 0x3a54, 0x1630, 0x2278, 0x2e0c, 0x0a9c,
+ 0x3954, 0x1530, 0x2178, 0x2d0c, 0x099c,
+ 0x3958, 0x1534, 0x217c, 0x2d10, 0x09a0,
+ 0x3a58, 0x1634, 0x227c, 0x2e10, 0x0aa0,
+ 0x3b58, 0x1734, 0x237c, 0x2f10, 0x0ba0,
+ 0x3b5c, 0x1738, 0x2380, 0x2f14, 0x0ba4,
+ 0x3a5c, 0x1638, 0x2280, 0x2e14, 0x0aa4,
+ 0x395c, 0x1538, 0x2180, 0x2d14, 0x09a4,
+ 0x3960, 0x153c, 0x2184, 0x2d18, 0x09a8,
+ 0x3a60, 0x163c, 0x2284, 0x2e18, 0x0aa8,
+ 0x3b60, 0x173c, 0x2384, 0x2f18, 0x0ba8,
+ 0x3b64, 0x1740, 0x2388, 0x2f1c, 0x0bac,
+ 0x3a64, 0x1640, 0x2288, 0x2e1c, 0x0aac,
+ 0x3964, 0x1540, 0x2188, 0x2d1c, 0x09ac,
+ 0x3968, 0x1544, 0x218c, 0x2d20, 0x09b0,
+ 0x3a68, 0x1644, 0x228c, 0x2e20, 0x0ab0,
+ 0x3b68, 0x1744, 0x238c, 0x2f20, 0x0bb0,
+ 0x0348, 0x1b24, 0x276c, 0x3300, 0x0f90,
+ 0x0448, 0x1c24, 0x286c, 0x3400, 0x1090,
+ 0x0548, 0x1d24, 0x296c, 0x3500, 0x1190,
+ 0x054c, 0x1d28, 0x2970, 0x3504, 0x1194,
+ 0x044c, 0x1c28, 0x2870, 0x3404, 0x1094,
+ 0x034c, 0x1b28, 0x2770, 0x3304, 0x0f94,
+ 0x0350, 0x1b2c, 0x2774, 0x3308, 0x0f98,
+ 0x0450, 0x1c2c, 0x2874, 0x3408, 0x1098,
+ 0x0550, 0x1d2c, 0x2974, 0x3508, 0x1198,
+ 0x0554, 0x1d30, 0x2978, 0x350c, 0x119c,
+ 0x0454, 0x1c30, 0x2878, 0x340c, 0x109c,
+ 0x0354, 0x1b30, 0x2778, 0x330c, 0x0f9c,
+ 0x0358, 0x1b34, 0x277c, 0x3310, 0x0fa0,
+ 0x0458, 0x1c34, 0x287c, 0x3410, 0x10a0,
+ 0x0558, 0x1d34, 0x297c, 0x3510, 0x11a0,
+ 0x055c, 0x1d38, 0x2980, 0x3514, 0x11a4,
+ 0x045c, 0x1c38, 0x2880, 0x3414, 0x10a4,
+ 0x035c, 0x1b38, 0x2780, 0x3314, 0x0fa4,
+ 0x0360, 0x1b3c, 0x2784, 0x3318, 0x0fa8,
+ 0x0460, 0x1c3c, 0x2884, 0x3418, 0x10a8,
+ 0x0560, 0x1d3c, 0x2984, 0x3518, 0x11a8,
+ 0x0564, 0x1d40, 0x2988, 0x351c, 0x11ac,
+ 0x0464, 0x1c40, 0x2888, 0x341c, 0x10ac,
+ 0x0364, 0x1b40, 0x2788, 0x331c, 0x0fac,
+ 0x0368, 0x1b44, 0x278c, 0x3320, 0x0fb0,
+ 0x0468, 0x1c44, 0x288c, 0x3420, 0x10b0,
+ 0x0568, 0x1d44, 0x298c, 0x3520, 0x11b0,
+ 0x0948, 0x2124, 0x2d6c, 0x3900, 0x1590,
+ 0x0a48, 0x2224, 0x2e6c, 0x3a00, 0x1690,
+ 0x0b48, 0x2324, 0x2f6c, 0x3b00, 0x1790,
+ 0x0b4c, 0x2328, 0x2f70, 0x3b04, 0x1794,
+ 0x0a4c, 0x2228, 0x2e70, 0x3a04, 0x1694,
+ 0x094c, 0x2128, 0x2d70, 0x3904, 0x1594,
+ 0x0950, 0x212c, 0x2d74, 0x3908, 0x1598,
+ 0x0a50, 0x222c, 0x2e74, 0x3a08, 0x1698,
+ 0x0b50, 0x232c, 0x2f74, 0x3b08, 0x1798,
+ 0x0b54, 0x2330, 0x2f78, 0x3b0c, 0x179c,
+ 0x0a54, 0x2230, 0x2e78, 0x3a0c, 0x169c,
+ 0x0954, 0x2130, 0x2d78, 0x390c, 0x159c,
+ 0x0958, 0x2134, 0x2d7c, 0x3910, 0x15a0,
+ 0x0a58, 0x2234, 0x2e7c, 0x3a10, 0x16a0,
+ 0x0b58, 0x2334, 0x2f7c, 0x3b10, 0x17a0,
+ 0x0b5c, 0x2338, 0x2f80, 0x3b14, 0x17a4,
+ 0x0a5c, 0x2238, 0x2e80, 0x3a14, 0x16a4,
+ 0x095c, 0x2138, 0x2d80, 0x3914, 0x15a4,
+ 0x0960, 0x213c, 0x2d84, 0x3918, 0x15a8,
+ 0x0a60, 0x223c, 0x2e84, 0x3a18, 0x16a8,
+ 0x0b60, 0x233c, 0x2f84, 0x3b18, 0x17a8,
+ 0x0b64, 0x2340, 0x2f88, 0x3b1c, 0x17ac,
+ 0x0a64, 0x2240, 0x2e88, 0x3a1c, 0x16ac,
+ 0x0964, 0x2140, 0x2d88, 0x391c, 0x15ac,
+ 0x0968, 0x2144, 0x2d8c, 0x3920, 0x15b0,
+ 0x0a68, 0x2244, 0x2e8c, 0x3a20, 0x16b0,
+ 0x0b68, 0x2344, 0x2f8c, 0x3b20, 0x17b0,
+};
+
+/* 2 channels per frame, 12 DIF sequences per channel,
+   27 video segments per DIF sequence, 5 macroblocks per video segment */
+static const uint16_t dv_place_422_625[2*12*27*5] = {
+ 0x0c48, 0x2424, 0x306c, 0x0000, 0x1890,
+ 0x0d48, 0x2524, 0x316c, 0x0100, 0x1990,
+ 0x0e48, 0x2624, 0x326c, 0x0200, 0x1a90,
+ 0x0e4c, 0x2628, 0x3270, 0x0204, 0x1a94,
+ 0x0d4c, 0x2528, 0x3170, 0x0104, 0x1994,
+ 0x0c4c, 0x2428, 0x3070, 0x0004, 0x1894,
+ 0x0c50, 0x242c, 0x3074, 0x0008, 0x1898,
+ 0x0d50, 0x252c, 0x3174, 0x0108, 0x1998,
+ 0x0e50, 0x262c, 0x3274, 0x0208, 0x1a98,
+ 0x0e54, 0x2630, 0x3278, 0x020c, 0x1a9c,
+ 0x0d54, 0x2530, 0x3178, 0x010c, 0x199c,
+ 0x0c54, 0x2430, 0x3078, 0x000c, 0x189c,
+ 0x0c58, 0x2434, 0x307c, 0x0010, 0x18a0,
+ 0x0d58, 0x2534, 0x317c, 0x0110, 0x19a0,
+ 0x0e58, 0x2634, 0x327c, 0x0210, 0x1aa0,
+ 0x0e5c, 0x2638, 0x3280, 0x0214, 0x1aa4,
+ 0x0d5c, 0x2538, 0x3180, 0x0114, 0x19a4,
+ 0x0c5c, 0x2438, 0x3080, 0x0014, 0x18a4,
+ 0x0c60, 0x243c, 0x3084, 0x0018, 0x18a8,
+ 0x0d60, 0x253c, 0x3184, 0x0118, 0x19a8,
+ 0x0e60, 0x263c, 0x3284, 0x0218, 0x1aa8,
+ 0x0e64, 0x2640, 0x3288, 0x021c, 0x1aac,
+ 0x0d64, 0x2540, 0x3188, 0x011c, 0x19ac,
+ 0x0c64, 0x2440, 0x3088, 0x001c, 0x18ac,
+ 0x0c68, 0x2444, 0x308c, 0x0020, 0x18b0,
+ 0x0d68, 0x2544, 0x318c, 0x0120, 0x19b0,
+ 0x0e68, 0x2644, 0x328c, 0x0220, 0x1ab0,
+ 0x1248, 0x2a24, 0x366c, 0x0600, 0x1e90,
+ 0x1348, 0x2b24, 0x376c, 0x0700, 0x1f90,
+ 0x1448, 0x2c24, 0x386c, 0x0800, 0x2090,
+ 0x144c, 0x2c28, 0x3870, 0x0804, 0x2094,
+ 0x134c, 0x2b28, 0x3770, 0x0704, 0x1f94,
+ 0x124c, 0x2a28, 0x3670, 0x0604, 0x1e94,
+ 0x1250, 0x2a2c, 0x3674, 0x0608, 0x1e98,
+ 0x1350, 0x2b2c, 0x3774, 0x0708, 0x1f98,
+ 0x1450, 0x2c2c, 0x3874, 0x0808, 0x2098,
+ 0x1454, 0x2c30, 0x3878, 0x080c, 0x209c,
+ 0x1354, 0x2b30, 0x3778, 0x070c, 0x1f9c,
+ 0x1254, 0x2a30, 0x3678, 0x060c, 0x1e9c,
+ 0x1258, 0x2a34, 0x367c, 0x0610, 0x1ea0,
+ 0x1358, 0x2b34, 0x377c, 0x0710, 0x1fa0,
+ 0x1458, 0x2c34, 0x387c, 0x0810, 0x20a0,
+ 0x145c, 0x2c38, 0x3880, 0x0814, 0x20a4,
+ 0x135c, 0x2b38, 0x3780, 0x0714, 0x1fa4,
+ 0x125c, 0x2a38, 0x3680, 0x0614, 0x1ea4,
+ 0x1260, 0x2a3c, 0x3684, 0x0618, 0x1ea8,
+ 0x1360, 0x2b3c, 0x3784, 0x0718, 0x1fa8,
+ 0x1460, 0x2c3c, 0x3884, 0x0818, 0x20a8,
+ 0x1464, 0x2c40, 0x3888, 0x081c, 0x20ac,
+ 0x1364, 0x2b40, 0x3788, 0x071c, 0x1fac,
+ 0x1264, 0x2a40, 0x3688, 0x061c, 0x1eac,
+ 0x1268, 0x2a44, 0x368c, 0x0620, 0x1eb0,
+ 0x1368, 0x2b44, 0x378c, 0x0720, 0x1fb0,
+ 0x1468, 0x2c44, 0x388c, 0x0820, 0x20b0,
+ 0x1848, 0x3024, 0x3c6c, 0x0c00, 0x2490,
+ 0x1948, 0x3124, 0x3d6c, 0x0d00, 0x2590,
+ 0x1a48, 0x3224, 0x3e6c, 0x0e00, 0x2690,
+ 0x1a4c, 0x3228, 0x3e70, 0x0e04, 0x2694,
+ 0x194c, 0x3128, 0x3d70, 0x0d04, 0x2594,
+ 0x184c, 0x3028, 0x3c70, 0x0c04, 0x2494,
+ 0x1850, 0x302c, 0x3c74, 0x0c08, 0x2498,
+ 0x1950, 0x312c, 0x3d74, 0x0d08, 0x2598,
+ 0x1a50, 0x322c, 0x3e74, 0x0e08, 0x2698,
+ 0x1a54, 0x3230, 0x3e78, 0x0e0c, 0x269c,
+ 0x1954, 0x3130, 0x3d78, 0x0d0c, 0x259c,
+ 0x1854, 0x3030, 0x3c78, 0x0c0c, 0x249c,
+ 0x1858, 0x3034, 0x3c7c, 0x0c10, 0x24a0,
+ 0x1958, 0x3134, 0x3d7c, 0x0d10, 0x25a0,
+ 0x1a58, 0x3234, 0x3e7c, 0x0e10, 0x26a0,
+ 0x1a5c, 0x3238, 0x3e80, 0x0e14, 0x26a4,
+ 0x195c, 0x3138, 0x3d80, 0x0d14, 0x25a4,
+ 0x185c, 0x3038, 0x3c80, 0x0c14, 0x24a4,
+ 0x1860, 0x303c, 0x3c84, 0x0c18, 0x24a8,
+ 0x1960, 0x313c, 0x3d84, 0x0d18, 0x25a8,
+ 0x1a60, 0x323c, 0x3e84, 0x0e18, 0x26a8,
+ 0x1a64, 0x3240, 0x3e88, 0x0e1c, 0x26ac,
+ 0x1964, 0x3140, 0x3d88, 0x0d1c, 0x25ac,
+ 0x1864, 0x3040, 0x3c88, 0x0c1c, 0x24ac,
+ 0x1868, 0x3044, 0x3c8c, 0x0c20, 0x24b0,
+ 0x1968, 0x3144, 0x3d8c, 0x0d20, 0x25b0,
+ 0x1a68, 0x3244, 0x3e8c, 0x0e20, 0x26b0,
+ 0x1e48, 0x3624, 0x426c, 0x1200, 0x2a90,
+ 0x1f48, 0x3724, 0x436c, 0x1300, 0x2b90,
+ 0x2048, 0x3824, 0x446c, 0x1400, 0x2c90,
+ 0x204c, 0x3828, 0x4470, 0x1404, 0x2c94,
+ 0x1f4c, 0x3728, 0x4370, 0x1304, 0x2b94,
+ 0x1e4c, 0x3628, 0x4270, 0x1204, 0x2a94,
+ 0x1e50, 0x362c, 0x4274, 0x1208, 0x2a98,
+ 0x1f50, 0x372c, 0x4374, 0x1308, 0x2b98,
+ 0x2050, 0x382c, 0x4474, 0x1408, 0x2c98,
+ 0x2054, 0x3830, 0x4478, 0x140c, 0x2c9c,
+ 0x1f54, 0x3730, 0x4378, 0x130c, 0x2b9c,
+ 0x1e54, 0x3630, 0x4278, 0x120c, 0x2a9c,
+ 0x1e58, 0x3634, 0x427c, 0x1210, 0x2aa0,
+ 0x1f58, 0x3734, 0x437c, 0x1310, 0x2ba0,
+ 0x2058, 0x3834, 0x447c, 0x1410, 0x2ca0,
+ 0x205c, 0x3838, 0x4480, 0x1414, 0x2ca4,
+ 0x1f5c, 0x3738, 0x4380, 0x1314, 0x2ba4,
+ 0x1e5c, 0x3638, 0x4280, 0x1214, 0x2aa4,
+ 0x1e60, 0x363c, 0x4284, 0x1218, 0x2aa8,
+ 0x1f60, 0x373c, 0x4384, 0x1318, 0x2ba8,
+ 0x2060, 0x383c, 0x4484, 0x1418, 0x2ca8,
+ 0x2064, 0x3840, 0x4488, 0x141c, 0x2cac,
+ 0x1f64, 0x3740, 0x4388, 0x131c, 0x2bac,
+ 0x1e64, 0x3640, 0x4288, 0x121c, 0x2aac,
+ 0x1e68, 0x3644, 0x428c, 0x1220, 0x2ab0,
+ 0x1f68, 0x3744, 0x438c, 0x1320, 0x2bb0,
+ 0x2068, 0x3844, 0x448c, 0x1420, 0x2cb0,
+ 0x2448, 0x3c24, 0x006c, 0x1800, 0x3090,
+ 0x2548, 0x3d24, 0x016c, 0x1900, 0x3190,
+ 0x2648, 0x3e24, 0x026c, 0x1a00, 0x3290,
+ 0x264c, 0x3e28, 0x0270, 0x1a04, 0x3294,
+ 0x254c, 0x3d28, 0x0170, 0x1904, 0x3194,
+ 0x244c, 0x3c28, 0x0070, 0x1804, 0x3094,
+ 0x2450, 0x3c2c, 0x0074, 0x1808, 0x3098,
+ 0x2550, 0x3d2c, 0x0174, 0x1908, 0x3198,
+ 0x2650, 0x3e2c, 0x0274, 0x1a08, 0x3298,
+ 0x2654, 0x3e30, 0x0278, 0x1a0c, 0x329c,
+ 0x2554, 0x3d30, 0x0178, 0x190c, 0x319c,
+ 0x2454, 0x3c30, 0x0078, 0x180c, 0x309c,
+ 0x2458, 0x3c34, 0x007c, 0x1810, 0x30a0,
+ 0x2558, 0x3d34, 0x017c, 0x1910, 0x31a0,
+ 0x2658, 0x3e34, 0x027c, 0x1a10, 0x32a0,
+ 0x265c, 0x3e38, 0x0280, 0x1a14, 0x32a4,
+ 0x255c, 0x3d38, 0x0180, 0x1914, 0x31a4,
+ 0x245c, 0x3c38, 0x0080, 0x1814, 0x30a4,
+ 0x2460, 0x3c3c, 0x0084, 0x1818, 0x30a8,
+ 0x2560, 0x3d3c, 0x0184, 0x1918, 0x31a8,
+ 0x2660, 0x3e3c, 0x0284, 0x1a18, 0x32a8,
+ 0x2664, 0x3e40, 0x0288, 0x1a1c, 0x32ac,
+ 0x2564, 0x3d40, 0x0188, 0x191c, 0x31ac,
+ 0x2464, 0x3c40, 0x0088, 0x181c, 0x30ac,
+ 0x2468, 0x3c44, 0x008c, 0x1820, 0x30b0,
+ 0x2568, 0x3d44, 0x018c, 0x1920, 0x31b0,
+ 0x2668, 0x3e44, 0x028c, 0x1a20, 0x32b0,
+ 0x2a48, 0x4224, 0x066c, 0x1e00, 0x3690,
+ 0x2b48, 0x4324, 0x076c, 0x1f00, 0x3790,
+ 0x2c48, 0x4424, 0x086c, 0x2000, 0x3890,
+ 0x2c4c, 0x4428, 0x0870, 0x2004, 0x3894,
+ 0x2b4c, 0x4328, 0x0770, 0x1f04, 0x3794,
+ 0x2a4c, 0x4228, 0x0670, 0x1e04, 0x3694,
+ 0x2a50, 0x422c, 0x0674, 0x1e08, 0x3698,
+ 0x2b50, 0x432c, 0x0774, 0x1f08, 0x3798,
+ 0x2c50, 0x442c, 0x0874, 0x2008, 0x3898,
+ 0x2c54, 0x4430, 0x0878, 0x200c, 0x389c,
+ 0x2b54, 0x4330, 0x0778, 0x1f0c, 0x379c,
+ 0x2a54, 0x4230, 0x0678, 0x1e0c, 0x369c,
+ 0x2a58, 0x4234, 0x067c, 0x1e10, 0x36a0,
+ 0x2b58, 0x4334, 0x077c, 0x1f10, 0x37a0,
+ 0x2c58, 0x4434, 0x087c, 0x2010, 0x38a0,
+ 0x2c5c, 0x4438, 0x0880, 0x2014, 0x38a4,
+ 0x2b5c, 0x4338, 0x0780, 0x1f14, 0x37a4,
+ 0x2a5c, 0x4238, 0x0680, 0x1e14, 0x36a4,
+ 0x2a60, 0x423c, 0x0684, 0x1e18, 0x36a8,
+ 0x2b60, 0x433c, 0x0784, 0x1f18, 0x37a8,
+ 0x2c60, 0x443c, 0x0884, 0x2018, 0x38a8,
+ 0x2c64, 0x4440, 0x0888, 0x201c, 0x38ac,
+ 0x2b64, 0x4340, 0x0788, 0x1f1c, 0x37ac,
+ 0x2a64, 0x4240, 0x0688, 0x1e1c, 0x36ac,
+ 0x2a68, 0x4244, 0x068c, 0x1e20, 0x36b0,
+ 0x2b68, 0x4344, 0x078c, 0x1f20, 0x37b0,
+ 0x2c68, 0x4444, 0x088c, 0x2020, 0x38b0,
+ 0x3048, 0x0024, 0x0c6c, 0x2400, 0x3c90,
+ 0x3148, 0x0124, 0x0d6c, 0x2500, 0x3d90,
+ 0x3248, 0x0224, 0x0e6c, 0x2600, 0x3e90,
+ 0x324c, 0x0228, 0x0e70, 0x2604, 0x3e94,
+ 0x314c, 0x0128, 0x0d70, 0x2504, 0x3d94,
+ 0x304c, 0x0028, 0x0c70, 0x2404, 0x3c94,
+ 0x3050, 0x002c, 0x0c74, 0x2408, 0x3c98,
+ 0x3150, 0x012c, 0x0d74, 0x2508, 0x3d98,
+ 0x3250, 0x022c, 0x0e74, 0x2608, 0x3e98,
+ 0x3254, 0x0230, 0x0e78, 0x260c, 0x3e9c,
+ 0x3154, 0x0130, 0x0d78, 0x250c, 0x3d9c,
+ 0x3054, 0x0030, 0x0c78, 0x240c, 0x3c9c,
+ 0x3058, 0x0034, 0x0c7c, 0x2410, 0x3ca0,
+ 0x3158, 0x0134, 0x0d7c, 0x2510, 0x3da0,
+ 0x3258, 0x0234, 0x0e7c, 0x2610, 0x3ea0,
+ 0x325c, 0x0238, 0x0e80, 0x2614, 0x3ea4,
+ 0x315c, 0x0138, 0x0d80, 0x2514, 0x3da4,
+ 0x305c, 0x0038, 0x0c80, 0x2414, 0x3ca4,
+ 0x3060, 0x003c, 0x0c84, 0x2418, 0x3ca8,
+ 0x3160, 0x013c, 0x0d84, 0x2518, 0x3da8,
+ 0x3260, 0x023c, 0x0e84, 0x2618, 0x3ea8,
+ 0x3264, 0x0240, 0x0e88, 0x261c, 0x3eac,
+ 0x3164, 0x0140, 0x0d88, 0x251c, 0x3dac,
+ 0x3064, 0x0040, 0x0c88, 0x241c, 0x3cac,
+ 0x3068, 0x0044, 0x0c8c, 0x2420, 0x3cb0,
+ 0x3168, 0x0144, 0x0d8c, 0x2520, 0x3db0,
+ 0x3268, 0x0244, 0x0e8c, 0x2620, 0x3eb0,
+ 0x3648, 0x0624, 0x126c, 0x2a00, 0x4290,
+ 0x3748, 0x0724, 0x136c, 0x2b00, 0x4390,
+ 0x3848, 0x0824, 0x146c, 0x2c00, 0x4490,
+ 0x384c, 0x0828, 0x1470, 0x2c04, 0x4494,
+ 0x374c, 0x0728, 0x1370, 0x2b04, 0x4394,
+ 0x364c, 0x0628, 0x1270, 0x2a04, 0x4294,
+ 0x3650, 0x062c, 0x1274, 0x2a08, 0x4298,
+ 0x3750, 0x072c, 0x1374, 0x2b08, 0x4398,
+ 0x3850, 0x082c, 0x1474, 0x2c08, 0x4498,
+ 0x3854, 0x0830, 0x1478, 0x2c0c, 0x449c,
+ 0x3754, 0x0730, 0x1378, 0x2b0c, 0x439c,
+ 0x3654, 0x0630, 0x1278, 0x2a0c, 0x429c,
+ 0x3658, 0x0634, 0x127c, 0x2a10, 0x42a0,
+ 0x3758, 0x0734, 0x137c, 0x2b10, 0x43a0,
+ 0x3858, 0x0834, 0x147c, 0x2c10, 0x44a0,
+ 0x385c, 0x0838, 0x1480, 0x2c14, 0x44a4,
+ 0x375c, 0x0738, 0x1380, 0x2b14, 0x43a4,
+ 0x365c, 0x0638, 0x1280, 0x2a14, 0x42a4,
+ 0x3660, 0x063c, 0x1284, 0x2a18, 0x42a8,
+ 0x3760, 0x073c, 0x1384, 0x2b18, 0x43a8,
+ 0x3860, 0x083c, 0x1484, 0x2c18, 0x44a8,
+ 0x3864, 0x0840, 0x1488, 0x2c1c, 0x44ac,
+ 0x3764, 0x0740, 0x1388, 0x2b1c, 0x43ac,
+ 0x3664, 0x0640, 0x1288, 0x2a1c, 0x42ac,
+ 0x3668, 0x0644, 0x128c, 0x2a20, 0x42b0,
+ 0x3768, 0x0744, 0x138c, 0x2b20, 0x43b0,
+ 0x3868, 0x0844, 0x148c, 0x2c20, 0x44b0,
+ 0x3c48, 0x0c24, 0x186c, 0x3000, 0x0090,
+ 0x3d48, 0x0d24, 0x196c, 0x3100, 0x0190,
+ 0x3e48, 0x0e24, 0x1a6c, 0x3200, 0x0290,
+ 0x3e4c, 0x0e28, 0x1a70, 0x3204, 0x0294,
+ 0x3d4c, 0x0d28, 0x1970, 0x3104, 0x0194,
+ 0x3c4c, 0x0c28, 0x1870, 0x3004, 0x0094,
+ 0x3c50, 0x0c2c, 0x1874, 0x3008, 0x0098,
+ 0x3d50, 0x0d2c, 0x1974, 0x3108, 0x0198,
+ 0x3e50, 0x0e2c, 0x1a74, 0x3208, 0x0298,
+ 0x3e54, 0x0e30, 0x1a78, 0x320c, 0x029c,
+ 0x3d54, 0x0d30, 0x1978, 0x310c, 0x019c,
+ 0x3c54, 0x0c30, 0x1878, 0x300c, 0x009c,
+ 0x3c58, 0x0c34, 0x187c, 0x3010, 0x00a0,
+ 0x3d58, 0x0d34, 0x197c, 0x3110, 0x01a0,
+ 0x3e58, 0x0e34, 0x1a7c, 0x3210, 0x02a0,
+ 0x3e5c, 0x0e38, 0x1a80, 0x3214, 0x02a4,
+ 0x3d5c, 0x0d38, 0x1980, 0x3114, 0x01a4,
+ 0x3c5c, 0x0c38, 0x1880, 0x3014, 0x00a4,
+ 0x3c60, 0x0c3c, 0x1884, 0x3018, 0x00a8,
+ 0x3d60, 0x0d3c, 0x1984, 0x3118, 0x01a8,
+ 0x3e60, 0x0e3c, 0x1a84, 0x3218, 0x02a8,
+ 0x3e64, 0x0e40, 0x1a88, 0x321c, 0x02ac,
+ 0x3d64, 0x0d40, 0x1988, 0x311c, 0x01ac,
+ 0x3c64, 0x0c40, 0x1888, 0x301c, 0x00ac,
+ 0x3c68, 0x0c44, 0x188c, 0x3020, 0x00b0,
+ 0x3d68, 0x0d44, 0x198c, 0x3120, 0x01b0,
+ 0x3e68, 0x0e44, 0x1a8c, 0x3220, 0x02b0,
+ 0x4248, 0x1224, 0x1e6c, 0x3600, 0x0690,
+ 0x4348, 0x1324, 0x1f6c, 0x3700, 0x0790,
+ 0x4448, 0x1424, 0x206c, 0x3800, 0x0890,
+ 0x444c, 0x1428, 0x2070, 0x3804, 0x0894,
+ 0x434c, 0x1328, 0x1f70, 0x3704, 0x0794,
+ 0x424c, 0x1228, 0x1e70, 0x3604, 0x0694,
+ 0x4250, 0x122c, 0x1e74, 0x3608, 0x0698,
+ 0x4350, 0x132c, 0x1f74, 0x3708, 0x0798,
+ 0x4450, 0x142c, 0x2074, 0x3808, 0x0898,
+ 0x4454, 0x1430, 0x2078, 0x380c, 0x089c,
+ 0x4354, 0x1330, 0x1f78, 0x370c, 0x079c,
+ 0x4254, 0x1230, 0x1e78, 0x360c, 0x069c,
+ 0x4258, 0x1234, 0x1e7c, 0x3610, 0x06a0,
+ 0x4358, 0x1334, 0x1f7c, 0x3710, 0x07a0,
+ 0x4458, 0x1434, 0x207c, 0x3810, 0x08a0,
+ 0x445c, 0x1438, 0x2080, 0x3814, 0x08a4,
+ 0x435c, 0x1338, 0x1f80, 0x3714, 0x07a4,
+ 0x425c, 0x1238, 0x1e80, 0x3614, 0x06a4,
+ 0x4260, 0x123c, 0x1e84, 0x3618, 0x06a8,
+ 0x4360, 0x133c, 0x1f84, 0x3718, 0x07a8,
+ 0x4460, 0x143c, 0x2084, 0x3818, 0x08a8,
+ 0x4464, 0x1440, 0x2088, 0x381c, 0x08ac,
+ 0x4364, 0x1340, 0x1f88, 0x371c, 0x07ac,
+ 0x4264, 0x1240, 0x1e88, 0x361c, 0x06ac,
+ 0x4268, 0x1244, 0x1e8c, 0x3620, 0x06b0,
+ 0x4368, 0x1344, 0x1f8c, 0x3720, 0x07b0,
+ 0x4468, 0x1444, 0x208c, 0x3820, 0x08b0,
+ 0x0048, 0x1824, 0x246c, 0x3c00, 0x0c90,
+ 0x0148, 0x1924, 0x256c, 0x3d00, 0x0d90,
+ 0x0248, 0x1a24, 0x266c, 0x3e00, 0x0e90,
+ 0x024c, 0x1a28, 0x2670, 0x3e04, 0x0e94,
+ 0x014c, 0x1928, 0x2570, 0x3d04, 0x0d94,
+ 0x004c, 0x1828, 0x2470, 0x3c04, 0x0c94,
+ 0x0050, 0x182c, 0x2474, 0x3c08, 0x0c98,
+ 0x0150, 0x192c, 0x2574, 0x3d08, 0x0d98,
+ 0x0250, 0x1a2c, 0x2674, 0x3e08, 0x0e98,
+ 0x0254, 0x1a30, 0x2678, 0x3e0c, 0x0e9c,
+ 0x0154, 0x1930, 0x2578, 0x3d0c, 0x0d9c,
+ 0x0054, 0x1830, 0x2478, 0x3c0c, 0x0c9c,
+ 0x0058, 0x1834, 0x247c, 0x3c10, 0x0ca0,
+ 0x0158, 0x1934, 0x257c, 0x3d10, 0x0da0,
+ 0x0258, 0x1a34, 0x267c, 0x3e10, 0x0ea0,
+ 0x025c, 0x1a38, 0x2680, 0x3e14, 0x0ea4,
+ 0x015c, 0x1938, 0x2580, 0x3d14, 0x0da4,
+ 0x005c, 0x1838, 0x2480, 0x3c14, 0x0ca4,
+ 0x0060, 0x183c, 0x2484, 0x3c18, 0x0ca8,
+ 0x0160, 0x193c, 0x2584, 0x3d18, 0x0da8,
+ 0x0260, 0x1a3c, 0x2684, 0x3e18, 0x0ea8,
+ 0x0264, 0x1a40, 0x2688, 0x3e1c, 0x0eac,
+ 0x0164, 0x1940, 0x2588, 0x3d1c, 0x0dac,
+ 0x0064, 0x1840, 0x2488, 0x3c1c, 0x0cac,
+ 0x0068, 0x1844, 0x248c, 0x3c20, 0x0cb0,
+ 0x0168, 0x1944, 0x258c, 0x3d20, 0x0db0,
+ 0x0268, 0x1a44, 0x268c, 0x3e20, 0x0eb0,
+ 0x0648, 0x1e24, 0x2a6c, 0x4200, 0x1290,
+ 0x0748, 0x1f24, 0x2b6c, 0x4300, 0x1390,
+ 0x0848, 0x2024, 0x2c6c, 0x4400, 0x1490,
+ 0x084c, 0x2028, 0x2c70, 0x4404, 0x1494,
+ 0x074c, 0x1f28, 0x2b70, 0x4304, 0x1394,
+ 0x064c, 0x1e28, 0x2a70, 0x4204, 0x1294,
+ 0x0650, 0x1e2c, 0x2a74, 0x4208, 0x1298,
+ 0x0750, 0x1f2c, 0x2b74, 0x4308, 0x1398,
+ 0x0850, 0x202c, 0x2c74, 0x4408, 0x1498,
+ 0x0854, 0x2030, 0x2c78, 0x440c, 0x149c,
+ 0x0754, 0x1f30, 0x2b78, 0x430c, 0x139c,
+ 0x0654, 0x1e30, 0x2a78, 0x420c, 0x129c,
+ 0x0658, 0x1e34, 0x2a7c, 0x4210, 0x12a0,
+ 0x0758, 0x1f34, 0x2b7c, 0x4310, 0x13a0,
+ 0x0858, 0x2034, 0x2c7c, 0x4410, 0x14a0,
+ 0x085c, 0x2038, 0x2c80, 0x4414, 0x14a4,
+ 0x075c, 0x1f38, 0x2b80, 0x4314, 0x13a4,
+ 0x065c, 0x1e38, 0x2a80, 0x4214, 0x12a4,
+ 0x0660, 0x1e3c, 0x2a84, 0x4218, 0x12a8,
+ 0x0760, 0x1f3c, 0x2b84, 0x4318, 0x13a8,
+ 0x0860, 0x203c, 0x2c84, 0x4418, 0x14a8,
+ 0x0864, 0x2040, 0x2c88, 0x441c, 0x14ac,
+ 0x0764, 0x1f40, 0x2b88, 0x431c, 0x13ac,
+ 0x0664, 0x1e40, 0x2a88, 0x421c, 0x12ac,
+ 0x0668, 0x1e44, 0x2a8c, 0x4220, 0x12b0,
+ 0x0768, 0x1f44, 0x2b8c, 0x4320, 0x13b0,
+ 0x0868, 0x2044, 0x2c8c, 0x4420, 0x14b0,
+ 0x0f48, 0x2724, 0x336c, 0x0300, 0x1b90,
+ 0x1048, 0x2824, 0x346c, 0x0400, 0x1c90,
+ 0x1148, 0x2924, 0x356c, 0x0500, 0x1d90,
+ 0x114c, 0x2928, 0x3570, 0x0504, 0x1d94,
+ 0x104c, 0x2828, 0x3470, 0x0404, 0x1c94,
+ 0x0f4c, 0x2728, 0x3370, 0x0304, 0x1b94,
+ 0x0f50, 0x272c, 0x3374, 0x0308, 0x1b98,
+ 0x1050, 0x282c, 0x3474, 0x0408, 0x1c98,
+ 0x1150, 0x292c, 0x3574, 0x0508, 0x1d98,
+ 0x1154, 0x2930, 0x3578, 0x050c, 0x1d9c,
+ 0x1054, 0x2830, 0x3478, 0x040c, 0x1c9c,
+ 0x0f54, 0x2730, 0x3378, 0x030c, 0x1b9c,
+ 0x0f58, 0x2734, 0x337c, 0x0310, 0x1ba0,
+ 0x1058, 0x2834, 0x347c, 0x0410, 0x1ca0,
+ 0x1158, 0x2934, 0x357c, 0x0510, 0x1da0,
+ 0x115c, 0x2938, 0x3580, 0x0514, 0x1da4,
+ 0x105c, 0x2838, 0x3480, 0x0414, 0x1ca4,
+ 0x0f5c, 0x2738, 0x3380, 0x0314, 0x1ba4,
+ 0x0f60, 0x273c, 0x3384, 0x0318, 0x1ba8,
+ 0x1060, 0x283c, 0x3484, 0x0418, 0x1ca8,
+ 0x1160, 0x293c, 0x3584, 0x0518, 0x1da8,
+ 0x1164, 0x2940, 0x3588, 0x051c, 0x1dac,
+ 0x1064, 0x2840, 0x3488, 0x041c, 0x1cac,
+ 0x0f64, 0x2740, 0x3388, 0x031c, 0x1bac,
+ 0x0f68, 0x2744, 0x338c, 0x0320, 0x1bb0,
+ 0x1068, 0x2844, 0x348c, 0x0420, 0x1cb0,
+ 0x1168, 0x2944, 0x358c, 0x0520, 0x1db0,
+ 0x1548, 0x2d24, 0x396c, 0x0900, 0x2190,
+ 0x1648, 0x2e24, 0x3a6c, 0x0a00, 0x2290,
+ 0x1748, 0x2f24, 0x3b6c, 0x0b00, 0x2390,
+ 0x174c, 0x2f28, 0x3b70, 0x0b04, 0x2394,
+ 0x164c, 0x2e28, 0x3a70, 0x0a04, 0x2294,
+ 0x154c, 0x2d28, 0x3970, 0x0904, 0x2194,
+ 0x1550, 0x2d2c, 0x3974, 0x0908, 0x2198,
+ 0x1650, 0x2e2c, 0x3a74, 0x0a08, 0x2298,
+ 0x1750, 0x2f2c, 0x3b74, 0x0b08, 0x2398,
+ 0x1754, 0x2f30, 0x3b78, 0x0b0c, 0x239c,
+ 0x1654, 0x2e30, 0x3a78, 0x0a0c, 0x229c,
+ 0x1554, 0x2d30, 0x3978, 0x090c, 0x219c,
+ 0x1558, 0x2d34, 0x397c, 0x0910, 0x21a0,
+ 0x1658, 0x2e34, 0x3a7c, 0x0a10, 0x22a0,
+ 0x1758, 0x2f34, 0x3b7c, 0x0b10, 0x23a0,
+ 0x175c, 0x2f38, 0x3b80, 0x0b14, 0x23a4,
+ 0x165c, 0x2e38, 0x3a80, 0x0a14, 0x22a4,
+ 0x155c, 0x2d38, 0x3980, 0x0914, 0x21a4,
+ 0x1560, 0x2d3c, 0x3984, 0x0918, 0x21a8,
+ 0x1660, 0x2e3c, 0x3a84, 0x0a18, 0x22a8,
+ 0x1760, 0x2f3c, 0x3b84, 0x0b18, 0x23a8,
+ 0x1764, 0x2f40, 0x3b88, 0x0b1c, 0x23ac,
+ 0x1664, 0x2e40, 0x3a88, 0x0a1c, 0x22ac,
+ 0x1564, 0x2d40, 0x3988, 0x091c, 0x21ac,
+ 0x1568, 0x2d44, 0x398c, 0x0920, 0x21b0,
+ 0x1668, 0x2e44, 0x3a8c, 0x0a20, 0x22b0,
+ 0x1768, 0x2f44, 0x3b8c, 0x0b20, 0x23b0,
+ 0x1b48, 0x3324, 0x3f6c, 0x0f00, 0x2790,
+ 0x1c48, 0x3424, 0x406c, 0x1000, 0x2890,
+ 0x1d48, 0x3524, 0x416c, 0x1100, 0x2990,
+ 0x1d4c, 0x3528, 0x4170, 0x1104, 0x2994,
+ 0x1c4c, 0x3428, 0x4070, 0x1004, 0x2894,
+ 0x1b4c, 0x3328, 0x3f70, 0x0f04, 0x2794,
+ 0x1b50, 0x332c, 0x3f74, 0x0f08, 0x2798,
+ 0x1c50, 0x342c, 0x4074, 0x1008, 0x2898,
+ 0x1d50, 0x352c, 0x4174, 0x1108, 0x2998,
+ 0x1d54, 0x3530, 0x4178, 0x110c, 0x299c,
+ 0x1c54, 0x3430, 0x4078, 0x100c, 0x289c,
+ 0x1b54, 0x3330, 0x3f78, 0x0f0c, 0x279c,
+ 0x1b58, 0x3334, 0x3f7c, 0x0f10, 0x27a0,
+ 0x1c58, 0x3434, 0x407c, 0x1010, 0x28a0,
+ 0x1d58, 0x3534, 0x417c, 0x1110, 0x29a0,
+ 0x1d5c, 0x3538, 0x4180, 0x1114, 0x29a4,
+ 0x1c5c, 0x3438, 0x4080, 0x1014, 0x28a4,
+ 0x1b5c, 0x3338, 0x3f80, 0x0f14, 0x27a4,
+ 0x1b60, 0x333c, 0x3f84, 0x0f18, 0x27a8,
+ 0x1c60, 0x343c, 0x4084, 0x1018, 0x28a8,
+ 0x1d60, 0x353c, 0x4184, 0x1118, 0x29a8,
+ 0x1d64, 0x3540, 0x4188, 0x111c, 0x29ac,
+ 0x1c64, 0x3440, 0x4088, 0x101c, 0x28ac,
+ 0x1b64, 0x3340, 0x3f88, 0x0f1c, 0x27ac,
+ 0x1b68, 0x3344, 0x3f8c, 0x0f20, 0x27b0,
+ 0x1c68, 0x3444, 0x408c, 0x1020, 0x28b0,
+ 0x1d68, 0x3544, 0x418c, 0x1120, 0x29b0,
+ 0x2148, 0x3924, 0x456c, 0x1500, 0x2d90,
+ 0x2248, 0x3a24, 0x466c, 0x1600, 0x2e90,
+ 0x2348, 0x3b24, 0x476c, 0x1700, 0x2f90,
+ 0x234c, 0x3b28, 0x4770, 0x1704, 0x2f94,
+ 0x224c, 0x3a28, 0x4670, 0x1604, 0x2e94,
+ 0x214c, 0x3928, 0x4570, 0x1504, 0x2d94,
+ 0x2150, 0x392c, 0x4574, 0x1508, 0x2d98,
+ 0x2250, 0x3a2c, 0x4674, 0x1608, 0x2e98,
+ 0x2350, 0x3b2c, 0x4774, 0x1708, 0x2f98,
+ 0x2354, 0x3b30, 0x4778, 0x170c, 0x2f9c,
+ 0x2254, 0x3a30, 0x4678, 0x160c, 0x2e9c,
+ 0x2154, 0x3930, 0x4578, 0x150c, 0x2d9c,
+ 0x2158, 0x3934, 0x457c, 0x1510, 0x2da0,
+ 0x2258, 0x3a34, 0x467c, 0x1610, 0x2ea0,
+ 0x2358, 0x3b34, 0x477c, 0x1710, 0x2fa0,
+ 0x235c, 0x3b38, 0x4780, 0x1714, 0x2fa4,
+ 0x225c, 0x3a38, 0x4680, 0x1614, 0x2ea4,
+ 0x215c, 0x3938, 0x4580, 0x1514, 0x2da4,
+ 0x2160, 0x393c, 0x4584, 0x1518, 0x2da8,
+ 0x2260, 0x3a3c, 0x4684, 0x1618, 0x2ea8,
+ 0x2360, 0x3b3c, 0x4784, 0x1718, 0x2fa8,
+ 0x2364, 0x3b40, 0x4788, 0x171c, 0x2fac,
+ 0x2264, 0x3a40, 0x4688, 0x161c, 0x2eac,
+ 0x2164, 0x3940, 0x4588, 0x151c, 0x2dac,
+ 0x2168, 0x3944, 0x458c, 0x1520, 0x2db0,
+ 0x2268, 0x3a44, 0x468c, 0x1620, 0x2eb0,
+ 0x2368, 0x3b44, 0x478c, 0x1720, 0x2fb0,
+ 0x2748, 0x3f24, 0x036c, 0x1b00, 0x3390,
+ 0x2848, 0x4024, 0x046c, 0x1c00, 0x3490,
+ 0x2948, 0x4124, 0x056c, 0x1d00, 0x3590,
+ 0x294c, 0x4128, 0x0570, 0x1d04, 0x3594,
+ 0x284c, 0x4028, 0x0470, 0x1c04, 0x3494,
+ 0x274c, 0x3f28, 0x0370, 0x1b04, 0x3394,
+ 0x2750, 0x3f2c, 0x0374, 0x1b08, 0x3398,
+ 0x2850, 0x402c, 0x0474, 0x1c08, 0x3498,
+ 0x2950, 0x412c, 0x0574, 0x1d08, 0x3598,
+ 0x2954, 0x4130, 0x0578, 0x1d0c, 0x359c,
+ 0x2854, 0x4030, 0x0478, 0x1c0c, 0x349c,
+ 0x2754, 0x3f30, 0x0378, 0x1b0c, 0x339c,
+ 0x2758, 0x3f34, 0x037c, 0x1b10, 0x33a0,
+ 0x2858, 0x4034, 0x047c, 0x1c10, 0x34a0,
+ 0x2958, 0x4134, 0x057c, 0x1d10, 0x35a0,
+ 0x295c, 0x4138, 0x0580, 0x1d14, 0x35a4,
+ 0x285c, 0x4038, 0x0480, 0x1c14, 0x34a4,
+ 0x275c, 0x3f38, 0x0380, 0x1b14, 0x33a4,
+ 0x2760, 0x3f3c, 0x0384, 0x1b18, 0x33a8,
+ 0x2860, 0x403c, 0x0484, 0x1c18, 0x34a8,
+ 0x2960, 0x413c, 0x0584, 0x1d18, 0x35a8,
+ 0x2964, 0x4140, 0x0588, 0x1d1c, 0x35ac,
+ 0x2864, 0x4040, 0x0488, 0x1c1c, 0x34ac,
+ 0x2764, 0x3f40, 0x0388, 0x1b1c, 0x33ac,
+ 0x2768, 0x3f44, 0x038c, 0x1b20, 0x33b0,
+ 0x2868, 0x4044, 0x048c, 0x1c20, 0x34b0,
+ 0x2968, 0x4144, 0x058c, 0x1d20, 0x35b0,
+ 0x2d48, 0x4524, 0x096c, 0x2100, 0x3990,
+ 0x2e48, 0x4624, 0x0a6c, 0x2200, 0x3a90,
+ 0x2f48, 0x4724, 0x0b6c, 0x2300, 0x3b90,
+ 0x2f4c, 0x4728, 0x0b70, 0x2304, 0x3b94,
+ 0x2e4c, 0x4628, 0x0a70, 0x2204, 0x3a94,
+ 0x2d4c, 0x4528, 0x0970, 0x2104, 0x3994,
+ 0x2d50, 0x452c, 0x0974, 0x2108, 0x3998,
+ 0x2e50, 0x462c, 0x0a74, 0x2208, 0x3a98,
+ 0x2f50, 0x472c, 0x0b74, 0x2308, 0x3b98,
+ 0x2f54, 0x4730, 0x0b78, 0x230c, 0x3b9c,
+ 0x2e54, 0x4630, 0x0a78, 0x220c, 0x3a9c,
+ 0x2d54, 0x4530, 0x0978, 0x210c, 0x399c,
+ 0x2d58, 0x4534, 0x097c, 0x2110, 0x39a0,
+ 0x2e58, 0x4634, 0x0a7c, 0x2210, 0x3aa0,
+ 0x2f58, 0x4734, 0x0b7c, 0x2310, 0x3ba0,
+ 0x2f5c, 0x4738, 0x0b80, 0x2314, 0x3ba4,
+ 0x2e5c, 0x4638, 0x0a80, 0x2214, 0x3aa4,
+ 0x2d5c, 0x4538, 0x0980, 0x2114, 0x39a4,
+ 0x2d60, 0x453c, 0x0984, 0x2118, 0x39a8,
+ 0x2e60, 0x463c, 0x0a84, 0x2218, 0x3aa8,
+ 0x2f60, 0x473c, 0x0b84, 0x2318, 0x3ba8,
+ 0x2f64, 0x4740, 0x0b88, 0x231c, 0x3bac,
+ 0x2e64, 0x4640, 0x0a88, 0x221c, 0x3aac,
+ 0x2d64, 0x4540, 0x0988, 0x211c, 0x39ac,
+ 0x2d68, 0x4544, 0x098c, 0x2120, 0x39b0,
+ 0x2e68, 0x4644, 0x0a8c, 0x2220, 0x3ab0,
+ 0x2f68, 0x4744, 0x0b8c, 0x2320, 0x3bb0,
+ 0x3348, 0x0324, 0x0f6c, 0x2700, 0x3f90,
+ 0x3448, 0x0424, 0x106c, 0x2800, 0x4090,
+ 0x3548, 0x0524, 0x116c, 0x2900, 0x4190,
+ 0x354c, 0x0528, 0x1170, 0x2904, 0x4194,
+ 0x344c, 0x0428, 0x1070, 0x2804, 0x4094,
+ 0x334c, 0x0328, 0x0f70, 0x2704, 0x3f94,
+ 0x3350, 0x032c, 0x0f74, 0x2708, 0x3f98,
+ 0x3450, 0x042c, 0x1074, 0x2808, 0x4098,
+ 0x3550, 0x052c, 0x1174, 0x2908, 0x4198,
+ 0x3554, 0x0530, 0x1178, 0x290c, 0x419c,
+ 0x3454, 0x0430, 0x1078, 0x280c, 0x409c,
+ 0x3354, 0x0330, 0x0f78, 0x270c, 0x3f9c,
+ 0x3358, 0x0334, 0x0f7c, 0x2710, 0x3fa0,
+ 0x3458, 0x0434, 0x107c, 0x2810, 0x40a0,
+ 0x3558, 0x0534, 0x117c, 0x2910, 0x41a0,
+ 0x355c, 0x0538, 0x1180, 0x2914, 0x41a4,
+ 0x345c, 0x0438, 0x1080, 0x2814, 0x40a4,
+ 0x335c, 0x0338, 0x0f80, 0x2714, 0x3fa4,
+ 0x3360, 0x033c, 0x0f84, 0x2718, 0x3fa8,
+ 0x3460, 0x043c, 0x1084, 0x2818, 0x40a8,
+ 0x3560, 0x053c, 0x1184, 0x2918, 0x41a8,
+ 0x3564, 0x0540, 0x1188, 0x291c, 0x41ac,
+ 0x3464, 0x0440, 0x1088, 0x281c, 0x40ac,
+ 0x3364, 0x0340, 0x0f88, 0x271c, 0x3fac,
+ 0x3368, 0x0344, 0x0f8c, 0x2720, 0x3fb0,
+ 0x3468, 0x0444, 0x108c, 0x2820, 0x40b0,
+ 0x3568, 0x0544, 0x118c, 0x2920, 0x41b0,
+ 0x3948, 0x0924, 0x156c, 0x2d00, 0x4590,
+ 0x3a48, 0x0a24, 0x166c, 0x2e00, 0x4690,
+ 0x3b48, 0x0b24, 0x176c, 0x2f00, 0x4790,
+ 0x3b4c, 0x0b28, 0x1770, 0x2f04, 0x4794,
+ 0x3a4c, 0x0a28, 0x1670, 0x2e04, 0x4694,
+ 0x394c, 0x0928, 0x1570, 0x2d04, 0x4594,
+ 0x3950, 0x092c, 0x1574, 0x2d08, 0x4598,
+ 0x3a50, 0x0a2c, 0x1674, 0x2e08, 0x4698,
+ 0x3b50, 0x0b2c, 0x1774, 0x2f08, 0x4798,
+ 0x3b54, 0x0b30, 0x1778, 0x2f0c, 0x479c,
+ 0x3a54, 0x0a30, 0x1678, 0x2e0c, 0x469c,
+ 0x3954, 0x0930, 0x1578, 0x2d0c, 0x459c,
+ 0x3958, 0x0934, 0x157c, 0x2d10, 0x45a0,
+ 0x3a58, 0x0a34, 0x167c, 0x2e10, 0x46a0,
+ 0x3b58, 0x0b34, 0x177c, 0x2f10, 0x47a0,
+ 0x3b5c, 0x0b38, 0x1780, 0x2f14, 0x47a4,
+ 0x3a5c, 0x0a38, 0x1680, 0x2e14, 0x46a4,
+ 0x395c, 0x0938, 0x1580, 0x2d14, 0x45a4,
+ 0x3960, 0x093c, 0x1584, 0x2d18, 0x45a8,
+ 0x3a60, 0x0a3c, 0x1684, 0x2e18, 0x46a8,
+ 0x3b60, 0x0b3c, 0x1784, 0x2f18, 0x47a8,
+ 0x3b64, 0x0b40, 0x1788, 0x2f1c, 0x47ac,
+ 0x3a64, 0x0a40, 0x1688, 0x2e1c, 0x46ac,
+ 0x3964, 0x0940, 0x1588, 0x2d1c, 0x45ac,
+ 0x3968, 0x0944, 0x158c, 0x2d20, 0x45b0,
+ 0x3a68, 0x0a44, 0x168c, 0x2e20, 0x46b0,
+ 0x3b68, 0x0b44, 0x178c, 0x2f20, 0x47b0,
+ 0x3f48, 0x0f24, 0x1b6c, 0x3300, 0x0390,
+ 0x4048, 0x1024, 0x1c6c, 0x3400, 0x0490,
+ 0x4148, 0x1124, 0x1d6c, 0x3500, 0x0590,
+ 0x414c, 0x1128, 0x1d70, 0x3504, 0x0594,
+ 0x404c, 0x1028, 0x1c70, 0x3404, 0x0494,
+ 0x3f4c, 0x0f28, 0x1b70, 0x3304, 0x0394,
+ 0x3f50, 0x0f2c, 0x1b74, 0x3308, 0x0398,
+ 0x4050, 0x102c, 0x1c74, 0x3408, 0x0498,
+ 0x4150, 0x112c, 0x1d74, 0x3508, 0x0598,
+ 0x4154, 0x1130, 0x1d78, 0x350c, 0x059c,
+ 0x4054, 0x1030, 0x1c78, 0x340c, 0x049c,
+ 0x3f54, 0x0f30, 0x1b78, 0x330c, 0x039c,
+ 0x3f58, 0x0f34, 0x1b7c, 0x3310, 0x03a0,
+ 0x4058, 0x1034, 0x1c7c, 0x3410, 0x04a0,
+ 0x4158, 0x1134, 0x1d7c, 0x3510, 0x05a0,
+ 0x415c, 0x1138, 0x1d80, 0x3514, 0x05a4,
+ 0x405c, 0x1038, 0x1c80, 0x3414, 0x04a4,
+ 0x3f5c, 0x0f38, 0x1b80, 0x3314, 0x03a4,
+ 0x3f60, 0x0f3c, 0x1b84, 0x3318, 0x03a8,
+ 0x4060, 0x103c, 0x1c84, 0x3418, 0x04a8,
+ 0x4160, 0x113c, 0x1d84, 0x3518, 0x05a8,
+ 0x4164, 0x1140, 0x1d88, 0x351c, 0x05ac,
+ 0x4064, 0x1040, 0x1c88, 0x341c, 0x04ac,
+ 0x3f64, 0x0f40, 0x1b88, 0x331c, 0x03ac,
+ 0x3f68, 0x0f44, 0x1b8c, 0x3320, 0x03b0,
+ 0x4068, 0x1044, 0x1c8c, 0x3420, 0x04b0,
+ 0x4168, 0x1144, 0x1d8c, 0x3520, 0x05b0,
+ 0x4548, 0x1524, 0x216c, 0x3900, 0x0990,
+ 0x4648, 0x1624, 0x226c, 0x3a00, 0x0a90,
+ 0x4748, 0x1724, 0x236c, 0x3b00, 0x0b90,
+ 0x474c, 0x1728, 0x2370, 0x3b04, 0x0b94,
+ 0x464c, 0x1628, 0x2270, 0x3a04, 0x0a94,
+ 0x454c, 0x1528, 0x2170, 0x3904, 0x0994,
+ 0x4550, 0x152c, 0x2174, 0x3908, 0x0998,
+ 0x4650, 0x162c, 0x2274, 0x3a08, 0x0a98,
+ 0x4750, 0x172c, 0x2374, 0x3b08, 0x0b98,
+ 0x4754, 0x1730, 0x2378, 0x3b0c, 0x0b9c,
+ 0x4654, 0x1630, 0x2278, 0x3a0c, 0x0a9c,
+ 0x4554, 0x1530, 0x2178, 0x390c, 0x099c,
+ 0x4558, 0x1534, 0x217c, 0x3910, 0x09a0,
+ 0x4658, 0x1634, 0x227c, 0x3a10, 0x0aa0,
+ 0x4758, 0x1734, 0x237c, 0x3b10, 0x0ba0,
+ 0x475c, 0x1738, 0x2380, 0x3b14, 0x0ba4,
+ 0x465c, 0x1638, 0x2280, 0x3a14, 0x0aa4,
+ 0x455c, 0x1538, 0x2180, 0x3914, 0x09a4,
+ 0x4560, 0x153c, 0x2184, 0x3918, 0x09a8,
+ 0x4660, 0x163c, 0x2284, 0x3a18, 0x0aa8,
+ 0x4760, 0x173c, 0x2384, 0x3b18, 0x0ba8,
+ 0x4764, 0x1740, 0x2388, 0x3b1c, 0x0bac,
+ 0x4664, 0x1640, 0x2288, 0x3a1c, 0x0aac,
+ 0x4564, 0x1540, 0x2188, 0x391c, 0x09ac,
+ 0x4568, 0x1544, 0x218c, 0x3920, 0x09b0,
+ 0x4668, 0x1644, 0x228c, 0x3a20, 0x0ab0,
+ 0x4768, 0x1744, 0x238c, 0x3b20, 0x0bb0,
+ 0x0348, 0x1b24, 0x276c, 0x3f00, 0x0f90,
+ 0x0448, 0x1c24, 0x286c, 0x4000, 0x1090,
+ 0x0548, 0x1d24, 0x296c, 0x4100, 0x1190,
+ 0x054c, 0x1d28, 0x2970, 0x4104, 0x1194,
+ 0x044c, 0x1c28, 0x2870, 0x4004, 0x1094,
+ 0x034c, 0x1b28, 0x2770, 0x3f04, 0x0f94,
+ 0x0350, 0x1b2c, 0x2774, 0x3f08, 0x0f98,
+ 0x0450, 0x1c2c, 0x2874, 0x4008, 0x1098,
+ 0x0550, 0x1d2c, 0x2974, 0x4108, 0x1198,
+ 0x0554, 0x1d30, 0x2978, 0x410c, 0x119c,
+ 0x0454, 0x1c30, 0x2878, 0x400c, 0x109c,
+ 0x0354, 0x1b30, 0x2778, 0x3f0c, 0x0f9c,
+ 0x0358, 0x1b34, 0x277c, 0x3f10, 0x0fa0,
+ 0x0458, 0x1c34, 0x287c, 0x4010, 0x10a0,
+ 0x0558, 0x1d34, 0x297c, 0x4110, 0x11a0,
+ 0x055c, 0x1d38, 0x2980, 0x4114, 0x11a4,
+ 0x045c, 0x1c38, 0x2880, 0x4014, 0x10a4,
+ 0x035c, 0x1b38, 0x2780, 0x3f14, 0x0fa4,
+ 0x0360, 0x1b3c, 0x2784, 0x3f18, 0x0fa8,
+ 0x0460, 0x1c3c, 0x2884, 0x4018, 0x10a8,
+ 0x0560, 0x1d3c, 0x2984, 0x4118, 0x11a8,
+ 0x0564, 0x1d40, 0x2988, 0x411c, 0x11ac,
+ 0x0464, 0x1c40, 0x2888, 0x401c, 0x10ac,
+ 0x0364, 0x1b40, 0x2788, 0x3f1c, 0x0fac,
+ 0x0368, 0x1b44, 0x278c, 0x3f20, 0x0fb0,
+ 0x0468, 0x1c44, 0x288c, 0x4020, 0x10b0,
+ 0x0568, 0x1d44, 0x298c, 0x4120, 0x11b0,
+ 0x0948, 0x2124, 0x2d6c, 0x4500, 0x1590,
+ 0x0a48, 0x2224, 0x2e6c, 0x4600, 0x1690,
+ 0x0b48, 0x2324, 0x2f6c, 0x4700, 0x1790,
+ 0x0b4c, 0x2328, 0x2f70, 0x4704, 0x1794,
+ 0x0a4c, 0x2228, 0x2e70, 0x4604, 0x1694,
+ 0x094c, 0x2128, 0x2d70, 0x4504, 0x1594,
+ 0x0950, 0x212c, 0x2d74, 0x4508, 0x1598,
+ 0x0a50, 0x222c, 0x2e74, 0x4608, 0x1698,
+ 0x0b50, 0x232c, 0x2f74, 0x4708, 0x1798,
+ 0x0b54, 0x2330, 0x2f78, 0x470c, 0x179c,
+ 0x0a54, 0x2230, 0x2e78, 0x460c, 0x169c,
+ 0x0954, 0x2130, 0x2d78, 0x450c, 0x159c,
+ 0x0958, 0x2134, 0x2d7c, 0x4510, 0x15a0,
+ 0x0a58, 0x2234, 0x2e7c, 0x4610, 0x16a0,
+ 0x0b58, 0x2334, 0x2f7c, 0x4710, 0x17a0,
+ 0x0b5c, 0x2338, 0x2f80, 0x4714, 0x17a4,
+ 0x0a5c, 0x2238, 0x2e80, 0x4614, 0x16a4,
+ 0x095c, 0x2138, 0x2d80, 0x4514, 0x15a4,
+ 0x0960, 0x213c, 0x2d84, 0x4518, 0x15a8,
+ 0x0a60, 0x223c, 0x2e84, 0x4618, 0x16a8,
+ 0x0b60, 0x233c, 0x2f84, 0x4718, 0x17a8,
+ 0x0b64, 0x2340, 0x2f88, 0x471c, 0x17ac,
+ 0x0a64, 0x2240, 0x2e88, 0x461c, 0x16ac,
+ 0x0964, 0x2140, 0x2d88, 0x451c, 0x15ac,
+ 0x0968, 0x2144, 0x2d8c, 0x4520, 0x15b0,
+ 0x0a68, 0x2244, 0x2e8c, 0x4620, 0x16b0,
+ 0x0b68, 0x2344, 0x2f8c, 0x4720, 0x17b0,
+};
+
+/* DV25/50 DCT coefficient weights and inverse weights */
+/* created by dvtables.py */
+static const int dv_weight_bits = 18;
+static const int dv_weight_88[64] = {
+ 131072, 257107, 257107, 242189, 252167, 242189, 235923, 237536,
+ 237536, 235923, 229376, 231390, 223754, 231390, 229376, 222935,
+ 224969, 217965, 217965, 224969, 222935, 200636, 218652, 211916,
+ 212325, 211916, 218652, 200636, 188995, 196781, 205965, 206433,
+ 206433, 205965, 196781, 188995, 185364, 185364, 200636, 200704,
+ 200636, 185364, 185364, 174609, 180568, 195068, 195068, 180568,
+ 174609, 170091, 175557, 189591, 175557, 170091, 165371, 170627,
+ 170627, 165371, 160727, 153560, 160727, 144651, 144651, 136258,
+};
+static const int dv_weight_248[64] = {
+ 131072, 242189, 257107, 237536, 229376, 200636, 242189, 223754,
+ 224969, 196781, 262144, 242189, 229376, 200636, 257107, 237536,
+ 211916, 185364, 235923, 217965, 229376, 211916, 206433, 180568,
+ 242189, 223754, 224969, 196781, 211916, 185364, 235923, 217965,
+ 200704, 175557, 222935, 205965, 200636, 185364, 195068, 170627,
+ 229376, 211916, 206433, 180568, 200704, 175557, 222935, 205965,
+ 175557, 153560, 188995, 174609, 165371, 144651, 200636, 185364,
+ 195068, 170627, 175557, 153560, 188995, 174609, 165371, 144651,
+};
+static const int dv_iweight_bits = 14;
+static const int dv_iweight_88[64] = {
+ 32768, 16710, 16710, 17735, 17015, 17735, 18197, 18079,
+ 18079, 18197, 18725, 18559, 19196, 18559, 18725, 19284,
+ 19108, 19692, 19692, 19108, 19284, 21400, 19645, 20262,
+ 20214, 20262, 19645, 21400, 22733, 21845, 20867, 20815,
+ 20815, 20867, 21845, 22733, 23173, 23173, 21400, 21400,
+ 21400, 23173, 23173, 24600, 23764, 22017, 22017, 23764,
+ 24600, 25267, 24457, 22672, 24457, 25267, 25971, 25191,
+ 25191, 25971, 26715, 27962, 26715, 29642, 29642, 31536,
+};
+static const int dv_iweight_248[64] = {
+ 32768, 17735, 16710, 18079, 18725, 21400, 17735, 19196,
+ 19108, 21845, 16384, 17735, 18725, 21400, 16710, 18079,
+ 20262, 23173, 18197, 19692, 18725, 20262, 20815, 23764,
+ 17735, 19196, 19108, 21845, 20262, 23173, 18197, 19692,
+ 21400, 24457, 19284, 20867, 21400, 23173, 22017, 25191,
+ 18725, 20262, 20815, 23764, 21400, 24457, 19284, 20867,
+ 24457, 27962, 22733, 24600, 25971, 29642, 21400, 23173,
+ 22017, 25191, 24457, 27962, 22733, 24600, 25971, 29642,
+};
+
 static const uint16_t dv_audio_shuffle525[10][9] = {
   {  0, 30, 60, 20, 50, 80, 10, 40, 70 }, /* 1st channel */
   {  6, 36, 66, 26, 56, 86, 16, 46, 76 },
@@ -1294,6 +2540,7 @@ static const DVprofile dv_profiles[] = {
     { .dsf = 0,
       .frame_size = 120000,        /* IEC 61834, SMPTE-314M - 525/60 (NTSC) */
       .difseg_size = 10,
+      .n_difchan = 1,
       .frame_rate = 30000,
       .ltc_divisor = 30,
       .frame_rate_base = 1001,
@@ -1304,12 +2551,13 @@ static const DVprofile dv_profiles[] = {
       .pix_fmt = PIX_FMT_YUV411P,
       .audio_stride = 90,
       .audio_min_samples = { 1580, 1452, 1053 }, /* for 48, 44.1 and 32Khz */
-      .audio_samples_dist = { 1602, 1601, 1602, 1601, 1602 },
+      .audio_samples_dist = { 1600, 1602, 1602, 1602, 1602 }, /* per SMPTE-314M */
       .audio_shuffle = dv_audio_shuffle525,
     },
     { .dsf = 1,
       .frame_size = 144000,        /* IEC 61834 - 625/50 (PAL) */
       .difseg_size = 12,
+      .n_difchan = 1,
       .frame_rate = 25,
       .frame_rate_base = 1,
       .ltc_divisor = 25,
@@ -1326,6 +2574,7 @@ static const DVprofile dv_profiles[] = {
     { .dsf = 1,
       .frame_size = 144000,        /* SMPTE-314M - 625/50 (PAL) */
       .difseg_size = 12,
+      .n_difchan = 1,
       .frame_rate = 25,
       .frame_rate_base = 1,
       .ltc_divisor = 25,
@@ -1338,29 +2587,79 @@ static const DVprofile dv_profiles[] = {
       .audio_min_samples = { 1896, 1742, 1264 }, /* for 48, 44.1 and 32Khz */
       .audio_samples_dist = { 1920, 1920, 1920, 1920, 1920 },
       .audio_shuffle = dv_audio_shuffle625,
-     }
+    },
+    { .dsf = 0,
+      .frame_size = 240000,        /* SMPTE-314M - 525/60 (NTSC) 50 Mbps */
+      .difseg_size = 10,           /* also known as "DVCPRO50" */
+      .n_difchan = 2,
+      .frame_rate = 30000,
+      .ltc_divisor = 30,
+      .frame_rate_base = 1001,
+      .height = 480,
+      .width = 720,
+      .sar = {{10, 11}, {40, 33}},
+      .video_place = dv_place_422_525,
+      .pix_fmt = PIX_FMT_YUV422P,
+      .audio_stride = 90,
+      .audio_min_samples = { 1580, 1452, 1053 }, /* for 48, 44.1 and 32Khz */
+      .audio_samples_dist = { 1600, 1602, 1602, 1602, 1602 }, /* per SMPTE-314M */
+      .audio_shuffle = dv_audio_shuffle525,
+    },
+    { .dsf = 1,
+      .frame_size = 288000,        /* SMPTE-314M - 625/50 (PAL) 50 Mbps */
+      .difseg_size = 12,           /* also known as "DVCPRO50" */
+      .n_difchan = 2,
+      .frame_rate = 25,
+      .frame_rate_base = 1,
+      .ltc_divisor = 25,
+      .height = 576,
+      .width = 720,
+      .sar = {{59, 54}, {118, 81}},
+      .video_place = dv_place_422_625,
+      .pix_fmt = PIX_FMT_YUV422P,
+      .audio_stride = 108,
+      .audio_min_samples = { 1896, 1742, 1264 }, /* for 48, 44.1 and 32Khz */
+      .audio_samples_dist = { 1920, 1920, 1920, 1920, 1920 },
+      .audio_shuffle = dv_audio_shuffle625,
+    }
 };
 
+/* minimum number of bytes to read from a DV stream in order to determine the profile */
+#define DV_PROFILE_BYTES (6*80) /* 6 DIF blocks */
+
+/* largest possible DV frame, in bytes (PAL 50Mbps) */
+#define DV_MAX_FRAME_SIZE 288000
+
 static inline const DVprofile* dv_frame_profile(uint8_t* frame)
 {
     if ((frame[3] & 0x80) == 0) {      /* DSF flag */
-        return &dv_profiles[0];
-    }
-    else if ((frame[5] & 0x07) == 0) { /* APT flag */
-        return &dv_profiles[1];
+        /* it's an NTSC format */
+        if ((frame[80*5 + 48 + 3] & 0x4)) { /* 4:2:2 sampling */
+            return &dv_profiles[3]; /* NTSC 50Mbps */
+        } else { /* 4:1:1 sampling */
+            return &dv_profiles[0]; /* NTSC 25Mbps */
+        }
+    } else {
+        /* it's a PAL format */
+        if ((frame[80*5 + 48 + 3] & 0x4)) { /* 4:2:2 sampling */
+            return &dv_profiles[4]; /* PAL 50Mbps */
+        } else if ((frame[5] & 0x07) == 0) { /* APT flag */
+            return &dv_profiles[1]; /* PAL 25Mbps 4:2:0 */
+        } else
+            return &dv_profiles[2]; /* PAL 25Mbps 4:1:1 */
     }
-    else
-        return &dv_profiles[2];
 }
 
 static inline const DVprofile* dv_codec_profile(AVCodecContext* codec)
 {
-    if (codec->width != 720) {
+    int i;
+
+    if (codec->width != 720)
         return NULL;
-    }
-    else if (codec->height == 480) {
-        return &dv_profiles[0];
-    }
-    else
-        return &dv_profiles[1];
+
+    for (i=0; i<sizeof(dv_profiles)/sizeof(DVprofile); i++)
+       if (codec->height == dv_profiles[i].height && codec->pix_fmt == dv_profiles[i].pix_fmt)
+           return &dv_profiles[i];
+
+    return NULL;
 }
diff --git a/src/libffmpeg/libavcodec/fft.c b/src/libffmpeg/libavcodec/fft.c
index 81b6843e9..1c63f6889 100644
--- a/src/libffmpeg/libavcodec/fft.c
+++ b/src/libffmpeg/libavcodec/fft.c
@@ -57,12 +57,16 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
     s->exptab1 = NULL;
 
     /* compute constant table for HAVE_SSE version */
-#if (defined(HAVE_MMX) && defined(HAVE_BUILTIN_VECTOR)) || defined(HAVE_ALTIVEC)
+#if (defined(HAVE_MMX) && (defined(HAVE_BUILTIN_VECTOR) || defined(HAVE_MM3DNOW))) || defined(HAVE_ALTIVEC)
     {
         int has_vectors = 0;
 
 #if defined(HAVE_MMX)
-        has_vectors = mm_support() & MM_SSE;
+#ifdef HAVE_MM3DNOW
+        has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2);
+#else
+        has_vectors = mm_support() & (MM_SSE | MM_SSE2);
+#endif
 #endif
 #if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
         has_vectors = mm_support() & MM_ALTIVEC;
@@ -94,8 +98,24 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
             } while (nblocks != 0);
             av_freep(&s->exptab);
 #if defined(HAVE_MMX)
-            s->fft_calc = ff_fft_calc_sse;
-#else
+#ifdef HAVE_MM3DNOW
+            if (has_vectors & MM_3DNOWEXT)
+                /* 3DNowEx for Athlon(XP) */
+                s->fft_calc = ff_fft_calc_3dn2;
+            else if (has_vectors & MM_3DNOW)
+                /* 3DNow! for K6-2/3 */
+                s->fft_calc = ff_fft_calc_3dn;
+#endif
+#ifdef HAVE_BUILTIN_VECTOR
+            if (has_vectors & MM_SSE2)
+                /* SSE for P4/K8 */
+                s->fft_calc = ff_fft_calc_sse;
+            else if ((has_vectors & MM_SSE) &&
+                     s->fft_calc == ff_fft_calc_c)
+                /* SSE for P3 */
+                s->fft_calc = ff_fft_calc_sse;
+#endif
+#else /* HAVE_MMX */
             s->fft_calc = ff_fft_calc_altivec;
 #endif
         }
diff --git a/src/libffmpeg/libavcodec/ffv1.c b/src/libffmpeg/libavcodec/ffv1.c
index 10ba21b4c..c987d84f6 100644
--- a/src/libffmpeg/libavcodec/ffv1.c
+++ b/src/libffmpeg/libavcodec/ffv1.c
@@ -550,12 +550,6 @@ static int encode_init(AVCodecContext *avctx)
     FFV1Context *s = avctx->priv_data;
     int i;
 
-    if(avctx->strict_std_compliance >FF_COMPLIANCE_EXPERIMENTAL){
-        av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodeable with future versions!!!\n"
-               "use vstrict=-2 / -strict -2 to use it anyway\n");
-        return -1;
-    }
-
     common_init(avctx);
 
     s->version=0;
@@ -694,7 +688,8 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     }
 }
 
-static void common_end(FFV1Context *s){
+static int common_end(AVCodecContext *avctx){
+    FFV1Context *s = avctx->priv_data;
     int i;
 
     for(i=0; i<s->plane_count; i++){
@@ -702,13 +697,6 @@ static void common_end(FFV1Context *s){
 
         av_freep(&p->state);
     }
-}
-
-static int encode_end(AVCodecContext *avctx)
-{
-    FFV1Context *s = avctx->priv_data;
-
-    common_end(s);
 
     return 0;
 }
@@ -959,11 +947,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
     p->pict_type= FF_I_TYPE; //FIXME I vs. P
     if(get_rac(c, &keystate)){
         p->key_frame= 1;
-        read_header(f);
+        if(read_header(f) < 0)
+            return -1;
         clear_state(f);
     }else{
         p->key_frame= 0;
     }
+    if(!f->plane[0].state && !f->plane[0].vlc_state)
+        return -1;
 
     p->reference= 0;
     if(avctx->get_buffer(avctx, p) < 0){
@@ -1021,7 +1012,7 @@ AVCodec ffv1_decoder = {
     sizeof(FFV1Context),
     decode_init,
     NULL,
-    NULL,
+    common_end,
     decode_frame,
     CODEC_CAP_DR1 /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
     NULL
@@ -1035,6 +1026,6 @@ AVCodec ffv1_encoder = {
     sizeof(FFV1Context),
     encode_init,
     encode_frame,
-    encode_end,
+    common_end,
 };
 #endif
diff --git a/src/libffmpeg/libavcodec/flac.c b/src/libffmpeg/libavcodec/flac.c
index 97ac53745..659112c77 100644
--- a/src/libffmpeg/libavcodec/flac.c
+++ b/src/libffmpeg/libavcodec/flac.c
@@ -36,6 +36,7 @@
 #include "avcodec.h"
 #include "bitstream.h"
 #include "golomb.h"
+#include "crc.h"
 
 #undef NDEBUG
 #include <assert.h>
@@ -84,99 +85,12 @@ static int blocksize_table[] = {
 256<<0, 256<<1, 256<<2, 256<<3, 256<<4, 256<<5, 256<<6, 256<<7
 };
 
-static const uint8_t table_crc8[256] = {
-    0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
-    0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d,
-    0x70, 0x77, 0x7e, 0x79, 0x6c, 0x6b, 0x62, 0x65,
-    0x48, 0x4f, 0x46, 0x41, 0x54, 0x53, 0x5a, 0x5d,
-    0xe0, 0xe7, 0xee, 0xe9, 0xfc, 0xfb, 0xf2, 0xf5,
-    0xd8, 0xdf, 0xd6, 0xd1, 0xc4, 0xc3, 0xca, 0xcd,
-    0x90, 0x97, 0x9e, 0x99, 0x8c, 0x8b, 0x82, 0x85,
-    0xa8, 0xaf, 0xa6, 0xa1, 0xb4, 0xb3, 0xba, 0xbd,
-    0xc7, 0xc0, 0xc9, 0xce, 0xdb, 0xdc, 0xd5, 0xd2,
-    0xff, 0xf8, 0xf1, 0xf6, 0xe3, 0xe4, 0xed, 0xea,
-    0xb7, 0xb0, 0xb9, 0xbe, 0xab, 0xac, 0xa5, 0xa2,
-    0x8f, 0x88, 0x81, 0x86, 0x93, 0x94, 0x9d, 0x9a,
-    0x27, 0x20, 0x29, 0x2e, 0x3b, 0x3c, 0x35, 0x32,
-    0x1f, 0x18, 0x11, 0x16, 0x03, 0x04, 0x0d, 0x0a,
-    0x57, 0x50, 0x59, 0x5e, 0x4b, 0x4c, 0x45, 0x42,
-    0x6f, 0x68, 0x61, 0x66, 0x73, 0x74, 0x7d, 0x7a,
-    0x89, 0x8e, 0x87, 0x80, 0x95, 0x92, 0x9b, 0x9c,
-    0xb1, 0xb6, 0xbf, 0xb8, 0xad, 0xaa, 0xa3, 0xa4,
-    0xf9, 0xfe, 0xf7, 0xf0, 0xe5, 0xe2, 0xeb, 0xec,
-    0xc1, 0xc6, 0xcf, 0xc8, 0xdd, 0xda, 0xd3, 0xd4,
-    0x69, 0x6e, 0x67, 0x60, 0x75, 0x72, 0x7b, 0x7c,
-    0x51, 0x56, 0x5f, 0x58, 0x4d, 0x4a, 0x43, 0x44,
-    0x19, 0x1e, 0x17, 0x10, 0x05, 0x02, 0x0b, 0x0c,
-    0x21, 0x26, 0x2f, 0x28, 0x3d, 0x3a, 0x33, 0x34,
-    0x4e, 0x49, 0x40, 0x47, 0x52, 0x55, 0x5c, 0x5b,
-    0x76, 0x71, 0x78, 0x7f, 0x6a, 0x6d, 0x64, 0x63,
-    0x3e, 0x39, 0x30, 0x37, 0x22, 0x25, 0x2c, 0x2b,
-    0x06, 0x01, 0x08, 0x0f, 0x1a, 0x1d, 0x14, 0x13,
-    0xae, 0xa9, 0xa0, 0xa7, 0xb2, 0xb5, 0xbc, 0xbb,
-    0x96, 0x91, 0x98, 0x9f, 0x8a, 0x8d, 0x84, 0x83,
-    0xde, 0xd9, 0xd0, 0xd7, 0xc2, 0xc5, 0xcc, 0xcb,
-    0xe6, 0xe1, 0xe8, 0xef, 0xfa, 0xfd, 0xf4, 0xf3
-};
-
-static int64_t get_utf8(GetBitContext *gb)
-{
-    uint64_t val;
-    int ones=0, bytes;
-
-    while(get_bits1(gb))
-        ones++;
-
-    if     (ones==0) bytes=0;
-    else if(ones==1) return -1;
-    else             bytes= ones - 1;
-
-    val= get_bits(gb, 7-ones);
-    while(bytes--){
-        const int tmp = get_bits(gb, 8);
-
-        if((tmp>>6) != 2)
-            return -1;
-        val<<=6;
-        val|= tmp&0x3F;
-    }
+static int64_t get_utf8(GetBitContext *gb){
+    int64_t val;
+    GET_UTF8(val, get_bits(gb, 8), return -1;)
     return val;
 }
 
-#if 0
-static int skip_utf8(GetBitContext *gb)
-{
-    int ones=0, bytes;
-
-    while(get_bits1(gb))
-        ones++;
-
-    if     (ones==0) bytes=0;
-    else if(ones==1) return -1;
-    else             bytes= ones - 1;
-
-    skip_bits(gb, 7-ones);
-    while(bytes--){
-        const int tmp = get_bits(gb, 8);
-
-        if((tmp>>6) != 2)
-            return -1;
-    }
-    return 0;
-}
-#endif
-
-static int get_crc8(const uint8_t *buf, int count){
-    int crc=0;
-    int i;
-
-    for(i=0; i<count; i++){
-        crc = table_crc8[crc ^ buf[i]];
-    }
-
-    return crc;
-}
-
 static void metadata_streaminfo(FLACContext *s);
 static void dump_headers(FLACContext *s);
 
@@ -341,7 +255,7 @@ static int decode_subframe_fixed(FLACContext *s, int channel, int pred_order)
 
 static int decode_subframe_lpc(FLACContext *s, int channel, int pred_order)
 {
-    int sum, i, j;
+    int i, j;
     int coeff_prec, qlevel;
     int coeffs[pred_order];
 
@@ -379,12 +293,24 @@ static int decode_subframe_lpc(FLACContext *s, int channel, int pred_order)
     if (decode_residuals(s, channel, pred_order) < 0)
         return -1;
 
-    for (i = pred_order; i < s->blocksize; i++)
-    {
-        sum = 0;
-        for (j = 0; j < pred_order; j++)
-            sum += coeffs[j] * s->decoded[channel][i-j-1];
-        s->decoded[channel][i] += sum >> qlevel;
+    if (s->bps > 16) {
+        int64_t sum;
+        for (i = pred_order; i < s->blocksize; i++)
+        {
+            sum = 0;
+            for (j = 0; j < pred_order; j++)
+                sum += (int64_t)coeffs[j] * s->decoded[channel][i-j-1];
+            s->decoded[channel][i] += sum >> qlevel;
+        }
+    } else {
+        int sum;
+        for (i = pred_order; i < s->blocksize; i++)
+        {
+            sum = 0;
+            for (j = 0; j < pred_order; j++)
+                sum += coeffs[j] * s->decoded[channel][i-j-1];
+            s->decoded[channel][i] += sum >> qlevel;
+        }
     }
 
     return 0;
@@ -554,7 +480,7 @@ static int decode_frame(FLACContext *s)
     }
 
     skip_bits(&s->gb, 8);
-    crc8= get_crc8(s->gb.buffer, get_bits_count(&s->gb)/8);
+    crc8= av_crc(av_crc07, 0, s->gb.buffer, get_bits_count(&s->gb)/8);
     if(crc8){
         av_log(s->avctx, AV_LOG_ERROR, "header crc mismatch crc=%2X\n", crc8);
         return -1;
@@ -583,6 +509,17 @@ static int decode_frame(FLACContext *s)
     return 0;
 }
 
+static inline int16_t shift_to_16_bits(int32_t data, int bps)
+{
+    if (bps == 24) {
+        return (data >> 8);
+    } else if (bps == 20) {
+        return (data >> 4);
+    } else {
+        return data;
+    }
+}
+
 static int flac_decode_frame(AVCodecContext *avctx,
                             void *data, int *data_size,
                             uint8_t *buf, int buf_size)
@@ -719,53 +656,32 @@ static int flac_decode_frame(AVCodecContext *avctx,
     }
     }
 #else
+#define DECORRELATE(left, right)\
+            assert(s->channels == 2);\
+            for (i = 0; i < s->blocksize; i++)\
+            {\
+                int a= s->decoded[0][i];\
+                int b= s->decoded[1][i];\
+                *(samples++) = (left ) >> (16 - s->bps);\
+                *(samples++) = (right) >> (16 - s->bps);\
+            }\
+            break;
+
     switch(s->decorrelation)
     {
         case INDEPENDENT:
             for (j = 0; j < s->blocksize; j++)
             {
                 for (i = 0; i < s->channels; i++)
-                    *(samples++) = s->decoded[i][j];
+                    *(samples++) = shift_to_16_bits(s->decoded[i][j], s->bps);
             }
             break;
         case LEFT_SIDE:
-            assert(s->channels == 2);
-            for (i = 0; i < s->blocksize; i++)
-            {
-                *(samples++) = s->decoded[0][i];
-                *(samples++) = s->decoded[0][i] - s->decoded[1][i];
-            }
-            break;
+            DECORRELATE(a,a-b)
         case RIGHT_SIDE:
-            assert(s->channels == 2);
-            for (i = 0; i < s->blocksize; i++)
-            {
-                *(samples++) = s->decoded[0][i] + s->decoded[1][i];
-                *(samples++) = s->decoded[1][i];
-            }
-            break;
+            DECORRELATE(a+b,b)
         case MID_SIDE:
-            assert(s->channels == 2);
-            for (i = 0; i < s->blocksize; i++)
-            {
-                int mid, side;
-                mid = s->decoded[0][i];
-                side = s->decoded[1][i];
-
-#if 1 //needs to be checked but IMHO it should be binary identical
-                mid -= side>>1;
-                *(samples++) = mid + side;
-                *(samples++) = mid;
-#else
-
-                mid <<= 1;
-                if (side & 1)
-                    mid++;
-                *(samples++) = (mid + side) >> 1;
-                *(samples++) = (mid - side) >> 1;
-#endif
-            }
-            break;
+            DECORRELATE( (a-=b>>1) + b, a)
     }
 #endif
 
diff --git a/src/libffmpeg/libavcodec/golomb.h b/src/libffmpeg/libavcodec/golomb.h
index ef74f15c6..a8221ec29 100644
--- a/src/libffmpeg/libavcodec/golomb.h
+++ b/src/libffmpeg/libavcodec/golomb.h
@@ -435,6 +435,10 @@ static inline void set_ur_golomb_jpegls(PutBitContext *pb, int i, int k, int lim
 
     e= (i>>k) + 1;
     if(e<limit){
+        while(e > 31) {
+            put_bits(pb, 31, 0);
+            e -= 31;
+        }
         put_bits(pb, e, 1);
         if(k)
             put_bits(pb, k, i&((1<<k)-1));
diff --git a/src/libffmpeg/libavcodec/h261.c b/src/libffmpeg/libavcodec/h261.c
index c6218c8b9..e56978e61 100644
--- a/src/libffmpeg/libavcodec/h261.c
+++ b/src/libffmpeg/libavcodec/h261.c
@@ -264,6 +264,7 @@ void ff_h261_encode_mb(MpegEncContext * s,
     h->previous_mba = h->current_mba;
 
     if(HAS_CBP(h->mtype)){
+        assert(cbp>0);
         put_bits(&s->pb,h261_cbp_tab[cbp-1][1],h261_cbp_tab[cbp-1][0]);
     }
     for(i=0; i<6; i++) {
@@ -846,6 +847,7 @@ static int h261_decode_gob(H261Context *h){
     return -1;
 }
 
+#ifdef CONFIG_H261_PARSER
 static int h261_find_frame_end(ParseContext *pc, AVCodecContext* avctx, const uint8_t *buf, int buf_size){
     int vop_found, i, j;
     uint32_t state;
@@ -899,6 +901,7 @@ static int h261_parse(AVCodecParserContext *s,
     *poutbuf_size = buf_size;
     return next;
 }
+#endif
 
 /**
  * returns the number of bytes consumed for building the current frame
@@ -921,8 +924,8 @@ static int h261_decode_frame(AVCodecContext *avctx,
     AVFrame *pict = data;
 
 #ifdef DEBUG
-    printf("*****frame %d size=%d\n", avctx->frame_number, buf_size);
-    printf("bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]);
+    av_log(avctx, AV_LOG_DEBUG, "*****frame %d size=%d\n", avctx->frame_number, buf_size);
+    av_log(avctx, AV_LOG_DEBUG, "bytes=%x %x %x %x\n", buf[0], buf[1], buf[2], buf[3]);
 #endif
     s->flags= avctx->flags;
     s->flags2= avctx->flags2;
@@ -1038,6 +1041,7 @@ AVCodec h261_decoder = {
     CODEC_CAP_DR1,
 };
 
+#ifdef CONFIG_H261_PARSER
 AVCodecParser h261_parser = {
     { CODEC_ID_H261 },
     sizeof(ParseContext),
@@ -1045,3 +1049,4 @@ AVCodecParser h261_parser = {
     h261_parse,
     ff_parse_close,
 };
+#endif
diff --git a/src/libffmpeg/libavcodec/h263.c b/src/libffmpeg/libavcodec/h263.c
index f7369c18d..f88114f70 100644
--- a/src/libffmpeg/libavcodec/h263.c
+++ b/src/libffmpeg/libavcodec/h263.c
@@ -552,6 +552,49 @@ void ff_clean_mpeg4_qscales(MpegEncContext *s){
 }
 
 #endif //CONFIG_ENCODERS
+
+#define tab_size ((signed)(sizeof(s->direct_scale_mv[0])/sizeof(int16_t)))
+#define tab_bias (tab_size/2)
+
+static void ff_mpeg4_init_direct_mv(MpegEncContext *s){
+    int i;
+    for(i=0; i<tab_size; i++){
+        s->direct_scale_mv[0][i] = (i-tab_bias)*s->pb_time/s->pp_time;
+        s->direct_scale_mv[1][i] = (i-tab_bias)*(s->pb_time-s->pp_time)/s->pp_time;
+    }
+}
+
+static inline void ff_mpeg4_set_one_direct_mv(MpegEncContext *s, int mx, int my, int i){
+    int xy= s->block_index[i];
+    uint16_t time_pp= s->pp_time;
+    uint16_t time_pb= s->pb_time;
+    int p_mx, p_my;
+
+    p_mx= s->next_picture.motion_val[0][xy][0];
+    if((unsigned)(p_mx + tab_bias) < tab_size){
+        s->mv[0][i][0] = s->direct_scale_mv[0][p_mx + tab_bias] + mx;
+        s->mv[1][i][0] = mx ? s->mv[0][i][0] - p_mx
+                            : s->direct_scale_mv[1][p_mx + tab_bias];
+    }else{
+        s->mv[0][i][0] = p_mx*time_pb/time_pp + mx;
+        s->mv[1][i][0] = mx ? s->mv[0][i][0] - p_mx
+                            : p_mx*(time_pb - time_pp)/time_pp;
+    }
+    p_my= s->next_picture.motion_val[0][xy][1];
+    if((unsigned)(p_my + tab_bias) < tab_size){
+        s->mv[0][i][1] = s->direct_scale_mv[0][p_my + tab_bias] + my;
+        s->mv[1][i][1] = my ? s->mv[0][i][1] - p_my
+                            : s->direct_scale_mv[1][p_my + tab_bias];
+    }else{
+        s->mv[0][i][1] = p_my*time_pb/time_pp + my;
+        s->mv[1][i][1] = my ? s->mv[0][i][1] - p_my
+                            : p_my*(time_pb - time_pp)/time_pp;
+    }
+}
+
+#undef tab_size
+#undef tab_bias
+
 /**
  *
  * @return the mb_type
@@ -559,29 +602,25 @@ void ff_clean_mpeg4_qscales(MpegEncContext *s){
 int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
     const int mb_index= s->mb_x + s->mb_y*s->mb_stride;
     const int colocated_mb_type= s->next_picture.mb_type[mb_index];
-    int xy= s->block_index[0];
     uint16_t time_pp= s->pp_time;
     uint16_t time_pb= s->pb_time;
     int i;
 
     //FIXME avoid divides
+    // try special case with shifts for 1 and 3 B-frames?
 
     if(IS_8X8(colocated_mb_type)){
         s->mv_type = MV_TYPE_8X8;
         for(i=0; i<4; i++){
-            xy= s->block_index[i];
-            s->mv[0][i][0] = s->next_picture.motion_val[0][xy][0]*time_pb/time_pp + mx;
-            s->mv[0][i][1] = s->next_picture.motion_val[0][xy][1]*time_pb/time_pp + my;
-            s->mv[1][i][0] = mx ? s->mv[0][i][0] - s->next_picture.motion_val[0][xy][0]
-                                : s->next_picture.motion_val[0][xy][0]*(time_pb - time_pp)/time_pp;
-            s->mv[1][i][1] = my ? s->mv[0][i][1] - s->next_picture.motion_val[0][xy][1]
-                                : s->next_picture.motion_val[0][xy][1]*(time_pb - time_pp)/time_pp;
+            ff_mpeg4_set_one_direct_mv(s, mx, my, i);
         }
         return MB_TYPE_DIRECT2 | MB_TYPE_8x8 | MB_TYPE_L0L1;
     } else if(IS_INTERLACED(colocated_mb_type)){
         s->mv_type = MV_TYPE_FIELD;
         for(i=0; i<2; i++){
             int field_select= s->next_picture.ref_index[0][s->block_index[2*i]];
+            s->field_select[0][i]= field_select;
+            s->field_select[1][i]= i;
             if(s->top_field_first){
                 time_pp= s->pp_field_time - field_select + i;
                 time_pb= s->pb_field_time - field_select + i;
@@ -598,12 +637,11 @@ int ff_mpeg4_set_direct_mv(MpegEncContext *s, int mx, int my){
         }
         return MB_TYPE_DIRECT2 | MB_TYPE_16x8 | MB_TYPE_L0L1 | MB_TYPE_INTERLACED;
     }else{
-        s->mv[0][0][0] = s->mv[0][1][0] = s->mv[0][2][0] = s->mv[0][3][0] = s->next_picture.motion_val[0][xy][0]*time_pb/time_pp + mx;
-        s->mv[0][0][1] = s->mv[0][1][1] = s->mv[0][2][1] = s->mv[0][3][1] = s->next_picture.motion_val[0][xy][1]*time_pb/time_pp + my;
-        s->mv[1][0][0] = s->mv[1][1][0] = s->mv[1][2][0] = s->mv[1][3][0] = mx ? s->mv[0][0][0] - s->next_picture.motion_val[0][xy][0]
-                            : s->next_picture.motion_val[0][xy][0]*(time_pb - time_pp)/time_pp;
-        s->mv[1][0][1] = s->mv[1][1][1] = s->mv[1][2][1] = s->mv[1][3][1] = my ? s->mv[0][0][1] - s->next_picture.motion_val[0][xy][1]
-                            : s->next_picture.motion_val[0][xy][1]*(time_pb - time_pp)/time_pp;
+        ff_mpeg4_set_one_direct_mv(s, mx, my, 0);
+        s->mv[0][1][0] = s->mv[0][2][0] = s->mv[0][3][0] = s->mv[0][0][0];
+        s->mv[0][1][1] = s->mv[0][2][1] = s->mv[0][3][1] = s->mv[0][0][1];
+        s->mv[1][1][0] = s->mv[1][2][0] = s->mv[1][3][0] = s->mv[1][0][0];
+        s->mv[1][1][1] = s->mv[1][2][1] = s->mv[1][3][1] = s->mv[1][0][1];
         if((s->avctx->workaround_bugs & FF_BUG_DIRECT_BLOCKSIZE) || !s->quarter_sample)
             s->mv_type= MV_TYPE_16X16;
         else
@@ -1480,7 +1518,7 @@ void ff_h263_loop_filter(MpegEncContext * s){
 static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr)
 {
     int x, y, wrap, a, c, pred_dc, scale;
-    int16_t *dc_val, *ac_val;
+    int16_t *dc_val;
 
     /* find prediction */
     if (n < 4) {
@@ -1488,14 +1526,12 @@ static int h263_pred_dc(MpegEncContext * s, int n, uint16_t **dc_val_ptr)
         y = 2 * s->mb_y + ((n & 2) >> 1);
         wrap = s->b8_stride;
         dc_val = s->dc_val[0];
-        ac_val = s->ac_val[0][0];
         scale = s->y_dc_scale;
     } else {
         x = s->mb_x;
         y = s->mb_y;
         wrap = s->mb_stride;
         dc_val = s->dc_val[n - 4 + 1];
-        ac_val = s->ac_val[n - 4 + 1][0];
         scale = s->c_dc_scale;
     }
     /* B C
@@ -2219,6 +2255,7 @@ void ff_set_mpeg4_time(MpegEncContext * s, int picture_number){
     if(s->pict_type==B_TYPE){
         s->pb_time= s->pp_time - (s->last_non_b_time - s->time);
         assert(s->pb_time > 0 && s->pb_time < s->pp_time);
+        ff_mpeg4_init_direct_mv(s);
     }else{
         s->last_time_base= s->time_base;
         s->time_base= time_div;
@@ -3711,6 +3748,8 @@ static int mpeg4_decode_partitioned_mb(MpegEncContext *s, DCTELEM block[6][64])
     mb_type= s->current_picture.mb_type[xy];
     cbp = s->cbp_table[xy];
 
+    s->use_intra_dc_vlc= s->qscale < s->intra_dc_threshold;
+
     if(s->current_picture.qscale_table[xy] != s->qscale){
         ff_set_qscale(s, s->current_picture.qscale_table[xy] );
     }
@@ -4447,6 +4486,9 @@ intra:
             return -1;
         }
         cbp = (cbpc & 3) | (cbpy << 2);
+
+        s->use_intra_dc_vlc= s->qscale < s->intra_dc_threshold;
+
         if (dquant) {
             ff_set_qscale(s, s->qscale + quant_tab[get_bits(&s->gb, 2)]);
         }
@@ -4474,6 +4516,12 @@ end:
 
         /* per-MB end of slice check */
     if(s->codec_id==CODEC_ID_MPEG4){
+#if 0 //http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_14496-4_2004_Conformance_Testing/video_conformance/version_1/simple/ERROR.ZIP/mit025.m4v needs this but its unclear if the mpeg4 standard allows this at all (MN)
+        if(s->pict_type != B_TYPE){
+            while(show_bits(&s->gb, 9 + (s->pict_type == P_TYPE)) == 1)
+                skip_bits(&s->gb, 9 + (s->pict_type == P_TYPE));
+        }
+#endif
         if(mpeg4_is_resync(s)){
             const int delta= s->mb_x + 1 == s->mb_width ? 2 : 1;
             if(s->pict_type==B_TYPE && s->next_picture.mbskip_table[xy + delta])
@@ -4742,7 +4790,7 @@ static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
     //Note intra & rvlc should be optimized away if this is inlined
 
     if(intra) {
-      if(s->qscale < s->intra_dc_threshold){
+      if(s->use_intra_dc_vlc){
         /* DC coef */
         if(s->partitioned_frame){
             level = s->dc_val[0][ s->block_index[n] ];
@@ -4758,6 +4806,7 @@ static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         i = 0;
       }else{
             i = -1;
+            ff_mpeg4_pred_dc(s, n, 0, &dc_pred_dir, 0);
       }
         if (!coded)
             goto not_coded;
@@ -4965,10 +5014,10 @@ static inline int mpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
   }
  not_coded:
     if (intra) {
-        if(s->qscale >= s->intra_dc_threshold){
+        if(!s->use_intra_dc_vlc){
             block[0] = ff_mpeg4_pred_dc(s, n, block[0], &dc_pred_dir, 0);
 
-            if(i == -1) i=0;
+            i -= i>>31; //if(i == -1) i=0;
         }
 
         mpeg4_pred_ac(s, block, n, dc_pred_dir);
@@ -5575,6 +5624,7 @@ static int decode_vol_header(MpegEncContext *s, GetBitContext *gb){
 
         s->progressive_sequence=
         s->progressive_frame= get_bits1(gb)^1;
+        s->interlaced_dct=0;
         if(!get_bits1(gb) && (s->avctx->debug & FF_DEBUG_PICT_INFO))
             av_log(s->avctx, AV_LOG_INFO, "MPEG4 OBMC not supported (very likely buggy encoder)\n");   /* OBMC Disable */
         if (vo_ver_id == 1) {
@@ -5849,6 +5899,7 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
 //            printf("messed up order, maybe after seeking? skipping current b frame\n");
             return FRAME_SKIPPED;
         }
+        ff_mpeg4_init_direct_mv(s);
 
         if(s->t_frame==0) s->t_frame= s->pb_time;
         if(s->t_frame==0) s->t_frame=1; // 1/0 protection
diff --git a/src/libffmpeg/libavcodec/h263data.h b/src/libffmpeg/libavcodec/h263data.h
index 2968531a5..01bcaedb4 100644
--- a/src/libffmpeg/libavcodec/h263data.h
+++ b/src/libffmpeg/libavcodec/h263data.h
@@ -274,8 +274,8 @@ const uint16_t ff_mba_max[6]={
      47,  98, 395,1583,6335,9215
 };
 
-const uint8_t ff_mba_length[6]={
-      6,   7,   9,  11,  13,  14
+const uint8_t ff_mba_length[7]={
+      6,   7,   9,  11,  13,  14,  14
 };
 
 const uint8_t ff_h263_loop_filter_strength[32]={
diff --git a/src/libffmpeg/libavcodec/h263dec.c b/src/libffmpeg/libavcodec/h263dec.c
index 87c9e4991..73b050325 100644
--- a/src/libffmpeg/libavcodec/h263dec.c
+++ b/src/libffmpeg/libavcodec/h263dec.c
@@ -85,6 +85,7 @@ int ff_h263_decode_init(AVCodecContext *avctx)
         s->h263_pred = 1;
         s->msmpeg4_version=5;
         break;
+    case CODEC_ID_VC1:
     case CODEC_ID_WMV3:
         s->h263_msmpeg4 = 1;
         s->h263_pred = 1;
@@ -390,6 +391,7 @@ static int h263_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_siz
     return END_NOT_FOUND;
 }
 
+#ifdef CONFIG_H263_PARSER
 static int h263_parse(AVCodecParserContext *s,
                            AVCodecContext *avctx,
                            uint8_t **poutbuf, int *poutbuf_size,
@@ -410,6 +412,7 @@ static int h263_parse(AVCodecParserContext *s,
     *poutbuf_size = buf_size;
     return next;
 }
+#endif
 
 int ff_h263_decode_frame(AVCodecContext *avctx,
                              void *data, int *data_size,
@@ -514,7 +517,8 @@ retry:
 
     if(s->xvid_build==0 && s->divx_version==0 && s->lavc_build==0){
         if(s->avctx->stream_codec_tag == ff_get_fourcc("XVID") ||
-           s->avctx->codec_tag == ff_get_fourcc("XVID") || s->avctx->codec_tag == ff_get_fourcc("XVIX"))
+           s->avctx->codec_tag == ff_get_fourcc("XVID") || s->avctx->codec_tag == ff_get_fourcc("XVIX") ||
+           s->avctx->codec_tag == ff_get_fourcc("RMP4"))
             s->xvid_build= -1;
 #if 0
         if(s->avctx->codec_tag == ff_get_fourcc("DIVX") && s->vo_type==0 && s->vol_control_parameters==1
@@ -693,7 +697,7 @@ retry:
         return -1;
 
 #ifdef DEBUG
-    printf("qscale=%d\n", s->qscale);
+    av_log(avctx, AV_LOG_DEBUG, "qscale=%d\n", s->qscale);
 #endif
 
     ff_er_frame_start(s);
@@ -765,24 +769,23 @@ retry:
 
 assert(s->current_picture.pict_type == s->current_picture_ptr->pict_type);
 assert(s->current_picture.pict_type == s->pict_type);
-    if(s->pict_type==B_TYPE || s->low_delay){
-        *pict= *(AVFrame*)&s->current_picture;
+    if (s->pict_type == B_TYPE || s->low_delay) {
+        *pict= *(AVFrame*)s->current_picture_ptr;
+    } else if (s->last_picture_ptr != NULL) {
+        *pict= *(AVFrame*)s->last_picture_ptr;
+    }
+
+    if(s->last_picture_ptr || s->low_delay){
+        *data_size = sizeof(AVFrame);
         ff_print_debug_info(s, pict);
-    } else {
-        *pict= *(AVFrame*)&s->last_picture;
-        if(pict)
-            ff_print_debug_info(s, pict);
     }
 
     /* Return the Picture timestamp as the frame number */
     /* we substract 1 because it is added on utils.c    */
     avctx->frame_number = s->picture_number - 1;
 
-    /* don't output the last pic after seeking */
-    if(s->last_picture_ptr || s->low_delay)
-        *data_size = sizeof(AVFrame);
 #ifdef PRINT_FRAME_TIME
-printf("%Ld\n", rdtsc()-time);
+av_log(avctx, AV_LOG_DEBUG, "%Ld\n", rdtsc()-time);
 #endif
 
     return get_consumed_bytes(s, buf_size);
@@ -886,6 +889,7 @@ AVCodec flv_decoder = {
     CODEC_CAP_DRAW_HORIZ_BAND | CODEC_CAP_DR1
 };
 
+#ifdef CONFIG_H263_PARSER
 AVCodecParser h263_parser = {
     { CODEC_ID_H263 },
     sizeof(ParseContext),
@@ -893,3 +897,4 @@ AVCodecParser h263_parser = {
     h263_parse,
     ff_parse_close,
 };
+#endif
diff --git a/src/libffmpeg/libavcodec/h264.c b/src/libffmpeg/libavcodec/h264.c
index e80a3992c..1a7fb76b4 100644
--- a/src/libffmpeg/libavcodec/h264.c
+++ b/src/libffmpeg/libavcodec/h264.c
@@ -33,7 +33,7 @@
 
 #include "cabac.h"
 
-#undef NDEBUG
+//#undef NDEBUG
 #include <assert.h>
 
 #define interlaced_dct interlaced_dct_is_a_bad_name
@@ -54,6 +54,22 @@
 
 #define MAX_MMCO_COUNT 66
 
+/* Compiling in interlaced support reduces the speed
+ * of progressive decoding by about 2%. */
+#define ALLOW_INTERLACE
+
+#ifdef ALLOW_INTERLACE
+#define MB_MBAFF h->mb_mbaff
+#define MB_FIELD h->mb_field_decoding_flag
+#define FRAME_MBAFF h->mb_aff_frame
+#else
+#define MB_MBAFF 0
+#define MB_FIELD 0
+#define FRAME_MBAFF 0
+#undef  IS_INTERLACED
+#define IS_INTERLACED(mb_type) 0
+#endif
+
 /**
  * Sequence parameter set
  */
@@ -173,7 +189,8 @@ typedef struct H264Context{
 
     int chroma_qp; //QPc
 
-    int prev_mb_skipped; //FIXME remove (IMHO not used)
+    int prev_mb_skipped;
+    int next_mb_skipped;
 
     //prediction stuff
     int chroma_pred_mode;
@@ -231,6 +248,12 @@ typedef struct H264Context{
     int b_stride; //FIXME use s->b4_stride
     int b8_stride;
 
+    int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
+    int mb_uvlinesize;
+
+    int emu_edge_width;
+    int emu_edge_height;
+
     int halfpel_flag;
     int thirdpel_flag;
 
@@ -254,13 +277,14 @@ typedef struct H264Context{
 
     int slice_num;
     uint8_t *slice_table_base;
-    uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
+    uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
     int slice_type;
     int slice_type_fixed;
 
     //interlacing specific flags
     int mb_aff_frame;
     int mb_field_decoding_flag;
+    int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 
     int sub_mb_type[4];
 
@@ -291,11 +315,11 @@ typedef struct H264Context{
     int use_weight_chroma;
     int luma_log2_weight_denom;
     int chroma_log2_weight_denom;
-    int luma_weight[2][16];
-    int luma_offset[2][16];
-    int chroma_weight[2][16][2];
-    int chroma_offset[2][16][2];
-    int implicit_weight[16][16];
+    int luma_weight[2][48];
+    int luma_offset[2][48];
+    int chroma_weight[2][48][2];
+    int chroma_offset[2][48][2];
+    int implicit_weight[48][48];
 
     //deblock
     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
@@ -306,17 +330,18 @@ typedef struct H264Context{
 
     int direct_spatial_mv_pred;
     int dist_scale_factor[16];
+    int dist_scale_factor_field[32];
     int map_col_to_list0[2][16];
+    int map_col_to_list0_field[2][32];
 
     /**
      * num_ref_idx_l0/1_active_minus1 + 1
      */
-    int ref_count[2];// FIXME split for AFF
+    int ref_count[2];            ///< counts frames or fields, depending on current mb mode
     Picture *short_ref[32];
     Picture *long_ref[32];
     Picture default_ref_list[2][32];
-    Picture ref_list[2][32]; //FIXME size?
-    Picture field_ref_list[2][32]; //FIXME size?
+    Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
     Picture *delayed_pic[16]; //FIXME size?
     Picture *delayed_output_pic;
 
@@ -357,9 +382,17 @@ typedef struct H264Context{
     uint8_t     direct_cache[5*8];
 
     uint8_t zigzag_scan[16];
+    uint8_t zigzag_scan8x8[64];
+    uint8_t zigzag_scan8x8_cavlc[64];
     uint8_t field_scan[16];
+    uint8_t field_scan8x8[64];
+    uint8_t field_scan8x8_cavlc[64];
     const uint8_t *zigzag_scan_q0;
+    const uint8_t *zigzag_scan8x8_q0;
+    const uint8_t *zigzag_scan8x8_cavlc_q0;
     const uint8_t *field_scan_q0;
+    const uint8_t *field_scan8x8_q0;
+    const uint8_t *field_scan8x8_cavlc_q0;
 
     int x264_build;
 }H264Context;
@@ -394,60 +427,83 @@ static always_inline uint32_t pack16to32(int a, int b){
 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
     uint8_t *p= (uint8_t*)vp;
     assert(size==1 || size==4);
+    assert(w<=4);
 
     w      *= size;
     stride *= size;
 
     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
     assert((stride&(w-1))==0);
-//FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it
-    if(w==2 && h==2){
-        *(uint16_t*)(p + 0)=
-        *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
-    }else if(w==2 && h==4){
-        *(uint16_t*)(p + 0*stride)=
-        *(uint16_t*)(p + 1*stride)=
+    if(w==2){
+        const uint16_t v= size==4 ? val : val*0x0101;
+        *(uint16_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint16_t*)(p + 1*stride)= v;
+        if(h==2) return;
         *(uint16_t*)(p + 2*stride)=
-        *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
-    }else if(w==4 && h==1){
-        *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
-    }else if(w==4 && h==2){
-        *(uint32_t*)(p + 0*stride)=
-        *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
-    }else if(w==4 && h==4){
-        *(uint32_t*)(p + 0*stride)=
-        *(uint32_t*)(p + 1*stride)=
+        *(uint16_t*)(p + 3*stride)= v;
+    }else if(w==4){
+        const uint32_t v= size==4 ? val : val*0x01010101;
+        *(uint32_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint32_t*)(p + 1*stride)= v;
+        if(h==2) return;
         *(uint32_t*)(p + 2*stride)=
-        *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
-    }else if(w==8 && h==1){
-        *(uint32_t*)(p + 0)=
-        *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
-    }else if(w==8 && h==2){
-        *(uint32_t*)(p + 0 + 0*stride)=
-        *(uint32_t*)(p + 4 + 0*stride)=
-        *(uint32_t*)(p + 0 + 1*stride)=
-        *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
-    }else if(w==8 && h==4){
-        *(uint64_t*)(p + 0*stride)=
-        *(uint64_t*)(p + 1*stride)=
+        *(uint32_t*)(p + 3*stride)= v;
+    }else if(w==8){
+    //gcc can't optimize 64bit math on x86_32
+#if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
+        const uint64_t v= val*0x0100000001ULL;
+        *(uint64_t*)(p + 0*stride)= v;
+        if(h==1) return;
+        *(uint64_t*)(p + 1*stride)= v;
+        if(h==2) return;
         *(uint64_t*)(p + 2*stride)=
-        *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
-    }else if(w==16 && h==2){
-        *(uint64_t*)(p + 0+0*stride)=
-        *(uint64_t*)(p + 8+0*stride)=
-        *(uint64_t*)(p + 0+1*stride)=
-        *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
-    }else if(w==16 && h==4){
+        *(uint64_t*)(p + 3*stride)= v;
+    }else if(w==16){
+        const uint64_t v= val*0x0100000001ULL;
         *(uint64_t*)(p + 0+0*stride)=
         *(uint64_t*)(p + 8+0*stride)=
         *(uint64_t*)(p + 0+1*stride)=
-        *(uint64_t*)(p + 8+1*stride)=
+        *(uint64_t*)(p + 8+1*stride)= v;
+        if(h==2) return;
         *(uint64_t*)(p + 0+2*stride)=
         *(uint64_t*)(p + 8+2*stride)=
         *(uint64_t*)(p + 0+3*stride)=
-        *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
+        *(uint64_t*)(p + 8+3*stride)= v;
+#else
+        *(uint32_t*)(p + 0+0*stride)=
+        *(uint32_t*)(p + 4+0*stride)= val;
+        if(h==1) return;
+        *(uint32_t*)(p + 0+1*stride)=
+        *(uint32_t*)(p + 4+1*stride)= val;
+        if(h==2) return;
+        *(uint32_t*)(p + 0+2*stride)=
+        *(uint32_t*)(p + 4+2*stride)=
+        *(uint32_t*)(p + 0+3*stride)=
+        *(uint32_t*)(p + 4+3*stride)= val;
+    }else if(w==16){
+        *(uint32_t*)(p + 0+0*stride)=
+        *(uint32_t*)(p + 4+0*stride)=
+        *(uint32_t*)(p + 8+0*stride)=
+        *(uint32_t*)(p +12+0*stride)=
+        *(uint32_t*)(p + 0+1*stride)=
+        *(uint32_t*)(p + 4+1*stride)=
+        *(uint32_t*)(p + 8+1*stride)=
+        *(uint32_t*)(p +12+1*stride)= val;
+        if(h==2) return;
+        *(uint32_t*)(p + 0+2*stride)=
+        *(uint32_t*)(p + 4+2*stride)=
+        *(uint32_t*)(p + 8+2*stride)=
+        *(uint32_t*)(p +12+2*stride)=
+        *(uint32_t*)(p + 0+3*stride)=
+        *(uint32_t*)(p + 4+3*stride)=
+        *(uint32_t*)(p + 8+3*stride)=
+        *(uint32_t*)(p +12+3*stride)= val;
+#endif
     }else
         assert(0);
+    assert(h==4);
 }
 
 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
@@ -458,10 +514,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
     int left_block[8];
     int i;
 
-    //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
-    // the actual condition is whether we're on the edge of a slice,
-    // and even then the intra and nnz parts are unnecessary.
-    if(for_deblock && h->slice_num == 1)
+    //FIXME deblocking could skip the intra and nnz parts.
+    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
         return;
 
     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
@@ -478,7 +532,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
     left_block[5]= 10;
     left_block[6]= 8;
     left_block[7]= 11;
-    if(h->mb_aff_frame){
+    if(FRAME_MBAFF){
         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
         const int top_pair_xy      = pair_xy     - s->mb_stride;
         const int topleft_pair_xy  = top_pair_xy - 1;
@@ -548,11 +602,39 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
     h->left_mb_xy[0] = left_xy[0];
     h->left_mb_xy[1] = left_xy[1];
     if(for_deblock){
-        topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
+        topleft_type = 0;
+        topright_type = 0;
         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
-        topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
+
+        if(FRAME_MBAFF && !IS_INTRA(mb_type)){
+            int list;
+            int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
+            for(i=0; i<16; i++)
+                h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
+            for(list=0; list<1+(h->slice_type==B_TYPE); list++){
+                if(USES_LIST(mb_type,list)){
+                    uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
+                    uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
+                    uint8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
+                    for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
+                        dst[0] = src[0];
+                        dst[1] = src[1];
+                        dst[2] = src[2];
+                        dst[3] = src[3];
+                    }
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
+                    ref += h->b8_stride;
+                    *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
+                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
+                }else{
+                    fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
+                    fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
+                }
+            }
+        }
     }else{
         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
@@ -697,7 +779,6 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
     }
 
 #if 1
-    //FIXME direct mb can skip much of this
     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
         int list;
         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
@@ -711,7 +792,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
             }
             h->mv_cache_clean[list]= 0;
 
-            if(IS_INTER(top_type)){
+            if(USES_LIST(top_type, list)){
                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
@@ -731,13 +812,13 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
             }
 
             //FIXME unify cleanup or sth
-            if(IS_INTER(left_type[0])){
+            if(USES_LIST(left_type[0], list)){
                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
-                h->ref_cache[list][scan8[0] - 1 + 0*8]=
-                h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
+                h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
+                h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
             }else{
                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
@@ -745,13 +826,13 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
             }
 
-            if(IS_INTER(left_type[1])){
+            if(USES_LIST(left_type[1], list)){
                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
-                h->ref_cache[list][scan8[0] - 1 + 2*8]=
-                h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
+                h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
+                h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
             }else{
                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
@@ -760,10 +841,10 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 assert((!left_type[0]) == (!left_type[1]));
             }
 
-            if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
+            if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
                 continue;
 
-            if(IS_INTER(topleft_type)){
+            if(USES_LIST(topleft_type, list)){
                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
@@ -773,7 +854,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
             }
 
-            if(IS_INTER(topright_type)){
+            if(USES_LIST(topright_type, list)){
                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
@@ -783,6 +864,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
             }
 
+            if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
+                continue;
 
             h->ref_cache[list][scan8[5 ]+1] =
             h->ref_cache[list][scan8[7 ]+1] =
@@ -797,14 +880,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 
             if( h->pps.cabac ) {
                 /* XXX beurk, Load mvd */
-                if(IS_INTER(topleft_type)){
-                    const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
-                }else{
-                    *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
-                }
-
-                if(IS_INTER(top_type)){
+                if(USES_LIST(top_type, list)){
                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
@@ -816,7 +892,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
                 }
-                if(IS_INTER(left_type[0])){
+                if(USES_LIST(left_type[0], list)){
                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
@@ -824,7 +900,7 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
                 }
-                if(IS_INTER(left_type[1])){
+                if(USES_LIST(left_type[1], list)){
                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
@@ -851,18 +927,52 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
                     }
 
-                    //FIXME interlacing
-                    if(IS_DIRECT(left_type[0])){
-                        h->direct_cache[scan8[0] - 1 + 0*8]=
+                    if(IS_DIRECT(left_type[0]))
+                        h->direct_cache[scan8[0] - 1 + 0*8]= 1;
+                    else if(IS_8X8(left_type[0]))
+                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
+                    else
+                        h->direct_cache[scan8[0] - 1 + 0*8]= 0;
+
+                    if(IS_DIRECT(left_type[1]))
                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
-                    }else if(IS_8X8(left_type[0])){
-                        int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
-                        h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
-                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
-                    }else{
-                        h->direct_cache[scan8[0] - 1 + 0*8]=
+                    else if(IS_8X8(left_type[1]))
+                        h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
+                    else
                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
+                }
+            }
+
+            if(FRAME_MBAFF){
+#define MAP_MVS\
+                    MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
+                    MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
+                    MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
+                    MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
+                    MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
+                    MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
+                    MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
+                if(MB_FIELD){
+#define MAP_F2F(idx, mb_type)\
+                    if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
+                        h->ref_cache[list][idx] <<= 1;\
+                        h->mv_cache[list][idx][1] /= 2;\
+                        h->mvd_cache[list][idx][1] /= 2;\
+                    }
+                    MAP_MVS
+#undef MAP_F2F
+                }else{
+#define MAP_F2F(idx, mb_type)\
+                    if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
+                        h->ref_cache[list][idx] >>= 1;\
+                        h->mv_cache[list][idx][1] <<= 1;\
+                        h->mvd_cache[list][idx][1] <<= 1;\
                     }
+                    MAP_MVS
+#undef MAP_F2F
                 }
             }
         }
@@ -987,6 +1097,14 @@ static inline void write_back_non_zero_count(H264Context *h){
     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
+
+    if(FRAME_MBAFF){
+        // store all luma nnzs, for deblocking
+        int v = 0, i;
+        for(i=0; i<16; i++)
+            v += (!!h->non_zero_count_cache[scan8[i]]) << i;
+        *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
+    }
 }
 
 /**
@@ -1009,6 +1127,49 @@ static inline int pred_non_zero_count(H264Context *h, int n){
 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 
+    /* there is no consistent mapping of mvs to neighboring locations that will
+     * make mbaff happy, so we can't move all this logic to fill_caches */
+    if(FRAME_MBAFF){
+        MpegEncContext *s = &h->s;
+        const int *mb_types = s->current_picture_ptr->mb_type;
+        const int16_t *mv;
+        *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
+        *C = h->mv_cache[list][scan8[0]-2];
+
+        if(!MB_FIELD
+           && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
+            int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
+            if(IS_INTERLACED(mb_types[topright_xy])){
+#define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
+                const int x4 = X4, y4 = Y4;\
+                const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
+                if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
+                    return LIST_NOT_USED;\
+                mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
+                h->mv_cache[list][scan8[0]-2][0] = mv[0];\
+                h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
+                return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
+
+                SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
+            }
+        }
+        if(topright_ref == PART_NOT_AVAILABLE
+           && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
+           && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
+            if(!MB_FIELD
+               && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
+                SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
+            }
+            if(MB_FIELD
+               && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
+               && i >= scan8[0]+8){
+                // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
+                SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
+            }
+        }
+#undef SET_DIAG_MV
+    }
+
     if(topright_ref != PART_NOT_AVAILABLE){
         *C= h->mv_cache[list][ i - 8 + part_width ];
         return topright_ref;
@@ -1182,6 +1343,12 @@ static inline void direct_dist_scale_factor(H264Context * const h){
             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
         }
     }
+    if(FRAME_MBAFF){
+        for(i=0; i<h->ref_count[0]; i++){
+            h->dist_scale_factor_field[2*i] =
+            h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
+        }
+    }
 }
 static inline void direct_ref_list_init(H264Context * const h){
     MpegEncContext * const s = &h->s;
@@ -1202,7 +1369,7 @@ static inline void direct_ref_list_init(H264Context * const h){
     for(list=0; list<2; list++){
         for(i=0; i<ref1->ref_count[list]; i++){
             const int poc = ref1->ref_poc[list][i];
-            h->map_col_to_list0[list][i] = PART_NOT_AVAILABLE;
+            h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
             for(j=0; j<h->ref_count[list]; j++)
                 if(h->ref_list[list][j].poc == poc){
                     h->map_col_to_list0[list][i] = j;
@@ -1210,6 +1377,15 @@ static inline void direct_ref_list_init(H264Context * const h){
                 }
         }
     }
+    if(FRAME_MBAFF){
+        for(list=0; list<2; list++){
+            for(i=0; i<ref1->ref_count[list]; i++){
+                j = h->map_col_to_list0[list][i];
+                h->map_col_to_list0_field[list][2*i] = 2*j;
+                h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
+            }
+        }
+    }
 }
 
 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
@@ -1226,12 +1402,13 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
     int sub_mb_type;
     int i8, i4;
 
+#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
         /* FIXME save sub mb types from previous frames (or derive from MVs)
          * so we know exactly what block size to use */
         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
-    }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
+    }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
     }else{
@@ -1240,6 +1417,8 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
     }
     if(!is_b8x8)
         *mb_type |= MB_TYPE_DIRECT2;
+    if(MB_FIELD)
+        *mb_type |= MB_TYPE_INTERLACED;
 
     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 
@@ -1248,6 +1427,8 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
         int mv[2][2];
         int list;
 
+        /* FIXME interlacing + spatial direct uses wrong colocated block positions */
+
         /* ref = min(neighbors) */
         for(list=0; list<2; list++){
             int refa = h->ref_cache[list][scan8[0] - 1];
@@ -1345,6 +1526,107 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
             }
         }
     }else{ /* direct temporal mv pred */
+        const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
+        const int *dist_scale_factor = h->dist_scale_factor;
+
+        if(FRAME_MBAFF){
+            if(IS_INTERLACED(*mb_type)){
+                map_col_to_list0[0] = h->map_col_to_list0_field[0];
+                map_col_to_list0[1] = h->map_col_to_list0_field[1];
+                dist_scale_factor = h->dist_scale_factor_field;
+            }
+            if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
+                /* FIXME assumes direct_8x8_inference == 1 */
+                const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
+                int mb_types_col[2];
+                int y_shift;
+
+                *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
+                         | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
+                         | (*mb_type & MB_TYPE_INTERLACED);
+                sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
+
+                if(IS_INTERLACED(*mb_type)){
+                    /* frame to field scaling */
+                    mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
+                    mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
+                    if(s->mb_y&1){
+                        l1ref0 -= 2*h->b8_stride;
+                        l1ref1 -= 2*h->b8_stride;
+                        l1mv0 -= 4*h->b_stride;
+                        l1mv1 -= 4*h->b_stride;
+                    }
+                    y_shift = 0;
+
+                    if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
+                       && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
+                       && !is_b8x8)
+                        *mb_type |= MB_TYPE_16x8;
+                    else
+                        *mb_type |= MB_TYPE_8x8;
+                }else{
+                    /* field to frame scaling */
+                    /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
+                     * but in MBAFF, top and bottom POC are equal */
+                    int dy = (s->mb_y&1) ? 1 : 2;
+                    mb_types_col[0] =
+                    mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
+                    l1ref0 += dy*h->b8_stride;
+                    l1ref1 += dy*h->b8_stride;
+                    l1mv0 += 2*dy*h->b_stride;
+                    l1mv1 += 2*dy*h->b_stride;
+                    y_shift = 2;
+
+                    if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
+                       && !is_b8x8)
+                        *mb_type |= MB_TYPE_16x16;
+                    else
+                        *mb_type |= MB_TYPE_8x8;
+                }
+
+                for(i8=0; i8<4; i8++){
+                    const int x8 = i8&1;
+                    const int y8 = i8>>1;
+                    int ref0, scale;
+                    const int16_t (*l1mv)[2]= l1mv0;
+
+                    if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
+                        continue;
+                    h->sub_mb_type[i8] = sub_mb_type;
+
+                    fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
+                    if(IS_INTRA(mb_types_col[y8])){
+                        fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
+                        fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
+                        fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
+                        continue;
+                    }
+
+                    ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
+                    if(ref0 >= 0)
+                        ref0 = map_col_to_list0[0][ref0*2>>y_shift];
+                    else{
+                        ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
+                        l1mv= l1mv1;
+                    }
+                    scale = dist_scale_factor[ref0];
+                    fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
+
+                    {
+                        const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
+                        int my_col = (mv_col[1]<<y_shift)/2;
+                        int mx = (scale * mv_col[0] + 128) >> 8;
+                        int my = (scale * my_col + 128) >> 8;
+                        fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
+                        fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
+                    }
+                }
+                return;
+            }
+        }
+
+        /* one-to-one mv scaling */
+
         if(IS_16X16(*mb_type)){
             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
             if(IS_INTRA(mb_type_col)){
@@ -1352,13 +1634,13 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
             }else{
-                const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
-                                                : h->map_col_to_list0[1][l1ref1[0]];
-                const int dist_scale_factor = h->dist_scale_factor[ref0];
+                const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
+                                                : map_col_to_list0[1][l1ref1[0]];
+                const int scale = dist_scale_factor[ref0];
                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
                 int mv_l0[2];
-                mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
-                mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
+                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
+                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
@@ -1367,15 +1649,15 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
             for(i8=0; i8<4; i8++){
                 const int x8 = i8&1;
                 const int y8 = i8>>1;
-                int ref0, dist_scale_factor;
+                int ref0, scale;
                 const int16_t (*l1mv)[2]= l1mv0;
 
                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
                     continue;
                 h->sub_mb_type[i8] = sub_mb_type;
+                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
                 if(IS_INTRA(mb_type_col)){
                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
-                    fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
                     continue;
@@ -1383,27 +1665,26 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 
                 ref0 = l1ref0[x8 + y8*h->b8_stride];
                 if(ref0 >= 0)
-                    ref0 = h->map_col_to_list0[0][ref0];
+                    ref0 = map_col_to_list0[0][ref0];
                 else{
-                    ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
+                    ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
                     l1mv= l1mv1;
                 }
-                dist_scale_factor = h->dist_scale_factor[ref0];
+                scale = dist_scale_factor[ref0];
 
                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
-                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
                 if(IS_SUB_8X8(sub_mb_type)){
                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
-                    int mx = (dist_scale_factor * mv_col[0] + 128) >> 8;
-                    int my = (dist_scale_factor * mv_col[1] + 128) >> 8;
+                    int mx = (scale * mv_col[0] + 128) >> 8;
+                    int my = (scale * mv_col[1] + 128) >> 8;
                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
                 }else
                 for(i4=0; i4<4; i4++){
                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
-                    mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
-                    mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
+                    mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
+                    mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
                 }
@@ -1418,28 +1699,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){
     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
     int list;
 
+    if(!USES_LIST(mb_type, 0))
+        fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
+
     for(list=0; list<2; list++){
         int y;
-        if(!USES_LIST(mb_type, list)){
-            if(1){ //FIXME skip or never read if mb_type doesn't use it
-                for(y=0; y<4; y++){
-                    *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
-                    *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
-                }
-                if( h->pps.cabac ) {
-                    /* FIXME needed ? */
-                    for(y=0; y<4; y++){
-                        *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
-                        *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
-                    }
-                }
-                for(y=0; y<2; y++){
-                    s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]=
-                    s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= LIST_NOT_USED;
-                }
-            }
+        if(!USES_LIST(mb_type, list))
             continue;
-        }
 
         for(y=0; y<4; y++){
             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
@@ -1451,17 +1717,22 @@ static inline void write_back_motion(H264Context *h, int mb_type){
                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
             }
         }
-        for(y=0; y<2; y++){
-            s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
-            s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
+
+        {
+            uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
+            ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
+            ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
+            ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
+            ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
         }
     }
 
     if(h->slice_type == B_TYPE && h->pps.cabac){
         if(IS_8X8(mb_type)){
-            h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
-            h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
-            h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
+            uint8_t *direct_table = &h->direct_table[b8_xy];
+            direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
+            direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
+            direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
         }
     }
 }
@@ -2594,20 +2865,20 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
     MpegEncContext * const s = &h->s;
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
-    const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
+    int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
     const int luma_xy= (mx&3) + ((my&3)<<2);
-    uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
-    uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
-    uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
-    int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
-    int extra_height= extra_width;
+    uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
+    uint8_t * src_cb, * src_cr;
+    int extra_width= h->emu_edge_width;
+    int extra_height= h->emu_edge_height;
     int emu=0;
     const int full_mx= mx>>2;
     const int full_my= my>>2;
     const int pic_width  = 16*s->mb_width;
-    const int pic_height = 16*s->mb_height;
+    const int pic_height = 16*s->mb_height >> MB_MBAFF;
 
-    assert(pic->data[0]);
+    if(!pic->data[0])
+        return;
 
     if(mx&7) extra_width -= 3;
     if(my&7) extra_height -= 3;
@@ -2616,29 +2887,37 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        || full_my < 0-extra_height
        || full_mx + 16/*FIXME*/ > pic_width + extra_width
        || full_my + 16/*FIXME*/ > pic_height + extra_height){
-        ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
-            src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
+        ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+            src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
         emu=1;
     }
 
-    qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
+    qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
     if(!square){
-        qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
+        qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
     }
 
     if(s->flags&CODEC_FLAG_GRAY) return;
 
+    if(MB_MBAFF){
+        // chroma offset when predicting from a field of opposite parity
+        my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
+        emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
+    }
+    src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
+    src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
+
     if(emu){
-        ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
             src_cb= s->edge_emu_buffer;
     }
-    chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
 
     if(emu){
-        ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
             src_cr= s->edge_emu_buffer;
     }
-    chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
 }
 
 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
@@ -2651,11 +2930,11 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
     qpel_mc_func *qpix_op=  qpix_put;
     h264_chroma_mc_func chroma_op= chroma_put;
 
-    dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
-    dest_cb +=   x_offset +   y_offset*s->uvlinesize;
-    dest_cr +=   x_offset +   y_offset*s->uvlinesize;
+    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
+    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
+    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
     x_offset += 8*s->mb_x;
-    y_offset += 8*s->mb_y;
+    y_offset += 8*(s->mb_y >> MB_MBAFF);
 
     if(list0){
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
@@ -2684,18 +2963,18 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                            int list0, int list1){
     MpegEncContext * const s = &h->s;
 
-    dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
-    dest_cb +=   x_offset +   y_offset*s->uvlinesize;
-    dest_cr +=   x_offset +   y_offset*s->uvlinesize;
+    dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
+    dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
+    dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
     x_offset += 8*s->mb_x;
-    y_offset += 8*s->mb_y;
+    y_offset += 8*(s->mb_y >> MB_MBAFF);
 
     if(list0 && list1){
         /* don't optimize for luma-only case, since B-frames usually
          * use implicit weights => chroma too. */
         uint8_t *tmp_cb = s->obmc_scratchpad;
-        uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
-        uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
+        uint8_t *tmp_cr = s->obmc_scratchpad + 8;
+        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
         int refn0 = h->ref_cache[0][ scan8[n] ];
         int refn1 = h->ref_cache[1][ scan8[n] ];
 
@@ -2709,17 +2988,17 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1];
             int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0);
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
         }else{
-            luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
-            chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
-            chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
         }
@@ -2731,12 +3010,12 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put, chroma_put);
 
-        luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
+        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
         if(h->use_weight_chroma){
-            chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
-            chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
         }
     }
@@ -2760,6 +3039,22 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
 }
 
+static inline void prefetch_motion(H264Context *h, int list){
+    /* fetch pixels for estimated mv 4 macroblocks ahead
+     * optimized for 64byte cache lines */
+    MpegEncContext * const s = &h->s;
+    const int refn = h->ref_cache[list][scan8[0]];
+    if(refn >= 0){
+        const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
+        const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
+        uint8_t **src= h->ref_list[list][refn].data;
+        int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
+        s->dsp.prefetch(src[0]+off, s->linesize, 4);
+        off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
+        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+    }
+}
+
 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
@@ -2770,6 +3065,8 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
 
     assert(IS_INTER(mb_type));
 
+    prefetch_motion(h, 0);
+
     if(IS_16X16(mb_type)){
         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
@@ -2785,11 +3082,11 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
                 &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
     }else if(IS_8X16(mb_type)){
-        mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
-        mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
+        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                 &weight_op[2], &weight_avg[2],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
@@ -2819,11 +3116,11 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
                     &weight_op[4], &weight_avg[4],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
             }else if(IS_SUB_4X8(sub_mb_type)){
-                mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
-                mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     &weight_op[5], &weight_avg[5],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
@@ -2841,6 +3138,8 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
             }
         }
     }
+
+    prefetch_motion(h, 1);
 }
 
 static void decode_init_vlc(H264Context *h){
@@ -2952,6 +3251,7 @@ static void free_tables(H264Context *h){
 
 static void init_dequant8_coeff_table(H264Context *h){
     int i,q,x;
+    const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
     h->dequant8_coeff[0] = h->dequant8_buffer[0];
     h->dequant8_coeff[1] = h->dequant8_buffer[1];
 
@@ -2965,8 +3265,9 @@ static void init_dequant8_coeff_table(H264Context *h){
             int shift = div6[q];
             int idx = rem6[q];
             for(x=0; x<64; x++)
-                h->dequant8_coeff[i][q][x] = ((uint32_t)dequant8_coeff_init[idx][
-                    dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * h->pps.scaling_matrix8[i][x]) << shift;
+                h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
+                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
+                    h->pps.scaling_matrix8[i][x]) << shift;
         }
     }
 }
@@ -3025,7 +3326,7 @@ static int alloc_tables(H264Context *h){
     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
 
     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
-    CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
@@ -3037,8 +3338,8 @@ static int alloc_tables(H264Context *h){
         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
     }
 
-    memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
-    h->slice_table= h->slice_table_base + s->mb_stride + 1;
+    memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
+    h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
 
     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
@@ -3135,7 +3436,11 @@ static int frame_start(H264Context *h){
     /* can't be in alloc_tables because linesize isn't known there.
      * FIXME: redo bipred weight to not require extra buffer? */
     if(!s->obmc_scratchpad)
-        s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
+        s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
+
+    /* some macroblocks will be accessed before they're available */
+    if(FRAME_MBAFF)
+        memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
 
 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
     return 0;
@@ -3258,7 +3563,7 @@ static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src
     int temp8, i;
     uint64_t temp64;
     int deblock_left = (s->mb_x > 0);
-    int deblock_top  = (s->mb_y > 0);
+    int deblock_top  = (s->mb_y > 1);
 
     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
 
@@ -3283,6 +3588,10 @@ b= t;
         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
+        if(s->mb_x+1 < s->mb_width){
+            XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
+            XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
+        }
     }
 
     if(!(s->flags&CODEC_FLAG_GRAY)){
@@ -3314,6 +3623,7 @@ static void hl_decode_mb(H264Context *h){
     const unsigned int bottom = mb_y & 1;
     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 
     if(!s->decode)
         return;
@@ -3322,24 +3632,58 @@ static void hl_decode_mb(H264Context *h){
     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
 
-    if (h->mb_field_decoding_flag) {
-        linesize = s->linesize * 2;
-        uvlinesize = s->uvlinesize * 2;
+    if (MB_FIELD) {
+        linesize   = h->mb_linesize   = s->linesize * 2;
+        uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
         block_offset = &h->block_offset[24];
         if(mb_y&1){ //FIXME move out of this func?
             dest_y -= s->linesize*15;
             dest_cb-= s->uvlinesize*7;
             dest_cr-= s->uvlinesize*7;
         }
+        if(FRAME_MBAFF) {
+            int list;
+            for(list=0; list<2; list++){
+                if(!USES_LIST(mb_type, list))
+                    continue;
+                if(IS_16X16(mb_type)){
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
+                    fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
+                }else{
+                    for(i=0; i<16; i+=4){
+                        //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
+                        int ref = h->ref_cache[list][scan8[i]];
+                        if(ref >= 0)
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
+                    }
+                }
+            }
+        }
     } else {
-        linesize = s->linesize;
-        uvlinesize = s->uvlinesize;
+        linesize   = h->mb_linesize   = s->linesize;
+        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
 //        dct_offset = s->linesize * 16;
     }
 
-    idct_add = transform_bypass
-             ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4
-             : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add;
+    if(transform_bypass){
+        idct_dc_add =
+        idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
+    }else if(IS_8x8DCT(mb_type)){
+        idct_dc_add = s->dsp.h264_idct8_dc_add;
+        idct_add = s->dsp.h264_idct8_add;
+    }else{
+        idct_dc_add = s->dsp.h264_idct_dc_add;
+        idct_add = s->dsp.h264_idct_add;
+    }
+
+    if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
+       && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
+        int mbt_y = mb_y&~1;
+        uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
+        uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
+        uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
+        xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
+    }
 
     if (IS_INTRA_PCM(mb_type)) {
         unsigned int x, y;
@@ -3369,14 +3713,8 @@ static void hl_decode_mb(H264Context *h){
         }
     } else {
         if(IS_INTRA(mb_type)){
-            if(h->deblocking_filter) {
-                if (h->mb_aff_frame) {
-                    if (!bottom)
-                        xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
-                } else {
-                    xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
-                }
-            }
+            if(h->deblocking_filter && !FRAME_MBAFF)
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
 
             if(!(s->flags&CODEC_FLAG_GRAY)){
                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
@@ -3389,17 +3727,22 @@ static void hl_decode_mb(H264Context *h){
                         for(i=0; i<16; i+=4){
                             uint8_t * const ptr= dest_y + block_offset[i];
                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+                            const int nnz = h->non_zero_count_cache[ scan8[i] ];
                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
-                            if(h->non_zero_count_cache[ scan8[i] ])
-                                idct_add(ptr, h->mb + i*16, linesize);
+                            if(nnz){
+                                if(nnz == 1 && h->mb[i*16])
+                                    idct_dc_add(ptr, h->mb + i*16, linesize);
+                                else
+                                    idct_add(ptr, h->mb + i*16, linesize);
+                            }
                         }
                     }else
                     for(i=0; i<16; i++){
                         uint8_t * const ptr= dest_y + block_offset[i];
                         uint8_t *topright;
                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                        int tr;
+                        int nnz, tr;
 
                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
@@ -3413,10 +3756,14 @@ static void hl_decode_mb(H264Context *h){
                             topright= NULL;
 
                         h->pred4x4[ dir ](ptr, topright, linesize);
-                        if(h->non_zero_count_cache[ scan8[i] ]){
-                            if(s->codec_id == CODEC_ID_H264)
-                                idct_add(ptr, h->mb + i*16, linesize);
-                            else
+                        nnz = h->non_zero_count_cache[ scan8[i] ];
+                        if(nnz){
+                            if(s->codec_id == CODEC_ID_H264){
+                                if(nnz == 1 && h->mb[i*16])
+                                    idct_dc_add(ptr, h->mb + i*16, linesize);
+                                else
+                                    idct_add(ptr, h->mb + i*16, linesize);
+                            }else
                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
                         }
                     }
@@ -3429,20 +3776,8 @@ static void hl_decode_mb(H264Context *h){
                 }else
                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
             }
-            if(h->deblocking_filter) {
-                if (h->mb_aff_frame) {
-                    if (bottom) {
-                        uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
-                        uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
-                        uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
-                        s->mb_y--;
-                        xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
-                        s->mb_y++;
-                    }
-                } else {
-                    xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
-                }
-            }
+            if(h->deblocking_filter && !FRAME_MBAFF)
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
         }else if(s->codec_id == CODEC_ID_H264){
             hl_motion(h, dest_y, dest_cb, dest_cr,
                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
@@ -3453,11 +3788,23 @@ static void hl_decode_mb(H264Context *h){
 
         if(!IS_INTRA4x4(mb_type)){
             if(s->codec_id == CODEC_ID_H264){
-                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
-                for(i=0; i<16; i+=di){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
-                        uint8_t * const ptr= dest_y + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, linesize);
+                if(IS_INTRA16x16(mb_type)){
+                    for(i=0; i<16; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ])
+                            idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                        else if(h->mb[i*16])
+                            idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                    }
+                }else{
+                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                    for(i=0; i<16; i+=di){
+                        int nnz = h->non_zero_count_cache[ scan8[i] ];
+                        if(nnz){
+                            if(nnz==1 && h->mb[i*16])
+                                idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                            else
+                                idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                        }
                     }
                 }
             }else{
@@ -3471,34 +3818,26 @@ static void hl_decode_mb(H264Context *h){
         }
 
         if(!(s->flags&CODEC_FLAG_GRAY)){
-            idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add;
-            if(!transform_bypass){
+            uint8_t *dest[2] = {dest_cb, dest_cr};
+            if(transform_bypass){
+                idct_add = idct_dc_add = s->dsp.add_pixels4;
+            }else{
+                idct_add = s->dsp.h264_idct_add;
+                idct_dc_add = s->dsp.h264_idct_dc_add;
                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
             }
             if(s->codec_id == CODEC_ID_H264){
-                for(i=16; i<16+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cb + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, uvlinesize);
-                    }
-                }
-                for(i=20; i<20+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cr + block_offset[i];
-                        idct_add(ptr, h->mb + i*16, uvlinesize);
-                    }
+                for(i=16; i<16+8; i++){
+                    if(h->non_zero_count_cache[ scan8[i] ])
+                        idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                    else if(h->mb[i*16])
+                        idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
                 }
             }else{
-                for(i=16; i<16+4; i++){
+                for(i=16; i<16+8; i++){
                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cb + block_offset[i];
-                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
-                    }
-                }
-                for(i=20; i<20+4; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest_cr + block_offset[i];
+                        uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
                     }
                 }
@@ -3506,36 +3845,36 @@ static void hl_decode_mb(H264Context *h){
         }
     }
     if(h->deblocking_filter) {
-        if (h->mb_aff_frame) {
+        if (FRAME_MBAFF) {
+            //FIXME try deblocking one mb at a time?
+            // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
             const int mb_y = s->mb_y - 1;
             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
             const int mb_xy= mb_x + mb_y*s->mb_stride;
             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
-            uint8_t tmp = s->current_picture.data[1][384];
             if (!bottom) return;
             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
 
+            if(IS_INTRA(mb_type_top | mb_type_bottom))
+                xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
+
             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
-            // TODO deblock a pair
+            // deblock a pair
             // top
             s->mb_y--;
             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
+            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
-            if (tmp != s->current_picture.data[1][384]) {
-                tprintf("modified pixel 8,1 (1)\n");
-            }
             // bottom
             s->mb_y++;
             tprintf("call mbaff filter_mb\n");
             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
+            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
-            if (tmp != s->current_picture.data[1][384]) {
-                tprintf("modified pixel 8,1 (2)\n");
-            }
         } else {
             tprintf("call filter_mb\n");
             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
@@ -3759,6 +4098,35 @@ static int decode_ref_pic_list_reordering(H264Context *h){
     return 0;
 }
 
+static void fill_mbaff_ref_list(H264Context *h){
+    int list, i, j;
+    for(list=0; list<2; list++){
+        for(i=0; i<h->ref_count[list]; i++){
+            Picture *frame = &h->ref_list[list][i];
+            Picture *field = &h->ref_list[list][16+2*i];
+            field[0] = *frame;
+            for(j=0; j<3; j++)
+                field[0].linesize[j] <<= 1;
+            field[1] = field[0];
+            for(j=0; j<3; j++)
+                field[1].data[j] += frame->linesize[j];
+
+            h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
+            h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
+            for(j=0; j<2; j++){
+                h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
+                h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
+            }
+        }
+    }
+    for(j=0; j<h->ref_count[1]; j++){
+        for(i=0; i<h->ref_count[0]; i++)
+            h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
+        memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
+        memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
+    }
+}
+
 static int pred_weight_table(H264Context *h){
     MpegEncContext * const s = &h->s;
     int list, i;
@@ -3828,7 +4196,6 @@ static void implicit_weight_table(H264Context *h){
     h->luma_log2_weight_denom= 5;
     h->chroma_log2_weight_denom= 5;
 
-    /* FIXME: MBAFF */
     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
         int poc0 = h->ref_list[0][ref0].poc;
         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
@@ -3887,8 +4254,13 @@ static void idr(H264Context *h){
 static void flush_dpb(AVCodecContext *avctx){
     H264Context *h= avctx->priv_data;
     int i;
-    for(i=0; i<16; i++)
+    for(i=0; i<16; i++) {
+        if(h->delayed_pic[i])
+            h->delayed_pic[i]->reference= 0;
         h->delayed_pic[i]= NULL;
+    }
+    if(h->delayed_output_pic)
+        h->delayed_output_pic->reference= 0;
     h->delayed_output_pic= NULL;
     idr(h);
     if(h->s.current_picture_ptr)
@@ -4263,8 +4635,8 @@ static int decode_slice_header(H264Context *h){
     s->mb_width= h->sps.mb_width;
     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
 
-    h->b_stride=  s->mb_width*4 + 1;
-    h->b8_stride= s->mb_width*2 + 1;
+    h->b_stride=  s->mb_width*4;
+    h->b8_stride= s->mb_width*2;
 
     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
     if(h->sps.frame_mbs_only_flag)
@@ -4290,14 +4662,39 @@ static int decode_slice_header(H264Context *h){
 #define T(x) (x>>2) | ((x<<2) & 0xF)
                 h->zigzag_scan[i] = T(zigzag_scan[i]);
                 h-> field_scan[i] = T( field_scan[i]);
+#undef T
+            }
+        }
+        if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+            memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
+            memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
+            memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
+            memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
+        }else{
+            int i;
+            for(i=0; i<64; i++){
+#define T(x) (x>>3) | ((x&7)<<3)
+                h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
+                h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
+                h->field_scan8x8[i]        = T(field_scan8x8[i]);
+                h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
+#undef T
             }
         }
         if(h->sps.transform_bypass){ //FIXME same ugly
-            h->zigzag_scan_q0 = zigzag_scan;
-            h->field_scan_q0 = field_scan;
+            h->zigzag_scan_q0          = zigzag_scan;
+            h->zigzag_scan8x8_q0       = zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
+            h->field_scan_q0           = field_scan;
+            h->field_scan8x8_q0        = field_scan8x8;
+            h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
         }else{
-            h->zigzag_scan_q0 = h->zigzag_scan;
-            h->field_scan_q0 = h->field_scan;
+            h->zigzag_scan_q0          = h->zigzag_scan;
+            h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
+            h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
+            h->field_scan_q0           = h->field_scan;
+            h->field_scan8x8_q0        = h->field_scan8x8;
+            h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
         }
 
         alloc_tables(h);
@@ -4325,21 +4722,22 @@ static int decode_slice_header(H264Context *h){
     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
 
+    h->mb_mbaff = 0;
     h->mb_aff_frame = 0;
     if(h->sps.frame_mbs_only_flag){
         s->picture_structure= PICT_FRAME;
     }else{
         if(get_bits1(&s->gb)) { //field_pic_flag
             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
+            av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
         } else {
             s->picture_structure= PICT_FRAME;
-            first_mb_in_slice <<= h->sps.mb_aff;
             h->mb_aff_frame = h->sps.mb_aff;
         }
     }
 
     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
-    s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
+    s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
     if(s->mb_y >= s->mb_height){
         return -1;
     }
@@ -4384,6 +4782,8 @@ static int decode_slice_header(H264Context *h){
     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
         if(h->slice_type == B_TYPE){
             h->direct_spatial_mv_pred= get_bits1(&s->gb);
+            if(h->sps.mb_aff && h->direct_spatial_mv_pred)
+                av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
         }
         num_ref_idx_active_override_flag= get_bits1(&s->gb);
 
@@ -4417,6 +4817,9 @@ static int decode_slice_header(H264Context *h){
     if(s->current_picture.reference)
         decode_ref_pic_marking(h);
 
+    if(FRAME_MBAFF)
+        fill_mbaff_ref_list(h);
+
     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
         h->cabac_init_idc = get_ue_golomb(&s->gb);
 
@@ -4461,6 +4864,9 @@ static int decode_slice_header(H264Context *h){
 
     h->slice_num++;
 
+    h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
+    h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
+
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
                h->slice_num,
@@ -4662,6 +5068,17 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     return 0;
 }
 
+static void predict_field_decoding_flag(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
+    int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
+                ? s->current_picture.mb_type[mb_xy-1]
+                : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
+                ? s->current_picture.mb_type[mb_xy-s->mb_stride]
+                : 0;
+    h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
+}
+
 /**
  * decodes a P_SKIP or B_SKIP macroblock
  */
@@ -4673,10 +5090,7 @@ static void decode_mb_skip(H264Context *h){
     memset(h->non_zero_count[mb_xy], 0, 16);
     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
 
-    if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
-        h->mb_field_decoding_flag= get_bits1(&s->gb);
-    }
-    if(h->mb_field_decoding_flag)
+    if(MB_FIELD)
         mb_type|= MB_TYPE_INTERLACED;
 
     if( h->slice_type == B_TYPE )
@@ -4731,13 +5145,19 @@ static int decode_mb_cavlc(H264Context *h){
             s->mb_skip_run= get_ue_golomb(&s->gb);
 
         if (s->mb_skip_run--) {
+            if(FRAME_MBAFF && (s->mb_y&1) == 0){
+                if(s->mb_skip_run==0)
+                    h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
+                else
+                    predict_field_decoding_flag(h);
+            }
             decode_mb_skip(h);
             return 0;
         }
     }
-    if(h->mb_aff_frame){
-        if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
-            h->mb_field_decoding_flag = get_bits1(&s->gb);
+    if(FRAME_MBAFF){
+        if( (s->mb_y&1) == 0 )
+            h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
     }else
         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
 
@@ -4773,7 +5193,7 @@ decode_intra_mb:
         mb_type= i_mb_type_info[mb_type].type;
     }
 
-    if(h->mb_field_decoding_flag)
+    if(MB_FIELD)
         mb_type |= MB_TYPE_INTERLACED;
 
     h->slice_table[ mb_xy ]= h->slice_num;
@@ -4817,6 +5237,11 @@ decode_intra_mb:
         return 0;
     }
 
+    if(MB_MBAFF){
+        h->ref_count[0] <<= 1;
+        h->ref_count[1] <<= 1;
+    }
+
     fill_caches(h, mb_type, 0);
 
     //mb_pred
@@ -4832,18 +5257,11 @@ decode_intra_mb:
 
 //                fill_intra4x4_pred_table(h);
                 for(i=0; i<16; i+=di){
-                    const int mode_coded= !get_bits1(&s->gb);
-                    const int predicted_mode=  pred_intra_mode(h, i);
-                    int mode;
+                    int mode= pred_intra_mode(h, i);
 
-                    if(mode_coded){
+                    if(!get_bits1(&s->gb)){
                         const int rem_mode= get_bits(&s->gb, 3);
-                        if(rem_mode<predicted_mode)
-                            mode= rem_mode;
-                        else
-                            mode= rem_mode + 1;
-                    }else{
-                        mode= predicted_mode;
+                        mode = rem_mode + (rem_mode >= mode);
                     }
 
                     if(di==4)
@@ -4901,9 +5319,6 @@ decode_intra_mb:
         for(list=0; list<2; list++){
             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
             if(ref_count == 0) continue;
-            if (h->mb_aff_frame && h->mb_field_decoding_flag) {
-                ref_count <<= 1;
-            }
             for(i=0; i<4; i++){
                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
@@ -5074,14 +5489,16 @@ decode_intra_mb:
         int i8x8, i4x4, chroma_idx;
         int chroma_qp, dquant;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
 
 //        fill_non_zero_count_cache(h);
 
         if(IS_INTERLACED(mb_type)){
+            scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
             scan= s->qscale ? h->field_scan : h->field_scan_q0;
             dc_scan= luma_dc_field_scan;
         }else{
+            scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
             dc_scan= luma_dc_zigzag_scan;
         }
@@ -5126,12 +5543,12 @@ decode_intra_mb:
                         DCTELEM *buf = &h->mb[64*i8x8];
                         uint8_t *nnz;
                         for(i4x4=0; i4x4<4; i4x4++){
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
+                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
                                 return -1;
                         }
                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
-                        nnz[0] |= nnz[1] | nnz[8] | nnz[9];
+                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
                     }else{
                         for(i4x4=0; i4x4<4; i4x4++){
                             const int index= i4x4 + 4*i8x8;
@@ -5178,6 +5595,11 @@ decode_intra_mb:
     s->current_picture.qscale_table[mb_xy]= s->qscale;
     write_back_non_zero_count(h);
 
+    if(MB_MBAFF){
+        h->ref_count[0] >>= 1;
+        h->ref_count[1] >>= 1;
+    }
+
     return 0;
 }
 
@@ -5225,19 +5647,11 @@ static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_sl
         return 25;  /* PCM */
 
     mb_type = 1; /* I16x16 */
-    if( get_cabac( &h->cabac, &state[1] ) )
-        mb_type += 12;  /* cbp_luma != 0 */
-
-    if( get_cabac( &h->cabac, &state[2] ) ) {
-        if( get_cabac( &h->cabac, &state[2+intra_slice] ) )
-            mb_type += 4 * 2;   /* cbp_chroma == 2 */
-        else
-            mb_type += 4 * 1;   /* cbp_chroma == 1 */
-    }
-    if( get_cabac( &h->cabac, &state[3+intra_slice] ) )
-        mb_type += 2;
-    if( get_cabac( &h->cabac, &state[3+2*intra_slice] ) )
-        mb_type += 1;
+    mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
+    if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
+        mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
+    mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
+    mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
     return mb_type;
 }
 
@@ -5250,15 +5664,11 @@ static int decode_cabac_mb_type( H264Context *h ) {
         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
             /* P-type */
             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
-                if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
-                    return 0; /* P_L0_D16x16; */
-                else
-                    return 3; /* P_8x8; */
+                /* P_L0_D16x16, P_8x8 */
+                return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
             } else {
-                if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
-                    return 2; /* P_L0_D8x16; */
-                else
-                    return 1; /* P_L0_D16x8; */
+                /* P_L0_D8x16, P_L0_D16x8 */
+                return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
             }
         } else {
             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
@@ -5269,11 +5679,9 @@ static int decode_cabac_mb_type( H264Context *h ) {
         int ctx = 0;
         int bits;
 
-        if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] )
-                      && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
+        if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
             ctx++;
-        if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] )
-                      && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
+        if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
             ctx++;
 
         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
@@ -5304,22 +5712,40 @@ static int decode_cabac_mb_type( H264Context *h ) {
     }
 }
 
-static int decode_cabac_mb_skip( H264Context *h) {
+static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
     MpegEncContext * const s = &h->s;
-    const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
-    const int mba_xy = mb_xy - 1;
-    const int mbb_xy = mb_xy - s->mb_stride;
+    int mba_xy, mbb_xy;
     int ctx = 0;
 
+    if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
+        int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
+        mba_xy = mb_xy - 1;
+        if( (mb_y&1)
+            && h->slice_table[mba_xy] == h->slice_num
+            && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
+            mba_xy += s->mb_stride;
+        if( MB_FIELD ){
+            mbb_xy = mb_xy - s->mb_stride;
+            if( !(mb_y&1)
+                && h->slice_table[mbb_xy] == h->slice_num
+                && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
+                mbb_xy -= s->mb_stride;
+        }else
+            mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
+    }else{
+        int mb_xy = mb_x + mb_y*s->mb_stride;
+        mba_xy = mb_xy - 1;
+        mbb_xy = mb_xy - s->mb_stride;
+    }
+
     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
         ctx++;
     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
         ctx++;
 
-    if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
-        return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
-    else /* B-frame */
-        return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
+    if( h->slice_type == B_TYPE )
+        ctx += 13;
+    return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
 }
 
 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
@@ -5376,14 +5802,17 @@ static const uint8_t block_idx_xy[4][4] = {
 };
 
 static int decode_cabac_mb_cbp_luma( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-
     int cbp = 0;
+    int cbp_b = -1;
     int i8x8;
 
+    if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
+        cbp_b = h->top_cbp;
+        tprintf("cbp_b = top_cbp = %x\n", cbp_b);
+    }
+
     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
         int cbp_a = -1;
-        int cbp_b = -1;
         int x, y;
         int ctx = 0;
 
@@ -5392,17 +5821,13 @@ static int decode_cabac_mb_cbp_luma( H264Context *h) {
 
         if( x > 0 )
             cbp_a = cbp;
-        else if( s->mb_x > 0 && (h->slice_table[h->left_mb_xy[0]] == h->slice_num)) {
+        else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
             cbp_a = h->left_cbp;
             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
         }
 
         if( y > 0 )
             cbp_b = cbp;
-        else if( s->mb_y > 0 && (h->slice_table[h->top_mb_xy] == h->slice_num)) {
-            cbp_b = h->top_cbp;
-            tprintf("cbp_b = top_cbp = %x\n", cbp_b);
-        }
 
         /* No need to test for skip as we put 0 for skip block */
         /* No need to test for IPCM as we put 1 for IPCM block */
@@ -5453,7 +5878,7 @@ static int decode_cabac_mb_dqp( H264Context *h) {
     else
         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
 
-    if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
+    if( h->last_qscale_diff != 0 )
         ctx++;
 
     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
@@ -5462,7 +5887,7 @@ static int decode_cabac_mb_dqp( H264Context *h) {
         else
             ctx = 3;
         val++;
-        if(val > 52) //prevent infinite loop
+        if(val > 102) //prevent infinite loop
             return INT_MIN;
     }
 
@@ -5598,16 +6023,26 @@ static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
 
 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
-    static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
-    static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
-    static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
-    static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
-    static const int coeff_abs_level_m1_offset[6] = { 227+0, 227+10, 227+20, 227+30, 227+39, 426 };
-    static const int significant_coeff_flag_offset_8x8[63] = {
-        0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
+    static const int significant_coeff_flag_offset[2][6] = {
+      { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
+      { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
+    };
+    static const int last_coeff_flag_offset[2][6] = {
+      { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
+      { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
+    };
+    static const int coeff_abs_level_m1_offset[6] = {
+        227+0, 227+10, 227+20, 227+30, 227+39, 426
+    };
+    static const int significant_coeff_flag_offset_8x8[2][63] = {
+      { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
-       12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
+       12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
+      { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
+        6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
+        9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
+        9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
     };
     static const int last_coeff_flag_offset_8x8[63] = {
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -5649,11 +6084,9 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
     }
 
     significant_coeff_ctx_base = h->cabac_state
-        + significant_coeff_flag_offset[cat]
-        + significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
+        + significant_coeff_flag_offset[MB_FIELD][cat];
     last_coeff_ctx_base = h->cabac_state
-        + last_significant_coeff_flag_offset[cat]
-        + last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
+        + last_coeff_flag_offset[MB_FIELD][cat];
     abs_level_m1_ctx_base = h->cabac_state
         + coeff_abs_level_m1_offset[cat];
 
@@ -5670,8 +6103,8 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
                 } \
             } \
         }
-        DECODE_SIGNIFICANCE( 63, significant_coeff_flag_offset_8x8[last],
-                                 last_coeff_flag_offset_8x8[last] );
+        const int *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
+        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
     } else {
         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
     }
@@ -5690,7 +6123,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
     else {
         assert( cat == 5 );
-        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1);
+        fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
     }
 
     for( i = coeff_count - 1; i >= 0; i-- ) {
@@ -5747,12 +6180,12 @@ static void inline compute_mb_neighbors(H264Context *h)
     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
     h->top_mb_xy     = mb_xy - s->mb_stride;
     h->left_mb_xy[0] = mb_xy - 1;
-    if(h->mb_aff_frame){
+    if(FRAME_MBAFF){
         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
         const int top_pair_xy      = pair_xy     - s->mb_stride;
         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
-        const int curr_mb_frame_flag = !h->mb_field_decoding_flag;
+        const int curr_mb_frame_flag = !MB_FIELD;
         const int bottom = (s->mb_y & 1);
         if (bottom
                 ? !curr_mb_frame_flag // bottom macroblock
@@ -5781,8 +6214,25 @@ static int decode_mb_cabac(H264Context *h) {
 
     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
+        int skip;
+        /* a skipped mb needs the aff flag from the following mb */
+        if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
+            predict_field_decoding_flag(h);
+        if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
+            skip = h->next_mb_skipped;
+        else
+            skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
         /* read skip flags */
-        if( decode_cabac_mb_skip( h ) ) {
+        if( skip ) {
+            if( FRAME_MBAFF && (s->mb_y&1)==0 ){
+                s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
+                h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
+                if(h->next_mb_skipped)
+                    predict_field_decoding_flag(h);
+                else
+                    h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
+            }
+
             decode_mb_skip(h);
 
             h->cbp_table[mb_xy] = 0;
@@ -5793,8 +6243,9 @@ static int decode_mb_cabac(H264Context *h) {
 
         }
     }
-    if(h->mb_aff_frame){
-        if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
+    if(FRAME_MBAFF){
+        if( (s->mb_y&1) == 0 )
+            h->mb_mbaff =
             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
     }else
         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
@@ -5831,7 +6282,7 @@ decode_intra_mb:
         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
         mb_type= i_mb_type_info[mb_type].type;
     }
-    if(h->mb_field_decoding_flag)
+    if(MB_FIELD)
         mb_type |= MB_TYPE_INTERLACED;
 
     h->slice_table[ mb_xy ]= h->slice_num;
@@ -5883,6 +6334,11 @@ decode_intra_mb:
         return 0;
     }
 
+    if(MB_MBAFF){
+        h->ref_count[0] <<= 1;
+        h->ref_count[1] <<= 1;
+    }
+
     fill_caches(h, mb_type, 0);
 
     if( IS_INTRA( mb_type ) ) {
@@ -5923,8 +6379,8 @@ decode_intra_mb:
                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
             }
-            if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
-               || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
+            if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
+                          h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
                 pred_direct_motion(h, &mb_type);
                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
                     for( i = 0; i < 4; i++ )
@@ -6133,13 +6589,15 @@ decode_intra_mb:
     s->current_picture.mb_type[mb_xy]= mb_type;
 
     if( cbp || IS_INTRA16x16( mb_type ) ) {
-        const uint8_t *scan, *dc_scan;
+        const uint8_t *scan, *scan8x8, *dc_scan;
         int dqp;
 
         if(IS_INTERLACED(mb_type)){
+            scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
             scan= s->qscale ? h->field_scan : h->field_scan_q0;
             dc_scan= luma_dc_field_scan;
         }else{
+            scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
             dc_scan= luma_dc_zigzag_scan;
         }
@@ -6176,7 +6634,7 @@ decode_intra_mb:
                 if( cbp & (1<<i8x8) ) {
                     if( IS_8x8DCT(mb_type) ) {
                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
-                                zigzag_scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
+                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
                             return -1;
                     } else
                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
@@ -6221,11 +6679,17 @@ decode_intra_mb:
         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
+        h->last_qscale_diff = 0;
     }
 
     s->current_picture.qscale_table[mb_xy]= s->qscale;
     write_back_non_zero_count(h);
 
+    if(MB_MBAFF){
+        h->ref_count[0] >>= 1;
+        h->ref_count[1] >>= 1;
+    }
+
     return 0;
 }
 
@@ -6316,7 +6780,7 @@ static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int
 
         int qp_index;
         int bS_index = (i >> 1);
-        if (h->mb_field_decoding_flag) {
+        if (!MB_FIELD) {
             bS_index &= ~1;
             bS_index |= (i & 1);
         }
@@ -6325,15 +6789,13 @@ static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int
             continue;
         }
 
-        qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
+        qp_index = MB_FIELD ? (i >> 3) : (i & 1);
         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
         alpha = alpha_table[index_a];
         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
 
-
         if( bS[bS_index] < 4 ) {
             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
-            /* 4px edge length */
             const int p0 = pix[-1];
             const int p1 = pix[-2];
             const int p2 = pix[-3];
@@ -6362,7 +6824,6 @@ static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int
                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
             }
         }else{
-            /* 4px edge length */
             const int p0 = pix[-1];
             const int p1 = pix[-2];
             const int p2 = pix[-3];
@@ -6408,7 +6869,7 @@ static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int
         }
     }
 }
-static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
+static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
     int i;
     for( i = 0; i < 8; i++, pix += stride) {
         int index_a;
@@ -6422,13 +6883,13 @@ static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, in
             continue;
         }
 
-        qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
+        qp_index = MB_FIELD ? (i >> 2) : (i & 1);
         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
         alpha = alpha_table[index_a];
         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
+
         if( bS[bS_index] < 4 ) {
             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
-            /* 2px edge length (because we use same bS than the one for luma) */
             const int p0 = pix[-1];
             const int p1 = pix[-2];
             const int q0 = pix[0];
@@ -6540,85 +7001,91 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
     MpegEncContext * const s = &h->s;
     const int mb_xy= mb_x + mb_y*s->mb_stride;
+    const int mb_type = s->current_picture.mb_type[mb_xy];
+    const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
     int first_vertical_edge_done = 0;
     int dir;
     /* FIXME: A given frame may occupy more than one position in
      * the reference list. So ref2frm should be populated with
      * frame numbers, not indices. */
-    static const int ref2frm[18] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+    static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
+                                    16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+
+    //for sufficiently low qp, filtering wouldn't do anything
+    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
+    if(!FRAME_MBAFF){
+        int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
+        int qp = s->current_picture.qscale_table[mb_xy];
+        if(qp <= qp_thresh
+           && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
+           && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
+            return;
+        }
+    }
 
-    if (h->mb_aff_frame
+    if (FRAME_MBAFF
             // left mb is in picture
             && h->slice_table[mb_xy-1] != 255
             // and current and left pair do not have the same interlaced type
-            && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
+            && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
             // and left mb is in the same slice if deblocking_filter == 2
             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
         /* First vertical edge is different in MBAFF frames
          * There are 8 different bS to compute and 2 different Qp
          */
+        const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
+        const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
         int bS[8];
         int qp[2];
         int chroma_qp[2];
-
+        int mb_qp, mbn0_qp, mbn1_qp;
         int i;
         first_vertical_edge_done = 1;
-        for( i = 0; i < 8; i++ ) {
-            int y = i>>1;
-            int b_idx= 8 + 4 + 8*y;
-            int bn_idx= b_idx - 1;
-
-            int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
-
-            if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
-                IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
-                bS[i] = 4;
-            } else if( h->non_zero_count_cache[b_idx] != 0 ||
-                /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
-                h->non_zero_count_cache[bn_idx] != 0 ) {
-                bS[i] = 2;
-            } else {
-                int l;
-                bS[i] = 0;
-                for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
-                    if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
-                        ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
-                        ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
-                        bS[i] = 1;
-                        break;
-                    }
-                }
+
+        if( IS_INTRA(mb_type) )
+            bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
+        else {
+            for( i = 0; i < 8; i++ ) {
+                int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
+
+                if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
+                    bS[i] = 4;
+                else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
+                         /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
+                         h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
+                    bS[i] = 2;
+                else
+                    bS[i] = 1;
             }
         }
-        if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
-            // Do not use s->qscale as luma quantizer because it has not the same
-            // value in IPCM macroblocks.
-            qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
-            chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
-                             get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
-            qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
-            chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
-                             get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
 
-            /* Filter edge */
-            tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
-            { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
-            filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
-            filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
-            filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
-        }
+        mb_qp = s->current_picture.qscale_table[mb_xy];
+        mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
+        mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
+        qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
+        chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
+                         get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
+        qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
+        chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
+                         get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
+
+        /* Filter edge */
+        tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
+        { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
+        filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
+        filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
+        filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
     }
     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
     for( dir = 0; dir < 2; dir++ )
     {
         int edge;
         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
-        const int mb_type = s->current_picture.mb_type[mb_xy];
         const int mbm_type = s->current_picture.mb_type[mbm_xy];
         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
 
-        const int edges = ((mb_type & mbm_type) & (MB_TYPE_16x16|MB_TYPE_SKIP))
-                                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
+        const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
+                                  == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
         // how often to recheck mv-based bS when iterating between edges
         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
@@ -6633,78 +7100,68 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
             start = 1;
 
-        /* Calculate bS */
-        for( edge = start; edge < edges; edge++ ) {
-            /* mbn_xy: neighbor macroblock */
-            const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
-            const int mbn_type = s->current_picture.mb_type[mbn_xy];
+        if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
+            && !IS_INTERLACED(mb_type)
+            && IS_INTERLACED(mbm_type)
+            ) {
+            // This is a special case in the norm where the filtering must
+            // be done twice (one each of the field) even if we are in a
+            // frame macroblock.
+            //
+            static const int nnz_idx[4] = {4,5,6,3};
+            unsigned int tmp_linesize   = 2 *   linesize;
+            unsigned int tmp_uvlinesize = 2 * uvlinesize;
+            int mbn_xy = mb_xy - 2 * s->mb_stride;
+            int qp, chroma_qp;
+            int i, j;
             int bS[4];
-            int qp;
-
-            if( (edge&1) && IS_8x8DCT(mb_type) )
-                continue;
 
-            if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
-                && !IS_INTERLACED(mb_type)
-                && IS_INTERLACED(mbn_type)
-                ) {
-                // This is a special case in the norm where the filtering must
-                // be done twice (one each of the field) even if we are in a
-                // frame macroblock.
-                //
-                unsigned int tmp_linesize   = 2 *   linesize;
-                unsigned int tmp_uvlinesize = 2 * uvlinesize;
-                int mbn_xy = mb_xy - 2 * s->mb_stride;
-                int qp, chroma_qp;
-
-                // first filtering
+            for(j=0; j<2; j++, mbn_xy += s->mb_stride){
                 if( IS_INTRA(mb_type) ||
                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
                 } else {
-                    // TODO
-                    av_log(h->s.avctx, AV_LOG_ERROR, "both non intra (TODO)\n");
+                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
+                    for( i = 0; i < 4; i++ ) {
+                        if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
+                            mbn_nnz[nnz_idx[i]] != 0 )
+                            bS[i] = 2;
+                        else
+                            bS[i] = 1;
+                    }
                 }
-                /* Filter edge */
                 // Do not use s->qscale as luma quantizer because it has not the same
                 // value in IPCM macroblocks.
                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
-                filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
+                filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
                 chroma_qp = ( h->chroma_qp +
                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
-                filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
+                filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+                filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+            }
 
-                // second filtering
-                mbn_xy += s->mb_stride;
-                if( IS_INTRA(mb_type) ||
-                    IS_INTRA(mbn_type) ) {
-                    bS[0] = bS[1] = bS[2] = bS[3] = 3;
-                } else {
-                    // TODO
-                    av_log(h->s.avctx, AV_LOG_ERROR, "both non intra (TODO)\n");
-                }
-                /* Filter edge */
-                // Do not use s->qscale as luma quantizer because it has not the same
-                // value in IPCM macroblocks.
-                qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
-                tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
-                { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
-                filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
-                chroma_qp = ( h->chroma_qp +
-                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
-                filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+            start = 1;
+        }
+
+        /* Calculate bS */
+        for( edge = start; edge < edges; edge++ ) {
+            /* mbn_xy: neighbor macroblock */
+            const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
+            const int mbn_type = s->current_picture.mb_type[mbn_xy];
+            int bS[4];
+            int qp;
+
+            if( (edge&1) && IS_8x8DCT(mb_type) )
                 continue;
-            }
+
             if( IS_INTRA(mb_type) ||
                 IS_INTRA(mbn_type) ) {
                 int value;
                 if (edge == 0) {
                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
-                        || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
+                        || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
                     ) {
                         value = 4;
                     } else {
@@ -6722,6 +7179,10 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
                     mv_done = 1;
                 }
+                else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
+                    bS[0] = bS[1] = bS[2] = bS[3] = 1;
+                    mv_done = 1;
+                }
                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
                     int bn_idx= b_idx - (dir ? 8:1);
@@ -6729,7 +7190,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
                              ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
-                             ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4;
+                             ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
                     }
                     bS[0] = bS[1] = bS[2] = bS[3] = v;
                     mv_done = 1;
@@ -6753,7 +7214,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
-                                ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
+                                ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
                                 bS[i] = 1;
                                 break;
                             }
@@ -6830,8 +7291,7 @@ static int decode_slice(H264Context *h){
 
             if(ret>=0) hl_decode_mb(h);
 
-            /* XXX: useless as decode_mb_cabac it doesn't support that ... */
-            if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
+            if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
                 s->mb_y++;
 
                 if(ret>=0) ret = decode_mb_cabac(h);
@@ -6842,7 +7302,7 @@ static int decode_slice(H264Context *h){
             eos = get_cabac_terminate( &h->cabac );
 
             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
-                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
                 return -1;
             }
@@ -6851,7 +7311,7 @@ static int decode_slice(H264Context *h){
                 s->mb_x = 0;
                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
                 ++s->mb_y;
-                if(h->mb_aff_frame) {
+                if(FRAME_MBAFF) {
                     ++s->mb_y;
                 }
             }
@@ -6869,7 +7329,7 @@ static int decode_slice(H264Context *h){
 
             if(ret>=0) hl_decode_mb(h);
 
-            if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
+            if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
                 s->mb_y++;
                 ret = decode_mb_cavlc(h);
 
@@ -6888,7 +7348,7 @@ static int decode_slice(H264Context *h){
                 s->mb_x=0;
                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
                 ++s->mb_y;
-                if(h->mb_aff_frame) {
+                if(FRAME_MBAFF) {
                     ++s->mb_y;
                 }
                 if(s->mb_y >= s->mb_height){
@@ -6929,7 +7389,7 @@ static int decode_slice(H264Context *h){
             hl_decode_mb(h);
 
             if(ret<0){
-                fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
 
                 return -1;
@@ -7013,7 +7473,7 @@ static int decode_sei(H264Context *h){
 
         switch(type){
         case 5:
-            if(decode_unregistered_user_data(h, size) < 0);
+            if(decode_unregistered_user_data(h, size) < 0)
                 return -1;
             break;
         default:
@@ -7056,7 +7516,7 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps){
         if( aspect_ratio_idc == EXTENDED_SAR ) {
             sps->sar.num= get_bits(&s->gb, 16);
             sps->sar.den= get_bits(&s->gb, 16);
-        }else if(aspect_ratio_idc < 16){
+        }else if(aspect_ratio_idc < 14){
             sps->sar=  pixel_aspect[aspect_ratio_idc];
         }else{
             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
@@ -7232,6 +7692,13 @@ static inline int decode_seq_parameter_set(H264Context *h){
 
     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
 
+#ifndef ALLOW_INTERLACE
+    if(sps->mb_aff)
+        av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it compilation time\n");
+#endif
+    if(!sps->direct_8x8_inference_flag && sps->mb_aff)
+        av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
+
     sps->crop= get_bits1(&s->gb);
     if(sps->crop){
         sps->crop_left  = get_ue_golomb(&s->gb);
@@ -7330,6 +7797,8 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
     pps->constrained_intra_pred= get_bits1(&s->gb);
     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
 
+    pps->transform_8x8_mode= 0;
+    h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
 
@@ -7402,6 +7871,7 @@ static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
     return END_NOT_FOUND;
 }
 
+#ifdef CONFIG_H264_PARSER
 static int h264_parse(AVCodecParserContext *s,
                       AVCodecContext *avctx,
                       uint8_t **poutbuf, int *poutbuf_size,
@@ -7447,7 +7917,7 @@ static int h264_split(AVCodecContext *avctx,
     }
     return 0;
 }
-
+#endif /* CONFIG_H264_PARSER */
 
 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
     MpegEncContext * const s = &h->s;
@@ -7473,6 +7943,15 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         nalsize = 0;
         for(i = 0; i < h->nal_length_size; i++)
             nalsize = (nalsize << 8) | buf[buf_index++];
+        if(nalsize <= 1){
+            if(nalsize == 1){
+                buf_index++;
+                continue;
+            }else{
+                av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
+                break;
+            }
+        }
       } else {
         // start code prefix search
         for(; buf_index + 3 < buf_size; buf_index++){
@@ -7487,7 +7966,8 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
       }
 
         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
-        if(ptr[dst_length - 1] == 0) dst_length--;
+        while(ptr[dst_length - 1] == 0 && dst_length > 1)
+            dst_length--;
         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
 
         if(s->avctx->debug&FF_DEBUG_STARTCODE){
@@ -7516,6 +7996,7 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
                 break;
             }
+            s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
             if(h->redundant_pic_count==0 && s->hurry_up < 5
                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
@@ -7585,7 +8066,6 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
 
     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
     s->current_picture_ptr->pict_type= s->pict_type;
-    s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
 
     h->prev_frame_num_offset= h->frame_num_offset;
     h->prev_frame_num= h->frame_num;
@@ -7711,12 +8191,7 @@ static int decode_frame(AVCodecContext *avctx,
         /* Sort B-frames into display order */
         Picture *cur = s->current_picture_ptr;
         Picture *prev = h->delayed_output_pic;
-        int out_idx = 0;
-        int pics = 0;
-        int out_of_order;
-        int cross_idr = 0;
-        int dropped_frame = 0;
-        int i;
+        int i, pics, cross_idr, out_of_order, out_idx;
 
         if(h->sps.bitstream_restriction_flag
            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
@@ -7724,16 +8199,19 @@ static int decode_frame(AVCodecContext *avctx,
             s->low_delay = 0;
         }
 
+        pics = 0;
         while(h->delayed_pic[pics]) pics++;
         h->delayed_pic[pics++] = cur;
         if(cur->reference == 0)
             cur->reference = 1;
 
+        cross_idr = 0;
         for(i=0; h->delayed_pic[i]; i++)
             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
                 cross_idr = 1;
 
         out = h->delayed_pic[0];
+        out_idx = 0;
         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
             if(h->delayed_pic[i]->poc < out->poc){
                 out = h->delayed_pic[i];
@@ -7741,7 +8219,9 @@ static int decode_frame(AVCodecContext *avctx,
             }
 
         out_of_order = !cross_idr && prev && out->poc < prev->poc;
-        if(prev && pics <= s->avctx->has_b_frames)
+        if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
+            { }
+        else if(prev && pics <= s->avctx->has_b_frames)
             out = prev;
         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
            || (s->low_delay &&
@@ -7756,12 +8236,11 @@ static int decode_frame(AVCodecContext *avctx,
             out = prev;
 
         if(out_of_order || pics > s->avctx->has_b_frames){
-            dropped_frame = (out != h->delayed_pic[out_idx]);
             for(i=out_idx; h->delayed_pic[i]; i++)
                 h->delayed_pic[i] = h->delayed_pic[i+1];
         }
 
-        if(prev == out && !dropped_frame)
+        if(prev == out)
             *data_size = 0;
         else
             *data_size = sizeof(AVFrame);
@@ -8009,6 +8488,7 @@ AVCodec h264_decoder = {
     .flush= flush_dpb,
 };
 
+#ifdef CONFIG_H264_PARSER
 AVCodecParser h264_parser = {
     { CODEC_ID_H264 },
     sizeof(H264Context),
@@ -8017,5 +8497,6 @@ AVCodecParser h264_parser = {
     ff_parse_close,
     h264_split,
 };
+#endif
 
 #include "svq3.c"
diff --git a/src/libffmpeg/libavcodec/h264data.h b/src/libffmpeg/libavcodec/h264data.h
index 3132102df..1dd9dafe5 100644
--- a/src/libffmpeg/libavcodec/h264data.h
+++ b/src/libffmpeg/libavcodec/h264data.h
@@ -345,6 +345,44 @@ static const uint8_t zigzag_scan8x8_cavlc[64]={
  5+5*8, 6+5*8, 6+6*8, 7+7*8,
 };
 
+static const uint8_t field_scan8x8[64]={
+ 0+0*8, 0+1*8, 0+2*8, 1+0*8,
+ 1+1*8, 0+3*8, 0+4*8, 1+2*8,
+ 2+0*8, 1+3*8, 0+5*8, 0+6*8,
+ 0+7*8, 1+4*8, 2+1*8, 3+0*8,
+ 2+2*8, 1+5*8, 1+6*8, 1+7*8,
+ 2+3*8, 3+1*8, 4+0*8, 3+2*8,
+ 2+4*8, 2+5*8, 2+6*8, 2+7*8,
+ 3+3*8, 4+1*8, 5+0*8, 4+2*8,
+ 3+4*8, 3+5*8, 3+6*8, 3+7*8,
+ 4+3*8, 5+1*8, 6+0*8, 5+2*8,
+ 4+4*8, 4+5*8, 4+6*8, 4+7*8,
+ 5+3*8, 6+1*8, 6+2*8, 5+4*8,
+ 5+5*8, 5+6*8, 5+7*8, 6+3*8,
+ 7+0*8, 7+1*8, 6+4*8, 6+5*8,
+ 6+6*8, 6+7*8, 7+2*8, 7+3*8,
+ 7+4*8, 7+5*8, 7+6*8, 7+7*8,
+};
+
+static const uint8_t field_scan8x8_cavlc[64]={
+ 0+0*8, 1+1*8, 2+0*8, 0+7*8,
+ 2+2*8, 2+3*8, 2+4*8, 3+3*8,
+ 3+4*8, 4+3*8, 4+4*8, 5+3*8,
+ 5+5*8, 7+0*8, 6+6*8, 7+4*8,
+ 0+1*8, 0+3*8, 1+3*8, 1+4*8,
+ 1+5*8, 3+1*8, 2+5*8, 4+1*8,
+ 3+5*8, 5+1*8, 4+5*8, 6+1*8,
+ 5+6*8, 7+1*8, 6+7*8, 7+5*8,
+ 0+2*8, 0+4*8, 0+5*8, 2+1*8,
+ 1+6*8, 4+0*8, 2+6*8, 5+0*8,
+ 3+6*8, 6+0*8, 4+6*8, 6+2*8,
+ 5+7*8, 6+4*8, 7+2*8, 7+6*8,
+ 1+0*8, 1+2*8, 0+6*8, 3+0*8,
+ 1+7*8, 3+2*8, 2+7*8, 4+2*8,
+ 3+7*8, 5+2*8, 4+7*8, 5+4*8,
+ 6+3*8, 6+5*8, 7+3*8, 7+7*8,
+};
+
 #define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16bit
 #define MB_TYPE_8x8DCT     0x01000000
 #define IS_REF0(a)       ((a)&MB_TYPE_REF0)
diff --git a/src/libffmpeg/libavcodec/h264idct.c b/src/libffmpeg/libavcodec/h264idct.c
index a4ddf1d51..3e44385d5 100755
--- a/src/libffmpeg/libavcodec/h264idct.c
+++ b/src/libffmpeg/libavcodec/h264idct.c
@@ -139,3 +139,28 @@ void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
         dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
     }
 }
+
+// assumes all AC coefs are 0
+void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i, j;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 4; j++ )
+    {
+        for( i = 0; i < 4; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
+
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    int i, j;
+    uint8_t *cm = cropTbl + MAX_NEG_CROP;
+    int dc = (block[0] + 32) >> 6;
+    for( j = 0; j < 8; j++ )
+    {
+        for( i = 0; i < 8; i++ )
+            dst[i] = cm[ dst[i] + dc ];
+        dst += stride;
+    }
+}
diff --git a/src/libffmpeg/libavcodec/huffyuv.c b/src/libffmpeg/libavcodec/huffyuv.c
index dc9e123ff..d65943fcc 100644
--- a/src/libffmpeg/libavcodec/huffyuv.c
+++ b/src/libffmpeg/libavcodec/huffyuv.c
@@ -343,7 +343,7 @@ static int read_old_huffman_tables(HYuvContext *s){
 
     return 0;
 #else
-    fprintf(stderr, "v1 huffyuv is not supported \n");
+    av_log(s->avctx, AV_LOG_DEBUG, "v1 huffyuv is not supported \n");
     return -1;
 #endif
 }
@@ -541,9 +541,6 @@ static int encode_init(AVCodecContext *avctx)
         }
         if(s->interlaced != ( s->height > 288 ))
             av_log(avctx, AV_LOG_INFO, "using huffyuv 2.2.0 or newer interlacing flag\n");
-    }else if(avctx->strict_std_compliance>FF_COMPLIANCE_EXPERIMENTAL){
-        av_log(avctx, AV_LOG_ERROR, "This codec is under development; files encoded with it may not be decodable with future versions!!! Set vstrict=-2 / -strict -2 to use it anyway.\n");
-        return -1;
     }
 
     ((uint8_t*)avctx->extradata)[0]= s->predictor;
@@ -808,6 +805,9 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
             return -1;
     }
 
+    if((unsigned)(buf_size-table_size) >= INT_MAX/8)
+        return -1;
+
     init_get_bits(&s->gb, s->bitstream_buffer+table_size, (buf_size-table_size)*8);
 
     fake_ystride= s->interlaced ? p->linesize[0]*2  : p->linesize[0];
@@ -1012,7 +1012,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8
     *picture= *p;
     *data_size = sizeof(AVFrame);
 
-    return (get_bits_count(&s->gb)+31)/32*4;
+    return (get_bits_count(&s->gb)+31)/32*4 + table_size;
 }
 
 static int common_end(HYuvContext *s){
diff --git a/src/libffmpeg/libavcodec/i386/cputest.c b/src/libffmpeg/libavcodec/i386/cputest.c
index 64656c65a..a66bdbe98 100644
--- a/src/libffmpeg/libavcodec/i386/cputest.c
+++ b/src/libffmpeg/libavcodec/i386/cputest.c
@@ -64,6 +64,8 @@ int mm_support(void)
             rval |= MM_MMXEXT | MM_SSE;
         if (std_caps & (1<<26))
             rval |= MM_SSE2;
+        if (ecx & 1)
+            rval |= MM_SSE3;
     }
 
     cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
@@ -76,38 +78,10 @@ int mm_support(void)
             rval |= MM_3DNOWEXT;
         if (ext_caps & (1<<23))
             rval |= MM_MMX;
-    }
-
-    cpuid(0, eax, ebx, ecx, edx);
-    if (       ebx == 0x68747541 &&
-               edx == 0x69746e65 &&
-               ecx == 0x444d4163) {
-        /* AMD */
-        if(ext_caps & (1<<22))
-            rval |= MM_MMXEXT;
-    } else if (ebx == 0x746e6543 &&
-               edx == 0x48727561 &&
-               ecx == 0x736c7561) {  /*  "CentaurHauls" */
-        /* VIA C3 */
-        if(ext_caps & (1<<24))
-          rval |= MM_MMXEXT;
-    } else if (ebx == 0x69727943 &&
-               edx == 0x736e4978 &&
-               ecx == 0x64616574) {
-        /* Cyrix Section */
-        /* See if extended CPUID level 80000001 is supported */
-        /* The value of CPUID/80000001 for the 6x86MX is undefined
-           according to the Cyrix CPU Detection Guide (Preliminary
-           Rev. 1.01 table 1), so we'll check the value of eax for
-           CPUID/0 to see if standard CPUID level 2 is supported.
-           According to the table, the only CPU which supports level
-           2 is also the only one which supports extended CPUID levels.
-        */
-        if (eax < 2)
-            return rval;
-        if (ext_caps & (1<<24))
+        if (ext_caps & (1<<22))
             rval |= MM_MMXEXT;
     }
+
 #if 0
     av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s\n",
         (rval&MM_MMX) ? "MMX ":"",
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c
index d52938ccf..b49c880a7 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c
@@ -37,112 +37,56 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
 
     assert(x<8 && y<8 && x>=0 && y>=0);
 
-    if(y==0)
+    if(y==0 || x==0)
     {
-        /* horizontal filter only */
-        asm volatile("movd %0, %%mm5\n\t"
-                     "punpcklwd %%mm5, %%mm5\n\t"
-                     "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
-                     "movq %1, %%mm4\n\t"
-                     "pxor %%mm7, %%mm7\n\t"
-                     "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
-                     : : "rm" (x), "m" (ff_pw_8));
+        /* 1 dimensional filter only */
+        const int dxy = x ? 1 : stride;
+
+        asm volatile(
+            "movd %0, %%mm5\n\t"
+            "movq %1, %%mm4\n\t"
+            "punpcklwd %%mm5, %%mm5\n\t"
+            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
+            "movq %%mm4, %%mm6\n\t"
+            "pxor %%mm7, %%mm7\n\t"
+            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
+            "psrlw $1, %%mm6\n\t"        /* mm6 = 4 */
+            :: "rm"(x+y), "m"(ff_pw_8));
 
         for(i=0; i<h; i++) {
             asm volatile(
                 /* mm0 = src[0..7], mm1 = src[1..8] */
                 "movq %0, %%mm0\n\t"
-                "movq %1, %%mm1\n\t"
-                : : "m" (src[0]), "m" (src[1]));
+                "movq %1, %%mm2\n\t"
+                :: "m"(src[0]), "m"(src[dxy]));
 
             asm volatile(
-                /* [mm2,mm3] = A * src[0..7] */
-                "movq %%mm0, %%mm2\n\t"
-                "punpcklbw %%mm7, %%mm2\n\t"
-                "pmullw %%mm4, %%mm2\n\t"
-                "movq %%mm0, %%mm3\n\t"
-                "punpckhbw %%mm7, %%mm3\n\t"
-                "pmullw %%mm4, %%mm3\n\t"
-
-                /* [mm2,mm3] += B * src[1..8] */
-                "movq %%mm1, %%mm0\n\t"
+                /* [mm0,mm1] = A * src[0..7] */
+                /* [mm2,mm3] = B * src[1..8] */
+                "movq %%mm0, %%mm1\n\t"
+                "movq %%mm2, %%mm3\n\t"
                 "punpcklbw %%mm7, %%mm0\n\t"
-                "pmullw %%mm5, %%mm0\n\t"
                 "punpckhbw %%mm7, %%mm1\n\t"
-                "pmullw %%mm5, %%mm1\n\t"
-                "paddw %%mm0, %%mm2\n\t"
-                "paddw %%mm1, %%mm3\n\t"
-
-                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
-                "paddw %1, %%mm2\n\t"
-                "paddw %1, %%mm3\n\t"
-                "psrlw $3, %%mm2\n\t"
-                "psrlw $3, %%mm3\n\t"
-                "packuswb %%mm3, %%mm2\n\t"
-                H264_CHROMA_OP(%0, %%mm2)
-                "movq %%mm2, %0\n\t"
-                : "=m" (dst[0]) : "m" (ff_pw_4));
-
-            src += stride;
-            dst += stride;
-        }
-        return;
-    }
-
-    if(x==0)
-    {
-        /* vertical filter only */
-        asm volatile("movd %0, %%mm6\n\t"
-                     "punpcklwd %%mm6, %%mm6\n\t"
-                     "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
-                     "movq %1, %%mm4\n\t"
-                     "pxor %%mm7, %%mm7\n\t"
-                     "psubw %%mm6, %%mm4\n\t"     /* mm4 = A = 8-y */
-                     : : "rm" (y), "m" (ff_pw_8));
-
-        asm volatile(
-            /* mm0 = src[0..7] */
-            "movq %0, %%mm0\n\t"
-            : : "m" (src[0]));
-
-        for(i=0; i<h; i++) {
-            asm volatile(
-                /* [mm2,mm3] = A * src[0..7] */
-                "movq %mm0, %mm2\n\t"
-                "punpcklbw %mm7, %mm2\n\t"
-                "pmullw %mm4, %mm2\n\t"
-                "movq %mm0, %mm3\n\t"
-                "punpckhbw %mm7, %mm3\n\t"
-                "pmullw %mm4, %mm3\n\t");
+                "punpcklbw %%mm7, %%mm2\n\t"
+                "punpckhbw %%mm7, %%mm3\n\t"
+                "pmullw %%mm4, %%mm0\n\t"
+                "pmullw %%mm4, %%mm1\n\t"
+                "pmullw %%mm5, %%mm2\n\t"
+                "pmullw %%mm5, %%mm3\n\t"
+
+                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */
+                "paddw %%mm6, %%mm0\n\t"
+                "paddw %%mm6, %%mm1\n\t"
+                "paddw %%mm2, %%mm0\n\t"
+                "paddw %%mm3, %%mm1\n\t"
+                "psrlw $3, %%mm0\n\t"
+                "psrlw $3, %%mm1\n\t"
+                "packuswb %%mm1, %%mm0\n\t"
+                H264_CHROMA_OP(%0, %%mm0)
+                "movq %%mm0, %0\n\t"
+                : "=m" (dst[0]));
 
             src += stride;
-            asm volatile(
-                /* mm0 = src[0..7] */
-                "movq %0, %%mm0\n\t"
-                : : "m" (src[0]));
-
-            asm volatile(
-                /* [mm2,mm3] += C * src[0..7] */
-                "movq %mm0, %mm1\n\t"
-                "punpcklbw %mm7, %mm1\n\t"
-                "pmullw %mm6, %mm1\n\t"
-                "paddw %mm1, %mm2\n\t"
-                "movq %mm0, %mm5\n\t"
-                "punpckhbw %mm7, %mm5\n\t"
-                "pmullw %mm6, %mm5\n\t"
-                "paddw %mm5, %mm3\n\t");
-
-            asm volatile(
-                /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
-                "paddw %1, %%mm2\n\t"
-                "paddw %1, %%mm3\n\t"
-                "psrlw $3, %%mm2\n\t"
-                "psrlw $3, %%mm3\n\t"
-                "packuswb %%mm3, %%mm2\n\t"
-                H264_CHROMA_OP(%0, %%mm2)
-                "movq %%mm2, %0\n\t"
-                : "=m" (dst[0]) : "m" (ff_pw_4));
-
             dst += stride;
         }
         return;
@@ -177,57 +121,53 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
         : : "m" (src[0]), "m" (src[1]));
 
     for(i=0; i<h; i++) {
+        src += stride;
+
         asm volatile(
-            /* [mm2,mm3] = A * src[0..7] */
+            /* mm2 = A * src[0..3] + B * src[1..4] */
+            /* mm3 = A * src[4..7] + B * src[5..8] */
             "movq %%mm0, %%mm2\n\t"
+            "movq %%mm1, %%mm3\n\t"
+            "punpckhbw %%mm7, %%mm0\n\t"
+            "punpcklbw %%mm7, %%mm1\n\t"
             "punpcklbw %%mm7, %%mm2\n\t"
-            "pmullw %0, %%mm2\n\t"
-            "movq %%mm0, %%mm3\n\t"
             "punpckhbw %%mm7, %%mm3\n\t"
-            "pmullw %0, %%mm3\n\t"
+            "pmullw %0, %%mm0\n\t"
+            "pmullw %0, %%mm2\n\t"
+            "pmullw %%mm5, %%mm1\n\t"
+            "pmullw %%mm5, %%mm3\n\t"
+            "paddw %%mm1, %%mm2\n\t"
+            "paddw %%mm0, %%mm3\n\t"
+            : : "m" (AA));
 
-            /* [mm2,mm3] += B * src[1..8] */
-            "movq %%mm1, %%mm0\n\t"
+        asm volatile(
+            /* [mm2,mm3] += C * src[0..7] */
+            "movq %0, %%mm0\n\t"
+            "movq %%mm0, %%mm1\n\t"
             "punpcklbw %%mm7, %%mm0\n\t"
-            "pmullw %%mm5, %%mm0\n\t"
             "punpckhbw %%mm7, %%mm1\n\t"
-            "pmullw %%mm5, %%mm1\n\t"
+            "pmullw %%mm6, %%mm0\n\t"
+            "pmullw %%mm6, %%mm1\n\t"
             "paddw %%mm0, %%mm2\n\t"
             "paddw %%mm1, %%mm3\n\t"
-            : : "m" (AA));
+            : : "m" (src[0]));
 
-        src += stride;
         asm volatile(
-            /* mm0 = src[0..7], mm1 = src[1..8] */
-            "movq %0, %%mm0\n\t"
+            /* [mm2,mm3] += D * src[1..8] */
             "movq %1, %%mm1\n\t"
-            : : "m" (src[0]), "m" (src[1]));
-
-        asm volatile(
-            /* [mm2,mm3] += C *  src[0..7] */
-            "movq %mm0, %mm4\n\t"
-            "punpcklbw %mm7, %mm4\n\t"
-            "pmullw %mm6, %mm4\n\t"
-            "paddw %mm4, %mm2\n\t"
-            "movq %mm0, %mm4\n\t"
-            "punpckhbw %mm7, %mm4\n\t"
-            "pmullw %mm6, %mm4\n\t"
-            "paddw %mm4, %mm3\n\t");
-
-        asm volatile(
-            /* [mm2,mm3] += D *  src[1..8] */
-            "movq %%mm1, %%mm4\n\t"
-            "punpcklbw %%mm7, %%mm4\n\t"
-            "pmullw %0, %%mm4\n\t"
-            "paddw %%mm4, %%mm2\n\t"
+            "movq %%mm1, %%mm0\n\t"
             "movq %%mm1, %%mm4\n\t"
+            "punpcklbw %%mm7, %%mm0\n\t"
             "punpckhbw %%mm7, %%mm4\n\t"
-            "pmullw %0, %%mm4\n\t"
+            "pmullw %2, %%mm0\n\t"
+            "pmullw %2, %%mm4\n\t"
+            "paddw %%mm0, %%mm2\n\t"
             "paddw %%mm4, %%mm3\n\t"
-            : : "m" (DD));
+            "movq %0, %%mm0\n\t"
+            : : "m" (src[0]), "m" (src[1]), "m" (DD));
 
         asm volatile(
-            /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
+            /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */
             "paddw %1, %%mm2\n\t"
             "paddw %1, %%mm3\n\t"
             "psrlw $6, %%mm2\n\t"
@@ -240,7 +180,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
     }
 }
 
-static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 {
     DECLARE_ALIGNED_8(uint64_t, AA);
     DECLARE_ALIGNED_8(uint64_t, DD);
@@ -319,3 +259,66 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
         dst += stride;
     }
 }
+
+#ifdef H264_CHROMA_MC2_TMPL
+static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    int CD=((1<<16)-1)*x*y + 8*y;
+    int AB=((8<<16)-8)*x + 64 - CD;
+    int i;
+
+    asm volatile(
+        /* mm5 = {A,B,A,B} */
+        /* mm6 = {C,D,C,D} */
+        "movd %0, %%mm5\n\t"
+        "movd %1, %%mm6\n\t"
+        "punpckldq %%mm5, %%mm5\n\t"
+        "punpckldq %%mm6, %%mm6\n\t"
+        "pxor %%mm7, %%mm7\n\t"
+        :: "r"(AB), "r"(CD));
+
+    asm volatile(
+        /* mm0 = src[0,1,1,2] */
+        "movd %0, %%mm0\n\t"
+        "punpcklbw %%mm7, %%mm0\n\t"
+        "pshufw $0x94, %%mm0, %%mm0\n\t"
+        :: "m"(src[0]));
+
+    for(i=0; i<h; i++) {
+        asm volatile(
+            /* mm1 = A * src[0,1] + B * src[1,2] */
+            "movq    %%mm0, %%mm1\n\t"
+            "pmaddwd %%mm5, %%mm1\n\t"
+            ::);
+
+        src += stride;
+        asm volatile(
+            /* mm0 = src[0,1,1,2] */
+            "movd %0, %%mm0\n\t"
+            "punpcklbw %%mm7, %%mm0\n\t"
+            "pshufw $0x94, %%mm0, %%mm0\n\t"
+            :: "m"(src[0]));
+
+        asm volatile(
+            /* mm1 += C * src[0,1] + D * src[1,2] */
+            "movq    %%mm0, %%mm2\n\t"
+            "pmaddwd %%mm6, %%mm2\n\t"
+            "paddw   %%mm2, %%mm1\n\t"
+            ::);
+
+        asm volatile(
+            /* dst[0,1] = pack((mm1 + 32) >> 6) */
+            "paddw %1, %%mm1\n\t"
+            "psrlw $6, %%mm1\n\t"
+            "packssdw %%mm7, %%mm1\n\t"
+            "packuswb %%mm7, %%mm1\n\t"
+            /* writes garbage to the right of dst.
+             * ok because partitions are processed from left to right. */
+            H264_CHROMA_OP4(%0, %%mm1, %%mm3)
+            "movd %%mm1, %0\n\t"
+            : "=m" (dst[0]) : "m" (ff_pw_32));
+        dst += stride;
+    }
+}
+#endif
+
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 7d69859a6..ec6b2ad1a 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -23,6 +23,7 @@
 #include "../dsputil.h"
 #include "../simple_idct.h"
 #include "../mpegvideo.h"
+#include "x86_cpu.h"
 #include "mmx.h"
 
 //#undef NDEBUG
@@ -186,6 +187,11 @@ static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xF
 #undef DEF
 #undef PAVGB
 
+#define SBUTTERFLY(a,b,t,n)\
+    "movq " #a ", " #t "              \n\t" /* abcd */\
+    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
+    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
+
 /***********************************/
 /* standard MMX */
 
@@ -1522,11 +1528,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
     "pmaxsw " #z ", " #a "            \n\t"\
     "paddusw " #a ", " #sum "         \n\t"
 
-#define SBUTTERFLY(a,b,t,n)\
-    "movq " #a ", " #t "              \n\t" /* abcd */\
-    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
-    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
-
 #define TRANSPOSE4(a,b,c,d,t)\
     SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
     SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
@@ -2403,6 +2404,124 @@ static void just_return() { return; }
     c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
     c->avg_ ## postfix1 = avg_ ## postfix2;
 
+static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
+    const int w = 8;
+    const int ix = ox>>(16+shift);
+    const int iy = oy>>(16+shift);
+    const int oxs = ox>>4;
+    const int oys = oy>>4;
+    const int dxxs = dxx>>4;
+    const int dxys = dxy>>4;
+    const int dyxs = dyx>>4;
+    const int dyys = dyy>>4;
+    const uint16_t r4[4] = {r,r,r,r};
+    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
+    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
+    const uint64_t shift2 = 2*shift;
+    uint8_t edge_buf[(h+1)*stride];
+    int x, y;
+
+    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
+    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
+    const int dxh = dxy*(h-1);
+    const int dyw = dyx*(w-1);
+    if( // non-constant fullpel offset (3% of blocks)
+        (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
+         oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
+        // uses more than 16 bits of subpel mv (only at huge resolution)
+        || (dxx|dxy|dyx|dyy)&15 )
+    {
+        //FIXME could still use mmx for some of the rows
+        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
+        return;
+    }
+
+    src += ix + iy*stride;
+    if( (unsigned)ix >= width-w ||
+        (unsigned)iy >= height-h )
+    {
+        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
+        src = edge_buf;
+    }
+
+    asm volatile(
+        "movd         %0, %%mm6 \n\t"
+        "pxor      %%mm7, %%mm7 \n\t"
+        "punpcklwd %%mm6, %%mm6 \n\t"
+        "punpcklwd %%mm6, %%mm6 \n\t"
+        :: "r"(1<<shift)
+    );
+
+    for(x=0; x<w; x+=4){
+        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
+                            oxs - dxys + dxxs*(x+1),
+                            oxs - dxys + dxxs*(x+2),
+                            oxs - dxys + dxxs*(x+3) };
+        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
+                            oys - dyys + dyxs*(x+1),
+                            oys - dyys + dyxs*(x+2),
+                            oys - dyys + dyxs*(x+3) };
+
+        for(y=0; y<h; y++){
+            asm volatile(
+                "movq   %0,  %%mm4 \n\t"
+                "movq   %1,  %%mm5 \n\t"
+                "paddw  %2,  %%mm4 \n\t"
+                "paddw  %3,  %%mm5 \n\t"
+                "movq   %%mm4, %0  \n\t"
+                "movq   %%mm5, %1  \n\t"
+                "psrlw  $12, %%mm4 \n\t"
+                "psrlw  $12, %%mm5 \n\t"
+                : "+m"(*dx4), "+m"(*dy4)
+                : "m"(*dxy4), "m"(*dyy4)
+            );
+
+            asm volatile(
+                "movq   %%mm6, %%mm2 \n\t"
+                "movq   %%mm6, %%mm1 \n\t"
+                "psubw  %%mm4, %%mm2 \n\t"
+                "psubw  %%mm5, %%mm1 \n\t"
+                "movq   %%mm2, %%mm0 \n\t"
+                "movq   %%mm4, %%mm3 \n\t"
+                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
+                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
+                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
+                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
+
+                "movd   %4,    %%mm5 \n\t"
+                "movd   %3,    %%mm4 \n\t"
+                "punpcklbw %%mm7, %%mm5 \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
+                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
+
+                "movd   %2,    %%mm5 \n\t"
+                "movd   %1,    %%mm4 \n\t"
+                "punpcklbw %%mm7, %%mm5 \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
+                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
+                "paddw  %5,    %%mm1 \n\t"
+                "paddw  %%mm3, %%mm2 \n\t"
+                "paddw  %%mm1, %%mm0 \n\t"
+                "paddw  %%mm2, %%mm0 \n\t"
+
+                "psrlw    %6,    %%mm0 \n\t"
+                "packuswb %%mm0, %%mm0 \n\t"
+                "movd     %%mm0, %0    \n\t"
+
+                : "=m"(dst[x+y*stride])
+                : "m"(src[0]), "m"(src[1]),
+                  "m"(src[stride]), "m"(src[stride+1]),
+                  "m"(*r4), "m"(shift2)
+            );
+            src += stride;
+        }
+        src += 4-h*stride;
+    }
+}
+
 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
     long i=0;
 
@@ -2489,8 +2608,36 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
     }
 }
 
+#define PREFETCH(name, op) \
+void name(void *mem, int stride, int h){\
+    const uint8_t *p= mem;\
+    do{\
+        asm volatile(#op" %0" :: "m"(*p));\
+        p+= stride;\
+    }while(--h);\
+}
+PREFETCH(prefetch_mmx2,  prefetcht0)
+PREFETCH(prefetch_3dnow, prefetch)
+#undef PREFETCH
+
 #include "h264dsp_mmx.c"
 
+/* AVS specific */
+void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels16_mmx(dst, src, stride, 16);
+}
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels16_mmx(dst, src, stride, 16);
+}
+
 /* external functions, from idct_mmx.c */
 void ff_mmx_idct(DCTELEM *block);
 void ff_mmxext_idct(DCTELEM *block);
@@ -2564,6 +2711,17 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
 }
 #endif
 
+#ifdef CONFIG_SNOW_ENCODER
+extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
+extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
+extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
+extern void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+#endif
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
@@ -2622,6 +2780,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->idct    = ff_mmx_idct;
                 }
                 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+#if 0
             }else if(idct_algo==FF_IDCT_VP3){
                 if(mm_flags & MM_SSE2){
                     c->idct_put= ff_vp3_idct_put_sse2;
@@ -2635,6 +2794,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->idct    = ff_vp3_idct_mmx;
                     c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
                 }
+#endif
+            }else if(idct_algo==FF_IDCT_CAVS){
+                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
 #ifdef CONFIG_GPL
             }else if(idct_algo==FF_IDCT_XVIDMMX){
                 if(mm_flags & MM_MMXEXT){
@@ -2702,6 +2864,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
         c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
 
+        c->gmc= gmc_mmx;
+
         c->add_bytes= add_bytes_mmx;
 #ifdef CONFIG_ENCODERS
         c->diff_bytes= diff_bytes_mmx;
@@ -2732,7 +2896,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
         c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
 
+        c->h264_idct_dc_add=
+        c->h264_idct_add= ff_h264_idct_add_mmx;
+        c->h264_idct8_dc_add=
+        c->h264_idct8_add= ff_h264_idct8_add_mmx;
+
         if (mm_flags & MM_MMXEXT) {
+            c->prefetch = prefetch_mmx2;
+
             c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
             c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
 
@@ -2753,7 +2924,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->vsad[4]= vsad_intra16_mmx2;
 #endif //CONFIG_ENCODERS
 
-            c->h264_idct_add= ff_h264_idct_add_mmx2;
+            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
 
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
@@ -2831,6 +3003,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
+            c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
+            c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
             c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
             c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
             c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
@@ -2856,10 +3030,16 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
             c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
 
+#ifdef CONFIG_CAVS_DECODER
+            ff_cavsdsp_init_mmx2(c, avctx);
+#endif
+
 #ifdef CONFIG_ENCODERS
             c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
 #endif //CONFIG_ENCODERS
         } else if (mm_flags & MM_3DNOW) {
+            c->prefetch = prefetch_3dnow;
+
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
 
@@ -2944,6 +3124,19 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
         }
+
+#ifdef CONFIG_SNOW_ENCODER
+        if(mm_flags & MM_SSE2){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+        }
+        else{
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+        }
+#endif
     }
 
 #ifdef CONFIG_ENCODERS
diff --git a/src/libffmpeg/libavcodec/i386/fft_sse.c b/src/libffmpeg/libavcodec/i386/fft_sse.c
index 54851fb94..631848265 100644
--- a/src/libffmpeg/libavcodec/i386/fft_sse.c
+++ b/src/libffmpeg/libavcodec/i386/fft_sse.c
@@ -23,14 +23,14 @@
 
 #include <xmmintrin.h>
 
-static const float p1p1p1m1[4] __attribute__((aligned(16))) =
-    { 1.0, 1.0, 1.0, -1.0 };
+static const int p1p1p1m1[4] __attribute__((aligned(16))) =
+    { 0, 0, 0, 1 << 31 };
 
-static const float p1p1m1p1[4] __attribute__((aligned(16))) =
-    { 1.0, 1.0, -1.0, 1.0 };
+static const int p1p1m1p1[4] __attribute__((aligned(16))) =
+    { 0, 0, 1 << 31, 0 };
 
-static const float p1p1m1m1[4] __attribute__((aligned(16))) =
-    { 1.0, 1.0, -1.0, -1.0 };
+static const int p1p1m1m1[4] __attribute__((aligned(16))) =
+    { 0, 0, 1 << 31, 1 << 31 };
 
 #if 0
 static void print_v4sf(const char *str, __m128 a)
@@ -58,7 +58,6 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
 
         r = (__m128 *)&z[0];
         c1 = *(__m128 *)p1p1m1m1;
-        c2 = *(__m128 *)p1p1p1m1;
         if (s->inverse)
             c2 = *(__m128 *)p1p1m1p1;
         else
@@ -68,19 +67,20 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
         do {
             a = r[0];
             b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
-            a = _mm_mul_ps(a, c1);
+            a = _mm_xor_ps(a, c1);
             /* do the pass 0 butterfly */
             a = _mm_add_ps(a, b);
 
             a1 = r[1];
             b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
-            a1 = _mm_mul_ps(a1, c1);
+            a1 = _mm_xor_ps(a1, c1);
             /* do the pass 0 butterfly */
             b = _mm_add_ps(a1, b);
 
             /* multiply third by -i */
+            /* by toggling the sign bit */
             b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
-            b = _mm_mul_ps(b, c2);
+            b = _mm_xor_ps(b, c2);
 
             /* do the pass 1 butterfly */
             r[0] = _mm_add_ps(a, b);
diff --git a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
index 8ab58f389..ac4ad6401 100644
--- a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
@@ -46,11 +46,6 @@
     SUMSUBD2_AB( s13, d13, t )\
     SUMSUB_BADC( d13, s02, s13, d02 )
 
-#define SBUTTERFLY(a,b,t,n)\
-    "movq " #a ", " #t "                \n\t" /* abcd */\
-    "punpckl" #n " " #b ", " #a "       \n\t" /* aebf */\
-    "punpckh" #n " " #b ", " #t "       \n\t" /* cgdh */\
-
 #define TRANSPOSE4(a,b,c,d,t)\
     SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
     SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
@@ -65,7 +60,7 @@
     "packuswb  "#z",    "#p" \n\t"\
     "movd      "#p",    (%0) \n\t"
 
-void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
 {
     /* Load dct coeffs */
     asm volatile(
@@ -104,6 +99,208 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
     );
 }
 
+static inline void h264_idct8_1d(int16_t *block)
+{
+    asm volatile(
+        "movq 112(%0), %%mm7  \n\t"
+        "movq  80(%0), %%mm5  \n\t"
+        "movq  48(%0), %%mm3  \n\t"
+        "movq  16(%0), %%mm1  \n\t"
+
+        "movq   %%mm7, %%mm4  \n\t"
+        "movq   %%mm3, %%mm6  \n\t"
+        "movq   %%mm5, %%mm0  \n\t"
+        "movq   %%mm7, %%mm2  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm6  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
+        "psubw  %%mm6, %%mm2  \n\t"
+        "psubw  %%mm4, %%mm0  \n\t"
+        "psubw  %%mm3, %%mm2  \n\t"
+        "psubw  %%mm3, %%mm0  \n\t"
+        "paddw  %%mm1, %%mm2  \n\t"
+
+        "movq   %%mm5, %%mm4  \n\t"
+        "movq   %%mm1, %%mm6  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm6  \n\t"
+        "paddw  %%mm5, %%mm4  \n\t"
+        "paddw  %%mm1, %%mm6  \n\t"
+        "paddw  %%mm7, %%mm4  \n\t"
+        "paddw  %%mm5, %%mm6  \n\t"
+        "psubw  %%mm1, %%mm4  \n\t"
+        "paddw  %%mm3, %%mm6  \n\t"
+
+        "movq   %%mm0, %%mm1  \n\t"
+        "movq   %%mm4, %%mm3  \n\t"
+        "movq   %%mm2, %%mm5  \n\t"
+        "movq   %%mm6, %%mm7  \n\t"
+        "psraw  $2,    %%mm6  \n\t"
+        "psraw  $2,    %%mm3  \n\t"
+        "psraw  $2,    %%mm5  \n\t"
+        "psraw  $2,    %%mm0  \n\t"
+        "paddw  %%mm6, %%mm1  \n\t"
+        "paddw  %%mm2, %%mm3  \n\t"
+        "psubw  %%mm4, %%mm5  \n\t"
+        "psubw  %%mm0, %%mm7  \n\t"
+
+        "movq  32(%0), %%mm2  \n\t"
+        "movq  96(%0), %%mm6  \n\t"
+        "movq   %%mm2, %%mm4  \n\t"
+        "movq   %%mm6, %%mm0  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm6  \n\t"
+        "psubw  %%mm0, %%mm4  \n\t"
+        "paddw  %%mm2, %%mm6  \n\t"
+
+        "movq    (%0), %%mm2  \n\t"
+        "movq  64(%0), %%mm0  \n\t"
+        SUMSUB_BA( %%mm0, %%mm2 )
+        SUMSUB_BA( %%mm6, %%mm0 )
+        SUMSUB_BA( %%mm4, %%mm2 )
+        SUMSUB_BA( %%mm7, %%mm6 )
+        SUMSUB_BA( %%mm5, %%mm4 )
+        SUMSUB_BA( %%mm3, %%mm2 )
+        SUMSUB_BA( %%mm1, %%mm0 )
+        :: "r"(block)
+    );
+}
+
+static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    int i;
+    int16_t __attribute__ ((aligned(8))) b2[64];
+
+    block[0] += 32;
+
+    for(i=0; i<2; i++){
+        uint64_t tmp;
+
+        h264_idct8_1d(block+4*i);
+
+        asm volatile(
+            "movq   %%mm7,    %0   \n\t"
+            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+            "movq   %%mm0,  8(%1)  \n\t"
+            "movq   %%mm6, 24(%1)  \n\t"
+            "movq   %%mm7, 40(%1)  \n\t"
+            "movq   %%mm4, 56(%1)  \n\t"
+            "movq    %0,    %%mm7  \n\t"
+            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+            "movq   %%mm7,   (%1)  \n\t"
+            "movq   %%mm1, 16(%1)  \n\t"
+            "movq   %%mm0, 32(%1)  \n\t"
+            "movq   %%mm3, 48(%1)  \n\t"
+            : "=m"(tmp)
+            : "r"(b2+32*i)
+            : "memory"
+        );
+    }
+
+    for(i=0; i<2; i++){
+        h264_idct8_1d(b2+4*i);
+
+        asm volatile(
+            "psraw     $6, %%mm7  \n\t"
+            "psraw     $6, %%mm6  \n\t"
+            "psraw     $6, %%mm5  \n\t"
+            "psraw     $6, %%mm4  \n\t"
+            "psraw     $6, %%mm3  \n\t"
+            "psraw     $6, %%mm2  \n\t"
+            "psraw     $6, %%mm1  \n\t"
+            "psraw     $6, %%mm0  \n\t"
+
+            "movq   %%mm7,    (%0)  \n\t"
+            "movq   %%mm5,  16(%0)  \n\t"
+            "movq   %%mm3,  32(%0)  \n\t"
+            "movq   %%mm1,  48(%0)  \n\t"
+            "movq   %%mm0,  64(%0)  \n\t"
+            "movq   %%mm2,  80(%0)  \n\t"
+            "movq   %%mm4,  96(%0)  \n\t"
+            "movq   %%mm6, 112(%0)  \n\t"
+            :: "r"(b2+4*i)
+            : "memory"
+        );
+    }
+
+    add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    asm volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    asm volatile(
+        "movd          %0, %%mm2 \n\t"
+        "movd          %1, %%mm3 \n\t"
+        "movd          %2, %%mm4 \n\t"
+        "movd          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movd       %%mm2, %0    \n\t"
+        "movd       %%mm3, %1    \n\t"
+        "movd       %%mm4, %2    \n\t"
+        "movd       %%mm5, %3    \n\t"
+        :"+m"(*(uint32_t*)(dst+0*stride)),
+         "+m"(*(uint32_t*)(dst+1*stride)),
+         "+m"(*(uint32_t*)(dst+2*stride)),
+         "+m"(*(uint32_t*)(dst+3*stride))
+    );
+}
+
+static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    int y;
+    asm volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    for(y=2; y--; dst += 4*stride){
+    asm volatile(
+        "movq          %0, %%mm2 \n\t"
+        "movq          %1, %%mm3 \n\t"
+        "movq          %2, %%mm4 \n\t"
+        "movq          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movq       %%mm2, %0    \n\t"
+        "movq       %%mm3, %1    \n\t"
+        "movq       %%mm4, %2    \n\t"
+        "movq       %%mm5, %3    \n\t"
+        :"+m"(*(uint64_t*)(dst+0*stride)),
+         "+m"(*(uint64_t*)(dst+1*stride)),
+         "+m"(*(uint64_t*)(dst+2*stride)),
+         "+m"(*(uint64_t*)(dst+3*stride))
+    );
+    }
+}
+
 
 /***********************************/
 /* deblocking */
@@ -441,6 +638,50 @@ static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
         : "memory"\
     );\
 }\
+static void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=4;\
+    asm volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %0, %%mm4             \n\t"\
+        "movq %1, %%mm5             \n\t"\
+        :: "m"(ff_pw_5), "m"(ff_pw_16)\
+    );\
+    do{\
+    asm volatile(\
+        "movd  -1(%0), %%mm1        \n\t"\
+        "movd    (%0), %%mm2        \n\t"\
+        "movd   1(%0), %%mm3        \n\t"\
+        "movd   2(%0), %%mm0        \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "paddw %%mm0, %%mm1         \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "movd  -2(%0), %%mm0        \n\t"\
+        "movd   3(%0), %%mm3        \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm3, %%mm0         \n\t"\
+        "psllw $2, %%mm2            \n\t"\
+        "psubw %%mm1, %%mm2         \n\t"\
+        "pmullw %%mm4, %%mm2        \n\t"\
+        "paddw %%mm5, %%mm0         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "movd   (%2), %%mm3         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "packuswb %%mm0, %%mm0      \n\t"\
+        PAVGB" %%mm3, %%mm0         \n\t"\
+        OP(%%mm0, (%1),%%mm6, d)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((long)src2Stride), "S"((long)dstStride)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
     src -= 2*srcStride;\
     asm volatile(\
@@ -591,11 +832,74 @@ static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
     );\
 }\
 \
-static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    int h= 2;\
+static void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=8;\
+    asm volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %0, %%mm6             \n\t"\
+        :: "m"(ff_pw_5)\
+    );\
+    do{\
+    asm volatile(\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "psllw $2, %%mm0            \n\t"\
+        "psllw $2, %%mm1            \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movd   -2(%0), %%mm2       \n\t"\
+        "movd    7(%0), %%mm5       \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "movq %5, %%mm5             \n\t"\
+        "paddw %%mm5, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm4, %%mm1         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "psraw $5, %%mm1            \n\t"\
+        "movq (%2), %%mm4           \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        PAVGB" %%mm4, %%mm0         \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((long)src2Stride), "S"((long)dstStride),\
+          "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
+\
+static inline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    int w= 2;\
     src -= 2*srcStride;\
     \
-    while(h--){\
+    while(w--){\
       asm volatile(\
         "pxor %%mm7, %%mm7          \n\t"\
         "movd (%0), %%mm0           \n\t"\
@@ -626,13 +930,29 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
         : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
         : "memory"\
      );\
-     src += 4-13*srcStride;\
-     dst +=  4-8*dstStride;\
+     if(h==16){\
+        asm volatile(\
+            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+            \
+           : "+a"(src), "+c"(dst)\
+           : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+           : "memory"\
+        );\
+     }\
+     src += 4-(h+5)*srcStride;\
+     dst += 4-h*dstStride;\
    }\
 }\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    int h=8;\
-    int w=4;\
+static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+    int h = size;\
+    int w = (size+8)>>2;\
     src -= 2*srcStride+2;\
     while(w--){\
         asm volatile(\
@@ -652,23 +972,40 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
             "punpcklbw %%mm7, %%mm2 \n\t"\
             "punpcklbw %%mm7, %%mm3 \n\t"\
             "punpcklbw %%mm7, %%mm4 \n\t"\
-            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
-            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
-            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
-            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
-            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
-            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
-            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
-            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
-             \
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
+            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
+            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
+            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
+            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
             : "+a"(src)\
             : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
             : "memory"\
         );\
+        if(size==16){\
+            asm volatile(\
+                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
+                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
+                QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
+                QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
+                QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
+                QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
+                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
+                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
+                : "+a"(src)\
+                : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+                : "memory"\
+            );\
+        }\
         tmp += 4;\
-        src += 4 - 13*srcStride;\
+        src += 4 - (size+5)*srcStride;\
     }\
-    tmp -= 4*4;\
+    tmp -= size+8;\
+    w = size>>4;\
+    do{\
+    h = size;\
     asm volatile(\
         "movq %4, %%mm6             \n\t"\
         "1:                         \n\t"\
@@ -702,7 +1039,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
         "psraw $6, %%mm3            \n\t"\
         "packuswb %%mm3, %%mm0      \n\t"\
         OP(%%mm0, (%1),%%mm7, q)\
-        "add $32, %0                \n\t"\
+        "add $48, %0                \n\t"\
         "add %3, %1                 \n\t"\
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
@@ -710,14 +1047,17 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
         : "S"((long)dstStride), "m"(ff_pw_32)\
         : "memory"\
     );\
+    tmp += 8 - size*24;\
+    dst += 8 - size*dstStride;\
+    }while(w--);\
+}\
+\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
 }\
 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
 }\
 \
 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
@@ -729,14 +1069,88 @@ static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
 }\
 \
-static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
-    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
-    src += 8*srcStride;\
+static void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
     dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
-    OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp  , src+8, dstStride, tmpStride, srcStride);\
+    src2 += 8*src2Stride;\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    asm volatile(\
+        "movq       %5,  %%mm6          \n\t"\
+        "movq      (%1), %%mm0          \n\t"\
+        "movq    24(%1), %%mm1          \n\t"\
+        "paddw    %%mm6, %%mm0          \n\t"\
+        "paddw    %%mm6, %%mm1          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "packuswb %%mm0, %%mm0          \n\t"\
+        "packuswb %%mm1, %%mm1          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm1          \n\t"\
+        OP(%%mm0, (%2),    %%mm4, d)\
+        OP(%%mm1, (%2,%4), %%mm5, d)\
+        "lea  (%0,%3,2), %0             \n\t"\
+        "lea  (%2,%4,2), %2             \n\t"\
+        "movq    48(%1), %%mm0          \n\t"\
+        "movq    72(%1), %%mm1          \n\t"\
+        "paddw    %%mm6, %%mm0          \n\t"\
+        "paddw    %%mm6, %%mm1          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "packuswb %%mm0, %%mm0          \n\t"\
+        "packuswb %%mm1, %%mm1          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm1          \n\t"\
+        OP(%%mm0, (%2),    %%mm4, d)\
+        OP(%%mm1, (%2,%4), %%mm5, d)\
+        :"+a"(src8), "+c"(src16), "+d"(dst)\
+        :"S"((long)src8Stride), "D"((long)dstStride), "m"(ff_pw_16)\
+        :"memory");\
+}\
+static void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    asm volatile(\
+        "movq       %0,  %%mm6          \n\t"\
+        ::"m"(ff_pw_16)\
+        );\
+    while(h--){\
+    asm volatile(\
+        "movq      (%1), %%mm0          \n\t"\
+        "movq     8(%1), %%mm1          \n\t"\
+        "paddw    %%mm6, %%mm0          \n\t"\
+        "paddw    %%mm6, %%mm1          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "packuswb %%mm1, %%mm0          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        OP(%%mm0, (%2), %%mm5, q)\
+        ::"a"(src8), "c"(src16), "d"(dst)\
+        :"memory");\
+        src8 += src8Stride;\
+        src16 += 24;\
+        dst += dstStride;\
+    }\
 }\
+static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
+    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
 
 #define H264_MC(OPNAME, SIZE, MMX) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
@@ -744,10 +1158,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -755,10 +1166,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -780,89 +1188,72 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/4];\
-    uint8_t * const halfH= (uint8_t*)temp;\
-    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const halfV= (uint8_t*)temp;\
     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/4];\
-    uint8_t * const halfH= (uint8_t*)temp;\
-    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const halfV= (uint8_t*)temp;\
     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/4];\
-    uint8_t * const halfH= (uint8_t*)temp;\
-    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const halfV= (uint8_t*)temp;\
     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/4];\
-    uint8_t * const halfH= (uint8_t*)temp;\
-    uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
+    uint64_t temp[SIZE*SIZE/8];\
+    uint8_t * const halfV= (uint8_t*)temp;\
     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE+8)/4];\
+    uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
     int16_t * const tmp= (int16_t*)temp;\
     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
-    uint8_t * const halfH= (uint8_t*)temp;\
-    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
-    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
+    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
+    uint8_t * const halfHV= (uint8_t*)temp;\
+    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
-    uint8_t * const halfH= (uint8_t*)temp;\
-    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
-    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
+    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
+    uint8_t * const halfHV= (uint8_t*)temp;\
+    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
-    uint8_t * const halfV= (uint8_t*)temp;\
-    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
-    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
+    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
+    int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
+    uint8_t * const halfHV= ((uint8_t*)temp);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
-    uint8_t * const halfV= (uint8_t*)temp;\
-    uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
-    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
+    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
+    int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
+    uint8_t * const halfHV= ((uint8_t*)temp);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
 
 
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
 #define AVG_3DNOW_OP(a,b,temp, size) \
 "mov" #size " " #b ", " #temp "   \n\t"\
 "pavgusb " #temp ", " #a "        \n\t"\
@@ -872,10 +1263,14 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *
 "pavgb " #temp ", " #a "          \n\t"\
 "mov" #size " " #a ", " #b "      \n\t"
 
+#define PAVGB "pavgusb"
 QPEL_H264(put_,       PUT_OP, 3dnow)
 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
+#undef PAVGB
+#define PAVGB "pavgb"
 QPEL_H264(put_,       PUT_OP, mmx2)
 QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
+#undef PAVGB
 
 H264_MC(put_, 4, 3dnow)
 H264_MC(put_, 8, 3dnow)
@@ -895,12 +1290,14 @@ H264_MC(avg_, 16,mmx2)
 #define H264_CHROMA_OP4(S,D,T)
 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
+#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC2_TMPL
 #undef H264_CHROMA_MC8_MV0
 
 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
@@ -908,12 +1305,14 @@ H264_MC(avg_, 16,mmx2)
                                "pavgb " #T ", " #D " \n\t"
 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
+#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC2_TMPL
 #undef H264_CHROMA_MC8_MV0
 
 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c b/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c
index 7bc6f5f78..a55d4ea07 100644
--- a/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c
+++ b/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c
@@ -20,7 +20,7 @@
 // *  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 //
 // *
-// * $Id: idct_mmx_xvid.c,v 1.2 2006/02/05 14:11:36 miguelfreitas Exp $
+// * $Id: idct_mmx_xvid.c,v 1.3 2006/08/02 07:02:41 tmmm Exp $
 // *
 // ***************************************************************************/
 
diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h
index df4620e0a..eab051341 100644
--- a/src/libffmpeg/libavcodec/i386/mmx.h
+++ b/src/libffmpeg/libavcodec/i386/mmx.h
@@ -5,22 +5,6 @@
 #ifndef AVCODEC_I386MMX_H
 #define AVCODEC_I386MMX_H
 
-#ifdef ARCH_X86_64
-#  define REG_a "rax"
-#  define REG_b "rbx"
-#  define REG_c "rcx"
-#  define REG_d "rdx"
-#  define REG_D "rdi"
-#  define REG_S "rsi"
-#else
-#  define REG_a "eax"
-#  define REG_b "ebx"
-#  define REG_c "ecx"
-#  define REG_d "edx"
-#  define REG_D "edi"
-#  define REG_S "esi"
-#endif
-
 /*
  * The type of an value that fits in an MMX register (note that long
  * long constant values MUST be suffixed by LL and unsigned long long
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index c14b79384..edcabcf38 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -20,7 +20,7 @@
  * mostly by Michael Niedermayer <michaelni@gmx.at>
  */
 #include "../dsputil.h"
-#include "mmx.h"
+#include "x86_cpu.h"
 
 static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
 0x0000000000000000ULL,
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index f83df3a19..c00a602bd 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -23,7 +23,7 @@
 #include "../dsputil.h"
 #include "../mpegvideo.h"
 #include "../avcodec.h"
-#include "mmx.h"
+#include "x86_cpu.h"
 
 extern uint8_t zigzag_direct_noperm[64];
 extern uint16_t inv_zigzag_direct16[64];
@@ -699,7 +699,8 @@ void MPV_common_init_mmx(MpegEncContext *s)
         s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
-        s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+        if(!(s->flags & CODEC_FLAG_BITEXACT))
+            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
 
         draw_edges = draw_edges_mmx;
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index 2c50df232..de2ef08e5 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -19,6 +19,7 @@
  */
 #undef SPREADW
 #undef PMAXW
+#undef PMAX
 #ifdef HAVE_MMX2
 #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
 #define PMAXW(a,b) "pmaxsw " #a ", " #b "     \n\t"
diff --git a/src/libffmpeg/libavcodec/imgconvert.c b/src/libffmpeg/libavcodec/imgconvert.c
index 850f9b04f..cc1a825fc 100644
--- a/src/libffmpeg/libavcodec/imgconvert.c
+++ b/src/libffmpeg/libavcodec/imgconvert.c
@@ -63,7 +63,7 @@ typedef struct PixFmtInfo {
 } PixFmtInfo;
 
 /* this table gives more information about formats */
-static PixFmtInfo pix_fmt_info[PIX_FMT_NB] = {
+static const PixFmtInfo pix_fmt_info[PIX_FMT_NB] = {
     /* YUV formats */
     [PIX_FMT_YUV420P] = {
         .name = "yuv420p",
@@ -266,7 +266,7 @@ int avpicture_fill(AVPicture *picture, uint8_t *ptr,
                    int pix_fmt, int width, int height)
 {
     int size, w2, h2, size2;
-    PixFmtInfo *pinfo;
+    const PixFmtInfo *pinfo;
 
     if(avcodec_check_dimensions(NULL, width, height))
         goto fail;
@@ -359,7 +359,7 @@ fail:
 int avpicture_layout(const AVPicture* src, int pix_fmt, int width, int height,
                      unsigned char *dest, int dest_size)
 {
-    PixFmtInfo* pf = &pix_fmt_info[pix_fmt];
+    const PixFmtInfo* pf = &pix_fmt_info[pix_fmt];
     int i, j, w, h, data_planes;
     const unsigned char* s;
     int size = avpicture_get_size(pix_fmt, width, height);
@@ -572,7 +572,7 @@ int avcodec_find_best_pix_fmt(int pix_fmt_mask, int src_pix_fmt,
     return dst_pix_fmt;
 }
 
-static void img_copy_plane(uint8_t *dst, int dst_wrap,
+void ff_img_copy_plane(uint8_t *dst, int dst_wrap,
                            const uint8_t *src, int src_wrap,
                            int width, int height)
 {
@@ -592,7 +592,7 @@ void img_copy(AVPicture *dst, const AVPicture *src,
               int pix_fmt, int width, int height)
 {
     int bwidth, bits, i;
-    PixFmtInfo *pf = &pix_fmt_info[pix_fmt];
+    const PixFmtInfo *pf = &pix_fmt_info[pix_fmt];
 
     pf = &pix_fmt_info[pix_fmt];
     switch(pf->pixel_type) {
@@ -612,7 +612,7 @@ void img_copy(AVPicture *dst, const AVPicture *src,
             break;
         }
         bwidth = (width * bits + 7) >> 3;
-        img_copy_plane(dst->data[0], dst->linesize[0],
+        ff_img_copy_plane(dst->data[0], dst->linesize[0],
                        src->data[0], src->linesize[0],
                        bwidth, height);
         break;
@@ -626,17 +626,17 @@ void img_copy(AVPicture *dst, const AVPicture *src,
                 h >>= pf->y_chroma_shift;
             }
             bwidth = (w * pf->depth + 7) >> 3;
-            img_copy_plane(dst->data[i], dst->linesize[i],
+            ff_img_copy_plane(dst->data[i], dst->linesize[i],
                            src->data[i], src->linesize[i],
                            bwidth, h);
         }
         break;
     case FF_PIXEL_PALETTE:
-        img_copy_plane(dst->data[0], dst->linesize[0],
+        ff_img_copy_plane(dst->data[0], dst->linesize[0],
                        src->data[0], src->linesize[0],
                        width, height);
         /* copy the palette */
-        img_copy_plane(dst->data[1], dst->linesize[1],
+        ff_img_copy_plane(dst->data[1], dst->linesize[1],
                        src->data[1], src->linesize[1],
                        4, 256);
         break;
@@ -1210,7 +1210,7 @@ static void shrink12(uint8_t *dst, int dst_wrap,
 }
 
 /* 2x2 -> 1x1 */
-static void shrink22(uint8_t *dst, int dst_wrap,
+void ff_shrink22(uint8_t *dst, int dst_wrap,
                      const uint8_t *src, int src_wrap,
                      int width, int height)
 {
@@ -1243,7 +1243,7 @@ static void shrink22(uint8_t *dst, int dst_wrap,
 }
 
 /* 4x4 -> 1x1 */
-static void shrink44(uint8_t *dst, int dst_wrap,
+void ff_shrink44(uint8_t *dst, int dst_wrap,
                      const uint8_t *src, int src_wrap,
                      int width, int height)
 {
@@ -1273,6 +1273,28 @@ static void shrink44(uint8_t *dst, int dst_wrap,
     }
 }
 
+/* 8x8 -> 1x1 */
+void ff_shrink88(uint8_t *dst, int dst_wrap,
+                     const uint8_t *src, int src_wrap,
+                     int width, int height)
+{
+    int w, i;
+
+    for(;height > 0; height--) {
+        for(w = width;w > 0; w--) {
+            int tmp=0;
+            for(i=0; i<8; i++){
+                tmp += src[0] + src[1] + src[2] + src[3] + src[4] + src[5] + src[6] + src[7];
+                src += src_wrap;
+            }
+            *(dst++) = (tmp + 32)>>6;
+            src += 8 - 8*src_wrap;
+        }
+        src += 8*src_wrap - 8*width;
+        dst += dst_wrap - width;
+    }
+}
+
 static void grow21_line(uint8_t *dst, const uint8_t *src,
                         int width)
 {
@@ -1701,7 +1723,7 @@ typedef struct ConvertEntry {
 
    The other conversion functions are just optimisations for common cases.
 */
-static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
+static const ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
     [PIX_FMT_YUV420P] = {
         [PIX_FMT_YUV422] = {
             .convert = yuv420p_to_yuv422,
@@ -1922,7 +1944,7 @@ static ConvertEntry convert_table[PIX_FMT_NB][PIX_FMT_NB] = {
 int avpicture_alloc(AVPicture *picture,
                            int pix_fmt, int width, int height)
 {
-    unsigned int size;
+    int size;
     void *ptr;
 
     size = avpicture_get_size(pix_fmt, width, height);
@@ -1944,13 +1966,88 @@ void avpicture_free(AVPicture *picture)
 }
 
 /* return true if yuv planar */
-static inline int is_yuv_planar(PixFmtInfo *ps)
+static inline int is_yuv_planar(const PixFmtInfo *ps)
 {
     return (ps->color_type == FF_COLOR_YUV ||
             ps->color_type == FF_COLOR_YUV_JPEG) &&
         ps->pixel_type == FF_PIXEL_PLANAR;
 }
 
+/**
+ * Crop image top and left side
+ */
+int img_crop(AVPicture *dst, const AVPicture *src,
+              int pix_fmt, int top_band, int left_band)
+{
+    int y_shift;
+    int x_shift;
+
+    if (pix_fmt < 0 || pix_fmt >= PIX_FMT_NB || !is_yuv_planar(&pix_fmt_info[pix_fmt]))
+        return -1;
+
+    y_shift = pix_fmt_info[pix_fmt].y_chroma_shift;
+    x_shift = pix_fmt_info[pix_fmt].x_chroma_shift;
+
+    dst->data[0] = src->data[0] + (top_band * src->linesize[0]) + left_band;
+    dst->data[1] = src->data[1] + ((top_band >> y_shift) * src->linesize[1]) + (left_band >> x_shift);
+    dst->data[2] = src->data[2] + ((top_band >> y_shift) * src->linesize[2]) + (left_band >> x_shift);
+
+    dst->linesize[0] = src->linesize[0];
+    dst->linesize[1] = src->linesize[1];
+    dst->linesize[2] = src->linesize[2];
+    return 0;
+}
+
+/**
+ * Pad image
+ */
+int img_pad(AVPicture *dst, const AVPicture *src, int height, int width, int pix_fmt,
+            int padtop, int padbottom, int padleft, int padright, int *color)
+{
+    uint8_t *optr, *iptr;
+    int y_shift;
+    int x_shift;
+    int yheight;
+    int i, y;
+
+    if (pix_fmt < 0 || pix_fmt >= PIX_FMT_NB || !is_yuv_planar(&pix_fmt_info[pix_fmt]))
+        return -1;
+
+    for (i = 0; i < 3; i++) {
+        x_shift = i ? pix_fmt_info[pix_fmt].x_chroma_shift : 0;
+        y_shift = i ? pix_fmt_info[pix_fmt].y_chroma_shift : 0;
+
+        if (padtop || padleft) {
+            memset(dst->data[i], color[i], dst->linesize[i] * (padtop >> y_shift) + (padleft >> x_shift));
+        }
+
+        if (padleft || padright || src) {
+            if (src) { /* first line */
+                iptr = src->data[i];
+                optr = dst->data[i] + dst->linesize[i] * (padtop >> y_shift) + (padleft >> x_shift);
+                memcpy(optr, iptr, src->linesize[i]);
+                iptr += src->linesize[i];
+            }
+            optr = dst->data[i] + dst->linesize[i] * (padtop >> y_shift) + (dst->linesize[i] - (padright >> x_shift));
+            yheight = (height - 1 - (padtop + padbottom)) >> y_shift;
+            for (y = 0; y < yheight; y++) {
+                memset(optr, color[i], (padleft + padright) >> x_shift);
+                if (src) {
+                    memcpy(optr + ((padleft + padright) >> x_shift), iptr, src->linesize[i]);
+                    iptr += src->linesize[i];
+                }
+                optr += dst->linesize[i];
+            }
+        }
+
+        if (padbottom || padright) {
+            optr = dst->data[i] + dst->linesize[i] * ((height - padbottom) >> y_shift) - (padright >> x_shift);
+            memset(optr, color[i], dst->linesize[i] * (padbottom >> y_shift) + (padright >> x_shift));
+        }
+    }
+    return 0;
+}
+
 /* XXX: always use linesize. Return -1 if not supported */
 int img_convert(AVPicture *dst, int dst_pix_fmt,
                 const AVPicture *src, int src_pix_fmt,
@@ -1958,8 +2055,8 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
 {
     static int inited;
     int i, ret, dst_width, dst_height, int_pix_fmt;
-    PixFmtInfo *src_pix, *dst_pix;
-    ConvertEntry *ce;
+    const PixFmtInfo *src_pix, *dst_pix;
+    const ConvertEntry *ce;
     AVPicture tmp1, *tmp = &tmp1;
 
     if (src_pix_fmt < 0 || src_pix_fmt >= PIX_FMT_NB ||
@@ -1998,7 +2095,7 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
         uint8_t *d;
 
         if (dst_pix->color_type == FF_COLOR_YUV_JPEG) {
-            img_copy_plane(dst->data[0], dst->linesize[0],
+            ff_img_copy_plane(dst->data[0], dst->linesize[0],
                      src->data[0], src->linesize[0],
                      dst_width, dst_height);
         } else {
@@ -2026,7 +2123,7 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
     if (is_yuv_planar(src_pix) &&
         dst_pix_fmt == PIX_FMT_GRAY8) {
         if (src_pix->color_type == FF_COLOR_YUV_JPEG) {
-            img_copy_plane(dst->data[0], dst->linesize[0],
+            ff_img_copy_plane(dst->data[0], dst->linesize[0],
                      src->data[0], src->linesize[0],
                      dst_width, dst_height);
         } else {
@@ -2064,7 +2161,7 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
            YUV444 format */
         switch(xy_shift) {
         case 0x00:
-            resize_func = img_copy_plane;
+            resize_func = ff_img_copy_plane;
             break;
         case 0x10:
             resize_func = shrink21;
@@ -2076,10 +2173,10 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
             resize_func = shrink12;
             break;
         case 0x11:
-            resize_func = shrink22;
+            resize_func = ff_shrink22;
             break;
         case 0x22:
-            resize_func = shrink44;
+            resize_func = ff_shrink44;
             break;
         case 0xf0:
             resize_func = grow21;
@@ -2101,7 +2198,7 @@ int img_convert(AVPicture *dst, int dst_pix_fmt,
             goto no_chroma_filter;
         }
 
-        img_copy_plane(dst->data[0], dst->linesize[0],
+        ff_img_copy_plane(dst->data[0], dst->linesize[0],
                        src->data[0], src->linesize[0],
                        dst_width, dst_height);
 
@@ -2226,7 +2323,7 @@ static int get_alpha_info_pal8(const AVPicture *src, int width, int height)
 int img_get_alpha_info(const AVPicture *src,
                        int pix_fmt, int width, int height)
 {
-    PixFmtInfo *pf = &pix_fmt_info[pix_fmt];
+    const PixFmtInfo *pf = &pix_fmt_info[pix_fmt];
     int ret;
 
     pf = &pix_fmt_info[pix_fmt];
diff --git a/src/libffmpeg/libavcodec/imgresample.c b/src/libffmpeg/libavcodec/imgresample.c
index 906fde3f2..8ffcd7960 100644
--- a/src/libffmpeg/libavcodec/imgresample.c
+++ b/src/libffmpeg/libavcodec/imgresample.c
@@ -23,6 +23,7 @@
  */
 
 #include "avcodec.h"
+#include "swscale.h"
 #include "dsputil.h"
 
 #ifdef USE_FASTMEMCPY
@@ -630,6 +631,143 @@ void img_resample_close(ImgReSampleContext *s)
     av_free(s);
 }
 
+struct SwsContext *sws_getContext(int srcW, int srcH, int srcFormat,
+                                  int dstW, int dstH, int dstFormat,
+                                  int flags, SwsFilter *srcFilter,
+                                  SwsFilter *dstFilter, double *param)
+{
+    struct SwsContext *ctx;
+
+    ctx = av_malloc(sizeof(struct SwsContext));
+    if (ctx == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "Cannot allocate a resampling context!\n");
+
+        return NULL;
+    }
+
+    if ((srcH != dstH) || (srcW != dstW)) {
+        if ((srcFormat != PIX_FMT_YUV420P) || (dstFormat != PIX_FMT_YUV420P)) {
+            av_log(NULL, AV_LOG_INFO, "PIX_FMT_YUV420P will be used as an intermediate format for rescaling\n");
+        }
+        ctx->resampling_ctx = img_resample_init(dstW, dstH, srcW, srcH);
+    } else {
+        ctx->resampling_ctx = av_malloc(sizeof(ImgReSampleContext));
+        ctx->resampling_ctx->iheight = srcH;
+        ctx->resampling_ctx->iwidth = srcW;
+        ctx->resampling_ctx->oheight = dstH;
+        ctx->resampling_ctx->owidth = dstW;
+    }
+    ctx->src_pix_fmt = srcFormat;
+    ctx->dst_pix_fmt = dstFormat;
+
+    return ctx;
+}
+
+void sws_freeContext(struct SwsContext *ctx)
+{
+    if ((ctx->resampling_ctx->iwidth != ctx->resampling_ctx->owidth) ||
+        (ctx->resampling_ctx->iheight != ctx->resampling_ctx->oheight)) {
+        img_resample_close(ctx->resampling_ctx);
+    } else {
+        av_free(ctx->resampling_ctx);
+    }
+    av_free(ctx);
+}
+
+int sws_scale(struct SwsContext *ctx, uint8_t* src[], int srcStride[],
+              int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[])
+{
+    AVPicture src_pict, dst_pict;
+    int i, res = 0;
+    AVPicture picture_format_temp;
+    AVPicture picture_resample_temp, *formatted_picture, *resampled_picture;
+    uint8_t *buf1 = NULL, *buf2 = NULL;
+    enum PixelFormat current_pix_fmt;
+
+    for (i = 0; i < 3; i++) {
+        src_pict.data[i] = src[i];
+        src_pict.linesize[i] = srcStride[i];
+        dst_pict.data[i] = dst[i];
+        dst_pict.linesize[i] = dstStride[i];
+    }
+    if ((ctx->resampling_ctx->iwidth != ctx->resampling_ctx->owidth) ||
+        (ctx->resampling_ctx->iheight != ctx->resampling_ctx->oheight)) {
+        /* We have to rescale the picture, but only YUV420P rescaling is supported... */
+
+        if (ctx->src_pix_fmt != PIX_FMT_YUV420P) {
+            int size;
+
+            /* create temporary picture for rescaling input*/
+            size = avpicture_get_size(PIX_FMT_YUV420P, ctx->resampling_ctx->iwidth, ctx->resampling_ctx->iheight);
+            buf1 = av_malloc(size);
+            if (!buf1) {
+                res = -1;
+                goto the_end;
+            }
+            formatted_picture = &picture_format_temp;
+            avpicture_fill((AVPicture*)formatted_picture, buf1,
+                           PIX_FMT_YUV420P, ctx->resampling_ctx->iwidth, ctx->resampling_ctx->iheight);
+
+            if (img_convert((AVPicture*)formatted_picture, PIX_FMT_YUV420P,
+                            &src_pict, ctx->src_pix_fmt,
+                            ctx->resampling_ctx->iwidth, ctx->resampling_ctx->iheight) < 0) {
+
+                av_log(NULL, AV_LOG_ERROR, "pixel format conversion not handled\n");
+                res = -1;
+                goto the_end;
+            }
+        } else {
+            formatted_picture = &src_pict;
+        }
+
+        if (ctx->dst_pix_fmt != PIX_FMT_YUV420P) {
+            int size;
+
+            /* create temporary picture for rescaling output*/
+            size = avpicture_get_size(PIX_FMT_YUV420P, ctx->resampling_ctx->owidth, ctx->resampling_ctx->oheight);
+            buf2 = av_malloc(size);
+            if (!buf2) {
+                res = -1;
+                goto the_end;
+            }
+            resampled_picture = &picture_resample_temp;
+            avpicture_fill((AVPicture*)resampled_picture, buf2,
+                           PIX_FMT_YUV420P, ctx->resampling_ctx->owidth, ctx->resampling_ctx->oheight);
+
+        } else {
+            resampled_picture = &dst_pict;
+        }
+
+        /* ...and finally rescale!!! */
+        img_resample(ctx->resampling_ctx, resampled_picture, formatted_picture);
+        current_pix_fmt = PIX_FMT_YUV420P;
+    } else {
+        resampled_picture = &src_pict;
+        current_pix_fmt = ctx->src_pix_fmt;
+    }
+
+    if (current_pix_fmt != ctx->dst_pix_fmt) {
+        if (img_convert(&dst_pict, ctx->dst_pix_fmt,
+                        resampled_picture, current_pix_fmt,
+                        ctx->resampling_ctx->owidth, ctx->resampling_ctx->oheight) < 0) {
+
+            av_log(NULL, AV_LOG_ERROR, "pixel format conversion not handled\n");
+
+            res = -1;
+            goto the_end;
+        }
+    } else if (resampled_picture != &dst_pict) {
+        img_copy(&dst_pict, resampled_picture, current_pix_fmt,
+                        ctx->resampling_ctx->owidth, ctx->resampling_ctx->oheight);
+    }
+
+the_end:
+    av_free(buf1);
+    av_free(buf2);
+    return res;
+}
+
+
 #ifdef TEST
 #include <stdio.h>
 
diff --git a/src/libffmpeg/libavcodec/interplayvideo.c b/src/libffmpeg/libavcodec/interplayvideo.c
index 588485a0a..73165e795 100644
--- a/src/libffmpeg/libavcodec/interplayvideo.c
+++ b/src/libffmpeg/libavcodec/interplayvideo.c
@@ -49,11 +49,7 @@
 #if DEBUG_INTERPLAY
 #define debug_interplay(x,...) av_log(NULL, AV_LOG_DEBUG, x, __VA_ARGS__)
 #else
-static inline void
-#ifdef __GNUC__
-__attribute__ ((__format__ (__printf__, 1, 2)))
-#endif
-debug_interplay(const char *format, ...) { }
+static inline void debug_interplay(const char *format, ...) { }
 #endif
 
 typedef struct IpvideoContext {
diff --git a/src/libffmpeg/libavcodec/mjpeg.c b/src/libffmpeg/libavcodec/mjpeg.c
index 951a622ee..dffd98946 100644
--- a/src/libffmpeg/libavcodec/mjpeg.c
+++ b/src/libffmpeg/libavcodec/mjpeg.c
@@ -894,14 +894,24 @@ typedef struct MJpegDecodeContext {
 static int mjpeg_decode_dht(MJpegDecodeContext *s);
 
 static int build_vlc(VLC *vlc, const uint8_t *bits_table, const uint8_t *val_table,
-                      int nb_codes, int use_static)
+                      int nb_codes, int use_static, int is_ac)
 {
-    uint8_t huff_size[256];
-    uint16_t huff_code[256];
+    uint8_t huff_size[256+16];
+    uint16_t huff_code[256+16];
+
+    assert(nb_codes <= 256);
 
     memset(huff_size, 0, sizeof(huff_size));
     build_huffman_codes(huff_size, huff_code, bits_table, val_table);
 
+    if(is_ac){
+        memmove(huff_size+16, huff_size, sizeof(uint8_t)*nb_codes);
+        memmove(huff_code+16, huff_code, sizeof(uint16_t)*nb_codes);
+        memset(huff_size, 0, sizeof(uint8_t)*16);
+        memset(huff_code, 0, sizeof(uint16_t)*16);
+        nb_codes += 16;
+    }
+
     return init_vlc(vlc, 9, nb_codes, huff_size, 1, 1, huff_code, 2, 2, use_static);
 }
 
@@ -930,10 +940,10 @@ static int mjpeg_decode_init(AVCodecContext *avctx)
     s->first_picture = 1;
     s->org_height = avctx->coded_height;
 
-    build_vlc(&s->vlcs[0][0], bits_dc_luminance, val_dc_luminance, 12, 0);
-    build_vlc(&s->vlcs[0][1], bits_dc_chrominance, val_dc_chrominance, 12, 0);
-    build_vlc(&s->vlcs[1][0], bits_ac_luminance, val_ac_luminance, 251, 0);
-    build_vlc(&s->vlcs[1][1], bits_ac_chrominance, val_ac_chrominance, 251, 0);
+    build_vlc(&s->vlcs[0][0], bits_dc_luminance, val_dc_luminance, 12, 0, 0);
+    build_vlc(&s->vlcs[0][1], bits_dc_chrominance, val_dc_chrominance, 12, 0, 0);
+    build_vlc(&s->vlcs[1][0], bits_ac_luminance, val_ac_luminance, 251, 0, 1);
+    build_vlc(&s->vlcs[1][1], bits_ac_chrominance, val_ac_chrominance, 251, 0, 1);
 
     if (avctx->flags & CODEC_FLAG_EXTERN_HUFF)
     {
@@ -1084,7 +1094,7 @@ static int mjpeg_decode_dht(MJpegDecodeContext *s)
         free_vlc(&s->vlcs[class][index]);
         dprintf("class=%d index=%d nb_codes=%d\n",
                class, index, code_max + 1);
-        if(build_vlc(&s->vlcs[class][index], bits_table, val_table, code_max + 1, 0) < 0){
+        if(build_vlc(&s->vlcs[class][index], bits_table, val_table, code_max + 1, 0, class > 0) < 0){
             return -1;
         }
     }
@@ -1246,11 +1256,9 @@ static inline int mjpeg_decode_dc(MJpegDecodeContext *s, int dc_index)
 
 /* decode block and dequantize */
 static int decode_block(MJpegDecodeContext *s, DCTELEM *block,
-                        int component, int dc_index, int ac_index, int quant_index)
+                        int component, int dc_index, int ac_index, int16_t *quant_matrix)
 {
     int code, i, j, level, val;
-    VLC *ac_vlc;
-    int16_t *quant_matrix;
 
     /* DC coef */
     val = mjpeg_decode_dc(s, dc_index);
@@ -1258,39 +1266,48 @@ static int decode_block(MJpegDecodeContext *s, DCTELEM *block,
         dprintf("error dc\n");
         return -1;
     }
-    quant_matrix = s->quant_matrixes[quant_index];
     val = val * quant_matrix[0] + s->last_dc[component];
     s->last_dc[component] = val;
     block[0] = val;
     /* AC coefs */
-    ac_vlc = &s->vlcs[1][ac_index];
-    i = 1;
+    i = 0;
+    {OPEN_READER(re, &s->gb)
     for(;;) {
-        code = get_vlc2(&s->gb, s->vlcs[1][ac_index].table, 9, 2);
+        UPDATE_CACHE(re, &s->gb);
+        GET_VLC(code, re, &s->gb, s->vlcs[1][ac_index].table, 9, 2)
 
-        if (code < 0) {
-            dprintf("error ac\n");
-            return -1;
-        }
         /* EOB */
-        if (code == 0)
+        if (code == 0x10)
             break;
-        if (code == 0xf0) {
-            i += 16;
-        } else {
-            level = get_xbits(&s->gb, code & 0xf);
-            i += code >> 4;
-            if (i >= 64) {
+        i += ((unsigned)code) >> 4;
+        if(code != 0x100){
+            code &= 0xf;
+            if(code > MIN_CACHE_BITS - 16){
+                UPDATE_CACHE(re, &s->gb)
+            }
+            {
+                int cache=GET_CACHE(re,&s->gb);
+                int sign=(~cache)>>31;
+                level = (NEG_USR32(sign ^ cache,code) ^ sign) - sign;
+            }
+
+            LAST_SKIP_BITS(re, &s->gb, code)
+
+            if (i >= 63) {
+                if(i == 63){
+                    j = s->scantable.permutated[63];
+                    block[j] = level * quant_matrix[j];
+                    break;
+                }
                 dprintf("error count: %d\n", i);
                 return -1;
             }
             j = s->scantable.permutated[i];
             block[j] = level * quant_matrix[j];
-            i++;
-            if (i >= 64)
-                break;
         }
     }
+    CLOSE_READER(re, &s->gb)}
+
     return 0;
 }
 
@@ -1467,7 +1484,7 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s){
                     memset(s->block, 0, sizeof(s->block));
                     if (decode_block(s, s->block, i,
                                      s->dc_index[i], s->ac_index[i],
-                                     s->quant_index[c]) < 0) {
+                                     s->quant_matrixes[ s->quant_index[c] ]) < 0) {
                         dprintf("error y=%d x=%d\n", mb_y, mb_x);
                         return -1;
                     }
diff --git a/src/libffmpeg/libavcodec/motion_est.c b/src/libffmpeg/libavcodec/motion_est.c
index 991be55d0..c587369f3 100644
--- a/src/libffmpeg/libavcodec/motion_est.c
+++ b/src/libffmpeg/libavcodec/motion_est.c
@@ -368,7 +368,7 @@ static int full_motion_search(MpegEncContext * s,
 #if 0
     if (*mx_ptr < -(2 * range) || *mx_ptr >= (2 * range) ||
         *my_ptr < -(2 * range) || *my_ptr >= (2 * range)) {
-        fprintf(stderr, "error %d %d\n", *mx_ptr, *my_ptr);
+        av_log(NULL, AV_LOG_ERROR, "error %d %d\n", *mx_ptr, *my_ptr);
     }
 #endif
     return dmin;
@@ -904,6 +904,8 @@ static int interlaced_search(MpegEncContext *s, int ref_index,
             int16_t (*mv_table)[2]= mv_tables[block][field_select];
 
             if(user_field_select){
+                assert(field_select==0 || field_select==1);
+                assert(field_select_tables[block][xy]==0 || field_select_tables[block][xy]==1);
                 if(field_select_tables[block][xy] != field_select)
                     continue;
             }
diff --git a/src/libffmpeg/libavcodec/motion_est_template.c b/src/libffmpeg/libavcodec/motion_est_template.c
index 23ead283c..16d34bb88 100644
--- a/src/libffmpeg/libavcodec/motion_est_template.c
+++ b/src/libffmpeg/libavcodec/motion_est_template.c
@@ -896,7 +896,8 @@ static always_inline int epzs_motion_search_internal(MpegEncContext * s, int *mx
         CHECK_CLIPED_MV((last_mv[ref_mv_xy][0]*ref_mv_scale + (1<<15))>>16,
                         (last_mv[ref_mv_xy][1]*ref_mv_scale + (1<<15))>>16)
     }else{
-        if(dmin<h*h && ( P_LEFT[0]    |P_LEFT[1]
+        if(dmin<((h*h*s->avctx->mv0_threshold)>>8)
+                    && ( P_LEFT[0]    |P_LEFT[1]
                         |P_TOP[0]     |P_TOP[1]
                         |P_TOPRIGHT[0]|P_TOPRIGHT[1])==0){
             *mx_ptr= 0;
diff --git a/src/libffmpeg/libavcodec/mpeg12.c b/src/libffmpeg/libavcodec/mpeg12.c
index ddecae85d..c268cf707 100644
--- a/src/libffmpeg/libavcodec/mpeg12.c
+++ b/src/libffmpeg/libavcodec/mpeg12.c
@@ -101,8 +101,8 @@ const enum PixelFormat pixfmt_xvmc_mpg2_420[] = {
 static uint8_t (*mv_penalty)[MAX_MV*2+1]= NULL;
 static uint8_t fcode_tab[MAX_MV*2+1];
 
-static uint32_t uni_mpeg1_ac_vlc_bits[64*64*2];
 static uint8_t  uni_mpeg1_ac_vlc_len [64*64*2];
+static uint8_t  uni_mpeg2_ac_vlc_len [64*64*2];
 
 /* simple include everything table for dc, first byte is bits number next 3 are code*/
 static uint32_t mpeg1_lum_dc_uni[512];
@@ -155,7 +155,7 @@ static void init_2d_vlc_rl(RLTable *rl, int use_static)
 }
 
 #ifdef CONFIG_ENCODERS
-static void init_uni_ac_vlc(RLTable *rl, uint32_t *uni_ac_vlc_bits, uint8_t *uni_ac_vlc_len){
+static void init_uni_ac_vlc(RLTable *rl, uint8_t *uni_ac_vlc_len){
     int i;
 
     for(i=0; i<128; i++){
@@ -174,11 +174,11 @@ static void init_uni_ac_vlc(RLTable *rl, uint32_t *uni_ac_vlc_bits, uint8_t *uni
 
             if (code < 111 /* rl->n */) {
                 /* store the vlc & sign at once */
-                len=   mpeg1_vlc[code][1]+1;
-                bits= (mpeg1_vlc[code][0]<<1) + sign;
+                len=   rl->table_vlc[code][1]+1;
+                bits= (rl->table_vlc[code][0]<<1) + sign;
             } else {
-                len=  mpeg1_vlc[111/*rl->n*/][1]+6;
-                bits= mpeg1_vlc[111/*rl->n*/][0]<<6;
+                len=  rl->table_vlc[111/*rl->n*/][1]+6;
+                bits= rl->table_vlc[111/*rl->n*/][0]<<6;
 
                 bits|= run;
                 if (alevel < 128) {
@@ -195,7 +195,6 @@ static void init_uni_ac_vlc(RLTable *rl, uint32_t *uni_ac_vlc_bits, uint8_t *uni
                 }
             }
 
-            uni_ac_vlc_bits[UNI_AC_ENC_INDEX(run, i)]= bits;
             uni_ac_vlc_len [UNI_AC_ENC_INDEX(run, i)]= len;
         }
     }
@@ -208,7 +207,7 @@ static int find_frame_rate_index(MpegEncContext *s){
     int64_t d;
 
     for(i=1;i<14;i++) {
-        int64_t n0= 1001LL/frame_rate_tab[i].den*frame_rate_tab[i].num*s->avctx->time_base.num;
+        int64_t n0= 1001LL/ff_frame_rate_tab[i].den*ff_frame_rate_tab[i].num*s->avctx->time_base.num;
         int64_t n1= 1001LL*s->avctx->time_base.den;
         if(s->avctx->strict_std_compliance > FF_COMPLIANCE_INOFFICIAL && i>=9) break;
 
@@ -240,6 +239,12 @@ static int encode_init(AVCodecContext *avctx)
         }
     }
 
+    if(avctx->profile == FF_PROFILE_UNKNOWN)
+        avctx->profile = s->chroma_format == CHROMA_420 ? 4 : 0;
+
+    if(avctx->level == FF_LEVEL_UNKNOWN)
+        avctx->level = s->chroma_format == CHROMA_420 ? 8 : 5;
+
     return 0;
 }
 
@@ -264,7 +269,7 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
         if(aspect_ratio==0.0) aspect_ratio= 1.0; //pixel aspect 1:1 (VGA)
 
         if (s->current_picture.key_frame) {
-            AVRational framerate= frame_rate_tab[s->frame_rate_index];
+            AVRational framerate= ff_frame_rate_tab[s->frame_rate_index];
 
             /* mpeg1 header repeated every gop */
             put_header(s, SEQ_START_CODE);
@@ -327,22 +332,14 @@ static void mpeg1_encode_sequence_header(MpegEncContext *s)
             if(s->codec_id == CODEC_ID_MPEG2VIDEO){
                 put_header(s, EXT_START_CODE);
                 put_bits(&s->pb, 4, 1); //seq ext
-                put_bits(&s->pb, 1, 0); //esc
 
-                if(s->avctx->profile == FF_PROFILE_UNKNOWN){
-                    put_bits(&s->pb, 3, 4); //profile
-                }else{
-                    put_bits(&s->pb, 3, s->avctx->profile); //profile
-                }
+                put_bits(&s->pb, 1, s->chroma_format == CHROMA_422); //escx
 
-                if(s->avctx->level == FF_LEVEL_UNKNOWN){
-                    put_bits(&s->pb, 4, 8); //level
-                }else{
-                    put_bits(&s->pb, 4, s->avctx->level); //level
-                }
+                put_bits(&s->pb, 3, s->avctx->profile); //profile
+                put_bits(&s->pb, 4, s->avctx->level); //level
 
                 put_bits(&s->pb, 1, s->progressive_sequence);
-                put_bits(&s->pb, 2, 1); //chroma format 4:2:0
+                put_bits(&s->pb, 2, s->chroma_format);
                 put_bits(&s->pb, 2, 0); //horizontal size ext
                 put_bits(&s->pb, 2, 0); //vertical size ext
                 put_bits(&s->pb, 12, v>>18); //bitrate ext
@@ -476,7 +473,7 @@ void mpeg1_encode_picture_header(MpegEncContext *s, int picture_number)
         put_bits(&s->pb, 1, s->alternate_scan);
         put_bits(&s->pb, 1, s->repeat_first_field);
         s->progressive_frame = s->progressive_sequence;
-        put_bits(&s->pb, 1, s->chroma_420_type=s->progressive_frame);
+        put_bits(&s->pb, 1, s->chroma_format == CHROMA_420 ? s->progressive_frame : 0); /* chroma_420_type */
         put_bits(&s->pb, 1, s->progressive_frame);
         put_bits(&s->pb, 1, 0); //composite_display_flag
     }
@@ -504,9 +501,10 @@ static inline void put_mb_modes(MpegEncContext *s, int n, int bits,
     }
 }
 
-void mpeg1_encode_mb(MpegEncContext *s,
-                     DCTELEM block[6][64],
-                     int motion_x, int motion_y)
+static always_inline void mpeg1_encode_mb_internal(MpegEncContext *s,
+                                                   DCTELEM block[6][64],
+                                                   int motion_x, int motion_y,
+                                                   int mb_block_count)
 {
     int i, cbp;
     const int mb_x = s->mb_x;
@@ -515,9 +513,9 @@ void mpeg1_encode_mb(MpegEncContext *s,
 
     /* compute cbp */
     cbp = 0;
-    for(i=0;i<6;i++) {
+    for(i=0;i<mb_block_count;i++) {
         if (s->block_last_index[i] >= 0)
-            cbp |= 1 << (5 - i);
+            cbp |= 1 << (mb_block_count - 1 - i);
     }
 
     if (cbp == 0 && !first_mb && s->mv_type == MV_TYPE_16X16 &&
@@ -623,8 +621,14 @@ void mpeg1_encode_mb(MpegEncContext *s,
                 }
                 s->mv_bits+= get_bits_diff(s);
             }
-            if(cbp)
-                put_bits(&s->pb, mbPatTable[cbp][1], mbPatTable[cbp][0]);
+            if(cbp) {
+                if (s->chroma_y_shift) {
+                    put_bits(&s->pb, mbPatTable[cbp][1], mbPatTable[cbp][0]);
+                } else {
+                    put_bits(&s->pb, mbPatTable[cbp>>2][1], mbPatTable[cbp>>2][0]);
+                    put_bits(&s->pb, 2, cbp & 3);
+                }
+            }
             s->f_count++;
         } else{
             static const int mb_type_len[4]={0,3,4,2}; //bak,for,bi
@@ -702,11 +706,17 @@ void mpeg1_encode_mb(MpegEncContext *s,
                 }
             }
             s->mv_bits += get_bits_diff(s);
-            if(cbp)
-                put_bits(&s->pb, mbPatTable[cbp][1], mbPatTable[cbp][0]);
+            if(cbp) {
+                if (s->chroma_y_shift) {
+                    put_bits(&s->pb, mbPatTable[cbp][1], mbPatTable[cbp][0]);
+                } else {
+                    put_bits(&s->pb, mbPatTable[cbp>>2][1], mbPatTable[cbp>>2][0]);
+                    put_bits(&s->pb, 2, cbp & 3);
+                }
+            }
         }
-        for(i=0;i<6;i++) {
-            if (cbp & (1 << (5 - i))) {
+        for(i=0;i<mb_block_count;i++) {
+            if (cbp & (1 << (mb_block_count - 1 - i))) {
                 mpeg1_encode_block(s, block[i], i);
             }
         }
@@ -718,6 +728,12 @@ void mpeg1_encode_mb(MpegEncContext *s,
     }
 }
 
+void mpeg1_encode_mb(MpegEncContext *s, DCTELEM block[6][64], int motion_x, int motion_y)
+{
+    if (s->chroma_format == CHROMA_420) mpeg1_encode_mb_internal(s, block, motion_x, motion_y, 6);
+    else                                mpeg1_encode_mb_internal(s, block, motion_x, motion_y, 8);
+}
+
 // RAL: Parameter added: f_or_b_code
 static void mpeg1_encode_motion(MpegEncContext *s, int val, int f_or_b_code)
 {
@@ -775,6 +791,8 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
 
         done=1;
         init_rl(&rl_mpeg1, 1);
+        if(s->intra_vlc_format)
+            init_rl(&rl_mpeg2, 1);
 
         for(i=0; i<64; i++)
         {
@@ -782,7 +800,9 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
                 mpeg1_index_run[0][i]= rl_mpeg1.index_run[0][i];
         }
 
-        init_uni_ac_vlc(&rl_mpeg1, uni_mpeg1_ac_vlc_bits, uni_mpeg1_ac_vlc_len);
+        init_uni_ac_vlc(&rl_mpeg1, uni_mpeg1_ac_vlc_len);
+        if(s->intra_vlc_format)
+            init_uni_ac_vlc(&rl_mpeg2, uni_mpeg2_ac_vlc_len);
 
         /* build unified dc encoding tables */
         for(i=-255; i<256; i++)
@@ -849,9 +869,14 @@ void ff_mpeg1_encode_init(MpegEncContext *s)
         s->min_qcoeff=-2047;
         s->max_qcoeff= 2047;
     }
-    s->intra_ac_vlc_length=
+    if (s->intra_vlc_format) {
+        s->intra_ac_vlc_length=
+        s->intra_ac_vlc_last_length= uni_mpeg2_ac_vlc_len;
+    } else {
+        s->intra_ac_vlc_length=
+        s->intra_ac_vlc_last_length= uni_mpeg1_ac_vlc_len;
+    }
     s->inter_ac_vlc_length=
-    s->intra_ac_vlc_last_length=
     s->inter_ac_vlc_last_length= uni_mpeg1_ac_vlc_len;
 }
 
@@ -898,24 +923,20 @@ static void mpeg1_encode_block(MpegEncContext *s,
 {
     int alevel, level, last_non_zero, dc, diff, i, j, run, last_index, sign;
     int code, component;
-//    RLTable *rl = &rl_mpeg1;
+    const uint16_t (*table_vlc)[2] = rl_mpeg1.table_vlc;
 
     last_index = s->block_last_index[n];
 
     /* DC coef */
     if (s->mb_intra) {
-        component = (n <= 3 ? 0 : n - 4 + 1);
+        component = (n <= 3 ? 0 : (n&1) + 1);
         dc = block[0]; /* overflow is impossible */
         diff = dc - s->last_dc[component];
         encode_dc(s, diff, component);
         s->last_dc[component] = dc;
         i = 1;
-/*
         if (s->intra_vlc_format)
-            rl = &rl_mpeg2;
-        else
-            rl = &rl_mpeg1;
-*/
+            table_vlc = rl_mpeg2.table_vlc;
     } else {
         /* encode the first coefficient : needs to be done here because
            it is handled slightly differently */
@@ -950,14 +971,13 @@ static void mpeg1_encode_block(MpegEncContext *s,
             MASK_ABS(sign, alevel)
             sign&=1;
 
-//            code = get_rl_index(rl, 0, run, alevel);
             if (alevel <= mpeg1_max_level[0][run]){
                 code= mpeg1_index_run[0][run] + alevel - 1;
                 /* store the vlc & sign at once */
-                put_bits(&s->pb, mpeg1_vlc[code][1]+1, (mpeg1_vlc[code][0]<<1) + sign);
+                put_bits(&s->pb, table_vlc[code][1]+1, (table_vlc[code][0]<<1) + sign);
             } else {
                 /* escape seems to be pretty rare <5% so i dont optimize it */
-                put_bits(&s->pb, mpeg1_vlc[111/*rl->n*/][1], mpeg1_vlc[111/*rl->n*/][0]);
+                put_bits(&s->pb, table_vlc[111][1], table_vlc[111][0]);
                 /* escape: only clip in this case */
                 put_bits(&s->pb, 6, run);
                 if(s->codec_id == CODEC_ID_MPEG1VIDEO){
@@ -978,7 +998,7 @@ static void mpeg1_encode_block(MpegEncContext *s,
         }
     }
     /* end of block */
-    put_bits(&s->pb, 2, 0x2);
+    put_bits(&s->pb, table_vlc[112][1], table_vlc[112][0]);
 }
 #endif //CONFIG_ENCODERS
 
@@ -2108,8 +2128,8 @@ static int mpeg_decode_postinit(AVCodecContext *avctx){
 
         if(avctx->sub_id==1){//s->codec_id==avctx->codec_id==CODEC_ID
             //mpeg1 fps
-            avctx->time_base.den     = frame_rate_tab[s->frame_rate_index].num;
-            avctx->time_base.num= frame_rate_tab[s->frame_rate_index].den;
+            avctx->time_base.den= ff_frame_rate_tab[s->frame_rate_index].num;
+            avctx->time_base.num= ff_frame_rate_tab[s->frame_rate_index].den;
             //mpeg1 aspect
             avctx->sample_aspect_ratio= av_d2q(
                     1.0/mpeg1_aspect[s->aspect_ratio_info], 255);
@@ -2119,8 +2139,8 @@ static int mpeg_decode_postinit(AVCodecContext *avctx){
             av_reduce(
                 &s->avctx->time_base.den,
                 &s->avctx->time_base.num,
-                frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num,
-                frame_rate_tab[s->frame_rate_index].den * s1->frame_rate_ext.den,
+                ff_frame_rate_tab[s->frame_rate_index].num * s1->frame_rate_ext.num,
+                ff_frame_rate_tab[s->frame_rate_index].den * s1->frame_rate_ext.den,
                 1<<30);
         //mpeg2 aspect
             if(s->aspect_ratio_info > 1){
@@ -2625,10 +2645,13 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
 
             if(s->mb_y<<field_pic >= s->mb_height){
                 int left= s->gb.size_in_bits - get_bits_count(&s->gb);
+                int is_d10= s->chroma_format==2 && s->pict_type==I_TYPE && avctx->profile==0 && avctx->level==5
+                            && s->intra_dc_precision == 2 && s->q_scale_type == 1 && s->alternate_scan == 0
+                            && s->progressive_frame == 0 /* vbv_delay == 0xBBB || 0xE10*/;
 
-                if(left < 0 || (left && show_bits(&s->gb, FFMIN(left, 23)))
+                if(left < 0 || (left && show_bits(&s->gb, FFMIN(left, 23)) && !is_d10)
                    || (avctx->error_resilience >= FF_ER_AGGRESSIVE && left>8)){
-                    av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d\n", left);
+                    av_log(avctx, AV_LOG_ERROR, "end mismatch left=%d %0X\n", left, show_bits(&s->gb, FFMIN(left, 23)));
                     return -1;
                 }else
                     goto eos;
@@ -2793,12 +2816,10 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
             s->chroma_intra_matrix[j] = v;
         }
 #ifdef DEBUG
-/*
         dprintf("intra matrix present\n");
         for(i=0;i<64;i++)
             dprintf(" %d", s->intra_matrix[s->dsp.idct_permutation[i]]);
-        printf("\n");
-*/
+        dprintf("\n");
 #endif
     } else {
         for(i=0;i<64;i++) {
@@ -2820,12 +2841,10 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
             s->chroma_inter_matrix[j] = v;
         }
 #ifdef DEBUG
-/*
         dprintf("non intra matrix present\n");
         for(i=0;i<64;i++)
             dprintf(" %d", s->inter_matrix[s->dsp.idct_permutation[i]]);
-        printf("\n");
-*/
+        dprintf("\n");
 #endif
     } else {
         for(i=0;i<64;i++) {
@@ -3125,7 +3144,7 @@ static int mpeg_decode_frame(AVCodecContext *avctx,
                         /* skip b frames if we dont have reference frames */
                             if(s2->pict_type==B_TYPE) break;
                         /* skip P frames if we dont have reference frame no valid header */
-                            if(s2->pict_type==P_TYPE && !s2->first_slice) break;
+//                            if(s2->pict_type==P_TYPE && s2->first_field && !s2->first_slice) break;
                         }
                         /* skip b frames if we are in a hurry */
                         if(avctx->hurry_up && s2->pict_type==B_TYPE) break;
@@ -3240,7 +3259,7 @@ AVCodec mpeg1video_encoder = {
     encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .supported_framerates= frame_rate_tab+1,
+    .supported_framerates= ff_frame_rate_tab+1,
     .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
     .capabilities= CODEC_CAP_DELAY,
 };
@@ -3253,8 +3272,8 @@ AVCodec mpeg2video_encoder = {
     encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .supported_framerates= frame_rate_tab+1,
-    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, -1},
+    .supported_framerates= ff_frame_rate_tab+1,
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_YUV422P, -1},
     .capabilities= CODEC_CAP_DELAY,
 };
 #endif
diff --git a/src/libffmpeg/libavcodec/mpeg12data.h b/src/libffmpeg/libavcodec/mpeg12data.h
index a6b49aa78..e9a10ff3a 100644
--- a/src/libffmpeg/libavcodec/mpeg12data.h
+++ b/src/libffmpeg/libavcodec/mpeg12data.h
@@ -332,7 +332,7 @@ static const uint8_t mbMotionVectorTable[17][2] = {
 { 0xc, 10 },
 };
 
-static const AVRational frame_rate_tab[] = {
+const AVRational ff_frame_rate_tab[] = {
     {    0,    0},
     {24000, 1001},
     {   24,    1},
diff --git a/src/libffmpeg/libavcodec/mpegaudiodec.c b/src/libffmpeg/libavcodec/mpegaudiodec.c
index ff1f1113e..0d82e3e98 100644
--- a/src/libffmpeg/libavcodec/mpegaudiodec.c
+++ b/src/libffmpeg/libavcodec/mpegaudiodec.c
@@ -489,10 +489,10 @@ static int decode_init(AVCodecContext * avctx)
 
 #if defined(DEBUG)
         for(j=0;j<8;j++) {
-            printf("win%d=\n", j);
+            av_log(avctx, AV_LOG_DEBUG, "win%d=\n", j);
             for(i=0;i<36;i++)
-                printf("%f, ", (double)mdct_win[j][i] / FRAC_ONE);
-            printf("\n");
+                av_log(avctx, AV_LOG_DEBUG, "%f, ", (double)mdct_win[j][i] / FRAC_ONE);
+            av_log(avctx, AV_LOG_DEBUG, "\n");
         }
 #endif
         init = 1;
@@ -1179,20 +1179,20 @@ static int decode_header(MPADecodeContext *s, uint32_t header)
     }
 
 #if defined(DEBUG)
-    printf("layer%d, %d Hz, %d kbits/s, ",
+    dprintf("layer%d, %d Hz, %d kbits/s, ",
            s->layer, s->sample_rate, s->bit_rate);
     if (s->nb_channels == 2) {
         if (s->layer == 3) {
             if (s->mode_ext & MODE_EXT_MS_STEREO)
-                printf("ms-");
+                dprintf("ms-");
             if (s->mode_ext & MODE_EXT_I_STEREO)
-                printf("i-");
+                dprintf("i-");
         }
-        printf("stereo");
+        dprintf("stereo");
     } else {
-        printf("mono");
+        dprintf("mono");
     }
-    printf("\n");
+    dprintf("\n");
 #endif
     return 0;
 }
@@ -1370,8 +1370,8 @@ static int mp_decode_layer2(MPADecodeContext *s)
     {
         for(ch=0;ch<s->nb_channels;ch++) {
             for(i=0;i<sblimit;i++)
-                printf(" %d", bit_alloc[ch][i]);
-            printf("\n");
+                dprintf(" %d", bit_alloc[ch][i]);
+            dprintf("\n");
         }
     }
 #endif
@@ -1421,12 +1421,12 @@ static int mp_decode_layer2(MPADecodeContext *s)
         for(i=0;i<sblimit;i++) {
             if (bit_alloc[ch][i]) {
                 sf = scale_factors[ch][i];
-                printf(" %d %d %d", sf[0], sf[1], sf[2]);
+                dprintf(" %d %d %d", sf[0], sf[1], sf[2]);
             } else {
-                printf(" -");
+                dprintf(" -");
             }
         }
-        printf("\n");
+        dprintf("\n");
     }
 #endif
 
@@ -1650,7 +1650,7 @@ static int huffman_decode(MPADecodeContext *s, GranuleDef *g,
             if (get_bits_count(&s->gb) >= end_pos)
                 break;
             if (code_table) {
-                code = get_vlc2(&s->gb, vlc->table, 8, 2);
+                code = get_vlc2(&s->gb, vlc->table, 8, 3);
                 if (code < 0)
                     return -1;
                 y = code_table[code];
@@ -2285,11 +2285,11 @@ static int mp_decode_layer3(MPADecodeContext *s)
                 }
 #if defined(DEBUG)
                 {
-                    printf("scfsi=%x gr=%d ch=%d scale_factors:\n",
+                    dprintf("scfsi=%x gr=%d ch=%d scale_factors:\n",
                            g->scfsi, gr, ch);
                     for(i=0;i<j;i++)
-                        printf(" %d", g->scale_factors[i]);
-                    printf("\n");
+                        dprintf(" %d", g->scale_factors[i]);
+                    dprintf("\n");
                 }
 #endif
             } else {
@@ -2342,11 +2342,11 @@ static int mp_decode_layer3(MPADecodeContext *s)
                     g->scale_factors[j] = 0;
 #if defined(DEBUG)
                 {
-                    printf("gr=%d ch=%d scale_factors:\n",
+                    dprintf("gr=%d ch=%d scale_factors:\n",
                            gr, ch);
                     for(i=0;i<40;i++)
-                        printf(" %d", g->scale_factors[i]);
-                    printf("\n");
+                        dprintf(" %d", g->scale_factors[i]);
+                    dprintf("\n");
                 }
 #endif
             }
@@ -2428,10 +2428,10 @@ static int mp_decode_frame(MPADecodeContext *s,
     for(i=0;i<nb_frames;i++) {
         for(ch=0;ch<s->nb_channels;ch++) {
             int j;
-            printf("%d-%d:", i, ch);
+            dprintf("%d-%d:", i, ch);
             for(j=0;j<SBLIMIT;j++)
-                printf(" %0.6f", (double)s->sb_samples[ch][i][j] / FRAC_ONE);
-            printf("\n");
+                dprintf(" %0.6f", (double)s->sb_samples[ch][i][j] / FRAC_ONE);
+            dprintf("\n");
         }
     }
 #endif
diff --git a/src/libffmpeg/libavcodec/mpegvideo.c b/src/libffmpeg/libavcodec/mpegvideo.c
index 77d6691af..f1c1b34bb 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.c
+++ b/src/libffmpeg/libavcodec/mpegvideo.c
@@ -55,6 +55,8 @@ static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
 static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
+static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
+                                   DCTELEM *block, int n, int qscale);
 static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
 static void dct_unquantize_h263_intra_c(MpegEncContext *s,
@@ -239,6 +241,7 @@ void ff_write_quant_matrix(PutBitContext *pb, int16_t *matrix){
 const uint8_t *ff_find_start_code(const uint8_t * restrict p, const uint8_t *end, uint32_t * restrict state){
     int i;
 
+    assert(p<=end);
     if(p>=end)
         return end;
 
@@ -273,6 +276,8 @@ int DCT_common_init(MpegEncContext *s)
     s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_c;
     s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_c;
     s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_c;
+    if(s->flags & CODEC_FLAG_BITEXACT)
+        s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_bitexact;
     s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_c;
 
 #ifdef CONFIG_ENCODERS
@@ -490,8 +495,8 @@ static int init_duplicate_context(MpegEncContext *s, MpegEncContext *base){
     int i;
 
     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
-    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*17*2); //(width + edge + align)*interlaced*MBsize*tolerance
-    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*17;
+    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
+    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
 
      //FIXME should be linesize instead of s->width*2 but that isnt known before get_buffer()
     CHECKED_ALLOCZ(s->me.scratchpad,  (s->width+64)*4*16*2*sizeof(uint8_t))
@@ -930,27 +935,43 @@ int MPV_encode_init(AVCodecContext *avctx)
 
     MPV_encode_defaults(s);
 
-    if(avctx->pix_fmt != PIX_FMT_YUVJ420P && avctx->pix_fmt != PIX_FMT_YUV420P){
-        av_log(avctx, AV_LOG_ERROR, "only YUV420 is supported\n");
-        return -1;
-    }
-
-    if(avctx->codec_id == CODEC_ID_MJPEG || avctx->codec_id == CODEC_ID_LJPEG){
-        if(avctx->strict_std_compliance>FF_COMPLIANCE_INOFFICIAL && avctx->pix_fmt != PIX_FMT_YUVJ420P){
+    switch (avctx->codec_id) {
+    case CODEC_ID_MPEG2VIDEO:
+        if(avctx->pix_fmt != PIX_FMT_YUV420P && avctx->pix_fmt != PIX_FMT_YUV422P){
+            av_log(avctx, AV_LOG_ERROR, "only YUV420 and YUV422 are supported\n");
+            return -1;
+        }
+        break;
+    case CODEC_ID_LJPEG:
+    case CODEC_ID_MJPEG:
+        if(avctx->pix_fmt != PIX_FMT_YUVJ420P && (avctx->pix_fmt != PIX_FMT_YUV420P || avctx->strict_std_compliance>FF_COMPLIANCE_INOFFICIAL)){
             av_log(avctx, AV_LOG_ERROR, "colorspace not supported in jpeg\n");
             return -1;
         }
-    }else{
-        if(avctx->strict_std_compliance>FF_COMPLIANCE_INOFFICIAL && avctx->pix_fmt != PIX_FMT_YUV420P){
-            av_log(avctx, AV_LOG_ERROR, "colorspace not supported\n");
+        break;
+    default:
+        if(avctx->pix_fmt != PIX_FMT_YUV420P){
+            av_log(avctx, AV_LOG_ERROR, "only YUV420 is supported\n");
             return -1;
         }
     }
 
+    switch (avctx->pix_fmt) {
+    case PIX_FMT_YUVJ422P:
+    case PIX_FMT_YUV422P:
+        s->chroma_format = CHROMA_422;
+        break;
+    case PIX_FMT_YUVJ420P:
+    case PIX_FMT_YUV420P:
+    default:
+        s->chroma_format = CHROMA_420;
+        break;
+    }
+
     s->bit_rate = avctx->bit_rate;
     s->width = avctx->width;
     s->height = avctx->height;
-    if(avctx->gop_size > 600){
+    if(avctx->gop_size > 600 && avctx->strict_std_compliance>FF_COMPLIANCE_EXPERIMENTAL){
         av_log(avctx, AV_LOG_ERROR, "Warning keyframe interval too large! reducing it ...\n");
         avctx->gop_size=600;
     }
@@ -994,6 +1015,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->obmc= !!(s->flags & CODEC_FLAG_OBMC);
     s->loop_filter= !!(s->flags & CODEC_FLAG_LOOP_FILTER);
     s->alternate_scan= !!(s->flags & CODEC_FLAG_ALT_SCAN);
+    s->intra_vlc_format= !!(s->flags2 & CODEC_FLAG2_INTRA_VLC);
 
     if(avctx->rc_max_rate && !avctx->rc_buffer_size){
         av_log(avctx, AV_LOG_ERROR, "a vbv buffer size is needed, for encoding with a maximum bitrate\n");
@@ -1078,6 +1100,11 @@ int MPV_encode_init(AVCodecContext *avctx)
         return -1;
     }
 
+    if((s->flags2 & CODEC_FLAG2_INTRA_VLC) && s->codec_id != CODEC_ID_MPEG2VIDEO){
+        av_log(avctx, AV_LOG_ERROR, "intra vlc table not supported by codec\n");
+        return -1;
+    }
+
     if(s->avctx->thread_count > 1 && s->codec_id != CODEC_ID_MPEG4
        && s->codec_id != CODEC_ID_MPEG1VIDEO && s->codec_id != CODEC_ID_MPEG2VIDEO
        && (s->codec_id != CODEC_ID_H263P || !(s->flags & CODEC_FLAG_H263P_SLICE_STRUCT))){
@@ -1104,8 +1131,8 @@ int MPV_encode_init(AVCodecContext *avctx)
     }
 
     if(avctx->b_frame_strategy && (avctx->flags&CODEC_FLAG_PASS2)){
-        av_log(avctx, AV_LOG_ERROR, "b_frame_strategy must be 0 on the second pass\n");
-        return -1;
+        av_log(avctx, AV_LOG_INFO, "notice: b_frame_strategy only affects the first pass\n");
+        avctx->b_frame_strategy = 0;
     }
 
     i= ff_gcd(avctx->time_base.den, avctx->time_base.num);
@@ -1359,10 +1386,6 @@ int MPV_encode_end(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
 
-#ifdef STATS
-    print_stats();
-#endif
-
     ff_rate_control_uninit(s);
 
 /* xine: do not need this for decode or MPEG-1 encoding modes */
@@ -1848,7 +1871,7 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
         const int width = s->avctx->width;
         const int height= s->avctx->height;
         const int mv_sample_log2= 4 - pict->motion_subsample_log2;
-        const int mv_stride= (s->mb_width << mv_sample_log2) + 1;
+        const int mv_stride= (s->mb_width << mv_sample_log2) + (s->codec_id == CODEC_ID_H264 ? 0 : 1);
         s->low_delay=0; //needed to see the vectors without trashing the buffers
 
         avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
@@ -2197,10 +2220,11 @@ static int estimate_best_b_count(MpegEncContext *s){
     int i, j, out_size, p_lambda, b_lambda, lambda2;
     int outbuf_size= s->width * s->height; //FIXME
     uint8_t *outbuf= av_malloc(outbuf_size);
-    ImgReSampleContext *resample;
     int64_t best_rd= INT64_MAX;
     int best_b_count= -1;
 
+    assert(scale>=0 && scale <=3);
+
 //    emms_c();
     p_lambda= s->last_lambda_for[P_TYPE]; //s->next_picture_ptr->quality;
     b_lambda= s->last_lambda_for[B_TYPE]; //p_lambda *ABS(s->avctx->b_quant_factor) + s->avctx->b_quant_offset;
@@ -2222,8 +2246,6 @@ static int estimate_best_b_count(MpegEncContext *s){
     if (avcodec_open(c, codec) < 0)
         return -1;
 
-    resample= img_resample_init(c->width, c->height, s->width, s->height); //FIXME use sws
-
     for(i=0; i<s->max_b_frames+2; i++){
         int ysize= c->width*c->height;
         int csize= (c->width/2)*(c->height/2);
@@ -2246,8 +2268,11 @@ static int estimate_best_b_count(MpegEncContext *s){
         input[i].linesize[1]=
         input[i].linesize[2]= c->width/2;
 
-        if(!i || s->input_picture[i-1])
-            img_resample(resample, &input[i], &pre_input);
+        if(!i || s->input_picture[i-1]){
+            s->dsp.shrink[scale](input[i].data[0], input[i].linesize[0], pre_input.data[0], pre_input.linesize[0], c->width, c->height);
+            s->dsp.shrink[scale](input[i].data[1], input[i].linesize[1], pre_input.data[1], pre_input.linesize[1], c->width>>1, c->height>>1);
+            s->dsp.shrink[scale](input[i].data[2], input[i].linesize[2], pre_input.data[2], pre_input.linesize[2], c->width>>1, c->height>>1);
+        }
     }
 
     for(j=0; j<s->max_b_frames+1; j++){
@@ -2289,7 +2314,6 @@ static int estimate_best_b_count(MpegEncContext *s){
     av_freep(&outbuf);
     avcodec_close(c);
     av_freep(&c);
-    img_resample_close(resample);
 
     for(i=0; i<s->max_b_frames+2; i++){
         av_freep(&input[i].data[0]);
@@ -2365,7 +2389,7 @@ static void select_input_picture(MpegEncContext *s){
                     }
                 }
                 for(i=0; i<s->max_b_frames+1; i++){
-                    if(s->input_picture[i]==NULL || s->input_picture[i]->b_frame_score - 1 > s->mb_num/40) break;
+                    if(s->input_picture[i]==NULL || s->input_picture[i]->b_frame_score - 1 > s->mb_num/s->avctx->b_sensitivity) break;
                 }
 
                 b_frames= FFMAX(0, i-1);
@@ -2472,11 +2496,6 @@ int MPV_encode_picture(AVCodecContext *avctx,
     AVFrame *pic_arg = data;
     int i, stuffing_count;
 
-    if(avctx->pix_fmt != PIX_FMT_YUV420P && avctx->pix_fmt != PIX_FMT_YUVJ420P){
-        av_log(avctx, AV_LOG_ERROR, "this codec supports only YUV420P\n");
-        return -1;
-    }
-
     for(i=0; i<avctx->thread_count; i++){
         int start_y= s->thread_context[i]->start_mb_y;
         int   end_y= s->thread_context[i]->  end_mb_y;
@@ -3378,6 +3397,18 @@ static inline void chroma_4mv_motion_lowres(MpegEncContext *s,
     pix_op[lowres](dest_cr, ptr, s->uvlinesize, block_s, sx, sy);
 }
 
+static inline void prefetch_motion(MpegEncContext *s, uint8_t **pix, int dir){
+    /* fetch pixels for estimated mv 4 macroblocks ahead
+     * optimized for 64byte cache lines */
+    const int shift = s->quarter_sample ? 2 : 1;
+    const int mx= (s->mv[dir][0][0]>>shift) + 16*s->mb_x + 8;
+    const int my= (s->mv[dir][0][1]>>shift) + 16*s->mb_y;
+    int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64;
+    s->dsp.prefetch(pix[0]+off, s->linesize, 4);
+    off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
+    s->dsp.prefetch(pix[1]+off, pix[2]-pix[1], 2);
+}
+
 /**
  * motion compensation of a single macroblock
  * @param s context
@@ -3402,6 +3433,8 @@ static inline void MPV_motion(MpegEncContext *s,
     mb_x = s->mb_x;
     mb_y = s->mb_y;
 
+    prefetch_motion(s, ref_picture, dir);
+
     if(s->obmc && s->pict_type != B_TYPE){
         int16_t mv_cache[4][4][2];
         const int xy= s->mb_x + s->mb_y*s->mb_stride;
@@ -3963,8 +3996,17 @@ static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM bloc
                 add_dequant_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize, s->qscale);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
-                    add_dequant_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
-                    add_dequant_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    if (s->chroma_y_shift){
+                        add_dequant_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    }else{
+                        dct_linesize >>= 1;
+                        dct_offset >>=1;
+                        add_dequant_dct(s, block[4], 4, dest_cb,              dct_linesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[5], 5, dest_cr,              dct_linesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[6], 6, dest_cb + dct_offset, dct_linesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[7], 7, dest_cr + dct_offset, dct_linesize, s->chroma_qscale);
+                    }
                 }
             } else if(s->codec_id != CODEC_ID_WMV2){
                 add_dct(s, block[0], 0, dest_y                          , dct_linesize);
@@ -4006,8 +4048,17 @@ static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM bloc
                 put_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize, s->qscale);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
-                    put_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
-                    put_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    if(s->chroma_y_shift){
+                        put_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
+                        put_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    }else{
+                        dct_offset >>=1;
+                        dct_linesize >>=1;
+                        put_dct(s, block[4], 4, dest_cb,              dct_linesize, s->chroma_qscale);
+                        put_dct(s, block[5], 5, dest_cr,              dct_linesize, s->chroma_qscale);
+                        put_dct(s, block[6], 6, dest_cb + dct_offset, dct_linesize, s->chroma_qscale);
+                        put_dct(s, block[7], 7, dest_cr + dct_offset, dct_linesize, s->chroma_qscale);
+                    }
                 }
             }else{
                 s->dsp.idct_put(dest_y                          , dct_linesize, block[0]);
@@ -4229,19 +4280,19 @@ static void get_vissual_weight(int16_t *weight, uint8_t *ptr, int stride){
     }
 }
 
-static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
+static always_inline void encode_mb_internal(MpegEncContext *s, int motion_x, int motion_y, int mb_block_height, int mb_block_count)
 {
-    int16_t weight[6][64];
-    DCTELEM orig[6][64];
+    int16_t weight[8][64];
+    DCTELEM orig[8][64];
     const int mb_x= s->mb_x;
     const int mb_y= s->mb_y;
     int i;
-    int skip_dct[6];
+    int skip_dct[8];
     int dct_offset   = s->linesize*8; //default for progressive frames
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     int wrap_y, wrap_c;
 
-    for(i=0; i<6; i++) skip_dct[i]=0;
+    for(i=0; i<mb_block_count; i++) skip_dct[i]=0;
 
     if(s->adaptive_quant){
         const int last_qp= s->qscale;
@@ -4277,16 +4328,16 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     wrap_y = s->linesize;
     wrap_c = s->uvlinesize;
     ptr_y = s->new_picture.data[0] + (mb_y * 16 * wrap_y) + mb_x * 16;
-    ptr_cb = s->new_picture.data[1] + (mb_y * 8 * wrap_c) + mb_x * 8;
-    ptr_cr = s->new_picture.data[2] + (mb_y * 8 * wrap_c) + mb_x * 8;
+    ptr_cb = s->new_picture.data[1] + (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+    ptr_cr = s->new_picture.data[2] + (mb_y * mb_block_height * wrap_c) + mb_x * 8;
 
     if(mb_x*16+16 > s->width || mb_y*16+16 > s->height){
         uint8_t *ebuf= s->edge_emu_buffer + 32;
         ff_emulated_edge_mc(ebuf            , ptr_y , wrap_y,16,16,mb_x*16,mb_y*16, s->width   , s->height);
         ptr_y= ebuf;
-        ff_emulated_edge_mc(ebuf+18*wrap_y  , ptr_cb, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+        ff_emulated_edge_mc(ebuf+18*wrap_y  , ptr_cb, wrap_c, 8, mb_block_height, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
         ptr_cb= ebuf+18*wrap_y;
-        ff_emulated_edge_mc(ebuf+18*wrap_y+8, ptr_cr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+        ff_emulated_edge_mc(ebuf+18*wrap_y+8, ptr_cr, wrap_c, 8, mb_block_height, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
         ptr_cr= ebuf+18*wrap_y+8;
     }
 
@@ -4306,6 +4357,8 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
                     dct_offset= wrap_y;
                     wrap_y<<=1;
+                    if (s->chroma_format == CHROMA_422)
+                        wrap_c<<=1;
                 }
             }
         }
@@ -4321,6 +4374,10 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         }else{
             s->dsp.get_pixels(s->block[4], ptr_cb, wrap_c);
             s->dsp.get_pixels(s->block[5], ptr_cr, wrap_c);
+            if(!s->chroma_y_shift){ /* 422 */
+                s->dsp.get_pixels(s->block[6], ptr_cb + (dct_offset>>1), wrap_c);
+                s->dsp.get_pixels(s->block[7], ptr_cr + (dct_offset>>1), wrap_c);
+            }
         }
     }else{
         op_pixels_func (*op_pix)[4];
@@ -4366,6 +4423,8 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
                     dct_offset= wrap_y;
                     wrap_y<<=1;
+                    if (s->chroma_format == CHROMA_422)
+                        wrap_c<<=1;
                 }
             }
         }
@@ -4381,6 +4440,10 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         }else{
             s->dsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
             s->dsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
+            if(!s->chroma_y_shift){ /* 422 */
+                s->dsp.diff_pixels(s->block[6], ptr_cb + (dct_offset>>1), dest_cb + (dct_offset>>1), wrap_c);
+                s->dsp.diff_pixels(s->block[7], ptr_cr + (dct_offset>>1), dest_cr + (dct_offset>>1), wrap_c);
+            }
         }
         /* pre quantization */
         if(s->current_picture.mc_mb_var[s->mb_stride*mb_y+ mb_x]<2*s->qscale*s->qscale){
@@ -4391,6 +4454,10 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             if(s->dsp.sad[1](NULL, ptr_y +dct_offset+ 8, dest_y +dct_offset+ 8, wrap_y, 8) < 20*s->qscale) skip_dct[3]= 1;
             if(s->dsp.sad[1](NULL, ptr_cb              , dest_cb              , wrap_c, 8) < 20*s->qscale) skip_dct[4]= 1;
             if(s->dsp.sad[1](NULL, ptr_cr              , dest_cr              , wrap_c, 8) < 20*s->qscale) skip_dct[5]= 1;
+            if(!s->chroma_y_shift){ /* 422 */
+                if(s->dsp.sad[1](NULL, ptr_cb +(dct_offset>>1), dest_cb +(dct_offset>>1), wrap_c, 8) < 20*s->qscale) skip_dct[6]= 1;
+                if(s->dsp.sad[1](NULL, ptr_cr +(dct_offset>>1), dest_cr +(dct_offset>>1), wrap_c, 8) < 20*s->qscale) skip_dct[7]= 1;
+            }
         }
     }
 
@@ -4401,13 +4468,17 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         if(!skip_dct[3]) get_vissual_weight(weight[3], ptr_y + dct_offset + 8, wrap_y);
         if(!skip_dct[4]) get_vissual_weight(weight[4], ptr_cb                , wrap_c);
         if(!skip_dct[5]) get_vissual_weight(weight[5], ptr_cr                , wrap_c);
-        memcpy(orig[0], s->block[0], sizeof(DCTELEM)*64*6);
+        if(!s->chroma_y_shift){ /* 422 */
+            if(!skip_dct[6]) get_vissual_weight(weight[6], ptr_cb + (dct_offset>>1), wrap_c);
+            if(!skip_dct[7]) get_vissual_weight(weight[7], ptr_cr + (dct_offset>>1), wrap_c);
+        }
+        memcpy(orig[0], s->block[0], sizeof(DCTELEM)*64*mb_block_count);
     }
 
     /* DCT & quantize */
     assert(s->out_format!=FMT_MJPEG || s->qscale==8);
     {
-        for(i=0;i<6;i++) {
+        for(i=0;i<mb_block_count;i++) {
             if(!skip_dct[i]){
                 int overflow;
                 s->block_last_index[i] = s->dct_quantize(s, s->block[i], i, s->qscale, &overflow);
@@ -4419,7 +4490,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
                 s->block_last_index[i]= -1;
         }
         if(s->avctx->quantizer_noise_shaping){
-            for(i=0;i<6;i++) {
+            for(i=0;i<mb_block_count;i++) {
                 if(!skip_dct[i]){
                     s->block_last_index[i] = dct_quantize_refine(s, s->block[i], weight[i], orig[i], i, s->qscale);
                 }
@@ -4430,11 +4501,11 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             for(i=0; i<4; i++)
                 dct_single_coeff_elimination(s, i, s->luma_elim_threshold);
         if(s->chroma_elim_threshold && !s->mb_intra)
-            for(i=4; i<6; i++)
+            for(i=4; i<mb_block_count; i++)
                 dct_single_coeff_elimination(s, i, s->chroma_elim_threshold);
 
         if(s->flags & CODEC_FLAG_CBP_RD){
-            for(i=0;i<6;i++) {
+            for(i=0;i<mb_block_count;i++) {
                 if(s->block_last_index[i] == -1)
                     s->coded_score[i]= INT_MAX/256;
             }
@@ -4450,7 +4521,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
     //non c quantize code returns incorrect block_last_index FIXME
     if(s->alternate_scan && s->dct_quantize != dct_quantize_c){
-        for(i=0; i<6; i++){
+        for(i=0; i<mb_block_count; i++){
             int j;
             if(s->block_last_index[i]>0){
                 for(j=63; j>0; j--){
@@ -4494,6 +4565,12 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     }
 }
 
+static always_inline void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
+{
+    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 6);
+    else                                encode_mb_internal(s, motion_x, motion_y, 16, 8);
+}
+
 #endif //CONFIG_ENCODERS
 
 void ff_mpeg_flush(AVCodecContext *avctx){
@@ -4603,7 +4680,7 @@ static inline void copy_context_after_encode(MpegEncContext *d, MpegEncContext *
         d->tex_pb= s->tex_pb;
     }
     d->block= s->block;
-    for(i=0; i<6; i++)
+    for(i=0; i<8; i++)
         d->block_last_index[i]= s->block_last_index[i];
     d->interlaced_dct= s->interlaced_dct;
     d->qscale= s->qscale;
@@ -5621,7 +5698,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         for(i=1;i<64;i++){
             int j= s->dsp.idct_permutation[i];
 
-            s->intra_matrix[j] = clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3) & 0xFF;
+            s->intra_matrix[j] = clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
         }
         convert_matrix(&s->dsp, s->q_intra_matrix, s->q_intra_matrix16,
                        s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
@@ -6581,6 +6658,39 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     }
 }
 
+static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
+                                   DCTELEM *block, int n, int qscale)
+{
+    int i, level, nCoeffs;
+    const uint16_t *quant_matrix;
+    int sum=-1;
+
+    if(s->alternate_scan) nCoeffs= 63;
+    else nCoeffs= s->block_last_index[n];
+
+    if (n < 4)
+        block[0] = block[0] * s->y_dc_scale;
+    else
+        block[0] = block[0] * s->c_dc_scale;
+    quant_matrix = s->intra_matrix;
+    for(i=1;i<=nCoeffs;i++) {
+        int j= s->intra_scantable.permutated[i];
+        level = block[j];
+        if (level) {
+            if (level < 0) {
+                level = -level;
+                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = -level;
+            } else {
+                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+            }
+            block[j] = level;
+            sum+=level;
+        }
+    }
+    block[63]^=sum&1;
+}
+
 static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale)
 {
diff --git a/src/libffmpeg/libavcodec/mpegvideo.h b/src/libffmpeg/libavcodec/mpegvideo.h
index 888b0b608..023e65700 100644
--- a/src/libffmpeg/libavcodec/mpegvideo.h
+++ b/src/libffmpeg/libavcodec/mpegvideo.h
@@ -123,6 +123,7 @@ typedef struct RateControlContext{
 
     void *non_lavc_opaque;        ///< context for non lavc rc code (for example xvid)
     float dry_run_qscale;         ///< for xvid rc
+    int last_picture_number;      ///< for xvid rc
 }RateControlContext;
 
 /**
@@ -429,6 +430,7 @@ typedef struct MpegEncContext {
     int field_select[2][2];
     int last_mv[2][2][2];             ///< last MV, used for MV prediction in MPEG1 & B-frame MPEG4
     uint8_t *fcode_tab;               ///< smallest fcode needed for each MV
+    int16_t direct_scale_mv[2][64];   ///< precomputed to avoid divisions in ff_mpeg4_set_direct_mv
 
     MotionEstContext me;
 
@@ -484,7 +486,7 @@ typedef struct MpegEncContext {
     uint8_t *chroma_dc_vlc_length;
 #define UNI_AC_ENC_INDEX(run,level) ((run)*128 + (level))
 
-    int coded_score[6];
+    int coded_score[8];
 
     /** precomputed matrix (combine qscale and DCT renorm) */
     int (*q_intra_matrix)[64];
@@ -600,6 +602,7 @@ typedef struct MpegEncContext {
     int vo_type;
     int vol_control_parameters;      ///< does the stream contain the low_delay flag, used to workaround buggy encoders
     int intra_dc_threshold;          ///< QP above whch the ac VLC should be used for intra dc
+    int use_intra_dc_vlc;
     PutBitContext tex_pb;            ///< used for data partitioned VOPs
     PutBitContext pb2;               ///< used for data partitioned VOPs
     int mpeg_quant;
@@ -696,7 +699,7 @@ typedef struct MpegEncContext {
     short * pblocks[12];
 
     DCTELEM (*block)[64]; ///< points to one of the following blocks
-    DCTELEM (*blocks)[6][64]; // for HQ mode we need to keep the best block
+    DCTELEM (*blocks)[8][64]; // for HQ mode we need to keep the best block
     int (*decode_mb)(struct MpegEncContext *s, DCTELEM block[6][64]); // used by some codecs to avoid a switch()
 #define SLICE_OK         0
 #define SLICE_ERROR     -1
diff --git a/src/libffmpeg/libavcodec/msmpeg4.c b/src/libffmpeg/libavcodec/msmpeg4.c
index 5bb7158e6..6d83f5c6c 100644
--- a/src/libffmpeg/libavcodec/msmpeg4.c
+++ b/src/libffmpeg/libavcodec/msmpeg4.c
@@ -75,7 +75,7 @@ static int msmpeg4v12_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 static int wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 
-/* vc9 externs */
+/* vc1 externs */
 extern uint8_t wmv3_dc_scale_table[32];
 
 #ifdef DEBUG
@@ -89,70 +89,6 @@ int frame_count = 0;
 static uint8_t rl_length[NB_RL_TABLES][MAX_LEVEL+1][MAX_RUN+1][2];
 #endif //CONFIG_ENCODERS
 
-#ifdef STATS
-
-const char *st_names[ST_NB] = {
-    "unknown",
-    "dc",
-    "intra_ac",
-    "inter_ac",
-    "intra_mb",
-    "inter_mb",
-    "mv",
-};
-
-int st_current_index = 0;
-unsigned int st_bit_counts[ST_NB];
-unsigned int st_out_bit_counts[ST_NB];
-
-#define set_stat(var) st_current_index = var;
-
-void print_stats(void)
-{
-    unsigned int total;
-    int i;
-
-    printf("Input:\n");
-    total = 0;
-    for(i=0;i<ST_NB;i++)
-        total += st_bit_counts[i];
-    if (total == 0)
-        total = 1;
-    for(i=0;i<ST_NB;i++) {
-        printf("%-10s : %10.1f %5.1f%%\n",
-               st_names[i],
-               (double)st_bit_counts[i] / 8.0,
-               (double)st_bit_counts[i] * 100.0 / total);
-    }
-    printf("%-10s : %10.1f %5.1f%%\n",
-           "total",
-           (double)total / 8.0,
-           100.0);
-
-    printf("Output:\n");
-    total = 0;
-    for(i=0;i<ST_NB;i++)
-        total += st_out_bit_counts[i];
-    if (total == 0)
-        total = 1;
-    for(i=0;i<ST_NB;i++) {
-        printf("%-10s : %10.1f %5.1f%%\n",
-               st_names[i],
-               (double)st_out_bit_counts[i] / 8.0,
-               (double)st_out_bit_counts[i] * 100.0 / total);
-    }
-    printf("%-10s : %10.1f %5.1f%%\n",
-           "total",
-           (double)total / 8.0,
-           100.0);
-}
-
-#else
-
-#define set_stat(var)
-
-#endif
-
 static void common_init(MpegEncContext * s)
 {
     static int inited=0;
@@ -177,7 +113,7 @@ static void common_init(MpegEncContext * s)
         s->y_dc_scale_table= wmv1_y_dc_scale_table;
         s->c_dc_scale_table= wmv1_c_dc_scale_table;
         break;
-#if defined(CONFIG_WMV3_DECODER)||defined(CONFIG_VC9_DECODER)
+#if defined(CONFIG_WMV3_DECODER)||defined(CONFIG_VC1_DECODER)
     case 6:
         s->y_dc_scale_table= wmv3_dc_scale_table;
         s->c_dc_scale_table= wmv3_dc_scale_table;
@@ -434,7 +370,7 @@ void msmpeg4_encode_picture_header(MpegEncContext * s, int picture_number)
 
 #ifdef DEBUG
     intra_count = 0;
-    printf("*****frame %d:\n", frame_count++);
+    av_log(s->avctx, AV_LOG_DEBUG, "*****frame %d:\n", frame_count++);
 #endif
 }
 
@@ -504,12 +440,11 @@ static void msmpeg4_encode_motion(MpegEncContext * s,
 #if 0
     if ((unsigned)mx >= 64 ||
         (unsigned)my >= 64)
-        fprintf(stderr, "error mx=%d my=%d\n", mx, my);
+        av_log(s->avctx, AV_LOG_ERROR, "error mx=%d my=%d\n", mx, my);
 #endif
     mv = &mv_tables[s->mv_table_index];
 
     code = mv->table_mv_index[(mx << 6) | my];
-    set_stat(ST_MV);
     put_bits(&s->pb,
              mv->table_mv_bits[code],
              mv->table_mv_code[code]);
@@ -545,7 +480,6 @@ void msmpeg4_encode_mb(MpegEncContext * s,
 
     if (!s->mb_intra) {
         /* compute cbp */
-        set_stat(ST_INTER_MB);
         cbp = 0;
         for (i = 0; i < 6; i++) {
             if (s->block_last_index[i] >= 0)
@@ -636,7 +570,6 @@ void msmpeg4_encode_mb(MpegEncContext * s,
                      cbpy_tab[cbp>>2][0]);
         }else{
             if (s->pict_type == I_TYPE) {
-                set_stat(ST_INTRA_MB);
                 put_bits(&s->pb,
                          ff_msmp4_mb_i_table[coded_cbp][1], ff_msmp4_mb_i_table[coded_cbp][0]);
             } else {
@@ -646,7 +579,6 @@ void msmpeg4_encode_mb(MpegEncContext * s,
                          table_mb_non_intra[cbp][1],
                          table_mb_non_intra[cbp][0]);
             }
-            set_stat(ST_INTRA_MB);
             put_bits(&s->pb, 1, 0);             /* no AC prediction yet */
             if(s->inter_intra_pred){
                 s->h263_aic_dir=0;
@@ -924,7 +856,6 @@ static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int
     const uint8_t *scantable;
 
     if (s->mb_intra) {
-        set_stat(ST_DC);
         msmpeg4_encode_dc(s, block[0], n, &dc_pred_dir);
         i = 1;
         if (n < 4) {
@@ -934,7 +865,6 @@ static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int
         }
         run_diff = 0;
         scantable= s->intra_scantable.permutated;
-        set_stat(ST_INTRA_AC);
     } else {
         i = 0;
         rl = &rl_table[3 + s->rl_table_index];
@@ -943,7 +873,6 @@ static inline void msmpeg4_encode_block(MpegEncContext * s, DCTELEM * block, int
         else
             run_diff = 1;
         scantable= s->inter_scantable.permutated;
-        set_stat(ST_INTER_AC);
     }
 
     /* recalculate block_last_index for M$ wmv1 */
@@ -1197,7 +1126,7 @@ int ff_msmpeg4_decode_init(MpegEncContext *s)
     case 5:
         s->decode_mb= wmv2_decode_mb;
     case 6:
-        //FIXME + TODO VC9 decode mb
+        //FIXME + TODO VC1 decode mb
         break;
     }
 
@@ -1214,9 +1143,9 @@ int msmpeg4_decode_picture_header(MpegEncContext * s)
 {
 int i;
 for(i=0; i<s->gb.size_in_bits; i++)
-    printf("%d", get_bits1(&s->gb));
+    av_log(s->avctx, AV_LOG_DEBUG, "%d", get_bits1(&s->gb));
 //    get_bits1(&s->gb);
-printf("END\n");
+av_log(s->avctx, AV_LOG_DEBUG, "END\n");
 return -1;
 }
 #endif
@@ -1370,7 +1299,7 @@ return -1;
     s->esc3_run_length= 0;
 
 #ifdef DEBUG
-    printf("*****frame %d:\n", frame_count++);
+    av_log(s->avctx, AV_LOG_DEBUG, "*****frame %d:\n", frame_count++);
 #endif
     return 0;
 }
@@ -1572,7 +1501,6 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
     uint32_t * const mb_type_ptr= &s->current_picture.mb_type[ s->mb_x + s->mb_y*s->mb_stride ];
 
     if (s->pict_type == P_TYPE) {
-        set_stat(ST_INTER_MB);
         if (s->use_skip_mb_code) {
             if (get_bits1(&s->gb)) {
                 /* skip mb */
@@ -1598,7 +1526,6 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
 
         cbp = code & 0x3f;
     } else {
-        set_stat(ST_INTRA_MB);
         s->mb_intra = 1;
         code = get_vlc2(&s->gb, ff_msmp4_mb_i_vlc.table, MB_INTRA_VLC_BITS, 2);
         if (code < 0)
@@ -1623,7 +1550,6 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
             s->rl_table_index = decode012(&s->gb);
             s->rl_chroma_table_index = s->rl_table_index;
         }
-        set_stat(ST_MV);
         h263_pred_motion(s, 0, 0, &mx, &my);
         if (msmpeg4_decode_motion(s, &mx, &my) < 0)
             return -1;
@@ -1634,7 +1560,6 @@ static int msmpeg4v34_decode_mb(MpegEncContext *s, DCTELEM block[6][64])
         *mb_type_ptr = MB_TYPE_L0 | MB_TYPE_16x16;
     } else {
 //printf("I at %d %d %d %06X\n", s->mb_x, s->mb_y, ((cbp&3)? 1 : 0) +((cbp&0x3C)? 2 : 0), show_bits(&s->gb, 24));
-        set_stat(ST_INTRA_MB);
         s->ac_pred = get_bits1(&s->gb);
         *mb_type_ptr = MB_TYPE_INTRA;
         if(s->inter_intra_pred){
@@ -1673,7 +1598,6 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         qadd=0;
 
         /* DC coef */
-        set_stat(ST_DC);
         level = msmpeg4_decode_dc(s, n, &dc_pred_dir);
 
         if (level < 0){
@@ -1709,7 +1633,6 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         } else {
             scan_table = s->intra_scantable.permutated;
         }
-        set_stat(ST_INTRA_AC);
         rl_vlc= rl->rl_vlc[0];
     } else {
         qmul = s->qscale << 1;
@@ -1728,7 +1651,6 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
         }
         if(!scan_table)
             scan_table = s->inter_scantable.permutated;
-        set_stat(ST_INTER_AC);
         rl_vlc= rl->rl_vlc[s->qscale];
     }
   {
@@ -1794,15 +1716,15 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                         const int run1= run - rl->max_run[last][abs_level] - run_diff;
                         if(abs_level<=MAX_LEVEL && run<=MAX_RUN){
                             if(abs_level <= rl->max_level[last][run]){
-                                fprintf(stderr, "illegal 3. esc, vlc encoding possible\n");
+                                av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, vlc encoding possible\n");
                                 return DECODING_AC_LOST;
                             }
                             if(abs_level <= rl->max_level[last][run]*2){
-                                fprintf(stderr, "illegal 3. esc, esc 1 encoding possible\n");
+                                av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 1 encoding possible\n");
                                 return DECODING_AC_LOST;
                             }
                             if(run1>=0 && abs_level <= rl->max_level[last][run1]){
-                                fprintf(stderr, "illegal 3. esc, esc 2 encoding possible\n");
+                                av_log(s->avctx, AV_LOG_ERROR, "illegal 3. esc, esc 2 encoding possible\n");
                                 return DECODING_AC_LOST;
                             }
                         }
@@ -1813,7 +1735,7 @@ static inline int msmpeg4_decode_block(MpegEncContext * s, DCTELEM * block,
                     else         level= level * qmul - qadd;
 #if 0 // waste of time too :(
                     if(level>2048 || level<-2048){
-                        fprintf(stderr, "|level| overflow in 3. esc\n");
+                        av_log(s->avctx, AV_LOG_ERROR, "|level| overflow in 3. esc\n");
                         return DECODING_AC_LOST;
                     }
 #endif
diff --git a/src/libffmpeg/libavcodec/parser.c b/src/libffmpeg/libavcodec/parser.c
index 412cd8359..59087cdb8 100644
--- a/src/libffmpeg/libavcodec/parser.c
+++ b/src/libffmpeg/libavcodec/parser.c
@@ -145,6 +145,7 @@ int av_parser_parse(AVCodecParserContext *s,
 /**
  *
  * @return 0 if the output buffer is a subset of the input, 1 if it is allocated and must be freed
+ * @deprecated use AVBitstreamFilter
  */
 int av_parser_change(AVCodecParserContext *s,
                      AVCodecContext *avctx,
@@ -297,6 +298,7 @@ static const int frame_rate_tab[16] = {
     25025,
 };
 
+#ifdef CONFIG_MPEGVIDEO_PARSER
 //FIXME move into mpeg12.c
 static void mpegvideo_extract_headers(AVCodecParserContext *s,
                                       AVCodecContext *avctx,
@@ -449,6 +451,7 @@ static int mpegvideo_split(AVCodecContext *avctx,
     }
     return 0;
 }
+#endif /* CONFIG_MPEGVIDEO_PARSER */
 
 void ff_parse_close(AVCodecParserContext *s)
 {
@@ -467,6 +470,7 @@ static void parse1_close(AVCodecParserContext *s)
 
 /*************************/
 
+#ifdef CONFIG_MPEG4VIDEO_PARSER
 /* used by parser */
 /* XXX: make it use less memory */
 static int av_mpeg4_decode_header(AVCodecParserContext *s1,
@@ -532,6 +536,33 @@ static int mpeg4video_parse(AVCodecParserContext *s,
     *poutbuf_size = buf_size;
     return next;
 }
+#endif
+
+#ifdef CONFIG_CAVSVIDEO_PARSER
+static int cavsvideo_parse(AVCodecParserContext *s,
+                           AVCodecContext *avctx,
+                           uint8_t **poutbuf, int *poutbuf_size,
+                           const uint8_t *buf, int buf_size)
+{
+    ParseContext *pc = s->priv_data;
+    int next;
+
+    if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
+        next= buf_size;
+    }else{
+        next= ff_cavs_find_frame_end(pc, buf, buf_size);
+
+        if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
+            *poutbuf = NULL;
+            *poutbuf_size = 0;
+            return buf_size;
+        }
+    }
+    *poutbuf = (uint8_t *)buf;
+    *poutbuf_size = buf_size;
+    return next;
+}
+#endif /* CONFIG_CAVSVIDEO_PARSER */
 
 static int mpeg4video_split(AVCodecContext *avctx,
                            const uint8_t *buf, int buf_size)
@@ -549,6 +580,7 @@ static int mpeg4video_split(AVCodecContext *avctx,
 
 /*************************/
 
+#ifdef CONFIG_MPEGAUDIO_PARSER
 typedef struct MpegAudioParseContext {
     uint8_t inbuf[MPA_MAX_CODED_FRAME_SIZE];    /* input buffer */
     uint8_t *inbuf_ptr;
@@ -726,15 +758,23 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
     }
     return buf_ptr - buf;
 }
+#endif /* CONFIG_MPEGAUDIO_PARSER */
 
+#if defined(CONFIG_AC3_PARSER) || defined(CONFIG_AAC_PARSER)
+/* also used for ADTS AAC */
 typedef struct AC3ParseContext {
-    uint8_t inbuf[4096]; /* input buffer */
     uint8_t *inbuf_ptr;
     int frame_size;
+    int header_size;
+    int (*sync)(const uint8_t *buf, int *channels, int *sample_rate,
+                int *bit_rate, int *samples);
+    uint8_t inbuf[8192]; /* input buffer */
 } AC3ParseContext;
 
 #define AC3_HEADER_SIZE 7
+#define AAC_HEADER_SIZE 7
 
+#ifdef CONFIG_AC3_PARSER
 static const int ac3_sample_rates[4] = {
     48000, 44100, 32000, 0
 };
@@ -789,9 +829,22 @@ static const int ac3_bitrates[64] = {
 static const int ac3_channels[8] = {
     2, 1, 2, 3, 3, 4, 4, 5
 };
+#endif /* CONFIG_AC3_PARSER */
+
+#ifdef CONFIG_AAC_PARSER
+static const int aac_sample_rates[16] = {
+    96000, 88200, 64000, 48000, 44100, 32000,
+    24000, 22050, 16000, 12000, 11025, 8000, 7350
+};
 
+static const int aac_channels[8] = {
+    0, 1, 2, 3, 4, 5, 6, 8
+};
+#endif
+
+#ifdef CONFIG_AC3_PARSER
 static int ac3_sync(const uint8_t *buf, int *channels, int *sample_rate,
-                    int *bit_rate)
+                    int *bit_rate, int *samples)
 {
     unsigned int fscod, frmsizecod, acmod, bsid, lfeon;
     GetBitContext bits;
@@ -801,7 +854,7 @@ static int ac3_sync(const uint8_t *buf, int *channels, int *sample_rate,
     if(get_bits(&bits, 16) != 0x0b77)
         return 0;
 
-    get_bits(&bits, 16);    /* crc */
+    skip_bits(&bits, 16);       /* crc */
     fscod = get_bits(&bits, 2);
     frmsizecod = get_bits(&bits, 6);
 
@@ -811,30 +864,90 @@ static int ac3_sync(const uint8_t *buf, int *channels, int *sample_rate,
     bsid = get_bits(&bits, 5);
     if(bsid > 8)
         return 0;
-    get_bits(&bits, 3);     /* bsmod */
+    skip_bits(&bits, 3);        /* bsmod */
     acmod = get_bits(&bits, 3);
     if(acmod & 1 && acmod != 1)
-        get_bits(&bits, 2); /* cmixlev */
+        skip_bits(&bits, 2);    /* cmixlev */
     if(acmod & 4)
-        get_bits(&bits, 2); /* surmixlev */
+        skip_bits(&bits, 2);    /* surmixlev */
     if(acmod & 2)
-        get_bits(&bits, 2); /* dsurmod */
-    lfeon = get_bits(&bits, 1);
+        skip_bits(&bits, 2);    /* dsurmod */
+    lfeon = get_bits1(&bits);
 
     *sample_rate = ac3_sample_rates[fscod];
     *bit_rate = ac3_bitrates[frmsizecod] * 1000;
     *channels = ac3_channels[acmod] + lfeon;
+    *samples = 6 * 256;
 
     return ac3_frame_sizes[frmsizecod][fscod] * 2;
 }
+#endif /* CONFIG_AC3_PARSER */
+
+#ifdef CONFIG_AAC_PARSER
+static int aac_sync(const uint8_t *buf, int *channels, int *sample_rate,
+                    int *bit_rate, int *samples)
+{
+    GetBitContext bits;
+    int size, rdb, ch, sr;
+
+    init_get_bits(&bits, buf, AAC_HEADER_SIZE * 8);
+
+    if(get_bits(&bits, 12) != 0xfff)
+        return 0;
+
+    skip_bits1(&bits);          /* id */
+    skip_bits(&bits, 2);        /* layer */
+    skip_bits1(&bits);          /* protection_absent */
+    skip_bits(&bits, 2);        /* profile_objecttype */
+    sr = get_bits(&bits, 4);    /* sample_frequency_index */
+    if(!aac_sample_rates[sr])
+        return 0;
+    skip_bits1(&bits);          /* private_bit */
+    ch = get_bits(&bits, 3);    /* channel_configuration */
+    if(!aac_channels[ch])
+        return 0;
+    skip_bits1(&bits);          /* original/copy */
+    skip_bits1(&bits);          /* home */
+
+    /* adts_variable_header */
+    skip_bits1(&bits);          /* copyright_identification_bit */
+    skip_bits1(&bits);          /* copyright_identification_start */
+    size = get_bits(&bits, 13); /* aac_frame_length */
+    skip_bits(&bits, 11);       /* adts_buffer_fullness */
+    rdb = get_bits(&bits, 2);   /* number_of_raw_data_blocks_in_frame */
+
+    *channels = aac_channels[ch];
+    *sample_rate = aac_sample_rates[sr];
+    *samples = (rdb + 1) * 1024;
+    *bit_rate = size * 8 * *sample_rate / *samples;
+
+    return size;
+}
+#endif /* CONFIG_AAC_PARSER */
 
+#ifdef CONFIG_AC3_PARSER
 static int ac3_parse_init(AVCodecParserContext *s1)
 {
     AC3ParseContext *s = s1->priv_data;
     s->inbuf_ptr = s->inbuf;
+    s->header_size = AC3_HEADER_SIZE;
+    s->sync = ac3_sync;
     return 0;
 }
+#endif
 
+#ifdef CONFIG_AAC_PARSER
+static int aac_parse_init(AVCodecParserContext *s1)
+{
+    AC3ParseContext *s = s1->priv_data;
+    s->inbuf_ptr = s->inbuf;
+    s->header_size = AAC_HEADER_SIZE;
+    s->sync = aac_sync;
+    return 0;
+}
+#endif
+
+/* also used for ADTS AAC */
 static int ac3_parse(AVCodecParserContext *s1,
                      AVCodecContext *avctx,
                      uint8_t **poutbuf, int *poutbuf_size,
@@ -842,7 +955,7 @@ static int ac3_parse(AVCodecParserContext *s1,
 {
     AC3ParseContext *s = s1->priv_data;
     const uint8_t *buf_ptr;
-    int len, sample_rate, bit_rate, channels;
+    int len, sample_rate, bit_rate, channels, samples;
 
     *poutbuf = NULL;
     *poutbuf_size = 0;
@@ -851,29 +964,35 @@ static int ac3_parse(AVCodecParserContext *s1,
     while (buf_size > 0) {
         len = s->inbuf_ptr - s->inbuf;
         if (s->frame_size == 0) {
-            /* no header seen : find one. We need at least 7 bytes to parse it */
-            len = FFMIN(AC3_HEADER_SIZE - len, buf_size);
+            /* no header seen : find one. We need at least s->header_size
+               bytes to parse it */
+            len = FFMIN(s->header_size - len, buf_size);
 
             memcpy(s->inbuf_ptr, buf_ptr, len);
             buf_ptr += len;
             s->inbuf_ptr += len;
             buf_size -= len;
-            if ((s->inbuf_ptr - s->inbuf) == AC3_HEADER_SIZE) {
-                len = ac3_sync(s->inbuf, &channels, &sample_rate, &bit_rate);
+            if ((s->inbuf_ptr - s->inbuf) == s->header_size) {
+                len = s->sync(s->inbuf, &channels, &sample_rate, &bit_rate,
+                              &samples);
                 if (len == 0) {
                     /* no sync found : move by one byte (inefficient, but simple!) */
-                    memmove(s->inbuf, s->inbuf + 1, AC3_HEADER_SIZE - 1);
+                    memmove(s->inbuf, s->inbuf + 1, s->header_size - 1);
                     s->inbuf_ptr--;
                 } else {
                     s->frame_size = len;
                     /* update codec info */
                     avctx->sample_rate = sample_rate;
                     /* set channels,except if the user explicitly requests 1 or 2 channels, XXX/FIXME this is a bit ugly */
-                    if(avctx->channels!=1 && avctx->channels!=2){
+                    if(avctx->codec_id == CODEC_ID_AC3){
+                        if(avctx->channels!=1 && avctx->channels!=2){
+                            avctx->channels = channels;
+                        }
+                    } else {
                         avctx->channels = channels;
                     }
                     avctx->bit_rate = bit_rate;
-                    avctx->frame_size = 6 * 256;
+                    avctx->frame_size = samples;
                 }
             }
         } else {
@@ -895,7 +1014,9 @@ static int ac3_parse(AVCodecParserContext *s1,
     }
     return buf_ptr - buf;
 }
+#endif /* CONFIG_AC3_PARSER || CONFIG_AAC_PARSER */
 
+#ifdef CONFIG_MPEGVIDEO_PARSER
 AVCodecParser mpegvideo_parser = {
     { CODEC_ID_MPEG1VIDEO, CODEC_ID_MPEG2VIDEO },
     sizeof(ParseContext1),
@@ -904,7 +1025,8 @@ AVCodecParser mpegvideo_parser = {
     parse1_close,
     mpegvideo_split,
 };
-
+#endif
+#ifdef CONFIG_MPEG4VIDEO_PARSER
 AVCodecParser mpeg4video_parser = {
     { CODEC_ID_MPEG4 },
     sizeof(ParseContext1),
@@ -913,7 +1035,18 @@ AVCodecParser mpeg4video_parser = {
     parse1_close,
     mpeg4video_split,
 };
-
+#endif
+#ifdef CONFIG_CAVSVIDEO_PARSER
+AVCodecParser cavsvideo_parser = {
+    { CODEC_ID_CAVS },
+    sizeof(ParseContext1),
+    NULL,
+    cavsvideo_parse,
+    parse1_close,
+    mpeg4video_split,
+};
+#endif
+#ifdef CONFIG_MPEGAUDIO_PARSER
 AVCodecParser mpegaudio_parser = {
     { CODEC_ID_MP2, CODEC_ID_MP3 },
     sizeof(MpegAudioParseContext),
@@ -921,7 +1054,8 @@ AVCodecParser mpegaudio_parser = {
     mpegaudio_parse,
     NULL,
 };
-
+#endif
+#ifdef CONFIG_AC3_PARSER
 AVCodecParser ac3_parser = {
     { CODEC_ID_AC3 },
     sizeof(AC3ParseContext),
@@ -929,3 +1063,13 @@ AVCodecParser ac3_parser = {
     ac3_parse,
     NULL,
 };
+#endif
+#ifdef CONFIG_AAC_PARSER
+AVCodecParser aac_parser = {
+    { CODEC_ID_AAC },
+    sizeof(AC3ParseContext),
+    aac_parse_init,
+    ac3_parse,
+    NULL,
+};
+#endif
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
index 31464fb7a..81a32c9e3 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -1308,13 +1308,12 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 }
 
-#ifdef CONFIG_DARWIN
 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
   int sum;
-POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
   register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
   register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
   {
     register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
     register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
@@ -1339,6 +1338,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
     {                                                                   \
       register vector unsigned char src1, src2, srcO;                   \
       register vector unsigned char dst1, dst2, dstO;                   \
+      register vector signed short srcV, dstV;                          \
+      register vector signed short but0, but1, but2, op1, op2, op3;     \
       src1 = vec_ld(stride * i, src);                                   \
       if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8)       \
         src2 = vec_ld((stride * i) + 16, src);                          \
@@ -1349,17 +1350,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
       dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
       /* promote the unsigned chars to signed shorts */                 \
       /* we're in the 8x8 function, we only care for the first 8 */     \
-      register vector signed short srcV =                               \
-        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
-      register vector signed short dstV =                               \
-        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+      srcV =                                                            \
+        (vector signed short)vec_mergeh((vector signed char)vzero,      \
+        (vector signed char)srcO);                                      \
+      dstV =                                                            \
+        (vector signed short)vec_mergeh((vector signed char)vzero,      \
+        (vector signed char)dstO);                                      \
       /* substractions inside the first butterfly */                    \
-      register vector signed short but0 = vec_sub(srcV, dstV);          \
-      register vector signed short op1 = vec_perm(but0, but0, perm1);   \
-      register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
-      register vector signed short op2 = vec_perm(but1, but1, perm2);   \
-      register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
-      register vector signed short op3 = vec_perm(but2, but2, perm3);   \
+      but0 = vec_sub(srcV, dstV);                                       \
+      op1 = vec_perm(but0, but0, perm1);                                \
+      but1 = vec_mladd(but0, vprod1, op1);                              \
+      op2 = vec_perm(but1, but1, perm2);                                \
+      but2 = vec_mladd(but1, vprod2, op2);                              \
+      op3 = vec_perm(but2, but2, perm3);                                \
       res = vec_mladd(but2, vprod3, op3);                               \
     }
     ONEITERBUTTERFLY(0, temp0);
@@ -1442,39 +1445,39 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
   int sum;
   register vector signed short
-    temp0 asm ("v0"),
-    temp1 asm ("v1"),
-    temp2 asm ("v2"),
-    temp3 asm ("v3"),
-    temp4 asm ("v4"),
-    temp5 asm ("v5"),
-    temp6 asm ("v6"),
-    temp7 asm ("v7");
+    temp0 REG_v(v0),
+    temp1 REG_v(v1),
+    temp2 REG_v(v2),
+    temp3 REG_v(v3),
+    temp4 REG_v(v4),
+    temp5 REG_v(v5),
+    temp6 REG_v(v6),
+    temp7 REG_v(v7);
   register vector signed short
-    temp0S asm ("v8"),
-    temp1S asm ("v9"),
-    temp2S asm ("v10"),
-    temp3S asm ("v11"),
-    temp4S asm ("v12"),
-    temp5S asm ("v13"),
-    temp6S asm ("v14"),
-    temp7S asm ("v15");
-  register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
+    temp0S REG_v(v8),
+    temp1S REG_v(v9),
+    temp2S REG_v(v10),
+    temp3S REG_v(v11),
+    temp4S REG_v(v12),
+    temp5S REG_v(v13),
+    temp6S REG_v(v14),
+    temp7S REG_v(v15);
+  register const_vector unsigned char vzero REG_v(v31)= (const_vector unsigned char)vec_splat_u8(0);
   {
-    register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
-    register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
-    register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
-    register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
+    register const_vector signed short vprod1 REG_v(v16)= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
+    register const_vector signed short vprod2 REG_v(v17)= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
+    register const_vector signed short vprod3 REG_v(v18)= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
+    register const_vector unsigned char perm1 REG_v(v19)= (const_vector unsigned char)
       AVV(0x02, 0x03, 0x00, 0x01,
        0x06, 0x07, 0x04, 0x05,
        0x0A, 0x0B, 0x08, 0x09,
        0x0E, 0x0F, 0x0C, 0x0D);
-    register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
+    register const_vector unsigned char perm2 REG_v(v20)= (const_vector unsigned char)
       AVV(0x04, 0x05, 0x06, 0x07,
        0x00, 0x01, 0x02, 0x03,
        0x0C, 0x0D, 0x0E, 0x0F,
        0x08, 0x09, 0x0A, 0x0B);
-    register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
+    register const_vector unsigned char perm3 REG_v(v21)= (const_vector unsigned char)
       AVV(0x08, 0x09, 0x0A, 0x0B,
        0x0C, 0x0D, 0x0E, 0x0F,
        0x00, 0x01, 0x02, 0x03,
@@ -1482,37 +1485,63 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
 
 #define ONEITERBUTTERFLY(i, res1, res2)                                 \
     {                                                                   \
-      register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
-      register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
+      register vector unsigned char src1 REG_v(v22),                    \
+                                    src2 REG_v(v23),                    \
+                                    dst1 REG_v(v24),                    \
+                                    dst2 REG_v(v25),                    \
+                                    srcO REG_v(v22),                    \
+                                    dstO REG_v(v23);                    \
+                                                                        \
+      register vector signed short  srcV REG_v(v24),                    \
+                                    dstV REG_v(v25),                    \
+                                    srcW REG_v(v26),                    \
+                                    dstW REG_v(v27),                    \
+                                    but0 REG_v(v28),                    \
+                                    but0S REG_v(v29),                   \
+                                    op1 REG_v(v30),                     \
+                                    but1 REG_v(v22),                    \
+                                    op1S REG_v(v23),                    \
+                                    but1S REG_v(v24),                   \
+                                    op2 REG_v(v25),                     \
+                                    but2 REG_v(v26),                    \
+                                    op2S REG_v(v27),                    \
+                                    but2S REG_v(v28),                   \
+                                    op3 REG_v(v29),                     \
+                                    op3S REG_v(v30);                    \
+                                                                        \
       src1 = vec_ld(stride * i, src);                                   \
       src2 = vec_ld((stride * i) + 16, src);                            \
-      register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+      srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
       dst1 = vec_ld(stride * i, dst);                                   \
       dst2 = vec_ld((stride * i) + 16, dst);                            \
-      register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+      dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
       /* promote the unsigned chars to signed shorts */                 \
-      register vector signed short srcV asm ("v24") =                   \
-        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
-      register vector signed short dstV asm ("v25") =                   \
-        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
-      register vector signed short srcW asm ("v26") =                   \
-        (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
-      register vector signed short dstW asm ("v27") =                   \
-        (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
+      srcV =                                                            \
+        (vector signed short)vec_mergeh((vector signed char)vzero,      \
+        (vector signed char)srcO);                                      \
+      dstV =                                                            \
+        (vector signed short)vec_mergeh((vector signed char)vzero,      \
+        (vector signed char)dstO);                                      \
+      srcW =                                                            \
+        (vector signed short)vec_mergel((vector signed char)vzero,      \
+        (vector signed char)srcO);                                      \
+      dstW =                                                            \
+        (vector signed short)vec_mergel((vector signed char)vzero,      \
+        (vector signed char)dstO);                                      \
       /* substractions inside the first butterfly */                    \
-      register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
-      register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
-      register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
-      register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
-      register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
-      register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
-      register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
-      register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
-      register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
-      register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
-      register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
+      but0 = vec_sub(srcV, dstV);                                       \
+      but0S = vec_sub(srcW, dstW);                                      \
+      op1 = vec_perm(but0, but0, perm1);                                \
+      but1 = vec_mladd(but0, vprod1, op1);                              \
+      op1S = vec_perm(but0S, but0S, perm1);                             \
+      but1S = vec_mladd(but0S, vprod1, op1S);                           \
+      op2 = vec_perm(but1, but1, perm2);                                \
+      but2 = vec_mladd(but1, vprod2, op2);                              \
+      op2S = vec_perm(but1S, but1S, perm2);                             \
+      but2S = vec_mladd(but1S, vprod2, op2S);                           \
+      op3 = vec_perm(but2, but2, perm3);                                \
       res1 = vec_mladd(but2, vprod3, op3);                              \
-      register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
+      op3S = vec_perm(but2S, but2S, perm3);                             \
       res2 = vec_mladd(but2S, vprod3, op3S);                            \
     }
     ONEITERBUTTERFLY(0, temp0, temp0S);
@@ -1527,6 +1556,12 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
 #undef ONEITERBUTTERFLY
   {
     register vector signed int vsum;
+    register vector signed short line0S, line1S, line2S, line3S, line4S,
+                                 line5S, line6S, line7S, line0BS,line2BS,
+                                 line1BS,line3BS,line4BS,line6BS,line5BS,
+                                 line7BS,line0CS,line4CS,line1CS,line5CS,
+                                 line2CS,line6CS,line3CS,line7CS;
+
     register vector signed short line0 = vec_add(temp0, temp1);
     register vector signed short line1 = vec_sub(temp0, temp1);
     register vector signed short line2 = vec_add(temp2, temp3);
@@ -1563,32 +1598,32 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
     vsum = vec_sum4s(vec_abs(line6C), vsum);
     vsum = vec_sum4s(vec_abs(line7C), vsum);
 
-    register vector signed short line0S = vec_add(temp0S, temp1S);
-    register vector signed short line1S = vec_sub(temp0S, temp1S);
-    register vector signed short line2S = vec_add(temp2S, temp3S);
-    register vector signed short line3S = vec_sub(temp2S, temp3S);
-    register vector signed short line4S = vec_add(temp4S, temp5S);
-    register vector signed short line5S = vec_sub(temp4S, temp5S);
-    register vector signed short line6S = vec_add(temp6S, temp7S);
-    register vector signed short line7S = vec_sub(temp6S, temp7S);
-
-    register vector signed short line0BS = vec_add(line0S, line2S);
-    register vector signed short line2BS = vec_sub(line0S, line2S);
-    register vector signed short line1BS = vec_add(line1S, line3S);
-    register vector signed short line3BS = vec_sub(line1S, line3S);
-    register vector signed short line4BS = vec_add(line4S, line6S);
-    register vector signed short line6BS = vec_sub(line4S, line6S);
-    register vector signed short line5BS = vec_add(line5S, line7S);
-    register vector signed short line7BS = vec_sub(line5S, line7S);
-
-    register vector signed short line0CS = vec_add(line0BS, line4BS);
-    register vector signed short line4CS = vec_sub(line0BS, line4BS);
-    register vector signed short line1CS = vec_add(line1BS, line5BS);
-    register vector signed short line5CS = vec_sub(line1BS, line5BS);
-    register vector signed short line2CS = vec_add(line2BS, line6BS);
-    register vector signed short line6CS = vec_sub(line2BS, line6BS);
-    register vector signed short line3CS = vec_add(line3BS, line7BS);
-    register vector signed short line7CS = vec_sub(line3BS, line7BS);
+    line0S = vec_add(temp0S, temp1S);
+    line1S = vec_sub(temp0S, temp1S);
+    line2S = vec_add(temp2S, temp3S);
+    line3S = vec_sub(temp2S, temp3S);
+    line4S = vec_add(temp4S, temp5S);
+    line5S = vec_sub(temp4S, temp5S);
+    line6S = vec_add(temp6S, temp7S);
+    line7S = vec_sub(temp6S, temp7S);
+
+    line0BS = vec_add(line0S, line2S);
+    line2BS = vec_sub(line0S, line2S);
+    line1BS = vec_add(line1S, line3S);
+    line3BS = vec_sub(line1S, line3S);
+    line4BS = vec_add(line4S, line6S);
+    line6BS = vec_sub(line4S, line6S);
+    line5BS = vec_add(line5S, line7S);
+    line7BS = vec_sub(line5S, line7S);
+
+    line0CS = vec_add(line0BS, line4BS);
+    line4CS = vec_sub(line0BS, line4BS);
+    line1CS = vec_add(line1BS, line5BS);
+    line5CS = vec_sub(line1BS, line5BS);
+    line2CS = vec_add(line2BS, line6BS);
+    line6CS = vec_sub(line2BS, line6BS);
+    line3CS = vec_add(line3BS, line7BS);
+    line7CS = vec_sub(line3BS, line7BS);
 
     vsum = vec_sum4s(vec_abs(line0CS), vsum);
     vsum = vec_sum4s(vec_abs(line1CS), vsum);
@@ -1618,7 +1653,6 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
   return score;
 }
-#endif //CONFIG_DARWIN
 
 int has_altivec(void)
 {
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_h264_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_h264_altivec.c
index b9fef005e..14391e60c 100755
--- a/src/libffmpeg/libavcodec/ppc/dsputil_h264_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_h264_altivec.c
@@ -188,44 +188,97 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint
     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 }\
 
+static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
+                                    const uint8_t * src2, int dst_stride,
+                                    int src_stride1, int h)
+{
+    int i;
+    vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 
-/* from dsputil.c */
-static inline void put_pixels8_l2(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int src_stride2, int h) {
-        int             i;
-        for (i = 0; i < h; i++) {
-                uint32_t        a, b;
-                a = (((const struct unaligned_32 *) (&src1[i * src_stride1]))->l);
-                b = (((const struct unaligned_32 *) (&src2[i * src_stride2]))->l);
-                *((uint32_t *) & dst[i * dst_stride]) = rnd_avg32(a, b);
-                a = (((const struct unaligned_32 *) (&src1[i * src_stride1 + 4]))->l);
-                b = (((const struct unaligned_32 *) (&src2[i * src_stride2 + 4]))->l);
-                *((uint32_t *) & dst[i * dst_stride + 4]) = rnd_avg32(a, b);
-        }
-} static inline void avg_pixels8_l2(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int src_stride2, int h) {
-        int             i;
-        for (i = 0; i < h; i++) {
-                uint32_t        a, b;
-                a = (((const struct unaligned_32 *) (&src1[i * src_stride1]))->l);
-                b = (((const struct unaligned_32 *) (&src2[i * src_stride2]))->l);
-                *((uint32_t *) & dst[i * dst_stride]) = rnd_avg32(*((uint32_t *) & dst[i * dst_stride]), rnd_avg32(a, b));
-                a = (((const struct unaligned_32 *) (&src1[i * src_stride1 + 4]))->l);
-                b = (((const struct unaligned_32 *) (&src2[i * src_stride2 + 4]))->l);
-                *((uint32_t *) & dst[i * dst_stride + 4]) = rnd_avg32(*((uint32_t *) & dst[i * dst_stride + 4]), rnd_avg32(a, b));
-        }
-} static inline void put_pixels16_l2(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int src_stride2, int h) {
-        put_pixels8_l2(dst, src1, src2, dst_stride, src_stride1, src_stride2, h);
-        put_pixels8_l2(dst + 8, src1 + 8, src2 + 8, dst_stride, src_stride1, src_stride2, h);
-} static inline void avg_pixels16_l2(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, int dst_stride, int src_stride1, int src_stride2, int h) {
-        avg_pixels8_l2(dst, src1, src2, dst_stride, src_stride1, src_stride2, h);
-        avg_pixels8_l2(dst + 8, src1 + 8, src2 + 8, dst_stride, src_stride1, src_stride2, h);
+    mask_ = vec_lvsl(0, src2);
+
+    for (i = 0; i < h; i++) {
+
+        tmp1 = vec_ld(i * src_stride1, src1);
+        mask = vec_lvsl(i * src_stride1, src1);
+        tmp2 = vec_ld(i * src_stride1 + 15, src1);
+
+        a = vec_perm(tmp1, tmp2, mask);
+
+        tmp1 = vec_ld(i * 16, src2);
+        tmp2 = vec_ld(i * 16 + 15, src2);
+
+        b = vec_perm(tmp1, tmp2, mask_);
+
+        tmp1 = vec_ld(0, dst);
+        mask = vec_lvsl(0, dst);
+        tmp2 = vec_ld(15, dst);
+
+        d = vec_avg(a, b);
+
+        edges = vec_perm(tmp2, tmp1, mask);
+
+        align = vec_lvsr(0, dst);
+
+        tmp1 = vec_perm(edges, d, align);
+        tmp2 = vec_perm(d, edges, align);
+
+        vec_st(tmp2, 15, dst);
+        vec_st(tmp1, 0 , dst);
+
+        dst += dst_stride;
+    }
+}
+
+static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
+                                    const uint8_t * src2, int dst_stride,
+                                    int src_stride1, int h)
+{
+    int i;
+    vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+
+    mask_ = vec_lvsl(0, src2);
+
+    for (i = 0; i < h; i++) {
+
+        tmp1 = vec_ld(i * src_stride1, src1);
+        mask = vec_lvsl(i * src_stride1, src1);
+        tmp2 = vec_ld(i * src_stride1 + 15, src1);
+
+        a = vec_perm(tmp1, tmp2, mask);
+
+        tmp1 = vec_ld(i * 16, src2);
+        tmp2 = vec_ld(i * 16 + 15, src2);
+
+        b = vec_perm(tmp1, tmp2, mask_);
+
+        tmp1 = vec_ld(0, dst);
+        mask = vec_lvsl(0, dst);
+        tmp2 = vec_ld(15, dst);
+
+        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
+
+        edges = vec_perm(tmp2, tmp1, mask);
+
+        align = vec_lvsr(0, dst);
+
+        tmp1 = vec_perm(edges, d, align);
+        tmp2 = vec_perm(d, edges, align);
+
+        vec_st(tmp2, 15, dst);
+        vec_st(tmp1, 0 , dst);
+
+        dst += dst_stride;
+    }
 }
 
-/* UNIMPLEMENTED YET !! */
+/* Implemented but could be faster
 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
+ */
 
-H264_MC(put_, 16, altivec)
-     H264_MC(avg_, 16, altivec)
+  H264_MC(put_, 16, altivec)
+  H264_MC(avg_, 16, altivec)
 
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
 
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_h264_template_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_h264_template_altivec.c
index 7f46ccf14..37f4de58f 100755
--- a/src/libffmpeg/libavcodec/ppc/dsputil_h264_template_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_h264_template_altivec.c
@@ -19,70 +19,76 @@
 /* this code assume that stride % 16 == 0 */
 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
   POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
-  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
-    signed int ABCD[4] __attribute__((aligned(16)));
+    signed int ABCD[4] __attribute__((aligned(16))) =
+                        {((8 - x) * (8 - y)),
+                          ((x) * (8 - y)),
+                          ((8 - x) * (y)),
+                          ((x) * (y))};
     register int i;
-    ABCD[0] = ((8 - x) * (8 - y));
-    ABCD[1] = ((x) * (8 - y));
-    ABCD[2] = ((8 - x) * (y));
-    ABCD[3] = ((x) * (y));
+    vector unsigned char fperm;
     const vector signed int vABCD = vec_ld(0, ABCD);
     const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
     const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
     const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
     const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
     const vector signed int vzero = vec_splat_s32(0);
-    const vector signed short v32ss = (const vector signed short)AVV(32);
+    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
     const vector unsigned short v6us = vec_splat_u16(6);
+    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vector unsigned char fperm;
+    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
+    vector unsigned char vsrc0uc, vsrc1uc;
+    vector signed short vsrc0ssH, vsrc1ssH;
+    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
+    vector signed short vsrc2ssH, vsrc3ssH, psum;
+    vector unsigned char vdst, ppsum, vfdst, fsum;
+
+  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
 
     if (((unsigned long)dst) % 16 == 0) {
-      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
-                                        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
+                                        0x14, 0x15, 0x16, 0x17,
+                                        0x08, 0x09, 0x0A, 0x0B,
+                                        0x0C, 0x0D, 0x0E, 0x0F);
     } else {
-      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-                                        0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
+                                        0x04, 0x05, 0x06, 0x07,
+                                        0x18, 0x19, 0x1A, 0x1B,
+                                        0x1C, 0x1D, 0x1E, 0x1F);
     }
 
-    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
-    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
-
-    vector unsigned char vsrcAuc;
-    vector unsigned char vsrcBuc;
-    vector unsigned char vsrcperm0;
-    vector unsigned char vsrcperm1;
     vsrcAuc = vec_ld(0, src);
+
     if (loadSecond)
       vsrcBuc = vec_ld(16, src);
     vsrcperm0 = vec_lvsl(0, src);
     vsrcperm1 = vec_lvsl(1, src);
 
-    vector unsigned char vsrc0uc;
-    vector unsigned char vsrc1uc;
     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
     if (reallyBadAlign)
       vsrc1uc = vsrcBuc;
     else
       vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
 
-    vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc);
-    vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc);
+    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
+                                               (vector unsigned char)vsrc0uc);
+    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
+                                               (vector unsigned char)vsrc1uc);
 
     if (!loadSecond) {// -> !reallyBadAlign
       for (i = 0 ; i < h ; i++) {
-        vector unsigned char vsrcCuc;
+
+
         vsrcCuc = vec_ld(stride + 0, src);
 
-        vector unsigned char vsrc2uc;
-        vector unsigned char vsrc3uc;
         vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
         vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
 
-        vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
-        vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
-
-        vector signed short psum;
+        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
+                                                (vector unsigned char)vsrc2uc);
+        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
+                                                (vector unsigned char)vsrc3uc);
 
         psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
         psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -91,11 +97,9 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
         psum = vec_add(v32ss, psum);
         psum = vec_sra(psum, v6us);
 
-        vector unsigned char vdst = vec_ld(0, dst);
-        vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum);
-
-        vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
-        vector unsigned char fsum;
+        vdst = vec_ld(0, dst);
+        ppsum = (vector unsigned char)vec_packsu(psum, psum);
+        vfdst = vec_perm(vdst, ppsum, fperm);
 
         OP_U8_ALTIVEC(fsum, vfdst, vdst);
 
@@ -108,24 +112,21 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
         src += stride;
       }
     } else {
-      for (i = 0 ; i < h ; i++) {
-        vector unsigned char vsrcCuc;
         vector unsigned char vsrcDuc;
+      for (i = 0 ; i < h ; i++) {
         vsrcCuc = vec_ld(stride + 0, src);
         vsrcDuc = vec_ld(stride + 16, src);
 
-        vector unsigned char vsrc2uc;
-        vector unsigned char vsrc3uc;
         vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
         if (reallyBadAlign)
           vsrc3uc = vsrcDuc;
         else
           vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
 
-        vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
-        vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
-
-        vector signed short psum;
+        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
+                                                (vector unsigned char)vsrc2uc);
+        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
+                                                (vector unsigned char)vsrc3uc);
 
         psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
         psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -134,11 +135,9 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
         psum = vec_add(v32ss, psum);
         psum = vec_sr(psum, v6us);
 
-        vector unsigned char vdst = vec_ld(0, dst);
-        vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum);
-
-        vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
-        vector unsigned char fsum;
+        vdst = vec_ld(0, dst);
+        ppsum = (vector unsigned char)vec_pack(psum, psum);
+        vfdst = vec_perm(vdst, ppsum, fperm);
 
         OP_U8_ALTIVEC(fsum, vfdst, vdst);
 
@@ -157,7 +156,6 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
 /* this code assume stride % 16 == 0 */
 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
   POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
-  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
   register int i;
 
   const vector signed int vzero = vec_splat_s32(0);
@@ -167,18 +165,35 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
   const vector unsigned char permP1 = vec_lvsl(+1, src);
   const vector unsigned char permP2 = vec_lvsl(+2, src);
   const vector unsigned char permP3 = vec_lvsl(+3, src);
-  const vector signed short v20ss = (const vector signed short)AVV(20);
-  const vector unsigned short v5us = vec_splat_u16(5);
   const vector signed short v5ss = vec_splat_s16(5);
-  const vector signed short v16ss = (const vector signed short)AVV(16);
+  const vector unsigned short v5us = vec_splat_u16(5);
+  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
   const vector unsigned char dstperm = vec_lvsr(0, dst);
-  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
-  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
+  const vector unsigned char neg1 =
+                                (const vector unsigned char) vec_splat_s8(-1);
+
+  const vector unsigned char dstmask =
+                                vec_perm((const vector unsigned char)vzero,
+                                                               neg1, dstperm);
+
+  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
 
   register int align = ((((unsigned long)src) - 2) % 16);
 
+  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
+                      srcP2A, srcP2B, srcP3A, srcP3B,
+                      srcM1A, srcM1B, srcM2A, srcM2B,
+                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
+                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
+                      psumA, psumB, sumA, sumB;
+
+  vector unsigned char sum, dst1, dst2, vdst, fsum,
+                       rsum, fdst1, fdst2;
+
+  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
+
   for (i = 0 ; i < 16 ; i ++) {
-    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
     vector unsigned char srcR1 = vec_ld(-2, src);
     vector unsigned char srcR2 = vec_ld(14, src);
 
@@ -237,55 +252,66 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
     } break;
     }
 
-    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
-    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
-    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
-    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
-
-    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
-    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
-    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
-    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
-
-    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
-    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
-    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
-    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
-
-    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
-    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
-    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
-    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
-    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
-    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
-
-    const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
-    const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
-
-    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
-    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
-
-    const vector signed short pp3A = vec_add(sum3A, pp1A);
-    const vector signed short pp3B = vec_add(sum3B, pp1B);
-
-    const vector signed short psumA = vec_sub(pp3A, pp2A);
-    const vector signed short psumB = vec_sub(pp3B, pp2B);
+    srcP0A = (vector signed short)
+                vec_mergeh((vector unsigned char)vzero, srcP0);
+    srcP0B = (vector signed short)
+                vec_mergel((vector unsigned char)vzero, srcP0);
+    srcP1A = (vector signed short)
+                vec_mergeh((vector unsigned char)vzero, srcP1);
+    srcP1B = (vector signed short)
+                vec_mergel((vector unsigned char)vzero, srcP1);
+
+    srcP2A = (vector signed short)
+                vec_mergeh((vector unsigned char)vzero, srcP2);
+    srcP2B = (vector signed short)
+                vec_mergel((vector unsigned char)vzero, srcP2);
+    srcP3A = (vector signed short)
+                vec_mergeh((vector unsigned char)vzero, srcP3);
+    srcP3B = (vector signed short)
+                vec_mergel((vector unsigned char)vzero, srcP3);
+
+    srcM1A = (vector signed short)
+                vec_mergeh((vector unsigned char)vzero, srcM1);
+    srcM1B = (vector signed short)
+                vec_mergel((vector unsigned char)vzero, srcM1);
+    srcM2A = (vector signed short)
+                vec_mergeh((vector unsigned char)vzero, srcM2);
+    srcM2B = (vector signed short)
+                vec_mergel((vector unsigned char)vzero, srcM2);
+
+    sum1A = vec_adds(srcP0A, srcP1A);
+    sum1B = vec_adds(srcP0B, srcP1B);
+    sum2A = vec_adds(srcM1A, srcP2A);
+    sum2B = vec_adds(srcM1B, srcP2B);
+    sum3A = vec_adds(srcM2A, srcP3A);
+    sum3B = vec_adds(srcM2B, srcP3B);
+
+    pp1A = vec_mladd(sum1A, v20ss, v16ss);
+    pp1B = vec_mladd(sum1B, v20ss, v16ss);
+
+    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
+    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
+
+    pp3A = vec_add(sum3A, pp1A);
+    pp3B = vec_add(sum3B, pp1B);
+
+    psumA = vec_sub(pp3A, pp2A);
+    psumB = vec_sub(pp3B, pp2B);
+
+    sumA = vec_sra(psumA, v5us);
+    sumB = vec_sra(psumB, v5us);
+
+    sum = vec_packsu(sumA, sumB);
+
+    dst1 = vec_ld(0, dst);
+    dst2 = vec_ld(16, dst);
+    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
 
-    const vector signed short sumA = vec_sra(psumA, v5us);
-    const vector signed short sumB = vec_sra(psumB, v5us);
-
-    const vector unsigned char sum = vec_packsu(sumA, sumB);
-
-    const vector unsigned char dst1 = vec_ld(0, dst);
-    const vector unsigned char dst2 = vec_ld(16, dst);
-    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
-
-    vector unsigned char fsum;
     OP_U8_ALTIVEC(fsum, sum, vdst);
 
-    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
-    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
-    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
+    rsum = vec_perm(fsum, fsum, dstperm);
+    fdst1 = vec_sel(dst1, rsum, dstmask);
+    fdst2 = vec_sel(rsum, dst2, dstmask);
 
     vec_st(fdst1, 0, dst);
     vec_st(fdst2, 16, dst);
@@ -299,16 +325,15 @@ POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
 /* this code assume stride % 16 == 0 */
 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
   POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
-  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
 
   register int i;
 
   const vector signed int vzero = vec_splat_s32(0);
   const vector unsigned char perm = vec_lvsl(0, src);
-  const vector signed short v20ss = (const vector signed short)AVV(20);
+  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
   const vector unsigned short v5us = vec_splat_u16(5);
   const vector signed short v5ss = vec_splat_s16(5);
-  const vector signed short v16ss = (const vector signed short)AVV(16);
+  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
   const vector unsigned char dstperm = vec_lvsr(0, dst);
   const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
   const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
@@ -318,49 +343,71 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
   const vector unsigned char srcM2a = vec_ld(0, srcbis);
   const vector unsigned char srcM2b = vec_ld(16, srcbis);
   const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
-  srcbis += srcStride;
-  const vector unsigned char srcM1a = vec_ld(0, srcbis);
+//  srcbis += srcStride;
+  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
   const vector unsigned char srcM1b = vec_ld(16, srcbis);
   const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
-  srcbis += srcStride;
-  const vector unsigned char srcP0a = vec_ld(0, srcbis);
+//  srcbis += srcStride;
+  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
   const vector unsigned char srcP0b = vec_ld(16, srcbis);
   const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
-  srcbis += srcStride;
-  const vector unsigned char srcP1a = vec_ld(0, srcbis);
+//  srcbis += srcStride;
+  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
   const vector unsigned char srcP1b = vec_ld(16, srcbis);
   const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
-  srcbis += srcStride;
-  const vector unsigned char srcP2a = vec_ld(0, srcbis);
+//  srcbis += srcStride;
+  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
   const vector unsigned char srcP2b = vec_ld(16, srcbis);
   const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
-  srcbis += srcStride;
-
-  vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
-  vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
-  vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
-  vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
-  vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
-  vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
-  vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
-  vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
-  vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
-  vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
+//  srcbis += srcStride;
+
+  vector signed short srcM2ssA = (vector signed short)
+                                vec_mergeh((vector unsigned char)vzero, srcM2);
+  vector signed short srcM2ssB = (vector signed short)
+                                vec_mergel((vector unsigned char)vzero, srcM2);
+  vector signed short srcM1ssA = (vector signed short)
+                                vec_mergeh((vector unsigned char)vzero, srcM1);
+  vector signed short srcM1ssB = (vector signed short)
+                                vec_mergel((vector unsigned char)vzero, srcM1);
+  vector signed short srcP0ssA = (vector signed short)
+                                vec_mergeh((vector unsigned char)vzero, srcP0);
+  vector signed short srcP0ssB = (vector signed short)
+                                vec_mergel((vector unsigned char)vzero, srcP0);
+  vector signed short srcP1ssA = (vector signed short)
+                                vec_mergeh((vector unsigned char)vzero, srcP1);
+  vector signed short srcP1ssB = (vector signed short)
+                                vec_mergel((vector unsigned char)vzero, srcP1);
+  vector signed short srcP2ssA = (vector signed short)
+                                vec_mergeh((vector unsigned char)vzero, srcP2);
+  vector signed short srcP2ssB = (vector signed short)
+                                vec_mergel((vector unsigned char)vzero, srcP2);
+
+  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
+                      psumA, psumB, sumA, sumB,
+                      srcP3ssA, srcP3ssB,
+                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
+
+  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
+                       srcP3a, srcP3b, srcP3;
+
+  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
 
   for (i = 0 ; i < 16 ; i++) {
-    const vector unsigned char srcP3a = vec_ld(0, srcbis);
-    const vector unsigned char srcP3b = vec_ld(16, srcbis);
-    const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm);
-    const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
-    const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
-    srcbis += srcStride;
-
-    const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA);
-    const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB);
-    const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA);
-    const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB);
-    const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA);
-    const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB);
+    srcP3a = vec_ld(0, srcbis += srcStride);
+    srcP3b = vec_ld(16, srcbis);
+    srcP3 = vec_perm(srcP3a, srcP3b, perm);
+    srcP3ssA = (vector signed short)
+                                vec_mergeh((vector unsigned char)vzero, srcP3);
+    srcP3ssB = (vector signed short)
+                                vec_mergel((vector unsigned char)vzero, srcP3);
+//    srcbis += srcStride;
+
+    sum1A = vec_adds(srcP0ssA, srcP1ssA);
+    sum1B = vec_adds(srcP0ssB, srcP1ssB);
+    sum2A = vec_adds(srcM1ssA, srcP2ssA);
+    sum2B = vec_adds(srcM1ssB, srcP2ssB);
+    sum3A = vec_adds(srcM2ssA, srcP3ssA);
+    sum3B = vec_adds(srcM2ssB, srcP3ssB);
 
     srcM2ssA = srcM1ssA;
     srcM2ssB = srcM1ssB;
@@ -373,33 +420,32 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
     srcP2ssA = srcP3ssA;
     srcP2ssB = srcP3ssB;
 
-    const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
-    const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
+    pp1A = vec_mladd(sum1A, v20ss, v16ss);
+    pp1B = vec_mladd(sum1B, v20ss, v16ss);
 
-    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
-    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
+    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
+    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
 
-    const vector signed short pp3A = vec_add(sum3A, pp1A);
-    const vector signed short pp3B = vec_add(sum3B, pp1B);
+    pp3A = vec_add(sum3A, pp1A);
+    pp3B = vec_add(sum3B, pp1B);
 
-    const vector signed short psumA = vec_sub(pp3A, pp2A);
-    const vector signed short psumB = vec_sub(pp3B, pp2B);
+    psumA = vec_sub(pp3A, pp2A);
+    psumB = vec_sub(pp3B, pp2B);
 
-    const vector signed short sumA = vec_sra(psumA, v5us);
-    const vector signed short sumB = vec_sra(psumB, v5us);
+    sumA = vec_sra(psumA, v5us);
+    sumB = vec_sra(psumB, v5us);
 
-    const vector unsigned char sum = vec_packsu(sumA, sumB);
+    sum = vec_packsu(sumA, sumB);
 
-    const vector unsigned char dst1 = vec_ld(0, dst);
-    const vector unsigned char dst2 = vec_ld(16, dst);
-    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
+    dst1 = vec_ld(0, dst);
+    dst2 = vec_ld(16, dst);
+    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
 
-    vector unsigned char fsum;
     OP_U8_ALTIVEC(fsum, sum, vdst);
 
-    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
-    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
-    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
+    rsum = vec_perm(fsum, fsum, dstperm);
+    fdst1 = vec_sel(dst1, rsum, dstmask);
+    fdst2 = vec_sel(rsum, dst2, dstmask);
 
     vec_st(fdst1, 0, dst);
     vec_st(fdst2, 16, dst);
@@ -412,7 +458,6 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
   POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
-  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
   register int i;
   const vector signed int vzero = vec_splat_s32(0);
   const vector unsigned char permM2 = vec_lvsl(-2, src);
@@ -421,17 +466,47 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
   const vector unsigned char permP1 = vec_lvsl(+1, src);
   const vector unsigned char permP2 = vec_lvsl(+2, src);
   const vector unsigned char permP3 = vec_lvsl(+3, src);
-  const vector signed short v20ss = (const vector signed short)AVV(20);
+  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
   const vector unsigned int v10ui = vec_splat_u32(10);
   const vector signed short v5ss = vec_splat_s16(5);
   const vector signed short v1ss = vec_splat_s16(1);
-  const vector signed int v512si = (const vector signed int)AVV(512);
-  const vector unsigned int v16ui = (const vector unsigned int)AVV(16);
+  const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
+  const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
 
   register int align = ((((unsigned long)src) - 2) % 16);
 
-  src -= (2 * srcStride);
+  const vector unsigned char neg1 = (const vector unsigned char)
+                                                        vec_splat_s8(-1);
+
+  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
+                      srcP2A, srcP2B, srcP3A, srcP3B,
+                      srcM1A, srcM1B, srcM2A, srcM2B,
+                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
+                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
+
+  const vector unsigned char dstperm = vec_lvsr(0, dst);
+
+  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
+
+  const vector unsigned char mperm = (const vector unsigned char)
+    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
+        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
+  int16_t *tmpbis = tmp;
+
+  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
+                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
+                      tmpP2ssA, tmpP2ssB;
+
+  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
+                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
+                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
+                    ssumAe, ssumAo, ssumBe, ssumBo;
+  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
+                       rsum, fdst1, fdst2;
+  vector signed short ssume, ssumo;
 
+  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
+  src -= (2 * srcStride);
   for (i = 0 ; i < 21 ; i ++) {
     vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
     vector unsigned char srcR1 = vec_ld(-2, src);
@@ -492,36 +567,48 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
     } break;
     }
 
-    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
-    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
-    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
-    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
-
-    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
-    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
-    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
-    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
-
-    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
-    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
-    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
-    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
-
-    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
-    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
-    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
-    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
-    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
-    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
-
-    const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
-    const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);
-
-    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
-    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
-
-    const vector signed short psumA = vec_sub(pp1A, pp2A);
-    const vector signed short psumB = vec_sub(pp1B, pp2B);
+    srcP0A = (vector signed short)
+                            vec_mergeh((vector unsigned char)vzero, srcP0);
+    srcP0B = (vector signed short)
+                            vec_mergel((vector unsigned char)vzero, srcP0);
+    srcP1A = (vector signed short)
+                            vec_mergeh((vector unsigned char)vzero, srcP1);
+    srcP1B = (vector signed short)
+                            vec_mergel((vector unsigned char)vzero, srcP1);
+
+    srcP2A = (vector signed short)
+                            vec_mergeh((vector unsigned char)vzero, srcP2);
+    srcP2B = (vector signed short)
+                            vec_mergel((vector unsigned char)vzero, srcP2);
+    srcP3A = (vector signed short)
+                            vec_mergeh((vector unsigned char)vzero, srcP3);
+    srcP3B = (vector signed short)
+                            vec_mergel((vector unsigned char)vzero, srcP3);
+
+    srcM1A = (vector signed short)
+                            vec_mergeh((vector unsigned char)vzero, srcM1);
+    srcM1B = (vector signed short)
+                            vec_mergel((vector unsigned char)vzero, srcM1);
+    srcM2A = (vector signed short)
+                            vec_mergeh((vector unsigned char)vzero, srcM2);
+    srcM2B = (vector signed short)
+                            vec_mergel((vector unsigned char)vzero, srcM2);
+
+    sum1A = vec_adds(srcP0A, srcP1A);
+    sum1B = vec_adds(srcP0B, srcP1B);
+    sum2A = vec_adds(srcM1A, srcP2A);
+    sum2B = vec_adds(srcM1B, srcP2B);
+    sum3A = vec_adds(srcM2A, srcP3A);
+    sum3B = vec_adds(srcM2B, srcP3B);
+
+    pp1A = vec_mladd(sum1A, v20ss, sum3A);
+    pp1B = vec_mladd(sum1B, v20ss, sum3B);
+
+    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
+    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
+
+    psumA = vec_sub(pp1A, pp2A);
+    psumB = vec_sub(pp1B, pp2B);
 
     vec_st(psumA, 0, tmp);
     vec_st(psumB, 16, tmp);
@@ -530,35 +617,25 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
     tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
   }
 
-  const vector unsigned char dstperm = vec_lvsr(0, dst);
-  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
-  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
-  const vector unsigned char mperm = (const vector unsigned char)
-    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
-        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
-
-  int16_t *tmpbis = tmp - (tmpStride * 21);
-
-  vector signed short tmpM2ssA = vec_ld(0, tmpbis);
-  vector signed short tmpM2ssB = vec_ld(16, tmpbis);
+  tmpM2ssA = vec_ld(0, tmpbis);
+  tmpM2ssB = vec_ld(16, tmpbis);
   tmpbis += tmpStride;
-  vector signed short tmpM1ssA = vec_ld(0, tmpbis);
-  vector signed short tmpM1ssB = vec_ld(16, tmpbis);
+  tmpM1ssA = vec_ld(0, tmpbis);
+  tmpM1ssB = vec_ld(16, tmpbis);
   tmpbis += tmpStride;
-  vector signed short tmpP0ssA = vec_ld(0, tmpbis);
-  vector signed short tmpP0ssB = vec_ld(16, tmpbis);
+  tmpP0ssA = vec_ld(0, tmpbis);
+  tmpP0ssB = vec_ld(16, tmpbis);
   tmpbis += tmpStride;
-  vector signed short tmpP1ssA = vec_ld(0, tmpbis);
-  vector signed short tmpP1ssB = vec_ld(16, tmpbis);
+  tmpP1ssA = vec_ld(0, tmpbis);
+  tmpP1ssB = vec_ld(16, tmpbis);
   tmpbis += tmpStride;
-  vector signed short tmpP2ssA = vec_ld(0, tmpbis);
-  vector signed short tmpP2ssB = vec_ld(16, tmpbis);
+  tmpP2ssA = vec_ld(0, tmpbis);
+  tmpP2ssB = vec_ld(16, tmpbis);
   tmpbis += tmpStride;
 
   for (i = 0 ; i < 16 ; i++) {
     const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
     const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
-    tmpbis += tmpStride;
 
     const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
     const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
@@ -567,6 +644,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
     const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
     const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
 
+    tmpbis += tmpStride;
+
     tmpM2ssA = tmpM1ssA;
     tmpM2ssB = tmpM1ssB;
     tmpM1ssA = tmpP0ssA;
@@ -578,57 +657,56 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
     tmpP2ssA = tmpP3ssA;
     tmpP2ssB = tmpP3ssB;
 
-    const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
-    const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
-    const vector signed int pp1Be = vec_mule(sum1B, v20ss);
-    const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);
+    pp1Ae = vec_mule(sum1A, v20ss);
+    pp1Ao = vec_mulo(sum1A, v20ss);
+    pp1Be = vec_mule(sum1B, v20ss);
+    pp1Bo = vec_mulo(sum1B, v20ss);
 
-    const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
-    const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
-    const vector signed int pp2Be = vec_mule(sum2B, v5ss);
-    const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);
+    pp2Ae = vec_mule(sum2A, v5ss);
+    pp2Ao = vec_mulo(sum2A, v5ss);
+    pp2Be = vec_mule(sum2B, v5ss);
+    pp2Bo = vec_mulo(sum2B, v5ss);
 
-    const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
-    const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
-    const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
-    const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);
+    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
+    pp3Ao = vec_mulo(sum3A, v1ss);
+    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
+    pp3Bo = vec_mulo(sum3B, v1ss);
 
-    const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
-    const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
-    const vector signed int pp1cBe = vec_add(pp1Be, v512si);
-    const vector signed int pp1cBo = vec_add(pp1Bo, v512si);
+    pp1cAe = vec_add(pp1Ae, v512si);
+    pp1cAo = vec_add(pp1Ao, v512si);
+    pp1cBe = vec_add(pp1Be, v512si);
+    pp1cBo = vec_add(pp1Bo, v512si);
 
-    const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
-    const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
-    const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
-    const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);
+    pp32Ae = vec_sub(pp3Ae, pp2Ae);
+    pp32Ao = vec_sub(pp3Ao, pp2Ao);
+    pp32Be = vec_sub(pp3Be, pp2Be);
+    pp32Bo = vec_sub(pp3Bo, pp2Bo);
 
-    const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
-    const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
-    const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
-    const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
+    sumAe = vec_add(pp1cAe, pp32Ae);
+    sumAo = vec_add(pp1cAo, pp32Ao);
+    sumBe = vec_add(pp1cBe, pp32Be);
+    sumBo = vec_add(pp1cBo, pp32Bo);
 
-    const vector signed int ssumAe = vec_sra(sumAe, v10ui);
-    const vector signed int ssumAo = vec_sra(sumAo, v10ui);
-    const vector signed int ssumBe = vec_sra(sumBe, v10ui);
-    const vector signed int ssumBo = vec_sra(sumBo, v10ui);
+    ssumAe = vec_sra(sumAe, v10ui);
+    ssumAo = vec_sra(sumAo, v10ui);
+    ssumBe = vec_sra(sumBe, v10ui);
+    ssumBo = vec_sra(sumBo, v10ui);
 
-    const vector signed short ssume = vec_packs(ssumAe, ssumBe);
-    const vector signed short ssumo = vec_packs(ssumAo, ssumBo);
+    ssume = vec_packs(ssumAe, ssumBe);
+    ssumo = vec_packs(ssumAo, ssumBo);
 
-    const vector unsigned char sumv = vec_packsu(ssume, ssumo);
-    const vector unsigned char sum = vec_perm(sumv, sumv, mperm);
+    sumv = vec_packsu(ssume, ssumo);
+    sum = vec_perm(sumv, sumv, mperm);
 
-    const vector unsigned char dst1 = vec_ld(0, dst);
-    const vector unsigned char dst2 = vec_ld(16, dst);
-    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
+    dst1 = vec_ld(0, dst);
+    dst2 = vec_ld(16, dst);
+    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
 
-    vector unsigned char fsum;
     OP_U8_ALTIVEC(fsum, sum, vdst);
 
-    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
-    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
-    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
+    rsum = vec_perm(fsum, fsum, dstperm);
+    fdst1 = vec_sel(dst1, rsum, dstmask);
+    fdst2 = vec_sel(rsum, dst2, dstmask);
 
     vec_st(fdst1, 0, dst);
     vec_st(fdst2, 16, dst);
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
index d5f55b80f..b63c8dd84 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -30,6 +30,17 @@ extern void fdct_altivec(int16_t *block);
 extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
 extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
 
+extern void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width);
+extern void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1,
+                                                DWTELEM *b2, DWTELEM *b3,
+                                                DWTELEM *b4, DWTELEM *b5,
+                                                int width);
+extern void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
+                                          uint8_t * * block, int b_w, int b_h,
+                                          int src_x, int src_y, int src_stride,
+                                          slice_buffer * sb, int add,
+                                          uint8_t * dst8);
+
 int mm_flags = 0;
 
 int mm_support(void)
@@ -292,10 +303,13 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
 
         c->gmc1 = gmc1_altivec;
 
-#ifdef CONFIG_DARWIN // ATM gcc-3.3 and gcc-3.4 fail to compile these in linux...
         c->hadamard8_diff[0] = hadamard8_diff16_altivec;
         c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
-#endif
+
+
+        c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
+        c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
+        c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
 
 #ifdef CONFIG_ENCODERS
         if (avctx->dct_algo == FF_DCT_AUTO ||
diff --git a/src/libffmpeg/libavcodec/ppc/gcc_fixes.h b/src/libffmpeg/libavcodec/ppc/gcc_fixes.h
index 288fdf834..943905bc5 100644
--- a/src/libffmpeg/libavcodec/ppc/gcc_fixes.h
+++ b/src/libffmpeg/libavcodec/ppc/gcc_fixes.h
@@ -17,8 +17,17 @@
 # else
 #  define AVV
 # endif
+#define REG_v(a) asm ( #a )
 #else
+
 #define AVV(x...) {x}
+
+#if (__GNUC__ < 4)
+# define REG_v(a)
+#else
+# define REG_v(a) asm ( #a )
+#endif
+
 #if (__GNUC__ * 100 + __GNUC_MINOR__ < 303)
 
 /* This code was provided to me by Bartosch Pixa
diff --git a/src/libffmpeg/libavcodec/qdm2.c b/src/libffmpeg/libavcodec/qdm2.c
index 98bec5cca..81d548386 100644
--- a/src/libffmpeg/libavcodec/qdm2.c
+++ b/src/libffmpeg/libavcodec/qdm2.c
@@ -538,7 +538,7 @@ static void fix_coding_method_array (int sb, int channels, sb_int8_array coding_
                 run = 1;
                 case_val = 8;
             } else {
-                switch (switchtable[coding_method[ch][sb][j]]) {
+                switch (switchtable[coding_method[ch][sb][j]-8]) {
                     case 0: run = 10; case_val = 10; break;
                     case 1: run = 1; case_val = 16; break;
                     case 2: run = 5; case_val = 24; break;
@@ -1470,13 +1470,13 @@ static void qdm2_decode_fft_packets (QDM2Context *q)
             if (duration >= 0 && duration < 4)
                 qdm2_fft_decode_tones(q, duration, &gb, unknown_flag);
         } else if (type == 31) {
-            for (i=0; i < 4; i++)
-                qdm2_fft_decode_tones(q, i, &gb, unknown_flag);
+            for (j=0; j < 4; j++)
+                qdm2_fft_decode_tones(q, j, &gb, unknown_flag);
         } else if (type == 46) {
-            for (i=0; i < 6; i++)
-                q->fft_level_exp[i] = get_bits(&gb, 6);
-            for (i=0; i < 4; i++)
-            qdm2_fft_decode_tones(q, i, &gb, unknown_flag);
+            for (j=0; j < 6; j++)
+                q->fft_level_exp[j] = get_bits(&gb, 6);
+            for (j=0; j < 4; j++)
+            qdm2_fft_decode_tones(q, j, &gb, unknown_flag);
         }
     } // Loop on B packets
 
@@ -2008,8 +2008,10 @@ static int qdm2_decode_frame(AVCodecContext *avctx,
 {
     QDM2Context *s = avctx->priv_data;
 
-    if((buf == NULL) || (buf_size < s->checksum_size))
+    if(!buf)
         return 0;
+    if(buf_size < s->checksum_size)
+        return -1;
 
     *data_size = s->channels * s->frame_size * sizeof(int16_t);
 
diff --git a/src/libffmpeg/libavcodec/qdm2data.h b/src/libffmpeg/libavcodec/qdm2data.h
index f41a2078b..dafd4f490 100644
--- a/src/libffmpeg/libavcodec/qdm2data.h
+++ b/src/libffmpeg/libavcodec/qdm2data.h
@@ -32,18 +32,18 @@
 /** VLC TABLES **/
 
 /* values in this table range from -1..23; adjust retrieved value by -1 */
-static uint16_t vlc_tab_level_huffcodes[24] = {
+static const uint16_t vlc_tab_level_huffcodes[24] = {
     0x037c, 0x0004, 0x003c, 0x004c, 0x003a, 0x002c, 0x001c, 0x001a,
     0x0024, 0x0014, 0x0001, 0x0002, 0x0000, 0x0003, 0x0007, 0x0005,
     0x0006, 0x0008, 0x0009, 0x000a, 0x000c, 0x00fc, 0x007c, 0x017c
 };
 
-static uint8_t vlc_tab_level_huffbits[24] = {
+static const uint8_t vlc_tab_level_huffbits[24] = {
     10, 6, 7, 7, 6, 6, 6, 6, 6, 5, 4, 4, 4, 3, 3, 3, 3, 4, 4, 5, 7, 8, 9, 10
 };
 
 /* values in this table range from -1..36; adjust retrieved value by -1 */
-static uint16_t vlc_tab_diff_huffcodes[37] = {
+static const uint16_t vlc_tab_diff_huffcodes[37] = {
     0x1c57, 0x0004, 0x0000, 0x0001, 0x0003, 0x0002, 0x000f, 0x000e,
     0x0007, 0x0016, 0x0037, 0x0027, 0x0026, 0x0066, 0x0006, 0x0097,
     0x0046, 0x01c6, 0x0017, 0x0786, 0x0086, 0x0257, 0x00d7, 0x0357,
@@ -51,111 +51,111 @@ static uint16_t vlc_tab_diff_huffcodes[37] = {
     0x0b86, 0x0000, 0x1457, 0x0000, 0x0457
 };
 
-static uint8_t vlc_tab_diff_huffbits[37] = {
+static const uint8_t vlc_tab_diff_huffbits[37] = {
     13, 3, 3, 2, 3, 3, 4, 4, 6, 5, 6, 6, 7, 7, 8, 8,
     8, 9, 8, 11, 9, 10, 8, 10, 9, 12, 10, 0, 10, 13, 11, 0,
     12, 0, 13, 0, 13
 };
 
 /* values in this table range from -1..5; adjust retrieved value by -1 */
-static uint8_t vlc_tab_run_huffcodes[6] = {
+static const uint8_t vlc_tab_run_huffcodes[6] = {
     0x1f, 0x00, 0x01, 0x03, 0x07, 0x0f
 };
 
-static uint8_t vlc_tab_run_huffbits[6] = {
+static const uint8_t vlc_tab_run_huffbits[6] = {
     5, 1, 2, 3, 4, 5
 };
 
 /* values in this table range from -1..19; adjust retrieved value by -1 */
-static uint16_t vlc_tab_tone_level_idx_hi1_huffcodes[20] = {
+static const uint16_t vlc_tab_tone_level_idx_hi1_huffcodes[20] = {
     0x5714, 0x000c, 0x0002, 0x0001, 0x0000, 0x0004, 0x0034, 0x0054,
     0x0094, 0x0014, 0x0114, 0x0214, 0x0314, 0x0614, 0x0e14, 0x0f14,
     0x2714, 0x0714, 0x1714, 0x3714
 };
 
-static uint8_t vlc_tab_tone_level_idx_hi1_huffbits[20] = {
+static const uint8_t vlc_tab_tone_level_idx_hi1_huffbits[20] = {
     15, 4, 2, 1, 3, 5, 6, 7, 8, 10, 10, 11, 11, 12, 12, 12, 14, 14, 15, 14
 };
 
 /* values in this table range from -1..23; adjust retrieved value by -1 */
-static uint16_t vlc_tab_tone_level_idx_mid_huffcodes[24] = {
+static const uint16_t vlc_tab_tone_level_idx_mid_huffcodes[24] = {
     0x0fea, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
     0x0000, 0x0000, 0x0000, 0x0000, 0x03ea, 0x00ea, 0x002a, 0x001a,
     0x0006, 0x0001, 0x0000, 0x0002, 0x000a, 0x006a, 0x01ea, 0x07ea
 };
 
-static uint8_t vlc_tab_tone_level_idx_mid_huffbits[24] = {
+static const uint8_t vlc_tab_tone_level_idx_mid_huffbits[24] = {
     12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 9, 7, 5, 3, 1, 2, 4, 6, 8, 10, 12
 };
 
 /* values in this table range from -1..23; adjust retrieved value by -1 */
-static uint16_t vlc_tab_tone_level_idx_hi2_huffcodes[24] = {
+static const uint16_t vlc_tab_tone_level_idx_hi2_huffcodes[24] = {
     0x0664, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0064, 0x00e4,
     0x00a4, 0x0068, 0x0004, 0x0008, 0x0014, 0x0018, 0x0000, 0x0001,
     0x0002, 0x0003, 0x000c, 0x0028, 0x0024, 0x0164, 0x0000, 0x0264
 };
 
-static uint8_t vlc_tab_tone_level_idx_hi2_huffbits[24] = {
+static const uint8_t vlc_tab_tone_level_idx_hi2_huffbits[24] = {
     11, 0, 0, 0, 0, 0, 10, 8, 8, 7, 6, 6, 5, 5, 4, 2, 2, 2, 4, 7, 8, 9, 0, 11
 };
 
 /* values in this table range from -1..8; adjust retrieved value by -1 */
-static uint8_t vlc_tab_type30_huffcodes[9] = {
+static const uint8_t vlc_tab_type30_huffcodes[9] = {
     0x3c, 0x06, 0x00, 0x01, 0x03, 0x02, 0x04, 0x0c, 0x1c
 };
 
-static uint8_t vlc_tab_type30_huffbits[9] = {
+static const uint8_t vlc_tab_type30_huffbits[9] = {
     6, 3, 3, 2, 2, 3, 4, 5, 6
 };
 
 /* values in this table range from -1..9; adjust retrieved value by -1 */
-static uint8_t vlc_tab_type34_huffcodes[10] = {
+static const uint8_t vlc_tab_type34_huffcodes[10] = {
     0x18, 0x00, 0x01, 0x04, 0x05, 0x07, 0x03, 0x02, 0x06, 0x08
 };
 
-static uint8_t vlc_tab_type34_huffbits[10] = {
+static const uint8_t vlc_tab_type34_huffbits[10] = {
     5, 4, 3, 3, 3, 3, 3, 3, 3, 5
 };
 
 /* values in this table range from -1..22; adjust retrieved value by -1 */
-static uint16_t vlc_tab_fft_tone_offset_0_huffcodes[23] = {
+static const uint16_t vlc_tab_fft_tone_offset_0_huffcodes[23] = {
     0x038e, 0x0001, 0x0000, 0x0022, 0x000a, 0x0006, 0x0012, 0x0002,
     0x001e, 0x003e, 0x0056, 0x0016, 0x000e, 0x0032, 0x0072, 0x0042,
     0x008e, 0x004e, 0x00f2, 0x002e, 0x0036, 0x00c2, 0x018e
 };
 
-static uint8_t vlc_tab_fft_tone_offset_0_huffbits[23] = {
+static const uint8_t vlc_tab_fft_tone_offset_0_huffbits[23] = {
     10, 1, 2, 6, 4, 5, 6, 7, 6, 6, 7, 7, 8, 7, 8, 8, 9, 7, 8, 6, 6, 8, 10
 };
 
 /* values in this table range from -1..27; adjust retrieved value by -1 */
-static uint16_t vlc_tab_fft_tone_offset_1_huffcodes[28] = {
+static const uint16_t vlc_tab_fft_tone_offset_1_huffcodes[28] = {
     0x07a4, 0x0001, 0x0020, 0x0012, 0x001c, 0x0008, 0x0006, 0x0010,
     0x0000, 0x0014, 0x0004, 0x0032, 0x0070, 0x000c, 0x0002, 0x003a,
     0x001a, 0x002c, 0x002a, 0x0022, 0x0024, 0x000a, 0x0064, 0x0030,
     0x0062, 0x00a4, 0x01a4, 0x03a4
 };
 
-static uint8_t vlc_tab_fft_tone_offset_1_huffbits[28] = {
+static const uint8_t vlc_tab_fft_tone_offset_1_huffbits[28] = {
     11, 1, 6, 6, 5, 4, 3, 6, 6, 5, 6, 6, 7, 6, 6, 6,
     6, 6, 6, 7, 8, 6, 7, 7, 7, 9, 10, 11
 };
 
 /* values in this table range from -1..31; adjust retrieved value by -1 */
-static uint16_t vlc_tab_fft_tone_offset_2_huffcodes[32] = {
+static const uint16_t vlc_tab_fft_tone_offset_2_huffcodes[32] = {
     0x1760, 0x0001, 0x0000, 0x0082, 0x000c, 0x0006, 0x0003, 0x0007,
     0x0008, 0x0004, 0x0010, 0x0012, 0x0022, 0x001a, 0x0000, 0x0020,
     0x000a, 0x0040, 0x004a, 0x006a, 0x002a, 0x0042, 0x0002, 0x0060,
     0x00aa, 0x00e0, 0x00c2, 0x01c2, 0x0160, 0x0360, 0x0760, 0x0f60
 };
 
-static uint8_t vlc_tab_fft_tone_offset_2_huffbits[32] = {
+static const uint8_t vlc_tab_fft_tone_offset_2_huffbits[32] = {
     13, 2, 0, 8, 4, 3, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,
     7, 7, 7, 7, 8, 8, 8, 9, 8, 8, 9, 9, 10, 11, 13, 12
 };
 
 /* values in this table range from -1..34; adjust retrieved value by -1 */
-static uint16_t vlc_tab_fft_tone_offset_3_huffcodes[35] = {
+static const uint16_t vlc_tab_fft_tone_offset_3_huffcodes[35] = {
     0x33ea, 0x0005, 0x0000, 0x000c, 0x0000, 0x0006, 0x0003, 0x0008,
     0x0002, 0x0001, 0x0004, 0x0007, 0x001a, 0x000f, 0x001c, 0x002c,
     0x000a, 0x001d, 0x002d, 0x002a, 0x000d, 0x004c, 0x008c, 0x006a,
@@ -163,14 +163,14 @@ static uint16_t vlc_tab_fft_tone_offset_3_huffcodes[35] = {
     0x0bea, 0x03ea, 0x13ea
 };
 
-static uint8_t vlc_tab_fft_tone_offset_3_huffbits[35] = {
+static const uint8_t vlc_tab_fft_tone_offset_3_huffbits[35] = {
     14, 4, 0, 10, 4, 3, 3, 4, 4, 3, 4, 4, 5, 4, 5, 6,
     6, 5, 6, 7, 7, 7, 8, 8, 8, 8, 9, 10, 10, 10, 10, 11,
     12, 13, 14
 };
 
 /* values in this table range from -1..37; adjust retrieved value by -1 */
-static uint16_t vlc_tab_fft_tone_offset_4_huffcodes[38] = {
+static const uint16_t vlc_tab_fft_tone_offset_4_huffcodes[38] = {
     0x5282, 0x0016, 0x0000, 0x0136, 0x0004, 0x0000, 0x0007, 0x000a,
     0x000e, 0x0003, 0x0001, 0x000d, 0x0006, 0x0009, 0x0012, 0x0005,
     0x0025, 0x0022, 0x0015, 0x0002, 0x0076, 0x0035, 0x0042, 0x00c2,
@@ -178,7 +178,7 @@ static uint16_t vlc_tab_fft_tone_offset_4_huffcodes[38] = {
     0x0a82, 0x0082, 0x0282, 0x1282, 0x3282, 0x2282
 };
 
-static uint8_t vlc_tab_fft_tone_offset_4_huffbits[38] = {
+static const uint8_t vlc_tab_fft_tone_offset_4_huffbits[38] = {
     15, 6, 0, 9, 3, 3, 3, 4, 4, 3, 4, 4, 5, 4, 5, 6,
     6, 6, 6, 8, 7, 6, 8, 9, 9, 8, 9, 10, 11, 10, 11, 12,
     12, 12, 14, 15, 14, 14
@@ -187,44 +187,44 @@ static uint8_t vlc_tab_fft_tone_offset_4_huffbits[38] = {
 /** FFT TABLES **/
 
 /* values in this table range from -1..27; adjust retrieved value by -1 */
-static uint16_t fft_level_exp_alt_huffcodes[28] = {
+static const uint16_t fft_level_exp_alt_huffcodes[28] = {
     0x1ec6, 0x0006, 0x00c2, 0x0142, 0x0242, 0x0246, 0x00c6, 0x0046,
     0x0042, 0x0146, 0x00a2, 0x0062, 0x0026, 0x0016, 0x000e, 0x0005,
     0x0004, 0x0003, 0x0000, 0x0001, 0x000a, 0x0012, 0x0002, 0x0022,
     0x01c6, 0x02c6, 0x06c6, 0x0ec6
 };
 
-static uint8_t fft_level_exp_alt_huffbits[28] = {
+static const uint8_t fft_level_exp_alt_huffbits[28] = {
     13, 7, 8, 9, 10, 10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3,
     3, 2, 3, 3, 4, 5, 7, 8, 9, 11, 12, 13
 };
 
 /* values in this table range from -1..19; adjust retrieved value by -1 */
-static uint16_t fft_level_exp_huffcodes[20] = {
+static const uint16_t fft_level_exp_huffcodes[20] = {
     0x0f24, 0x0001, 0x0002, 0x0000, 0x0006, 0x0005, 0x0007, 0x000c,
     0x000b, 0x0014, 0x0013, 0x0004, 0x0003, 0x0023, 0x0064, 0x00a4,
     0x0024, 0x0124, 0x0324, 0x0724
 };
 
-static uint8_t fft_level_exp_huffbits[20] = {
+static const uint8_t fft_level_exp_huffbits[20] = {
     12, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 6, 7, 8, 9, 10, 11, 12
 };
 
 /* values in this table range from -1..6; adjust retrieved value by -1 */
-static uint8_t fft_stereo_exp_huffcodes[7] = {
+static const uint8_t fft_stereo_exp_huffcodes[7] = {
     0x3e, 0x01, 0x00, 0x02, 0x06, 0x0e, 0x1e
 };
 
-static uint8_t fft_stereo_exp_huffbits[7] = {
+static const uint8_t fft_stereo_exp_huffbits[7] = {
     6, 1, 2, 3, 4, 5, 6
 };
 
 /* values in this table range from -1..8; adjust retrieved value by -1 */
-static uint8_t fft_stereo_phase_huffcodes[9] = {
+static const uint8_t fft_stereo_phase_huffcodes[9] = {
     0x35, 0x02, 0x00, 0x01, 0x0d, 0x15, 0x05, 0x09, 0x03
 };
 
-static uint8_t fft_stereo_phase_huffbits[9] = {
+static const uint8_t fft_stereo_phase_huffbits[9] = {
     6, 2, 2, 4, 4, 6, 5, 4, 2
 };
 
diff --git a/src/libffmpeg/libavcodec/rangecoder.c b/src/libffmpeg/libavcodec/rangecoder.c
index 8607b8f6d..4266cf1b3 100644
--- a/src/libffmpeg/libavcodec/rangecoder.c
+++ b/src/libffmpeg/libavcodec/rangecoder.c
@@ -110,7 +110,7 @@ void ff_build_rac_states(RangeCoder *c, int factor, int max_p){
         c->one_state[    i]=     p8;
     }
 
-    for(i=0; i<256; i++)
+    for(i=1; i<255; i++)
         c->zero_state[i]= 256-c->one_state[256-i];
 #if 0
     for(i=0; i<256; i++)
diff --git a/src/libffmpeg/libavcodec/ratecontrol.c b/src/libffmpeg/libavcodec/ratecontrol.c
index 29dc1f495..f4f433add 100644
--- a/src/libffmpeg/libavcodec/ratecontrol.c
+++ b/src/libffmpeg/libavcodec/ratecontrol.c
@@ -117,13 +117,18 @@ int ff_rate_control_init(MpegEncContext *s)
 
             p= next;
         }
-#ifdef CONFIG_XVID
+
+        if(init_pass2(s) < 0) return -1;
+
         //FIXME maybe move to end
-        if((s->flags&CODEC_FLAG_PASS2) && s->avctx->rc_strategy == FF_RC_STRATEGY_XVID)
+        if((s->flags&CODEC_FLAG_PASS2) && s->avctx->rc_strategy == FF_RC_STRATEGY_XVID) {
+#ifdef CONFIG_XVID
             return ff_xvid_rate_control_init(s);
+#else
+            av_log(s->avctx, AV_LOG_ERROR, "XviD ratecontrol requires libavcodec compiled with XviD support\n");
+            return -1;
 #endif
-
-        if(init_pass2(s) < 0) return -1;
+        }
     }
 
     if(!(s->flags&CODEC_FLAG_PASS2)){
@@ -906,7 +911,7 @@ static int init_pass2(MpegEncContext *s)
     av_free(qscale);
     av_free(blured_qscale);
 
-    if(abs(expected_bits/all_available_bits - 1.0) > 0.01 ){
+    if(fabs(expected_bits/all_available_bits - 1.0) > 0.01 ){
         av_log(s->avctx, AV_LOG_ERROR, "Error: 2pass curve failed to converge\n");
         return -1;
     }
diff --git a/src/libffmpeg/libavcodec/raw.c b/src/libffmpeg/libavcodec/raw.c
index 28c3cad54..e777397fe 100644
--- a/src/libffmpeg/libavcodec/raw.c
+++ b/src/libffmpeg/libavcodec/raw.c
@@ -26,7 +26,6 @@
 
 typedef struct RawVideoContext {
     unsigned char * buffer;  /* block of memory for holding one frame */
-    unsigned char * p;       /* current position in buffer */
     int             length;  /* number of bytes in buffer */
     AVFrame pic;             ///< AVCodecContext.coded_frame
 } RawVideoContext;
@@ -51,6 +50,9 @@ const PixelFormatTag pixelFormatTags[] = {
     { PIX_FMT_UYVY422, MKTAG('U', 'Y', 'V', 'Y') },
     { PIX_FMT_GRAY8,   MKTAG('G', 'R', 'E', 'Y') },
 
+    /* quicktime */
+    { PIX_FMT_UYVY422, MKTAG('2', 'v', 'u', 'y') },
+
     { -1, 0 },
 };
 
@@ -86,6 +88,7 @@ static int raw_init_decoder(AVCodecContext *avctx)
         avctx->pix_fmt = findPixelFormat(avctx->codec_tag);
     else if (avctx->bits_per_sample){
         switch(avctx->bits_per_sample){
+        case  8: avctx->pix_fmt= PIX_FMT_PAL8  ; break;
         case 15: avctx->pix_fmt= PIX_FMT_RGB555; break;
         case 16: avctx->pix_fmt= PIX_FMT_RGB565; break;
         case 24: avctx->pix_fmt= PIX_FMT_BGR24 ; break;
@@ -95,7 +98,6 @@ static int raw_init_decoder(AVCodecContext *avctx)
 
     context->length = avpicture_get_size(avctx->pix_fmt, avctx->width, avctx->height);
     context->buffer = av_malloc(context->length);
-    context->p      = context->buffer;
     context->pic.pict_type = FF_I_TYPE;
     context->pic.key_frame = 1;
 
@@ -108,7 +110,7 @@ static int raw_init_decoder(AVCodecContext *avctx)
 }
 
 static void flip(AVCodecContext *avctx, AVPicture * picture){
-    if(!avctx->codec_tag && avctx->bits_per_sample && picture->linesize[1]==0){
+    if(!avctx->codec_tag && avctx->bits_per_sample && picture->linesize[2]==0){
         picture->data[0] += picture->linesize[0] * (avctx->height-1);
         picture->linesize[0] *= -1;
     }
@@ -119,7 +121,6 @@ static int raw_decode(AVCodecContext *avctx,
                             uint8_t *buf, int buf_size)
 {
     RawVideoContext *context = avctx->priv_data;
-    int bytesNeeded;
 
     AVFrame * frame = (AVFrame *) data;
     AVPicture * picture = (AVPicture *) data;
@@ -127,27 +128,21 @@ static int raw_decode(AVCodecContext *avctx,
     frame->interlaced_frame = avctx->coded_frame->interlaced_frame;
     frame->top_field_first = avctx->coded_frame->top_field_first;
 
-    /* Early out without copy if packet size == frame size */
-    if (buf_size == context->length  &&  context->p == context->buffer) {
-        avpicture_fill(picture, buf, avctx->pix_fmt, avctx->width, avctx->height);
-        flip(avctx, picture);
-        *data_size = sizeof(AVPicture);
-        return buf_size;
-    }
+    if(buf_size < context->length - (avctx->pix_fmt==PIX_FMT_PAL8 ? 256*4 : 0))
+        return -1;
 
-    bytesNeeded = context->length - (context->p - context->buffer);
-    if (buf_size < bytesNeeded) {
-        memcpy(context->p, buf, buf_size);
-        context->p += buf_size;
-        return buf_size;
+    avpicture_fill(picture, buf, avctx->pix_fmt, avctx->width, avctx->height);
+    if(avctx->pix_fmt==PIX_FMT_PAL8 && buf_size < context->length){
+        frame->data[1]= context->buffer;
+    }
+    if (avctx->palctrl && avctx->palctrl->palette_changed) {
+        memcpy(frame->data[1], avctx->palctrl->palette, AVPALETTE_SIZE);
+        avctx->palctrl->palette_changed = 0;
     }
 
-    memcpy(context->p, buf, bytesNeeded);
-    context->p = context->buffer;
-    avpicture_fill(picture, context->buffer, avctx->pix_fmt, avctx->width, avctx->height);
     flip(avctx, picture);
     *data_size = sizeof(AVPicture);
-    return bytesNeeded;
+    return buf_size;
 }
 
 static int raw_close_decoder(AVCodecContext *avctx)
diff --git a/src/libffmpeg/libavcodec/resample2.c b/src/libffmpeg/libavcodec/resample2.c
index 735f612d1..11da57651 100644
--- a/src/libffmpeg/libavcodec/resample2.c
+++ b/src/libffmpeg/libavcodec/resample2.c
@@ -62,7 +62,7 @@ typedef struct AVResampleContext{
 /**
  * 0th order modified bessel function of the first kind.
  */
-double bessel(double x){
+static double bessel(double x){
     double v=1;
     double t=1;
     int i;
diff --git a/src/libffmpeg/libavcodec/rv10.c b/src/libffmpeg/libavcodec/rv10.c
index 5dd942dc5..daec2b85b 100644
--- a/src/libffmpeg/libavcodec/rv10.c
+++ b/src/libffmpeg/libavcodec/rv10.c
@@ -321,7 +321,7 @@ static int rv10_decode_picture_header(MpegEncContext *s)
     pb_frame = get_bits(&s->gb, 1);
 
 #ifdef DEBUG
-    printf("pict_type=%d pb_frame=%d\n", s->pict_type, pb_frame);
+    av_log(s->avctx, AV_LOG_DEBUG, "pict_type=%d pb_frame=%d\n", s->pict_type, pb_frame);
 #endif
 
     if (pb_frame){
@@ -342,7 +342,7 @@ static int rv10_decode_picture_header(MpegEncContext *s)
             s->last_dc[1] = get_bits(&s->gb, 8);
             s->last_dc[2] = get_bits(&s->gb, 8);
 #ifdef DEBUG
-            printf("DC:%d %d %d\n",
+            av_log(s->avctx, AV_LOG_DEBUG, "DC:%d %d %d\n",
                    s->last_dc[0],
                    s->last_dc[1],
                    s->last_dc[2]);
@@ -631,7 +631,7 @@ static int rv10_decode_packet(AVCodecContext *avctx,
     }
 
 #ifdef DEBUG
-    printf("qscale=%d\n", s->qscale);
+    av_log(avctx, AV_LOG_DEBUG, "qscale=%d\n", s->qscale);
 #endif
 
     /* default quantization values */
@@ -639,9 +639,9 @@ static int rv10_decode_packet(AVCodecContext *avctx,
         if(s->mb_y==0) s->first_slice_line=1;
     }else{
         s->first_slice_line=1;
-        s->resync_mb_x= s->mb_x;
-        s->resync_mb_y= s->mb_y;
     }
+    s->resync_mb_x= s->mb_x;
+    s->resync_mb_y= s->mb_y;
     if(s->h263_aic){
         s->y_dc_scale_table=
         s->c_dc_scale_table= ff_aic_dc_scale_table;
@@ -672,7 +672,7 @@ static int rv10_decode_packet(AVCodecContext *avctx,
         int ret;
         ff_update_block_index(s);
 #ifdef DEBUG
-        printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
+        av_log(avctx, AV_LOG_DEBUG, "**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 #endif
 
         s->mv_dir = MV_DIR_FORWARD;
@@ -713,7 +713,7 @@ static int rv10_decode_frame(AVCodecContext *avctx,
     AVFrame *pict = data;
 
 #ifdef DEBUG
-    printf("*****frame %d size=%d\n", avctx->frame_number, buf_size);
+    av_log(avctx, AV_LOG_DEBUG, "*****frame %d size=%d\n", avctx->frame_number, buf_size);
 #endif
 
     /* no supplementary picture */
@@ -737,19 +737,20 @@ static int rv10_decode_frame(AVCodecContext *avctx,
         rv10_decode_packet(avctx, buf, buf_size);
     }
 
-    if(s->mb_y>=s->mb_height){
+    if(s->current_picture_ptr != NULL && s->mb_y>=s->mb_height){
         ff_er_frame_end(s);
         MPV_frame_end(s);
 
-        if(s->pict_type==B_TYPE || s->low_delay){
-            *pict= *(AVFrame*)&s->current_picture;
-            ff_print_debug_info(s, pict);
-        } else {
-            *pict= *(AVFrame*)&s->last_picture;
-            ff_print_debug_info(s, pict);
+        if (s->pict_type == B_TYPE || s->low_delay) {
+            *pict= *(AVFrame*)s->current_picture_ptr;
+        } else if (s->last_picture_ptr != NULL) {
+            *pict= *(AVFrame*)s->last_picture_ptr;
         }
-        if(s->last_picture_ptr || s->low_delay)
+
+        if(s->last_picture_ptr || s->low_delay){
             *data_size = sizeof(AVFrame);
+            ff_print_debug_info(s, pict);
+        }
         s->current_picture_ptr= NULL; //so we can detect if frame_end wasnt called (find some nicer solution...)
     }
 
diff --git a/src/libffmpeg/libavcodec/shorten.c b/src/libffmpeg/libavcodec/shorten.c
index 4d80d40a5..af1c3fe6e 100644
--- a/src/libffmpeg/libavcodec/shorten.c
+++ b/src/libffmpeg/libavcodec/shorten.c
@@ -106,18 +106,27 @@ static int shorten_decode_init(AVCodecContext * avctx)
     return 0;
 }
 
-static void allocate_buffers(ShortenContext *s)
+static int allocate_buffers(ShortenContext *s)
 {
     int i, chan;
     for (chan=0; chan<s->channels; chan++) {
+        if(FFMAX(1, s->nmean) >= UINT_MAX/sizeof(int32_t)){
+            av_log(s->avctx, AV_LOG_ERROR, "nmean too large\n");
+            return -1;
+        }
+        if(s->blocksize + s->nwrap >= UINT_MAX/sizeof(int32_t) || s->blocksize + s->nwrap <= (unsigned)s->nwrap){
+            av_log(s->avctx, AV_LOG_ERROR, "s->blocksize + s->nwrap too large\n");
+            return -1;
+        }
+
         s->offset[chan] = av_realloc(s->offset[chan], sizeof(int32_t)*FFMAX(1, s->nmean));
 
         s->decoded[chan] = av_realloc(s->decoded[chan], sizeof(int32_t)*(s->blocksize + s->nwrap));
         for (i=0; i<s->nwrap; i++)
             s->decoded[chan][i] = 0;
         s->decoded[chan] += s->nwrap;
-
     }
+    return 0;
 }
 
 
diff --git a/src/libffmpeg/libavcodec/snow.c b/src/libffmpeg/libavcodec/snow.c
index ad69c3241..05ad44726 100644
--- a/src/libffmpeg/libavcodec/snow.c
+++ b/src/libffmpeg/libavcodec/snow.c
@@ -19,23 +19,15 @@
 #include "avcodec.h"
 #include "common.h"
 #include "dsputil.h"
+#include "snow.h"
 
 #include "rangecoder.h"
-#define MID_STATE 128
 
 #include "mpegvideo.h"
 
 #undef NDEBUG
 #include <assert.h>
 
-#define MAX_DECOMPOSITIONS 8
-#define MAX_PLANES 4
-#define DWTELEM int
-#define QSHIFT 5
-#define QROOT (1<<QSHIFT)
-#define LOSSLESS_QLOG -128
-#define FRAC_BITS 8
-
 static const int8_t quant3[256]={
  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -181,8 +173,6 @@ static const int8_t quant13[256]={
 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 };
 
-#define LOG2_OBMC_MAX 6
-#define OBMC_MAX (1<<(LOG2_OBMC_MAX))
 #if 0 //64*cubic
 static const uint8_t obmc32[1024]={
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -240,57 +230,57 @@ static const uint8_t obmc16[256]={
 };
 #elif 1 // 64*linear
 static const uint8_t obmc32[1024]={
- 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
- 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
- 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
- 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,13,12,11,10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0,
- 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15,16,17,17,16,15,14,13,12,11,10, 8, 7, 6, 5, 4, 3, 2, 1,
- 1, 2, 3, 5, 6, 8, 9,10,12,13,14,16,17,19,20,21,21,20,19,17,16,14,13,12,10, 9, 8, 6, 5, 3, 2, 1,
- 1, 2, 4, 6, 7, 9,11,12,14,15,17,19,20,22,24,25,25,24,22,20,19,17,15,14,12,11, 9, 7, 6, 4, 2, 1,
- 1, 3, 5, 7, 8,10,12,14,16,18,20,22,23,25,27,29,29,27,25,23,22,20,18,16,14,12,10, 8, 7, 5, 3, 1,
- 1, 3, 5, 7,10,12,14,16,18,20,22,24,27,29,31,33,33,31,29,27,24,22,20,18,16,14,12,10, 7, 5, 3, 1,
- 1, 4, 6, 8,11,13,15,18,20,23,25,27,30,32,34,37,37,34,32,30,27,25,23,20,18,15,13,11, 8, 6, 4, 1,
- 1, 4, 7, 9,12,14,17,20,22,25,28,30,33,35,38,41,41,38,35,33,30,28,25,22,20,17,14,12, 9, 7, 4, 1,
- 1, 4, 7,10,13,16,19,22,24,27,30,33,36,39,42,45,45,42,39,36,33,30,27,24,22,19,16,13,10, 7, 4, 1,
- 2, 5, 8,11,14,17,20,23,27,30,33,36,39,42,45,48,48,45,42,39,36,33,30,27,23,20,17,14,11, 8, 5, 2,
- 2, 5, 8,12,15,19,22,25,29,32,35,39,42,46,49,52,52,49,46,42,39,35,32,29,25,22,19,15,12, 8, 5, 2,
- 2, 5, 9,13,16,20,24,27,31,34,38,42,45,49,53,56,56,53,49,45,42,38,34,31,27,24,20,16,13, 9, 5, 2,
- 2, 6,10,14,17,21,25,29,33,37,41,45,48,52,56,60,60,56,52,48,45,41,37,33,29,25,21,17,14,10, 6, 2,
- 2, 6,10,14,17,21,25,29,33,37,41,45,48,52,56,60,60,56,52,48,45,41,37,33,29,25,21,17,14,10, 6, 2,
- 2, 5, 9,13,16,20,24,27,31,34,38,42,45,49,53,56,56,53,49,45,42,38,34,31,27,24,20,16,13, 9, 5, 2,
- 2, 5, 8,12,15,19,22,25,29,32,35,39,42,46,49,52,52,49,46,42,39,35,32,29,25,22,19,15,12, 8, 5, 2,
- 2, 5, 8,11,14,17,20,23,27,30,33,36,39,42,45,48,48,45,42,39,36,33,30,27,23,20,17,14,11, 8, 5, 2,
- 1, 4, 7,10,13,16,19,22,24,27,30,33,36,39,42,45,45,42,39,36,33,30,27,24,22,19,16,13,10, 7, 4, 1,
- 1, 4, 7, 9,12,14,17,20,22,25,28,30,33,35,38,41,41,38,35,33,30,28,25,22,20,17,14,12, 9, 7, 4, 1,
- 1, 4, 6, 8,11,13,15,18,20,23,25,27,30,32,34,37,37,34,32,30,27,25,23,20,18,15,13,11, 8, 6, 4, 1,
- 1, 3, 5, 7,10,12,14,16,18,20,22,24,27,29,31,33,33,31,29,27,24,22,20,18,16,14,12,10, 7, 5, 3, 1,
- 1, 3, 5, 7, 8,10,12,14,16,18,20,22,23,25,27,29,29,27,25,23,22,20,18,16,14,12,10, 8, 7, 5, 3, 1,
- 1, 2, 4, 6, 7, 9,11,12,14,15,17,19,20,22,24,25,25,24,22,20,19,17,15,14,12,11, 9, 7, 6, 4, 2, 1,
- 1, 2, 3, 5, 6, 8, 9,10,12,13,14,16,17,19,20,21,21,20,19,17,16,14,13,12,10, 9, 8, 6, 5, 3, 2, 1,
- 1, 2, 3, 4, 5, 6, 7, 8,10,11,12,13,14,15,16,17,17,16,15,14,13,12,11,10, 8, 7, 6, 5, 4, 3, 2, 1,
- 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,13,12,11,10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0,
- 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9,10,10, 9, 8, 8, 7, 7, 6, 5, 5, 4, 3, 3, 2, 2, 1, 0,
- 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 0,
- 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
+  8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
+  8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
+  8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
+  4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
+  4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
+  4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
+  4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
+  4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
+  4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
+  4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
+  4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
+  0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
+  0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
+  0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
+  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
  //error:0.000020
 };
 static const uint8_t obmc16[256]={
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
- 1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
- 1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
- 2, 5, 9,12,16,19,23,26,26,23,19,16,12, 9, 5, 2,
- 2, 7,11,16,20,25,29,34,34,29,25,20,16,11, 7, 2,
- 3, 8,14,19,25,30,36,41,41,36,30,25,19,14, 8, 3,
- 3,10,16,23,29,36,42,49,49,42,36,29,23,16,10, 3,
- 4,11,19,26,34,41,49,56,56,49,41,34,26,19,11, 4,
- 4,11,19,26,34,41,49,56,56,49,41,34,26,19,11, 4,
- 3,10,16,23,29,36,42,49,49,42,36,29,23,16,10, 3,
- 3, 8,14,19,25,30,36,41,41,36,30,25,19,14, 8, 3,
- 2, 7,11,16,20,25,29,34,34,29,25,20,16,11, 7, 2,
- 2, 5, 9,12,16,19,23,26,26,23,19,16,12, 9, 5, 2,
- 1, 4, 6, 9,11,14,16,19,19,16,14,11, 9, 6, 4, 1,
- 1, 2, 4, 5, 7, 8,10,11,11,10, 8, 7, 5, 4, 2, 1,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 2, 2, 1, 1, 0,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
+ 12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
+ 12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
+  8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
+  8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
+  4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
+  4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
+  0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 //error:0.000015
 };
 #else //64*cos
@@ -352,23 +342,23 @@ static const uint8_t obmc16[256]={
 
 //linear *64
 static const uint8_t obmc8[64]={
- 1, 3, 5, 7, 7, 5, 3, 1,
- 3, 9,15,21,21,15, 9, 3,
- 5,15,25,35,35,25,15, 5,
- 7,21,35,49,49,35,21, 7,
- 7,21,35,49,49,35,21, 7,
- 5,15,25,35,35,25,15, 5,
- 3, 9,15,21,21,15, 9, 3,
- 1, 3, 5, 7, 7, 5, 3, 1,
+  4, 12, 20, 28, 28, 20, 12,  4,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+ 20, 60,100,140,140,100, 60, 20,
+ 28, 84,140,196,196,140, 84, 28,
+ 28, 84,140,196,196,140, 84, 28,
+ 20, 60,100,140,140,100, 60, 20,
+ 12, 36, 60, 84, 84, 60, 36, 12,
+  4, 12, 20, 28, 28, 20, 12,  4,
 //error:0.000000
 };
 
 //linear *64
 static const uint8_t obmc4[16]={
- 4,12,12, 4,
-12,36,36,12,
-12,36,36,12,
- 4,12,12, 4,
+ 16, 48, 48, 16,
+ 48,144,144, 48,
+ 48,144,144, 48,
+ 16, 48, 48, 16,
 //error:0.000000
 };
 
@@ -376,9 +366,12 @@ static const uint8_t *obmc_tab[4]={
     obmc32, obmc16, obmc8, obmc4
 };
 
+static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
+
 typedef struct BlockNode{
     int16_t mx;
     int16_t my;
+    uint8_t ref;
     uint8_t color[3];
     uint8_t type;
 //#define TYPE_SPLIT    1
@@ -392,6 +385,7 @@ static const BlockNode null_block= { //FIXME add border maybe
     .color= {128,128,128},
     .mx= 0,
     .my= 0,
+    .ref= 0,
     .type= 0,
     .level= 0,
 };
@@ -425,17 +419,6 @@ typedef struct Plane{
     SubBand band[MAX_DECOMPOSITIONS][4];
 }Plane;
 
-/** Used to minimize the amount of memory used in order to optimize cache performance. **/
-typedef struct {
-    DWTELEM * * line; ///< For use by idwt and predict_slices.
-    DWTELEM * * data_stack; ///< Used for internal purposes.
-    int data_stack_top;
-    int line_count;
-    int line_width;
-    int data_count;
-    DWTELEM * base_buffer; ///< Buffer that this structure is caching.
-} slice_buffer;
-
 typedef struct SnowContext{
 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 
@@ -445,7 +428,7 @@ typedef struct SnowContext{
     AVFrame new_picture;
     AVFrame input_picture;              ///< new_picture with the internal linesizes
     AVFrame current_picture;
-    AVFrame last_picture;
+    AVFrame last_picture[MAX_REF_FRAMES];
     AVFrame mconly_picture;
 //     uint8_t q_context[16];
     uint8_t header_state[32];
@@ -457,6 +440,10 @@ typedef struct SnowContext{
     int temporal_decomposition_type;
     int spatial_decomposition_count;
     int temporal_decomposition_count;
+    int max_ref_frames;
+    int ref_frames;
+    int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
+    uint32_t *ref_scores[MAX_REF_FRAMES];
     DWTELEM *spatial_dwt_buffer;
     int colorspace_type;
     int chroma_h_shift;
@@ -465,6 +452,7 @@ typedef struct SnowContext{
     int qlog;
     int lambda;
     int lambda2;
+    int pass1_rc;
     int mv_scale;
     int qbias;
 #define QBIAS_SHIFT 3
@@ -573,12 +561,12 @@ static void slice_buffer_destroy(slice_buffer * buf)
     for (i = buf->data_count - 1; i >= 0; i--)
     {
         assert(buf->data_stack[i]);
-        av_free(buf->data_stack[i]);
+        av_freep(&buf->data_stack[i]);
     }
     assert(buf->data_stack);
-    av_free(buf->data_stack);
+    av_freep(&buf->data_stack);
     assert(buf->line);
-    av_free(buf->line);
+    av_freep(&buf->line);
 }
 
 #ifdef __sgi
@@ -741,6 +729,7 @@ static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst
     }
 }
 
+#ifndef lift5
 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
     const int mirror_left= !highpass;
     const int mirror_right= (width&1) ^ highpass;
@@ -770,7 +759,9 @@ static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int ds
         dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
     }
 }
+#endif
 
+#ifndef liftS
 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
     const int mirror_left= !highpass;
     const int mirror_right= (width&1) ^ highpass;
@@ -793,6 +784,7 @@ static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int ds
         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
     }
 }
+#endif
 
 
 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
@@ -1111,76 +1103,6 @@ STOP_TIMER("vertical_decompose53i*")}
     }
 }
 
-#define liftS lift
-#define lift5 lift
-#if 1
-#define W_AM 3
-#define W_AO 0
-#define W_AS 1
-
-#undef liftS
-#define W_BM 1
-#define W_BO 8
-#define W_BS 4
-
-#define W_CM 1
-#define W_CO 0
-#define W_CS 0
-
-#define W_DM 3
-#define W_DO 4
-#define W_DS 3
-#elif 0
-#define W_AM 55
-#define W_AO 16
-#define W_AS 5
-
-#define W_BM 3
-#define W_BO 32
-#define W_BS 6
-
-#define W_CM 127
-#define W_CO 64
-#define W_CS 7
-
-#define W_DM 7
-#define W_DO 8
-#define W_DS 4
-#elif 0
-#define W_AM 97
-#define W_AO 32
-#define W_AS 6
-
-#define W_BM 63
-#define W_BO 512
-#define W_BS 10
-
-#define W_CM 13
-#define W_CO 8
-#define W_CS 4
-
-#define W_DM 15
-#define W_DO 16
-#define W_DS 5
-
-#else
-
-#define W_AM 203
-#define W_AO 64
-#define W_AS 7
-
-#define W_BM 217
-#define W_BO 2048
-#define W_BS 12
-
-#define W_CM 113
-#define W_CO 64
-#define W_CS 7
-
-#define W_DM 227
-#define W_DO 128
-#define W_DS 9
-#endif
 static void horizontal_decompose97i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int w2= (width+1)>>1;
@@ -1275,9 +1197,9 @@ void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type
 
     for(level=0; level<decomposition_count; level++){
         switch(type){
-        case 0: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
-        case 1: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
-        case 2: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
+        case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
+        case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
+        case DWT_X: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
         }
     }
 }
@@ -1410,7 +1332,7 @@ static void spatial_compose53i(DWTELEM *buffer, int width, int height, int strid
 }
 
 
-static void horizontal_compose97i(DWTELEM *b, int width){
+void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int w2= (width+1)>>1;
 
@@ -1463,7 +1385,7 @@ static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int wid
     }
 }
 
-static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
+void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
     int i;
 
     for(i=0; i<width; i++){
@@ -1504,7 +1426,7 @@ static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int heig
     cs->y = -3;
 }
 
-static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
     int y = cs->y;
 
     DWTELEM *b0= cs->b0;
@@ -1516,7 +1438,7 @@ static void spatial_compose97i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb,
 
 {START_TIMER
     if(y>0 && y+4<height){
-        vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
+        dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
     }else{
         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
@@ -1527,8 +1449,8 @@ if(width>400){
 STOP_TIMER("vertical_compose97i")}}
 
 {START_TIMER
-        if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
-        if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
+        if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
+        if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
 if(width>400 && y+0<(unsigned)height){
 STOP_TIMER("horizontal_compose97i")}}
 
@@ -1557,8 +1479,8 @@ if(width>400){
 STOP_TIMER("vertical_compose97i")}}
 
 {START_TIMER
-        if(y-1<(unsigned)height) horizontal_compose97i(b0, width);
-        if(y+0<(unsigned)height) horizontal_compose97i(b1, width);
+        if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
+        if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
 if(width>400 && b0 <= b2){
 STOP_TIMER("horizontal_compose97i")}}
 
@@ -1580,10 +1502,10 @@ static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb,
     int level;
     for(level=decomposition_count-1; level>=0; level--){
         switch(type){
-        case 0: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
-        case 1: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
+        case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
+        case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
         /* not slicified yet */
-        case 2: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
+        case DWT_X: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
           av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
         }
     }
@@ -1593,10 +1515,10 @@ static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width,
     int level;
     for(level=decomposition_count-1; level>=0; level--){
         switch(type){
-        case 0: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
-        case 1: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
+        case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
+        case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
         /* not slicified yet */
-        case 2: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
+        case DWT_X: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
         }
     }
 }
@@ -1609,17 +1531,17 @@ static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width,
     for(level=decomposition_count-1; level>=0; level--){
         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
             switch(type){
-            case 0: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
+            case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
                     break;
-            case 1: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
+            case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
                     break;
-            case 2: break;
+            case DWT_X: break;
             }
         }
     }
 }
 
-static void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
+static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
     const int support = type==1 ? 3 : 5;
     int level;
     if(type==2) return;
@@ -1627,11 +1549,11 @@ static void ff_spatial_idwt_buffered_slice(dwt_compose_t *cs, slice_buffer * sli
     for(level=decomposition_count-1; level>=0; level--){
         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
             switch(type){
-            case 0: spatial_compose97i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
+            case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
                     break;
-            case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
+            case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
                     break;
-            case 2: break;
+            case DWT_X: break;
             }
         }
     }
@@ -1991,7 +1913,7 @@ static int pix_norm1(uint8_t * pix, int line_size, int w)
     return s;
 }
 
-static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int type){
+static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
     const int w= s->b_width << s->block_max_depth;
     const int rem_depth= s->block_max_depth - level;
     const int index= (x + y*w) << rem_depth;
@@ -2004,6 +1926,7 @@ static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, in
     block.color[2]= cr;
     block.mx= mx;
     block.my= my;
+    block.ref= ref;
     block.type= type;
     block.level= level;
 
@@ -2028,6 +1951,22 @@ static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3
     assert(!ref_index);
 }
 
+static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
+                           BlockNode *left, BlockNode *top, BlockNode *tr){
+    if(s->ref_frames == 1){
+        *mx = mid_pred(left->mx, top->mx, tr->mx);
+        *my = mid_pred(left->my, top->my, tr->my);
+    }else{
+        const int *scale = scale_mv_ref[ref];
+        *mx = mid_pred(left->mx * scale[left->ref] + 128 >>8,
+                       top ->mx * scale[top ->ref] + 128 >>8,
+                       tr  ->mx * scale[tr  ->ref] + 128 >>8);
+        *my = mid_pred(left->my * scale[left->ref] + 128 >>8,
+                       top ->my * scale[top ->ref] + 128 >>8,
+                       tr  ->my * scale[tr  ->ref] + 128 >>8);
+    }
+}
+
 //FIXME copy&paste
 #define P_LEFT P[1]
 #define P_TOP P[2]
@@ -2062,8 +2001,7 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
     int pl = left->color[0];
     int pcb= left->color[1];
     int pcr= left->color[2];
-    int pmx= mid_pred(left->mx, top->mx, tr->mx);
-    int pmy= mid_pred(left->my, top->my, tr->my);
+    int pmx, pmy;
     int mx=0, my=0;
     int l,cr,cb;
     const int stride= s->current_picture.linesize[0];
@@ -2076,13 +2014,15 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
     const int shift= 1+qpel;
     MotionEstContext *c= &s->m.me;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
     int mx_context= av_log2(2*ABS(left->mx - top->mx));
     int my_context= av_log2(2*ABS(left->my - top->my));
     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
+    int ref, best_ref, ref_score, ref_mx, ref_my;
 
     assert(sizeof(s->block_state) >= 256);
     if(s->keyframe){
-        set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
         return 0;
     }
 
@@ -2107,8 +2047,6 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
     s->m.mb_y= 0;
     s->m.me.skip= 0;
 
-    init_ref(c, current_data, s->last_picture.data, NULL, block_w*x, block_w*y, 0);
-
     assert(s->m.me.  stride ==   stride);
     assert(s->m.me.uvstride == uvstride);
 
@@ -2141,16 +2079,34 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
         c->pred_y = P_MEDIAN[1];
     }
 
-    score= ff_epzs_motion_search(&s->m, &mx, &my, P, 0, /*ref_index*/ 0, last_mv,
-                             (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
+    score= INT_MAX;
+    best_ref= 0;
+    for(ref=0; ref<s->ref_frames; ref++){
+        init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
+
+        ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
+                                         (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
 
-    assert(mx >= c->xmin);
-    assert(mx <= c->xmax);
-    assert(my >= c->ymin);
-    assert(my <= c->ymax);
+        assert(ref_mx >= c->xmin);
+        assert(ref_mx <= c->xmax);
+        assert(ref_my >= c->ymin);
+        assert(ref_my <= c->ymax);
 
-    score= s->m.me.sub_motion_search(&s->m, &mx, &my, score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
-    score= ff_get_mb_score(&s->m, mx, my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
+        ref_score= s->m.me.sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
+        ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
+        ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
+        if(s->ref_mvs[ref]){
+            s->ref_mvs[ref][index][0]= ref_mx;
+            s->ref_mvs[ref][index][1]= ref_my;
+            s->ref_scores[ref][index]= ref_score;
+        }
+        if(score > ref_score){
+            score= ref_score;
+            best_ref= ref;
+            mx= ref_mx;
+            my= ref_my;
+        }
+    }
     //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
 
   //  subpel search
@@ -2162,8 +2118,11 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
     if(level!=s->block_max_depth)
         put_rac(&pc, &p_state[4 + s_context], 1);
     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
-    put_symbol(&pc, &p_state[128 + 32*mx_context], mx - pmx, 1);
-    put_symbol(&pc, &p_state[128 + 32*my_context], my - pmy, 1);
+    if(s->ref_frames > 1)
+        put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
+    pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
+    put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
+    put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
     p_len= pc.bytestream - pc.bytestream_start;
     score += (s->lambda2*(p_len*8
               + (pc.outstanding_count - s->c.outstanding_count)*8
@@ -2227,11 +2186,12 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
     }
 
     if(iscore < score){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
         memcpy(pbbak, i_buffer, i_len);
         s->c= ic;
         s->c.bytestream_start= pbbak_start;
         s->c.bytestream= pbbak + i_len;
-        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, BLOCK_INTRA);
+        set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
         memcpy(s->block_state, i_state, sizeof(s->block_state));
         return iscore;
     }else{
@@ -2239,7 +2199,7 @@ static int encode_q_branch(SnowContext *s, int level, int x, int y){
         s->c= pc;
         s->c.bytestream_start= pbbak_start;
         s->c.bytestream= pbbak + p_len;
-        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, 0);
+        set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
         memcpy(s->block_state, p_state, sizeof(s->block_state));
         return score;
     }
@@ -2250,7 +2210,7 @@ static always_inline int same_block(BlockNode *a, BlockNode *b){
     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
     }else{
-        return !((a->mx - b->mx) | (a->my - b->my) | ((a->type ^ b->type)&BLOCK_INTRA));
+        return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
     }
 }
 
@@ -2267,14 +2227,14 @@ static void encode_q_branch2(SnowContext *s, int level, int x, int y){
     int pl = left->color[0];
     int pcb= left->color[1];
     int pcr= left->color[2];
-    int pmx= mid_pred(left->mx, top->mx, tr->mx);
-    int pmy= mid_pred(left->my, top->my, tr->my);
-    int mx_context= av_log2(2*ABS(left->mx - top->mx));
-    int my_context= av_log2(2*ABS(left->my - top->my));
+    int pmx, pmy;
+    int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
+    int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 16*!!b->ref;
+    int my_context= av_log2(2*ABS(left->my - top->my)) + 16*!!b->ref;
     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
 
     if(s->keyframe){
-        set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
+        set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
         return;
     }
 
@@ -2291,16 +2251,20 @@ static void encode_q_branch2(SnowContext *s, int level, int x, int y){
         }
     }
     if(b->type & BLOCK_INTRA){
+        pred_mv(s, &pmx, &pmy, 0, left, top, tr);
         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
-        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, BLOCK_INTRA);
+        set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
     }else{
+        pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
+        if(s->ref_frames > 1)
+            put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
-        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, 0);
+        set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
     }
 }
 
@@ -2316,7 +2280,7 @@ static void decode_q_branch(SnowContext *s, int level, int x, int y){
     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
 
     if(s->keyframe){
-        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, BLOCK_INTRA);
+        set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
         return;
     }
 
@@ -2327,20 +2291,26 @@ static void decode_q_branch(SnowContext *s, int level, int x, int y){
         int cr= left->color[2];
         int mx= mid_pred(left->mx, top->mx, tr->mx);
         int my= mid_pred(left->my, top->my, tr->my);
+        int ref = 0;
+        int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
         int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 0*av_log2(2*ABS(tr->mx - top->mx));
         int my_context= av_log2(2*ABS(left->my - top->my)) + 0*av_log2(2*ABS(tr->my - top->my));
 
         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
 
         if(type){
+            pred_mv(s, &mx, &my, 0, left, top, tr);
             l += get_symbol(&s->c, &s->block_state[32], 1);
             cb+= get_symbol(&s->c, &s->block_state[64], 1);
             cr+= get_symbol(&s->c, &s->block_state[96], 1);
         }else{
-            mx+= get_symbol(&s->c, &s->block_state[128 + 32*mx_context], 1);
-            my+= get_symbol(&s->c, &s->block_state[128 + 32*my_context], 1);
+            if(s->ref_frames > 1)
+                ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
+            pred_mv(s, &mx, &my, ref, left, top, tr);
+            mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
+            my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
         }
-        set_blocks(s, level, x, y, l, cb, cr, mx, my, type);
+        set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
     }else{
         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
@@ -2470,7 +2440,7 @@ mca( 8, 0,8)
 mca( 0, 8,8)
 mca( 8, 8,8)
 
-static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
+static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
     if(block->type & BLOCK_INTRA){
         int x, y;
         const int color = block->color[plane_index];
@@ -2510,6 +2480,7 @@ static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp,
             }
         }
     }else{
+        uint8_t *src= s->last_picture[block->ref].data[plane_index];
         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
         int mx= block->mx*scale;
         int my= block->my*scale;
@@ -2524,11 +2495,11 @@ static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp,
             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
             src= tmp + MB_SIZE;
         }
-        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
-        assert(!(b_w&(b_w-1)));
+//        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
+//        assert(!(b_w&(b_w-1)));
         assert(b_w>1 && b_h>1);
         assert(tab_index>=0 && tab_index<4 || b_w==32);
-        if((dx&3) || (dy&3))
+        if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
         else if(b_w==32){
             int y;
@@ -2549,8 +2520,42 @@ static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp,
     }
 }
 
+void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                              int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    int y, x;
+    DWTELEM * dst;
+    for(y=0; y<b_h; y++){
+        //FIXME ugly missue of obmc_stride
+        uint8_t *obmc1= obmc + y*obmc_stride;
+        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
+        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
+        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
+        dst = slice_buffer_get_line(sb, src_y + y);
+        for(x=0; x<b_w; x++){
+            int v=   obmc1[x] * block[3][x + y*src_stride]
+                    +obmc2[x] * block[2][x + y*src_stride]
+                    +obmc3[x] * block[1][x + y*src_stride]
+                    +obmc4[x] * block[0][x + y*src_stride];
+
+            v <<= 8 - LOG2_OBMC_MAX;
+            if(FRAC_BITS != 8){
+                v += 1<<(7 - FRAC_BITS);
+                v >>= 8 - FRAC_BITS;
+            }
+            if(add){
+                v += dst[x + src_x];
+                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
+                if(v&(~255)) v= ~(v>>31);
+                dst8[x + y*src_stride] = v;
+            }else{
+                dst[x + src_x] -= v;
+            }
+        }
+    }
+}
+
 //FIXME name clenup (b_w, block_w, b_width stuff)
-static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
+static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
     DWTELEM * dst = NULL;
     const int b_width = s->b_width  << s->block_max_depth;
     const int b_height= s->b_height << s->block_max_depth;
@@ -2605,14 +2610,14 @@ assert(src_stride > 2*MB_SIZE + 5);
     ptmp= tmp + 3*tmp_step;
     block[0]= ptmp;
     ptmp+=tmp_step;
-    pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
+    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
 
     if(same_block(lt, rt)){
         block[1]= block[0];
     }else{
         block[1]= ptmp;
         ptmp+=tmp_step;
-        pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
     }
 
     if(same_block(lt, lb)){
@@ -2622,7 +2627,7 @@ assert(src_stride > 2*MB_SIZE + 5);
     }else{
         block[2]= ptmp;
         ptmp+=tmp_step;
-        pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
     }
 
     if(same_block(lt, rb) ){
@@ -2633,7 +2638,7 @@ assert(src_stride > 2*MB_SIZE + 5);
         block[3]= block[2];
     }else{
         block[3]= ptmp;
-        pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
     }
 #if 0
     for(y=0; y<b_h; y++){
@@ -2673,43 +2678,14 @@ assert(src_stride > 2*MB_SIZE + 5);
 
     START_TIMER
 
-    for(y=0; y<b_h; y++){
-        //FIXME ugly missue of obmc_stride
-        uint8_t *obmc1= obmc + y*obmc_stride;
-        uint8_t *obmc2= obmc1+ (obmc_stride>>1);
-        uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
-        uint8_t *obmc4= obmc3+ (obmc_stride>>1);
-        dst = slice_buffer_get_line(sb, src_y + y);
-        for(x=0; x<b_w; x++){
-            int v=   obmc1[x] * block[3][x + y*src_stride]
-                    +obmc2[x] * block[2][x + y*src_stride]
-                    +obmc3[x] * block[1][x + y*src_stride]
-                    +obmc4[x] * block[0][x + y*src_stride];
-
-            v <<= 8 - LOG2_OBMC_MAX;
-            if(FRAC_BITS != 8){
-                v += 1<<(7 - FRAC_BITS);
-                v >>= 8 - FRAC_BITS;
-            }
-            if(add){
-//                v += old_dst[x + y*dst_stride];
-                v += dst[x + src_x];
-                v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
-                if(v&(~255)) v= ~(v>>31);
-                dst8[x + y*src_stride] = v;
-            }else{
-//                old_dst[x + y*dst_stride] -= v;
-                dst[x + src_x] -= v;
-            }
-        }
-    }
+    s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
         STOP_TIMER("Inner add y block")
 }
 #endif
 }
 
 //FIXME name clenup (b_w, block_w, b_width stuff)
-static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
+static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
     const int b_width = s->b_width  << s->block_max_depth;
     const int b_height= s->b_height << s->block_max_depth;
     const int b_stride= b_width;
@@ -2768,14 +2744,14 @@ assert(src_stride > 2*MB_SIZE + 5);
     ptmp= tmp + 3*tmp_step;
     block[0]= ptmp;
     ptmp+=tmp_step;
-    pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
+    pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
 
     if(same_block(lt, rt)){
         block[1]= block[0];
     }else{
         block[1]= ptmp;
         ptmp+=tmp_step;
-        pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
+        pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
     }
 
     if(same_block(lt, lb)){
@@ -2785,7 +2761,7 @@ assert(src_stride > 2*MB_SIZE + 5);
     }else{
         block[2]= ptmp;
         ptmp+=tmp_step;
-        pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
+        pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
     }
 
     if(same_block(lt, rb) ){
@@ -2796,7 +2772,7 @@ assert(src_stride > 2*MB_SIZE + 5);
         block[3]= block[2];
     }else{
         block[3]= ptmp;
-        pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
+        pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
     }
 #if 0
     for(y=0; y<b_h; y++){
@@ -2872,7 +2848,6 @@ static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer *
     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
     int obmc_stride= plane_index ? block_size : 2*block_size;
     int ref_stride= s->current_picture.linesize[plane_index];
-    uint8_t *ref  = s->last_picture.data[plane_index];
     uint8_t *dst8= s->current_picture.data[plane_index];
     int w= p->width;
     int h= p->height;
@@ -2915,7 +2890,7 @@ static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer *
         for(mb_x=0; mb_x<=mb_w; mb_x++){
             START_TIMER
 
-            add_yblock_buffered(s, sb, old_buffer, dst8, ref, obmc,
+            add_yblock_buffered(s, sb, old_buffer, dst8, obmc,
                        block_w*mb_x - block_w/2,
                        block_w*mb_y - block_w/2,
                        block_w, block_w,
@@ -2940,7 +2915,6 @@ static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_
     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
     const int obmc_stride= plane_index ? block_size : 2*block_size;
     int ref_stride= s->current_picture.linesize[plane_index];
-    uint8_t *ref  = s->last_picture.data[plane_index];
     uint8_t *dst8= s->current_picture.data[plane_index];
     int w= p->width;
     int h= p->height;
@@ -2973,7 +2947,7 @@ static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_
         for(mb_x=0; mb_x<=mb_w; mb_x++){
             START_TIMER
 
-            add_yblock(s, buf, dst8, ref, obmc,
+            add_yblock(s, buf, dst8, obmc,
                        block_w*mb_x - block_w/2,
                        block_w*mb_y - block_w/2,
                        block_w, block_w,
@@ -3003,7 +2977,6 @@ static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
     const int obmc_stride= plane_index ? block_size : 2*block_size;
     const int ref_stride= s->current_picture.linesize[plane_index];
-    uint8_t *ref= s->   last_picture.data[plane_index];
     uint8_t *src= s-> input_picture.data[plane_index];
     DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
     const int b_stride = s->b_width << s->block_max_depth;
@@ -3025,7 +2998,7 @@ static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
         int x= block_w*mb_x2 + block_w/2;
         int y= block_w*mb_y2 + block_w/2;
 
-        add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, ref, obmc,
+        add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
 
         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
@@ -3048,7 +3021,7 @@ static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
     }
     *b= backup;
 
-    return clip(((ab<<6) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
+    return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
 }
 
 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
@@ -3066,8 +3039,6 @@ static inline int get_block_bits(SnowContext *s, int x, int y, int w){
 
     if(x<0 || x>=b_stride || y>=b_height)
         return 0;
-    dmx= b->mx - mid_pred(left->mx, top->mx, tr->mx);
-    dmy= b->my - mid_pred(left->my, top->my, tr->my);
 /*
 1            0      0
 01X          1-2    1
@@ -3081,9 +3052,14 @@ static inline int get_block_bits(SnowContext *s, int x, int y, int w){
         return 3+2*( av_log2(2*ABS(left->color[0] - b->color[0]))
                    + av_log2(2*ABS(left->color[1] - b->color[1]))
                    + av_log2(2*ABS(left->color[2] - b->color[2])));
-    }else
-        return 2*(1 + av_log2(2*ABS(dmx))
-                    + av_log2(2*ABS(dmy))); //FIXME kill the 2* can be merged in lambda
+    }else{
+        pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
+        dmx-= b->mx;
+        dmy-= b->my;
+        return 2*(1 + av_log2(2*ABS(dmx)) //FIXME kill the 2* can be merged in lambda
+                    + av_log2(2*ABS(dmy))
+                    + av_log2(2*b->ref));
+    }
 }
 
 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
@@ -3093,7 +3069,6 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, con
     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
     const int obmc_stride= plane_index ? block_size : 2*block_size;
     const int ref_stride= s->current_picture.linesize[plane_index];
-    uint8_t *ref= s->   last_picture.data[plane_index];
     uint8_t *dst= s->current_picture.data[plane_index];
     uint8_t *src= s->  input_picture.data[plane_index];
     DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
@@ -3108,13 +3083,13 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, con
     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
     int sx= block_w*mb_x - block_w/2;
     int sy= block_w*mb_y - block_w/2;
-    const int x0= FFMAX(0,-sx);
-    const int y0= FFMAX(0,-sy);
-    const int x1= FFMIN(block_w*2, w-sx);
-    const int y1= FFMIN(block_w*2, h-sy);
+    int x0= FFMAX(0,-sx);
+    int y0= FFMAX(0,-sy);
+    int x1= FFMIN(block_w*2, w-sx);
+    int y1= FFMIN(block_w*2, h-sy);
     int i,x,y;
 
-    pred_block(s, cur, ref, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
+    pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
 
     for(y=y0; y<y1; y++){
         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
@@ -3129,12 +3104,39 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, con
         }
     }
 
-    //FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
+    /* copy the regions where obmc[] = (uint8_t)256 */
+    if(LOG2_OBMC_MAX == 8
+        && (mb_x == 0 || mb_x == b_stride-1)
+        && (mb_y == 0 || mb_y == b_height-1)){
+        if(mb_x == 0)
+            x1 = block_w;
+        else
+            x0 = block_w;
+        if(mb_y == 0)
+            y1 = block_w;
+        else
+            y0 = block_w;
+        for(y=y0; y<y1; y++)
+            memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
+    }
+
     if(block_w==16){
-        distortion = 0;
-        for(i=0; i<4; i++){
-            int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
-            distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
+        /* FIXME rearrange dsputil to fit 32x32 cmp functions */
+        /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
+        /* FIXME cmps overlap but don't cover the wavelet's whole support,
+         * so improving the score of one block is not strictly guaranteed to
+         * improve the score of the whole frame, so iterative motion est
+         * doesn't always converge. */
+        if(s->avctx->me_cmp == FF_CMP_W97)
+            distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else if(s->avctx->me_cmp == FF_CMP_W53)
+            distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
+        else{
+            distortion = 0;
+            for(i=0; i<4; i++){
+                int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
+                distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
+            }
         }
     }else{
         assert(block_w==8);
@@ -3163,7 +3165,6 @@ static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
     const int obmc_stride= plane_index ? block_size : 2*block_size;
     const int ref_stride= s->current_picture.linesize[plane_index];
-    uint8_t *ref= s->   last_picture.data[plane_index];
     uint8_t *dst= s->current_picture.data[plane_index];
     uint8_t *src= s-> input_picture.data[plane_index];
     const static DWTELEM zero_dst[4096]; //FIXME
@@ -3181,7 +3182,7 @@ static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
         int x= block_w*mb_x2 + block_w/2;
         int y= block_w*mb_y2 + block_w/2;
 
-        add_yblock(s, zero_dst, dst, ref, obmc,
+        add_yblock(s, zero_dst, dst, obmc,
                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
 
         //FIXME find a cleaner/simpler way to skip the outside stuff
@@ -3237,7 +3238,7 @@ static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3
         block->type |= BLOCK_INTRA;
     }else{
         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
-        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6);
+        value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
         if(s->me_cache[index] == value)
             return 0;
         s->me_cache[index]= value;
@@ -3260,12 +3261,12 @@ static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3
 }
 
 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
-static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int intra, const uint8_t *obmc_edged, int *best_rd){
+static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
     int p[2] = {p0, p1};
-    return check_block(s, mb_x, mb_y, p, intra, obmc_edged, best_rd);
+    return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
 }
 
-static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int *best_rd){
+static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
     const int b_stride= s->b_width << s->block_max_depth;
     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
@@ -3276,13 +3277,14 @@ static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y,
     assert(((mb_x|mb_y)&1) == 0);
 
     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
-    value= s->me_cache_generation + (p0>>10) + (p1<<6);
+    value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
     if(s->me_cache[index] == value)
         return 0;
     s->me_cache[index]= value;
 
     block->mx= p0;
     block->my= p1;
+    block->ref= ref;
     block->type &= ~BLOCK_INTRA;
     block[1]= block[b_stride]= block[b_stride+1]= *block;
 
@@ -3308,24 +3310,35 @@ static void iterative_me(SnowContext *s){
     const int b_stride= b_width;
     int color[3];
 
-    for(pass=0; pass<50; pass++){
+    {
+        RangeCoder r = s->c;
+        uint8_t state[sizeof(s->block_state)];
+        memcpy(state, s->block_state, sizeof(s->block_state));
+        for(mb_y= 0; mb_y<s->b_height; mb_y++)
+            for(mb_x= 0; mb_x<s->b_width; mb_x++)
+                encode_q_branch(s, 0, mb_x, mb_y);
+        s->c = r;
+        memcpy(s->block_state, state, sizeof(s->block_state));
+    }
+
+    for(pass=0; pass<25; pass++){
         int change= 0;
 
         for(mb_y= 0; mb_y<b_height; mb_y++){
             for(mb_x= 0; mb_x<b_width; mb_x++){
-                int dia_change, i, j;
-                int best_rd= INT_MAX;
-                BlockNode backup;
+                int dia_change, i, j, ref;
+                int best_rd= INT_MAX, ref_rd;
+                BlockNode backup, ref_b;
                 const int index= mb_x + mb_y * b_stride;
                 BlockNode *block= &s->block[index];
-                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : &null_block;
-                BlockNode *lb = mb_x                              ? &s->block[index         -1] : &null_block;
-                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : &null_block;
-                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : &null_block;
-                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : &null_block;
-                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : &null_block;
-                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : &null_block;
-                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : &null_block;
+                BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
+                BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
+                BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
+                BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
+                BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
+                BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
+                BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
+                BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
                 const int b_w= (MB_SIZE >> s->block_max_depth);
                 uint8_t obmc_edged[b_w*2][b_w*2];
 
@@ -3399,48 +3412,72 @@ static void iterative_me(SnowContext *s){
                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
                 }else
-                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, 0, *obmc_edged, &best_rd);
-
-                check_block_inter(s, mb_x, mb_y, 0, 0, 0, *obmc_edged, &best_rd);
-                check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, 0, *obmc_edged, &best_rd);
-                check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, 0, *obmc_edged, &best_rd);
-                check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, 0, *obmc_edged, &best_rd);
-                check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, 0, *obmc_edged, &best_rd);
-
-                /* fullpel ME */
-                //FIXME avoid subpel interpol / round to nearest integer
-                do{
-                    dia_change=0;
-                    for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
-                        for(j=0; j<i; j++){
-                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd);
-                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd);
-                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), 0, *obmc_edged, &best_rd);
-                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), 0, *obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
+
+                ref_b= *block;
+                ref_rd= best_rd;
+                for(ref=0; ref < s->ref_frames; ref++){
+                    int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
+                    if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
+                        continue;
+                    block->ref= ref;
+                    best_rd= INT_MAX;
+
+                    check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
+                    check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
+                    if(tb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
+                    if(lb)
+                        check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
+                    if(rb)
+                        check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
+                    if(bb)
+                        check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
+
+                    /* fullpel ME */
+                    //FIXME avoid subpel interpol / round to nearest integer
+                    do{
+                        dia_change=0;
+                        for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
+                            for(j=0; j<i; j++){
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
+                                dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
+                            }
                         }
+                    }while(dia_change);
+                    /* subpel ME */
+                    do{
+                        static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
+                        dia_change=0;
+                        for(i=0; i<8; i++)
+                            dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
+                    }while(dia_change);
+                    //FIXME or try the standard 2 pass qpel or similar
+
+                    mvr[0][0]= block->mx;
+                    mvr[0][1]= block->my;
+                    if(ref_rd > best_rd){
+                        ref_rd= best_rd;
+                        ref_b= *block;
                     }
-                }while(dia_change);
-                /* subpel ME */
-                do{
-                    static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
-                    dia_change=0;
-                    for(i=0; i<8; i++)
-                        dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], 0, *obmc_edged, &best_rd);
-                }while(dia_change);
-                //FIXME or try the standard 2 pass qpel or similar
+                }
+                best_rd= ref_rd;
+                *block= ref_b;
 #if 1
                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
                 //FIXME RD style color selection
 #endif
                 if(!same_block(block, &backup)){
-                    if(tb != &null_block) tb ->type &= ~BLOCK_OPT;
-                    if(lb != &null_block) lb ->type &= ~BLOCK_OPT;
-                    if(rb != &null_block) rb ->type &= ~BLOCK_OPT;
-                    if(bb != &null_block) bb ->type &= ~BLOCK_OPT;
-                    if(tlb!= &null_block) tlb->type &= ~BLOCK_OPT;
-                    if(trb!= &null_block) trb->type &= ~BLOCK_OPT;
-                    if(blb!= &null_block) blb->type &= ~BLOCK_OPT;
-                    if(brb!= &null_block) brb->type &= ~BLOCK_OPT;
+                    if(tb ) tb ->type &= ~BLOCK_OPT;
+                    if(lb ) lb ->type &= ~BLOCK_OPT;
+                    if(rb ) rb ->type &= ~BLOCK_OPT;
+                    if(bb ) bb ->type &= ~BLOCK_OPT;
+                    if(tlb) tlb->type &= ~BLOCK_OPT;
+                    if(trb) trb->type &= ~BLOCK_OPT;
+                    if(blb) blb->type &= ~BLOCK_OPT;
+                    if(brb) brb->type &= ~BLOCK_OPT;
                     change ++;
                 }
             }
@@ -3454,7 +3491,7 @@ static void iterative_me(SnowContext *s){
         int change= 0;
         for(mb_y= 0; mb_y<b_height; mb_y+=2){
             for(mb_x= 0; mb_x<b_width; mb_x+=2){
-                int dia_change, i, j;
+                int i;
                 int best_rd, init_rd;
                 const int index= mb_x + mb_y * b_stride;
                 BlockNode *b[4];
@@ -3474,13 +3511,14 @@ static void iterative_me(SnowContext *s){
 
                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
 
+                //FIXME more multiref search?
                 check_4block_inter(s, mb_x, mb_y,
                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
-                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, &best_rd);
+                                   (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
 
                 for(i=0; i<4; i++)
                     if(!(b[i]->type&BLOCK_INTRA))
-                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, &best_rd);
+                        check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
 
                 if(init_rd != best_rd)
                     change++;
@@ -3706,6 +3744,7 @@ static void encode_header(SnowContext *s){
         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
         put_rac(&s->c, s->header_state, s->spatial_scalability);
 //        put_rac(&s->c, s->header_state, s->rate_scalability);
+        put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
 
         for(plane_index=0; plane_index<2; plane_index++){
             for(level=0; level<s->spatial_decomposition_count; level++){
@@ -3747,6 +3786,7 @@ static int decode_header(SnowContext *s){
         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
         s->spatial_scalability= get_rac(&s->c, s->header_state);
 //        s->rate_scalability= get_rac(&s->c, s->header_state);
+        s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
 
         for(plane_index=0; plane_index<3; plane_index++){
             for(level=0; level<s->spatial_decomposition_count; level++){
@@ -3771,7 +3811,7 @@ static int decode_header(SnowContext *s){
     s->mv_scale= get_symbol(&s->c, s->header_state, 0);
     s->qbias= get_symbol(&s->c, s->header_state, 1);
     s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
-    if(s->block_max_depth > 1){
+    if(s->block_max_depth > 1 || s->block_max_depth < 0){
         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
         s->block_max_depth= 0;
         return -1;
@@ -3794,6 +3834,7 @@ static int common_init(AVCodecContext *avctx){
     SnowContext *s = avctx->priv_data;
     int width, height;
     int level, orientation, plane_index, dec;
+    int i, j;
 
     s->avctx= avctx;
 
@@ -3899,6 +3940,10 @@ static int common_init(AVCodecContext *avctx){
         }
     }
 
+    for(i=0; i<MAX_REF_FRAMES; i++)
+        for(j=0; j<MAX_REF_FRAMES; j++)
+            scale_mv_ref[i][j] = 256*(i+1)/(j+1);
+
     reset_contexts(s);
 /*
     width= s->width= avctx->width;
@@ -3911,6 +3956,56 @@ static int common_init(AVCodecContext *avctx){
     return 0;
 }
 
+static int qscale2qlog(int qscale){
+    return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
+           + 61*QROOT/8; //<64 >60
+}
+
+static void ratecontrol_1pass(SnowContext *s, AVFrame *pict)
+{
+    /* estimate the frame's complexity as a sum of weighted dwt coefs.
+     * FIXME we know exact mv bits at this point,
+     * but ratecontrol isn't set up to include them. */
+    uint32_t coef_sum= 0;
+    int level, orientation;
+
+    for(level=0; level<s->spatial_decomposition_count; level++){
+        for(orientation=level ? 1 : 0; orientation<4; orientation++){
+            SubBand *b= &s->plane[0].band[level][orientation];
+            DWTELEM *buf= b->buf;
+            const int w= b->width;
+            const int h= b->height;
+            const int stride= b->stride;
+            const int qlog= clip(2*QROOT + b->qlog, 0, QROOT*16);
+            const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
+            const int qdiv= (1<<16)/qmul;
+            int x, y;
+            if(orientation==0)
+                decorrelate(s, b, buf, stride, 1, 0);
+            for(y=0; y<h; y++)
+                for(x=0; x<w; x++)
+                    coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
+            if(orientation==0)
+                correlate(s, b, buf, stride, 1, 0);
+        }
+    }
+
+    /* ugly, ratecontrol just takes a sqrt again */
+    coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
+    assert(coef_sum < INT_MAX);
+
+    if(pict->pict_type == I_TYPE){
+        s->m.current_picture.mb_var_sum= coef_sum;
+        s->m.current_picture.mc_mb_var_sum= 0;
+    }else{
+        s->m.current_picture.mc_mb_var_sum= coef_sum;
+        s->m.current_picture.mb_var_sum= 0;
+    }
+
+    pict->quality= ff_rate_estimate_qscale(&s->m, 1);
+    s->lambda= pict->quality * 3/2;
+    s->qlog= qscale2qlog(pict->quality);
+}
 
 static void calculate_vissual_weight(SnowContext *s, Plane *p){
     int width = p->width;
@@ -3951,6 +4046,13 @@ static int encode_init(AVCodecContext *avctx)
         return -1;
     }
 
+    if(avctx->prediction_method == DWT_97
+       && (avctx->flags & CODEC_FLAG_QSCALE)
+       && avctx->global_quality == 0){
+        av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
+        return -1;
+    }
+
     common_init(avctx);
     alloc_blocks(s);
 
@@ -3966,14 +4068,17 @@ static int encode_init(AVCodecContext *avctx)
     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
     h263_encode_init(&s->m); //mv_penalty
 
+    s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
+
     if(avctx->flags&CODEC_FLAG_PASS1){
         if(!avctx->stats_out)
             avctx->stats_out = av_mallocz(256);
     }
-    if(avctx->flags&CODEC_FLAG_PASS2){
+    if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
         if(ff_rate_control_init(&s->m) < 0)
             return -1;
     }
+    s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
 
     for(plane_index=0; plane_index<3; plane_index++){
         calculate_vissual_weight(s, &s->plane[plane_index]);
@@ -4006,6 +4111,15 @@ static int encode_init(AVCodecContext *avctx)
 
     s->avctx->get_buffer(s->avctx, &s->input_picture);
 
+    if(s->avctx->me_method == ME_ITER){
+        int i;
+        int size= s->b_width * s->b_height << 2*s->block_max_depth;
+        for(i=0; i<s->max_ref_frames; i++){
+            s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
+            s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
+        }
+    }
+
     return 0;
 }
 #endif
@@ -4021,16 +4135,29 @@ static int frame_start(SnowContext *s){
         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
     }
 
-    tmp= s->last_picture;
-    s->last_picture= s->current_picture;
+    tmp= s->last_picture[s->max_ref_frames-1];
+    memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
+    s->last_picture[0]= s->current_picture;
     s->current_picture= tmp;
 
+    if(s->keyframe){
+        s->ref_frames= 0;
+    }else{
+        int i;
+        for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
+            if(i && s->last_picture[i-1].key_frame)
+                break;
+        s->ref_frames= i;
+    }
+
     s->current_picture.reference= 1;
     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         return -1;
     }
 
+    s->current_picture.key_frame= s->keyframe;
+
     return 0;
 }
 
@@ -4055,27 +4182,31 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     }
     s->new_picture = *pict;
 
+    s->m.picture_number= avctx->frame_number;
     if(avctx->flags&CODEC_FLAG_PASS2){
         s->m.pict_type =
         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
         s->keyframe= pict->pict_type==FF_I_TYPE;
-        s->m.picture_number= avctx->frame_number;
-        pict->quality= ff_rate_estimate_qscale(&s->m, 0);
+        if(!(avctx->flags&CODEC_FLAG_QSCALE))
+            pict->quality= ff_rate_estimate_qscale(&s->m, 0);
     }else{
         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
+        s->m.pict_type=
         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
     }
 
+    if(s->pass1_rc && avctx->frame_number == 0)
+        pict->quality= 2*FF_QP2LAMBDA;
     if(pict->quality){
-        s->qlog= rint(QROOT*log(pict->quality / (float)FF_QP2LAMBDA)/log(2));
-        //<64 >60
-        s->qlog += 61*QROOT/8;
-    }else{
-        s->qlog= LOSSLESS_QLOG;
+        s->qlog= qscale2qlog(pict->quality);
+        s->lambda = pict->quality * 3/2;
     }
+    if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
+        s->qlog= LOSSLESS_QLOG;
+        s->lambda = 0;
+    }//else keep previous frame's qlog until after motion est
 
     frame_start(s);
-    s->current_picture.key_frame= s->keyframe;
 
     s->m.current_picture_ptr= &s->m.current_picture;
     if(pict->pict_type == P_TYPE){
@@ -4084,11 +4215,11 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         int stride= s->current_picture.linesize[0];
 
         assert(s->current_picture.data[0]);
-        assert(s->last_picture.data[0]);
+        assert(s->last_picture[0].data[0]);
 
         s->m.avctx= s->avctx;
         s->m.current_picture.data[0]= s->current_picture.data[0];
-        s->m.   last_picture.data[0]= s->   last_picture.data[0];
+        s->m.   last_picture.data[0]= s->last_picture[0].data[0];
         s->m.    new_picture.data[0]= s->  input_picture.data[0];
         s->m.   last_picture_ptr= &s->m.   last_picture;
         s->m.linesize=
@@ -4111,7 +4242,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         s->m.out_format= FMT_H263;
         s->m.unrestricted_mv= 1;
 
-        s->lambda = s->m.lambda= pict->quality * 3/2; //FIXME bug somewhere else
+        s->m.lambda = s->lambda;
         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
 
@@ -4122,6 +4253,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
 
 redo_frame:
 
+    s->m.pict_type = pict->pict_type;
     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
 
     encode_header(s);
@@ -4136,6 +4268,7 @@ redo_frame:
         int x, y;
 //        int bits= put_bits_count(&s->c.pb);
 
+    if(!(avctx->flags2 & CODEC_FLAG2_MEMC_ONLY)){
         //FIXME optimize
      if(pict->data[plane_index]) //FIXME gray hack
         for(y=0; y<h; y++){
@@ -4147,11 +4280,13 @@ redo_frame:
 
         if(   plane_index==0
            && pict->pict_type == P_TYPE
+           && !(avctx->flags&CODEC_FLAG_PASS2)
            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
             ff_init_range_encoder(c, buf, buf_size);
             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
             pict->pict_type= FF_I_TYPE;
             s->keyframe=1;
+            s->current_picture.key_frame=1;
             reset_contexts(s);
             goto redo_frame;
         }
@@ -4166,6 +4301,9 @@ redo_frame:
 
         ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
 
+        if(s->pass1_rc && plane_index==0)
+            ratecontrol_1pass(s, pict);
+
         for(level=0; level<s->spatial_decomposition_count; level++){
             for(orientation=level ? 1 : 0; orientation<4; orientation++){
                 SubBand *b= &p->band[level][orientation];
@@ -4200,6 +4338,20 @@ redo_frame:
 {START_TIMER
         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
 STOP_TIMER("pred-conv")}
+      }else{
+            //ME/MC only
+            if(pict->pict_type == I_TYPE){
+                for(y=0; y<h; y++){
+                    for(x=0; x<w; x++){
+                        s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
+                            pict->data[plane_index][y*pict->linesize[plane_index] + x];
+                    }
+                }
+            }else{
+                memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
+                predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
+            }
+      }
         if(s->avctx->flags&CODEC_FLAG_PSNR){
             int64_t error= 0;
 
@@ -4215,23 +4367,23 @@ STOP_TIMER("pred-conv")}
         }
     }
 
-    if(s->last_picture.data[0])
-        avctx->release_buffer(avctx, &s->last_picture);
+    if(s->last_picture[s->max_ref_frames-1].data[0])
+        avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
 
     s->current_picture.coded_picture_number = avctx->frame_number;
     s->current_picture.pict_type = pict->pict_type;
     s->current_picture.quality = pict->quality;
-    if(avctx->flags&CODEC_FLAG_PASS1){
-        s->m.p_tex_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits - s->m.mv_bits;
-        s->m.current_picture.display_picture_number =
-        s->m.current_picture.coded_picture_number = avctx->frame_number;
-        s->m.pict_type = pict->pict_type;
-        s->m.current_picture.quality = pict->quality;
+    s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
+    s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
+    s->m.current_picture.display_picture_number =
+    s->m.current_picture.coded_picture_number = avctx->frame_number;
+    s->m.current_picture.quality = pict->quality;
+    s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
+    if(s->pass1_rc)
+        ff_rate_estimate_qscale(&s->m, 0);
+    if(avctx->flags&CODEC_FLAG_PASS1)
         ff_write_pass1_stats(&s->m);
-    }
-    if(avctx->flags&CODEC_FLAG_PASS2){
-        s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
-    }
+    s->m.last_pict_type = s->m.pict_type;
 
     emms_c();
 
@@ -4240,7 +4392,7 @@ STOP_TIMER("pred-conv")}
 #endif
 
 static void common_end(SnowContext *s){
-    int plane_index, level, orientation;
+    int plane_index, level, orientation, i;
 
     av_freep(&s->spatial_dwt_buffer);
 
@@ -4251,6 +4403,13 @@ static void common_end(SnowContext *s){
 
     av_freep(&s->block);
 
+    for(i=0; i<MAX_REF_FRAMES; i++){
+        av_freep(&s->ref_mvs[i]);
+        av_freep(&s->ref_scores[i]);
+        if(s->last_picture[i].data[0])
+            s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
+    }
+
     for(plane_index=0; plane_index<3; plane_index++){
         for(level=s->spatial_decomposition_count-1; level>=0; level--){
             for(orientation=level ? 1 : 0; orientation<4; orientation++){
@@ -4397,7 +4556,7 @@ if(s->avctx->debug&2048){
 
 {   START_TIMER
         for(; yd<slice_h; yd+=4){
-            ff_spatial_idwt_buffered_slice(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
+            ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
         }
     STOP_TIMER("idwt slice");}
 
@@ -4426,8 +4585,8 @@ STOP_TIMER("idwt + predict_slices")}
 
     emms_c();
 
-    if(s->last_picture.data[0])
-        avctx->release_buffer(avctx, &s->last_picture);
+    if(s->last_picture[s->max_ref_frames-1].data[0])
+        avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
 
 if(!(s->avctx->debug&2048))
     *picture= s->current_picture;
diff --git a/src/libffmpeg/libavcodec/svq1_cb.h b/src/libffmpeg/libavcodec/svq1_cb.h
index 5c98c8047..ef097457e 100644
--- a/src/libffmpeg/libavcodec/svq1_cb.h
+++ b/src/libffmpeg/libavcodec/svq1_cb.h
@@ -764,9 +764,10 @@ static const int8_t svq1_inter_codebook_8x8[6144] = {
 };
 
 /* list of codebooks for inter-coded vectors */
-static const int8_t* const svq1_inter_codebooks[4] = {
+static const int8_t* const svq1_inter_codebooks[6] = {
     svq1_inter_codebook_4x2, svq1_inter_codebook_4x4,
-    svq1_inter_codebook_8x4, svq1_inter_codebook_8x8
+    svq1_inter_codebook_8x4, svq1_inter_codebook_8x8,
+    NULL, NULL,
 };
 
 static const int8_t svq1_inter_codebook_sum[4][16*6] = {
@@ -1538,9 +1539,10 @@ static const int8_t svq1_intra_codebook_8x8[6144] = {
 };
 
 /* list of codebooks for intra-coded vectors */
-static const int8_t* const svq1_intra_codebooks[4] = {
+static const int8_t* const svq1_intra_codebooks[6] = {
     svq1_intra_codebook_4x2, svq1_intra_codebook_4x4,
-    svq1_intra_codebook_8x4, svq1_intra_codebook_8x8
+    svq1_intra_codebook_8x4, svq1_intra_codebook_8x8,
+    NULL, NULL,
 };
 
 static const int8_t svq1_intra_codebook_sum[4][16*6] = {
diff --git a/src/libffmpeg/libavcodec/truemotion1.c b/src/libffmpeg/libavcodec/truemotion1.c
index 728dbcdb7..d2c9efbf8 100644
--- a/src/libffmpeg/libavcodec/truemotion1.c
+++ b/src/libffmpeg/libavcodec/truemotion1.c
@@ -322,7 +322,7 @@ static void gen_vector_table24(TrueMotion1Context *s, const uint8_t *sel_vector_
 static int truemotion1_decode_header(TrueMotion1Context *s)
 {
     int i;
-    struct frame_header header = {0};
+    struct frame_header header;
     uint8_t header_buffer[128];  /* logical maximum size of the header */
     const uint8_t *sel_vector_table;
 
diff --git a/src/libffmpeg/libavcodec/tscc.c b/src/libffmpeg/libavcodec/tscc.c
index 8bc53bf89..19edf3b2e 100644
--- a/src/libffmpeg/libavcodec/tscc.c
+++ b/src/libffmpeg/libavcodec/tscc.c
@@ -77,6 +77,8 @@ static int decode_rle(CamtasiaContext *c, unsigned int srcsize)
     unsigned char *src = c->decomp_buf;
     unsigned char *output, *output_end;
     int p1, p2, line=c->height, pos=0, i;
+    uint16_t pix16;
+    uint32_t pix32;
 
     output = c->pic.data[0] + (c->height - 1) * c->pic.linesize[0];
     output_end = c->pic.data[0] + (c->height) * c->pic.linesize[0];
@@ -107,12 +109,28 @@ static int decode_rle(CamtasiaContext *c, unsigned int srcsize)
                 src += p2 * (c->bpp / 8);
                 continue;
             }
-            for(i = 0; i < p2 * (c->bpp / 8); i++) {
-                *output++ = *src++;
-            }
-            // RLE8 copy is actually padded - and runs are not!
-            if(c->bpp == 8 && (p2 & 1)) {
-                src++;
+            if ((c->bpp == 8) || (c->bpp == 24)) {
+                for(i = 0; i < p2 * (c->bpp / 8); i++) {
+                    *output++ = *src++;
+                }
+                // RLE8 copy is actually padded - and runs are not!
+                if(c->bpp == 8 && (p2 & 1)) {
+                    src++;
+                }
+            } else if (c->bpp == 16) {
+                for(i = 0; i < p2; i++) {
+                    pix16 = LE_16(src);
+                    src += 2;
+                    *(uint16_t*)output = pix16;
+                    output += 2;
+                }
+            } else if (c->bpp == 32) {
+                for(i = 0; i < p2; i++) {
+                    pix32 = LE_32(src);
+                    src += 4;
+                    *(uint32_t*)output = pix32;
+                    output += 4;
+                }
             }
             pos += p2;
         } else { //Run of pixels
@@ -120,17 +138,17 @@ static int decode_rle(CamtasiaContext *c, unsigned int srcsize)
             switch(c->bpp){
             case  8: pix[0] = *src++;
                      break;
-            case 16: pix[0] = *src++;
-                     pix[1] = *src++;
+            case 16: pix16 = LE_16(src);
+                     src += 2;
+                     *(uint16_t*)pix = pix16;
                      break;
             case 24: pix[0] = *src++;
                      pix[1] = *src++;
                      pix[2] = *src++;
                      break;
-            case 32: pix[0] = *src++;
-                     pix[1] = *src++;
-                     pix[2] = *src++;
-                     pix[3] = *src++;
+            case 32: pix32 = LE_32(src);
+                     src += 4;
+                     *(uint32_t*)pix = pix32;
                      break;
             }
             if (output + p1 * (c->bpp / 8) > output_end)
@@ -139,17 +157,15 @@ static int decode_rle(CamtasiaContext *c, unsigned int srcsize)
                 switch(c->bpp){
                 case  8: *output++ = pix[0];
                          break;
-                case 16: *output++ = pix[0];
-                         *output++ = pix[1];
+                case 16: *(uint16_t*)output = pix16;
+                         output += 2;
                          break;
                 case 24: *output++ = pix[0];
                          *output++ = pix[1];
                          *output++ = pix[2];
                          break;
-                case 32: *output++ = pix[0];
-                         *output++ = pix[1];
-                         *output++ = pix[2];
-                         *output++ = pix[3];
+                case 32: *(uint32_t*)output = pix32;
+                         output += 4;
                          break;
                 }
             }
diff --git a/src/libffmpeg/libavcodec/utils.c b/src/libffmpeg/libavcodec/utils.c
index 525fc9a98..0f8a4f412 100644
--- a/src/libffmpeg/libavcodec/utils.c
+++ b/src/libffmpeg/libavcodec/utils.c
@@ -1,7 +1,6 @@
 /*
  * utils for libavcodec
  * Copyright (c) 2001 Fabrice Bellard.
- * Copyright (c) 2003 Michel Bardiaux for the av_log API
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
  * This library is free software; you can redistribute it and/or
@@ -29,9 +28,13 @@
 #include "mpegvideo.h"
 #include "integer.h"
 #include "opt.h"
+#include "crc.h"
 #include <stdarg.h>
 #include <limits.h>
 #include <float.h>
+#ifdef __MINGW32__
+#include <fcntl.h>
+#endif
 
 const uint8_t ff_reverse[256]={
 0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
@@ -446,7 +449,7 @@ static const char* context_to_name(void* ptr) {
 #define E AV_OPT_FLAG_ENCODING_PARAM
 #define D AV_OPT_FLAG_DECODING_PARAM
 
-static AVOption options[]={
+static const AVOption options[]={
 {"bit_rate", NULL, OFFSET(bit_rate), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|A|E},
 {"bit_rate_tolerance", NULL, OFFSET(bit_rate_tolerance), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"flags", NULL, OFFSET(flags), FF_OPT_TYPE_FLAGS, DEFAULT, INT_MIN, INT_MAX, V|A|E|D, "flags"},
@@ -495,7 +498,7 @@ static AVOption options[]={
 {"sample_rate", NULL, OFFSET(sample_rate), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"channels", NULL, OFFSET(channels), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"cutoff", "set cutoff bandwidth", OFFSET(cutoff), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, A|E},
-{"frame_size", NULL, OFFSET(frame_size), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
+{"frame_size", NULL, OFFSET(frame_size), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, A|E},
 {"frame_number", NULL, OFFSET(frame_number), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"real_pict_num", NULL, OFFSET(real_pict_num), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"delay", NULL, OFFSET(delay), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
@@ -650,8 +653,10 @@ static AVOption options[]={
 {"vsad", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_VSAD, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"vsse", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_VSSE, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"nsse", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_NSSE, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#ifdef CONFIG_SNOW_ENCODER
 {"w53", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_W53, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"w97", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_W97, INT_MIN, INT_MAX, V|E, "cmp_func"},
+#endif
 {"dctmax", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_DCTMAX, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"chroma", NULL, 0, FF_OPT_TYPE_CONST, FF_CMP_CHROMA, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"pre_dia_size", NULL, OFFSET(pre_dia_size), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
@@ -716,7 +721,7 @@ static AVOption options[]={
 {"refs", NULL, OFFSET(refs), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"chromaoffset", NULL, OFFSET(chromaoffset), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"bframebias", NULL, OFFSET(bframebias), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
-{"trellis", NULL, OFFSET(trellis), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
+{"trellis", NULL, OFFSET(trellis), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|A|E},
 {"directpred", NULL, OFFSET(directpred), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"bpyramid", NULL, 0, FF_OPT_TYPE_CONST, CODEC_FLAG2_BPYRAMID, INT_MIN, INT_MAX, V|E, "flags2"},
 {"wpred", NULL, 0, FF_OPT_TYPE_CONST, CODEC_FLAG2_WPRED, INT_MIN, INT_MAX, V|E, "flags2"},
@@ -735,6 +740,17 @@ static AVOption options[]={
 {"partp8x8", NULL, 0, FF_OPT_TYPE_CONST, X264_PART_P8X8, INT_MIN, INT_MAX, V|E, "partitions"},
 {"partb8x8", NULL, 0, FF_OPT_TYPE_CONST, X264_PART_B8X8, INT_MIN, INT_MAX, V|E, "partitions"},
 {"sc_factor", NULL, OFFSET(scenechange_factor), FF_OPT_TYPE_INT, 6, 0, INT_MAX, V|E},
+{"mv0_threshold", NULL, OFFSET(mv0_threshold), FF_OPT_TYPE_INT, 256, 0, INT_MAX, V|E},
+{"ivlc", "intra vlc table", 0, FF_OPT_TYPE_CONST, CODEC_FLAG2_INTRA_VLC, INT_MIN, INT_MAX, V|E, "flags2"},
+{"b_sensitivity", NULL, OFFSET(b_sensitivity), FF_OPT_TYPE_INT, 40, 1, INT_MAX, V|E},
+{"compression_level", NULL, OFFSET(compression_level), FF_OPT_TYPE_INT, FF_COMPRESSION_DEFAULT, INT_MIN, INT_MAX, V|A|E},
+{"use_lpc", NULL, OFFSET(use_lpc), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, A|E},
+{"lpc_coeff_precision", NULL, OFFSET(lpc_coeff_precision), FF_OPT_TYPE_INT, DEFAULT, 0, INT_MAX, A|E},
+{"min_prediction_order", NULL, OFFSET(min_prediction_order), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, A|E},
+{"max_prediction_order", NULL, OFFSET(max_prediction_order), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, A|E},
+{"prediction_order_method", NULL, OFFSET(prediction_order_method), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, A|E},
+{"min_partition_order", NULL, OFFSET(min_partition_order), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, A|E},
+{"max_partition_order", NULL, OFFSET(max_partition_order), FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, A|E},
 {NULL},
 };
 
@@ -788,6 +804,16 @@ void avcodec_get_context_defaults(AVCodecContext *s){
     s->pix_fmt= PIX_FMT_NONE;
     s->frame_skip_cmp= FF_CMP_DCTMAX;
     s->nsse_weight= 8;
+    s->sample_fmt= SAMPLE_FMT_S16; // FIXME: set to NONE
+    s->mv0_threshold= 256;
+    s->b_sensitivity= 40;
+    s->compression_level = FF_COMPRESSION_DEFAULT;
+    s->use_lpc = -1;
+    s->min_prediction_order = -1;
+    s->max_prediction_order = -1;
+    s->prediction_order_method = -1;
+    s->min_partition_order = -1;
+    s->max_partition_order = -1;
 
     s->intra_quant_bias= FF_DEFAULT_QUANT_BIAS;
     s->inter_quant_bias= FF_DEFAULT_QUANT_BIAS;
@@ -843,9 +869,6 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec)
     if(avctx->codec)
         goto end;
 
-    avctx->codec = codec;
-    avctx->codec_id = codec->id;
-    avctx->frame_number = 0;
     if (codec->priv_data_size > 0) {
         avctx->priv_data = av_mallocz(codec->priv_data_size);
         if (!avctx->priv_data)
@@ -864,9 +887,13 @@ int avcodec_open(AVCodecContext *avctx, AVCodec *codec)
         goto end;
     }
 
+    avctx->codec = codec;
+    avctx->codec_id = codec->id;
+    avctx->frame_number = 0;
     ret = avctx->codec->init(avctx);
     if (ret < 0) {
         av_freep(&avctx->priv_data);
+        avctx->codec= NULL;
         goto end;
     }
     ret=0;
@@ -1216,6 +1243,15 @@ unsigned avcodec_build( void )
   return LIBAVCODEC_BUILD;
 }
 
+static void init_crcs(void){
+    av_crc04C11DB7= av_mallocz_static(sizeof(AVCRC) * 257);
+    av_crc8005    = av_mallocz_static(sizeof(AVCRC) * 257);
+    av_crc07      = av_mallocz_static(sizeof(AVCRC) * 257);
+    av_crc_init(av_crc04C11DB7, 0, 32, 0x04c11db7, sizeof(AVCRC)*257);
+    av_crc_init(av_crc8005    , 0, 16, 0x8005    , sizeof(AVCRC)*257);
+    av_crc_init(av_crc07      , 0,  8, 0x07      , sizeof(AVCRC)*257);
+}
+
 /* must be called before any other functions */
 void avcodec_init(void)
 {
@@ -1226,6 +1262,7 @@ void avcodec_init(void)
     inited = 1;
 
     dsputil_static_init();
+    init_crcs();
 }
 
 /**
@@ -1266,55 +1303,39 @@ char av_get_pict_type_char(int pict_type){
     }
 }
 
-/* av_log API */
-
-static int av_log_level = AV_LOG_INFO;
-
-static void av_log_default_callback(void* ptr, int level, const char* fmt, va_list vl)
-{
-    static int print_prefix=1;
-    AVClass* avc= ptr ? *(AVClass**)ptr : NULL;
-    if(level>av_log_level)
-        return;
-/* #undef fprintf */
-    if(print_prefix && avc) {
-            fprintf(stderr, "[%s @ %p]", avc->item_name(ptr), avc);
+int av_get_bits_per_sample(enum CodecID codec_id){
+    switch(codec_id){
+    case CODEC_ID_ADPCM_SBPRO_2:
+        return 2;
+    case CODEC_ID_ADPCM_SBPRO_3:
+        return 3;
+    case CODEC_ID_ADPCM_SBPRO_4:
+    case CODEC_ID_ADPCM_CT:
+        return 4;
+    case CODEC_ID_PCM_ALAW:
+    case CODEC_ID_PCM_MULAW:
+    case CODEC_ID_PCM_S8:
+    case CODEC_ID_PCM_U8:
+        return 8;
+    case CODEC_ID_PCM_S16BE:
+    case CODEC_ID_PCM_S16LE:
+    case CODEC_ID_PCM_U16BE:
+    case CODEC_ID_PCM_U16LE:
+        return 16;
+    case CODEC_ID_PCM_S24DAUD:
+    case CODEC_ID_PCM_S24BE:
+    case CODEC_ID_PCM_S24LE:
+    case CODEC_ID_PCM_U24BE:
+    case CODEC_ID_PCM_U24LE:
+        return 24;
+    case CODEC_ID_PCM_S32BE:
+    case CODEC_ID_PCM_S32LE:
+    case CODEC_ID_PCM_U32BE:
+    case CODEC_ID_PCM_U32LE:
+        return 32;
+    default:
+        return 0;
     }
-/* #define fprintf please_use_av_log */
-
-    print_prefix= strstr(fmt, "\n") != NULL;
-
-    vfprintf(stderr, fmt, vl);
-}
-
-static void (*av_log_callback)(void*, int, const char*, va_list) = av_log_default_callback;
-
-void av_log(void* avcl, int level, const char *fmt, ...)
-{
-    va_list vl;
-    va_start(vl, fmt);
-    av_vlog(avcl, level, fmt, vl);
-    va_end(vl);
-}
-
-void av_vlog(void* avcl, int level, const char *fmt, va_list vl)
-{
-    av_log_callback(avcl, level, fmt, vl);
-}
-
-int av_log_get_level(void)
-{
-    return av_log_level;
-}
-
-void av_log_set_level(int level)
-{
-    av_log_level = level;
-}
-
-void av_log_set_callback(void (*callback)(void*, int, const char*, va_list))
-{
-    av_log_callback = callback;
 }
 
 #if !defined(HAVE_THREADS)
@@ -1336,3 +1357,39 @@ unsigned int av_xiphlacing(unsigned char *s, unsigned int v)
     n++;
     return n;
 }
+
+/* Wrapper to work around the lack of mkstemp() on mingw/cygin.
+ * Also, tries to create file in /tmp first, if possible.
+ * *prefix can be a character constant; *filename will be allocated internally.
+ * Returns file descriptor of opened file (or -1 on error)
+ * and opened file name in **filename. */
+int av_tempfile(char *prefix, char **filename) {
+    int fd=-1;
+#ifdef __MINGW32__
+    *filename = tempnam(".", prefix);
+#else
+    size_t len = strlen(prefix) + 12; /* room for "/tmp/" and "XXXXXX\0" */
+    *filename = av_malloc(len);
+#endif
+    /* -----common section-----*/
+    if (*filename == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "ff_tempfile: Cannot allocate file name\n");
+        return -1;
+    }
+#ifdef __MINGW32__
+    fd = open(*filename, _O_RDWR | _O_BINARY | _O_CREAT, 0444);
+#else
+    snprintf(*filename, len, "/tmp/%sXXXXXX", prefix);
+    fd = mkstemp(*filename);
+    if (fd < 0) {
+        snprintf(*filename, len, "./%sXXXXXX", prefix);
+        fd = mkstemp(*filename);
+    }
+#endif
+    /* -----common section-----*/
+    if (fd < 0) {
+        av_log(NULL, AV_LOG_ERROR, "ff_tempfile: Cannot open temporary file %s\n", *filename);
+        return -1;
+    }
+    return fd; /* success */
+}
diff --git a/src/libffmpeg/libavcodec/vmdav.c b/src/libffmpeg/libavcodec/vmdav.c
index 34685b676..b850a09f9 100644
--- a/src/libffmpeg/libavcodec/vmdav.c
+++ b/src/libffmpeg/libavcodec/vmdav.c
@@ -414,16 +414,28 @@ typedef struct VmdAudioContext {
     int channels;
     int bits;
     int block_align;
-    unsigned char steps8[16];
-    unsigned short steps16[16];
-    unsigned short steps128[256];
-    short predictors[2];
+    int predictors[2];
 } VmdAudioContext;
 
+static uint16_t vmdaudio_table[128] = {
+    0x000, 0x008, 0x010, 0x020, 0x030, 0x040, 0x050, 0x060, 0x070, 0x080,
+    0x090, 0x0A0, 0x0B0, 0x0C0, 0x0D0, 0x0E0, 0x0F0, 0x100, 0x110, 0x120,
+    0x130, 0x140, 0x150, 0x160, 0x170, 0x180, 0x190, 0x1A0, 0x1B0, 0x1C0,
+    0x1D0, 0x1E0, 0x1F0, 0x200, 0x208, 0x210, 0x218, 0x220, 0x228, 0x230,
+    0x238, 0x240, 0x248, 0x250, 0x258, 0x260, 0x268, 0x270, 0x278, 0x280,
+    0x288, 0x290, 0x298, 0x2A0, 0x2A8, 0x2B0, 0x2B8, 0x2C0, 0x2C8, 0x2D0,
+    0x2D8, 0x2E0, 0x2E8, 0x2F0, 0x2F8, 0x300, 0x308, 0x310, 0x318, 0x320,
+    0x328, 0x330, 0x338, 0x340, 0x348, 0x350, 0x358, 0x360, 0x368, 0x370,
+    0x378, 0x380, 0x388, 0x390, 0x398, 0x3A0, 0x3A8, 0x3B0, 0x3B8, 0x3C0,
+    0x3C8, 0x3D0, 0x3D8, 0x3E0, 0x3E8, 0x3F0, 0x3F8, 0x400, 0x440, 0x480,
+    0x4C0, 0x500, 0x540, 0x580, 0x5C0, 0x600, 0x640, 0x680, 0x6C0, 0x700,
+    0x740, 0x780, 0x7C0, 0x800, 0x900, 0xA00, 0xB00, 0xC00, 0xD00, 0xE00,
+    0xF00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x3000, 0x4000
+};
+
 static int vmdaudio_decode_init(AVCodecContext *avctx)
 {
     VmdAudioContext *s = (VmdAudioContext *)avctx->priv_data;
-    int i;
 
     s->avctx = avctx;
     s->channels = avctx->channels;
@@ -433,53 +445,25 @@ static int vmdaudio_decode_init(AVCodecContext *avctx)
     av_log(s->avctx, AV_LOG_DEBUG, "%d channels, %d bits/sample, block align = %d, sample rate = %d\n",
             s->channels, s->bits, s->block_align, avctx->sample_rate);
 
-    /* set up the steps8 and steps16 tables */
-    for (i = 0; i < 8; i++) {
-        if (i < 4)
-            s->steps8[i] = i;
-        else
-            s->steps8[i] = s->steps8[i - 1] + i - 1;
-
-        if (i == 0)
-            s->steps16[i] = 0;
-        else if (i == 1)
-            s->steps16[i] = 4;
-        else if (i == 2)
-            s->steps16[i] = 16;
-        else
-            s->steps16[i] = 1 << (i + 4);
-    }
-
-    /* set up the step128 table */
-    s->steps128[0] = 0;
-    s->steps128[1] = 8;
-    for (i = 0x02; i <= 0x20; i++)
-        s->steps128[i] = (i - 1) << 4;
-    for (i = 0x21; i <= 0x60; i++)
-        s->steps128[i] = (i + 0x1F) << 3;
-    for (i = 0x61; i <= 0x70; i++)
-        s->steps128[i] = (i - 0x51) << 6;
-    for (i = 0x71; i <= 0x78; i++)
-        s->steps128[i] = (i - 0x69) << 8;
-    for (i = 0x79; i <= 0x7D; i++)
-        s->steps128[i] = (i - 0x75) << 10;
-    s->steps128[0x7E] = 0x3000;
-    s->steps128[0x7F] = 0x4000;
-
-    /* set up the negative half of each table */
-    for (i = 0; i < 8; i++) {
-        s->steps8[i + 8] = -s->steps8[i];
-        s->steps16[i + 8] = -s->steps16[i];
-    }
-    for (i = 0; i < 128; i++)
-      s->steps128[i + 128] = -s->steps128[i];
-
     return 0;
 }
 
 static void vmdaudio_decode_audio(VmdAudioContext *s, unsigned char *data,
-    uint8_t *buf, int ratio) {
+    uint8_t *buf, int stereo)
+{
+    int i;
+    int chan = 0;
+    int16_t *out = (int16_t*)data;
 
+    for(i = 0; i < s->block_align; i++) {
+        if(buf[i] & 0x80)
+            s->predictors[chan] -= vmdaudio_table[buf[i] & 0x7F];
+        else
+            s->predictors[chan] += vmdaudio_table[buf[i]];
+        s->predictors[chan] = clip(s->predictors[chan], -32768, 32767);
+        out[i] = s->predictors[chan];
+        chan ^= stereo;
+    }
 }
 
 static int vmdaudio_loadsound(VmdAudioContext *s, unsigned char *data,
@@ -488,44 +472,39 @@ static int vmdaudio_loadsound(VmdAudioContext *s, unsigned char *data,
     int bytes_decoded = 0;
     int i;
 
-    if (silence)
-        av_log(s->avctx, AV_LOG_INFO, "silent block!\n");
+//    if (silence)
+//        av_log(s->avctx, AV_LOG_INFO, "silent block!\n");
     if (s->channels == 2) {
 
         /* stereo handling */
-        if ((s->block_align & 0x01) == 0) {
-            if (silence)
-                memset(data, 0, s->block_align * 2);
-            else
-                vmdaudio_decode_audio(s, data, buf, 1);
+        if (silence) {
+            memset(data, 0, s->block_align * 2);
         } else {
-            if (silence)
-                memset(data, 0, s->block_align * 2);
-            else
+            if (s->bits == 16)
                 vmdaudio_decode_audio(s, data, buf, 1);
+            else
+                /* copy the data but convert it to signed */
+                for (i = 0; i < s->block_align; i++)
+                    data[i * 2 + 1] = buf[i] + 0x80;
         }
     } else {
+        bytes_decoded = s->block_align * 2;
 
         /* mono handling */
         if (silence) {
+            memset(data, 0, s->block_align * 2);
+        } else {
             if (s->bits == 16) {
-                memset(data, 0, s->block_align * 2);
-                bytes_decoded = s->block_align * 2;
+                vmdaudio_decode_audio(s, data, buf, 0);
             } else {
-//                memset(data, 0x00, s->block_align);
-//                bytes_decoded = s->block_align;
-memset(data, 0x00, s->block_align * 2);
-bytes_decoded = s->block_align * 2;
+                /* copy the data but convert it to signed */
+                for (i = 0; i < s->block_align; i++)
+                    data[i * 2 + 1] = buf[i] + 0x80;
             }
-        } else {
-            /* copy the data but convert it to signed */
-            for (i = 0; i < s->block_align; i++)
-                data[i * 2 + 1] = buf[i] + 0x80;
-            bytes_decoded = s->block_align * 2;
         }
     }
 
-    return bytes_decoded;
+    return s->block_align * 2;
 }
 
 static int vmdaudio_decode_frame(AVCodecContext *avctx,
diff --git a/src/libffmpeg/libavcodec/vorbis.c b/src/libffmpeg/libavcodec/vorbis.c
index 9cc09bed1..de3688c91 100644
--- a/src/libffmpeg/libavcodec/vorbis.c
+++ b/src/libffmpeg/libavcodec/vorbis.c
@@ -20,6 +20,8 @@
  */
 
 #undef V_DEBUG
+//#define V_DEBUG
+//#define AV_DEBUG(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
 
 #include <math.h>
 
@@ -473,7 +475,7 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc) {
                 }
 
                 for(k=0;k<(1<<floor_setup->data.t1.class_subclasses[j]);++k) {
-                    floor_setup->data.t1.subclass_books[j][k]=get_bits(gb, 8)-1;
+                    floor_setup->data.t1.subclass_books[j][k]=(int16_t)get_bits(gb, 8)-1;
 
                     AV_DEBUG("    book %d. : %d \n", k, floor_setup->data.t1.subclass_books[j][k]);
                 }
@@ -872,10 +874,17 @@ static int vorbis_parse_id_hdr(vorbis_context *vc){
     bl1=get_bits(gb, 4);
     vc->blocksize_0=(1<<bl0);
     vc->blocksize_1=(1<<bl1);
-    if (bl0>13 || bl0<6 || bl1>13 || bl1<6) {
+    if (bl0>13 || bl0<6 || bl1>13 || bl1<6 || bl1<bl0) {
         av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (illegal blocksize). \n");
         return 3;
     }
+    // output format int16
+    if (vc->blocksize_1/2 * vc->audio_channels * 2 >
+                                             AVCODEC_MAX_AUDIO_FRAME_SIZE) {
+        av_log(vc->avccontext, AV_LOG_ERROR, "Vorbis channel count makes "
+               "output packets too large.\n");
+        return 4;
+    }
     vc->swin=vwin[bl0-6];
     vc->lwin=vwin[bl1-6];
 
@@ -1345,12 +1354,14 @@ static int vorbis_residue_decode(vorbis_context *vc, vorbis_residue *vr, uint_fa
 
                         AV_DEBUG("Classword: %d \n", temp);
 
-                        assert(vr->classifications > 1 && vr->classifications<256 && temp<=65536); //needed for inverse[]
+                        assert(vr->classifications > 1 && temp<=65536); //needed for inverse[]
                         for(i=0;i<c_p_c;++i) {
                             uint_fast32_t temp2;
 
                             temp2=(((uint_fast64_t)temp) * inverse[vr->classifications])>>32;
-                            classifs[j_times_ptns_to_read+partition_count+c_p_c-1-i]=temp-temp2*vr->classifications;
+                            if (partition_count+c_p_c-1-i < ptns_to_read) {
+                                classifs[j_times_ptns_to_read+partition_count+c_p_c-1-i]=temp-temp2*vr->classifications;
+                            }
                             temp=temp2;
                         }
                     }
diff --git a/src/libffmpeg/libavcodec/vp3.c b/src/libffmpeg/libavcodec/vp3.c
index a7a9e8bac..b5cfbb02c 100644
--- a/src/libffmpeg/libavcodec/vp3.c
+++ b/src/libffmpeg/libavcodec/vp3.c
@@ -229,6 +229,8 @@ typedef struct Vp3DecodeContext {
     DSPContext dsp;
     int flipped_image;
 
+    int qis[3];
+    int nqis;
     int quality_index;
     int last_quality_index;
 
@@ -254,17 +256,17 @@ typedef struct Vp3DecodeContext {
     Vp3Fragment *all_fragments;
     Coeff *coeffs;
     Coeff *next_coeff;
-    int u_fragment_start;
-    int v_fragment_start;
+    int fragment_start[3];
 
     ScanTable scantable;
 
     /* tables */
     uint16_t coded_dc_scale_factor[64];
     uint32_t coded_ac_scale_factor[64];
-    uint16_t coded_intra_y_dequant[64];
-    uint16_t coded_intra_c_dequant[64];
-    uint16_t coded_inter_dequant[64];
+    uint8_t base_matrix[384][64];
+    uint8_t qr_count[2][3];
+    uint8_t qr_size [2][3][64];
+    uint16_t qr_base[2][3][64];
 
     /* this is a list of indices into the all_fragments array indicating
      * which of the fragments are coded */
@@ -285,9 +287,7 @@ typedef struct Vp3DecodeContext {
 
     /* these arrays need to be on 16-byte boundaries since SSE2 operations
      * index into them */
-    DECLARE_ALIGNED_16(int16_t, intra_y_dequant[64]);
-    DECLARE_ALIGNED_16(int16_t, intra_c_dequant[64]);
-    DECLARE_ALIGNED_16(int16_t, inter_dequant[64]);
+    DECLARE_ALIGNED_16(int16_t, qmat[2][4][64]);        //<qmat[is_inter][plane]
 
     /* This table contains superblock_count * 16 entries. Each set of 16
      * numbers corresponds to the fragment indices 0..15 of the superblock.
@@ -328,8 +328,7 @@ typedef struct Vp3DecodeContext {
     int bounding_values_array[256];
 } Vp3DecodeContext;
 
-static int theora_decode_comments(AVCodecContext *avctx, GetBitContext gb);
-static int theora_decode_tables(AVCodecContext *avctx, GetBitContext gb);
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb);
 
 /************************************************************************
  * VP3 specific functions
@@ -345,8 +344,6 @@ static int theora_decode_tables(AVCodecContext *avctx, GetBitContext gb);
 static int init_block_mapping(Vp3DecodeContext *s)
 {
     int i, j;
-    signed int hilbert_walk_y[16];
-    signed int hilbert_walk_c[16];
     signed int hilbert_walk_mb[4];
 
     int current_fragment = 0;
@@ -385,41 +382,6 @@ static int init_block_mapping(Vp3DecodeContext *s)
 
     debug_vp3("  vp3: initialize block mapping tables\n");
 
-    /* figure out hilbert pattern per these frame dimensions */
-    hilbert_walk_y[0]  = 1;
-    hilbert_walk_y[1]  = 1;
-    hilbert_walk_y[2]  = s->fragment_width;
-    hilbert_walk_y[3]  = -1;
-    hilbert_walk_y[4]  = s->fragment_width;
-    hilbert_walk_y[5]  = s->fragment_width;
-    hilbert_walk_y[6]  = 1;
-    hilbert_walk_y[7]  = -s->fragment_width;
-    hilbert_walk_y[8]  = 1;
-    hilbert_walk_y[9]  = s->fragment_width;
-    hilbert_walk_y[10]  = 1;
-    hilbert_walk_y[11] = -s->fragment_width;
-    hilbert_walk_y[12] = -s->fragment_width;
-    hilbert_walk_y[13] = -1;
-    hilbert_walk_y[14] = -s->fragment_width;
-    hilbert_walk_y[15] = 1;
-
-    hilbert_walk_c[0]  = 1;
-    hilbert_walk_c[1]  = 1;
-    hilbert_walk_c[2]  = s->fragment_width / 2;
-    hilbert_walk_c[3]  = -1;
-    hilbert_walk_c[4]  = s->fragment_width / 2;
-    hilbert_walk_c[5]  = s->fragment_width / 2;
-    hilbert_walk_c[6]  = 1;
-    hilbert_walk_c[7]  = -s->fragment_width / 2;
-    hilbert_walk_c[8]  = 1;
-    hilbert_walk_c[9]  = s->fragment_width / 2;
-    hilbert_walk_c[10]  = 1;
-    hilbert_walk_c[11] = -s->fragment_width / 2;
-    hilbert_walk_c[12] = -s->fragment_width / 2;
-    hilbert_walk_c[13] = -1;
-    hilbert_walk_c[14] = -s->fragment_width / 2;
-    hilbert_walk_c[15] = 1;
-
     hilbert_walk_mb[0] = 1;
     hilbert_walk_mb[1] = s->macroblock_width;
     hilbert_walk_mb[2] = 1;
@@ -440,7 +402,6 @@ static int init_block_mapping(Vp3DecodeContext *s)
             current_height = 0;
             superblock_row_inc = 3 * s->fragment_width -
                 (s->y_superblock_width * 4 - s->fragment_width);
-            hilbert = hilbert_walk_y;
 
             /* the first operation for this variable is to advance by 1 */
             current_fragment = -1;
@@ -454,10 +415,9 @@ static int init_block_mapping(Vp3DecodeContext *s)
             current_height = 0;
             superblock_row_inc = 3 * (s->fragment_width / 2) -
                 (s->c_superblock_width * 4 - s->fragment_width / 2);
-            hilbert = hilbert_walk_c;
 
             /* the first operation for this variable is to advance by 1 */
-            current_fragment = s->u_fragment_start - 1;
+            current_fragment = s->fragment_start[1] - 1;
 
         } else if (i == s->v_superblock_start) {
 
@@ -468,10 +428,9 @@ static int init_block_mapping(Vp3DecodeContext *s)
             current_height = 0;
             superblock_row_inc = 3 * (s->fragment_width / 2) -
                 (s->c_superblock_width * 4 - s->fragment_width / 2);
-            hilbert = hilbert_walk_c;
 
             /* the first operation for this variable is to advance by 1 */
-            current_fragment = s->v_fragment_start - 1;
+            current_fragment = s->fragment_start[2] - 1;
 
         }
 
@@ -486,7 +445,7 @@ static int init_block_mapping(Vp3DecodeContext *s)
 
         /* iterate through all 16 fragments in a superblock */
         for (j = 0; j < 16; j++) {
-            current_fragment += hilbert[j];
+            current_fragment += travel_width[j] + right_edge * travel_height[j];
             current_width += travel_width[j];
             current_height += travel_height[j];
 
@@ -593,13 +552,13 @@ static int init_block_mapping(Vp3DecodeContext *s)
                 s->macroblock_fragments[mapping_index++] = -1;
 
             /* C planes */
-            c_fragment = s->u_fragment_start +
+            c_fragment = s->fragment_start[1] +
                 (i * s->fragment_width / 4) + (j / 2);
             s->all_fragments[c_fragment].macroblock = s->macroblock_count;
             s->macroblock_fragments[mapping_index++] = c_fragment;
             debug_init("%d ", c_fragment);
 
-            c_fragment = s->v_fragment_start +
+            c_fragment = s->fragment_start[2] +
                 (i * s->fragment_width / 4) + (j / 2);
             s->all_fragments[c_fragment].macroblock = s->macroblock_count;
             s->macroblock_fragments[mapping_index++] = c_fragment;
@@ -646,94 +605,38 @@ static void init_frame(Vp3DecodeContext *s, GetBitContext *gb)
  */
 static void init_dequantizer(Vp3DecodeContext *s)
 {
-
     int ac_scale_factor = s->coded_ac_scale_factor[s->quality_index];
     int dc_scale_factor = s->coded_dc_scale_factor[s->quality_index];
-    int i, j;
+    int i, j, plane, inter, qri, bmi, bmj, qistart;
 
     debug_vp3("  vp3: initializing dequantization tables\n");
 
-    /*
-     * Scale dequantizers:
-     *
-     *   quantizer * sf
-     *   --------------
-     *        100
-     *
-     * where sf = dc_scale_factor for DC quantizer
-     *         or ac_scale_factor for AC quantizer
-     *
-     * Then, saturate the result to a lower limit of MIN_DEQUANT_VAL.
-     */
-#define SCALER 4
-
-    /* scale DC quantizers */
-    s->intra_y_dequant[0] = s->coded_intra_y_dequant[0] * dc_scale_factor / 100;
-    if (s->intra_y_dequant[0] < MIN_DEQUANT_VAL * 2)
-        s->intra_y_dequant[0] = MIN_DEQUANT_VAL * 2;
-    s->intra_y_dequant[0] *= SCALER;
-
-    s->intra_c_dequant[0] = s->coded_intra_c_dequant[0] * dc_scale_factor / 100;
-    if (s->intra_c_dequant[0] < MIN_DEQUANT_VAL * 2)
-        s->intra_c_dequant[0] = MIN_DEQUANT_VAL * 2;
-    s->intra_c_dequant[0] *= SCALER;
-
-    s->inter_dequant[0] = s->coded_inter_dequant[0] * dc_scale_factor / 100;
-    if (s->inter_dequant[0] < MIN_DEQUANT_VAL * 4)
-        s->inter_dequant[0] = MIN_DEQUANT_VAL * 4;
-    s->inter_dequant[0] *= SCALER;
-
-    /* scale AC quantizers, zigzag at the same time in preparation for
-     * the dequantization phase */
-    for (i = 1; i < 64; i++) {
-        int k= s->scantable.scantable[i];
-        j = s->scantable.permutated[i];
-
-        s->intra_y_dequant[j] = s->coded_intra_y_dequant[k] * ac_scale_factor / 100;
-        if (s->intra_y_dequant[j] < MIN_DEQUANT_VAL)
-            s->intra_y_dequant[j] = MIN_DEQUANT_VAL;
-        s->intra_y_dequant[j] *= SCALER;
-
-        s->intra_c_dequant[j] = s->coded_intra_c_dequant[k] * ac_scale_factor / 100;
-        if (s->intra_c_dequant[j] < MIN_DEQUANT_VAL)
-            s->intra_c_dequant[j] = MIN_DEQUANT_VAL;
-        s->intra_c_dequant[j] *= SCALER;
-
-        s->inter_dequant[j] = s->coded_inter_dequant[k] * ac_scale_factor / 100;
-        if (s->inter_dequant[j] < MIN_DEQUANT_VAL * 2)
-            s->inter_dequant[j] = MIN_DEQUANT_VAL * 2;
-        s->inter_dequant[j] *= SCALER;
+    for(inter=0; inter<2; inter++){
+        for(plane=0; plane<3; plane++){
+            int sum=0;
+            for(qri=0; qri<s->qr_count[inter][plane]; qri++){
+                sum+= s->qr_size[inter][plane][qri];
+                if(s->quality_index <= sum)
+                    break;
+            }
+            qistart= sum - s->qr_size[inter][plane][qri];
+            bmi= s->qr_base[inter][plane][qri  ];
+            bmj= s->qr_base[inter][plane][qri+1];
+            for(i=0; i<64; i++){
+                int coeff= (  2*(sum    -s->quality_index)*s->base_matrix[bmi][i]
+                            - 2*(qistart-s->quality_index)*s->base_matrix[bmj][i]
+                            + s->qr_size[inter][plane][qri])
+                           / (2*s->qr_size[inter][plane][qri]);
+
+                int qmin= 8<<(inter + !i);
+                int qscale= i ? ac_scale_factor : dc_scale_factor;
+
+                s->qmat[inter][plane][i]= clip((qscale * coeff)/100 * 4, qmin, 4096);
+            }
+        }
     }
 
-    memset(s->qscale_table, (FFMAX(s->intra_y_dequant[1], s->intra_c_dequant[1])+8)/16, 512); //FIXME finetune
-
-    /* print debug information as requested */
-    debug_dequantizers("intra Y dequantizers:\n");
-    for (i = 0; i < 8; i++) {
-      for (j = i * 8; j < i * 8 + 8; j++) {
-        debug_dequantizers(" %4d,", s->intra_y_dequant[j]);
-      }
-      debug_dequantizers("\n");
-    }
-    debug_dequantizers("\n");
-
-    debug_dequantizers("intra C dequantizers:\n");
-    for (i = 0; i < 8; i++) {
-      for (j = i * 8; j < i * 8 + 8; j++) {
-        debug_dequantizers(" %4d,", s->intra_c_dequant[j]);
-      }
-      debug_dequantizers("\n");
-    }
-    debug_dequantizers("\n");
-
-    debug_dequantizers("interframe dequantizers:\n");
-    for (i = 0; i < 8; i++) {
-      for (j = i * 8; j < i * 8 + 8; j++) {
-        debug_dequantizers(" %4d,", s->inter_dequant[j]);
-      }
-      debug_dequantizers("\n");
-    }
-    debug_dequantizers("\n");
+    memset(s->qscale_table, (FFMAX(s->qmat[0][0][1], s->qmat[0][1][1])+8)/16, 512); //FIXME finetune
 }
 
 /*
@@ -903,7 +806,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                         s->all_fragments[current_fragment].next_coeff= s->coeffs + current_fragment;
                         s->coded_fragment_list[s->coded_fragment_list_index] =
                             current_fragment;
-                        if ((current_fragment >= s->u_fragment_start) &&
+                        if ((current_fragment >= s->fragment_start[1]) &&
                             (s->last_coded_y_fragment == -1) &&
                             (!first_c_fragment_seen)) {
                             s->first_coded_c_fragment = s->coded_fragment_list_index;
@@ -931,7 +834,7 @@ static int unpack_superblocks(Vp3DecodeContext *s, GetBitContext *gb)
                     s->all_fragments[current_fragment].next_coeff= s->coeffs + current_fragment;
                     s->coded_fragment_list[s->coded_fragment_list_index] =
                         current_fragment;
-                    if ((current_fragment >= s->u_fragment_start) &&
+                    if ((current_fragment >= s->fragment_start[1]) &&
                         (s->last_coded_y_fragment == -1) &&
                         (!first_c_fragment_seen)) {
                         s->first_coded_c_fragment = s->coded_fragment_list_index;
@@ -1146,17 +1049,10 @@ static int unpack_vectors(Vp3DecodeContext *s, GetBitContext *gb)
                         motion_y[4] += motion_y[k];
                     }
 
-                    if (motion_x[4] >= 0)
-                        motion_x[4] = (motion_x[4] + 2) / 4;
-                    else
-                        motion_x[4] = (motion_x[4] - 2) / 4;
-                    motion_x[5] = motion_x[4];
-
-                    if (motion_y[4] >= 0)
-                        motion_y[4] = (motion_y[4] + 2) / 4;
-                    else
-                        motion_y[4] = (motion_y[4] - 2) / 4;
-                    motion_y[5] = motion_y[4];
+                    motion_x[5]=
+                    motion_x[4]= RSHIFT(motion_x[4], 2);
+                    motion_y[5]=
+                    motion_y[4]= RSHIFT(motion_y[4], 2);
 
                     /* vector maintenance; vector[3] is treated as the
                      * last vector in this case */
@@ -1416,7 +1312,6 @@ static int unpack_dct_coeffs(Vp3DecodeContext *s, GetBitContext *gb)
   (compatible_frame[s->all_fragments[x].coding_method] == current_frame_type)
 #define FRAME_CODED(x) (s->all_fragments[x].coding_method != MODE_COPY)
 #define DC_COEFF(u) (s->coeffs[u].index ? 0 : s->coeffs[u].coeff) //FIXME do somethin to simplify this
-static inline int iabs (int x) { return ((x < 0) ? -x : x); }
 
 static void reverse_dc_prediction(Vp3DecodeContext *s,
                                   int first_fragment,
@@ -1432,24 +1327,8 @@ static void reverse_dc_prediction(Vp3DecodeContext *s,
     int x, y;
     int i = first_fragment;
 
-    /*
-     * Fragment prediction groups:
-     *
-     * 32222222226
-     * 10000000004
-     * 10000000004
-     * 10000000004
-     * 10000000004
-     *
-     * Note: Groups 5 and 7 do not exist as it would mean that the
-     * fragment's x coordinate is both 0 and (width - 1) at the same time.
-     */
-    int predictor_group;
     short predicted_dc;
 
-    /* validity flags for the left, up-left, up, and up-right fragments */
-    int fl, ful, fu, fur;
-
     /* DC values for the left, up-left, up, and up-right fragments */
     int vl, vul, vu, vur;
 
@@ -1462,26 +1341,24 @@ static void reverse_dc_prediction(Vp3DecodeContext *s,
      *   1: up multiplier
      *   2: up-right multiplier
      *   3: left multiplier
-     *   4: mask
-     *   5: right bit shift divisor (e.g., 7 means >>=7, a.k.a. div by 128)
      */
-    int predictor_transform[16][6] = {
-        {  0,  0,  0,  0,   0,  0 },
-        {  0,  0,  0,  1,   0,  0 },        // PL
-        {  0,  0,  1,  0,   0,  0 },        // PUR
-        {  0,  0, 53, 75, 127,  7 },        // PUR|PL
-        {  0,  1,  0,  0,   0,  0 },        // PU
-        {  0,  1,  0,  1,   1,  1 },        // PU|PL
-        {  0,  1,  0,  0,   0,  0 },        // PU|PUR
-        {  0,  0, 53, 75, 127,  7 },        // PU|PUR|PL
-        {  1,  0,  0,  0,   0,  0 },        // PUL
-        {  0,  0,  0,  1,   0,  0 },        // PUL|PL
-        {  1,  0,  1,  0,   1,  1 },        // PUL|PUR
-        {  0,  0, 53, 75, 127,  7 },        // PUL|PUR|PL
-        {  0,  1,  0,  0,   0,  0 },        // PUL|PU
-        {-26, 29,  0, 29,  31,  5 },        // PUL|PU|PL
-        {  3, 10,  3,  0,  15,  4 },        // PUL|PU|PUR
-        {-26, 29,  0, 29,  31,  5 }         // PUL|PU|PUR|PL
+    int predictor_transform[16][4] = {
+        {  0,  0,  0,  0},
+        {  0,  0,  0,128},        // PL
+        {  0,  0,128,  0},        // PUR
+        {  0,  0, 53, 75},        // PUR|PL
+        {  0,128,  0,  0},        // PU
+        {  0, 64,  0, 64},        // PU|PL
+        {  0,128,  0,  0},        // PU|PUR
+        {  0,  0, 53, 75},        // PU|PUR|PL
+        {128,  0,  0,  0},        // PUL
+        {  0,  0,  0,128},        // PUL|PL
+        { 64,  0, 64,  0},        // PUL|PUR
+        {  0,  0, 53, 75},        // PUL|PUR|PL
+        {  0,128,  0,  0},        // PUL|PU
+       {-104,116,  0,116},        // PUL|PU|PL
+        { 24, 80, 24,  0},        // PUL|PU|PUR
+       {-104,116,  0,116}         // PUL|PU|PUR|PL
     };
 
     /* This table shows which types of blocks can use other blocks for
@@ -1523,113 +1400,33 @@ static void reverse_dc_prediction(Vp3DecodeContext *s,
 
                 current_frame_type =
                     compatible_frame[s->all_fragments[i].coding_method];
-                predictor_group = (x == 0) + ((y == 0) << 1) +
-                    ((x + 1 == fragment_width) << 2);
-                debug_dc_pred(" frag %d: group %d, orig DC = %d, ",
-                    i, predictor_group, DC_COEFF(i));
-
-                switch (predictor_group) {
-
-                case 0:
-                    /* main body of fragments; consider all 4 possible
-                     * fragments for prediction */
-
-                    /* calculate the indices of the predicting fragments */
-                    ul = i - fragment_width - 1;
-                    u = i - fragment_width;
-                    ur = i - fragment_width + 1;
-                    l = i - 1;
-
-                    /* fetch the DC values for the predicting fragments */
-                    vul = DC_COEFF(ul);
-                    vu = DC_COEFF(u);
-                    vur = DC_COEFF(ur);
-                    vl = DC_COEFF(l);
-
-                    /* figure out which fragments are valid */
-                    ful = FRAME_CODED(ul) && COMPATIBLE_FRAME(ul);
-                    fu = FRAME_CODED(u) && COMPATIBLE_FRAME(u);
-                    fur = FRAME_CODED(ur) && COMPATIBLE_FRAME(ur);
-                    fl = FRAME_CODED(l) && COMPATIBLE_FRAME(l);
-
-                    /* decide which predictor transform to use */
-                    transform = (fl*PL) | (fu*PU) | (ful*PUL) | (fur*PUR);
-
-                    break;
-
-                case 1:
-                    /* left column of fragments, not including top corner;
-                     * only consider up and up-right fragments */
+                debug_dc_pred(" frag %d: orig DC = %d, ",
+                    i, DC_COEFF(i));
 
-                    /* calculate the indices of the predicting fragments */
-                    u = i - fragment_width;
-                    ur = i - fragment_width + 1;
-
-                    /* fetch the DC values for the predicting fragments */
-                    vu = DC_COEFF(u);
-                    vur = DC_COEFF(ur);
-
-                    /* figure out which fragments are valid */
-                    fur = FRAME_CODED(ur) && COMPATIBLE_FRAME(ur);
-                    fu = FRAME_CODED(u) && COMPATIBLE_FRAME(u);
-
-                    /* decide which predictor transform to use */
-                    transform = (fu*PU) | (fur*PUR);
-
-                    break;
-
-                case 2:
-                case 6:
-                    /* top row of fragments, not including top-left frag;
-                     * only consider the left fragment for prediction */
-
-                    /* calculate the indices of the predicting fragments */
-                    l = i - 1;
-
-                    /* fetch the DC values for the predicting fragments */
+                transform= 0;
+                if(x){
+                    l= i-1;
                     vl = DC_COEFF(l);
-
-                    /* figure out which fragments are valid */
-                    fl = FRAME_CODED(l) && COMPATIBLE_FRAME(l);
-
-                    /* decide which predictor transform to use */
-                    transform = (fl*PL);
-
-                    break;
-
-                case 3:
-                    /* top-left fragment */
-
-                    /* nothing to predict from in this case */
-                    transform = 0;
-
-                    break;
-
-                case 4:
-                    /* right column of fragments, not including top corner;
-                     * consider up-left, up, and left fragments for
-                     * prediction */
-
-                    /* calculate the indices of the predicting fragments */
-                    ul = i - fragment_width - 1;
-                    u = i - fragment_width;
-                    l = i - 1;
-
-                    /* fetch the DC values for the predicting fragments */
-                    vul = DC_COEFF(ul);
+                    if(FRAME_CODED(l) && COMPATIBLE_FRAME(l))
+                        transform |= PL;
+                }
+                if(y){
+                    u= i-fragment_width;
                     vu = DC_COEFF(u);
-                    vl = DC_COEFF(l);
-
-                    /* figure out which fragments are valid */
-                    ful = FRAME_CODED(ul) && COMPATIBLE_FRAME(ul);
-                    fu = FRAME_CODED(u) && COMPATIBLE_FRAME(u);
-                    fl = FRAME_CODED(l) && COMPATIBLE_FRAME(l);
-
-                    /* decide which predictor transform to use */
-                    transform = (fl*PL) | (fu*PU) | (ful*PUL);
-
-                    break;
-
+                    if(FRAME_CODED(u) && COMPATIBLE_FRAME(u))
+                        transform |= PU;
+                    if(x){
+                        ul= i-fragment_width-1;
+                        vul = DC_COEFF(ul);
+                        if(FRAME_CODED(ul) && COMPATIBLE_FRAME(ul))
+                            transform |= PUL;
+                    }
+                    if(x + 1 < fragment_width){
+                        ur= i-fragment_width+1;
+                        vur = DC_COEFF(ur);
+                        if(FRAME_CODED(ur) && COMPATIBLE_FRAME(ur))
+                            transform |= PUR;
+                    }
                 }
 
                 debug_dc_pred("transform = %d, ", transform);
@@ -1651,22 +1448,16 @@ static void reverse_dc_prediction(Vp3DecodeContext *s,
                         (predictor_transform[transform][2] * vur) +
                         (predictor_transform[transform][3] * vl);
 
-                    /* if there is a shift value in the transform, add
-                     * the sign bit before the shift */
-                    if (predictor_transform[transform][5] != 0) {
-                        predicted_dc += ((predicted_dc >> 15) &
-                            predictor_transform[transform][4]);
-                        predicted_dc >>= predictor_transform[transform][5];
-                    }
+                    predicted_dc /= 128;
 
                     /* check for outranging on the [ul u l] and
                      * [ul u ur l] predictors */
                     if ((transform == 13) || (transform == 15)) {
-                        if (iabs(predicted_dc - vu) > 128)
+                        if (ABS(predicted_dc - vu) > 128)
                             predicted_dc = vu;
-                        else if (iabs(predicted_dc - vl) > 128)
+                        else if (ABS(predicted_dc - vl) > 128)
                             predicted_dc = vl;
-                        else if (iabs(predicted_dc - vul) > 128)
+                        else if (ABS(predicted_dc - vul) > 128)
                             predicted_dc = vul;
                     }
 
@@ -1707,73 +1498,32 @@ static void vertical_filter(unsigned char *first_pixel, int stride,
  */
 static void render_slice(Vp3DecodeContext *s, int slice)
 {
-    int x, y;
+    int x;
     int m, n;
-    int i;  /* indicates current fragment */
     int16_t *dequantizer;
     DECLARE_ALIGNED_16(DCTELEM, block[64]);
-    unsigned char *output_plane;
-    unsigned char *last_plane;
-    unsigned char *golden_plane;
-    int stride;
     int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef;
-    int upper_motion_limit, lower_motion_limit;
     int motion_halfpel_index;
     uint8_t *motion_source;
     int plane;
-    int plane_width;
-    int plane_height;
-    int slice_height;
     int current_macroblock_entry = slice * s->macroblock_width * 6;
-    int fragment_width;
 
     if (slice >= s->macroblock_height)
         return;
 
     for (plane = 0; plane < 3; plane++) {
+        uint8_t *output_plane = s->current_frame.data    [plane];
+        uint8_t *  last_plane = s->   last_frame.data    [plane];
+        uint8_t *golden_plane = s-> golden_frame.data    [plane];
+        int stride            = s->current_frame.linesize[plane];
+        int plane_width       = s->width  >> !!plane;
+        int plane_height      = s->height >> !!plane;
+        int y =        slice *  FRAGMENT_PIXELS << !plane ;
+        int slice_height = y + (FRAGMENT_PIXELS << !plane);
+        int i = s->macroblock_fragments[current_macroblock_entry + plane + 3*!!plane];
+
+        if (!s->flipped_image) stride = -stride;
 
-        /* set up plane-specific parameters */
-        if (plane == 0) {
-            output_plane = s->current_frame.data[0];
-            last_plane = s->last_frame.data[0];
-            golden_plane = s->golden_frame.data[0];
-            stride = s->current_frame.linesize[0];
-            if (!s->flipped_image) stride = -stride;
-            upper_motion_limit = 7 * s->current_frame.linesize[0];
-            lower_motion_limit = s->height * s->current_frame.linesize[0] + s->width - 8;
-            y = slice * FRAGMENT_PIXELS * 2;
-            plane_width = s->width;
-            plane_height = s->height;
-            slice_height = y + FRAGMENT_PIXELS * 2;
-            i = s->macroblock_fragments[current_macroblock_entry + 0];
-        } else if (plane == 1) {
-            output_plane = s->current_frame.data[1];
-            last_plane = s->last_frame.data[1];
-            golden_plane = s->golden_frame.data[1];
-            stride = s->current_frame.linesize[1];
-            if (!s->flipped_image) stride = -stride;
-            upper_motion_limit = 7 * s->current_frame.linesize[1];
-            lower_motion_limit = (s->height / 2) * s->current_frame.linesize[1] + (s->width / 2) - 8;
-            y = slice * FRAGMENT_PIXELS;
-            plane_width = s->width / 2;
-            plane_height = s->height / 2;
-            slice_height = y + FRAGMENT_PIXELS;
-            i = s->macroblock_fragments[current_macroblock_entry + 4];
-        } else {
-            output_plane = s->current_frame.data[2];
-            last_plane = s->last_frame.data[2];
-            golden_plane = s->golden_frame.data[2];
-            stride = s->current_frame.linesize[2];
-            if (!s->flipped_image) stride = -stride;
-            upper_motion_limit = 7 * s->current_frame.linesize[2];
-            lower_motion_limit = (s->height / 2) * s->current_frame.linesize[2] + (s->width / 2) - 8;
-            y = slice * FRAGMENT_PIXELS;
-            plane_width = s->width / 2;
-            plane_height = s->height / 2;
-            slice_height = y + FRAGMENT_PIXELS;
-            i = s->macroblock_fragments[current_macroblock_entry + 5];
-        }
-        fragment_width = plane_width / FRAGMENT_PIXELS;
 
         if(ABS(stride) > 2048)
             return; //various tables are fixed size
@@ -1855,12 +1605,9 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                                 motion_source + stride + 1 + d,
                                 stride, 8);
                         }
-                        dequantizer = s->inter_dequant;
+                        dequantizer = s->qmat[1][plane];
                     }else{
-                        if (plane == 0)
-                            dequantizer = s->intra_y_dequant;
-                        else
-                            dequantizer = s->intra_c_dequant;
+                        dequantizer = s->qmat[0][plane];
                     }
 
                     /* dequantize the DCT coefficients */
@@ -1935,7 +1682,7 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                       (s->all_fragments[i - 1].coding_method != MODE_COPY)) )) {
                     horizontal_filter(
                         output_plane + s->all_fragments[i].first_pixel + 7*stride,
-                        -stride, bounding_values);
+                        -stride, s->bounding_values_array + 127);
                 }
 
                 /* perform the top edge filter if:
@@ -1951,7 +1698,7 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                       (s->all_fragments[i - fragment_width].coding_method != MODE_COPY)) )) {
                     vertical_filter(
                         output_plane + s->all_fragments[i].first_pixel - stride,
-                        -stride, bounding_values);
+                        -stride, s->bounding_values_array + 127);
                 }
 #endif
             }
@@ -1975,7 +1722,7 @@ static void horizontal_filter(unsigned char *first_pixel, int stride,
     unsigned char *end;
     int filter_value;
 
-    for (end= first_pixel + 8*stride; first_pixel < end; first_pixel += stride) {
+    for (end= first_pixel + 8*stride; first_pixel != end; first_pixel += stride) {
         filter_value =
             (first_pixel[-2] - first_pixel[ 1])
          +3*(first_pixel[ 0] - first_pixel[-1]);
@@ -2004,11 +1751,8 @@ static void vertical_filter(unsigned char *first_pixel, int stride,
 
 static void apply_loop_filter(Vp3DecodeContext *s)
 {
-    int x, y, plane;
-    int width, height;
-    int fragment;
-    int stride;
-    unsigned char *plane_data;
+    int plane;
+    int x, y;
     int *bounding_values= s->bounding_values_array+127;
 
 #if 0
@@ -2033,29 +1777,12 @@ static void apply_loop_filter(Vp3DecodeContext *s)
 #endif
 
     for (plane = 0; plane < 3; plane++) {
-
-        if (plane == 0) {
-            /* Y plane parameters */
-            fragment = 0;
-            width = s->fragment_width;
-            height = s->fragment_height;
-            stride = s->current_frame.linesize[0];
-            plane_data = s->current_frame.data[0];
-        } else if (plane == 1) {
-            /* U plane parameters */
-            fragment = s->u_fragment_start;
-            width = s->fragment_width / 2;
-            height = s->fragment_height / 2;
-            stride = s->current_frame.linesize[1];
-            plane_data = s->current_frame.data[1];
-        } else {
-            /* V plane parameters */
-            fragment = s->v_fragment_start;
-            width = s->fragment_width / 2;
-            height = s->fragment_height / 2;
-            stride = s->current_frame.linesize[2];
-            plane_data = s->current_frame.data[2];
-        }
+        int width           = s->fragment_width  >> !!plane;
+        int height          = s->fragment_height >> !!plane;
+        int fragment        = s->fragment_start        [plane];
+        int stride          = s->current_frame.linesize[plane];
+        uint8_t *plane_data = s->current_frame.data    [plane];
+        if (!s->flipped_image) stride = -stride;
 
         for (y = 0; y < height; y++) {
 
@@ -2065,7 +1792,7 @@ START_TIMER
                 if ((x > 0) &&
                     (s->all_fragments[fragment].coding_method != MODE_COPY)) {
                     horizontal_filter(
-                        plane_data + s->all_fragments[fragment].first_pixel - 7*stride,
+                        plane_data + s->all_fragments[fragment].first_pixel,
                         stride, bounding_values);
                 }
 
@@ -2073,7 +1800,7 @@ START_TIMER
                 if ((y > 0) &&
                     (s->all_fragments[fragment].coding_method != MODE_COPY)) {
                     vertical_filter(
-                        plane_data + s->all_fragments[fragment].first_pixel + stride,
+                        plane_data + s->all_fragments[fragment].first_pixel,
                         stride, bounding_values);
                 }
 
@@ -2084,7 +1811,7 @@ START_TIMER
                     (s->all_fragments[fragment].coding_method != MODE_COPY) &&
                     (s->all_fragments[fragment + 1].coding_method == MODE_COPY)) {
                     horizontal_filter(
-                        plane_data + s->all_fragments[fragment + 1].first_pixel - 7*stride,
+                        plane_data + s->all_fragments[fragment + 1].first_pixel,
                         stride, bounding_values);
                 }
 
@@ -2095,7 +1822,7 @@ START_TIMER
                     (s->all_fragments[fragment].coding_method != MODE_COPY) &&
                     (s->all_fragments[fragment + width].coding_method == MODE_COPY)) {
                     vertical_filter(
-                        plane_data + s->all_fragments[fragment + width].first_pixel + stride,
+                        plane_data + s->all_fragments[fragment + width].first_pixel,
                         stride, bounding_values);
                 }
 
@@ -2131,7 +1858,7 @@ static void vp3_calculate_pixel_addresses(Vp3DecodeContext *s)
     }
 
     /* U plane */
-    i = s->u_fragment_start;
+    i = s->fragment_start[1];
     for (y = s->fragment_height / 2; y > 0; y--) {
         for (x = 0; x < s->fragment_width / 2; x++) {
             s->all_fragments[i++].first_pixel =
@@ -2144,7 +1871,7 @@ static void vp3_calculate_pixel_addresses(Vp3DecodeContext *s)
     }
 
     /* V plane */
-    i = s->v_fragment_start;
+    i = s->fragment_start[2];
     for (y = s->fragment_height / 2; y > 0; y--) {
         for (x = 0; x < s->fragment_width / 2; x++) {
             s->all_fragments[i++].first_pixel =
@@ -2178,7 +1905,7 @@ static void theora_calculate_pixel_addresses(Vp3DecodeContext *s)
     }
 
     /* U plane */
-    i = s->u_fragment_start;
+    i = s->fragment_start[1];
     for (y = 1; y <= s->fragment_height / 2; y++) {
         for (x = 0; x < s->fragment_width / 2; x++) {
             s->all_fragments[i++].first_pixel =
@@ -2191,7 +1918,7 @@ static void theora_calculate_pixel_addresses(Vp3DecodeContext *s)
     }
 
     /* V plane */
-    i = s->v_fragment_start;
+    i = s->fragment_start[2];
     for (y = 1; y <= s->fragment_height / 2; y++) {
         for (x = 0; x < s->fragment_width / 2; x++) {
             s->all_fragments[i++].first_pixel =
@@ -2210,7 +1937,7 @@ static void theora_calculate_pixel_addresses(Vp3DecodeContext *s)
 static int vp3_decode_init(AVCodecContext *avctx)
 {
     Vp3DecodeContext *s = avctx->priv_data;
-    int i;
+    int i, inter, plane;
     int c_width;
     int c_height;
     int y_superblock_count;
@@ -2261,8 +1988,8 @@ static int vp3_decode_init(AVCodecContext *avctx)
 
     /* fragment count covers all 8x8 blocks for all 3 planes */
     s->fragment_count = s->fragment_width * s->fragment_height * 3 / 2;
-    s->u_fragment_start = s->fragment_width * s->fragment_height;
-    s->v_fragment_start = s->fragment_width * s->fragment_height * 5 / 4;
+    s->fragment_start[1] = s->fragment_width * s->fragment_height;
+    s->fragment_start[2] = s->fragment_width * s->fragment_height * 5 / 4;
 
     debug_init("  Y plane: %d x %d\n", s->width, s->height);
     debug_init("  C plane: %d x %d\n", c_width, c_height);
@@ -2278,8 +2005,8 @@ static int vp3_decode_init(AVCodecContext *avctx)
         s->fragment_count,
         s->fragment_width,
         s->fragment_height,
-        s->u_fragment_start,
-        s->v_fragment_start);
+        s->fragment_start[1],
+        s->fragment_start[2]);
 
     s->all_fragments = av_malloc(s->fragment_count * sizeof(Vp3Fragment));
     s->coeffs = av_malloc(s->fragment_count * sizeof(Coeff) * 65);
@@ -2293,14 +2020,23 @@ static int vp3_decode_init(AVCodecContext *avctx)
         for (i = 0; i < 64; i++)
             s->coded_ac_scale_factor[i] = vp31_ac_scale_factor[i];
         for (i = 0; i < 64; i++)
-            s->coded_intra_y_dequant[i] = vp31_intra_y_dequant[i];
+            s->base_matrix[0][i] = vp31_intra_y_dequant[i];
         for (i = 0; i < 64; i++)
-            s->coded_intra_c_dequant[i] = vp31_intra_c_dequant[i];
+            s->base_matrix[1][i] = vp31_intra_c_dequant[i];
         for (i = 0; i < 64; i++)
-            s->coded_inter_dequant[i] = vp31_inter_dequant[i];
+            s->base_matrix[2][i] = vp31_inter_dequant[i];
         for (i = 0; i < 64; i++)
             s->filter_limit_values[i] = vp31_filter_limit_values[i];
 
+        for(inter=0; inter<2; inter++){
+            for(plane=0; plane<3; plane++){
+                s->qr_count[inter][plane]= 1;
+                s->qr_size [inter][plane][0]= 63;
+                s->qr_base [inter][plane][0]=
+                s->qr_base [inter][plane][1]= 2*inter + (!!plane)*!inter;
+            }
+        }
+
         /* init VLC tables */
         for (i = 0; i < 16; i++) {
 
@@ -2418,10 +2154,10 @@ static int vp3_decode_frame(AVCodecContext *avctx,
         switch(ptype)
         {
             case 1:
-                theora_decode_comments(avctx, gb);
+                theora_decode_comments(avctx, &gb);
                 break;
             case 2:
-                theora_decode_tables(avctx, gb);
+                theora_decode_tables(avctx, &gb);
                     init_dequantizer(s);
                 break;
             default:
@@ -2435,9 +2171,13 @@ static int vp3_decode_frame(AVCodecContext *avctx,
     if (!s->theora)
         skip_bits(&gb, 1);
     s->last_quality_index = s->quality_index;
-    s->quality_index = get_bits(&gb, 6);
-    if (s->theora >= 0x030200)
-        skip_bits1(&gb);
+
+    s->nqis=0;
+    do{
+        s->qis[s->nqis++]= get_bits(&gb, 6);
+    } while(s->theora >= 0x030200 && s->nqis<3 && get_bits1(&gb));
+
+    s->quality_index= s->qis[0];
 
     if (s->avctx->debug & FF_DEBUG_PICT_INFO)
         av_log(s->avctx, AV_LOG_INFO, " VP3 %sframe #%d: Q index = %d\n",
@@ -2486,7 +2226,7 @@ static int vp3_decode_frame(AVCodecContext *avctx,
         }
 
         /* golden frame is also the current frame */
-        memcpy(&s->current_frame, &s->golden_frame, sizeof(AVFrame));
+        s->current_frame= s->golden_frame;
 
         /* time to figure out pixel addresses? */
         if (!s->pixel_addresses_inited)
@@ -2495,10 +2235,15 @@ static int vp3_decode_frame(AVCodecContext *avctx,
                 vp3_calculate_pixel_addresses(s);
             else
                 theora_calculate_pixel_addresses(s);
+            s->pixel_addresses_inited = 1;
         }
     } else {
         /* allocate a new current frame */
         s->current_frame.reference = 3;
+        if (!s->pixel_addresses_inited) {
+            av_log(s->avctx, AV_LOG_ERROR, "vp3: first frame not a keyframe\n");
+            return -1;
+        }
         if(avctx->get_buffer(avctx, &s->current_frame) < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "vp3: get_buffer() failed\n");
             return -1;
@@ -2553,9 +2298,9 @@ if (!s->keyframe) {
 
     reverse_dc_prediction(s, 0, s->fragment_width, s->fragment_height);
     if ((avctx->flags & CODEC_FLAG_GRAY) == 0) {
-        reverse_dc_prediction(s, s->u_fragment_start,
+        reverse_dc_prediction(s, s->fragment_start[1],
             s->fragment_width / 2, s->fragment_height / 2);
-        reverse_dc_prediction(s, s->v_fragment_start,
+        reverse_dc_prediction(s, s->fragment_start[2],
             s->fragment_width / 2, s->fragment_height / 2);
     }
     STOP_TIMER("reverse_dc_prediction")}
@@ -2582,7 +2327,7 @@ if (!s->keyframe) {
         avctx->release_buffer(avctx, &s->last_frame);
 
     /* shuffle frames (last = current) */
-    memcpy(&s->last_frame, &s->current_frame, sizeof(AVFrame));
+    s->last_frame= s->current_frame;
     s->current_frame.data[0]= NULL; /* ensure that we catch any access to this released frame */
 
     return buf_size;
@@ -2646,19 +2391,12 @@ static int read_huffman_tree(AVCodecContext *avctx, GetBitContext *gb)
     return 0;
 }
 
-static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
+static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb)
 {
     Vp3DecodeContext *s = avctx->priv_data;
-    int major, minor, micro;
 
-    major = get_bits(&gb, 8); /* version major */
-    minor = get_bits(&gb, 8); /* version minor */
-    micro = get_bits(&gb, 8); /* version micro */
-    av_log(avctx, AV_LOG_INFO, "Theora bitstream version %d.%d.%d\n",
-        major, minor, micro);
-
-    /* FIXME: endianess? */
-    s->theora = (major << 16) | (minor << 8) | micro;
+    s->theora = get_bits_long(gb, 24);
+    av_log(avctx, AV_LOG_INFO, "Theora bitstream version %X\n", s->theora);
 
     /* 3.2.0 aka alpha3 has the same frame orientation as original vp3 */
     /* but previous versions have the image flipped relative to vp3 */
@@ -2668,8 +2406,8 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
         av_log(avctx, AV_LOG_DEBUG, "Old (<alpha3) Theora bitstream, flipped image\n");
     }
 
-    s->width = get_bits(&gb, 16) << 4;
-    s->height = get_bits(&gb, 16) << 4;
+    s->width = get_bits(gb, 16) << 4;
+    s->height = get_bits(gb, 16) << 4;
 
     if(avcodec_check_dimensions(avctx, s->width, s->height)){
         av_log(avctx, AV_LOG_ERROR, "Invalid dimensions (%dx%d)\n", s->width, s->height);
@@ -2679,47 +2417,49 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
 
     if (s->theora >= 0x030400)
     {
-        skip_bits(&gb, 32); /* total number of superblocks in a frame */
+        skip_bits(gb, 32); /* total number of superblocks in a frame */
         // fixme, the next field is 36bits long
-        skip_bits(&gb, 32); /* total number of blocks in a frame */
-        skip_bits(&gb, 4); /* total number of blocks in a frame */
-        skip_bits(&gb, 32); /* total number of macroblocks in a frame */
+        skip_bits(gb, 32); /* total number of blocks in a frame */
+        skip_bits(gb, 4); /* total number of blocks in a frame */
+        skip_bits(gb, 32); /* total number of macroblocks in a frame */
 
-        skip_bits(&gb, 24); /* frame width */
-        skip_bits(&gb, 24); /* frame height */
+        skip_bits(gb, 24); /* frame width */
+        skip_bits(gb, 24); /* frame height */
     }
     else
     {
-        skip_bits(&gb, 24); /* frame width */
-        skip_bits(&gb, 24); /* frame height */
+        skip_bits(gb, 24); /* frame width */
+        skip_bits(gb, 24); /* frame height */
     }
 
-    skip_bits(&gb, 8); /* offset x */
-    skip_bits(&gb, 8); /* offset y */
+  if (s->theora >= 0x030200) {
+    skip_bits(gb, 8); /* offset x */
+    skip_bits(gb, 8); /* offset y */
+  }
 
-    skip_bits(&gb, 32); /* fps numerator */
-    skip_bits(&gb, 32); /* fps denumerator */
-    skip_bits(&gb, 24); /* aspect numerator */
-    skip_bits(&gb, 24); /* aspect denumerator */
+    skip_bits(gb, 32); /* fps numerator */
+    skip_bits(gb, 32); /* fps denumerator */
+    skip_bits(gb, 24); /* aspect numerator */
+    skip_bits(gb, 24); /* aspect denumerator */
 
     if (s->theora < 0x030200)
-        skip_bits(&gb, 5); /* keyframe frequency force */
-    skip_bits(&gb, 8); /* colorspace */
+        skip_bits(gb, 5); /* keyframe frequency force */
+    skip_bits(gb, 8); /* colorspace */
     if (s->theora >= 0x030400)
-        skip_bits(&gb, 2); /* pixel format: 420,res,422,444 */
-    skip_bits(&gb, 24); /* bitrate */
+        skip_bits(gb, 2); /* pixel format: 420,res,422,444 */
+    skip_bits(gb, 24); /* bitrate */
 
-    skip_bits(&gb, 6); /* quality hint */
+    skip_bits(gb, 6); /* quality hint */
 
     if (s->theora >= 0x030200)
     {
-        skip_bits(&gb, 5); /* keyframe frequency force */
+        skip_bits(gb, 5); /* keyframe frequency force */
 
         if (s->theora < 0x030400)
-            skip_bits(&gb, 5); /* spare bits */
+            skip_bits(gb, 5); /* spare bits */
     }
 
-//    align_get_bits(&gb);
+//    align_get_bits(gb);
 
     avctx->width = s->width;
     avctx->height = s->height;
@@ -2727,132 +2467,89 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext gb)
     return 0;
 }
 
-static inline int theora_get_32bit(GetBitContext gb)
-{
-    int ret = get_bits(&gb, 8);
-    ret += get_bits(&gb, 8) << 8;
-    ret += get_bits(&gb, 8) << 16;
-    ret += get_bits(&gb, 8) << 24;
-
-    return ret;
-}
-
-static int theora_decode_comments(AVCodecContext *avctx, GetBitContext gb)
-{
-    Vp3DecodeContext *s = avctx->priv_data;
-    int len;
-
-    if (s->theora <= 0x030200)
-    {
-        int i, comments;
-
-        // vendor string
-        len = get_bits_long(&gb, 32);
-        len = le2me_32(len);
-        while(len--)
-            skip_bits(&gb, 8);
-
-        // user comments
-        comments = get_bits_long(&gb, 32);
-        comments = le2me_32(comments);
-        for (i = 0; i < comments; i++)
-        {
-            len = get_bits_long(&gb, 32);
-            len = be2me_32(len);
-            while(len--)
-                skip_bits(&gb, 8);
-        }
-    }
-    else
-    {
-        do {
-            len = get_bits_long(&gb, 32);
-            len = le2me_32(len);
-            if (len <= 0)
-                break;
-            while (len--)
-                skip_bits(&gb, 8);
-        } while (1);
-    }
-    return 0;
-}
-
-static int theora_decode_tables(AVCodecContext *avctx, GetBitContext gb)
+static int theora_decode_tables(AVCodecContext *avctx, GetBitContext *gb)
 {
     Vp3DecodeContext *s = avctx->priv_data;
-    int i, n, matrices;
+    int i, n, matrices, inter, plane;
 
     if (s->theora >= 0x030200) {
-        n = get_bits(&gb, 3);
+        n = get_bits(gb, 3);
         /* loop filter limit values table */
         for (i = 0; i < 64; i++)
-            s->filter_limit_values[i] = get_bits(&gb, n);
+            s->filter_limit_values[i] = get_bits(gb, n);
     }
 
     if (s->theora >= 0x030200)
-        n = get_bits(&gb, 4) + 1;
+        n = get_bits(gb, 4) + 1;
     else
         n = 16;
     /* quality threshold table */
     for (i = 0; i < 64; i++)
-        s->coded_ac_scale_factor[i] = get_bits(&gb, n);
+        s->coded_ac_scale_factor[i] = get_bits(gb, n);
 
     if (s->theora >= 0x030200)
-        n = get_bits(&gb, 4) + 1;
+        n = get_bits(gb, 4) + 1;
     else
         n = 16;
     /* dc scale factor table */
     for (i = 0; i < 64; i++)
-        s->coded_dc_scale_factor[i] = get_bits(&gb, n);
+        s->coded_dc_scale_factor[i] = get_bits(gb, n);
 
     if (s->theora >= 0x030200)
-        matrices = get_bits(&gb, 9) + 1;
+        matrices = get_bits(gb, 9) + 1;
     else
         matrices = 3;
-    if (matrices != 3) {
-        av_log(avctx,AV_LOG_ERROR, "unsupported matrices: %d\n", matrices);
-//        return -1;
-    }
-    /* y coeffs */
-    for (i = 0; i < 64; i++)
-        s->coded_intra_y_dequant[i] = get_bits(&gb, 8);
-
-    /* uv coeffs */
-    for (i = 0; i < 64; i++)
-        s->coded_intra_c_dequant[i] = get_bits(&gb, 8);
 
-    /* inter coeffs */
-    for (i = 0; i < 64; i++)
-        s->coded_inter_dequant[i] = get_bits(&gb, 8);
+    if(matrices > 384){
+        av_log(avctx, AV_LOG_ERROR, "invalid number of base matrixes\n");
+        return -1;
+    }
 
-    /* skip unknown matrices */
-    n = matrices - 3;
-    while(n--)
+    for(n=0; n<matrices; n++){
         for (i = 0; i < 64; i++)
-            skip_bits(&gb, 8);
+            s->base_matrix[n][i]= get_bits(gb, 8);
+    }
 
-    for (i = 0; i <= 1; i++) {
-        for (n = 0; n <= 2; n++) {
-            int newqr;
-            if (i > 0 || n > 0)
-                newqr = get_bits(&gb, 1);
-            else
-                newqr = 1;
+    for (inter = 0; inter <= 1; inter++) {
+        for (plane = 0; plane <= 2; plane++) {
+            int newqr= 1;
+            if (inter || plane > 0)
+                newqr = get_bits(gb, 1);
             if (!newqr) {
-                if (i > 0)
-                    get_bits(&gb, 1);
-            }
-            else {
+                int qtj, plj;
+                if(inter && get_bits(gb, 1)){
+                    qtj = 0;
+                    plj = plane;
+                }else{
+                    qtj= (3*inter + plane - 1) / 3;
+                    plj= (plane + 2) % 3;
+                }
+                s->qr_count[inter][plane]= s->qr_count[qtj][plj];
+                memcpy(s->qr_size[inter][plane], s->qr_size[qtj][plj], sizeof(s->qr_size[0][0]));
+                memcpy(s->qr_base[inter][plane], s->qr_base[qtj][plj], sizeof(s->qr_base[0][0]));
+            } else {
+                int qri= 0;
                 int qi = 0;
-                skip_bits(&gb, av_log2(matrices-1)+1);
-                while (qi < 63) {
-                    qi += get_bits(&gb, av_log2(63-qi)+1) + 1;
-                    skip_bits(&gb, av_log2(matrices-1)+1);
+
+                for(;;){
+                    i= get_bits(gb, av_log2(matrices-1)+1);
+                    if(i>= matrices){
+                        av_log(avctx, AV_LOG_ERROR, "invalid base matrix index\n");
+                        return -1;
+                    }
+                    s->qr_base[inter][plane][qri]= i;
+                    if(qi >= 63)
+                        break;
+                    i = get_bits(gb, av_log2(63-qi)+1) + 1;
+                    s->qr_size[inter][plane][qri++]= i;
+                    qi += i;
                 }
+
                 if (qi > 63) {
                     av_log(avctx, AV_LOG_ERROR, "invalid qi %d > 63\n", qi);
                     return -1;
                 }
+                s->qr_count[inter][plane]= qri;
             }
         }
     }
@@ -2861,11 +2558,11 @@ static int theora_decode_tables(AVCodecContext *avctx, GetBitContext gb)
     for (s->hti = 0; s->hti < 80; s->hti++) {
         s->entries = 0;
         s->huff_code_size = 1;
-        if (!get_bits(&gb, 1)) {
+        if (!get_bits(gb, 1)) {
             s->hbits = 0;
-            read_huffman_tree(avctx, &gb);
+            read_huffman_tree(avctx, gb);
             s->hbits = 1;
-            read_huffman_tree(avctx, &gb);
+            read_huffman_tree(avctx, gb);
         }
     }
 
@@ -2903,7 +2600,7 @@ static int theora_decode_init(AVCodecContext *avctx)
      if (!(ptype & 0x80))
      {
         av_log(avctx, AV_LOG_ERROR, "Invalid extradata!\n");
-        return -1;
+//        return -1;
      }
 
     // FIXME: check for this aswell
@@ -2912,19 +2609,23 @@ static int theora_decode_init(AVCodecContext *avctx)
     switch(ptype)
     {
         case 0x80:
-            theora_decode_header(avctx, gb);
+            theora_decode_header(avctx, &gb);
                 break;
         case 0x81:
 // FIXME: is this needed? it breaks sometimes
 //            theora_decode_comments(avctx, gb);
             break;
         case 0x82:
-            theora_decode_tables(avctx, gb);
+            theora_decode_tables(avctx, &gb);
             break;
         default:
             av_log(avctx, AV_LOG_ERROR, "Unknown Theora config packet: %d\n", ptype&~0x80);
             break;
     }
+    if(8*op_bytes != get_bits_count(&gb))
+        av_log(avctx, AV_LOG_ERROR, "%d bits left in packet %X\n", 8*op_bytes - get_bits_count(&gb), ptype);
+    if (s->theora < 0x030200)
+        break;
   }
 
     vp3_decode_init(avctx);
diff --git a/src/libffmpeg/libavcodec/vp3dsp.c b/src/libffmpeg/libavcodec/vp3dsp.c
index 0cbe8d551..f5a1fb6ff 100644
--- a/src/libffmpeg/libavcodec/vp3dsp.c
+++ b/src/libffmpeg/libavcodec/vp3dsp.c
@@ -35,14 +35,15 @@
 #define xC6S2 25080
 #define xC7S1 12785
 
+#define M(a,b) (((a) * (b))>>16)
+
 static always_inline void idct(uint8_t *dst, int stride, int16_t *input, int type)
 {
     int16_t *ip = input;
     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 
-    int A_, B_, C_, D_, _Ad, _Bd, _Cd, _Dd, E_, F_, G_, H_;
-    int _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
-    int t1, t2;
+    int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
+    int Ed, Gd, Add, Bdd, Fd, Hd;
 
     int i;
 
@@ -50,86 +51,44 @@ static always_inline void idct(uint8_t *dst, int stride, int16_t *input, int typ
     for (i = 0; i < 8; i++) {
         /* Check for non-zero values */
         if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
-            t1 = (int32_t)(xC1S7 * ip[1]);
-            t2 = (int32_t)(xC7S1 * ip[7]);
-            t1 >>= 16;
-            t2 >>= 16;
-            A_ = t1 + t2;
-
-            t1 = (int32_t)(xC7S1 * ip[1]);
-            t2 = (int32_t)(xC1S7 * ip[7]);
-            t1 >>= 16;
-            t2 >>= 16;
-            B_ = t1 - t2;
-
-            t1 = (int32_t)(xC3S5 * ip[3]);
-            t2 = (int32_t)(xC5S3 * ip[5]);
-            t1 >>= 16;
-            t2 >>= 16;
-            C_ = t1 + t2;
-
-            t1 = (int32_t)(xC3S5 * ip[5]);
-            t2 = (int32_t)(xC5S3 * ip[3]);
-            t1 >>= 16;
-            t2 >>= 16;
-            D_ = t1 - t2;
-
-
-            t1 = (int32_t)(xC4S4 * (A_ - C_));
-            t1 >>= 16;
-            _Ad = t1;
+            A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]);
+            B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]);
+            C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]);
+            D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]);
 
-            t1 = (int32_t)(xC4S4 * (B_ - D_));
-            t1 >>= 16;
-            _Bd = t1;
+            Ad = M(xC4S4, (A - C));
+            Bd = M(xC4S4, (B - D));
 
+            Cd = A + C;
+            Dd = B + D;
 
-            _Cd = A_ + C_;
-            _Dd = B_ + D_;
+            E = M(xC4S4, (ip[0] + ip[4]));
+            F = M(xC4S4, (ip[0] - ip[4]));
 
-            t1 = (int32_t)(xC4S4 * (ip[0] + ip[4]));
-            t1 >>= 16;
-            E_ = t1;
+            G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]);
+            H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]);
 
-            t1 = (int32_t)(xC4S4 * (ip[0] - ip[4]));
-            t1 >>= 16;
-            F_ = t1;
+            Ed = E - G;
+            Gd = E + G;
 
-            t1 = (int32_t)(xC2S6 * ip[2]);
-            t2 = (int32_t)(xC6S2 * ip[6]);
-            t1 >>= 16;
-            t2 >>= 16;
-            G_ = t1 + t2;
+            Add = F + Ad;
+            Bdd = Bd - H;
 
-            t1 = (int32_t)(xC6S2 * ip[2]);
-            t2 = (int32_t)(xC2S6 * ip[6]);
-            t1 >>= 16;
-            t2 >>= 16;
-            H_ = t1 - t2;
-
-
-            _Ed = E_ - G_;
-            _Gd = E_ + G_;
-
-            _Add = F_ + _Ad;
-            _Bdd = _Bd - H_;
-
-            _Fd = F_ - _Ad;
-            _Hd = _Bd + H_;
+            Fd = F - Ad;
+            Hd = Bd + H;
 
             /*  Final sequence of operations over-write original inputs. */
-            ip[0] = _Gd + _Cd ;
-            ip[7] = _Gd - _Cd ;
-
-            ip[1] = _Add + _Hd;
-            ip[2] = _Add - _Hd;
+            ip[0] = Gd + Cd ;
+            ip[7] = Gd - Cd ;
 
-            ip[3] = _Ed + _Dd ;
-            ip[4] = _Ed - _Dd ;
+            ip[1] = Add + Hd;
+            ip[2] = Add - Hd;
 
-            ip[5] = _Fd + _Bdd;
-            ip[6] = _Fd - _Bdd;
+            ip[3] = Ed + Dd ;
+            ip[4] = Ed - Dd ;
 
+            ip[5] = Fd + Bdd;
+            ip[6] = Fd - Bdd;
         }
 
         ip += 8;            /* next row */
@@ -142,121 +101,74 @@ static always_inline void idct(uint8_t *dst, int stride, int16_t *input, int typ
         if ( ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
              ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
 
-            t1 = (int32_t)(xC1S7 * ip[1*8]);
-            t2 = (int32_t)(xC7S1 * ip[7*8]);
-            t1 >>= 16;
-            t2 >>= 16;
-            A_ = t1 + t2;
-
-            t1 = (int32_t)(xC7S1 * ip[1*8]);
-            t2 = (int32_t)(xC1S7 * ip[7*8]);
-            t1 >>= 16;
-            t2 >>= 16;
-            B_ = t1 - t2;
+            A = M(xC1S7, ip[1*8]) + M(xC7S1, ip[7*8]);
+            B = M(xC7S1, ip[1*8]) - M(xC1S7, ip[7*8]);
+            C = M(xC3S5, ip[3*8]) + M(xC5S3, ip[5*8]);
+            D = M(xC3S5, ip[5*8]) - M(xC5S3, ip[3*8]);
 
-            t1 = (int32_t)(xC3S5 * ip[3*8]);
-            t2 = (int32_t)(xC5S3 * ip[5*8]);
-            t1 >>= 16;
-            t2 >>= 16;
-            C_ = t1 + t2;
+            Ad = M(xC4S4, (A - C));
+            Bd = M(xC4S4, (B - D));
 
-            t1 = (int32_t)(xC3S5 * ip[5*8]);
-            t2 = (int32_t)(xC5S3 * ip[3*8]);
-            t1 >>= 16;
-            t2 >>= 16;
-            D_ = t1 - t2;
+            Cd = A + C;
+            Dd = B + D;
 
+            E = M(xC4S4, (ip[0*8] + ip[4*8])) + 8;
+            F = M(xC4S4, (ip[0*8] - ip[4*8])) + 8;
 
-            t1 = (int32_t)(xC4S4 * (A_ - C_));
-            t1 >>= 16;
-            _Ad = t1;
-
-            t1 = (int32_t)(xC4S4 * (B_ - D_));
-            t1 >>= 16;
-            _Bd = t1;
-
-
-            _Cd = A_ + C_;
-            _Dd = B_ + D_;
-
-            t1 = (int32_t)(xC4S4 * (ip[0*8] + ip[4*8]));
-            t1 >>= 16;
-            E_ = t1;
-
-            t1 = (int32_t)(xC4S4 * (ip[0*8] - ip[4*8]));
-            t1 >>= 16;
-            F_ = t1;
-
-            t1 = (int32_t)(xC2S6 * ip[2*8]);
-            t2 = (int32_t)(xC6S2 * ip[6*8]);
-            t1 >>= 16;
-            t2 >>= 16;
-            G_ = t1 + t2;
-
-            t1 = (int32_t)(xC6S2 * ip[2*8]);
-            t2 = (int32_t)(xC2S6 * ip[6*8]);
-            t1 >>= 16;
-            t2 >>= 16;
-            H_ = t1 - t2;
-
+            if(type==1){  //HACK
+                E += 16*128;
+                F += 16*128;
+            }
 
-            _Ed = E_ - G_;
-            _Gd = E_ + G_;
+            G = M(xC2S6, ip[2*8]) + M(xC6S2, ip[6*8]);
+            H = M(xC6S2, ip[2*8]) - M(xC2S6, ip[6*8]);
 
-            _Add = F_ + _Ad;
-            _Bdd = _Bd - H_;
+            Ed = E - G;
+            Gd = E + G;
 
-            _Fd = F_ - _Ad;
-            _Hd = _Bd + H_;
+            Add = F + Ad;
+            Bdd = Bd - H;
 
-            if(type==1){  //HACK
-                _Gd += 16*128;
-                _Add+= 16*128;
-                _Ed += 16*128;
-                _Fd += 16*128;
-            }
-            _Gd += IdctAdjustBeforeShift;
-            _Add += IdctAdjustBeforeShift;
-            _Ed += IdctAdjustBeforeShift;
-            _Fd += IdctAdjustBeforeShift;
+            Fd = F - Ad;
+            Hd = Bd + H;
 
             /* Final sequence of operations over-write original inputs. */
             if(type==0){
-                ip[0*8] = (_Gd + _Cd )  >> 4;
-                ip[7*8] = (_Gd - _Cd )  >> 4;
+                ip[0*8] = (Gd + Cd )  >> 4;
+                ip[7*8] = (Gd - Cd )  >> 4;
 
-                ip[1*8] = (_Add + _Hd ) >> 4;
-                ip[2*8] = (_Add - _Hd ) >> 4;
+                ip[1*8] = (Add + Hd ) >> 4;
+                ip[2*8] = (Add - Hd ) >> 4;
 
-                ip[3*8] = (_Ed + _Dd )  >> 4;
-                ip[4*8] = (_Ed - _Dd )  >> 4;
+                ip[3*8] = (Ed + Dd )  >> 4;
+                ip[4*8] = (Ed - Dd )  >> 4;
 
-                ip[5*8] = (_Fd + _Bdd ) >> 4;
-                ip[6*8] = (_Fd - _Bdd ) >> 4;
+                ip[5*8] = (Fd + Bdd ) >> 4;
+                ip[6*8] = (Fd - Bdd ) >> 4;
             }else if(type==1){
-                dst[0*stride] = cm[(_Gd + _Cd )  >> 4];
-                dst[7*stride] = cm[(_Gd - _Cd )  >> 4];
+                dst[0*stride] = cm[(Gd + Cd )  >> 4];
+                dst[7*stride] = cm[(Gd - Cd )  >> 4];
 
-                dst[1*stride] = cm[(_Add + _Hd ) >> 4];
-                dst[2*stride] = cm[(_Add - _Hd ) >> 4];
+                dst[1*stride] = cm[(Add + Hd ) >> 4];
+                dst[2*stride] = cm[(Add - Hd ) >> 4];
 
-                dst[3*stride] = cm[(_Ed + _Dd )  >> 4];
-                dst[4*stride] = cm[(_Ed - _Dd )  >> 4];
+                dst[3*stride] = cm[(Ed + Dd )  >> 4];
+                dst[4*stride] = cm[(Ed - Dd )  >> 4];
 
-                dst[5*stride] = cm[(_Fd + _Bdd ) >> 4];
-                dst[6*stride] = cm[(_Fd - _Bdd ) >> 4];
+                dst[5*stride] = cm[(Fd + Bdd ) >> 4];
+                dst[6*stride] = cm[(Fd - Bdd ) >> 4];
             }else{
-                dst[0*stride] = cm[dst[0*stride] + ((_Gd + _Cd )  >> 4)];
-                dst[7*stride] = cm[dst[7*stride] + ((_Gd - _Cd )  >> 4)];
+                dst[0*stride] = cm[dst[0*stride] + ((Gd + Cd )  >> 4)];
+                dst[7*stride] = cm[dst[7*stride] + ((Gd - Cd )  >> 4)];
 
-                dst[1*stride] = cm[dst[1*stride] + ((_Add + _Hd ) >> 4)];
-                dst[2*stride] = cm[dst[2*stride] + ((_Add - _Hd ) >> 4)];
+                dst[1*stride] = cm[dst[1*stride] + ((Add + Hd ) >> 4)];
+                dst[2*stride] = cm[dst[2*stride] + ((Add - Hd ) >> 4)];
 
-                dst[3*stride] = cm[dst[3*stride] + ((_Ed + _Dd )  >> 4)];
-                dst[4*stride] = cm[dst[4*stride] + ((_Ed - _Dd )  >> 4)];
+                dst[3*stride] = cm[dst[3*stride] + ((Ed + Dd )  >> 4)];
+                dst[4*stride] = cm[dst[4*stride] + ((Ed - Dd )  >> 4)];
 
-                dst[5*stride] = cm[dst[5*stride] + ((_Fd + _Bdd ) >> 4)];
-                dst[6*stride] = cm[dst[6*stride] + ((_Fd - _Bdd ) >> 4)];
+                dst[5*stride] = cm[dst[5*stride] + ((Fd + Bdd ) >> 4)];
+                dst[6*stride] = cm[dst[6*stride] + ((Fd - Bdd ) >> 4)];
             }
 
         } else {
diff --git a/src/libffmpeg/libavcodec/wmadec.c b/src/libffmpeg/libavcodec/wmadec.c
index c557a2a7a..227c9695b 100644
--- a/src/libffmpeg/libavcodec/wmadec.c
+++ b/src/libffmpeg/libavcodec/wmadec.c
@@ -57,6 +57,13 @@
 #define LSP_POW_BITS 7
 
 #define VLCBITS 9
+#define VLCMAX ((22+VLCBITS-1)/VLCBITS)
+
+#define EXPVLCBITS 8
+#define EXPMAX ((19+EXPVLCBITS-1)/EXPVLCBITS)
+
+#define HGAINVLCBITS 9
+#define HGAINMAX ((13+HGAINVLCBITS-1)/HGAINVLCBITS)
 
 typedef struct WMADecodeContext {
     GetBitContext gb;
@@ -185,7 +192,7 @@ static void init_coef_vlc(VLC *vlc,
     const uint16_t *p;
     int i, l, j, level;
 
-    init_vlc(vlc, 9, n, table_bits, 1, 1, table_codes, 4, 4, 0);
+    init_vlc(vlc, VLCBITS, n, table_bits, 1, 1, table_codes, 4, 4, 0);
 
     run_table = av_malloc(n * sizeof(uint16_t));
     level_table = av_malloc(n * sizeof(uint16_t));
@@ -494,13 +501,13 @@ static int wma_decode_init(AVCodecContext * avctx)
             }
         }
 #endif
-        init_vlc(&s->hgain_vlc, 9, sizeof(hgain_huffbits),
+        init_vlc(&s->hgain_vlc, HGAINVLCBITS, sizeof(hgain_huffbits),
                  hgain_huffbits, 1, 1,
                  hgain_huffcodes, 2, 2, 0);
     }
 
     if (s->use_exp_vlc) {
-        init_vlc(&s->exp_vlc, 9, sizeof(scale_huffbits),
+        init_vlc(&s->exp_vlc, EXPVLCBITS, sizeof(scale_huffbits),
                  scale_huffbits, 1, 1,
                  scale_huffcodes, 4, 4, 0);
     } else {
@@ -681,7 +688,7 @@ static int decode_exp_vlc(WMADecodeContext *s, int ch)
     }
     last_exp = 36;
     while (q < q_end) {
-        code = get_vlc2(&s->gb, s->exp_vlc.table, VLCBITS, 2);
+        code = get_vlc2(&s->gb, s->exp_vlc.table, EXPVLCBITS, EXPMAX);
         if (code < 0)
             return -1;
         /* NOTE: this offset is the same as MPEG4 AAC ! */
@@ -822,7 +829,7 @@ static int wma_decode_block(WMADecodeContext *s)
                         if (val == (int)0x80000000) {
                             val = get_bits(&s->gb, 7) - 19;
                         } else {
-                            code = get_vlc2(&s->gb, s->hgain_vlc.table, VLCBITS, 2);
+                            code = get_vlc2(&s->gb, s->hgain_vlc.table, HGAINVLCBITS, HGAINMAX);
                             if (code < 0)
                                 return -1;
                             val += code - 18;
@@ -879,7 +886,7 @@ static int wma_decode_block(WMADecodeContext *s)
             eptr = ptr + nb_coefs[ch];
             memset(ptr, 0, s->block_len * sizeof(int16_t));
             for(;;) {
-                code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, 3);
+                code = get_vlc2(&s->gb, coef_vlc->table, VLCBITS, VLCMAX);
                 if (code < 0)
                     return -1;
                 if (code == 1) {
@@ -901,7 +908,10 @@ static int wma_decode_block(WMADecodeContext *s)
                     level = -level;
                 ptr += run;
                 if (ptr >= eptr)
-                    return -1;
+                {
+                    av_log(NULL, AV_LOG_ERROR, "overflow in spectral RLE, ignoring\n");
+                    break;
+                }
                 *ptr++ = level;
                 /* NOTE: EOB can be omitted */
                 if (ptr >= eptr)
@@ -1229,7 +1239,7 @@ static int wma_decode_superframe(AVCodecContext *avctx,
                 goto fail;
             q = s->last_superframe + s->last_superframe_len;
             len = bit_offset;
-            while (len > 0) {
+            while (len > 7) {
                 *q++ = (get_bits)(&s->gb, 8);
                 len -= 8;
             }
diff --git a/src/libffmpeg/libavcodec/wmv2.c b/src/libffmpeg/libavcodec/wmv2.c
index dd88b7d28..3f405af4f 100644
--- a/src/libffmpeg/libavcodec/wmv2.c
+++ b/src/libffmpeg/libavcodec/wmv2.c
@@ -207,7 +207,6 @@ void ff_wmv2_encode_mb(MpegEncContext * s,
 
     if (!s->mb_intra) {
         /* compute cbp */
-        set_stat(ST_INTER_MB);
         cbp = 0;
         for (i = 0; i < 6; i++) {
             if (s->block_last_index[i] >= 0)
@@ -244,7 +243,6 @@ void ff_wmv2_encode_mb(MpegEncContext * s,
 #endif
 
         if (s->pict_type == I_TYPE) {
-            set_stat(ST_INTRA_MB);
             put_bits(&s->pb,
                      ff_msmp4_mb_i_table[coded_cbp][1], ff_msmp4_mb_i_table[coded_cbp][0]);
         } else {
@@ -252,7 +250,6 @@ void ff_wmv2_encode_mb(MpegEncContext * s,
                      wmv2_inter_table[w->cbp_table_index][cbp][1],
                      wmv2_inter_table[w->cbp_table_index][cbp][0]);
         }
-        set_stat(ST_INTRA_MB);
         put_bits(&s->pb, 1, 0);         /* no AC prediction yet */
         if(s->inter_intra_pred){
             s->h263_aic_dir=0;