1 files changed, 1275 insertions, 1899 deletions
diff --git a/contrib/ffmpeg/libavcodec/h264.c b/contrib/ffmpeg/libavcodec/h264.c
index 4d72dc2ff..cd6facb9b 100644
--- a/contrib/ffmpeg/libavcodec/h264.c
+++ b/contrib/ffmpeg/libavcodec/h264.c
@@ -17,7 +17,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
  */
 
 /**
@@ -26,367 +25,25 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
-#include "common.h"
 #include "dsputil.h"
 #include "avcodec.h"
 #include "mpegvideo.h"
+#include "h264.h"
 #include "h264data.h"
+#include "h264_parser.h"
 #include "golomb.h"
+#include "rectangle.h"
 
 #include "cabac.h"
 
 //#undef NDEBUG
 #include <assert.h>
 
-#define interlaced_dct interlaced_dct_is_a_bad_name
-#define mb_intra mb_intra_isnt_initalized_see_mb_type
-
-#define LUMA_DC_BLOCK_INDEX   25
-#define CHROMA_DC_BLOCK_INDEX 26
-
-#define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
-#define COEFF_TOKEN_VLC_BITS           8
-#define TOTAL_ZEROS_VLC_BITS           9
-#define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
-#define RUN_VLC_BITS                   3
-#define RUN7_VLC_BITS                  6
-
-#define MAX_SPS_COUNT 32
-#define MAX_PPS_COUNT 256
-
-#define MAX_MMCO_COUNT 66
-
-/* Compiling in interlaced support reduces the speed
- * of progressive decoding by about 2%. */
-#define ALLOW_INTERLACE
-
-#ifdef ALLOW_INTERLACE
-#define MB_MBAFF h->mb_mbaff
-#define MB_FIELD h->mb_field_decoding_flag
-#define FRAME_MBAFF h->mb_aff_frame
-#else
-#define MB_MBAFF 0
-#define MB_FIELD 0
-#define FRAME_MBAFF 0
-#undef  IS_INTERLACED
-#define IS_INTERLACED(mb_type) 0
-#endif
-
-/**
- * Sequence parameter set
- */
-typedef struct SPS{
-
-    int profile_idc;
-    int level_idc;
-    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
-    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
-    int poc_type;                      ///< pic_order_cnt_type
-    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
-    int delta_pic_order_always_zero_flag;
-    int offset_for_non_ref_pic;
-    int offset_for_top_to_bottom_field;
-    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
-    int ref_frame_count;               ///< num_ref_frames
-    int gaps_in_frame_num_allowed_flag;
-    int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
-    int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
-    int frame_mbs_only_flag;
-    int mb_aff;                        ///<mb_adaptive_frame_field_flag
-    int direct_8x8_inference_flag;
-    int crop;                   ///< frame_cropping_flag
-    int crop_left;              ///< frame_cropping_rect_left_offset
-    int crop_right;             ///< frame_cropping_rect_right_offset
-    int crop_top;               ///< frame_cropping_rect_top_offset
-    int crop_bottom;            ///< frame_cropping_rect_bottom_offset
-    int vui_parameters_present_flag;
-    AVRational sar;
-    int timing_info_present_flag;
-    uint32_t num_units_in_tick;
-    uint32_t time_scale;
-    int fixed_frame_rate_flag;
-    short offset_for_ref_frame[256]; //FIXME dyn aloc?
-    int bitstream_restriction_flag;
-    int num_reorder_frames;
-    int scaling_matrix_present;
-    uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
-}SPS;
-
-/**
- * Picture parameter set
- */
-typedef struct PPS{
-    unsigned int sps_id;
-    int cabac;                  ///< entropy_coding_mode_flag
-    int pic_order_present;      ///< pic_order_present_flag
-    int slice_group_count;      ///< num_slice_groups_minus1 + 1
-    int mb_slice_group_map_type;
-    unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
-    int weighted_pred;          ///< weighted_pred_flag
-    int weighted_bipred_idc;
-    int init_qp;                ///< pic_init_qp_minus26 + 26
-    int init_qs;                ///< pic_init_qs_minus26 + 26
-    int chroma_qp_index_offset;
-    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
-    int constrained_intra_pred; ///< constrained_intra_pred_flag
-    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
-    int transform_8x8_mode;     ///< transform_8x8_mode_flag
-    uint8_t scaling_matrix4[6][16];
-    uint8_t scaling_matrix8[2][64];
-}PPS;
-
-/**
- * Memory management control operation opcode.
- */
-typedef enum MMCOOpcode{
-    MMCO_END=0,
-    MMCO_SHORT2UNUSED,
-    MMCO_LONG2UNUSED,
-    MMCO_SHORT2LONG,
-    MMCO_SET_MAX_LONG,
-    MMCO_RESET,
-    MMCO_LONG,
-} MMCOOpcode;
-
-/**
- * Memory management control operation.
- */
-typedef struct MMCO{
-    MMCOOpcode opcode;
-    int short_frame_num;
-    int long_index;
-} MMCO;
-
 /**
- * H264Context
+ * Value of Picture.reference when Picture is not a reference picture, but
+ * is held for delayed output.
  */
-typedef struct H264Context{
-    MpegEncContext s;
-    int nal_ref_idc;
-    int nal_unit_type;
-    uint8_t *rbsp_buffer;
-    unsigned int rbsp_buffer_size;
-
-    /**
-      * Used to parse AVC variant of h264
-      */
-    int is_avc; ///< this flag is != 0 if codec is avc1
-    int got_avcC; ///< flag used to parse avcC data only once
-    int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
-
-    int chroma_qp; //QPc
-
-    int prev_mb_skipped;
-    int next_mb_skipped;
-
-    //prediction stuff
-    int chroma_pred_mode;
-    int intra16x16_pred_mode;
-
-    int top_mb_xy;
-    int left_mb_xy[2];
-
-    int8_t intra4x4_pred_mode_cache[5*8];
-    int8_t (*intra4x4_pred_mode)[8];
-    void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
-    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
-    void (*pred8x8  [4+3])(uint8_t *src, int stride);
-    void (*pred16x16[4+3])(uint8_t *src, int stride);
-    unsigned int topleft_samples_available;
-    unsigned int top_samples_available;
-    unsigned int topright_samples_available;
-    unsigned int left_samples_available;
-    uint8_t (*top_borders[2])[16+2*8];
-    uint8_t left_border[2*(17+2*9)];
-
-    /**
-     * non zero coeff count cache.
-     * is 64 if not available.
-     */
-    DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
-    uint8_t (*non_zero_count)[16];
-
-    /**
-     * Motion vector cache.
-     */
-    DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
-    DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
-#define LIST_NOT_USED -1 //FIXME rename?
-#define PART_NOT_AVAILABLE -2
-
-    /**
-     * is 1 if the specific list MV&references are set to 0,0,-2.
-     */
-    int mv_cache_clean[2];
-
-    /**
-     * number of neighbors (top and/or left) that used 8x8 dct
-     */
-    int neighbor_transform_size;
-
-    /**
-     * block_offset[ 0..23] for frame macroblocks
-     * block_offset[24..47] for field macroblocks
-     */
-    int block_offset[2*(16+8)];
-
-    uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
-    uint32_t *mb2b8_xy;
-    int b_stride; //FIXME use s->b4_stride
-    int b8_stride;
-
-    int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
-    int mb_uvlinesize;
-
-    int emu_edge_width;
-    int emu_edge_height;
-
-    int halfpel_flag;
-    int thirdpel_flag;
-
-    int unknown_svq3_flag;
-    int next_slice_index;
-
-    SPS sps_buffer[MAX_SPS_COUNT];
-    SPS sps; ///< current sps
-
-    PPS pps_buffer[MAX_PPS_COUNT];
-    /**
-     * current pps
-     */
-    PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
-
-    uint32_t dequant4_buffer[6][52][16];
-    uint32_t dequant8_buffer[2][52][64];
-    uint32_t (*dequant4_coeff[6])[16];
-    uint32_t (*dequant8_coeff[2])[64];
-    int dequant_coeff_pps;     ///< reinit tables when pps changes
-
-    int slice_num;
-    uint8_t *slice_table_base;
-    uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
-    int slice_type;
-    int slice_type_fixed;
-
-    //interlacing specific flags
-    int mb_aff_frame;
-    int mb_field_decoding_flag;
-    int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
-
-    unsigned int sub_mb_type[4];
-
-    //POC stuff
-    int poc_lsb;
-    int poc_msb;
-    int delta_poc_bottom;
-    int delta_poc[2];
-    int frame_num;
-    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
-    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
-    int frame_num_offset;         ///< for POC type 2
-    int prev_frame_num_offset;    ///< for POC type 2
-    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
-
-    /**
-     * frame_num for frames or 2*frame_num for field pics.
-     */
-    int curr_pic_num;
-
-    /**
-     * max_frame_num or 2*max_frame_num for field pics.
-     */
-    int max_pic_num;
-
-    //Weighted pred stuff
-    int use_weight;
-    int use_weight_chroma;
-    int luma_log2_weight_denom;
-    int chroma_log2_weight_denom;
-    int luma_weight[2][48];
-    int luma_offset[2][48];
-    int chroma_weight[2][48][2];
-    int chroma_offset[2][48][2];
-    int implicit_weight[48][48];
-
-    //deblock
-    int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
-    int slice_alpha_c0_offset;
-    int slice_beta_offset;
-
-    int redundant_pic_count;
-
-    int direct_spatial_mv_pred;
-    int dist_scale_factor[16];
-    int dist_scale_factor_field[32];
-    int map_col_to_list0[2][16];
-    int map_col_to_list0_field[2][32];
-
-    /**
-     * num_ref_idx_l0/1_active_minus1 + 1
-     */
-    unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
-    unsigned int list_count;
-    Picture *short_ref[32];
-    Picture *long_ref[32];
-    Picture default_ref_list[2][32];
-    Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
-    Picture *delayed_pic[18]; //FIXME size?
-    Picture *delayed_output_pic;
-
-    /**
-     * memory management control operations buffer.
-     */
-    MMCO mmco[MAX_MMCO_COUNT];
-    int mmco_index;
-
-    int long_ref_count;  ///< number of actual long term references
-    int short_ref_count; ///< number of actual short term references
-
-    //data partitioning
-    GetBitContext intra_gb;
-    GetBitContext inter_gb;
-    GetBitContext *intra_gb_ptr;
-    GetBitContext *inter_gb_ptr;
-
-    DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
-    DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
-
-    /**
-     * Cabac
-     */
-    CABACContext cabac;
-    uint8_t      cabac_state[460];
-    int          cabac_init_idc;
-
-    /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
-    uint16_t     *cbp_table;
-    int cbp;
-    int top_cbp;
-    int left_cbp;
-    /* chroma_pred_mode for i4x4 or i16x16, else 0 */
-    uint8_t     *chroma_pred_mode_table;
-    int         last_qscale_diff;
-    int16_t     (*mvd_table[2])[2];
-    DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
-    uint8_t     *direct_table;
-    uint8_t     direct_cache[5*8];
-
-    uint8_t zigzag_scan[16];
-    uint8_t zigzag_scan8x8[64];
-    uint8_t zigzag_scan8x8_cavlc[64];
-    uint8_t field_scan[16];
-    uint8_t field_scan8x8[64];
-    uint8_t field_scan8x8_cavlc[64];
-    const uint8_t *zigzag_scan_q0;
-    const uint8_t *zigzag_scan8x8_q0;
-    const uint8_t *zigzag_scan8x8_cavlc_q0;
-    const uint8_t *field_scan_q0;
-    const uint8_t *field_scan8x8_q0;
-    const uint8_t *field_scan8x8_cavlc_q0;
-
-    int x264_build;
-}H264Context;
+#define DELAYED_PIC_REF 4
 
 static VLC coeff_token_vlc[4];
 static VLC chroma_dc_coeff_token_vlc;
@@ -419,109 +76,23 @@ const uint8_t ff_div6[52]={
 };
 
 
-/**
- * fill a rectangle.
- * @param h height of the rectangle, should be a constant
- * @param w width of the rectangle, should be a constant
- * @param size the size of val (1 or 4), should be a constant
- */
-static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
-    uint8_t *p= (uint8_t*)vp;
-    assert(size==1 || size==4);
-    assert(w<=4);
-
-    w      *= size;
-    stride *= size;
-
-    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
-    assert((stride&(w-1))==0);
-    if(w==2){
-        const uint16_t v= size==4 ? val : val*0x0101;
-        *(uint16_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint16_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint16_t*)(p + 2*stride)=
-        *(uint16_t*)(p + 3*stride)= v;
-    }else if(w==4){
-        const uint32_t v= size==4 ? val : val*0x01010101;
-        *(uint32_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint32_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint32_t*)(p + 2*stride)=
-        *(uint32_t*)(p + 3*stride)= v;
-    }else if(w==8){
-    //gcc can't optimize 64bit math on x86_32
-#if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
-        const uint64_t v= val*0x0100000001ULL;
-        *(uint64_t*)(p + 0*stride)= v;
-        if(h==1) return;
-        *(uint64_t*)(p + 1*stride)= v;
-        if(h==2) return;
-        *(uint64_t*)(p + 2*stride)=
-        *(uint64_t*)(p + 3*stride)= v;
-    }else if(w==16){
-        const uint64_t v= val*0x0100000001ULL;
-        *(uint64_t*)(p + 0+0*stride)=
-        *(uint64_t*)(p + 8+0*stride)=
-        *(uint64_t*)(p + 0+1*stride)=
-        *(uint64_t*)(p + 8+1*stride)= v;
-        if(h==2) return;
-        *(uint64_t*)(p + 0+2*stride)=
-        *(uint64_t*)(p + 8+2*stride)=
-        *(uint64_t*)(p + 0+3*stride)=
-        *(uint64_t*)(p + 8+3*stride)= v;
-#else
-        *(uint32_t*)(p + 0+0*stride)=
-        *(uint32_t*)(p + 4+0*stride)= val;
-        if(h==1) return;
-        *(uint32_t*)(p + 0+1*stride)=
-        *(uint32_t*)(p + 4+1*stride)= val;
-        if(h==2) return;
-        *(uint32_t*)(p + 0+2*stride)=
-        *(uint32_t*)(p + 4+2*stride)=
-        *(uint32_t*)(p + 0+3*stride)=
-        *(uint32_t*)(p + 4+3*stride)= val;
-    }else if(w==16){
-        *(uint32_t*)(p + 0+0*stride)=
-        *(uint32_t*)(p + 4+0*stride)=
-        *(uint32_t*)(p + 8+0*stride)=
-        *(uint32_t*)(p +12+0*stride)=
-        *(uint32_t*)(p + 0+1*stride)=
-        *(uint32_t*)(p + 4+1*stride)=
-        *(uint32_t*)(p + 8+1*stride)=
-        *(uint32_t*)(p +12+1*stride)= val;
-        if(h==2) return;
-        *(uint32_t*)(p + 0+2*stride)=
-        *(uint32_t*)(p + 4+2*stride)=
-        *(uint32_t*)(p + 8+2*stride)=
-        *(uint32_t*)(p +12+2*stride)=
-        *(uint32_t*)(p + 0+3*stride)=
-        *(uint32_t*)(p + 4+3*stride)=
-        *(uint32_t*)(p + 8+3*stride)=
-        *(uint32_t*)(p +12+3*stride)= val;
-#endif
-    }else
-        assert(0);
-    assert(h==4);
-}
-
 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
     MpegEncContext * const s = &h->s;
     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
     int topleft_xy, top_xy, topright_xy, left_xy[2];
     int topleft_type, top_type, topright_type, left_type[2];
     int left_block[8];
+    int topleft_partition= -1;
     int i;
 
+    top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
+
     //FIXME deblocking could skip the intra and nnz parts.
-    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
+    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
         return;
 
     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 
-    top_xy     = mb_xy  - s->mb_stride;
     topleft_xy = top_xy - 1;
     topright_xy= top_xy + 1;
     left_xy[1] = left_xy[0] = mb_xy-1;
@@ -556,6 +127,10 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
                 ) {
             topleft_xy -= s->mb_stride;
+        } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
+            topleft_xy += s->mb_stride;
+            // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
+            topleft_partition = 0;
         }
         if (bottom
                 ? !curr_mb_frame_flag // bottom macroblock
@@ -833,8 +408,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
                 continue;
 
             if(USES_LIST(topleft_type, list)){
-                const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
-                const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
+                const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
+                const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
             }else{
@@ -1131,7 +706,7 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in
 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
                 const int x4 = X4, y4 = Y4;\
                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
-                if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
+                if(!USES_LIST(mb_type,list))\
                     return LIST_NOT_USED;\
                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
@@ -1152,7 +727,7 @@ static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, in
                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
                && i >= scan8[0]+8){
                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
-                SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
+                SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
             }
         }
 #undef SET_DIAG_MV
@@ -1447,14 +1022,76 @@ static inline void pred_direct_motion(H264Context * const h, int *mb_type){
         }
 
         if(ref[1] < 0){
-            *mb_type &= ~MB_TYPE_P0L1;
-            sub_mb_type &= ~MB_TYPE_P0L1;
+            if(!is_b8x8)
+                *mb_type &= ~MB_TYPE_L1;
+            sub_mb_type &= ~MB_TYPE_L1;
         }else if(ref[0] < 0){
-            *mb_type &= ~MB_TYPE_P0L0;
-            sub_mb_type &= ~MB_TYPE_P0L0;
+            if(!is_b8x8)
+                *mb_type &= ~MB_TYPE_L0;
+            sub_mb_type &= ~MB_TYPE_L0;
         }
 
-        if(IS_16X16(*mb_type)){
+        if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
+            int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
+            int mb_types_col[2];
+            int b8_stride = h->b8_stride;
+            int b4_stride = h->b_stride;
+
+            *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
+
+            if(IS_INTERLACED(*mb_type)){
+                mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
+                mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
+                if(s->mb_y&1){
+                    l1ref0 -= 2*b8_stride;
+                    l1ref1 -= 2*b8_stride;
+                    l1mv0 -= 4*b4_stride;
+                    l1mv1 -= 4*b4_stride;
+                }
+                b8_stride *= 3;
+                b4_stride *= 6;
+            }else{
+                int cur_poc = s->current_picture_ptr->poc;
+                int *col_poc = h->ref_list[1]->field_poc;
+                int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
+                int dy = 2*col_parity - (s->mb_y&1);
+                mb_types_col[0] =
+                mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
+                l1ref0 += dy*b8_stride;
+                l1ref1 += dy*b8_stride;
+                l1mv0 += 2*dy*b4_stride;
+                l1mv1 += 2*dy*b4_stride;
+                b8_stride = 0;
+            }
+
+            for(i8=0; i8<4; i8++){
+                int x8 = i8&1;
+                int y8 = i8>>1;
+                int xy8 = x8+y8*b8_stride;
+                int xy4 = 3*x8+y8*b4_stride;
+                int a=0, b=0;
+
+                if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
+                    continue;
+                h->sub_mb_type[i8] = sub_mb_type;
+
+                fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
+                fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
+                if(!IS_INTRA(mb_types_col[y8])
+                   && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
+                       || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
+                    if(ref[0] > 0)
+                        a= pack16to32(mv[0][0],mv[0][1]);
+                    if(ref[1] > 0)
+                        b= pack16to32(mv[1][0],mv[1][1]);
+                }else{
+                    a= pack16to32(mv[0][0],mv[0][1]);
+                    b= pack16to32(mv[1][0],mv[1][1]);
+                }
+                fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
+                fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
+            }
+        }else if(IS_16X16(*mb_type)){
             int a=0, b=0;
 
             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
@@ -1738,9 +1375,10 @@ static inline void write_back_motion(H264Context *h, int mb_type){
  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
  * @returns decoded bytes, might be src+1 if no escapes
  */
-static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
+static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
     int i, si, di;
     uint8_t *dst;
+    int bufidx;
 
 //    src[0]&0x80;                //forbidden bit
     h->nal_ref_idc= src[0]>>5;
@@ -1769,8 +1407,9 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
         return src;
     }
 
-    h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
-    dst= h->rbsp_buffer;
+    bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
+    h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
+    dst= h->rbsp_buffer[bufidx];
 
     if (dst == NULL){
         return NULL;
@@ -1795,7 +1434,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
 
     *dst_length= di;
     *consumed= si + 1;//+1 for the header
-//FIXME store exact number of bits in the getbitcontext (its needed for decoding)
+//FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
     return dst;
 }
 
@@ -1803,7 +1442,7 @@ static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *c
  * identifies the exact end of the bitstream
  * @return the length of the trailing, or 0 if damaged
  */
-static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
+static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
     int v= *src;
     int r;
 
@@ -1946,12 +1585,11 @@ static void chroma_dc_dct_c(DCTELEM *block){
 /**
  * gets the chroma qp.
  */
-static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
-
-    return chroma_qp[av_clip(qscale + chroma_qp_index_offset, 0, 51)];
+static inline int get_chroma_qp(H264Context *h, int t, int qscale){
+    return h->pps.chroma_qp_table[t][qscale & 0xff];
 }
 
-//FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
+//FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
     int i;
@@ -2030,722 +1668,6 @@ static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int
     return last_non_zero;
 }
 
-static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    ((uint32_t*)(src+0*stride))[0]= a;
-    ((uint32_t*)(src+1*stride))[0]= a;
-    ((uint32_t*)(src+2*stride))[0]= a;
-    ((uint32_t*)(src+3*stride))[0]= a;
-}
-
-static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
-    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
-    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
-    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
-    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
-}
-
-static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
-    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
-                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
-    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
-    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
-
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
-}
-
-static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
-    ((uint32_t*)(src+0*stride))[0]=
-    ((uint32_t*)(src+1*stride))[0]=
-    ((uint32_t*)(src+2*stride))[0]=
-    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
-}
-
-
-#define LOAD_TOP_RIGHT_EDGE\
-    const int t4= topright[0];\
-    const int t5= topright[1];\
-    const int t6= topright[2];\
-    const int t7= topright[3];\
-
-#define LOAD_LEFT_EDGE\
-    const int l0= src[-1+0*stride];\
-    const int l1= src[-1+1*stride];\
-    const int l2= src[-1+2*stride];\
-    const int l3= src[-1+3*stride];\
-
-#define LOAD_TOP_EDGE\
-    const int t0= src[ 0-1*stride];\
-    const int t1= src[ 1-1*stride];\
-    const int t2= src[ 2-1*stride];\
-    const int t3= src[ 3-1*stride];\
-
-static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-
-    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
-    src[0+2*stride]=
-    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
-    src[0+1*stride]=
-    src[1+2*stride]=
-    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
-    src[0+0*stride]=
-    src[1+1*stride]=
-    src[2+2*stride]=
-    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[1+0*stride]=
-    src[2+1*stride]=
-    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[2+0*stride]=
-    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-}
-
-static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_TOP_EDGE
-    LOAD_TOP_RIGHT_EDGE
-//    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
-    src[1+0*stride]=
-    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
-    src[2+0*stride]=
-    src[1+1*stride]=
-    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
-    src[3+0*stride]=
-    src[2+1*stride]=
-    src[1+2*stride]=
-    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+2*stride]=
-    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
-    src[3+2*stride]=
-    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
-    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
-}
-
-static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-    const __attribute__((unused)) int unu= l3;
-
-    src[0+0*stride]=
-    src[1+2*stride]=(lt + t0 + 1)>>1;
-    src[1+0*stride]=
-    src[2+2*stride]=(t0 + t1 + 1)>>1;
-    src[2+0*stride]=
-    src[3+2*stride]=(t1 + t2 + 1)>>1;
-    src[3+0*stride]=(t2 + t3 + 1)>>1;
-    src[0+1*stride]=
-    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[1+1*stride]=
-    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[2+1*stride]=
-    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
-    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-}
-
-static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_TOP_EDGE
-    LOAD_TOP_RIGHT_EDGE
-    const __attribute__((unused)) int unu= t7;
-
-    src[0+0*stride]=(t0 + t1 + 1)>>1;
-    src[1+0*stride]=
-    src[0+2*stride]=(t1 + t2 + 1)>>1;
-    src[2+0*stride]=
-    src[1+2*stride]=(t2 + t3 + 1)>>1;
-    src[3+0*stride]=
-    src[2+2*stride]=(t3 + t4+ 1)>>1;
-    src[3+2*stride]=(t4 + t5+ 1)>>1;
-    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[1+1*stride]=
-    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
-    src[2+1*stride]=
-    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
-    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
-}
-
-static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_LEFT_EDGE
-
-    src[0+0*stride]=(l0 + l1 + 1)>>1;
-    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-    src[2+0*stride]=
-    src[0+1*stride]=(l1 + l2 + 1)>>1;
-    src[3+0*stride]=
-    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
-    src[2+1*stride]=
-    src[0+2*stride]=(l2 + l3 + 1)>>1;
-    src[3+1*stride]=
-    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
-    src[3+2*stride]=
-    src[1+3*stride]=
-    src[0+3*stride]=
-    src[2+2*stride]=
-    src[2+3*stride]=
-    src[3+3*stride]=l3;
-}
-
-static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
-    const int lt= src[-1-1*stride];
-    LOAD_TOP_EDGE
-    LOAD_LEFT_EDGE
-    const __attribute__((unused)) int unu= t3;
-
-    src[0+0*stride]=
-    src[2+1*stride]=(lt + l0 + 1)>>1;
-    src[1+0*stride]=
-    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
-    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
-    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
-    src[0+1*stride]=
-    src[2+2*stride]=(l0 + l1 + 1)>>1;
-    src[1+1*stride]=
-    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
-    src[0+2*stride]=
-    src[2+3*stride]=(l1 + l2+ 1)>>1;
-    src[1+2*stride]=
-    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
-    src[0+3*stride]=(l2 + l3 + 1)>>1;
-    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
-}
-
-void ff_pred16x16_vertical_c(uint8_t *src, int stride){
-    int i;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    const uint32_t b= ((uint32_t*)(src-stride))[1];
-    const uint32_t c= ((uint32_t*)(src-stride))[2];
-    const uint32_t d= ((uint32_t*)(src-stride))[3];
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]= a;
-        ((uint32_t*)(src+i*stride))[1]= b;
-        ((uint32_t*)(src+i*stride))[2]= c;
-        ((uint32_t*)(src+i*stride))[3]= d;
-    }
-}
-
-void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
-    }
-}
-
-void ff_pred16x16_dc_c(uint8_t *src, int stride){
-    int i, dc=0;
-
-    for(i=0;i<16; i++){
-        dc+= src[-1+i*stride];
-    }
-
-    for(i=0;i<16; i++){
-        dc+= src[i-stride];
-    }
-
-    dc= 0x01010101*((dc + 16)>>5);
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-static void pred16x16_left_dc_c(uint8_t *src, int stride){
-    int i, dc=0;
-
-    for(i=0;i<16; i++){
-        dc+= src[-1+i*stride];
-    }
-
-    dc= 0x01010101*((dc + 8)>>4);
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-static void pred16x16_top_dc_c(uint8_t *src, int stride){
-    int i, dc=0;
-
-    for(i=0;i<16; i++){
-        dc+= src[i-stride];
-    }
-    dc= 0x01010101*((dc + 8)>>4);
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= dc;
-    }
-}
-
-void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<16; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]=
-        ((uint32_t*)(src+i*stride))[2]=
-        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
-    }
-}
-
-static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
-  int i, j, k;
-  int a;
-  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-  const uint8_t * const src0 = src+7-stride;
-  const uint8_t *src1 = src+8*stride-1;
-  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
-  int H = src0[1] - src0[-1];
-  int V = src1[0] - src2[ 0];
-  for(k=2; k<=8; ++k) {
-    src1 += stride; src2 -= stride;
-    H += k*(src0[k] - src0[-k]);
-    V += k*(src1[0] - src2[ 0]);
-  }
-  if(svq3){
-    H = ( 5*(H/4) ) / 16;
-    V = ( 5*(V/4) ) / 16;
-
-    /* required for 100% accuracy */
-    i = H; H = V; V = i;
-  }else{
-    H = ( 5*H+32 ) >> 6;
-    V = ( 5*V+32 ) >> 6;
-  }
-
-  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
-  for(j=16; j>0; --j) {
-    int b = a;
-    a += V;
-    for(i=-16; i<0; i+=4) {
-      src[16+i] = cm[ (b    ) >> 5 ];
-      src[17+i] = cm[ (b+  H) >> 5 ];
-      src[18+i] = cm[ (b+2*H) >> 5 ];
-      src[19+i] = cm[ (b+3*H) >> 5 ];
-      b += 4*H;
-    }
-    src += stride;
-  }
-}
-
-void ff_pred16x16_plane_c(uint8_t *src, int stride){
-    pred16x16_plane_compat_c(src, stride, 0);
-}
-
-void ff_pred8x8_vertical_c(uint8_t *src, int stride){
-    int i;
-    const uint32_t a= ((uint32_t*)(src-stride))[0];
-    const uint32_t b= ((uint32_t*)(src-stride))[1];
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= a;
-        ((uint32_t*)(src+i*stride))[1]= b;
-    }
-}
-
-void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
-    }
-}
-
-void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
-    int i;
-
-    for(i=0; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
-    }
-}
-
-static void pred8x8_left_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc2;
-
-    dc0=dc2=0;
-    for(i=0;i<4; i++){
-        dc0+= src[-1+i*stride];
-        dc2+= src[-1+(i+4)*stride];
-    }
-    dc0= 0x01010101*((dc0 + 2)>>2);
-    dc2= 0x01010101*((dc2 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= dc0;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]=
-        ((uint32_t*)(src+i*stride))[1]= dc2;
-    }
-}
-
-static void pred8x8_top_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc1;
-
-    dc0=dc1=0;
-    for(i=0;i<4; i++){
-        dc0+= src[i-stride];
-        dc1+= src[4+i-stride];
-    }
-    dc0= 0x01010101*((dc0 + 2)>>2);
-    dc1= 0x01010101*((dc1 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-}
-
-
-void ff_pred8x8_dc_c(uint8_t *src, int stride){
-    int i;
-    int dc0, dc1, dc2, dc3;
-
-    dc0=dc1=dc2=0;
-    for(i=0;i<4; i++){
-        dc0+= src[-1+i*stride] + src[i-stride];
-        dc1+= src[4+i-stride];
-        dc2+= src[-1+(i+4)*stride];
-    }
-    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
-    dc0= 0x01010101*((dc0 + 4)>>3);
-    dc1= 0x01010101*((dc1 + 2)>>2);
-    dc2= 0x01010101*((dc2 + 2)>>2);
-
-    for(i=0; i<4; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc0;
-        ((uint32_t*)(src+i*stride))[1]= dc1;
-    }
-    for(i=4; i<8; i++){
-        ((uint32_t*)(src+i*stride))[0]= dc2;
-        ((uint32_t*)(src+i*stride))[1]= dc3;
-    }
-}
-
-void ff_pred8x8_plane_c(uint8_t *src, int stride){
-  int j, k;
-  int a;
-  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-  const uint8_t * const src0 = src+3-stride;
-  const uint8_t *src1 = src+4*stride-1;
-  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
-  int H = src0[1] - src0[-1];
-  int V = src1[0] - src2[ 0];
-  for(k=2; k<=4; ++k) {
-    src1 += stride; src2 -= stride;
-    H += k*(src0[k] - src0[-k]);
-    V += k*(src1[0] - src2[ 0]);
-  }
-  H = ( 17*H+16 ) >> 5;
-  V = ( 17*V+16 ) >> 5;
-
-  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
-  for(j=8; j>0; --j) {
-    int b = a;
-    a += V;
-    src[0] = cm[ (b    ) >> 5 ];
-    src[1] = cm[ (b+  H) >> 5 ];
-    src[2] = cm[ (b+2*H) >> 5 ];
-    src[3] = cm[ (b+3*H) >> 5 ];
-    src[4] = cm[ (b+4*H) >> 5 ];
-    src[5] = cm[ (b+5*H) >> 5 ];
-    src[6] = cm[ (b+6*H) >> 5 ];
-    src[7] = cm[ (b+7*H) >> 5 ];
-    src += stride;
-  }
-}
-
-#define SRC(x,y) src[(x)+(y)*stride]
-#define PL(y) \
-    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_LEFT \
-    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
-                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
-    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
-    const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
-
-#define PT(x) \
-    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOP \
-    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
-                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
-    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
-    const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
-                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
-
-#define PTR(x) \
-    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOPRIGHT \
-    int t8, t9, t10, t11, t12, t13, t14, t15; \
-    if(has_topright) { \
-        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
-        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
-    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
-
-#define PREDICT_8x8_LOAD_TOPLEFT \
-    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
-
-#define PREDICT_8x8_DC(v) \
-    int y; \
-    for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
-        src += stride; \
-    }
-
-static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_DC(0x80808080);
-}
-static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_LEFT;
-    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOP;
-    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
-                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
-    PREDICT_8x8_DC(dc);
-}
-static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_LEFT;
-#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
-               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
-    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
-#undef ROW
-}
-static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    int y;
-    PREDICT_8x8_LOAD_TOP;
-    src[0] = t0;
-    src[1] = t1;
-    src[2] = t2;
-    src[3] = t3;
-    src[4] = t4;
-    src[5] = t5;
-    src[6] = t6;
-    src[7] = t7;
-    for( y = 1; y < 8; y++ )
-        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
-}
-static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_TOPRIGHT;
-    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
-    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
-    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
-    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
-    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
-    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
-    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
-    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
-    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
-}
-static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
-    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
-    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
-    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
-    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
-    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
-    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
-    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
-    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
-
-}
-static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
-    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
-    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
-    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
-    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
-    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
-    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
-    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
-    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
-    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
-    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
-    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
-    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
-    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
-    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(7,0)= (t6 + t7 + 1) >> 1;
-}
-static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_LEFT;
-    PREDICT_8x8_LOAD_TOPLEFT;
-    SRC(0,7)= (l6 + l7 + 1) >> 1;
-    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
-    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
-    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
-    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
-    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
-    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
-    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
-    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
-    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
-    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
-    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
-    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
-    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
-    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
-    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
-    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
-    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
-    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
-    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
-    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
-    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
-}
-static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_TOP;
-    PREDICT_8x8_LOAD_TOPRIGHT;
-    SRC(0,0)= (t0 + t1 + 1) >> 1;
-    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
-    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
-    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
-    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
-    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
-    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
-    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
-    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
-    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
-    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
-    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
-    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
-    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
-    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
-    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
-    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
-    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
-    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
-    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
-    SRC(7,6)= (t10 + t11 + 1) >> 1;
-    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
-}
-static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
-{
-    PREDICT_8x8_LOAD_LEFT;
-    SRC(0,0)= (l0 + l1 + 1) >> 1;
-    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
-    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
-    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
-    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
-    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
-    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
-    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
-    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
-    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
-    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
-    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
-    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
-    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
-    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
-    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
-    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
-    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
-}
-#undef PREDICT_8x8_LOAD_LEFT
-#undef PREDICT_8x8_LOAD_TOP
-#undef PREDICT_8x8_LOAD_TOPLEFT
-#undef PREDICT_8x8_LOAD_TOPRIGHT
-#undef PREDICT_8x8_DC
-#undef PTR
-#undef PT
-#undef PL
-#undef SRC
-
 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
@@ -2762,7 +1684,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
     const int full_mx= mx>>2;
     const int full_my= my>>2;
     const int pic_width  = 16*s->mb_width;
-    const int pic_height = 16*s->mb_height >> MB_MBAFF;
+    const int pic_height = 16*s->mb_height >> MB_FIELD;
 
     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
         return;
@@ -2784,11 +1706,11 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
     }
 
-    if(s->flags&CODEC_FLAG_GRAY) return;
+    if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
 
-    if(MB_MBAFF){
+    if(MB_FIELD){
         // chroma offset when predicting from a field of opposite parity
-        my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
+        my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
     }
     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
@@ -2821,7 +1743,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
     x_offset += 8*s->mb_x;
-    y_offset += 8*(s->mb_y >> MB_MBAFF);
+    y_offset += 8*(s->mb_y >> MB_FIELD);
 
     if(list0){
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
@@ -2854,7 +1776,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
     x_offset += 8*s->mb_x;
-    y_offset += 8*(s->mb_y >> MB_MBAFF);
+    y_offset += 8*(s->mb_y >> MB_FIELD);
 
     if(list0 && list1){
         /* don't optimize for luma-only case, since B-frames usually
@@ -3029,7 +1951,7 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
     prefetch_motion(h, 1);
 }
 
-static void decode_init_vlc(){
+static void decode_init_vlc(void){
     static int done = 0;
 
     if (!done) {
@@ -3068,56 +1990,9 @@ static void decode_init_vlc(){
     }
 }
 
-/**
- * Sets the intra prediction function pointers.
- */
-static void init_pred_ptrs(H264Context *h){
-//    MpegEncContext * const s = &h->s;
-
-    h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
-    h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
-    h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
-    h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
-    h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
-    h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
-    h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
-    h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
-    h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
-    h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
-    h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
-    h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
-
-    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
-    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
-    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
-    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
-    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
-    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
-    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
-    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
-    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
-    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
-    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
-    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
-
-    h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
-    h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
-    h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
-    h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
-    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
-    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
-    h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
-
-    h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
-    h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
-    h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
-    h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
-    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
-    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
-    h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
-}
-
 static void free_tables(H264Context *h){
+    int i;
+    H264Context *hx;
     av_freep(&h->intra4x4_pred_mode);
     av_freep(&h->chroma_pred_mode_table);
     av_freep(&h->cbp_table);
@@ -3126,14 +2001,24 @@ static void free_tables(H264Context *h){
     av_freep(&h->direct_table);
     av_freep(&h->non_zero_count);
     av_freep(&h->slice_table_base);
-    av_freep(&h->top_borders[1]);
-    av_freep(&h->top_borders[0]);
     h->slice_table= NULL;
 
     av_freep(&h->mb2b_xy);
     av_freep(&h->mb2b8_xy);
 
-    av_freep(&h->s.obmc_scratchpad);
+    for(i = 0; i < MAX_SPS_COUNT; i++)
+        av_freep(h->sps_buffers + i);
+
+    for(i = 0; i < MAX_PPS_COUNT; i++)
+        av_freep(h->pps_buffers + i);
+
+    for(i = 0; i < h->s.avctx->thread_count; i++) {
+        hx = h->thread_context[i];
+        if(!hx) continue;
+        av_freep(&hx->top_borders[1]);
+        av_freep(&hx->top_borders[0]);
+        av_freep(&hx->s.obmc_scratchpad);
+    }
 }
 
 static void init_dequant8_coeff_table(H264Context *h){
@@ -3214,16 +2099,12 @@ static int alloc_tables(H264Context *h){
 
     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
-    CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
-    CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
 
-    if( h->pps.cabac ) {
-        CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
-        CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
-        CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
-        CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
-    }
+    CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
+    CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
+    CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
 
     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
@@ -3252,6 +2133,38 @@ fail:
     return -1;
 }
 
+/**
+ * Mimic alloc_tables(), but for every context thread.
+ */
+static void clone_tables(H264Context *dst, H264Context *src){
+    dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
+    dst->non_zero_count           = src->non_zero_count;
+    dst->slice_table              = src->slice_table;
+    dst->cbp_table                = src->cbp_table;
+    dst->mb2b_xy                  = src->mb2b_xy;
+    dst->mb2b8_xy                 = src->mb2b8_xy;
+    dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
+    dst->mvd_table[0]             = src->mvd_table[0];
+    dst->mvd_table[1]             = src->mvd_table[1];
+    dst->direct_table             = src->direct_table;
+
+    dst->s.obmc_scratchpad = NULL;
+    ff_h264_pred_init(&dst->hpc, src->s.codec_id);
+}
+
+/**
+ * Init context
+ * Allocate buffers which are not shared amongst multiple threads.
+ */
+static int context_init(H264Context *h){
+    CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
+    CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
+
+    return 0;
+fail:
+    return -1; // free_tables will clean up for us
+}
+
 static void common_init(H264Context *h){
     MpegEncContext * const s = &h->s;
 
@@ -3259,7 +2172,7 @@ static void common_init(H264Context *h){
     s->height = s->avctx->height;
     s->codec_id= s->avctx->codec->id;
 
-    init_pred_ptrs(h);
+    ff_h264_pred_init(&h->hpc, s->codec_id);
 
     h->dequant_coeff_pps= -1;
     s->unrestricted_mv=1;
@@ -3283,6 +2196,7 @@ static int decode_init(AVCodecContext *avctx){
 
     // set defaults
 //    s->decode_mb= ff_h263_decode_mb;
+    s->quarter_sample = 1;
     s->low_delay= 1;
     avctx->pix_fmt= PIX_FMT_YUV420P;
 
@@ -3296,6 +2210,7 @@ static int decode_init(AVCodecContext *avctx){
         h->is_avc = 0;
     }
 
+    h->thread_context[0] = h;
     return 0;
 }
 
@@ -3306,6 +2221,13 @@ static int frame_start(H264Context *h){
     if(MPV_frame_start(s, s->avctx) < 0)
         return -1;
     ff_er_frame_start(s);
+    /*
+     * MPV_frame_start uses pict_type to derive key_frame.
+     * This is incorrect for H.264; IDR markings must be used.
+     * Zero here; IDR markings per slice in frame or fields are OR'd in later.
+     * See decode_nal_units().
+     */
+    s->current_picture_ptr->key_frame= 0;
 
     assert(s->linesize && s->uvlinesize);
 
@@ -3322,18 +2244,19 @@ static int frame_start(H264Context *h){
 
     /* can't be in alloc_tables because linesize isn't known there.
      * FIXME: redo bipred weight to not require extra buffer? */
-    if(!s->obmc_scratchpad)
-        s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
+    for(i = 0; i < s->avctx->thread_count; i++)
+        if(!h->thread_context[i]->s.obmc_scratchpad)
+            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
 
     /* some macroblocks will be accessed before they're available */
-    if(FRAME_MBAFF)
+    if(FRAME_MBAFF || s->avctx->thread_count > 1)
         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
 
 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
     return 0;
 }
 
-static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
     MpegEncContext * const s = &h->s;
     int i;
 
@@ -3351,7 +2274,7 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
 
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
         for(i=1; i<9; i++){
@@ -3363,12 +2286,22 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
     }
 }
 
-static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
+static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
     MpegEncContext * const s = &h->s;
     int temp8, i;
     uint64_t temp64;
-    int deblock_left = (s->mb_x > 0);
-    int deblock_top  = (s->mb_y > 0);
+    int deblock_left;
+    int deblock_top;
+    int mb_xy;
+
+    if(h->deblocking_filter == 2) {
+        mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+        deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
+        deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
+    } else {
+        deblock_left = (s->mb_x > 0);
+        deblock_top =  (s->mb_y > 0);
+    }
 
     src_y  -=   linesize + 1;
     src_cb -= uvlinesize + 1;
@@ -3394,7 +2327,7 @@ b= t;
         }
     }
 
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
         if(deblock_left){
             for(i = !deblock_top; i<9; i++){
                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
@@ -3429,7 +2362,7 @@ static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *s
     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
 
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
@@ -3481,7 +2414,7 @@ b= t;
         }
     }
 
-    if(!(s->flags&CODEC_FLAG_GRAY)){
+    if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
         if(deblock_left){
             for(i = (!deblock_top) << 1; i<18; i++){
                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
@@ -3497,7 +2430,7 @@ b= t;
     }
 }
 
-static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
+static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
     MpegEncContext * const s = &h->s;
     const int mb_x= s->mb_x;
     const int mb_y= s->mb_y;
@@ -3535,13 +2468,13 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
                     continue;
                 if(IS_16X16(mb_type)){
                     int8_t *ref = &h->ref_cache[list][scan8[0]];
-                    fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
+                    fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
                 }else{
                     for(i=0; i<16; i+=4){
                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
                         int ref = h->ref_cache[list][scan8[i]];
                         if(ref >= 0)
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
                     }
                 }
             }
@@ -3601,11 +2534,11 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
     } else {
         if(IS_INTRA(mb_type)){
             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
 
-            if(simple || !(s->flags&CODEC_FLAG_GRAY)){
-                h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
-                h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
+            if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
+                h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
+                h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
             }
 
             if(IS_INTRA4x4(mb_type)){
@@ -3615,7 +2548,7 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
                             uint8_t * const ptr= dest_y + block_offset[i];
                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
-                            h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
+                            h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
                                                    (h->topright_samples_available<<i)&0x4000, linesize);
                             if(nnz){
                                 if(nnz == 1 && h->mb[i*16])
@@ -3642,7 +2575,7 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
                         }else
                             topright= NULL;
 
-                        h->pred4x4[ dir ](ptr, topright, linesize);
+                        h->hpc.pred4x4[ dir ](ptr, topright, linesize);
                         nnz = h->non_zero_count_cache[ scan8[i] ];
                         if(nnz){
                             if(is_h264){
@@ -3656,15 +2589,15 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
                     }
                 }
             }else{
-                h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
+                h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
                 if(is_h264){
                     if(!transform_bypass)
-                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
+                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
                 }else
                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
             }
             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
         }else if(is_h264){
             hl_motion(h, dest_y, dest_cb, dest_cr,
                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
@@ -3704,15 +2637,15 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
             }
         }
 
-        if(simple || !(s->flags&CODEC_FLAG_GRAY)){
+        if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
             uint8_t *dest[2] = {dest_cb, dest_cr};
             if(transform_bypass){
                 idct_add = idct_dc_add = s->dsp.add_pixels4;
             }else{
                 idct_add = s->dsp.h264_idct_add;
                 idct_dc_add = s->dsp.h264_idct_dc_add;
-                chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
-                chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
+                chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
+                chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
             }
             if(is_h264){
                 for(i=16; i<16+8; i++){
@@ -3754,17 +2687,19 @@ static void av_always_inline hl_decode_mb_internal(H264Context *h, int simple){
             s->mb_y--;
             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
-            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
+            h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
+            h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
             // bottom
             s->mb_y++;
             tprintf(h->s.avctx, "call mbaff filter_mb\n");
             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
-            h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
+            h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
+            h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
         } else {
             tprintf(h->s.avctx, "call filter_mb\n");
-            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
+            backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
         }
@@ -3791,7 +2726,7 @@ static void hl_decode_mb(H264Context *h){
     const int mb_y= s->mb_y;
     const int mb_xy= mb_x + mb_y*s->mb_stride;
     const int mb_type= s->current_picture.mb_type[mb_xy];
-    int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
+    int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
 
     if(!s->decode)
         return;
@@ -3801,6 +2736,105 @@ static void hl_decode_mb(H264Context *h){
     else hl_decode_mb_simple(h);
 }
 
+static void pic_as_field(Picture *pic, const int parity){
+    int i;
+    for (i = 0; i < 4; ++i) {
+        if (parity == PICT_BOTTOM_FIELD)
+            pic->data[i] += pic->linesize[i];
+        pic->reference = parity;
+        pic->linesize[i] *= 2;
+    }
+}
+
+static int split_field_copy(Picture *dest, Picture *src,
+                            int parity, int id_add){
+    int match = !!(src->reference & parity);
+
+    if (match) {
+        *dest = *src;
+        pic_as_field(dest, parity);
+        dest->pic_id *= 2;
+        dest->pic_id += id_add;
+    }
+
+    return match;
+}
+
+/**
+ * Split one reference list into field parts, interleaving by parity
+ * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
+ * set to look at the actual start of data for that field.
+ *
+ * @param dest output list
+ * @param dest_len maximum number of fields to put in dest
+ * @param src the source reference list containing fields and/or field pairs
+ *            (aka short_ref/long_ref, or
+ *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
+ * @param src_len number of Picture's in source (pairs and unmatched fields)
+ * @param parity the parity of the picture being decoded/needing
+ *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
+ * @return number of fields placed in dest
+ */
+static int split_field_half_ref_list(Picture *dest, int dest_len,
+                                     Picture *src,  int src_len,  int parity){
+    int same_parity   = 1;
+    int same_i        = 0;
+    int opp_i         = 0;
+    int out_i;
+    int field_output;
+
+    for (out_i = 0; out_i < dest_len; out_i += field_output) {
+        if (same_parity && same_i < src_len) {
+            field_output = split_field_copy(dest + out_i, src + same_i,
+                                            parity, 1);
+            same_parity = !field_output;
+            same_i++;
+
+        } else if (opp_i < src_len) {
+            field_output = split_field_copy(dest + out_i, src + opp_i,
+                                            PICT_FRAME - parity, 0);
+            same_parity = field_output;
+            opp_i++;
+
+        } else {
+            break;
+        }
+    }
+
+    return out_i;
+}
+
+/**
+ * Split the reference frame list into a reference field list.
+ * This implements H.264 spec 8.2.4.2.5 for a combined input list.
+ * The input list contains both reference field pairs and
+ * unmatched reference fields; it is ordered as spec describes
+ * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
+ * unmatched field pairs are also present. Conceptually this is equivalent
+ * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
+ *
+ * @param dest output reference list where ordered fields are to be placed
+ * @param dest_len max number of fields to place at dest
+ * @param src source reference list, as described above
+ * @param src_len number of pictures (pairs and unmatched fields) in src
+ * @param parity parity of field being currently decoded
+ *        (one of PICT_{TOP,BOTTOM}_FIELD)
+ * @param long_i index into src array that holds first long reference picture,
+ *        or src_len if no long refs present.
+ */
+static int split_field_ref_list(Picture *dest, int dest_len,
+                                Picture *src,  int src_len,
+                                int parity,    int long_i){
+
+    int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
+    dest += i;
+    dest_len -= i;
+
+    i += split_field_half_ref_list(dest, dest_len, src + long_i,
+                                   src_len - long_i, parity);
+    return i;
+}
+
 /**
  * fills the default_ref_list.
  */
@@ -3808,9 +2842,25 @@ static int fill_default_ref_list(H264Context *h){
     MpegEncContext * const s = &h->s;
     int i;
     int smallest_poc_greater_than_current = -1;
+    int structure_sel;
     Picture sorted_short_ref[32];
+    Picture field_entry_list[2][32];
+    Picture *frame_list[2];
+
+    if (FIELD_PICTURE) {
+        structure_sel = PICT_FRAME;
+        frame_list[0] = field_entry_list[0];
+        frame_list[1] = field_entry_list[1];
+    } else {
+        structure_sel = 0;
+        frame_list[0] = h->default_ref_list[0];
+        frame_list[1] = h->default_ref_list[1];
+    }
 
     if(h->slice_type==B_TYPE){
+        int list;
+        int len[2];
+        int short_len[2];
         int out_i;
         int limit= INT_MIN;
 
@@ -3838,71 +2888,92 @@ static int fill_default_ref_list(H264Context *h){
                 }
             }
         }
-    }
 
-    if(s->picture_structure == PICT_FRAME){
-        if(h->slice_type==B_TYPE){
-            int list;
-            tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
+        tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
 
-            // find the largest poc
-            for(list=0; list<2; list++){
-                int index = 0;
-                int j= -99;
-                int step= list ? -1 : 1;
-
-                for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
-                    while(j<0 || j>= h->short_ref_count){
-                        if(j != -99 && step == (list ? -1 : 1))
-                            return -1;
-                        step = -step;
-                        j= smallest_poc_greater_than_current + (step>>1);
-                    }
-                    if(sorted_short_ref[j].reference != 3) continue;
-                    h->default_ref_list[list][index  ]= sorted_short_ref[j];
-                    h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
+        // find the largest poc
+        for(list=0; list<2; list++){
+            int index = 0;
+            int j= -99;
+            int step= list ? -1 : 1;
+
+            for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
+                int sel;
+                while(j<0 || j>= h->short_ref_count){
+                    if(j != -99 && step == (list ? -1 : 1))
+                        return -1;
+                    step = -step;
+                    j= smallest_poc_greater_than_current + (step>>1);
                 }
+                sel = sorted_short_ref[j].reference | structure_sel;
+                if(sel != PICT_FRAME) continue;
+                frame_list[list][index  ]= sorted_short_ref[j];
+                frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
+            }
+            short_len[list] = index;
 
-                for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
-                    if(h->long_ref[i] == NULL) continue;
-                    if(h->long_ref[i]->reference != 3) continue;
+            for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
+                int sel;
+                if(h->long_ref[i] == NULL) continue;
+                sel = h->long_ref[i]->reference | structure_sel;
+                if(sel != PICT_FRAME) continue;
 
-                    h->default_ref_list[ list ][index  ]= *h->long_ref[i];
-                    h->default_ref_list[ list ][index++].pic_id= i;;
-                }
+                frame_list[ list ][index  ]= *h->long_ref[i];
+                frame_list[ list ][index++].pic_id= i;
+            }
+            len[list] = index;
+        }
 
-                if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
-                    // swap the two first elements of L1 when
-                    // L0 and L1 are identical
-                    Picture temp= h->default_ref_list[1][0];
-                    h->default_ref_list[1][0] = h->default_ref_list[1][1];
-                    h->default_ref_list[1][1] = temp;
-                }
+        for(list=0; list<2; list++){
+            if (FIELD_PICTURE)
+                len[list] = split_field_ref_list(h->default_ref_list[list],
+                                                 h->ref_count[list],
+                                                 frame_list[list],
+                                                 len[list],
+                                                 s->picture_structure,
+                                                 short_len[list]);
+
+            // swap the two first elements of L1 when L0 and L1 are identical
+            if(list && len[0] > 1 && len[0] == len[1])
+                for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
+                    if(i == len[0]){
+                        FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
+                        break;
+                    }
 
-                if(index < h->ref_count[ list ])
-                    memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
-            }
-        }else{
-            int index=0;
-            for(i=0; i<h->short_ref_count; i++){
-                if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
-                h->default_ref_list[0][index  ]= *h->short_ref[i];
-                h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
-            }
-            for(i = 0; i < 16; i++){
-                if(h->long_ref[i] == NULL) continue;
-                if(h->long_ref[i]->reference != 3) continue;
-                h->default_ref_list[0][index  ]= *h->long_ref[i];
-                h->default_ref_list[0][index++].pic_id= i;;
-            }
-            if(index < h->ref_count[0])
-                memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
+            if(len[list] < h->ref_count[ list ])
+                memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
         }
-    }else{ //FIELD
-        if(h->slice_type==B_TYPE){
-        }else{
-            //FIXME second field balh
+
+
+    }else{
+        int index=0;
+        int short_len;
+        for(i=0; i<h->short_ref_count; i++){
+            int sel;
+            sel = h->short_ref[i]->reference | structure_sel;
+            if(sel != PICT_FRAME) continue;
+            frame_list[0][index  ]= *h->short_ref[i];
+            frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
+        }
+        short_len = index;
+        for(i = 0; i < 16; i++){
+            int sel;
+            if(h->long_ref[i] == NULL) continue;
+            sel = h->long_ref[i]->reference | structure_sel;
+            if(sel != PICT_FRAME) continue;
+            frame_list[0][index  ]= *h->long_ref[i];
+            frame_list[0][index++].pic_id= i;
         }
+
+        if (FIELD_PICTURE)
+            index = split_field_ref_list(h->default_ref_list[0],
+                                         h->ref_count[0], frame_list[0],
+                                         index, s->picture_structure,
+                                         short_len);
+
+        if(index < h->ref_count[0])
+            memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
     }
 #ifdef TRACE
     for (i=0; i<h->ref_count[0]; i++) {
@@ -3910,7 +2981,7 @@ static int fill_default_ref_list(H264Context *h){
     }
     if(h->slice_type==B_TYPE){
         for (i=0; i<h->ref_count[1]; i++) {
-            tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
+            tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
         }
     }
 #endif
@@ -3920,9 +2991,33 @@ static int fill_default_ref_list(H264Context *h){
 static void print_short_term(H264Context *h);
 static void print_long_term(H264Context *h);
 
+/**
+ * Extract structure information about the picture described by pic_num in
+ * the current decoding context (frame or field). Note that pic_num is
+ * picture number without wrapping (so, 0<=pic_num<max_pic_num).
+ * @param pic_num picture number for which to extract structure information
+ * @param structure one of PICT_XXX describing structure of picture
+ *                      with pic_num
+ * @return frame number (short term) or long term index of picture
+ *         described by pic_num
+ */
+static int pic_num_extract(H264Context *h, int pic_num, int *structure){
+    MpegEncContext * const s = &h->s;
+
+    *structure = s->picture_structure;
+    if(FIELD_PICTURE){
+        if (!(pic_num & 1))
+            /* opposite field */
+            *structure ^= PICT_FRAME;
+        pic_num >>= 1;
+    }
+
+    return pic_num;
+}
+
 static int decode_ref_pic_list_reordering(H264Context *h){
     MpegEncContext * const s = &h->s;
-    int list, index;
+    int list, index, pic_structure;
 
     print_short_term(h);
     print_long_term(h);
@@ -3951,8 +3046,9 @@ static int decode_ref_pic_list_reordering(H264Context *h){
                 if(reordering_of_pic_nums_idc<3){
                     if(reordering_of_pic_nums_idc<2){
                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
+                        int frame_num;
 
-                        if(abs_diff_pic_num >= h->max_pic_num){
+                        if(abs_diff_pic_num > h->max_pic_num){
                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
                             return -1;
                         }
@@ -3961,25 +3057,34 @@ static int decode_ref_pic_list_reordering(H264Context *h){
                         else                                pred+= abs_diff_pic_num;
                         pred &= h->max_pic_num - 1;
 
+                        frame_num = pic_num_extract(h, pred, &pic_structure);
+
                         for(i= h->short_ref_count-1; i>=0; i--){
                             ref = h->short_ref[i];
-                            assert(ref->reference == 3);
+                            assert(ref->reference);
                             assert(!ref->long_ref);
-                            if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
+                            if(ref->data[0] != NULL &&
+                                   ref->frame_num == frame_num &&
+                                   (ref->reference & pic_structure) &&
+                                   ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
                                 break;
                         }
                         if(i>=0)
-                            ref->pic_id= ref->frame_num;
+                            ref->pic_id= pred;
                     }else{
+                        int long_idx;
                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
-                        if(pic_id>31){
+
+                        long_idx= pic_num_extract(h, pic_id, &pic_structure);
+
+                        if(long_idx>31){
                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
                             return -1;
                         }
-                        ref = h->long_ref[pic_id];
-                        if(ref){
+                        ref = h->long_ref[long_idx];
+                        assert(!(ref && !ref->reference));
+                        if(ref && (ref->reference & pic_structure)){
                             ref->pic_id= pic_id;
-                            assert(ref->reference == 3);
                             assert(ref->long_ref);
                             i=0;
                         }else{
@@ -3999,6 +3104,9 @@ static int decode_ref_pic_list_reordering(H264Context *h){
                             h->ref_list[list][i]= h->ref_list[list][i-1];
                         }
                         h->ref_list[list][index]= *ref;
+                        if (FIELD_PICTURE){
+                            pic_as_field(&h->ref_list[list][index], pic_structure);
+                        }
                     }
                 }else{
                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
@@ -4029,9 +3137,11 @@ static void fill_mbaff_ref_list(H264Context *h){
             field[0] = *frame;
             for(j=0; j<3; j++)
                 field[0].linesize[j] <<= 1;
+            field[0].reference = PICT_TOP_FIELD;
             field[1] = field[0];
             for(j=0; j<3; j++)
                 field[1].data[j] += frame->linesize[j];
+            field[1].reference = PICT_BOTTOM_FIELD;
 
             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
@@ -4137,17 +3247,32 @@ static void implicit_weight_table(H264Context *h){
     }
 }
 
-static inline void unreference_pic(H264Context *h, Picture *pic){
+/**
+ * Mark a picture as no longer needed for reference. The refmask
+ * argument allows unreferencing of individual fields or the whole frame.
+ * If the picture becomes entirely unreferenced, but is being held for
+ * display purposes, it is marked as such.
+ * @param refmask mask of fields to unreference; the mask is bitwise
+ *                anded with the reference marking of pic
+ * @return non-zero if pic becomes entirely unreferenced (except possibly
+ *         for display purposes) zero if one of the fields remains in
+ *         reference
+ */
+static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
     int i;
-    pic->reference=0;
-    if(pic == h->delayed_output_pic)
-        pic->reference=1;
-    else{
-        for(i = 0; h->delayed_pic[i]; i++)
-            if(pic == h->delayed_pic[i]){
-                pic->reference=1;
-                break;
-            }
+    if (pic->reference &= refmask) {
+        return 0;
+    } else {
+        if(pic == h->delayed_output_pic)
+            pic->reference=DELAYED_PIC_REF;
+        else{
+            for(i = 0; h->delayed_pic[i]; i++)
+                if(pic == h->delayed_pic[i]){
+                    pic->reference=DELAYED_PIC_REF;
+                    break;
+                }
+        }
+        return 1;
     }
 }
 
@@ -4159,14 +3284,14 @@ static void idr(H264Context *h){
 
     for(i=0; i<16; i++){
         if (h->long_ref[i] != NULL) {
-            unreference_pic(h, h->long_ref[i]);
+            unreference_pic(h, h->long_ref[i], 0);
             h->long_ref[i]= NULL;
         }
     }
     h->long_ref_count=0;
 
     for(i=0; i<h->short_ref_count; i++){
-        unreference_pic(h, h->short_ref[i]);
+        unreference_pic(h, h->short_ref[i], 0);
         h->short_ref[i]= NULL;
     }
     h->short_ref_count=0;
@@ -4187,27 +3312,28 @@ static void flush_dpb(AVCodecContext *avctx){
     idr(h);
     if(h->s.current_picture_ptr)
         h->s.current_picture_ptr->reference= 0;
+    h->s.first_field= 0;
+    ff_mpeg_flush(avctx);
 }
 
 /**
- *
- * @return the removed picture or NULL if an error occurs
+ * Find a Picture in the short term reference list by frame number.
+ * @param frame_num frame number to search for
+ * @param idx the index into h->short_ref where returned picture is found
+ *            undefined if no picture found.
+ * @return pointer to the found picture, or NULL if no pic with the provided
+ *                 frame number is found
  */
-static Picture * remove_short(H264Context *h, int frame_num){
+static Picture * find_short(H264Context *h, int frame_num, int *idx){
     MpegEncContext * const s = &h->s;
     int i;
 
-    if(s->avctx->debug&FF_DEBUG_MMCO)
-        av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
-
     for(i=0; i<h->short_ref_count; i++){
         Picture *pic= h->short_ref[i];
         if(s->avctx->debug&FF_DEBUG_MMCO)
             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
-        if(pic->frame_num == frame_num){
-            h->short_ref[i]= NULL;
-            memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
-            h->short_ref_count--;
+        if(pic->frame_num == frame_num) {
+            *idx = i;
             return pic;
         }
     }
@@ -4215,6 +3341,49 @@ static Picture * remove_short(H264Context *h, int frame_num){
 }
 
 /**
+ * Remove a picture from the short term reference list by its index in
+ * that list.  This does no checking on the provided index; it is assumed
+ * to be valid. Other list entries are shifted down.
+ * @param i index into h->short_ref of picture to remove.
+ */
+static void remove_short_at_index(H264Context *h, int i){
+    assert(i > 0 && i < h->short_ref_count);
+    h->short_ref[i]= NULL;
+    if (--h->short_ref_count)
+        memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
+}
+
+/**
+ *
+ * @return the removed picture or NULL if an error occurs
+ */
+static Picture * remove_short(H264Context *h, int frame_num){
+    MpegEncContext * const s = &h->s;
+    Picture *pic;
+    int i;
+
+    if(s->avctx->debug&FF_DEBUG_MMCO)
+        av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
+
+    pic = find_short(h, frame_num, &i);
+    if (pic)
+        remove_short_at_index(h, i);
+
+    return pic;
+}
+
+/**
+ * Remove a picture from the long term reference list by its index in
+ * that list.  This does no checking on the provided index; it is assumed
+ * to be valid. The removed entry is set to NULL. Other entries are unaffected.
+ * @param i index into h->long_ref of picture to remove.
+ */
+static void remove_long_at_index(H264Context *h, int i){
+    h->long_ref[i]= NULL;
+    h->long_ref_count--;
+}
+
+/**
  *
  * @return the removed picture or NULL if an error occurs
  */
@@ -4222,8 +3391,8 @@ static Picture * remove_long(H264Context *h, int i){
     Picture *pic;
 
     pic= h->long_ref[i];
-    h->long_ref[i]= NULL;
-    if(pic) h->long_ref_count--;
+    if (pic)
+        remove_long_at_index(h, i);
 
     return pic;
 }
@@ -4264,77 +3433,143 @@ static void print_long_term(H264Context *h) {
 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
     MpegEncContext * const s = &h->s;
     int i, j;
-    int current_is_long=0;
+    int current_ref_assigned=0;
     Picture *pic;
 
     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
 
     for(i=0; i<mmco_count; i++){
+        int structure, frame_num, unref_pic;
         if(s->avctx->debug&FF_DEBUG_MMCO)
-            av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
+            av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
 
         switch(mmco[i].opcode){
         case MMCO_SHORT2UNUSED:
-            pic= remove_short(h, mmco[i].short_frame_num);
-            if(pic)
-                unreference_pic(h, pic);
-            else if(s->avctx->debug&FF_DEBUG_MMCO)
-                av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
+            if(s->avctx->debug&FF_DEBUG_MMCO)
+                av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
+            frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
+            pic = find_short(h, frame_num, &j);
+            if (pic) {
+                if (unreference_pic(h, pic, structure ^ PICT_FRAME))
+                    remove_short_at_index(h, j);
+            } else if(s->avctx->debug&FF_DEBUG_MMCO)
+                av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
             break;
         case MMCO_SHORT2LONG:
-            pic= remove_long(h, mmco[i].long_index);
-            if(pic) unreference_pic(h, pic);
+            if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
+                    h->long_ref[mmco[i].long_arg]->frame_num ==
+                                              mmco[i].short_pic_num / 2) {
+                /* do nothing, we've already moved this field pair. */
+            } else {
+                int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
 
-            h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
-            if (h->long_ref[ mmco[i].long_index ]){
-                h->long_ref[ mmco[i].long_index ]->long_ref=1;
-                h->long_ref_count++;
+                pic= remove_long(h, mmco[i].long_arg);
+                if(pic) unreference_pic(h, pic, 0);
+
+                h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
+                if (h->long_ref[ mmco[i].long_arg ]){
+                    h->long_ref[ mmco[i].long_arg ]->long_ref=1;
+                    h->long_ref_count++;
+                }
             }
             break;
         case MMCO_LONG2UNUSED:
-            pic= remove_long(h, mmco[i].long_index);
-            if(pic)
-                unreference_pic(h, pic);
-            else if(s->avctx->debug&FF_DEBUG_MMCO)
-                av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
+            j = pic_num_extract(h, mmco[i].long_arg, &structure);
+            pic = h->long_ref[j];
+            if (pic) {
+                if (unreference_pic(h, pic, structure ^ PICT_FRAME))
+                    remove_long_at_index(h, j);
+            } else if(s->avctx->debug&FF_DEBUG_MMCO)
+                av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
             break;
         case MMCO_LONG:
-            pic= remove_long(h, mmco[i].long_index);
-            if(pic) unreference_pic(h, pic);
+            unref_pic = 1;
+            if (FIELD_PICTURE && !s->first_field) {
+                if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
+                    /* Just mark second field as referenced */
+                    unref_pic = 0;
+                } else if (s->current_picture_ptr->reference) {
+                    /* First field in pair is in short term list or
+                     * at a different long term index.
+                     * This is not allowed; see 7.4.3, notes 2 and 3.
+                     * Report the problem and keep the pair where it is,
+                     * and mark this field valid.
+                     */
+                    av_log(h->s.avctx, AV_LOG_ERROR,
+                        "illegal long term reference assignment for second "
+                        "field in complementary field pair (first field is "
+                        "short term or has non-matching long index)\n");
+                    unref_pic = 0;
+                }
+            }
 
-            h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
-            h->long_ref[ mmco[i].long_index ]->long_ref=1;
-            h->long_ref_count++;
+            if (unref_pic) {
+                pic= remove_long(h, mmco[i].long_arg);
+                if(pic) unreference_pic(h, pic, 0);
 
-            current_is_long=1;
+                h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
+                h->long_ref[ mmco[i].long_arg ]->long_ref=1;
+                h->long_ref_count++;
+            }
+
+            s->current_picture_ptr->reference |= s->picture_structure;
+            current_ref_assigned=1;
             break;
         case MMCO_SET_MAX_LONG:
-            assert(mmco[i].long_index <= 16);
+            assert(mmco[i].long_arg <= 16);
             // just remove the long term which index is greater than new max
-            for(j = mmco[i].long_index; j<16; j++){
+            for(j = mmco[i].long_arg; j<16; j++){
                 pic = remove_long(h, j);
-                if (pic) unreference_pic(h, pic);
+                if (pic) unreference_pic(h, pic, 0);
             }
             break;
         case MMCO_RESET:
             while(h->short_ref_count){
                 pic= remove_short(h, h->short_ref[0]->frame_num);
-                if(pic) unreference_pic(h, pic);
+                if(pic) unreference_pic(h, pic, 0);
             }
             for(j = 0; j < 16; j++) {
                 pic= remove_long(h, j);
-                if(pic) unreference_pic(h, pic);
+                if(pic) unreference_pic(h, pic, 0);
             }
             break;
         default: assert(0);
         }
     }
 
-    if(!current_is_long){
+    if (!current_ref_assigned && FIELD_PICTURE &&
+            !s->first_field && s->current_picture_ptr->reference) {
+
+        /* Second field of complementary field pair; the first field of
+         * which is already referenced. If short referenced, it
+         * should be first entry in short_ref. If not, it must exist
+         * in long_ref; trying to put it on the short list here is an
+         * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
+         */
+        if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
+            /* Just mark the second field valid */
+            s->current_picture_ptr->reference = PICT_FRAME;
+        } else if (s->current_picture_ptr->long_ref) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
+                                             "assignment for second field "
+                                             "in complementary field pair "
+                                             "(first field is long term)\n");
+        } else {
+            /*
+             * First field in reference, but not in any sensible place on our
+             * reference lists. This shouldn't happen unless reference
+             * handling somewhere else is wrong.
+             */
+            assert(0);
+        }
+        current_ref_assigned = 1;
+    }
+
+    if(!current_ref_assigned){
         pic= remove_short(h, s->current_picture_ptr->frame_num);
         if(pic){
-            unreference_pic(h, pic);
+            unreference_pic(h, pic, 0);
             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
         }
 
@@ -4344,6 +3579,32 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
         h->short_ref[0]= s->current_picture_ptr;
         h->short_ref[0]->long_ref=0;
         h->short_ref_count++;
+        s->current_picture_ptr->reference |= s->picture_structure;
+    }
+
+    if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
+
+        /* We have too many reference frames, probably due to corrupted
+         * stream. Need to discard one frame. Prevents overrun of the
+         * short_ref and long_ref buffers.
+         */
+        av_log(h->s.avctx, AV_LOG_ERROR,
+               "number of reference frames exceeds max (probably "
+               "corrupt input), discarding one\n");
+
+        if (h->long_ref_count) {
+            for (i = 0; i < 16; ++i)
+                if (h->long_ref[i])
+                    break;
+
+            assert(i < 16);
+            pic = h->long_ref[i];
+            remove_long_at_index(h, i);
+        } else {
+            pic = h->short_ref[h->short_ref_count - 1];
+            remove_short_at_index(h, h->short_ref_count - 1);
+        }
+        unreference_pic(h, pic, 0);
     }
 
     print_short_term(h);
@@ -4351,39 +3612,39 @@ static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
     return 0;
 }
 
-static int decode_ref_pic_marking(H264Context *h){
+static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
     MpegEncContext * const s = &h->s;
     int i;
 
     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
-        s->broken_link= get_bits1(&s->gb) -1;
-        h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
-        if(h->mmco[0].long_index == -1)
+        s->broken_link= get_bits1(gb) -1;
+        h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
+        if(h->mmco[0].long_arg == -1)
             h->mmco_index= 0;
         else{
             h->mmco[0].opcode= MMCO_LONG;
             h->mmco_index= 1;
         }
     }else{
-        if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
+        if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
             for(i= 0; i<MAX_MMCO_COUNT; i++) {
-                MMCOOpcode opcode= get_ue_golomb(&s->gb);;
+                MMCOOpcode opcode= get_ue_golomb(gb);
 
                 h->mmco[i].opcode= opcode;
                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
-                    h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
-/*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
+                    h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
+/*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
                         return -1;
                     }*/
                 }
                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
-                    unsigned int long_index= get_ue_golomb(&s->gb);
-                    if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
+                    unsigned int long_arg= get_ue_golomb(gb);
+                    if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
                         return -1;
                     }
-                    h->mmco[i].long_index= long_index;
+                    h->mmco[i].long_arg= long_arg;
                 }
 
                 if(opcode > (unsigned)MMCO_LONG){
@@ -4397,10 +3658,17 @@ static int decode_ref_pic_marking(H264Context *h){
         }else{
             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
 
-            if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
+            if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
+                    !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
-                h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
+                h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
                 h->mmco_index= 1;
+                if (FIELD_PICTURE) {
+                    h->mmco[0].short_pic_num *= 2;
+                    h->mmco[1].opcode= MMCO_SHORT2UNUSED;
+                    h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
+                    h->mmco_index= 2;
+                }
             }else
                 h->mmco_index= 0;
         }
@@ -4488,37 +3756,135 @@ static int init_poc(H264Context *h){
         field_poc[1]= poc;
     }
 
-    if(s->picture_structure != PICT_BOTTOM_FIELD)
+    if(s->picture_structure != PICT_BOTTOM_FIELD) {
         s->current_picture_ptr->field_poc[0]= field_poc[0];
-    if(s->picture_structure != PICT_TOP_FIELD)
+        s->current_picture_ptr->poc = field_poc[0];
+    }
+    if(s->picture_structure != PICT_TOP_FIELD) {
         s->current_picture_ptr->field_poc[1]= field_poc[1];
-    if(s->picture_structure == PICT_FRAME) // FIXME field pix?
-        s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
+        s->current_picture_ptr->poc = field_poc[1];
+    }
+    if(!FIELD_PICTURE || !s->first_field) {
+        Picture *cur = s->current_picture_ptr;
+        cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
+    }
 
     return 0;
 }
 
+
+/**
+ * initialize scan tables
+ */
+static void init_scan_tables(H264Context *h){
+    MpegEncContext * const s = &h->s;
+    int i;
+    if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
+        memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
+        memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
+    }else{
+        for(i=0; i<16; i++){
+#define T(x) (x>>2) | ((x<<2) & 0xF)
+            h->zigzag_scan[i] = T(zigzag_scan[i]);
+            h-> field_scan[i] = T( field_scan[i]);
+#undef T
+        }
+    }
+    if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
+        memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
+        memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
+        memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
+        memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
+    }else{
+        for(i=0; i<64; i++){
+#define T(x) (x>>3) | ((x&7)<<3)
+            h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
+            h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
+            h->field_scan8x8[i]        = T(field_scan8x8[i]);
+            h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
+#undef T
+        }
+    }
+    if(h->sps.transform_bypass){ //FIXME same ugly
+        h->zigzag_scan_q0          = zigzag_scan;
+        h->zigzag_scan8x8_q0       = zigzag_scan8x8;
+        h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
+        h->field_scan_q0           = field_scan;
+        h->field_scan8x8_q0        = field_scan8x8;
+        h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
+    }else{
+        h->zigzag_scan_q0          = h->zigzag_scan;
+        h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
+        h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
+        h->field_scan_q0           = h->field_scan;
+        h->field_scan8x8_q0        = h->field_scan8x8;
+        h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
+    }
+}
+
+/**
+ * Replicates H264 "master" context to thread contexts.
+ */
+static void clone_slice(H264Context *dst, H264Context *src)
+{
+    memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
+    dst->s.current_picture_ptr  = src->s.current_picture_ptr;
+    dst->s.current_picture      = src->s.current_picture;
+    dst->s.linesize             = src->s.linesize;
+    dst->s.uvlinesize           = src->s.uvlinesize;
+    dst->s.first_field          = src->s.first_field;
+
+    dst->prev_poc_msb           = src->prev_poc_msb;
+    dst->prev_poc_lsb           = src->prev_poc_lsb;
+    dst->prev_frame_num_offset  = src->prev_frame_num_offset;
+    dst->prev_frame_num         = src->prev_frame_num;
+    dst->short_ref_count        = src->short_ref_count;
+
+    memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
+    memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
+    memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
+    memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
+
+    memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
+    memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
+}
+
 /**
  * decodes a slice header.
  * this will allso call MPV_common_init() and frame_start() as needed
+ *
+ * @param h h264context
+ * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
+ *
+ * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
  */
-static int decode_slice_header(H264Context *h){
+static int decode_slice_header(H264Context *h, H264Context *h0){
     MpegEncContext * const s = &h->s;
+    MpegEncContext * const s0 = &h0->s;
     unsigned int first_mb_in_slice;
     unsigned int pps_id;
     int num_ref_idx_active_override_flag;
     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
-    unsigned int slice_type, tmp;
+    unsigned int slice_type, tmp, i;
     int default_ref_list_done = 0;
+    int last_pic_structure;
 
-    s->current_picture.reference= h->nal_ref_idc != 0;
     s->dropable= h->nal_ref_idc == 0;
 
+    if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
+        s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
+        s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
+    }else{
+        s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
+        s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
+    }
+
     first_mb_in_slice= get_ue_golomb(&s->gb);
 
     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
-        h->slice_num = 0;
-        s->current_picture_ptr= NULL;
+        h0->current_slice = 0;
+        if (!s0->first_field)
+            s->current_picture_ptr= NULL;
     }
 
     slice_type= get_ue_golomb(&s->gb);
@@ -4534,31 +3900,36 @@ static int decode_slice_header(H264Context *h){
 
     slice_type= slice_type_map[ slice_type ];
     if (slice_type == I_TYPE
-        || (h->slice_num != 0 && slice_type == h->slice_type) ) {
+        || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
         default_ref_list_done = 1;
     }
     h->slice_type= slice_type;
 
     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
+    if (s->pict_type == B_TYPE && s0->last_picture_ptr == NULL) {
+        av_log(h->s.avctx, AV_LOG_ERROR,
+               "B picture before any references, skipping\n");
+        return -1;
+    }
 
     pps_id= get_ue_golomb(&s->gb);
     if(pps_id>=MAX_PPS_COUNT){
         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
         return -1;
     }
-    h->pps= h->pps_buffer[pps_id];
-    if(h->pps.slice_group_count == 0){
+    if(!h0->pps_buffers[pps_id]) {
         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
         return -1;
     }
+    h->pps= *h0->pps_buffers[pps_id];
 
-    h->sps= h->sps_buffer[ h->pps.sps_id ];
-    if(h->sps.log2_max_frame_num == 0){
+    if(!h0->sps_buffers[h->pps.sps_id]) {
         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
         return -1;
     }
+    h->sps = *h0->sps_buffers[h->pps.sps_id];
 
-    if(h->dequant_coeff_pps != pps_id){
+    if(h == h0 && h->dequant_coeff_pps != pps_id){
         h->dequant_coeff_pps = pps_id;
         init_dequant_tables(h);
     }
@@ -4577,58 +3948,35 @@ static int decode_slice_header(H264Context *h){
 
     if (s->context_initialized
         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
+        if(h != h0)
+            return -1;   // width / height changed during parallelized decoding
         free_tables(h);
         MPV_common_end(s);
     }
     if (!s->context_initialized) {
+        if(h != h0)
+            return -1;  // we cant (re-)initialize context during parallel decoding
         if (MPV_common_init(s) < 0)
             return -1;
+        s->first_field = 0;
 
-        if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
-            memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
-            memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
-        }else{
-            int i;
-            for(i=0; i<16; i++){
-#define T(x) (x>>2) | ((x<<2) & 0xF)
-                h->zigzag_scan[i] = T(zigzag_scan[i]);
-                h-> field_scan[i] = T( field_scan[i]);
-#undef T
-            }
-        }
-        if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
-            memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
-            memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
-            memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
-            memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
-        }else{
-            int i;
-            for(i=0; i<64; i++){
-#define T(x) (x>>3) | ((x&7)<<3)
-                h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
-                h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
-                h->field_scan8x8[i]        = T(field_scan8x8[i]);
-                h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
-#undef T
-            }
-        }
-        if(h->sps.transform_bypass){ //FIXME same ugly
-            h->zigzag_scan_q0          = zigzag_scan;
-            h->zigzag_scan8x8_q0       = zigzag_scan8x8;
-            h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
-            h->field_scan_q0           = field_scan;
-            h->field_scan8x8_q0        = field_scan8x8;
-            h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
-        }else{
-            h->zigzag_scan_q0          = h->zigzag_scan;
-            h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
-            h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
-            h->field_scan_q0           = h->field_scan;
-            h->field_scan8x8_q0        = h->field_scan8x8;
-            h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
+        init_scan_tables(h);
+        alloc_tables(h);
+
+        for(i = 1; i < s->avctx->thread_count; i++) {
+            H264Context *c;
+            c = h->thread_context[i] = av_malloc(sizeof(H264Context));
+            memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
+            memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
+            c->sps = h->sps;
+            c->pps = h->pps;
+            init_scan_tables(c);
+            clone_tables(c, h);
         }
 
-        alloc_tables(h);
+        for(i = 0; i < s->avctx->thread_count; i++)
+            if(context_init(h->thread_context[i]) < 0)
+                return -1;
 
         s->avctx->width = s->width;
         s->avctx->height = s->height;
@@ -4645,42 +3993,90 @@ static int decode_slice_header(H264Context *h){
         }
     }
 
-    if(h->slice_num == 0){
-        if(frame_start(h) < 0)
-            return -1;
-    }
-
-    s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
 
     h->mb_mbaff = 0;
     h->mb_aff_frame = 0;
+    last_pic_structure = s0->picture_structure;
     if(h->sps.frame_mbs_only_flag){
         s->picture_structure= PICT_FRAME;
     }else{
         if(get_bits1(&s->gb)) { //field_pic_flag
             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
-            av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
         } else {
             s->picture_structure= PICT_FRAME;
             h->mb_aff_frame = h->sps.mb_aff;
         }
     }
+
+    if(h0->current_slice == 0){
+        /* See if we have a decoded first field looking for a pair... */
+        if (s0->first_field) {
+            assert(s0->current_picture_ptr);
+            assert(s0->current_picture_ptr->data[0]);
+            assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
+
+            /* figure out if we have a complementary field pair */
+            if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
+                /*
+                 * Previous field is unmatched. Don't display it, but let it
+                 * remain for reference if marked as such.
+                 */
+                s0->current_picture_ptr = NULL;
+                s0->first_field = FIELD_PICTURE;
+
+            } else {
+                if (h->nal_ref_idc &&
+                        s0->current_picture_ptr->reference &&
+                        s0->current_picture_ptr->frame_num != h->frame_num) {
+                    /*
+                     * This and previous field were reference, but had
+                     * different frame_nums. Consider this field first in
+                     * pair. Throw away previous field except for reference
+                     * purposes.
+                     */
+                    s0->first_field = 1;
+                    s0->current_picture_ptr = NULL;
+
+                } else {
+                    /* Second field in complementary pair */
+                    s0->first_field = 0;
+                }
+            }
+
+        } else {
+            /* Frame or first field in a potentially complementary pair */
+            assert(!s0->current_picture_ptr);
+            s0->first_field = FIELD_PICTURE;
+        }
+
+        if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
+            s0->first_field = 0;
+            return -1;
+        }
+    }
+    if(h != h0)
+        clone_slice(h, h0);
+
+    s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
+
     assert(s->mb_num == s->mb_width * s->mb_height);
-    if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
+    if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
        first_mb_in_slice                    >= s->mb_num){
         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
         return -1;
     }
     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
-    s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
+    s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
+    if (s->picture_structure == PICT_BOTTOM_FIELD)
+        s->resync_mb_y = s->mb_y = s->mb_y + 1;
     assert(s->mb_y < s->mb_height);
 
     if(s->picture_structure==PICT_FRAME){
         h->curr_pic_num=   h->frame_num;
         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
     }else{
-        h->curr_pic_num= 2*h->frame_num;
+        h->curr_pic_num= 2*h->frame_num + 1;
         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
     }
 
@@ -4716,8 +4112,6 @@ static int decode_slice_header(H264Context *h){
     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
         if(h->slice_type == B_TYPE){
             h->direct_spatial_mv_pred= get_bits1(&s->gb);
-            if(h->sps.mb_aff && h->direct_spatial_mv_pred)
-                av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
         }
         num_ref_idx_active_override_flag= get_bits1(&s->gb);
 
@@ -4754,8 +4148,8 @@ static int decode_slice_header(H264Context *h){
     else
         h->use_weight = 0;
 
-    if(s->current_picture.reference)
-        decode_ref_pic_marking(h);
+    if(h->nal_ref_idc)
+        decode_ref_pic_marking(h0, &s->gb);
 
     if(FRAME_MBAFF)
         fill_mbaff_ref_list(h);
@@ -4776,7 +4170,8 @@ static int decode_slice_header(H264Context *h){
         return -1;
     }
     s->qscale= tmp;
-    h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
+    h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
+    h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
     //FIXME qscale / qp ... stuff
     if(h->slice_type == SP_TYPE){
         get_bits1(&s->gb); /* sp_for_switch_flag */
@@ -4803,21 +4198,39 @@ static int decode_slice_header(H264Context *h){
             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
         }
     }
+
     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
         h->deblocking_filter= 0;
 
+    if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
+        if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
+            /* Cheat slightly for speed:
+               Do not bother to deblock across slices. */
+            h->deblocking_filter = 2;
+        } else {
+            h0->max_contexts = 1;
+            if(!h0->single_decode_warning) {
+                av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
+                h0->single_decode_warning = 1;
+            }
+            if(h != h0)
+                return 1; // deblocking switched inside frame
+        }
+    }
+
 #if 0 //FMO
     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
         slice_group_change_cycle= get_bits(&s->gb, ?);
 #endif
 
-    h->slice_num++;
+    h0->last_slice_type = slice_type;
+    h->slice_num = ++h0->current_slice;
 
     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
-    h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
+    h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
 
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
@@ -4835,14 +4248,6 @@ static int decode_slice_header(H264Context *h){
                );
     }
 
-    if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
-        s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
-        s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
-    }else{
-        s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
-        s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
-    }
-
     return 0;
 }
 
@@ -5161,7 +4566,7 @@ decode_intra_mb:
     if(IS_INTRA_PCM(mb_type)){
         unsigned int x, y;
 
-        // we assume these blocks are very rare so we dont optimize it
+        // We assume these blocks are very rare so we do not optimize it.
         align_get_bits(&s->gb);
 
         // The pixels are stored in the same order as levels in h->mb array.
@@ -5189,7 +4594,8 @@ decode_intra_mb:
 
         // In deblocking, the quantizer is 0
         s->current_picture.qscale_table[mb_xy]= 0;
-        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
+        h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
+        h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
         // All coeffs are present
         memset(h->non_zero_count[mb_xy], 16, 16);
 
@@ -5299,8 +4705,6 @@ decode_intra_mb:
             dct8x8_allowed = get_dct8x8_allowed(h);
 
         for(list=0; list<h->list_count; list++){
-            const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
-
             for(i=0; i<4; i++){
                 if(IS_DIRECT(h->sub_mb_type[i])) {
                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
@@ -5465,7 +4869,7 @@ decode_intra_mb:
 
     if(cbp || IS_INTRA16x16(mb_type)){
         int i8x8, i4x4, chroma_idx;
-        int chroma_qp, dquant;
+        int dquant;
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
         const uint8_t *scan, *scan8x8, *dc_scan;
 
@@ -5494,7 +4898,8 @@ decode_intra_mb:
             else            s->qscale-= 52;
         }
 
-        h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
+        h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
+        h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
         if(IS_INTRA16x16(mb_type)){
             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
                 return -1; //FIXME continue if partitioned and other return -1 too
@@ -5552,9 +4957,10 @@ decode_intra_mb:
 
         if(cbp&0x20){
             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
+                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
                 for(i4x4=0; i4x4<4; i4x4++){
                     const int index= 16 + 4*chroma_idx + i4x4;
-                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
+                    if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
                         return -1;
                     }
                 }
@@ -5713,7 +5119,7 @@ static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
     }else{
         int mb_xy = mb_x + mb_y*s->mb_stride;
         mba_xy = mb_xy - 1;
-        mbb_xy = mb_xy - s->mb_stride;
+        mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
     }
 
     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
@@ -5766,65 +5172,20 @@ static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
         return 3;
 }
 
-static const uint8_t block_idx_x[16] = {
-    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
-};
-static const uint8_t block_idx_y[16] = {
-    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
-};
-static const uint8_t block_idx_xy[4][4] = {
-    { 0, 2, 8,  10},
-    { 1, 3, 9,  11},
-    { 4, 6, 12, 14},
-    { 5, 7, 13, 15}
-};
-
 static int decode_cabac_mb_cbp_luma( H264Context *h) {
-    int cbp = 0;
-    int cbp_b = -1;
-    int i8x8;
-
-    if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
-        cbp_b = h->top_cbp;
-        tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
-    }
-
-    for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
-        int cbp_a = -1;
-        int x, y;
-        int ctx = 0;
-
-        x = block_idx_x[4*i8x8];
-        y = block_idx_y[4*i8x8];
-
-        if( x > 0 )
-            cbp_a = cbp;
-        else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
-            cbp_a = h->left_cbp;
-            tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
-        }
-
-        if( y > 0 )
-            cbp_b = cbp;
-
-        /* No need to test for skip as we put 0 for skip block */
-        /* No need to test for IPCM as we put 1 for IPCM block */
-        if( cbp_a >= 0 ) {
-            int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
-            if( ((cbp_a >> i8x8a)&0x01) == 0 )
-                ctx++;
-        }
-
-        if( cbp_b >= 0 ) {
-            int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
-            if( ((cbp_b >> i8x8b)&0x01) == 0 )
-                ctx += 2;
-        }
-
-        if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
-            cbp |= 1 << i8x8;
-        }
-    }
+    int cbp_b, cbp_a, ctx, cbp = 0;
+
+    cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
+    cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
+
+    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
+    cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
+    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
+    cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
+    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
+    cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
+    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
+    cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
     return cbp;
 }
 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
@@ -5846,16 +5207,9 @@ static int decode_cabac_mb_cbp_chroma( H264Context *h) {
     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
 }
 static int decode_cabac_mb_dqp( H264Context *h) {
-    MpegEncContext * const s = &h->s;
-    int mbn_xy;
     int   ctx = 0;
     int   val = 0;
 
-    if( s->mb_x > 0 )
-        mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
-    else
-        mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
-
     if( h->last_qscale_diff != 0 )
         ctx++;
 
@@ -5978,7 +5332,7 @@ static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
     return get_cabac_bypass_sign( &h->cabac, -mvd );
 }
 
-static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
+static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
     int nza, nzb;
     int ctx = 0;
 
@@ -6006,14 +5360,14 @@ static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
     return ctx + 4 * cat;
 }
 
-static const __attribute((used)) uint8_t last_coeff_flag_offset_8x8[63] = {
+DECLARE_ASM_CONST(1, const uint8_t, last_coeff_flag_offset_8x8[63]) = {
     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
 
-static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
+static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
     static const int significant_coeff_flag_offset[2][6] = {
       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
@@ -6039,7 +5393,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
 
     int index[64];
 
-    int last;
+    int av_unused last;
     int coeff_count = 0;
 
     int abslevel1 = 1;
@@ -6083,7 +5437,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
             h->cabac.low       = cc.low       ;
             h->cabac.bytestream= cc.bytestream;
 #endif
-            return 0;
+            return;
         }
     }
 
@@ -6111,7 +5465,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
             index[coeff_count++] = last;\
         }
         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
-#if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(CONFIG_EBX_AVAILABLE) && !( defined(ARCH_X86_64) && defined(PIC) )
+#if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
     } else {
         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
@@ -6144,7 +5498,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
             if( !qmul ) {
                 block[j] = get_cabac_bypass_sign( CC, -1);
             }else{
-                block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
+                block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
             }
 
             abslevel1++;
@@ -6184,10 +5538,10 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
             h->cabac.low       = cc.low       ;
             h->cabac.bytestream= cc.bytestream;
 #endif
-    return 0;
+
 }
 
-static void inline compute_mb_neighbors(H264Context *h)
+static inline void compute_mb_neighbors(H264Context *h)
 {
     MpegEncContext * const s = &h->s;
     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
@@ -6209,6 +5563,8 @@ static void inline compute_mb_neighbors(H264Context *h)
         if (left_mb_frame_flag != curr_mb_frame_flag) {
             h->left_mb_xy[0] = pair_xy - 1;
         }
+    } else if (FIELD_PICTURE) {
+        h->top_mb_xy -= s->mb_stride;
     }
     return;
 }
@@ -6304,7 +5660,7 @@ decode_intra_mb:
         const uint8_t *ptr;
         unsigned int x, y;
 
-        // We assume these blocks are very rare so we dont optimize it.
+        // We assume these blocks are very rare so we do not optimize it.
         // FIXME The two following lines get the bitstream position in the cabac
         // decode, I think it should be done by a function in cabac.h (or cabac.c).
         ptr= h->cabac.bytestream;
@@ -6343,7 +5699,8 @@ decode_intra_mb:
         h->chroma_pred_mode_table[mb_xy] = 0;
         // In deblocking, the quantizer is 0
         s->current_picture.qscale_table[mb_xy]= 0;
-        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
+        h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
+        h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
         // All coeffs are present
         memset(h->non_zero_count[mb_xy], 16, 16);
         s->current_picture.mb_type[mb_xy]= mb_type;
@@ -6399,6 +5756,10 @@ decode_intra_mb:
             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
                 pred_direct_motion(h, &mb_type);
+                h->ref_cache[0][scan8[4]] =
+                h->ref_cache[1][scan8[4]] =
+                h->ref_cache[0][scan8[12]] =
+                h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
                     for( i = 0; i < 4; i++ )
                         if( IS_DIRECT(h->sub_mb_type[i]) )
@@ -6434,11 +5795,11 @@ decode_intra_mb:
 
         for(list=0; list<h->list_count; list++){
             for(i=0; i<4; i++){
+                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
                 if(IS_DIRECT(h->sub_mb_type[i])){
                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
                     continue;
                 }
-                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
 
                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
                     const int sub_mb_type= h->sub_mb_type[i];
@@ -6597,6 +5958,7 @@ decode_intra_mb:
 
     if( cbp || IS_INTRA16x16( mb_type ) ) {
         const uint8_t *scan, *scan8x8, *dc_scan;
+        const uint32_t *qmul;
         int dqp;
 
         if(IS_INTERLACED(mb_type)){
@@ -6619,18 +5981,19 @@ decode_intra_mb:
             if(s->qscale<0) s->qscale+= 52;
             else            s->qscale-= 52;
         }
-        h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
+        h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
+        h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
 
         if( IS_INTRA16x16( mb_type ) ) {
             int i;
             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
-            if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
-                return -1;
+            decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
+
             if( cbp&15 ) {
+                qmul = h->dequant4_coeff[0][s->qscale];
                 for( i = 0; i < 16; i++ ) {
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
-                    if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
-                        return -1;
+                    decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
                 }
             } else {
                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
@@ -6640,17 +6003,17 @@ decode_intra_mb:
             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
                 if( cbp & (1<<i8x8) ) {
                     if( IS_8x8DCT(mb_type) ) {
-                        if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
-                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
-                            return -1;
-                    } else
-                    for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
-                        const int index = 4*i8x8 + i4x4;
-                        //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
+                        decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
+                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
+                    } else {
+                        qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
+                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
+                            const int index = 4*i8x8 + i4x4;
+                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
 //START_TIMER
-                        if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
-                            return -1;
+                            decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
 //STOP_TIMER("decode_residual")
+                        }
                     }
                 } else {
                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
@@ -6663,19 +6026,18 @@ decode_intra_mb:
             int c;
             for( c = 0; c < 2; c++ ) {
                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
-                    return -1;
+                decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
             }
         }
 
         if( cbp&0x20 ) {
             int c, i;
             for( c = 0; c < 2; c++ ) {
+                qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
                 for( i = 0; i < 4; i++ ) {
                     const int index = 16 + 4 * c + i;
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
-                        return -1;
+                    decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
                 }
             }
         } else {
@@ -7009,23 +6371,27 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t
 
 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
     MpegEncContext * const s = &h->s;
+    int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
     int mb_xy, mb_type;
     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
 
-    if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
+    mb_xy = mb_x + mb_y*s->mb_stride;
+
+    if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
+       (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
+                                      h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
         return;
     }
     assert(!FRAME_MBAFF);
 
-    mb_xy = mb_x + mb_y*s->mb_stride;
     mb_type = s->current_picture.mb_type[mb_xy];
     qp = s->current_picture.qscale_table[mb_xy];
     qp0 = s->current_picture.qscale_table[mb_xy-1];
     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
-    qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
-    qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
-    qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
+    qpc = get_chroma_qp( h, 0, qp );
+    qpc0 = get_chroma_qp( h, 0, qp0 );
+    qpc1 = get_chroma_qp( h, 0, qp1 );
     qp0 = (qp + qp0 + 1) >> 1;
     qp1 = (qp + qp1 + 1) >> 1;
     qpc0 = (qpc + qpc0 + 1) >> 1;
@@ -7038,17 +6404,18 @@ static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
     if( IS_INTRA(mb_type) ) {
         int16_t bS4[4] = {4,4,4,4};
         int16_t bS3[4] = {3,3,3,3};
+        int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
         if( IS_8x8DCT(mb_type) ) {
             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
-            filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
+            filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
         } else {
             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
-            filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
+            filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
@@ -7057,9 +6424,9 @@ static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
-        filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
+        filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
-        filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
+        filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
         return;
     } else {
@@ -7083,7 +6450,7 @@ static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
             bSv[0][0] = 0x0004000400040004ULL;
         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
-            bSv[1][0] = 0x0004000400040004ULL;
+            bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
 
 #define FILTER(hv,dir,edge)\
         if(bSv[dir][edge]) {\
@@ -7131,7 +6498,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
     //for sufficiently low qp, filtering wouldn't do anything
     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
     if(!FRAME_MBAFF){
-        int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
+        int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
         int qp = s->current_picture.qscale_table[mb_xy];
         if(qp <= qp_thresh
            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
@@ -7154,7 +6521,8 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
         int16_t bS[8];
         int qp[2];
-        int chroma_qp[2];
+        int bqp[2];
+        int rqp[2];
         int mb_qp, mbn0_qp, mbn1_qp;
         int i;
         first_vertical_edge_done = 1;
@@ -7180,18 +6548,22 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
-        chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
-                         get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
+        bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
+                   get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
+        rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
+                   get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
-        chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
-                         get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
+        bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
+                   get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
+        rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
+                   get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
 
         /* Filter edge */
-        tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
+        tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
-        filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
-        filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
+        filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
+        filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
     }
     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
     for( dir = 0; dir < 2; dir++ )
@@ -7229,7 +6601,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
             unsigned int tmp_linesize   = 2 *   linesize;
             unsigned int tmp_uvlinesize = 2 * uvlinesize;
             int mbn_xy = mb_xy - 2 * s->mb_stride;
-            int qp, chroma_qp;
+            int qp;
             int i, j;
             int16_t bS[4];
 
@@ -7253,10 +6625,10 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
-                chroma_qp = ( h->chroma_qp +
-                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
-                filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
+                filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
+                                  ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
+                                  ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
             }
 
             start = 1;
@@ -7353,25 +6725,25 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
             if( dir == 0 ) {
                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
                 if( (edge&1) == 0 ) {
-                    int chroma_qp = ( h->chroma_qp +
-                                      get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                    filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
-                    filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
+                    filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
+                                      ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                    filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
+                                      ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
                 }
             } else {
                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
                 if( (edge&1) == 0 ) {
-                    int chroma_qp = ( h->chroma_qp +
-                                      get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
-                    filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
-                    filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
+                    filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
+                                      ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                    filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
+                                      ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
                 }
             }
         }
     }
 }
 
-static int decode_slice(H264Context *h){
+static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
     MpegEncContext * const s = &h->s;
     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
 
@@ -7421,7 +6793,7 @@ static int decode_slice(H264Context *h){
             eos = get_cabac_terminate( &h->cabac );
 
             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
-                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
                 return -1;
             }
@@ -7430,7 +6802,7 @@ static int decode_slice(H264Context *h){
                 s->mb_x = 0;
                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
                 ++s->mb_y;
-                if(FRAME_MBAFF) {
+                if(FIELD_OR_MBAFF_PICTURE) {
                     ++s->mb_y;
                 }
             }
@@ -7467,7 +6839,7 @@ static int decode_slice(H264Context *h){
                 s->mb_x=0;
                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
                 ++s->mb_y;
-                if(FRAME_MBAFF) {
+                if(FIELD_OR_MBAFF_PICTURE) {
                     ++s->mb_y;
                 }
                 if(s->mb_y >= s->mb_height){
@@ -7636,7 +7008,7 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps){
         if( aspect_ratio_idc == EXTENDED_SAR ) {
             sps->sar.num= get_bits(&s->gb, 16);
             sps->sar.den= get_bits(&s->gb, 16);
-        }else if(aspect_ratio_idc < 14){
+        }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
             sps->sar=  pixel_aspect[aspect_ratio_idc];
         }else{
             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
@@ -7753,6 +7125,26 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
     }
 }
 
+/**
+ * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
+ */
+static void *
+alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
+                    const size_t size, const char *name)
+{
+    if(id>=max) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
+        return NULL;
+    }
+
+    if(!vec[id]) {
+        vec[id] = av_mallocz(size);
+        if(vec[id] == NULL)
+            av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
+    }
+    return vec[id];
+}
+
 static inline int decode_seq_parameter_set(H264Context *h){
     MpegEncContext * const s = &h->s;
     int profile_idc, level_idc;
@@ -7769,13 +7161,10 @@ static inline int decode_seq_parameter_set(H264Context *h){
     level_idc= get_bits(&s->gb, 8);
     sps_id= get_ue_golomb(&s->gb);
 
-    if (sps_id >= MAX_SPS_COUNT){
-        // ok it has gone out of hand, someone is sending us bad stuff.
-        av_log(h->s.avctx, AV_LOG_ERROR, "illegal sps_id (%d)\n", sps_id);
+    sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
+    if(sps == NULL)
         return -1;
-    }
 
-    sps= &h->sps_buffer[ sps_id ];
     sps->profile_idc= profile_idc;
     sps->level_idc= level_idc;
 
@@ -7814,8 +7203,9 @@ static inline int decode_seq_parameter_set(H264Context *h){
     }
 
     tmp= get_ue_golomb(&s->gb);
-    if(tmp > MAX_PICTURE_COUNT-2){
+    if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
+        return -1;
     }
     sps->ref_frame_count= tmp;
     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
@@ -7880,19 +7270,25 @@ static inline int decode_seq_parameter_set(H264Context *h){
     return 0;
 }
 
+static void
+build_qp_table(PPS *pps, int t, int index)
+{
+    int i;
+    for(i = 0; i < 255; i++)
+        pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
+}
+
 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
     MpegEncContext * const s = &h->s;
     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
     PPS *pps;
 
-    if(pps_id>=MAX_PPS_COUNT){
-        av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
+    pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
+    if(pps == NULL)
         return -1;
-    }
-    pps = &h->pps_buffer[pps_id];
 
     tmp= get_ue_golomb(&s->gb);
-    if(tmp>=MAX_SPS_COUNT){
+    if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
         return -1;
     }
@@ -7950,7 +7346,7 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
     pps->init_qp= get_se_golomb(&s->gb) + 26;
     pps->init_qs= get_se_golomb(&s->gb) + 26;
-    pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
+    pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
     pps->constrained_intra_pred= get_bits1(&s->gb);
     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
@@ -7962,18 +7358,27 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
 
     if(get_bits_count(&s->gb) < bit_length){
         pps->transform_8x8_mode= get_bits1(&s->gb);
-        decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
-        get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
+        decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
+        pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
+    } else {
+        pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
     }
 
+    build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
+    if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
+        build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
+        h->pps.chroma_qp_diff= 1;
+    } else
+        memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
+
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-        av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
+        av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
                pps_id, pps->sps_id,
                pps->cabac ? "CABAC" : "CAVLC",
                pps->slice_group_count,
                pps->ref_count[0], pps->ref_count[1],
                pps->weighted_pred ? "weighted" : "",
-               pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
+               pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
                pps->deblocking_filter_parameters_present ? "LPAR" : "",
                pps->constrained_intra_pred ? "CONSTR" : "",
                pps->redundant_pic_cnt_present ? "REDU" : "",
@@ -7985,119 +7390,49 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
 }
 
 /**
- * finds the end of the current frame in the bitstream.
- * @return the position of the first byte of the next frame, or -1
+ * Call decode_slice() for each context.
+ *
+ * @param h h264 master context
+ * @param context_count number of contexts to execute
  */
-static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
+static void execute_decode_slices(H264Context *h, int context_count){
+    MpegEncContext * const s = &h->s;
+    AVCodecContext * const avctx= s->avctx;
+    H264Context *hx;
     int i;
-    uint32_t state;
-    ParseContext *pc = &(h->s.parse_context);
-//printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
-//    mb_addr= pc->mb_addr - 1;
-    state= pc->state;
-    if(state>13)
-        state= 7;
-
-    for(i=0; i<buf_size; i++){
-        if(state==7){
-            for(; i<buf_size; i++){
-                if(!buf[i]){
-                    state=2;
-                    break;
-                }
-            }
-        }else if(state<=2){
-            if(buf[i]==1)   state^= 5; //2->7, 1->4, 0->5
-            else if(buf[i]) state = 7;
-            else            state>>=1; //2->1, 1->0, 0->0
-        }else if(state<=5){
-            int v= buf[i] & 0x1F;
-            if(v==7 || v==8 || v==9){
-                if(pc->frame_start_found){
-                    i++;
-found:
-                    pc->state=7;
-                    pc->frame_start_found= 0;
-                    return i-(state&5);
-                }
-            }else if(v==1 || v==2 || v==5){
-                if(pc->frame_start_found){
-                    state+=8;
-                    continue;
-                }else
-                    pc->frame_start_found = 1;
-            }
-            state= 7;
-        }else{
-            if(buf[i] & 0x80)
-                goto found;
-            state= 7;
-        }
-    }
-    pc->state= state;
-    return END_NOT_FOUND;
-}
-
-#ifdef CONFIG_H264_PARSER
-static int h264_parse(AVCodecParserContext *s,
-                      AVCodecContext *avctx,
-                      uint8_t **poutbuf, int *poutbuf_size,
-                      const uint8_t *buf, int buf_size)
-{
-    H264Context *h = s->priv_data;
-    ParseContext *pc = &h->s.parse_context;
-    int next;
-
-    if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
-        next= buf_size;
-    }else{
-        next= find_frame_end(h, buf, buf_size);
-
-        if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
-            *poutbuf = NULL;
-            *poutbuf_size = 0;
-            return buf_size;
-        }
 
-        if(next<0){
-            find_frame_end(h, &pc->buffer[pc->last_index + next], -next); //update state
+    if(context_count == 1) {
+        decode_slice(avctx, h);
+    } else {
+        for(i = 1; i < context_count; i++) {
+            hx = h->thread_context[i];
+            hx->s.error_resilience = avctx->error_resilience;
+            hx->s.error_count = 0;
         }
-    }
-
-    *poutbuf = (uint8_t *)buf;
-    *poutbuf_size = buf_size;
-    return next;
-}
 
-static int h264_split(AVCodecContext *avctx,
-                      const uint8_t *buf, int buf_size)
-{
-    int i;
-    uint32_t state = -1;
-    int has_sps= 0;
+        avctx->execute(avctx, (void *)decode_slice,
+                       (void **)h->thread_context, NULL, context_count);
 
-    for(i=0; i<=buf_size; i++){
-        if((state&0xFFFFFF1F) == 0x107)
-            has_sps=1;
-/*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
-        }*/
-        if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
-            if(has_sps){
-                while(i>4 && buf[i-5]==0) i--;
-                return i-4;
-            }
-        }
-        if (i<buf_size)
-            state= (state<<8) | buf[i];
+        /* pull back stuff from slices to master context */
+        hx = h->thread_context[context_count - 1];
+        s->mb_x = hx->s.mb_x;
+        s->mb_y = hx->s.mb_y;
+        s->dropable = hx->s.dropable;
+        s->picture_structure = hx->s.picture_structure;
+        for(i = 1; i < context_count; i++)
+            h->s.error_count += h->thread_context[i]->s.error_count;
     }
-    return 0;
 }
-#endif /* CONFIG_H264_PARSER */
 
-static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
+
+static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
     MpegEncContext * const s = &h->s;
     AVCodecContext * const avctx= s->avctx;
     int buf_index=0;
+    H264Context *hx; ///< thread context
+    int context_count = 0;
+
+    h->max_contexts = avctx->thread_count;
 #if 0
     int i;
     for(i=0; i<50; i++){
@@ -8105,54 +7440,58 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
     }
 #endif
     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
-        h->slice_num = 0;
-        s->current_picture_ptr= NULL;
+        h->current_slice = 0;
+        if (!s->first_field)
+            s->current_picture_ptr= NULL;
     }
 
     for(;;){
         int consumed;
         int dst_length;
         int bit_length;
-        uint8_t *ptr;
+        const uint8_t *ptr;
         int i, nalsize = 0;
-
-      if(h->is_avc) {
-        if(buf_index >= buf_size) break;
-        nalsize = 0;
-        for(i = 0; i < h->nal_length_size; i++)
-            nalsize = (nalsize << 8) | buf[buf_index++];
-        if(nalsize <= 1 || nalsize > buf_size){
-            if(nalsize == 1){
-                buf_index++;
-                continue;
-            }else{
-                av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
-                break;
+        int err;
+
+        if(h->is_avc) {
+            if(buf_index >= buf_size) break;
+            nalsize = 0;
+            for(i = 0; i < h->nal_length_size; i++)
+                nalsize = (nalsize << 8) | buf[buf_index++];
+            if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
+                if(nalsize == 1){
+                    buf_index++;
+                    continue;
+                }else{
+                    av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
+                    break;
+                }
+            }
+        } else {
+            // start code prefix search
+            for(; buf_index + 3 < buf_size; buf_index++){
+                // This should always succeed in the first iteration.
+                if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
+                    break;
             }
-        }
-      } else {
-        // start code prefix search
-        for(; buf_index + 3 < buf_size; buf_index++){
-            // this should allways succeed in the first iteration
-            if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
-                break;
-        }
 
-        if(buf_index+3 >= buf_size) break;
+            if(buf_index+3 >= buf_size) break;
 
-        buf_index+=3;
-      }
+            buf_index+=3;
+        }
+
+        hx = h->thread_context[context_count];
 
-        ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
-        if (ptr==NULL || dst_length <= 0){
+        ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
+        if (ptr==NULL || dst_length < 0){
             return -1;
         }
-        while(ptr[dst_length - 1] == 0 && dst_length > 1)
+        while(ptr[dst_length - 1] == 0 && dst_length > 0)
             dst_length--;
-        bit_length= 8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1);
+        bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
 
         if(s->avctx->debug&FF_DEBUG_STARTCODE){
-            av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
+            av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
         }
 
         if (h->is_avc && (nalsize != consumed))
@@ -8160,57 +7499,60 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
 
         buf_index += consumed;
 
-        if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
+        if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
             continue;
 
-        switch(h->nal_unit_type){
+      again:
+        err = 0;
+        switch(hx->nal_unit_type){
         case NAL_IDR_SLICE:
+            if (h->nal_unit_type != NAL_IDR_SLICE) {
+                av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
+                return -1;
+            }
             idr(h); //FIXME ensure we don't loose some frames if there is reordering
         case NAL_SLICE:
-            init_get_bits(&s->gb, ptr, bit_length);
-            h->intra_gb_ptr=
-            h->inter_gb_ptr= &s->gb;
-            s->data_partitioning = 0;
-
-            if(decode_slice_header(h) < 0){
-                av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
-                break;
-            }
-            s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
-            if(h->redundant_pic_count==0 && s->hurry_up < 5
-               && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
-               && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
-               && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
+            init_get_bits(&hx->s.gb, ptr, bit_length);
+            hx->intra_gb_ptr=
+            hx->inter_gb_ptr= &hx->s.gb;
+            hx->s.data_partitioning = 0;
+
+            if((err = decode_slice_header(hx, h)))
+               break;
+
+            s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
+            if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
+               && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
+               && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
+               && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
                && avctx->skip_frame < AVDISCARD_ALL)
-                decode_slice(h);
+                context_count++;
             break;
         case NAL_DPA:
-            init_get_bits(&s->gb, ptr, bit_length);
-            h->intra_gb_ptr=
-            h->inter_gb_ptr= NULL;
-            s->data_partitioning = 1;
+            init_get_bits(&hx->s.gb, ptr, bit_length);
+            hx->intra_gb_ptr=
+            hx->inter_gb_ptr= NULL;
+            hx->s.data_partitioning = 1;
 
-            if(decode_slice_header(h) < 0){
-                av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
-            }
+            err = decode_slice_header(hx, h);
             break;
         case NAL_DPB:
-            init_get_bits(&h->intra_gb, ptr, bit_length);
-            h->intra_gb_ptr= &h->intra_gb;
+            init_get_bits(&hx->intra_gb, ptr, bit_length);
+            hx->intra_gb_ptr= &hx->intra_gb;
             break;
         case NAL_DPC:
-            init_get_bits(&h->inter_gb, ptr, bit_length);
-            h->inter_gb_ptr= &h->inter_gb;
+            init_get_bits(&hx->inter_gb, ptr, bit_length);
+            hx->inter_gb_ptr= &hx->inter_gb;
 
-            if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
+            if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
                && s->context_initialized
                && s->hurry_up < 5
-               && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
-               && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
-               && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
+               && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
+               && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
+               && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
                && avctx->skip_frame < AVDISCARD_ALL)
-                decode_slice(h);
+                context_count++;
             break;
         case NAL_SEI:
             init_get_bits(&s->gb, ptr, bit_length);
@@ -8240,10 +7582,29 @@ static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
         case NAL_AUXILIARY_SLICE:
             break;
         default:
-            av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
+            av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
+        }
+
+        if(context_count == h->max_contexts) {
+            execute_decode_slices(h, context_count);
+            context_count = 0;
         }
-    }
 
+        if (err < 0)
+            av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
+        else if(err == 1) {
+            /* Slice could not be decoded in parallel mode, copy down
+             * NAL unit stuff to context 0 and restart. Note that
+             * rbsp_buffer is not transfered, but since we no longer
+             * run in parallel mode this should not be an issue. */
+            h->nal_unit_type = hx->nal_unit_type;
+            h->nal_ref_idc   = hx->nal_ref_idc;
+            hx = h;
+            goto again;
+        }
+    }
+    if(context_count)
+        execute_decode_slices(h, context_count);
     return buf_index;
 }
 
@@ -8257,7 +7618,7 @@ static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
 
         return pos;
     }else{
-        if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
+        if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
         if(pos+10>buf_size) pos=buf_size; // oops ;)
 
         return pos;
@@ -8266,7 +7627,7 @@ static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
 
 static int decode_frame(AVCodecContext *avctx,
                              void *data, int *data_size,
-                             uint8_t *buf, int buf_size)
+                             const uint8_t *buf, int buf_size)
 {
     H264Context *h = avctx->priv_data;
     MpegEncContext *s = &h->s;
@@ -8302,9 +7663,9 @@ static int decode_frame(AVCodecContext *avctx,
     }
 
     if(s->flags&CODEC_FLAG_TRUNCATED){
-        int next= find_frame_end(h, buf, buf_size);
+        int next= ff_h264_find_frame_end(h, buf, buf_size);
 
-        if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
+        if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
             return buf_size;
 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
     }
@@ -8360,6 +7721,7 @@ static int decode_frame(AVCodecContext *avctx,
         return -1;
 
     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
+        if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
         return -1;
     }
@@ -8377,87 +7739,109 @@ static int decode_frame(AVCodecContext *avctx,
 
         h->prev_frame_num_offset= h->frame_num_offset;
         h->prev_frame_num= h->frame_num;
-        if(s->current_picture_ptr->reference){
+        if(!s->dropable) {
             h->prev_poc_msb= h->poc_msb;
             h->prev_poc_lsb= h->poc_lsb;
-        }
-        if(s->current_picture_ptr->reference)
             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
+        }
 
-        ff_er_frame_end(s);
+        /*
+         * FIXME: Error handling code does not seem to support interlaced
+         * when slices span multiple rows
+         * The ff_er_add_slice calls don't work right for bottom
+         * fields; they cause massive erroneous error concealing
+         * Error marking covers both fields (top and bottom).
+         * This causes a mismatched s->error_count
+         * and a bad error table. Further, the error count goes to
+         * INT_MAX when called for bottom field, because mb_y is
+         * past end by one (callers fault) and resync_mb_y != 0
+         * causes problems for the first MB line, too.
+         */
+        if (!FIELD_PICTURE)
+            ff_er_frame_end(s);
 
         MPV_frame_end(s);
 
-    //FIXME do something with unavailable reference frames
+        if (s->first_field) {
+            /* Wait for second field. */
+            *data_size = 0;
 
-#if 0 //decode order
-        *data_size = sizeof(AVFrame);
-#else
-        /* Sort B-frames into display order */
+        } else {
+            cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
+            /* Derive top_field_first from field pocs. */
+            cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
 
-        if(h->sps.bitstream_restriction_flag
-           && s->avctx->has_b_frames < h->sps.num_reorder_frames){
-            s->avctx->has_b_frames = h->sps.num_reorder_frames;
-            s->low_delay = 0;
-        }
+        //FIXME do something with unavailable reference frames
 
-        pics = 0;
-        while(h->delayed_pic[pics]) pics++;
+#if 0 //decode order
+            *data_size = sizeof(AVFrame);
+#else
+            /* Sort B-frames into display order */
 
-        assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
+            if(h->sps.bitstream_restriction_flag
+               && s->avctx->has_b_frames < h->sps.num_reorder_frames){
+                s->avctx->has_b_frames = h->sps.num_reorder_frames;
+                s->low_delay = 0;
+            }
 
-        h->delayed_pic[pics++] = cur;
-        if(cur->reference == 0)
-            cur->reference = 1;
+            pics = 0;
+            while(h->delayed_pic[pics]) pics++;
 
-        cross_idr = 0;
-        for(i=0; h->delayed_pic[i]; i++)
-            if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
-                cross_idr = 1;
+            assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
 
-        out = h->delayed_pic[0];
-        out_idx = 0;
-        for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
-            if(h->delayed_pic[i]->poc < out->poc){
-                out = h->delayed_pic[i];
-                out_idx = i;
-            }
+            h->delayed_pic[pics++] = cur;
+            if(cur->reference == 0)
+                cur->reference = DELAYED_PIC_REF;
 
-        out_of_order = !cross_idr && prev && out->poc < prev->poc;
-        if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
-            { }
-        else if(prev && pics <= s->avctx->has_b_frames)
-            out = prev;
-        else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
-           || (s->low_delay &&
-            ((!cross_idr && prev && out->poc > prev->poc + 2)
-             || cur->pict_type == B_TYPE)))
-        {
-            s->low_delay = 0;
-            s->avctx->has_b_frames++;
-            out = prev;
-        }
-        else if(out_of_order)
-            out = prev;
+            cross_idr = 0;
+            for(i=0; h->delayed_pic[i]; i++)
+                if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
+                    cross_idr = 1;
 
-        if(out_of_order || pics > s->avctx->has_b_frames){
-            for(i=out_idx; h->delayed_pic[i]; i++)
-                h->delayed_pic[i] = h->delayed_pic[i+1];
-        }
+            out = h->delayed_pic[0];
+            out_idx = 0;
+            for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
+                if(h->delayed_pic[i]->poc < out->poc){
+                    out = h->delayed_pic[i];
+                    out_idx = i;
+                }
 
-        if(prev == out)
-            *data_size = 0;
-        else
-            *data_size = sizeof(AVFrame);
-        if(prev && prev != out && prev->reference == 1)
-            prev->reference = 0;
-        h->delayed_output_pic = out;
+            out_of_order = !cross_idr && prev && out->poc < prev->poc;
+            if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
+                { }
+            else if(prev && pics <= s->avctx->has_b_frames)
+                out = prev;
+            else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
+               || (s->low_delay &&
+                ((!cross_idr && prev && out->poc > prev->poc + 2)
+                 || cur->pict_type == B_TYPE)))
+            {
+                s->low_delay = 0;
+                s->avctx->has_b_frames++;
+                out = prev;
+            }
+            else if(out_of_order)
+                out = prev;
+
+            if(out_of_order || pics > s->avctx->has_b_frames){
+                for(i=out_idx; h->delayed_pic[i]; i++)
+                    h->delayed_pic[i] = h->delayed_pic[i+1];
+            }
+
+            if(prev == out)
+                *data_size = 0;
+            else
+                *data_size = sizeof(AVFrame);
+            if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
+                prev->reference = 0;
+            h->delayed_output_pic = out;
 #endif
 
-        if(out)
-            *pict= *(AVFrame*)out;
-        else
-            av_log(avctx, AV_LOG_DEBUG, "no picture\n");
+            if(out)
+                *pict= *(AVFrame*)out;
+            else
+                av_log(avctx, AV_LOG_DEBUG, "no picture\n");
+        }
     }
 
     assert(pict->data[0] || !*data_size);
@@ -8466,7 +7850,7 @@ static int decode_frame(AVCodecContext *avctx,
 #if 0 //?
 
     /* Return the Picture timestamp as the frame number */
-    /* we substract 1 because it is added on utils.c    */
+    /* we subtract 1 because it is added on utils.c     */
     avctx->frame_number = s->picture_number - 1;
 #endif
     return get_consumed_bytes(s, buf_index, buf_size);
@@ -8491,10 +7875,12 @@ static inline void fill_mb_avail(H264Context *h){
 }
 #endif
 
-#if 0 //selftest
+#ifdef TEST
+#undef printf
+#undef random
 #define COUNT 8000
 #define SIZE (COUNT*40)
-int main(){
+int main(void){
     int i;
     uint8_t temp[SIZE];
     PutBitContext pb;
@@ -8523,7 +7909,7 @@ int main(){
         START_TIMER
         j= get_ue_golomb(&gb);
         if(j != i){
-            printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
+            printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
 //            return -1;
         }
         STOP_TIMER("get_ue_golomb");
@@ -8548,12 +7934,13 @@ int main(){
         START_TIMER
         j= get_se_golomb(&gb);
         if(j != i - COUNT/2){
-            printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
+            printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
 //            return -1;
         }
         STOP_TIMER("get_se_golomb");
     }
 
+#if 0
     printf("testing 4x4 (I)DCT\n");
 
     DCTELEM block[16];
@@ -8593,14 +7980,12 @@ int main(){
         }
     }
     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
-#if 0
     printf("testing quantizer\n");
     for(qp=0; qp<52; qp++){
         for(i=0; i<16; i++)
             src1_block[i]= src2_block[i]= random()%255;
 
     }
-#endif
     printf("Testing NAL layer\n");
 
     uint8_t bitstream[COUNT];
@@ -8652,17 +8037,18 @@ int main(){
         }
 
         if(memcmp(bitstream, out, COUNT)){
-            printf("missmatch\n");
+            printf("mismatch\n");
             return -1;
         }
     }
+#endif
 
     printf("Testing RBSP\n");
 
 
     return 0;
 }
-#endif
+#endif /* TEST */
 
 
 static int decode_end(AVCodecContext *avctx)
@@ -8670,7 +8056,8 @@ static int decode_end(AVCodecContext *avctx)
     H264Context *h = avctx->priv_data;
     MpegEncContext *s = &h->s;
 
-    av_freep(&h->rbsp_buffer);
+    av_freep(&h->rbsp_buffer[0]);
+    av_freep(&h->rbsp_buffer[1]);
     free_tables(h); //FIXME cleanup init stuff perhaps
     MPV_common_end(s);
 
@@ -8693,15 +8080,4 @@ AVCodec h264_decoder = {
     .flush= flush_dpb,
 };
 
-#ifdef CONFIG_H264_PARSER
-AVCodecParser h264_parser = {
-    { CODEC_ID_H264 },
-    sizeof(H264Context),
-    NULL,
-    h264_parse,
-    ff_parse_close,
-    h264_split,
-};
-#endif
-
 #include "svq3.c"