15 files changed, 5770 insertions, 0 deletions
diff --git a/src/libmpeg2/Makefile.am b/src/libmpeg2/Makefile.am
new file mode 100644
index 000000000..a7031295c
--- /dev/null
+++ b/src/libmpeg2/Makefile.am
@@ -0,0 +1,23 @@
+CFLAGS = @BUILD_LIB_STATIC@ @LIBMPEG2_CFLAGS@ @GLOBAL_CFLAGS@
+
+EXTRA_DIST = idct_mlib.c idct_mlib.h motion_comp_mlib.c
+
+noinst_LTLIBRARIES = libmpeg2.la
+
+#libmpeg2_la_SOURCES = slice.c header.c stats.c idct.c motion_comp.c\
+#	decode.c  idct_mmx.c motion_comp_mmx.c
+libmpeg2_la_SOURCES = slice.c header.c stats.c idct.c motion_comp.c\
+	decode.c idct_mmx.c motion_comp_mmx.c
+
+noinst_HEADERS = vlc.h mpeg2.h mpeg2_internal.h 
+
+debug:
+	$(MAKE) CFLAGS="$(DEBUG_CFLAGS) @BUILD_LIB_STATIC@"
+
+mostlyclean-generic:
+	-rm -f *~ \#* .*~ .\#*
+
+maintainer-clean-generic:
+	-@echo "This command is intended for maintainers to use;"
+	-@echo "it deletes files that may require special tools to rebuild."
+	-rm -f Makefile.in
diff --git a/src/libmpeg2/decode.c b/src/libmpeg2/decode.c
new file mode 100644
index 000000000..77e198fbf
--- /dev/null
+++ b/src/libmpeg2/decode.c
@@ -0,0 +1,323 @@
+/*
+ * decode.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>	/* memcpy/memset, try to remove */
+#include <stdlib.h>
+#include <inttypes.h>
+
+/*  Xine specific */
+#include "buffer.h"
+#include "video_decoder.h"
+/* */ 
+
+
+#include "video_out.h"
+#include "mpeg2.h"
+#include "mpeg2_internal.h"
+#include "cpu_accel.h"
+#include "attributes.h"
+
+#ifdef HAVE_MEMALIGN
+/* some systems have memalign() but no declaration for it */
+void * memalign (size_t align, size_t size);
+#else
+/* assume malloc alignment is sufficient */
+#define memalign(align,size) malloc (size)
+#endif
+
+#define BUFFER_SIZE (224 * 1024)
+
+mpeg2_config_t config;
+
+void mpeg2_init (mpeg2dec_t * mpeg2dec, uint32_t mm_accel,
+		 vo_instance_t * output)
+{
+    static int do_init = 1;
+
+    if (do_init) {
+	do_init = 0;
+	config.flags = mm_accel;
+	idct_init ();
+	motion_comp_init ();
+    }
+
+    mpeg2dec->chunk_buffer = memalign (16, BUFFER_SIZE + 4);
+    mpeg2dec->picture = memalign (16, sizeof (picture_t));
+
+    mpeg2dec->shift = 0xffffff00;
+    mpeg2dec->is_sequence_needed = 1;
+    mpeg2dec->drop_flag = 0;
+    mpeg2dec->drop_frame = 0;
+    mpeg2dec->in_slice = 0;
+    mpeg2dec->output = output;
+    mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+    mpeg2dec->code = 0xb4;
+
+    memset (mpeg2dec->picture, 0, sizeof (picture_t));
+
+    /* initialize supstructures */
+    header_state_init (mpeg2dec->picture);
+}
+
+static inline int parse_chunk (mpeg2dec_t * mpeg2dec, int code,
+			       uint8_t * buffer, uint32_t pts)
+{
+    picture_t * picture;
+    int is_frame_done;
+
+    /* wait for sequence_header_code */
+    if (mpeg2dec->is_sequence_needed && (code != 0xb3))
+	return 0;
+
+    stats_header (code, buffer);
+
+    picture = mpeg2dec->picture;
+    is_frame_done = mpeg2dec->in_slice && ((!code) || (code >= 0xb0));
+
+    if (is_frame_done) {
+	mpeg2dec->in_slice = 0;
+
+	if (((picture->picture_structure == FRAME_PICTURE) ||
+	     (picture->second_field)) &&
+	    (!(mpeg2dec->drop_frame))) {
+	    vo_draw ((picture->picture_coding_type == B_TYPE) ?
+		     picture->current_frame :
+		     picture->forward_reference_frame);
+#ifdef ARCH_X86
+	    if (config.flags & MM_ACCEL_X86_MMX)
+		emms ();
+#endif
+	}
+    }
+
+    switch (code) {
+    case 0x00:	/* picture_start_code */
+	if (header_process_picture_header (picture, buffer)) {
+	    fprintf (stderr, "bad picture header\n");
+	    exit (1);
+	}
+
+	if (mpeg2dec->pts) {
+	  picture->current_frame->PTS = mpeg2dec->pts;
+	  mpeg2dec->pts = 0;
+	}
+
+	mpeg2dec->drop_frame =
+	    mpeg2dec->drop_flag && (picture->picture_coding_type == B_TYPE);
+	break;
+
+    case 0xb3:	/* sequence_header_code */
+	if (header_process_sequence_header (picture, buffer)) {
+	    fprintf (stderr, "bad sequence header\n");
+	    exit (1);
+	}
+	if (mpeg2dec->is_sequence_needed) {
+	    mpeg2dec->is_sequence_needed = 0;
+	    if (vo_setup (mpeg2dec->output, picture->coded_picture_width,
+			  picture->coded_picture_height)) {
+		fprintf (stderr, "display setup failed\n");
+		exit (1);
+	    }
+	    picture->forward_reference_frame =
+		vo_get_frame (mpeg2dec->output,
+			      VO_PREDICTION_FLAG | VO_BOTH_FIELDS);
+	    picture->backward_reference_frame =
+		vo_get_frame (mpeg2dec->output,
+			      VO_PREDICTION_FLAG | VO_BOTH_FIELDS);
+	}
+	mpeg2dec->frame_rate_code = picture->frame_rate_code;	/* FIXME */
+	break;
+
+    case 0xb5:	/* extension_start_code */
+	if (header_process_extension (picture, buffer)) {
+	    fprintf (stderr, "bad extension\n");
+	    exit (1);
+	}
+	break;
+
+    default:
+	if (code >= 0xb9)
+	    fprintf (stderr, "stream not demultiplexed ?\n");
+
+	if (code >= 0xb0)
+	    break;
+
+	if (!(mpeg2dec->in_slice)) {
+	    mpeg2dec->in_slice = 1;
+
+	    if (picture->second_field)
+		vo_field (picture->current_frame, picture->picture_structure);
+	    /*
+	    else {
+		if (picture->picture_coding_type == B_TYPE)
+		    picture->current_frame =
+			vo_get_frame (mpeg2dec->output,
+				      picture->picture_structure);
+		else {
+		    picture->current_frame =
+			vo_get_frame (mpeg2dec->output,
+				      (VO_PREDICTION_FLAG |
+				       picture->picture_structure));
+		    picture->forward_reference_frame =
+			picture->backward_reference_frame;
+		    picture->backward_reference_frame = picture->current_frame;
+		}
+		}*/
+	}
+
+	if (!(mpeg2dec->drop_frame)) {
+	    slice_process (picture, code, buffer);
+
+#ifdef ARCH_X86
+	    if (config.flags & MM_ACCEL_X86_MMX)
+		emms ();
+#endif
+	}
+    }
+
+    return is_frame_done;
+}
+
+static inline uint8_t * copy_chunk (mpeg2dec_t * mpeg2dec,
+				    uint8_t * current, uint8_t * end)
+{
+    uint32_t shift;
+    uint8_t * chunk_ptr;
+    uint8_t * limit;
+    uint8_t byte;
+
+    shift = mpeg2dec->shift;
+    chunk_ptr = mpeg2dec->chunk_ptr;
+    limit = current + (mpeg2dec->chunk_buffer + BUFFER_SIZE - chunk_ptr);
+    if (limit > end)
+	limit = end;
+
+    while (1) {
+	byte = *current++;
+	if (shift != 0x00000100) {
+	    shift = (shift | byte) << 8;
+	    *chunk_ptr++ = byte;
+	    if (current < limit)
+		continue;
+	    if (current == end) {
+		mpeg2dec->chunk_ptr = chunk_ptr;
+		mpeg2dec->shift = shift;
+		return NULL;
+	    } else {
+		/* we filled the chunk buffer without finding a start code */
+		mpeg2dec->code = 0xb4;	/* sequence_error_code */
+		mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+		return current;
+	    }
+	}
+	mpeg2dec->code = byte;
+	mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+	mpeg2dec->shift = 0xffffff00;
+	return current;
+    }
+}
+
+int mpeg2_decode_data (mpeg2dec_t * mpeg2dec, uint8_t * current, 
+		       uint8_t * end, uint32_t pts)
+{
+    int ret;
+    uint8_t code;
+
+    ret = 0;
+
+    mpeg2dec->pts = pts;
+    while (current != end) {
+	code = mpeg2dec->code;
+	current = copy_chunk (mpeg2dec, current, end);
+	if (current == NULL)
+	    return ret;
+	ret += parse_chunk (mpeg2dec, code, mpeg2dec->chunk_buffer, pts);
+    }
+    return ret;
+}
+
+void mpeg2_close (mpeg2dec_t * mpeg2dec)
+{
+    static uint8_t finalizer[] = {0,0,1,0};
+
+    mpeg2_decode_data (mpeg2dec, finalizer, finalizer+4, mpeg2dec->pts);
+
+    if (! (mpeg2dec->is_sequence_needed))
+	vo_draw (mpeg2dec->picture->backward_reference_frame);
+
+    free (mpeg2dec->chunk_buffer);
+    free (mpeg2dec->picture);
+}
+
+void mpeg2_drop (mpeg2dec_t * mpeg2dec, int flag)
+{
+    mpeg2dec->drop_flag = flag;
+}
+
+/*
+ * xine specific stuff
+ */
+
+int mpeg2dec_get_version () {
+  return 1;
+}
+
+int mpeg2dec_can_handle (int buf_type) {
+  return (buf_type == BUF_VIDEO_MPEG) ;
+}
+
+
+static mpeg2dec_t gMpeg2;
+
+void mpeg2dec_init (vo_instance_t *video_out) {
+  uint32_t mmacc = mm_accel();
+
+  mpeg2_init (&gMpeg2, mmacc, video_out);
+}
+
+void mpeg2dec_decode_data (buf_element_t *buf) {
+  mpeg2_decode_data (&gMpeg2, buf->content, buf->content + buf->size,
+		     buf->PTS);
+}
+
+void mpeg2dec_release_img_buffers () {
+  //  decode_free_image_buffers (&gMpeg2);
+}
+
+void mpeg2dec_close () {
+  mpeg2_close (&gMpeg2);
+}
+
+static video_decoder_t vd_mpeg2dec = {
+  mpeg2dec_get_version,
+  mpeg2dec_can_handle,
+  mpeg2dec_init,
+  mpeg2dec_decode_data,
+  mpeg2dec_release_img_buffers,
+  mpeg2dec_close
+};
+
+video_decoder_t *init_video_decoder_mpeg2dec () {
+  return &vd_mpeg2dec;
+}
diff --git a/src/libmpeg2/header.c b/src/libmpeg2/header.c
new file mode 100644
index 000000000..e021b2f8e
--- /dev/null
+++ b/src/libmpeg2/header.c
@@ -0,0 +1,235 @@
+/*
+ * slice.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include "attributes.h"
+
+/* default intra quant matrix, in zig-zag order */
+static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
+    8,
+    16, 16,
+    19, 16, 19,
+    22, 22, 22, 22,
+    22, 22, 26, 24, 26,
+    27, 27, 27, 26, 26, 26,
+    26, 27, 27, 27, 29, 29, 29,
+    34, 34, 34, 29, 29, 29, 27, 27,
+    29, 29, 32, 32, 34, 34, 37,
+    38, 37, 35, 35, 34, 35,
+    38, 38, 40, 40, 40,
+    48, 48, 46, 46,
+    56, 56, 58,
+    69, 69,
+    83
+};
+
+uint8_t scan_norm[64] ATTR_ALIGN(16) =
+{
+    /* Zig-Zag scan pattern */
+     0, 1, 8,16, 9, 2, 3,10,
+    17,24,32,25,18,11, 4, 5,
+    12,19,26,33,40,48,41,34,
+    27,20,13, 6, 7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+uint8_t scan_alt[64] ATTR_ALIGN(16) =
+{
+    /* Alternate scan pattern */
+    0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49,
+    41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43,
+    51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45,
+    53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63
+};
+
+void header_state_init (picture_t * picture)
+{
+    picture->scan = scan_norm;
+}
+
+int header_process_sequence_header (picture_t * picture, uint8_t * buffer)
+{
+    int width, height;
+    int i;
+
+    if ((buffer[6] & 0x20) != 0x20)
+	return 1;	/* missing marker_bit */
+
+    height = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+
+    width = ((height >> 12) + 15) & ~15;
+    height = ((height & 0xfff) + 15) & ~15;
+
+    if ((width > 768) || (height > 576))
+	return 1;	/* size restrictions for MP@ML or MPEG1 */
+
+    picture->coded_picture_width = width;
+    picture->coded_picture_height = height;
+
+    /* this is not used by the decoder */
+    picture->aspect_ratio_information = buffer[3] >> 4;
+    picture->frame_rate_code = buffer[3] & 15;
+    picture->bitrate = (buffer[4]<<10)|(buffer[5]<<2)|(buffer[6]>>6);
+
+    if (buffer[7] & 2) {
+	for (i = 0; i < 64; i++)
+	    picture->intra_quantizer_matrix[scan_norm[i]] =
+		(buffer[i+7] << 7) | (buffer[i+8] >> 1);
+	buffer += 64;
+    } else {
+	for (i = 0; i < 64; i++)
+	    picture->intra_quantizer_matrix[scan_norm[i]] =
+		default_intra_quantizer_matrix [i];
+    }
+
+    if (buffer[7] & 1) {
+	for (i = 0; i < 64; i++)
+	    picture->non_intra_quantizer_matrix[scan_norm[i]] =
+		buffer[i+8];
+    } else {
+	for (i = 0; i < 64; i++)
+	    picture->non_intra_quantizer_matrix[i] = 16;
+    }
+
+    /* MPEG1 - for testing only */
+    picture->mpeg1 = 1;
+    picture->intra_dc_precision = 0;
+    picture->frame_pred_frame_dct = 1;
+    picture->q_scale_type = 0;
+    picture->concealment_motion_vectors = 0;
+    /* picture->alternate_scan = 0; */
+    picture->picture_structure = FRAME_PICTURE;
+    /* picture->second_field = 0; */
+
+    return 0;
+}
+
+static int header_process_sequence_extension (picture_t * picture,
+					      uint8_t * buffer)
+{
+    /* check chroma format, size extensions, marker bit */
+    if (((buffer[1] & 0x07) != 0x02) || (buffer[2] & 0xe0) ||
+	((buffer[3] & 0x01) != 0x01))
+	return 1;
+
+    /* this is not used by the decoder */
+    picture->progressive_sequence = (buffer[1] >> 3) & 1;
+
+    if (picture->progressive_sequence)
+	picture->coded_picture_height =
+	    (picture->coded_picture_height + 31) & ~31;
+
+    /* MPEG1 - for testing only */
+    picture->mpeg1 = 0;
+
+    return 0;
+}
+
+static int header_process_quant_matrix_extension (picture_t * picture,
+						  uint8_t * buffer)
+{
+    int i;
+
+    if (buffer[0] & 8) {
+	for (i = 0; i < 64; i++)
+	    picture->intra_quantizer_matrix[scan_norm[i]] =
+		(buffer[i] << 5) | (buffer[i+1] >> 3);
+	buffer += 64;
+    }
+
+    if (buffer[0] & 4) {
+	for (i = 0; i < 64; i++)
+	    picture->non_intra_quantizer_matrix[scan_norm[i]] =
+		(buffer[i] << 6) | (buffer[i+1] >> 2);
+    }
+
+    return 0;
+}
+
+static int header_process_picture_coding_extension (picture_t * picture, uint8_t * buffer)
+{
+    /* pre subtract 1 for use later in compute_motion_vector */
+    picture->f_motion.f_code[0] = (buffer[0] & 15) - 1;
+    picture->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
+    picture->b_motion.f_code[0] = (buffer[1] & 15) - 1;
+    picture->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
+
+    picture->intra_dc_precision = (buffer[2] >> 2) & 3;
+    picture->picture_structure = buffer[2] & 3;
+    picture->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    picture->concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    picture->q_scale_type = (buffer[3] >> 4) & 1;
+    picture->intra_vlc_format = (buffer[3] >> 3) & 1;
+
+    if (buffer[3] & 4)	/* alternate_scan */
+	picture->scan = scan_alt;
+    else
+	picture->scan = scan_norm;
+
+    /* these are not used by the decoder */
+    picture->top_field_first = buffer[3] >> 7;
+    picture->repeat_first_field = (buffer[3] >> 1) & 1;
+    picture->progressive_frame = buffer[4] >> 7;
+
+    return 0;
+}
+
+int header_process_extension (picture_t * picture, uint8_t * buffer)
+{
+    switch (buffer[0] & 0xf0) {
+    case 0x10:	/* sequence extension */
+	return header_process_sequence_extension (picture, buffer);
+
+    case 0x30:	/* quant matrix extension */
+	return header_process_quant_matrix_extension (picture, buffer);
+
+    case 0x80:	/* picture coding extension */
+	return header_process_picture_coding_extension (picture, buffer);
+    }
+
+    return 0;
+}
+
+int header_process_picture_header (picture_t *picture, uint8_t * buffer)
+{
+    picture->picture_coding_type = (buffer [1] >> 3) & 7;
+
+    /* forward_f_code and backward_f_code - used in mpeg1 only */
+    picture->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
+    picture->f_motion.f_code[0] =
+	(((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
+    picture->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
+    picture->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
+
+    /* move in header_process_picture_header */
+        picture->second_field =
+            (picture->picture_structure != FRAME_PICTURE) &&
+            !(picture->second_field);
+
+    return 0;
+}
diff --git a/src/libmpeg2/idct.c b/src/libmpeg2/idct.c
new file mode 100644
index 000000000..21d33dc8c
--- /dev/null
+++ b/src/libmpeg2/idct.c
@@ -0,0 +1,290 @@
+/*
+ * idct.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * Portions of this code are from the MPEG software simulation group
+ * idct implementation. This code will be replaced with a new
+ * implementation soon.
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**********************************************************/
+/* inverse two dimensional DCT, Chen-Wang algorithm */
+/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */
+/* 32-bit integer arithmetic (8 bit coefficients) */
+/* 11 mults, 29 adds per DCT */
+/* sE, 18.8.91 */
+/**********************************************************/
+/* coefficients extended to 12 bit for IEEE1180-1990 */
+/* compliance sE, 2.1.94 */
+/**********************************************************/
+
+/* this code assumes >> to be a two's-complement arithmetic */
+/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include "xine_internal.h"
+#include "xine.h"
+#include "cpu_accel.h"
+
+#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
+#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
+#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
+#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
+#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
+#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
+
+/* idct main entry point  */
+void (*idct_block_copy) (int16_t * block, uint8_t * dest, int stride);
+void (*idct_block_add) (int16_t * block, uint8_t * dest, int stride);
+
+static void idct_block_copy_c (int16_t *block, uint8_t * dest, int stride);
+static void idct_block_add_c (int16_t *block, uint8_t * dest, int stride);
+
+static uint8_t clip_lut[1024];
+#define CLIP(i) ((clip_lut+384)[ (i)])
+
+void idct_init (void)
+{
+#ifdef ARCH_X86
+    if (config.flags & MM_ACCEL_X86_MMXEXT) {
+	fprintf (stderr, "Using MMXEXT for IDCT transform\n");
+	idct_block_copy = idct_block_copy_mmxext;
+	idct_block_add = idct_block_add_mmxext;
+	idct_mmx_init ();
+    } else if (config.flags & MM_ACCEL_X86_MMX) {
+	fprintf (stderr, "Using MMX for IDCT transform\n");
+	idct_block_copy = idct_block_copy_mmx;
+	idct_block_add = idct_block_add_mmx;
+	idct_mmx_init ();
+    } else
+#endif
+#ifdef LIBMPEG2_MLIB
+    if (config.flags & MM_ACCEL_MLIB) {
+	fprintf (stderr, "Using mlib for IDCT transform\n");
+	idct_block_copy = idct_block_copy_mlib;
+	idct_block_add = idct_block_add_mlib;
+    } else
+#endif
+    {
+	int i;
+
+	fprintf (stderr, "No accelerated IDCT transform found\n");
+	idct_block_copy = idct_block_copy_c;
+	idct_block_add = idct_block_add_c;
+	for (i = -384; i < 640; i++)
+	    clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
+    }
+}
+
+/* row (horizontal) IDCT
+ *
+ * 7 pi 1
+ * dst[k] = sum c[l] * src[l] * cos ( -- * ( k + - ) * l )
+ * l=0 8 2
+ *
+ * where: c[0] = 128
+ * c[1..7] = 128*sqrt (2)
+ */
+
+static void inline idct_row (int16_t * block)
+{
+    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+    x1 = block[4] << 11;
+    x2 = block[6];
+    x3 = block[2];
+    x4 = block[1];
+    x5 = block[7];
+    x6 = block[5];
+    x7 = block[3];
+
+    /* shortcut */
+    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
+	block[0] = block[1] = block[2] = block[3] = block[4] =
+	    block[5] = block[6] = block[7] = block[0]<<3;
+	return;
+    }
+
+    x0 = (block[0] << 11) + 128; /* for proper rounding in the fourth stage */
+
+    /* first stage */
+    x8 = W7 * (x4 + x5);
+    x4 = x8 + (W1 - W7) * x4;
+    x5 = x8 - (W1 + W7) * x5;
+    x8 = W3 * (x6 + x7);
+    x6 = x8 - (W3 - W5) * x6;
+    x7 = x8 - (W3 + W5) * x7;
+ 
+    /* second stage */
+    x8 = x0 + x1;
+    x0 -= x1;
+    x1 = W6 * (x3 + x2);
+    x2 = x1 - (W2 + W6) * x2;
+    x3 = x1 + (W2 - W6) * x3;
+    x1 = x4 + x6;
+    x4 -= x6;
+    x6 = x5 + x7;
+    x5 -= x7;
+ 
+    /* third stage */
+    x7 = x8 + x3;
+    x8 -= x3;
+    x3 = x0 + x2;
+    x0 -= x2;
+    x2 = (181 * (x4 + x5) + 128) >> 8;
+    x4 = (181 * (x4 - x5) + 128) >> 8;
+ 
+    /* fourth stage */
+    block[0] = (x7 + x1) >> 8;
+    block[1] = (x3 + x2) >> 8;
+    block[2] = (x0 + x4) >> 8;
+    block[3] = (x8 + x6) >> 8;
+    block[4] = (x8 - x6) >> 8;
+    block[5] = (x0 - x4) >> 8;
+    block[6] = (x3 - x2) >> 8;
+    block[7] = (x7 - x1) >> 8;
+}
+
+/* column (vertical) IDCT
+ *
+ * 7 pi 1
+ * dst[8*k] = sum c[l] * src[8*l] * cos ( -- * ( k + - ) * l )
+ * l=0 8 2
+ *
+ * where: c[0] = 1/1024
+ * c[1..7] = (1/1024)*sqrt (2)
+ */
+
+static void inline idct_col (int16_t *block)
+{
+    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+    /* shortcut */
+    x1 = block [8*4] << 8;
+    x2 = block [8*6];
+    x3 = block [8*2];
+    x4 = block [8*1];
+    x5 = block [8*7];
+    x6 = block [8*5];
+    x7 = block [8*3];
+
+#if 0
+    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
+	block[8*0] = block[8*1] = block[8*2] = block[8*3] = block[8*4] =
+	    block[8*5] = block[8*6] = block[8*7] = (block[8*0] + 32) >> 6;
+	return;
+    }
+#endif
+
+    x0 = (block[8*0] << 8) + 8192;
+
+    /* first stage */
+    x8 = W7 * (x4 + x5) + 4;
+    x4 = (x8 + (W1 - W7) * x4) >> 3;
+    x5 = (x8 - (W1 + W7) * x5) >> 3;
+    x8 = W3 * (x6 + x7) + 4;
+    x6 = (x8 - (W3 - W5) * x6) >> 3;
+    x7 = (x8 - (W3 + W5) * x7) >> 3;
+ 
+    /* second stage */
+    x8 = x0 + x1;
+    x0 -= x1;
+    x1 = W6 * (x3 + x2) + 4;
+    x2 = (x1 - (W2 + W6) * x2) >> 3;
+    x3 = (x1 + (W2 - W6) * x3) >> 3;
+    x1 = x4 + x6;
+    x4 -= x6;
+    x6 = x5 + x7;
+    x5 -= x7;
+ 
+    /* third stage */
+    x7 = x8 + x3;
+    x8 -= x3;
+    x3 = x0 + x2;
+    x0 -= x2;
+    x2 = (181 * (x4 + x5) + 128) >> 8;
+    x4 = (181 * (x4 - x5) + 128) >> 8;
+ 
+    /* fourth stage */
+    block[8*0] = (x7 + x1) >> 14;
+    block[8*1] = (x3 + x2) >> 14;
+    block[8*2] = (x0 + x4) >> 14;
+    block[8*3] = (x8 + x6) >> 14;
+    block[8*4] = (x8 - x6) >> 14;
+    block[8*5] = (x0 - x4) >> 14;
+    block[8*6] = (x3 - x2) >> 14;
+    block[8*7] = (x7 - x1) >> 14;
+}
+
+void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+
+    i = 8;
+    do {
+	dest[0] = CLIP (block[0]);
+	dest[1] = CLIP (block[1]);
+	dest[2] = CLIP (block[2]);
+	dest[3] = CLIP (block[3]);
+	dest[4] = CLIP (block[4]);
+	dest[5] = CLIP (block[5]);
+	dest[6] = CLIP (block[6]);
+	dest[7] = CLIP (block[7]);
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
+
+void idct_block_add_c (int16_t * block, uint8_t * dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+
+    i = 8;
+    do {
+	dest[0] = CLIP (block[0] + dest[0]);
+	dest[1] = CLIP (block[1] + dest[1]);
+	dest[2] = CLIP (block[2] + dest[2]);
+	dest[3] = CLIP (block[3] + dest[3]);
+	dest[4] = CLIP (block[4] + dest[4]);
+	dest[5] = CLIP (block[5] + dest[5]);
+	dest[6] = CLIP (block[6] + dest[6]);
+	dest[7] = CLIP (block[7] + dest[7]);
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
diff --git a/src/libmpeg2/idct_mlib.c b/src/libmpeg2/idct_mlib.c
new file mode 100644
index 000000000..876ab574a
--- /dev/null
+++ b/src/libmpeg2/idct_mlib.c
@@ -0,0 +1,47 @@
+/*
+ * idct_mlib.c
+ * Copyright (C) 1999-2001 H�kan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef LIBMPEG2_MLIB
+
+#include <inttypes.h>
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+
+#include "mpeg2_internal.h"
+
+void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride)
+{
+    mlib_VideoIDCT8x8_U8_S16 (dest, block, stride);
+}
+
+void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride)
+{
+    /* Should we use mlib_VideoIDCT_IEEE_S16_S16 here ?? */
+    /* it's ~30% slower. */
+    mlib_VideoIDCT8x8_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+}
+
+#endif
diff --git a/src/libmpeg2/idct_mlib.h b/src/libmpeg2/idct_mlib.h
new file mode 100644
index 000000000..4a5b92919
--- /dev/null
+++ b/src/libmpeg2/idct_mlib.h
@@ -0,0 +1,25 @@
+/*
+ * idct_mlib.h
+ *
+ * Copyright (C) 1999, H�kan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *	
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 
+ *
+ */
+
+void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride);
+void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride);
diff --git a/src/libmpeg2/idct_mmx.c b/src/libmpeg2/idct_mmx.c
new file mode 100644
index 000000000..927a78996
--- /dev/null
+++ b/src/libmpeg2/idct_mmx.c
@@ -0,0 +1,705 @@
+/*
+ * idct_mmx.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_X86
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include "attributes.h"
+#include "cpu_accel.h"
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+
+
+#if 0
+/* C row IDCT - its just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+			     int16_t * table, int32_t * rounder)
+{
+    int C1, C2, C3, C4, C5, C6, C7;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    row += offset;
+
+    C1 = table[1];
+    C2 = table[2];
+    C3 = table[3];
+    C4 = table[4];
+    C5 = table[5];
+    C6 = table[6];
+    C7 = table[7];
+
+    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
+						   c4,  c6,  c4,  c6,	\
+						   c1,  c3, -c1, -c5,	\
+						   c5,  c7,  c3, -c7,	\
+						   c4, -c6,  c4, -c6,	\
+						  -c4,  c2,  c4, -c2,	\
+						   c5, -c1,  c3, -c1,	\
+						   c7,  c3,  c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+
+    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+}
+
+static inline void mmxext_row (int16_t * table, int32_t * rounder)
+{
+    movq_m2r (*(table+8), mm1);		// mm1 = -C5 -C1 C3 C1
+    pmaddwd_r2r (mm2, mm4);		// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
+
+    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
+    pshufw_r2r (mm6, mm6, 0x4e);	// mm6 = x3 x1 x7 x5
+
+    movq_m2r (*(table+12), mm7);	// mm7 = -C7 C3 C7 C5
+    pmaddwd_r2r (mm5, mm1);		// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
+
+    paddd_m2r (*rounder, mm3);		// mm3 += rounder
+    pmaddwd_r2r (mm6, mm7);		// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
+
+    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
+    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
+    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
+    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+
+    paddd_m2r (*rounder, mm0);		// mm0 += rounder
+    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+
+    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
+    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+
+    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
+    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+
+    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
+    movq_r2r (mm0, mm4);		// mm4 = a3 a2 + rounder
+
+    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
+    psubd_r2r (mm5, mm4);		// mm4 = a3-b3 a2-b2 + rounder
+}
+
+static inline void mmxext_row_tail (int16_t * row, int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+
+    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+
+    /* slot */
+
+    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+}
+
+static inline void mmxext_row_mid (int16_t * row, int store,
+				   int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+
+    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
+    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+
+    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+
+    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
+    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
+					   c4,  c6, -c4, -c2,	\
+					   c1,  c3,  c3, -c7,	\
+					   c5,  c7, -c1, -c5,	\
+					   c4, -c6,  c4, -c2,	\
+					  -c4,  c2,  c4, -c6,	\
+					   c5, -c1,  c7, -c5,	\
+					   c7,  c3,  c3, -c1 }
+
+static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+
+    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+
+    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
+    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+}
+
+static inline void mmx_row (int16_t * table, int32_t * rounder)
+{
+    pmaddwd_r2r (mm2, mm4);		// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
+    punpckldq_r2r (mm5, mm5);		// mm5 = x3 x1 x3 x1
+
+    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
+    punpckhdq_r2r (mm6, mm6);		// mm6 = x7 x5 x7 x5
+
+    movq_m2r (*(table+12), mm7);	// mm7 = -C5 -C1 C7 C5
+    pmaddwd_r2r (mm5, mm1);		// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
+
+    paddd_m2r (*rounder, mm3);		// mm3 += rounder
+    pmaddwd_r2r (mm6, mm7);		// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
+
+    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
+    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
+    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
+    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+
+    paddd_m2r (*rounder, mm0);		// mm0 += rounder
+    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+
+    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
+    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+
+    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
+    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+
+    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
+    movq_r2r (mm0, mm7);		// mm7 = a3 a2 + rounder
+
+    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
+    psubd_r2r (mm5, mm7);		// mm7 = a3-b3 a2-b2 + rounder
+}
+
+static inline void mmx_row_tail (int16_t * row, int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+
+    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    movq_r2r (mm7, mm4);		// mm4 = y6 y7 y4 y5
+
+    pslld_i2r (16, mm7);		// mm7 = y7 0 y5 0
+
+    psrld_i2r (16, mm4);		// mm4 = 0 y6 0 y4
+
+    por_r2r (mm4, mm7);			// mm7 = y7 y6 y5 y4
+
+    /* slot */
+
+    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+}
+
+static inline void mmx_row_mid (int16_t * row, int store,
+				int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    movq_r2r (mm7, mm1);		// mm1 = y6 y7 y4 y5
+
+    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+    psrld_i2r (16, mm7);		// mm7 = 0 y6 0 y4
+
+    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
+    pslld_i2r (16, mm1);		// mm1 = y7 0 y5 0
+
+    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
+    por_r2r (mm1, mm7);			// mm7 = y7 y6 y5 y4
+
+    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
+    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+
+    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+}
+
+
+#if 0
+// C column IDCT - its just here to document the MMXEXT and MMX versions
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+    col += offset;
+
+    x0 = col[0*8];
+    x1 = col[1*8];
+    x2 = col[2*8];
+    x3 = col[3*8];
+    x4 = col[4*8];
+    x5 = col[5*8];
+    x6 = col[6*8];
+    x7 = col[7*8];
+
+    u04 = S (x0 + x4);
+    v04 = S (x0 - x4);
+    u26 = S (F (T2, x6) + x2);
+    v26 = S (F (T2, x2) - x6);
+
+    a0 = S (u04 + u26);
+    a1 = S (v04 + v26);
+    a2 = S (v04 - v26);
+    a3 = S (u04 - u26);
+
+    u17 = S (F (T1, x7) + x1);
+    v17 = S (F (T1, x1) - x7);
+    u35 = S (F (T3, x5) + x3);
+    v35 = S (F (T3, x3) - x5);
+
+    b0 = S (u17 + u35);
+    b3 = S (v17 - v35);
+    u12 = S (u17 - u35);
+    v12 = S (v17 + v35);
+    u12 = S (2 * F (C4, u12));
+    v12 = S (2 * F (C4, v12));
+    b1 = S (u12 + v12);
+    b2 = S (u12 - v12);
+
+    y0 = S (a0 + b0) >> COL_SHIFT;
+    y1 = S (a1 + b1) >> COL_SHIFT;
+    y2 = S (a2 + b2) >> COL_SHIFT;
+    y3 = S (a3 + b3) >> COL_SHIFT;
+
+    y4 = S (a3 - b3) >> COL_SHIFT;
+    y5 = S (a2 - b2) >> COL_SHIFT;
+    y6 = S (a1 - b1) >> COL_SHIFT;
+    y7 = S (a0 - b0) >> COL_SHIFT;
+
+    col[0*8] = y0;
+    col[1*8] = y1;
+    col[2*8] = y2;
+    col[3*8] = y3;
+    col[4*8] = y4;
+    col[5*8] = y5;
+    col[6*8] = y6;
+    col[7*8] = y7;
+}
+#endif
+
+
+// MMX column IDCT
+static inline void idct_col (int16_t * col, int offset)
+{
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+    static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+
+    /* column code adapted from peter gubanov */
+    /* http://www.elecard.com/peter/idct.shtml */
+
+    movq_m2r (*_T1, mm0);		// mm0 = T1
+
+    movq_m2r (*(col+offset+1*8), mm1);	// mm1 = x1
+    movq_r2r (mm0, mm2);		// mm2 = T1
+
+    movq_m2r (*(col+offset+7*8), mm4);	// mm4 = x7
+    pmulhw_r2r (mm1, mm0);		// mm0 = T1*x1
+
+    movq_m2r (*_T3, mm5);		// mm5 = T3
+    pmulhw_r2r (mm4, mm2);		// mm2 = T1*x7
+
+    movq_m2r (*(col+offset+5*8), mm6);	// mm6 = x5
+    movq_r2r (mm5, mm7);		// mm7 = T3-1
+
+    movq_m2r (*(col+offset+3*8), mm3);	// mm3 = x3
+    psubsw_r2r (mm4, mm0);		// mm0 = v17
+
+    movq_m2r (*_T2, mm4);		// mm4 = T2
+    pmulhw_r2r (mm3, mm5);		// mm5 = (T3-1)*x3
+
+    paddsw_r2r (mm2, mm1);		// mm1 = u17
+    pmulhw_r2r (mm6, mm7);		// mm7 = (T3-1)*x5
+
+    /* slot */
+
+    movq_r2r (mm4, mm2);		// mm2 = T2
+    paddsw_r2r (mm3, mm5);		// mm5 = T3*x3
+
+    pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2
+    paddsw_r2r (mm6, mm7);		// mm7 = T3*x5
+
+    psubsw_r2r (mm6, mm5);		// mm5 = v35
+    paddsw_r2r (mm3, mm7);		// mm7 = u35
+
+    movq_m2r (*(col+offset+6*8), mm3);	// mm3 = x6
+    movq_r2r (mm0, mm6);		// mm6 = v17
+
+    pmulhw_r2r (mm3, mm2);		// mm2 = T2*x6
+    psubsw_r2r (mm5, mm0);		// mm0 = b3
+
+    psubsw_r2r (mm3, mm4);		// mm4 = v26
+    paddsw_r2r (mm6, mm5);		// mm5 = v12
+
+    movq_r2m (mm0, *(col+offset+3*8));	// save b3 in scratch0
+    movq_r2r (mm1, mm6);		// mm6 = u17
+
+    paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26
+    paddsw_r2r (mm7, mm6);		// mm6 = b0
+
+    psubsw_r2r (mm7, mm1);		// mm1 = u12
+    movq_r2r (mm1, mm7);		// mm7 = u12
+
+    movq_m2r (*(col+offset+0*8), mm3);	// mm3 = x0
+    paddsw_r2r (mm5, mm1);		// mm1 = u12+v12
+
+    movq_m2r (*_C4, mm0);		// mm0 = C4/2
+    psubsw_r2r (mm5, mm7);		// mm7 = u12-v12
+
+    movq_r2m (mm6, *(col+offset+5*8));	// save b0 in scratch1
+    pmulhw_r2r (mm0, mm1);		// mm1 = b1/2
+
+    movq_r2r (mm4, mm6);		// mm6 = v26
+    pmulhw_r2r (mm0, mm7);		// mm7 = b2/2
+
+    movq_m2r (*(col+offset+4*8), mm5);	// mm5 = x4
+    movq_r2r (mm3, mm0);		// mm0 = x0
+
+    psubsw_r2r (mm5, mm3);		// mm3 = v04
+    paddsw_r2r (mm5, mm0);		// mm0 = u04
+
+    paddsw_r2r (mm3, mm4);		// mm4 = a1
+    movq_r2r (mm0, mm5);		// mm5 = u04
+
+    psubsw_r2r (mm6, mm3);		// mm3 = a2
+    paddsw_r2r (mm2, mm5);		// mm5 = a0
+
+    paddsw_r2r (mm1, mm1);		// mm1 = b1
+    psubsw_r2r (mm2, mm0);		// mm0 = a3
+
+    paddsw_r2r (mm7, mm7);		// mm7 = b2
+    movq_r2r (mm3, mm2);		// mm2 = a2
+
+    movq_r2r (mm4, mm6);		// mm6 = a1
+    paddsw_r2r (mm7, mm3);		// mm3 = a2+b2
+
+    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y2
+    paddsw_r2r (mm1, mm4);		// mm4 = a1+b1
+
+    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y1
+    psubsw_r2r (mm1, mm6);		// mm6 = a1-b1
+
+    movq_m2r (*(col+offset+5*8), mm1);	// mm1 = b0
+    psubsw_r2r (mm7, mm2);		// mm2 = a2-b2
+
+    psraw_i2r (COL_SHIFT, mm6);		// mm6 = y6
+    movq_r2r (mm5, mm7);		// mm7 = a0
+
+    movq_r2m (mm4, *(col+offset+1*8));	// save y1
+    psraw_i2r (COL_SHIFT, mm2);		// mm2 = y5
+
+    movq_r2m (mm3, *(col+offset+2*8));	// save y2
+    paddsw_r2r (mm1, mm5);		// mm5 = a0+b0
+
+    movq_m2r (*(col+offset+3*8), mm4);	// mm4 = b3
+    psubsw_r2r (mm1, mm7);		// mm7 = a0-b0
+
+    psraw_i2r (COL_SHIFT, mm5);		// mm5 = y0
+    movq_r2r (mm0, mm3);		// mm3 = a3
+
+    movq_r2m (mm2, *(col+offset+5*8));	// save y5
+    psubsw_r2r (mm4, mm3);		// mm3 = a3-b3
+
+    psraw_i2r (COL_SHIFT, mm7);		// mm7 = y7
+    paddsw_r2r (mm0, mm4);		// mm4 = a3+b3
+
+    movq_r2m (mm5, *(col+offset+0*8));	// save y0
+    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y4
+
+    movq_r2m (mm6, *(col+offset+6*8));	// save y6
+    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y3
+
+    movq_r2m (mm7, *(col+offset+7*8));	// save y7
+
+    movq_r2m (mm3, *(col+offset+4*8));	// save y4
+
+    movq_r2m (mm4, *(col+offset+3*8));	// save y3
+}
+
+
+static int32_t rounder0[] ATTR_ALIGN(8) =
+    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static int32_t rounder1[] ATTR_ALIGN(8) =
+    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
+static int32_t rounder7[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
+static int32_t rounder2[] ATTR_ALIGN(8) =
+    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
+static int32_t rounder6[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C2 * (C6-C2)/2 */
+static int32_t rounder3[] ATTR_ALIGN(8) =
+    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
+static int32_t rounder5[] ATTR_ALIGN(8) =
+    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
+
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
+static inline void idct (int16_t * block)				\
+{									\
+    static int16_t table04[] ATTR_ALIGN(16) =				\
+	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
+    static int16_t table17[] ATTR_ALIGN(16) =				\
+	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
+    static int16_t table26[] ATTR_ALIGN(16) =				\
+	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
+    static int16_t table35[] ATTR_ALIGN(16) =				\
+	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
+									\
+    idct_row_head (block, 0*8, table04);				\
+    idct_row (table04, rounder0);					\
+    idct_row_mid (block, 0*8, 4*8, table04);				\
+    idct_row (table04, rounder4);					\
+    idct_row_mid (block, 4*8, 1*8, table17);				\
+    idct_row (table17, rounder1);					\
+    idct_row_mid (block, 1*8, 7*8, table17);				\
+    idct_row (table17, rounder7);					\
+    idct_row_mid (block, 7*8, 2*8, table26);				\
+    idct_row (table26, rounder2);					\
+    idct_row_mid (block, 2*8, 6*8, table26);				\
+    idct_row (table26, rounder6);					\
+    idct_row_mid (block, 6*8, 3*8, table35);				\
+    idct_row (table35, rounder3);					\
+    idct_row_mid (block, 3*8, 5*8, table35);				\
+    idct_row (table35, rounder5);					\
+    idct_row_tail (block, 5*8);						\
+									\
+    idct_col (block, 0);						\
+    idct_col (block, 4);						\
+}
+
+
+#define COPY_MMX(offset,r0,r1,r2)	\
+do {					\
+    movq_m2r (*(block+offset), r0);	\
+    dest += stride;			\
+    movq_m2r (*(block+offset+4), r1);	\
+    movq_r2m (r2, *dest);		\
+    packuswb_r2r (r1, r0);		\
+} while (0)
+
+static void block_copy (int16_t * block, uint8_t * dest, int stride)
+{
+    movq_m2r (*(block+0*8), mm0);
+    movq_m2r (*(block+0*8+4), mm1);
+    movq_m2r (*(block+1*8), mm2);
+    packuswb_r2r (mm1, mm0);
+    movq_m2r (*(block+1*8+4), mm3);
+    movq_r2m (mm0, *dest);
+    packuswb_r2r (mm3, mm2);
+    COPY_MMX (2*8, mm0, mm1, mm2);
+    COPY_MMX (3*8, mm2, mm3, mm0);
+    COPY_MMX (4*8, mm0, mm1, mm2);
+    COPY_MMX (5*8, mm2, mm3, mm0);
+    COPY_MMX (6*8, mm0, mm1, mm2);
+    COPY_MMX (7*8, mm2, mm3, mm0);
+    movq_r2m (mm2, *(dest+stride));
+}
+
+
+#define ADD_MMX(offset,r1,r2,r3,r4)	\
+do {					\
+    movq_m2r (*(dest+2*stride), r1);	\
+    packuswb_r2r (r4, r3);		\
+    movq_r2r (r1, r2);			\
+    dest += stride;			\
+    movq_r2m (r3, *dest);		\
+    punpcklbw_r2r (mm0, r1);		\
+    paddsw_m2r (*(block+offset), r1);	\
+    punpckhbw_r2r (mm0, r2);		\
+    paddsw_m2r (*(block+offset+4), r2);	\
+} while (0)
+
+static void block_add (int16_t * block, uint8_t * dest, int stride)
+{
+    movq_m2r (*dest, mm1);
+    pxor_r2r (mm0, mm0);
+    movq_m2r (*(dest+stride), mm3);
+    movq_r2r (mm1, mm2);
+    punpcklbw_r2r (mm0, mm1);
+    movq_r2r (mm3, mm4);
+    paddsw_m2r (*(block+0*8), mm1);
+    punpckhbw_r2r (mm0, mm2);
+    paddsw_m2r (*(block+0*8+4), mm2);
+    punpcklbw_r2r (mm0, mm3);
+    paddsw_m2r (*(block+1*8), mm3);
+    packuswb_r2r (mm2, mm1);
+    punpckhbw_r2r (mm0, mm4);
+    movq_r2m (mm1, *dest);
+    paddsw_m2r (*(block+1*8+4), mm4);
+    ADD_MMX (2*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (3*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (4*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (5*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (6*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (7*8, mm3, mm4, mm1, mm2);
+    packuswb_r2r (mm4, mm3);
+    movq_r2m (mm3, *(dest+stride));
+}
+
+
+declare_idct (mmxext_idct, mmxext_table,
+	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+void idct_block_copy_mmxext (int16_t * block, uint8_t * dest, int stride)
+{
+    mmxext_idct (block);
+    block_copy (block, dest, stride);
+}
+
+void idct_block_add_mmxext (int16_t * block, uint8_t * dest, int stride)
+{
+    mmxext_idct (block);
+    block_add (block, dest, stride);
+}
+
+
+declare_idct (mmx_idct, mmx_table,
+	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+void idct_block_copy_mmx (int16_t * block, uint8_t * dest, int stride)
+{
+    mmx_idct (block);
+    block_copy (block, dest, stride);
+}
+
+void idct_block_add_mmx (int16_t * block, uint8_t * dest, int stride)
+{
+    mmx_idct (block);
+    block_add (block, dest, stride);
+}
+
+
+void idct_mmx_init (void)
+{
+    extern uint8_t scan_norm[64];
+    extern uint8_t scan_alt[64];
+    int i, j;
+
+    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
+
+    for (i = 0; i < 64; i++) {
+	j = scan_norm[i];
+	scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+	j = scan_alt[i];
+	scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+    }
+}
+
+#endif
diff --git a/src/libmpeg2/motion_comp.c b/src/libmpeg2/motion_comp.c
new file mode 100644
index 000000000..fd4055265
--- /dev/null
+++ b/src/libmpeg2/motion_comp.c
@@ -0,0 +1,125 @@
+/*
+ * motion_comp.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include "cpu_accel.h"
+
+mc_functions_t mc_functions;
+
+void motion_comp_init (void)
+{
+
+#ifdef ARCH_X86
+    if (config.flags & MM_ACCEL_X86_MMXEXT) {
+	fprintf (stderr, "Using MMXEXT for motion compensation\n");
+	mc_functions = mc_functions_mmxext;
+    } else if (config.flags & MM_ACCEL_X86_3DNOW) {
+	fprintf (stderr, "Using 3DNOW for motion compensation\n");
+	mc_functions = mc_functions_3dnow;
+    } else if (config.flags & MM_ACCEL_X86_MMX) {
+	fprintf (stderr, "Using MMX for motion compensation\n");
+	mc_functions = mc_functions_mmx;
+    } else
+#endif
+#ifdef LIBMPEG2_MLIB
+    if (config.flags & MM_ACCEL_MLIB) {
+	fprintf (stderr, "Using mlib for motion compensation\n");
+	mc_functions = mc_functions_mlib;
+    } else
+#endif
+    {
+	fprintf (stderr, "No accelerated motion compensation found\n");
+	mc_functions = mc_functions_c;
+    }
+}
+
+#define avg2(a,b) ((a+b+1)>>1)
+#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
+
+#define predict_(i) (ref[i])
+#define predict_x(i) (avg2 (ref[i], ref[i+1]))
+#define predict_y(i) (avg2 (ref[i], (ref+stride)[i]))
+#define predict_xy(i) (avg4 (ref[i], ref[i+1], (ref+stride)[i], (ref+stride)[i+1]))
+
+#define put(predictor,i) dest[i] = predictor (i)
+#define avg(predictor,i) dest[i] = avg2 (predictor (i), dest[i])
+
+/* mc function template */
+
+#define MC_FUNC(op,xy)						\
+static void MC_##op##_##xy##16_c (uint8_t * dest, uint8_t * ref,\
+				 int stride, int height)	\
+{								\
+    do {							\
+	op (predict_##xy, 0);					\
+	op (predict_##xy, 1);					\
+	op (predict_##xy, 2);					\
+	op (predict_##xy, 3);					\
+	op (predict_##xy, 4);					\
+	op (predict_##xy, 5);					\
+	op (predict_##xy, 6);					\
+	op (predict_##xy, 7);					\
+	op (predict_##xy, 8);					\
+	op (predict_##xy, 9);					\
+	op (predict_##xy, 10);					\
+	op (predict_##xy, 11);					\
+	op (predict_##xy, 12);					\
+	op (predict_##xy, 13);					\
+	op (predict_##xy, 14);					\
+	op (predict_##xy, 15);					\
+	ref += stride;						\
+	dest += stride;						\
+    } while (--height);						\
+}								\
+static void MC_##op##_##xy##8_c (uint8_t * dest, uint8_t * ref,	\
+				int stride, int height)		\
+{								\
+    do {							\
+	op (predict_##xy, 0);					\
+	op (predict_##xy, 1);					\
+	op (predict_##xy, 2);					\
+	op (predict_##xy, 3);					\
+	op (predict_##xy, 4);					\
+	op (predict_##xy, 5);					\
+	op (predict_##xy, 6);					\
+	op (predict_##xy, 7);					\
+	ref += stride;						\
+	dest += stride;						\
+    } while (--height);						\
+}
+
+/* definitions of the actual mc functions */
+
+MC_FUNC (put,)
+MC_FUNC (avg,)
+MC_FUNC (put,x)
+MC_FUNC (avg,x)
+MC_FUNC (put,y)
+MC_FUNC (avg,y)
+MC_FUNC (put,xy)
+MC_FUNC (avg,xy)
+
+MOTION_COMP_EXTERN (c)
diff --git a/src/libmpeg2/motion_comp_mlib.c b/src/libmpeg2/motion_comp_mlib.c
new file mode 100644
index 000000000..91c0fb5a8
--- /dev/null
+++ b/src/libmpeg2/motion_comp_mlib.c
@@ -0,0 +1,180 @@
+/*
+ * motion_comp_mlib.c
+ * Copyright (C) 2000-2001 H�kan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef LIBMPEG2_MLIB
+
+#include <inttypes.h>
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+
+#include "mpeg2_internal.h"
+
+static void MC_put_16_mlib (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    if (height == 16) 
+	mlib_VideoCopyRef_U8_U8_16x16 (dest, ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_16x8 (dest, ref, stride);
+}
+
+static void MC_put_x16_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpX_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_put_y16_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_put_xy16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpXY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_put_8_mlib (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoCopyRef_U8_U8_8x8 (dest, ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_8x4 (dest, ref, stride);
+}
+
+static void MC_put_x8_mlib (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpX_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_put_y8_mlib (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_put_xy8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8) 
+	mlib_VideoInterpXY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_avg_16_mlib (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoCopyRefAve_U8_U8_16x16 (dest, ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_16x8 (dest, ref, stride);
+}
+
+static void MC_avg_x16_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveX_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_avg_y16_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_avg_xy16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveXY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_avg_8_mlib (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_8x4 (dest, ref, stride);
+}
+
+static void MC_avg_x8_mlib (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveX_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_avg_y8_mlib (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_avg_xy8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+MOTION_COMP_EXTERN (mlib)
+
+#endif
diff --git a/src/libmpeg2/motion_comp_mmx.c b/src/libmpeg2/motion_comp_mmx.c
new file mode 100644
index 000000000..049546b1f
--- /dev/null
+++ b/src/libmpeg2/motion_comp_mmx.c
@@ -0,0 +1,1017 @@
+/*
+ * motion_comp_mmx.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_X86
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include "attributes.h"
+#include "cpu_accel.h"
+
+#define CPU_MMXEXT 0
+#define CPU_3DNOW 1
+
+
+/* MMX code - needs a rewrite */
+
+
+
+
+
+
+
+/* some rounding constants */
+mmx_t round1 = {0x0001000100010001LL};
+mmx_t round4 = {0x0002000200020002LL};
+
+/*
+ * This code should probably be compiled with loop unrolling
+ * (ie, -funroll-loops in gcc)becuase some of the loops
+ * use a small static number of iterations. This was written
+ * with the assumption the compiler knows best about when
+ * unrolling will help
+ */
+
+static inline void mmx_zero_reg ()
+{
+    /* load 0 into mm0 */
+    pxor_r2r (mm0, mm0);
+}
+
+static inline void mmx_average_2_U8 (uint8_t * dest,
+				     uint8_t * src1, uint8_t * src2)
+{
+    /* *dest = (*src1 + *src2 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	// load 8 src1 bytes
+    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+
+    movq_m2r (*src2, mm3);	// load 8 src2 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows to mm1
+    paddw_m2r (round1, mm1);
+    psraw_i2r (1, mm1);		// /2
+
+    paddw_r2r (mm4, mm2);	// add highs to mm2
+    paddw_m2r (round1, mm2);
+    psraw_i2r (1, mm2);		// /2
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1, *dest);	// store result in dest
+}
+
+static inline void mmx_interp_average_2_U8 (uint8_t * dest,
+					    uint8_t * src1, uint8_t * src2)
+{
+    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
+
+    movq_m2r (*dest, mm1);	// load 8 dest bytes
+    movq_r2r (mm1, mm2);	// copy 8 dest bytes
+
+    movq_m2r (*src1, mm3);	// load 8 src1 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src1 bytes
+
+    movq_m2r (*src2, mm5);	// load 8 src2 bytes
+    movq_r2r (mm5, mm6);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low dest bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high dest bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src1 bytes
+
+    punpcklbw_r2r (mm0, mm5);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm6);	// unpack high src2 bytes
+
+    paddw_r2r (mm5, mm3);	// add lows
+    paddw_m2r (round1, mm3);
+    psraw_i2r (1, mm3);		// /2
+
+    paddw_r2r (mm6, mm4);	// add highs
+    paddw_m2r (round1, mm4);
+    psraw_i2r (1, mm4);		// /2
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_m2r (round1, mm1);
+    psraw_i2r (1, mm1);		// /2
+
+    paddw_r2r (mm4, mm2);	// add highs
+    paddw_m2r (round1, mm2);
+    psraw_i2r (1, mm2);		// /2
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1, *dest);	// store result in dest
+}
+
+static inline void mmx_average_4_U8 (uint8_t * dest,
+				     uint8_t * src1, uint8_t * src2,
+				     uint8_t * src3, uint8_t * src4)
+{
+    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
+
+    movq_m2r (*src1, mm1);	// load 8 src1 bytes
+    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+
+    movq_m2r (*src2, mm3);	// load 8 src2 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	// load 8 src3 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src3 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src3 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src3 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    movq_m2r (*src4, mm5);	// load 8 src4 bytes
+    movq_r2r (mm5, mm6);	// copy 8 src4 bytes
+
+    punpcklbw_r2r (mm0, mm5);	// unpack low src4 bytes
+    punpckhbw_r2r (mm0, mm6);	// unpack high src4 bytes
+
+    paddw_r2r (mm5, mm1);	// add lows
+    paddw_r2r (mm6, mm2);	// add highs
+
+    /* now have subtotal in mm1 and mm2 */
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		// /4
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		// /4
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1, *dest);	// store result in dest
+}
+
+static inline void mmx_interp_average_4_U8 (uint8_t * dest,
+					    uint8_t * src1, uint8_t * src2,
+					    uint8_t * src3, uint8_t * src4)
+{
+    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	// load 8 src1 bytes
+    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+
+    movq_m2r (*src2, mm3);	// load 8 src2 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	// load 8 src3 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src3 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src3 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src3 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    movq_m2r (*src4, mm5);	// load 8 src4 bytes
+    movq_r2r (mm5, mm6);	// copy 8 src4 bytes
+
+    punpcklbw_r2r (mm0, mm5);	// unpack low src4 bytes
+    punpckhbw_r2r (mm0, mm6);	// unpack high src4 bytes
+
+    paddw_r2r (mm5, mm1);	// add lows
+    paddw_r2r (mm6, mm2);	// add highs
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		// /4
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		// /4
+
+    /* now have subtotal/4 in mm1 and mm2 */
+
+    movq_m2r (*dest, mm3);	// load 8 dest bytes
+    movq_r2r (mm3, mm4);	// copy 8 dest bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low dest bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high dest bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    paddw_m2r (round1, mm1);
+    psraw_i2r (1, mm1);		// /2
+    paddw_m2r (round1, mm2);
+    psraw_i2r (1, mm2);		// /2
+
+    /* now have end value in mm1 and mm2 */
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1,*dest);	// store result in dest
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_mmx (int width, int height,
+			       uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, dest, ref);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, dest+8, ref+8);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_16_mmx (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    MC_avg_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_8_mmx (uint8_t * dest, uint8_t * ref,
+			  int stride, int height)
+{
+    MC_avg_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_mmx (int width, int height,
+			       uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	movq_m2r (* ref, mm1);	// load 8 ref bytes
+	movq_r2m (mm1,* dest);	// store 8 bytes at curr
+
+	if (width == 16)
+	    {
+		movq_m2r (* (ref+8), mm1);	// load 8 ref bytes
+		movq_r2m (mm1,* (dest+8));	// store 8 bytes at curr
+	    }
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_16_mmx (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    MC_put_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_8_mmx (uint8_t * dest, uint8_t * ref,
+			  int stride, int height)
+{
+    MC_put_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+/* Half pixel interpolation in the x direction */
+static inline void MC_avg_x_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_x16_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_x8_mmx (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    MC_avg_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_x_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_x16_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_x8_mmx (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    MC_put_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_xy_mmx (int width, int height,
+				  uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
+				     ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_xy16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_xy8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_xy_mmx (int width, int height,
+				  uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_xy16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_xy8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_y_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_y16_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_y8_mmx (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    MC_avg_y_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_y_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_y16_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_y8_mmx (uint8_t * dest, uint8_t * ref,
+			   int stride, int height)
+{
+    MC_put_y_mmx (8, height, dest, ref, stride);
+}
+
+
+MOTION_COMP_EXTERN (mmx)
+
+
+
+
+
+
+
+/* CPU_MMXEXT/CPU_3DNOW adaptation layer */
+
+#define pavg_r2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_r2r (src, dest);		\
+    else				\
+	pavgusb_r2r (src, dest);	\
+} while (0)
+
+#define pavg_m2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_m2r (src, dest);		\
+    else				\
+	pavgusb_m2r (src, dest);	\
+} while (0)
+
+
+/* CPU_MMXEXT code */
+
+
+static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put1_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static mmx_t mask_one = {0x0101010101010101LL};
+
+static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int cpu)
+{
+    movq_m2r (*ref, mm0);
+    movq_m2r (*(ref+1), mm1);
+    movq_r2r (mm0, mm7);
+    pxor_r2r (mm1, mm7);
+    pavg_r2r (mm1, mm0);
+    ref += stride;
+
+    do {
+	movq_m2r (*ref, mm2);
+	movq_r2r (mm0, mm5);
+
+	movq_m2r (*(ref+1), mm3);
+	movq_r2r (mm2, mm6);
+
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm3, mm2);
+
+	por_r2r (mm6, mm7);
+	pxor_r2r (mm2, mm5);
+
+	pand_r2r (mm5, mm7);
+	pavg_r2r (mm2, mm0);
+
+	pand_m2r (mask_one, mm7);
+
+	psubusb_r2r (mm7, mm0);
+
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+
+	movq_r2r (mm6, mm7);	// unroll !
+	movq_r2r (mm2, mm0);	// unroll !
+    } while (--height);
+}
+
+static inline void MC_put4_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*(dest+8), mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_16_mmxext (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_8_mmxext (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_16_mmxext (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_8_mmxext (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x16_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_x8_mmxext (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x16_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x8_mmxext (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_y16_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_y8_mmxext (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y16_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y8_mmxext (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+
+MOTION_COMP_EXTERN (mmxext)
+
+
+
+static void MC_avg_16_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_8_3dnow (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_16_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_8_3dnow (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_x8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_y16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_y8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy16_3dnow (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy8_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy16_3dnow (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy8_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+
+MOTION_COMP_EXTERN (3dnow)
+
+#endif
diff --git a/src/libmpeg2/mpeg2.h b/src/libmpeg2/mpeg2.h
new file mode 100644
index 000000000..c83a61e7e
--- /dev/null
+++ b/src/libmpeg2/mpeg2.h
@@ -0,0 +1,67 @@
+/*
+ * mpeg2.h
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Structure for the mpeg2dec decoder */
+
+typedef struct mpeg2dec_s {
+    vo_instance_t * output;
+
+    /* this is where we keep the state of the decoder */
+    struct picture_s * picture;
+    
+    uint32_t shift;
+    int is_display_initialized;
+    int is_sequence_needed;
+    int drop_flag;
+    int drop_frame;
+    int in_slice;
+
+    /* the maximum chunk size is determined by vbv_buffer_size */
+    /* which is 224K for MP@ML streams. */
+    /* (we make no pretenses of decoding anything more than that) */
+    /* allocated in init - gcc has problems allocating such big structures */
+    uint8_t * chunk_buffer;
+    /* pointer to current position in chunk_buffer */
+    uint8_t * chunk_ptr;
+    /* last start code ? */
+    uint8_t code;
+
+    uint32_t pts;
+
+    /* ONLY for 0.2.0 release - will not stay there later */
+    int frame_rate_code;
+} mpeg2dec_t ;
+
+
+
+
+
+/* initialize mpegdec with a opaque user pointer */
+void mpeg2_init (mpeg2dec_t * mpeg2dec, uint32_t mm_accel,
+		 vo_instance_t * output);
+
+/* destroy everything which was allocated, shutdown the output */
+void mpeg2_close (mpeg2dec_t * mpeg2dec);
+
+int mpeg2_decode_data (mpeg2dec_t * mpeg2dec,
+		       uint8_t * data_start, uint8_t * data_end, uint32_t pts);
+
+void mpeg2_drop (mpeg2dec_t * mpeg2dec, int flag);
diff --git a/src/libmpeg2/mpeg2_internal.h b/src/libmpeg2/mpeg2_internal.h
new file mode 100644
index 000000000..d3a92eb74
--- /dev/null
+++ b/src/libmpeg2/mpeg2_internal.h
@@ -0,0 +1,194 @@
+/*
+ * mpeg2_internal.h
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* macroblock modes */
+#define MACROBLOCK_INTRA 1
+#define MACROBLOCK_PATTERN 2
+#define MACROBLOCK_MOTION_BACKWARD 4
+#define MACROBLOCK_MOTION_FORWARD 8
+#define MACROBLOCK_QUANT 16
+#define DCT_TYPE_INTERLACED 32
+/* motion_type */
+#define MOTION_TYPE_MASK (3*64)
+#define MOTION_TYPE_BASE 64
+#define MC_FIELD (1*64)
+#define MC_FRAME (2*64)
+#define MC_16X8 (2*64)
+#define MC_DMV (3*64)
+
+/* picture structure */
+#define TOP_FIELD 1
+#define BOTTOM_FIELD 2
+#define FRAME_PICTURE 3
+
+/* picture coding type */
+#define I_TYPE 1
+#define P_TYPE 2
+#define B_TYPE 3
+#define D_TYPE 4
+
+typedef struct motion_s {
+    uint8_t * ref[2][3];
+    int pmv[2][2];
+    int f_code[2];
+} motion_t;
+
+typedef struct picture_s {
+    /* first, state that carries information from one macroblock to the */
+    /* next inside a slice, and is never used outside of slice_process() */
+
+    /* DCT coefficients - should be kept aligned ! */
+    int16_t DCTblock[64];
+
+    /* bit parsing stuff */
+    uint32_t bitstream_buf;	/* current 32 bit working set of buffer */
+    int bitstream_bits;		/* used bits in working set */
+    uint8_t * bitstream_ptr;	/* buffer with stream data */
+
+    /* Motion vectors */
+    /* The f_ and b_ correspond to the forward and backward motion */
+    /* predictors */
+    motion_t b_motion;
+    motion_t f_motion;
+
+    /* predictor for DC coefficients in intra blocks */
+    int16_t dc_dct_pred[3];
+
+    int quantizer_scale;	/* remove */
+    int current_field;		/* remove */
+    int v_offset;		/* remove */
+
+
+    /* now non-slice-specific information */
+
+    /* sequence header stuff */
+    uint8_t intra_quantizer_matrix [64];
+    uint8_t non_intra_quantizer_matrix [64];
+
+    /* The width and height of the picture snapped to macroblock units */
+    int coded_picture_width;
+    int coded_picture_height;
+
+    /* picture header stuff */
+
+    /* what type of picture this is (I, P, B, D) */
+    int picture_coding_type;
+	
+    /* picture coding extension stuff */
+	
+    /* quantization factor for intra dc coefficients */
+    int intra_dc_precision;
+    /* top/bottom/both fields */
+    int picture_structure;
+    /* bool to indicate all predictions are frame based */
+    int frame_pred_frame_dct;
+    /* bool to indicate whether intra blocks have motion vectors */
+    /* (for concealment) */
+    int concealment_motion_vectors;
+    /* bit to indicate which quantization table to use */
+    int q_scale_type;
+    /* bool to use different vlc tables */
+    int intra_vlc_format;
+    /* used for DMV MC */
+    int top_field_first;
+
+    /* stuff derived from bitstream */
+
+    /* pointer to the zigzag scan we're supposed to be using */
+    uint8_t * scan;
+
+    struct vo_frame_s * current_frame;
+    struct vo_frame_s * forward_reference_frame;
+    struct vo_frame_s * backward_reference_frame;
+
+    int second_field;
+
+    int mpeg1;
+
+    /* these things are not needed by the decoder */
+    /* this is a temporary interface, we will build a better one later. */
+    int aspect_ratio_information;
+    int frame_rate_code;
+    int progressive_sequence;
+    int repeat_first_field;
+    int progressive_frame;
+    int bitrate;
+} picture_t;
+
+typedef struct mpeg2_config_s {
+    /* Bit flags that enable various things */
+    uint32_t flags;
+} mpeg2_config_t;
+
+/* The only global variable, */
+/* the config struct */
+extern mpeg2_config_t config;
+
+
+
+/* slice.c */
+void header_state_init (picture_t * picture);
+int header_process_picture_header (picture_t * picture, uint8_t * buffer);
+int header_process_sequence_header (picture_t * picture, uint8_t * buffer);
+int header_process_extension (picture_t * picture, uint8_t * buffer);
+
+/* idct.c */
+void idct_init (void);
+
+/* idct_mlib.c */
+void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride);
+void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride);
+
+/* idct_mmx.c */
+void idct_block_copy_mmxext (int16_t *block, uint8_t * dest, int stride);
+void idct_block_add_mmxext (int16_t *block, uint8_t * dest, int stride);
+void idct_block_copy_mmx (int16_t *block, uint8_t * dest, int stride);
+void idct_block_add_mmx (int16_t *block, uint8_t * dest, int stride);
+void idct_mmx_init (void);
+
+/* motion_comp.c */
+void motion_comp_init (void);
+
+typedef struct mc_functions_s
+{
+    void (* put [8]) (uint8_t *dst, uint8_t *, int32_t, int32_t);
+    void (* avg [8]) (uint8_t *dst, uint8_t *, int32_t, int32_t);
+} mc_functions_t;
+
+#define MOTION_COMP_EXTERN(x) mc_functions_t mc_functions_##x =		\
+{									\
+    {MC_put_16_##x, MC_put_x16_##x, MC_put_y16_##x, MC_put_xy16_##x,	\
+     MC_put_8_##x,  MC_put_x8_##x,  MC_put_y8_##x,  MC_put_xy8_##x},	\
+    {MC_avg_16_##x, MC_avg_x16_##x, MC_avg_y16_##x, MC_avg_xy16_##x,	\
+     MC_avg_8_##x,  MC_avg_x8_##x,  MC_avg_y8_##x,  MC_avg_xy8_##x}	\
+};
+
+extern mc_functions_t mc_functions_c;
+extern mc_functions_t mc_functions_mmx;
+extern mc_functions_t mc_functions_mmxext;
+extern mc_functions_t mc_functions_3dnow;
+extern mc_functions_t mc_functions_mlib;
+
+/* slice.c */
+int slice_process (picture_t *picture, uint8_t code, uint8_t * buffer);
+
+/* stats.c */
+void stats_header (uint8_t code, uint8_t * buffer);
diff --git a/src/libmpeg2/slice.c b/src/libmpeg2/slice.c
new file mode 100644
index 000000000..727adb3d9
--- /dev/null
+++ b/src/libmpeg2/slice.c
@@ -0,0 +1,1799 @@
+/*
+ * slice.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <inttypes.h>
+
+#include "video_out.h"
+#include "mpeg2_internal.h"
+#include "attributes.h"
+
+extern mc_functions_t mc_functions;
+extern void (* idct_block_copy) (int16_t * block, uint8_t * dest, int stride);
+extern void (* idct_block_add) (int16_t * block, uint8_t * dest, int stride);
+
+#include "vlc.h"
+
+static int non_linear_quantizer_scale [] = {
+     0,  1,  2,  3,  4,  5,   6,   7,
+     8, 10, 12, 14, 16, 18,  20,  22,
+    24, 28, 32, 36, 40, 44,  48,  52,
+    56, 64, 72, 80, 88, 96, 104, 112
+};
+
+static inline int get_macroblock_modes (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int macroblock_modes;
+    MBtab * tab;
+
+    switch (picture->picture_coding_type) {
+    case I_TYPE:
+
+	tab = MB_I + UBITS (bit_buf, 1);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if ((! (picture->frame_pred_frame_dct)) &&
+	    (picture->picture_structure == FRAME_PICTURE)) {
+	    macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+	    DUMPBITS (bit_buf, bits, 1);
+	}
+
+	return macroblock_modes;
+
+    case P_TYPE:
+
+	tab = MB_P + UBITS (bit_buf, 5);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (picture->picture_structure != FRAME_PICTURE) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (picture->frame_pred_frame_dct) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
+		macroblock_modes |= MC_FRAME;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case B_TYPE:
+
+	tab = MB_B + UBITS (bit_buf, 6);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (picture->picture_structure != FRAME_PICTURE) {
+	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (picture->frame_pred_frame_dct) {
+	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
+	    macroblock_modes |= MC_FRAME;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_INTRA)
+		goto intra;
+	    macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+	    DUMPBITS (bit_buf, bits, 2);
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+	    intra:
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case D_TYPE:
+
+	DUMPBITS (bit_buf, bits, 1);
+	return MACROBLOCK_INTRA;
+
+    default:
+	return 0;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_quantizer_scale (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    int quantizer_scale_code;
+
+    quantizer_scale_code = UBITS (bit_buf, 5);
+    DUMPBITS (bit_buf, bits, 5);
+
+    if (picture->q_scale_type)
+	return non_linear_quantizer_scale [quantizer_scale_code];
+    else
+	return quantizer_scale_code << 1;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_motion_delta (picture_t * picture, int f_code)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    int delta;
+    int sign;
+    MVtab * tab;
+
+    if (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 1);
+	return 0;
+    } else if (bit_buf >= 0x0c000000) {
+
+	tab = MV_4 + UBITS (bit_buf, 4);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + f_code + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code)
+	    delta += UBITS (bit_buf, f_code);
+	bit_buf <<= f_code;
+
+	return (delta ^ sign) - sign;
+
+    } else {
+
+	tab = MV_10 + UBITS (bit_buf, 10);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code) {
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    delta += UBITS (bit_buf, f_code);
+	    DUMPBITS (bit_buf, bits, f_code);
+	}
+
+	return (delta ^ sign) - sign;
+
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int bound_motion_vector (int vector, int f_code)
+{
+#if 1
+    int limit;
+
+    limit = 16 << f_code;
+
+    if (vector >= limit)
+	return vector - 2*limit;
+    else if (vector < -limit)
+	return vector + 2*limit;
+    else return vector;
+#else
+    return (vector << (27 - f_code)) >> (27 - f_code);
+#endif
+}
+
+static inline int get_dmv (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    DMVtab * tab;
+
+    tab = DMV_2 + UBITS (bit_buf, 2);
+    DUMPBITS (bit_buf, bits, tab->len);
+    return tab->dmv;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_coded_block_pattern (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    CBPtab * tab;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    if (bit_buf >= 0x20000000) {
+
+	tab = CBP_7 - 16 + UBITS (bit_buf, 7);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+
+    } else {
+
+	tab = CBP_9 + UBITS (bit_buf, 9);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+    }
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_luma_dc_dct_diff (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_lum_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff;
+	} else {
+	    DUMPBITS (bit_buf, bits, 3);
+	    return 0;
+	}
+    } else {
+	tab = DC_long - 0x1e0 + UBITS (bit_buf, 9);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_chroma_dc_dct_diff (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_chrom_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff;
+	} else {
+	    DUMPBITS (bit_buf, bits, 2);
+	    return 0;
+	}
+    } else {
+	tab = DC_long - 0x3e0 + UBITS (bit_buf, 10);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len + 1);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define SATURATE(val)			\
+do {					\
+    if ((uint32_t)(val + 2048) > 4095)	\
+	val = (val > 0) ? 2047 : -2048;	\
+} while (0)
+
+static void get_intra_block_B14 (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    dest = picture->DCTblock;
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = (SBITS (bit_buf, 12) *
+		   quantizer_scale * quant_matrix[j]) / 16;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_intra_block_B15 (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    dest = picture->DCTblock;
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B15_8 - 4 + UBITS (bit_buf, 8);
+
+	    i += tab->run;
+	    if (i < 64) {
+
+	    normal_code:
+		j = scan[i];
+		bit_buf <<= tab->len;
+		bits += tab->len + 1;
+		val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+
+		/* if (bitstream_get (1)) val = -val; */
+		val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		bit_buf <<= 1;
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    } else {
+
+		/* end of block. I commented out this code because if we */
+		/* dont exit here we will still exit at the later test :) */
+
+		/* if (i >= 128) break;	*/	/* end of block */
+
+		/* escape code */
+
+		i += UBITS (bit_buf << 6, 6) - 64;
+		if (i >= 64)
+		    break;	/* illegal, check against buffer overflow */
+
+		j = scan[i];
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		val = (SBITS (bit_buf, 12) *
+		       quantizer_scale * quant_matrix[j]) / 16;
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    }
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B15_10 - 8 + UBITS (bit_buf, 10);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 4);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_non_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = -1;
+    mismatch = 1;
+    dest = picture->DCTblock;
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 - 5 + UBITS (bit_buf, 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1;
+	    val = (val * quantizer_scale * quant_matrix[j]) / 32;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_mpeg1_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = 0;
+    dest = picture->DCTblock;
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = (val * quantizer_scale * quant_matrix[j]) / 16;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_mpeg1_non_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = -1;
+    dest = picture->DCTblock;
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 - 5 + UBITS (bit_buf, 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = 2 * (val + SBITS (val, 1)) + 1;
+	    val = (val * quantizer_scale * quant_matrix[j]) / 32;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static inline int get_macroblock_address_increment (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    MBAtab * tab;
+    int mba;
+
+    mba = 0;
+
+    while (1) {
+	if (bit_buf >= 0x10000000) {
+	    tab = MBA_5 - 2 + UBITS (bit_buf, 5);
+	    DUMPBITS (bit_buf, bits, tab->len);
+	    return mba + tab->mba;
+	} else if (bit_buf >= 0x03000000) {
+	    tab = MBA_11 - 24 + UBITS (bit_buf, 11);
+	    DUMPBITS (bit_buf, bits, tab->len);
+	    return mba + tab->mba;
+	} else switch (UBITS (bit_buf, 11)) {
+	case 8:		/* macroblock_escape */
+	    mba += 33;
+	    /* no break here on purpose */
+	case 15:	/* macroblock_stuffing (MPEG1 only) */
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    break;
+	default:	/* end of slice, or error */
+	    return 0;
+	}
+    }
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void slice_intra_DCT (picture_t * picture, int cc,
+				    uint8_t * dest, int stride)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)  
+#define bit_ptr (picture->bitstream_ptr)
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    /* Get the intra DC coefficient and inverse quantize it */
+    if (cc == 0)
+	picture->dc_dct_pred[0] += get_luma_dc_dct_diff (picture);
+    else
+	picture->dc_dct_pred[cc] += get_chroma_dc_dct_diff (picture);
+    picture->DCTblock[0] =
+	picture->dc_dct_pred[cc] << (3 - picture->intra_dc_precision);
+    memset (picture->DCTblock + 1, 0, 63 * sizeof (int16_t));
+
+    if (picture->mpeg1) {
+	if (picture->picture_coding_type != D_TYPE)
+	    get_mpeg1_intra_block (picture);
+    } else if (picture->intra_vlc_format)
+	get_intra_block_B15 (picture);
+    else
+	get_intra_block_B14 (picture);
+    idct_block_copy (picture->DCTblock, dest, stride);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void slice_non_intra_DCT (picture_t * picture, uint8_t * dest,
+					int stride)
+{
+    memset (picture->DCTblock, 0, 64 * sizeof (int16_t));
+    if (picture->mpeg1)
+	get_mpeg1_non_intra_block (picture);
+    else
+	get_non_intra_block (picture);
+    idct_block_add (picture->DCTblock, dest, stride);
+}
+
+#define MOTION_Y(table,offset_x,offset_y,motion_x,motion_y,		\
+		 dest,src,offset_dest,offset_src,stride,height)		\
+do {									\
+    int xy_half;							\
+    int total_offset;							\
+									\
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			\
+    total_offset = ((offset_y + (motion_y >> 1)) * stride +		\
+		    offset_x + (motion_x >> 1) + (offset_src));		\
+    table[xy_half] (dest[0] + offset_x + (offset_dest),			\
+		    src[0] + total_offset, stride, height);		\
+} while (0)
+
+#define MOTION_UV(table,offset_x,offset_y,motion_x,motion_y,		\
+		  dest,src,offset_dest,offset_src,stride,height)	\
+do {									\
+    int xy_half;							\
+    int total_offset;							\
+									\
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			\
+    total_offset = (((offset_y + motion_y) >> 1) * (stride) +		\
+		    ((offset_x + motion_x) >> 1) + (offset_src));	\
+    table[4+xy_half] (dest[1] + (offset_x >> 1) + (offset_dest),	\
+		      src[1] + total_offset, stride, height);		\
+    table[4+xy_half] (dest[2] + (offset_x >> 1) + (offset_dest),	\
+		      src[2] + total_offset, stride, height);		\
+} while (0)
+
+static inline void motion_block (void (** table) (uint8_t *, uint8_t *,
+						  int32_t, int32_t),
+				 int x_offset, int y_offset, int mb_y_8_offset,
+				 int src_field, int dest_field,
+				 int x_pred, int y_pred,
+				 uint8_t * dest[3], uint8_t * src[3],
+				 int stride, int height)
+{
+    MOTION_Y (table, x_offset, y_offset, x_pred, y_pred, dest, src,
+	      dest_field + mb_y_8_offset*8*stride, src_field, stride, height);
+
+    x_pred /= 2;
+    y_pred /= 2;
+    stride >>= 1;
+    height >>= 1;
+
+    MOTION_UV (table, x_offset, y_offset, x_pred, y_pred, dest, src,
+	       (dest_field >> 1) + mb_y_8_offset*4*stride, src_field >> 1,
+	       stride, height);
+}
+
+static void motion_mp1 (picture_t * picture, motion_t * motion,
+			uint8_t * dest[3], int offset, int stride,
+			void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[0]);
+    motion->pmv[0][1] = motion_y;
+
+    if (motion->f_code[1]) {
+	motion_x <<= 1;
+	motion_y <<= 1;
+    }
+
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y, dest, motion->ref[0], stride, 16);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_mp1_reuse (picture_t * picture, motion_t * motion,
+			      uint8_t * dest[3], int offset, int stride,
+			      void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    int motion_x, motion_y;
+
+    motion_x = motion->pmv[0][0];
+    motion_y = motion->pmv[0][1];
+
+    if (motion->f_code[1]) {
+	motion_x <<= 1;
+	motion_y <<= 1;
+    }
+
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y, dest, motion->ref[0], stride, 16);
+}
+
+static void motion_fr_frame (picture_t * picture, motion_t * motion,
+			     uint8_t * dest[3], int offset, int stride,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y, dest, motion->ref[0], stride, 16);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_field (picture_t * picture, motion_t * motion,
+			     uint8_t * dest[3], int offset, int stride,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    int field_select;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field_select = SBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[0][1] = motion_y << 1;
+
+    motion_block (table, offset, picture->v_offset >> 1,
+		  0, (field_select & stride), 0,
+		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field_select = SBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[1][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[1][1] >> 1) + get_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[1][1] = motion_y << 1;
+
+    motion_block (table, offset, picture->v_offset >> 1,
+		  0, (field_select & stride), stride,
+		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_dmv (picture_t * picture, motion_t * motion,
+			   uint8_t * dest[3], int offset, int stride,
+			   void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    int dmv_x, dmv_y;
+    int m;
+    int other_x, other_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    dmv_x = get_dmv (picture);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    dmv_y = get_dmv (picture);
+
+    motion_block (mc_functions.put, offset, picture->v_offset >> 1, 0, 0, 0,
+		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+
+    m = picture->top_field_first ? 1 : 3;
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;
+    motion_block (mc_functions.avg, offset, picture->v_offset >> 1, 0, stride, 0,
+		  other_x, other_y, dest, motion->ref[0], stride * 2, 8);
+
+    motion_block (mc_functions.put, offset, picture->v_offset >> 1,
+		  0, stride, stride,
+		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+
+    m = picture->top_field_first ? 3 : 1;
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;
+    motion_block (mc_functions.avg, offset, picture->v_offset >> 1, 0, 0, stride,
+		  other_x, other_y, dest, motion->ref[0], stride * 2, 8);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+/* like motion_frame, but reuse previous motion vectors */
+static void motion_fr_reuse (picture_t * picture, motion_t * motion,
+			     uint8_t * dest[3], int offset, int stride,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion->pmv[0][0], motion->pmv[0][1],
+		  dest, motion->ref[0], stride, 16);
+}
+
+/* like motion_frame, but use null motion vectors */
+static void motion_fr_zero (picture_t * picture, motion_t * motion,
+			    uint8_t * dest[3], int offset, int stride,
+			    void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    motion_block (table, offset, picture->v_offset, 0, 0, 0, 0, 0,
+		  dest, motion->ref[0], stride, 16);
+}
+
+/* like motion_frame, but parsing without actual motion compensation */
+static void motion_fr_conceal (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][0] +
+	   get_motion_delta (picture, picture->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][1] +
+	   get_motion_delta (picture, picture->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
+    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_field (picture_t * picture, motion_t * motion,
+			     uint8_t * dest[3], int offset, int stride,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    int field_select;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field_select = UBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y,
+		  dest, motion->ref[field_select], stride, 16);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_16x8 (picture_t * picture, motion_t * motion,
+			    uint8_t * dest[3], int offset, int stride,
+			    void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    int field_select;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field_select = UBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[0][1] = motion_y;
+
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y,
+		  dest, motion->ref[field_select], stride, 8);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field_select = UBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[1][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[1][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion_y;
+
+    motion_block (table, offset, picture->v_offset+8, 1, 0, 0,
+		  motion_x, motion_y,
+		  dest, motion->ref[field_select], stride, 8);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_dmv (picture_t * picture, motion_t * motion,
+			   uint8_t * dest[3], int offset, int stride,
+			   void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    int dmv_x, dmv_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    dmv_x = get_dmv (picture);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    dmv_y = get_dmv (picture);
+
+    motion_block (mc_functions.put, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y,
+		  dest, motion->ref[picture->current_field], stride, 16);
+
+    motion_x = ((motion_x + (motion_x > 0)) >> 1) + dmv_x;
+    motion_y = ((motion_y + (motion_y > 0)) >> 1) + dmv_y +
+	2 * picture->current_field - 1;
+    motion_block (mc_functions.avg, offset, picture->v_offset, 0, 0, 0,
+		  motion_x, motion_y,
+		  dest, motion->ref[!picture->current_field], stride, 16);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_reuse (picture_t * picture, motion_t * motion,
+			     uint8_t * dest[3], int offset, int stride,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    motion_block (table, offset, picture->v_offset, 0, 0, 0,
+		  motion->pmv[0][0], motion->pmv[0][1],
+		  dest, motion->ref[picture->current_field], stride, 16);
+}
+
+static void motion_fi_zero (picture_t * picture, motion_t * motion,
+			    uint8_t * dest[3], int offset, int stride,
+			    void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    motion_block (table, offset, picture->v_offset, 0, 0, 0, 0, 0,
+		  dest, motion->ref[picture->current_field], stride, 16);
+}
+
+static void motion_fi_conceal (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    DUMPBITS (bit_buf, bits, 1); /* remove field_select */
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][0] +
+	   get_motion_delta (picture, picture->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][1] +
+	   get_motion_delta (picture, picture->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
+    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define MOTION(routine,direction)					\
+do {									\
+    if ((direction) & MACROBLOCK_MOTION_FORWARD)			\
+	routine (picture, &(picture->f_motion), dest, offset, stride,	\
+		 mc_functions.put);					\
+    if ((direction) & MACROBLOCK_MOTION_BACKWARD)			\
+	routine (picture, &(picture->b_motion), dest, offset, stride,	\
+		 ((direction) & MACROBLOCK_MOTION_FORWARD ?		\
+		  mc_functions.avg : mc_functions.put));		\
+} while (0)
+
+#define CHECK_DISPLAY							\
+do {									\
+    if (offset == picture->coded_picture_width) {			\
+	do { /* just so we can use the break statement */		\
+	    if (picture->current_frame->copy) {				\
+		picture->current_frame->copy (picture->current_frame,	\
+					      dest);			\
+		if (picture->picture_coding_type == B_TYPE)		\
+		    break;						\
+	    }								\
+	    dest[0] += 16 * stride;					\
+	    dest[1] += 4 * stride;					\
+	    dest[2] += 4 * stride;					\
+	} while (0);							\
+	if (! (picture->mpeg1))						\
+	    return 0;							\
+	picture->v_offset += 16;					\
+	if (picture->v_offset >= picture->coded_picture_height)		\
+	    return 0;							\
+	offset = 0;							\
+    }									\
+} while (0)
+
+int slice_process (picture_t * picture, uint8_t code, uint8_t * buffer)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int macroblock_modes;
+    int stride;
+    uint8_t * dest[3];
+    int offset;
+    uint8_t ** forward_ref[2];
+
+    stride = picture->coded_picture_width;
+    offset = (code - 1) * stride * 4;
+    picture->v_offset = (code - 1) * 16;
+
+    forward_ref[0] = picture->forward_reference_frame->base;
+    if (picture->picture_structure != FRAME_PICTURE) {
+	forward_ref[1] = picture->forward_reference_frame->base;
+	offset <<= 1;
+	picture->current_field = (picture->picture_structure == BOTTOM_FIELD);
+	if ((picture->second_field) &&
+	    (picture->picture_coding_type != B_TYPE))
+	    forward_ref[picture->picture_structure == TOP_FIELD] =
+		picture->current_frame->base;
+
+	picture->f_motion.ref[1][0] = forward_ref[1][0] + stride;
+	picture->f_motion.ref[1][1] = forward_ref[1][1] + (stride >> 1);
+	picture->f_motion.ref[1][2] = forward_ref[1][2] + (stride >> 1);
+
+	picture->b_motion.ref[1][0] =
+	    picture->backward_reference_frame->base[0] + stride;
+	picture->b_motion.ref[1][1] =
+	    picture->backward_reference_frame->base[1] + (stride >> 1);
+	picture->b_motion.ref[1][2] =
+	    picture->backward_reference_frame->base[2] + (stride >> 1);
+    }
+
+    picture->f_motion.ref[0][0] = forward_ref[0][0];
+    picture->f_motion.ref[0][1] = forward_ref[0][1];
+    picture->f_motion.ref[0][2] = forward_ref[0][2];
+
+    picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+
+    picture->b_motion.ref[0][0] = picture->backward_reference_frame->base[0];
+    picture->b_motion.ref[0][1] = picture->backward_reference_frame->base[1];
+    picture->b_motion.ref[0][2] = picture->backward_reference_frame->base[2];
+
+    picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
+    picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+
+    if ((picture->current_frame->copy) &&
+	(picture->picture_coding_type == B_TYPE))
+	offset = 0;
+
+    dest[0] = picture->current_frame->base[0] + offset * 4;
+    dest[1] = picture->current_frame->base[1] + offset;
+    dest[2] = picture->current_frame->base[2] + offset;
+
+    switch (picture->picture_structure) {
+    case BOTTOM_FIELD:
+	dest[0] += stride;
+	dest[1] += stride >> 1;
+	dest[2] += stride >> 1;
+	/* follow thru */
+    case TOP_FIELD:
+	stride <<= 1;
+    }
+
+    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+	picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision + 7);
+
+    bitstream_init (picture, buffer);
+
+    picture->quantizer_scale = get_quantizer_scale (picture);
+
+    /* ignore intra_slice and all the extra data */
+    while (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 9);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+    }
+    DUMPBITS (bit_buf, bits, 1);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    offset = get_macroblock_address_increment (picture) << 4;
+
+    while (1) {
+	NEEDBITS (bit_buf, bits, bit_ptr);
+
+	macroblock_modes = get_macroblock_modes (picture);
+
+	/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
+	if (macroblock_modes & MACROBLOCK_QUANT)
+	    picture->quantizer_scale = get_quantizer_scale (picture);
+
+	if (macroblock_modes & MACROBLOCK_INTRA) {
+
+	    int DCT_offset, DCT_stride;
+
+	    if (picture->concealment_motion_vectors) {
+		if (picture->picture_structure == FRAME_PICTURE)
+		    motion_fr_conceal (picture);
+		else
+		    motion_fi_conceal (picture);
+	    } else {
+		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+		picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
+		picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+	    }
+
+	    if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		DCT_offset = stride;
+		DCT_stride = stride * 2;
+	    } else {
+		DCT_offset = stride * 8;
+		DCT_stride = stride;
+	    }
+
+	    slice_intra_DCT (picture, 0, dest[0] + offset, DCT_stride);
+	    slice_intra_DCT (picture, 0, dest[0] + offset + 8, DCT_stride);
+	    slice_intra_DCT (picture, 0, dest[0] + offset + DCT_offset,
+			     DCT_stride);
+	    slice_intra_DCT (picture, 0, dest[0] + offset + DCT_offset + 8,
+			     DCT_stride);
+
+	    slice_intra_DCT (picture, 1, dest[1] + (offset >> 1), stride >> 1);
+	    slice_intra_DCT (picture, 2, dest[2] + (offset >> 1), stride >> 1);
+
+	    if (picture->picture_coding_type == D_TYPE) {
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	} else {
+
+	    if (picture->mpeg1) {
+		if ((macroblock_modes & MOTION_TYPE_MASK) == MC_FRAME)
+		    MOTION (motion_mp1, macroblock_modes);
+		else {
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    MOTION (motion_fr_zero, MACROBLOCK_MOTION_FORWARD);
+		}
+	    } else if (picture->picture_structure == FRAME_PICTURE)
+		switch (macroblock_modes & MOTION_TYPE_MASK) {
+		case MC_FRAME:
+		    MOTION (motion_fr_frame, macroblock_modes);
+		    break;
+
+		case MC_FIELD:
+		    MOTION (motion_fr_field, macroblock_modes);
+		    break;
+
+		case MC_DMV:
+		    MOTION (motion_fr_dmv, MACROBLOCK_MOTION_FORWARD);
+		    break;
+
+		case 0:
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    MOTION (motion_fr_zero, MACROBLOCK_MOTION_FORWARD);
+		    break;
+		}
+	    else
+		switch (macroblock_modes & MOTION_TYPE_MASK) {
+		case MC_FIELD:
+		    MOTION (motion_fi_field, macroblock_modes);
+		    break;
+
+		case MC_16X8:
+		    MOTION (motion_fi_16x8, macroblock_modes);
+		    break;
+
+		case MC_DMV:
+		    MOTION (motion_fi_dmv, MACROBLOCK_MOTION_FORWARD);
+		    break;
+
+		case 0:
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    MOTION (motion_fi_zero, MACROBLOCK_MOTION_FORWARD);
+		    break;
+		}
+
+	    if (macroblock_modes & MACROBLOCK_PATTERN) {
+		int coded_block_pattern;
+		int DCT_offset, DCT_stride;
+
+		if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		    DCT_offset = stride;
+		    DCT_stride = stride * 2;
+		} else {
+		    DCT_offset = stride * 8;
+		    DCT_stride = stride;
+		}
+
+		coded_block_pattern = get_coded_block_pattern (picture);
+
+		if (coded_block_pattern & 0x20)
+		    slice_non_intra_DCT (picture, dest[0] + offset,
+					 DCT_stride);
+		if (coded_block_pattern & 0x10)
+		    slice_non_intra_DCT (picture, dest[0] + offset + 8,
+					 DCT_stride);
+		if (coded_block_pattern & 0x08)
+		    slice_non_intra_DCT (picture,
+					 dest[0] + offset + DCT_offset,
+					 DCT_stride);
+		if (coded_block_pattern & 0x04)
+		    slice_non_intra_DCT (picture,
+					 dest[0] + offset + DCT_offset + 8,
+					 DCT_stride);
+
+		if (coded_block_pattern & 0x2)
+		    slice_non_intra_DCT (picture, dest[1] + (offset >> 1),
+					 stride >> 1);
+		if (coded_block_pattern & 0x1)
+		    slice_non_intra_DCT (picture, dest[2] + (offset >> 1),
+					 stride >> 1);
+	    }
+
+	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+		picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision+7);
+	}
+
+	offset += 16;
+	CHECK_DISPLAY;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+
+	if (bit_buf & 0x80000000) {
+	    DUMPBITS (bit_buf, bits, 1);
+	} else {
+	    int mba_inc;
+
+	    mba_inc = get_macroblock_address_increment (picture);
+	    if (!mba_inc)
+		break;
+
+	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+		picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision+7);
+
+	    if (picture->picture_coding_type == P_TYPE) {
+		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+
+		do {
+		    if (picture->picture_structure == FRAME_PICTURE)
+			MOTION (motion_fr_zero, MACROBLOCK_MOTION_FORWARD);
+		    else
+			MOTION (motion_fi_zero, MACROBLOCK_MOTION_FORWARD);
+
+		    offset += 16;
+		    CHECK_DISPLAY;
+		} while (--mba_inc);
+	    } else {
+		do {
+		    if (picture->mpeg1)
+			MOTION (motion_mp1_reuse, macroblock_modes);
+		    else if (picture->picture_structure == FRAME_PICTURE)
+			MOTION (motion_fr_reuse, macroblock_modes);
+		    else
+			MOTION (motion_fi_reuse, macroblock_modes);
+
+		    offset += 16;
+		    CHECK_DISPLAY;
+		} while (--mba_inc);
+	    }
+	}
+    }
+
+    return 0;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
diff --git a/src/libmpeg2/stats.c b/src/libmpeg2/stats.c
new file mode 100644
index 000000000..f3456058d
--- /dev/null
+++ b/src/libmpeg2/stats.c
@@ -0,0 +1,315 @@
+/*
+ * stats.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+
+static int debug_level = -1;
+
+/* Determine is debug output is required. */
+/* We could potentially have multiple levels of debug info */
+static int debug_is_on (void)
+{
+    char * env_var;
+	
+    if (debug_level < 0) {
+	env_var = getenv ("MPEG2_DEBUG");
+
+	if (env_var)
+	    debug_level = 1;
+	else
+	    debug_level = 0;
+    }
+	
+    return debug_level;
+}
+
+static void stats_picture (uint8_t * buffer)
+{
+    static char * picture_coding_type_str [8] = {
+	"Invalid picture type",
+	"I-type",
+	"P-type",
+	"B-type",
+	"D (very bad)",
+	"Invalid","Invalid","Invalid"
+    };
+
+    int picture_coding_type;
+    int temporal_reference;
+    int vbv_delay;
+
+    temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
+    picture_coding_type = (buffer [1] >> 3) & 7;
+    vbv_delay = ((buffer[1] << 13) | (buffer[2] << 5) |
+		 (buffer[3] >> 3)) & 0xffff;
+
+    fprintf (stderr, " (picture) %s temporal_reference %d, vbv_delay %d\n",
+	     picture_coding_type_str [picture_coding_type],
+	     temporal_reference, vbv_delay);
+}
+
+static void stats_user_data (uint8_t * buffer)
+{
+    fprintf (stderr, " (user_data)\n");
+}
+
+static void stats_sequence (uint8_t * buffer)
+{
+    static char * aspect_ratio_information_str[8] = {
+	"Invalid Aspect Ratio",
+	"1:1",
+	"4:3",
+	"16:9",
+	"2.21:1",
+	"Invalid Aspect Ratio",
+	"Invalid Aspect Ratio",
+	"Invalid Aspect Ratio"
+    };
+    static char * frame_rate_str[16] = {
+	"Invalid frame_rate_code",
+	"23.976", "24", "25" , "29.97",
+	"30" , "50", "59.94", "60" ,
+	"Invalid frame_rate_code", "Invalid frame_rate_code",
+	"Invalid frame_rate_code", "Invalid frame_rate_code",
+	"Invalid frame_rate_code", "Invalid frame_rate_code",
+	"Invalid frame_rate_code"
+    };
+
+    int horizontal_size;
+    int vertical_size;
+    int aspect_ratio_information;
+    int frame_rate_code;
+    int bit_rate_value;
+    int vbv_buffer_size_value;
+    int constrained_parameters_flag;
+    int load_intra_quantizer_matrix;
+    int load_non_intra_quantizer_matrix;
+
+    vertical_size = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    horizontal_size = vertical_size >> 12;
+    vertical_size &= 0xfff;
+    aspect_ratio_information = buffer[3] >> 4;
+    frame_rate_code = buffer[3] & 15;
+    bit_rate_value = (buffer[4] << 10) | (buffer[5] << 2) | (buffer[6] >> 6);
+    vbv_buffer_size_value = ((buffer[6] << 5) | (buffer[7] >> 3)) & 0x3ff;
+    constrained_parameters_flag = buffer[7] & 4;
+    load_intra_quantizer_matrix = buffer[7] & 2;
+    if (load_intra_quantizer_matrix)
+	buffer += 64;
+    load_non_intra_quantizer_matrix = buffer[7] & 1;
+
+    fprintf (stderr, " (seq) %dx%d %s, %s fps, %5.0f kbps, VBV %d kB%s%s%s\n",
+	     horizontal_size, vertical_size,
+	     aspect_ratio_information_str [aspect_ratio_information],
+	     frame_rate_str [frame_rate_code],
+	     bit_rate_value * 400.0 / 1000.0,
+	     2 * vbv_buffer_size_value,
+	     constrained_parameters_flag ? " , CP":"",
+	     load_intra_quantizer_matrix ? " , Custom Intra Matrix":"",
+	     load_non_intra_quantizer_matrix ? " , Custom Non-Intra Matrix":"");
+}
+
+static void stats_sequence_error (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_error)\n");
+}
+
+static void stats_sequence_end (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_end)\n");
+}
+
+static void stats_group (uint8_t * buffer)
+{
+    fprintf (stderr, " (group)%s%s\n",
+	     (buffer[4] & 0x40) ? " closed_gop" : "",
+	     (buffer[4] & 0x20) ? " broken_link" : "");
+}
+
+static void stats_slice (uint8_t code, uint8_t * buffer)
+{
+    /* fprintf (stderr, " (slice %d)\n", code); */
+}
+
+static void stats_sequence_extension (uint8_t * buffer)
+{
+    static char * chroma_format_str[4] = {
+	"Invalid Chroma Format",
+	"4:2:0 Chroma",
+	"4:2:2 Chroma",
+	"4:4:4 Chroma"
+    };
+
+    int progressive_sequence;
+    int chroma_format;
+
+    progressive_sequence = (buffer[1] >> 3) & 1;
+    chroma_format = (buffer[1] >> 1) & 3;
+
+    fprintf (stderr, " (seq_ext) progressive_sequence %d, %s\n",
+	     progressive_sequence, chroma_format_str [chroma_format]);
+}
+
+static void stats_sequence_display_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_display_extension)\n");
+}
+
+static void stats_quant_matrix_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (quant_matrix_extension)\n");
+}
+
+static void stats_copyright_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (copyright_extension)\n");
+}
+
+
+static void stats_sequence_scalable_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_scalable_extension)\n");
+}
+
+static void stats_picture_display_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (picture_display_extension)\n");
+}
+
+static void stats_picture_coding_extension (uint8_t * buffer)
+{
+    static char * picture_structure_str[4] = {
+	"Invalid Picture Structure",
+	"Top field",
+	"Bottom field",
+	"Frame Picture"
+    };
+
+    int f_code[2][2];
+    int intra_dc_precision;
+    int picture_structure;
+    int top_field_first;
+    int frame_pred_frame_dct;
+    int concealment_motion_vectors;
+    int q_scale_type;
+    int intra_vlc_format;
+    int alternate_scan;
+    int repeat_first_field;
+    int progressive_frame;
+
+    f_code[0][0] = buffer[0] & 15;
+    f_code[0][1] = buffer[1] >> 4;
+    f_code[1][0] = buffer[1] & 15;
+    f_code[1][1] = buffer[2] >> 4;
+    intra_dc_precision = (buffer[2] >> 2) & 3;
+    picture_structure = buffer[2] & 3;
+    top_field_first = buffer[3] >> 7;
+    frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    q_scale_type = (buffer[3] >> 4) & 1;
+    intra_vlc_format = (buffer[3] >> 3) & 1;
+    alternate_scan = (buffer[3] >> 2) & 1;
+    repeat_first_field = (buffer[3] >> 1) & 1;
+    progressive_frame = buffer[4] >> 7;
+
+    fprintf (stderr,
+	     " (pic_ext) %s\n", picture_structure_str [picture_structure]);
+    fprintf (stderr,
+	     " (pic_ext) forward horizontal f_code % d, forward vertical f_code % d\n",
+	     f_code[0][0], f_code[0][1]);
+    fprintf (stderr,
+	     " (pic_ext) backward horizontal f_code % d, backward vertical f_code % d\n", 
+	     f_code[1][0], f_code[1][1]);
+    fprintf (stderr,
+	     " (pic_ext) intra_dc_precision %d, top_field_first %d, frame_pred_frame_dct %d\n",
+	     intra_dc_precision, top_field_first, frame_pred_frame_dct);
+    fprintf (stderr,
+	     " (pic_ext) concealment_motion_vectors %d, q_scale_type %d, intra_vlc_format %d\n",
+	     concealment_motion_vectors, q_scale_type, intra_vlc_format);
+    fprintf (stderr,
+	     " (pic_ext) alternate_scan %d, repeat_first_field %d, progressive_frame %d\n",
+	     alternate_scan, repeat_first_field, progressive_frame);
+}
+
+void stats_header (uint8_t code, uint8_t * buffer)
+{
+    if (! (debug_is_on ()))
+	return;
+
+    switch (code) {
+    case 0x00:
+	stats_picture (buffer);
+	break;
+    case 0xb2:
+	stats_user_data (buffer);
+	break;
+    case 0xb3:
+	stats_sequence (buffer);
+	break;
+    case 0xb4:
+	stats_sequence_error (buffer);
+	break;
+    case 0xb5:
+	switch (buffer[0] >> 4) {
+	case 1:
+	    stats_sequence_extension (buffer);
+	    break;
+	case 2:
+	    stats_sequence_display_extension (buffer);
+	    break;
+	case 3:
+	    stats_quant_matrix_extension (buffer);
+	    break;
+	case 4:
+	    stats_copyright_extension (buffer);
+	    break;
+	case 5:
+	    stats_sequence_scalable_extension (buffer);
+	    break;
+	case 7:
+	    stats_picture_display_extension (buffer);
+	    break;
+	case 8:
+	    stats_picture_coding_extension (buffer);
+	    break;
+	default:
+	    fprintf (stderr, " (unknown extension %#x)\n", buffer[0] >> 4);
+	}
+	break;
+    case 0xb7:
+	stats_sequence_end (buffer);
+	break;
+    case 0xb8:
+	stats_group (buffer);
+	break;
+    default:
+	if (code < 0xb0)
+	    stats_slice (code, buffer);
+	else
+	    fprintf (stderr, " (unknown start code %#02x)\n", code);
+    }
+}
diff --git a/src/libmpeg2/vlc.h b/src/libmpeg2/vlc.h
new file mode 100644
index 000000000..ed2e04f88
--- /dev/null
+++ b/src/libmpeg2/vlc.h
@@ -0,0 +1,425 @@
+/*
+ * vlc.h
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define GETWORD(bit_buf,shift,bit_ptr)				\
+do {								\
+    bit_buf |= ((bit_ptr[0] << 8) | bit_ptr[1]) << (shift);	\
+    bit_ptr += 2;						\
+} while (0)
+
+static inline void bitstream_init (picture_t * picture, uint8_t * start)
+{
+    picture->bitstream_buf = 0;    GETWORD (picture->bitstream_buf, 16, start);
+    picture->bitstream_ptr = start;
+    picture->bitstream_bits = 0;
+}
+
+/* make sure that there are at least 16 valid bits in bit_buf */
+#define NEEDBITS(bit_buf,bits,bit_ptr)		\
+do {						\
+    if (bits > 0) {				\
+	GETWORD (bit_buf, bits, bit_ptr);	\
+	bits -= 16;				\
+    }						\
+} while (0)
+
+/* remove num valid bits from bit_buf */
+#define DUMPBITS(bit_buf,bits,num)	\
+do {					\
+    bit_buf <<= (num);			\
+    bits += (num);			\
+} while (0)
+
+/* take num bits from the high part of bit_buf and zero extend them */
+#define UBITS(bit_buf,num) (((uint32_t)(bit_buf)) >> (32 - (num)))
+
+/* take num bits from the high part of bit_buf and sign extend them */
+#define SBITS(bit_buf,num) (((int32_t)(bit_buf)) >> (32 - (num)))
+
+typedef struct {
+    uint8_t modes;
+    uint8_t len;
+} MBtab;
+
+typedef struct {
+    uint8_t delta;
+    uint8_t len;
+} MVtab;
+
+typedef struct {
+    int8_t dmv;
+    uint8_t len;
+} DMVtab;
+
+typedef struct {
+    uint8_t cbp;
+    uint8_t len;
+} CBPtab;
+
+typedef struct {
+    uint8_t size;
+    uint8_t len;
+} DCtab;
+
+typedef struct {
+    uint8_t run;
+    uint8_t level;
+    uint8_t len;
+} DCTtab;
+
+typedef struct {
+    uint8_t mba;
+    uint8_t len;
+} MBAtab;
+
+
+#define INTRA MACROBLOCK_INTRA
+#define QUANT MACROBLOCK_QUANT
+
+static MBtab MB_I [] = {
+    {INTRA|QUANT, 2}, {INTRA, 1}
+};
+
+#define MC MACROBLOCK_MOTION_FORWARD
+#define CODED MACROBLOCK_PATTERN
+
+static MBtab MB_P [] = {
+    {INTRA|QUANT, 6}, {CODED|QUANT, 5}, {MC|CODED|QUANT, 5}, {INTRA,    5},
+    {MC,          3}, {MC,          3}, {MC,             3}, {MC,       3},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1}
+};
+
+#define FWD MACROBLOCK_MOTION_FORWARD
+#define BWD MACROBLOCK_MOTION_BACKWARD
+#define INTER MACROBLOCK_MOTION_FORWARD|MACROBLOCK_MOTION_BACKWARD
+
+static MBtab MB_B [] = {
+    {0,                 0}, {INTRA|QUANT,       6},
+    {BWD|CODED|QUANT,   6}, {FWD|CODED|QUANT,   6},
+    {INTER|CODED|QUANT, 5}, {INTER|CODED|QUANT, 5},
+					{INTRA,       5}, {INTRA,       5},
+    {FWD,         4}, {FWD,         4}, {FWD,         4}, {FWD,         4},
+    {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}
+};
+
+#undef INTRA
+#undef QUANT
+#undef MC
+#undef CODED
+#undef FWD
+#undef BWD
+#undef INTER
+
+
+static MVtab MV_4 [] = {
+    { 3, 6}, { 2, 4}, { 1, 3}, { 1, 3}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}
+};
+
+static MVtab MV_10 [] = {
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10},
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, {15,10}, {14,10}, {13,10}, {12,10},
+    {11,10}, {10,10}, { 9, 9}, { 9, 9}, { 8, 9}, { 8, 9}, { 7, 9}, { 7, 9},
+    { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7},
+    { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7},
+    { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}
+};
+
+
+static DMVtab DMV_2 [] = {
+    { 0, 1}, { 0, 1}, { 1, 2}, {-1, 2}
+};
+
+
+static CBPtab CBP_7 [] = {
+    {0x22, 7}, {0x12, 7}, {0x0a, 7}, {0x06, 7},
+    {0x21, 7}, {0x11, 7}, {0x09, 7}, {0x05, 7},
+    {0x3f, 6}, {0x3f, 6}, {0x03, 6}, {0x03, 6},
+    {0x24, 6}, {0x24, 6}, {0x18, 6}, {0x18, 6},
+    {0x3e, 5}, {0x3e, 5}, {0x3e, 5}, {0x3e, 5},
+    {0x02, 5}, {0x02, 5}, {0x02, 5}, {0x02, 5},
+    {0x3d, 5}, {0x3d, 5}, {0x3d, 5}, {0x3d, 5},
+    {0x01, 5}, {0x01, 5}, {0x01, 5}, {0x01, 5},
+    {0x38, 5}, {0x38, 5}, {0x38, 5}, {0x38, 5},
+    {0x34, 5}, {0x34, 5}, {0x34, 5}, {0x34, 5},
+    {0x2c, 5}, {0x2c, 5}, {0x2c, 5}, {0x2c, 5},
+    {0x1c, 5}, {0x1c, 5}, {0x1c, 5}, {0x1c, 5},
+    {0x28, 5}, {0x28, 5}, {0x28, 5}, {0x28, 5},
+    {0x14, 5}, {0x14, 5}, {0x14, 5}, {0x14, 5},
+    {0x30, 5}, {0x30, 5}, {0x30, 5}, {0x30, 5},
+    {0x0c, 5}, {0x0c, 5}, {0x0c, 5}, {0x0c, 5},
+    {0x20, 4}, {0x20, 4}, {0x20, 4}, {0x20, 4},
+    {0x20, 4}, {0x20, 4}, {0x20, 4}, {0x20, 4},
+    {0x10, 4}, {0x10, 4}, {0x10, 4}, {0x10, 4},
+    {0x10, 4}, {0x10, 4}, {0x10, 4}, {0x10, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3}
+};
+
+static CBPtab CBP_9 [] = {
+    {0,    0}, {0x00, 9}, {0x27, 9}, {0x1b, 9},
+    {0x3b, 9}, {0x37, 9}, {0x2f, 9}, {0x1f, 9},
+    {0x3a, 8}, {0x3a, 8}, {0x36, 8}, {0x36, 8},
+    {0x2e, 8}, {0x2e, 8}, {0x1e, 8}, {0x1e, 8},
+    {0x39, 8}, {0x39, 8}, {0x35, 8}, {0x35, 8},
+    {0x2d, 8}, {0x2d, 8}, {0x1d, 8}, {0x1d, 8},
+    {0x26, 8}, {0x26, 8}, {0x1a, 8}, {0x1a, 8},
+    {0x25, 8}, {0x25, 8}, {0x19, 8}, {0x19, 8},
+    {0x2b, 8}, {0x2b, 8}, {0x17, 8}, {0x17, 8},
+    {0x33, 8}, {0x33, 8}, {0x0f, 8}, {0x0f, 8},
+    {0x2a, 8}, {0x2a, 8}, {0x16, 8}, {0x16, 8},
+    {0x32, 8}, {0x32, 8}, {0x0e, 8}, {0x0e, 8},
+    {0x29, 8}, {0x29, 8}, {0x15, 8}, {0x15, 8},
+    {0x31, 8}, {0x31, 8}, {0x0d, 8}, {0x0d, 8},
+    {0x23, 8}, {0x23, 8}, {0x13, 8}, {0x13, 8},
+    {0x0b, 8}, {0x0b, 8}, {0x07, 8}, {0x07, 8}
+};
+
+
+static DCtab DC_lum_5 [] = {
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
+    {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}
+};
+
+static DCtab DC_chrom_5 [] = {
+    {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}
+};
+
+static DCtab DC_long [] = {
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, { 7, 6}, { 7, 6},
+    {8, 7}, {8, 7}, {8, 7}, {8, 7}, {9, 8}, {9, 8}, {10, 9}, {11, 9}
+};
+
+
+static DCTtab DCT_16 [] = {
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {  2,18, 0}, {  2,17, 0}, {  2,16, 0}, {  2,15, 0},
+    {  7, 3, 0}, { 17, 2, 0}, { 16, 2, 0}, { 15, 2, 0},
+    { 14, 2, 0}, { 13, 2, 0}, { 12, 2, 0}, { 32, 1, 0},
+    { 31, 1, 0}, { 30, 1, 0}, { 29, 1, 0}, { 28, 1, 0}
+};
+
+static DCTtab DCT_15 [] = {
+    {  1,40,15}, {  1,39,15}, {  1,38,15}, {  1,37,15},
+    {  1,36,15}, {  1,35,15}, {  1,34,15}, {  1,33,15},
+    {  1,32,15}, {  2,14,15}, {  2,13,15}, {  2,12,15},
+    {  2,11,15}, {  2,10,15}, {  2, 9,15}, {  2, 8,15},
+    {  1,31,14}, {  1,31,14}, {  1,30,14}, {  1,30,14},
+    {  1,29,14}, {  1,29,14}, {  1,28,14}, {  1,28,14},
+    {  1,27,14}, {  1,27,14}, {  1,26,14}, {  1,26,14},
+    {  1,25,14}, {  1,25,14}, {  1,24,14}, {  1,24,14},
+    {  1,23,14}, {  1,23,14}, {  1,22,14}, {  1,22,14},
+    {  1,21,14}, {  1,21,14}, {  1,20,14}, {  1,20,14},
+    {  1,19,14}, {  1,19,14}, {  1,18,14}, {  1,18,14},
+    {  1,17,14}, {  1,17,14}, {  1,16,14}, {  1,16,14}
+};
+
+static DCTtab DCT_13 [] = {
+    { 11, 2,13}, { 10, 2,13}, {  6, 3,13}, {  4, 4,13},
+    {  3, 5,13}, {  2, 7,13}, {  2, 6,13}, {  1,15,13},
+    {  1,14,13}, {  1,13,13}, {  1,12,13}, { 27, 1,13},
+    { 26, 1,13}, { 25, 1,13}, { 24, 1,13}, { 23, 1,13},
+    {  1,11,12}, {  1,11,12}, {  9, 2,12}, {  9, 2,12},
+    {  5, 3,12}, {  5, 3,12}, {  1,10,12}, {  1,10,12},
+    {  3, 4,12}, {  3, 4,12}, {  8, 2,12}, {  8, 2,12},
+    { 22, 1,12}, { 22, 1,12}, { 21, 1,12}, { 21, 1,12},
+    {  1, 9,12}, {  1, 9,12}, { 20, 1,12}, { 20, 1,12},
+    { 19, 1,12}, { 19, 1,12}, {  2, 5,12}, {  2, 5,12},
+    {  4, 3,12}, {  4, 3,12}, {  1, 8,12}, {  1, 8,12},
+    {  7, 2,12}, {  7, 2,12}, { 18, 1,12}, { 18, 1,12}
+};
+
+static DCTtab DCT_B14_10 [] = {
+    { 17, 1,10}, {  6, 2,10}, {  1, 7,10}, {  3, 3,10},
+    {  2, 4,10}, { 16, 1,10}, { 15, 1,10}, {  5, 2,10}
+};
+
+static DCTtab DCT_B14_8 [] = {
+    { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
+    {  3, 2, 7}, {  3, 2, 7}, { 10, 1, 7}, { 10, 1, 7},
+    {  1, 4, 7}, {  1, 4, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6},
+    {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6},
+    {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    { 14, 1, 8}, {  1, 6, 8}, { 13, 1, 8}, { 12, 1, 8},
+    {  4, 2, 8}, {  2, 3, 8}, {  1, 5, 8}, { 11, 1, 8}
+};
+
+static DCTtab DCT_B14AC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}
+};
+
+static DCTtab DCT_B14DC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}
+};
+
+static DCTtab DCT_B15_10 [] = {
+    {  6, 2, 9}, {  6, 2, 9}, { 15, 1, 9}, { 15, 1, 9},
+    {  3, 4,10}, { 17, 1,10}, { 16, 1, 9}, { 16, 1, 9}
+};
+
+static DCTtab DCT_B15_8 [] = {
+    { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
+    {  8, 1, 7}, {  8, 1, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  7, 1, 7}, {  7, 1, 7}, {  3, 2, 7}, {  3, 2, 7},
+    {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6},
+    {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6},
+    {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    {  2, 5, 8}, { 12, 1, 8}, {  1,11, 8}, {  1,10, 8},
+    { 14, 1, 8}, { 13, 1, 8}, {  4, 2, 8}, {  2, 4, 8},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    { 10, 1, 7}, { 10, 1, 7}, {  2, 3, 7}, {  2, 3, 7},
+    { 11, 1, 7}, { 11, 1, 7}, {  1, 8, 7}, {  1, 8, 7},
+    {  1, 9, 7}, {  1, 9, 7}, {  1,12, 8}, {  1,13, 8},
+    {  3, 3, 8}, {  5, 2, 8}, {  1,14, 8}, {  1,15, 8}
+};
+
+
+static MBAtab MBA_5 [] = {
+		    {6, 5}, {5, 5}, {4, 4}, {4, 4}, {3, 4}, {3, 4},
+    {2, 3}, {2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}
+};
+
+static MBAtab MBA_11 [] = {
+    {32, 11}, {31, 11}, {30, 11}, {29, 11},
+    {28, 11}, {27, 11}, {26, 11}, {25, 11},
+    {24, 11}, {23, 11}, {22, 11}, {21, 11},
+    {20, 10}, {20, 10}, {19, 10}, {19, 10},
+    {18, 10}, {18, 10}, {17, 10}, {17, 10},
+    {16, 10}, {16, 10}, {15, 10}, {15, 10},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7}
+};