91 files changed, 43654 insertions, 0 deletions
diff --git a/src/video_dec/Makefile.am b/src/video_dec/Makefile.am
new file mode 100644
index 000000000..f20a88bad
--- /dev/null
+++ b/src/video_dec/Makefile.am
@@ -0,0 +1,45 @@
+include $(top_srcdir)/misc/Makefile.quiet
+SUBDIRS = \
+	libmpeg2 \
+	libmpeg2new \
+	libvdpau
+
+include $(top_builddir)/misc/Makefile.plugins
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS  = $(DEFAULT_OCFLAGS) $(VISIBILITY_FLAG)
+AM_LDFLAGS = $(xineplug_ldflags)
+
+EXTRA_DIST = foovideo.c
+
+if ENABLE_IMAGEMAGICK
+image_module = xineplug_decode_image.la
+endif
+
+if ENABLE_GDK_PIXBUF
+gdkpixbuf_module = xineplug_decode_gdk_pixbuf.la
+endif
+
+xineplug_LTLIBRARIES = $(image_module) \
+	$(gdkpixbuf_module) \
+	$(theora_module) \
+	xineplug_decode_bitplane.la \
+	xineplug_decode_rgb.la \
+	xineplug_decode_yuv.la
+
+xineplug_decode_bitplane_la_SOURCES = bitplane.c
+xineplug_decode_bitplane_la_LIBADD = $(XINE_LIB) $(LTLIBINTL)
+
+xineplug_decode_rgb_la_SOURCES = rgb.c
+xineplug_decode_rgb_la_LIBADD = $(XINE_LIB)
+
+xineplug_decode_yuv_la_SOURCES = yuv.c
+xineplug_decode_yuv_la_LIBADD = $(XINE_LIB)
+
+xineplug_decode_image_la_SOURCES = image.c
+xineplug_decode_image_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS) $(WAND_LIBS)
+xineplug_decode_image_la_CFLAGS = $(AM_CFLAGS) $(WAND_CFLAGS)
+
+xineplug_decode_gdk_pixbuf_la_SOURCES = gdkpixbuf.c
+xineplug_decode_gdk_pixbuf_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS) $(GDK_PIXBUF_LIBS)
+xineplug_decode_gdk_pixbuf_la_CFLAGS = $(AM_CFLAGS) $(GDK_PIXBUF_CFLAGS)
diff --git a/src/video_dec/bitplane.c b/src/video_dec/bitplane.c
new file mode 100644
index 000000000..fa9f0ffba
--- /dev/null
+++ b/src/video_dec/bitplane.c
@@ -0,0 +1,1550 @@
+/*
+ * Copyright (C) 2004 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * Bitplane "Decoder" by Manfred Tremmel (Manfred.Tremmel@iiv.de)
+ * Converts Amiga typical bitplane pictures to a YUV2 map
+ * suitable for display under xine. It's based on the rgb-decoder
+ * and the development documentation from the Amiga Developer CD
+ *
+ * Supported formats:
+ * - uncompressed and byterun1 compressed ILBM data
+ * - IFF ANIM compression methods OPT 5, 7 (long and short) and
+ *   8 (long and short)
+ * - untested (found no testfiles) IFF-ANIM OPT 3, 4 and 6
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+
+#include "demuxers/iff.h"
+
+#define IFF_REPLACE_BYTE_SIMPLE(ptr, old_data, new_data, colorindexx ) { \
+  register uint8_t  *index_ptr = ptr; \
+  register uint8_t  colorindex = colorindexx; \
+  *index_ptr    -= ((old_data & 0x80) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x80) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x40) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x40) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x20) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x20) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x10) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x10) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x08) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x08) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x04) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x04) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x02) ? colorindex : 0); \
+  *index_ptr++  += ((new_data & 0x02) ? colorindex : 0); \
+  *index_ptr    -= ((old_data & 0x01) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x01) ? colorindex : 0); \
+  old_data       = new_data; \
+}
+
+#define IFF_REPLACE_BYTE(ptr, yuvy, yuvu, yuvv, yuv_palette, old_data, new_data, colorindexx ) { \
+  register uint8_t  *index_ptr = ptr; \
+  register uint8_t  colorindex = colorindexx; \
+  register uint8_t  *yuv_y = yuvy; \
+  register uint8_t  *yuv_u = yuvu; \
+  register uint8_t  *yuv_v = yuvv; \
+  *index_ptr    -= ((old_data & 0x80) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x80) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x40) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x40) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x20) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x20) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x10) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x10) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x08) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x08) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x04) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x04) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x02) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x02) ? colorindex : 0); \
+  yuv_index      = *index_ptr++ * 4; \
+  *yuv_y++       = yuv_palette[yuv_index++]; \
+  *yuv_u++       = yuv_palette[yuv_index++]; \
+  *yuv_v++       = yuv_palette[yuv_index]; \
+  *index_ptr    -= ((old_data & 0x01) ? colorindex : 0); \
+  *index_ptr    += ((new_data & 0x01) ? colorindex : 0); \
+  yuv_index      = *index_ptr * 4; \
+  *yuv_y         = yuv_palette[yuv_index++]; \
+  *yuv_u         = yuv_palette[yuv_index++]; \
+  *yuv_v         = yuv_palette[yuv_index]; \
+  old_data       = new_data; \
+}
+
+#define IFF_REPLACE_SHORT_SIMPLE(ptr_s, old_data_s, new_data_s, colorindexx_s ) { \
+  uint8_t  *xindex_ptr = (uint8_t *)ptr_s; \
+  uint8_t  *xold_data  = (uint8_t *)old_data_s; \
+  uint8_t  *xnew_data  = (uint8_t *)new_data_s; \
+  IFF_REPLACE_BYTE_SIMPLE(xindex_ptr, *xold_data, *xnew_data, colorindexx_s ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  IFF_REPLACE_BYTE_SIMPLE(xindex_ptr, *xold_data, *xnew_data, colorindexx_s ); \
+}
+
+#define IFF_REPLACE_SHORT(ptr_s, yuvy_s, yuvu_s, yuvv_s, yuv_palette_s, old_data_s, new_data_s, colorindexx_s ) { \
+  uint8_t  *xindex_ptr = (uint8_t *)ptr_s; \
+  uint8_t  *xold_data  = (uint8_t *)old_data_s; \
+  uint8_t  *xnew_data  = (uint8_t *)new_data_s; \
+  uint8_t  *xyuv_y = yuvy_s; \
+  uint8_t  *xyuv_u = yuvu_s; \
+  uint8_t  *xyuv_v = yuvv_s; \
+  IFF_REPLACE_BYTE(xindex_ptr, xyuv_y, xyuv_u, xyuv_v, yuv_palette_s, *xold_data, *xnew_data, colorindexx_s ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  xyuv_y     += 8; \
+  xyuv_u     += 8; \
+  xyuv_v     += 8; \
+  IFF_REPLACE_BYTE(xindex_ptr, xyuv_y, xyuv_u, xyuv_v, yuv_palette_s, *xold_data, *xnew_data, colorindexx_s ); \
+}
+
+#define IFF_REPLACE_LONG_SIMPLE(ptr_l, old_data_l, new_data_l, colorindexx_l ) { \
+  uint8_t  *xindex_ptr = (uint8_t *)ptr_l; \
+  uint8_t  *xold_data  = (uint8_t *)old_data_l; \
+  uint8_t  *xnew_data  = (uint8_t *)new_data_l; \
+  IFF_REPLACE_BYTE_SIMPLE(xindex_ptr, *xold_data, *xnew_data, colorindexx_l ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  IFF_REPLACE_BYTE_SIMPLE(xindex_ptr, *xold_data, *xnew_data, colorindexx_l ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  IFF_REPLACE_BYTE_SIMPLE(xindex_ptr, *xold_data, *xnew_data, colorindexx_l ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  IFF_REPLACE_BYTE_SIMPLE(xindex_ptr, *xold_data, *xnew_data, colorindexx_l ); \
+}
+
+#define IFF_REPLACE_LONG(ptr_l, yuvy_l, yuvu_l, yuvv_l, yuv_palette_l, old_data_l, new_data_l, colorindexx_l ) { \
+  uint8_t  *xindex_ptr = (uint8_t *)ptr_l; \
+  uint8_t  *xold_data  = (uint8_t *)old_data_l; \
+  uint8_t  *xnew_data  = (uint8_t *)new_data_l; \
+  uint8_t  *xyuv_y = yuvy_l; \
+  uint8_t  *xyuv_u = yuvu_l; \
+  uint8_t  *xyuv_v = yuvv_l; \
+  IFF_REPLACE_BYTE(xindex_ptr, xyuv_y, xyuv_u, xyuv_v, yuv_palette_l, *xold_data, *xnew_data, colorindexx_l ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  xyuv_y     += 8; \
+  xyuv_u     += 8; \
+  xyuv_v     += 8; \
+  IFF_REPLACE_BYTE(xindex_ptr, xyuv_y, xyuv_u, xyuv_v, yuv_palette_l, *xold_data, *xnew_data, colorindexx_l ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  xyuv_y     += 8; \
+  xyuv_u     += 8; \
+  xyuv_v     += 8; \
+  IFF_REPLACE_BYTE(xindex_ptr, xyuv_y, xyuv_u, xyuv_v, yuv_palette_l, *xold_data, *xnew_data, colorindexx_l ); \
+  xindex_ptr += 8; \
+  xold_data++; \
+  xnew_data++; \
+  xyuv_y     += 8; \
+  xyuv_u     += 8; \
+  xyuv_v     += 8; \
+  IFF_REPLACE_BYTE(xindex_ptr, xyuv_y, xyuv_u, xyuv_v, yuv_palette_l, *xold_data, *xnew_data, colorindexx_l ); \
+}
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} bitplane_class_t;
+
+typedef struct bitplane_decoder_s {
+  video_decoder_t   video_decoder;  /* parent video decoder structure */
+
+  bitplane_class_t *class;
+  xine_stream_t    *stream;
+
+  /* these are traditional variables in a video decoder object    */
+  uint64_t          video_step;  /* frame duration in pts units   */
+  int               decoder_ok;  /* current decoder status        */
+  int               skipframes;  /* 0 = draw picture, 1 = skip it */
+  int               framenumber;
+
+  unsigned char    *buf;         /* the accumulated buffer data   */
+  int               bufsize;     /* the maximum size of buf       */
+  int               size;        /* the current size of buf       */
+  int               size_uk;     /* size of unkompressed bitplane */
+
+  int               width;       /* the width of a video frame    */
+  int               height;      /* the height of a video frame   */
+  int               num_pixel;   /* number pixel                  */
+  double            ratio;       /* the width to height ratio     */
+  int               bytes_per_pixel;
+  int               num_bitplanes;
+  int               camg_mode;
+  int               is_ham;
+
+  unsigned char     yuv_palette[256 * 4];
+  unsigned char     rgb_palette[256 * 4];
+  yuv_planes_t      yuv_planes;
+  yuv_planes_t      yuv_planes_hist;
+
+  uint8_t          *buf_uk;      /* uncompressed buffer                */
+  uint8_t          *buf_uk_hist; /* uncompressed buffer historic       */
+  uint8_t          *index_buf;   /* index buffer (for indexed pics)    */
+  uint8_t          *index_buf_hist;/* index buffer historic            */
+
+} bitplane_decoder_t;
+
+/* create a new buffer and decde a byterun1 decoded buffer into it */
+static uint8_t *bitplane_decode_byterun1 (uint8_t *compressed,
+  int size_compressed,
+  int size_uncompressed) {
+
+  /* BytRun1 decompression */
+  int pixel_ptr                         = 0;
+  int i                                 = 0;
+  int j                                 = 0;
+
+  uint8_t *uncompressed                 = calloc(1, size_uncompressed );
+
+  while ( i < size_compressed &&
+          pixel_ptr < size_uncompressed ) {
+    if( compressed[i] <= 127 ) {
+      j = compressed[i++];
+      if( (i+j) > size_compressed ) {
+	free(uncompressed);
+        return NULL;
+      }
+      for( ; (j >= 0) && (pixel_ptr < size_uncompressed); j-- ) {
+        uncompressed[pixel_ptr++] = compressed[i++];
+      }
+    } else if ( compressed[i] > 128 ) {
+      j = 256 - compressed[i++];
+      if( i >= size_compressed ) {
+	free(uncompressed);
+        return NULL;
+      }
+      for( ; (j >= 0) && (pixel_ptr < size_uncompressed); j-- ) {
+        uncompressed[pixel_ptr++] = compressed[i];
+      }
+      i++;
+    }
+  }
+  return uncompressed;
+}
+
+/* create a new buffer with "normal" index or rgb numbers out of a bitplane */
+static void bitplane_decode_bitplane (uint8_t *bitplane_buffer,
+  uint8_t *index_buf,
+  int width,
+  int height,
+  int num_bitplanes,
+  int bytes_per_pixel ) {
+
+  int rowsize                           = width / 8;
+  int pixel_ptr                         = 0;
+  int row_ptr                           = 0;
+  int palette_index                     = 0;
+  int i                                 = 0;
+  int j                                 = 0;
+  int row_i                             = 0;
+  int row_j                             = 0;
+  int palette_offset                    = 0;
+  int palette_index_rowsize             = 0;
+  uint8_t color                         = 0;
+  uint8_t data                          = 0;
+  int bytes_per_pixel_8                 = bytes_per_pixel * 8;
+  int rowsize_num_bitplanes             = rowsize * num_bitplanes;
+  int width_bytes_per_pixel             = width * bytes_per_pixel;
+
+  for (i = 0; i < (height * width_bytes_per_pixel); index_buf[i++] = 0);
+
+  /* decode Bitplanes to RGB/Index Numbers */
+  for (row_ptr = 0; row_ptr < height; row_ptr++) {
+
+    row_i                               = row_ptr * width_bytes_per_pixel;
+    row_j                               = row_ptr * rowsize_num_bitplanes;
+
+    for (palette_index = 0; palette_index < num_bitplanes; palette_index++) {
+
+      palette_offset                    = ((palette_index > 15) ? 2 : (palette_index > 7) ? 1 : 0);
+      color                             = bitplainoffeset[palette_index];
+      palette_index_rowsize             = palette_index * rowsize;
+
+      for (pixel_ptr = 0; pixel_ptr < rowsize; pixel_ptr++) {
+        i                               = row_i +
+                                          (pixel_ptr * bytes_per_pixel_8) +
+                                          palette_offset;
+        j                               = row_j + palette_index_rowsize + pixel_ptr;
+
+        data                            = bitplane_buffer[j];
+
+        index_buf[i]                   += ((data & 0x80) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x40) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x20) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x10) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x08) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x04) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x02) ? color : 0);
+        i                              += bytes_per_pixel;
+        index_buf[i]                   += ((data & 0x01) ? color : 0);
+      }
+    }
+  }
+}
+
+/* create Buffer decode HAM6 and HAM8 to YUV color */
+static void bitplane_decode_ham (uint8_t *ham_buffer,
+  yuv_planes_t *yuv_planes,
+  int width,
+  int height,
+  int num_bitplanes,
+  int bytes_per_pixel,
+  unsigned char *rgb_palette ) {
+
+  uint8_t *ham_buffer_work              = ham_buffer;
+  uint8_t *ham_buffer_end               = &ham_buffer[(width * height)];
+  uint8_t *yuv_ptr_y                    = yuv_planes->y;
+  uint8_t *yuv_ptr_u                    = yuv_planes->u;
+  uint8_t *yuv_ptr_v                    = yuv_planes->v;
+  int i                                 = 0;
+  int j                                 = 0;
+  uint8_t r                             = 0;
+  uint8_t g                             = 0;
+  uint8_t b                             = 0;
+  /* position of special HAM-Bits differs in HAM6 and HAM8, detect them */
+  int hambits                           = num_bitplanes > 6 ? 6 : 4;
+        /* the other bits contain the real data, dreate a mask out of it */
+  int maskbits                          = 8 - hambits;
+  int mask                              = ( 1 << hambits ) - 1;
+
+  for(; ham_buffer_work < ham_buffer_end; j = *ham_buffer_work++) {
+    i                                   = (j & mask);
+    switch ( j >> hambits ) {
+      case HAMBITS_CMAP:
+        /* Take colors from palette */
+        r                               = rgb_palette[i * 4 + 0];
+        g                               = rgb_palette[i * 4 + 1];
+        b                               = rgb_palette[i * 4 + 2];
+        break;
+      case HAMBITS_BLUE:
+        /* keep red and green and modify blue */
+        b                               = i << maskbits;
+        b                              |= b >> hambits;
+        break;
+      case HAMBITS_RED:
+        /* keep green and blue and modify red */
+        r                               = i << maskbits;
+        r                              |= r >> hambits;
+        break;
+      case HAMBITS_GREEN:
+        /* keep red and blue and modify green */
+        g                               = i << maskbits;
+        g                              |= g >> hambits;
+        break;
+      default:
+        break;
+    }
+    *yuv_ptr_y++                        = COMPUTE_Y(r, g, b);
+    *yuv_ptr_u++                        = COMPUTE_U(r, g, b);
+    *yuv_ptr_v++                        = COMPUTE_V(r, g, b);
+  }
+}
+
+/* decoding method 3 */
+static void bitplane_sdelta_opt_3 (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 16;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t palette_index                = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint16_t *ptr                         = NULL;
+  uint16_t *planeptr                    = NULL;
+  uint16_t *picture_end                 = (uint16_t *)(&this->buf_uk[(rowsize_all_planes * 2 * this->height)]);
+  uint16_t *data                        = NULL;
+  uint16_t *data_end                    = (uint16_t *)(&this->buf[this->size]);
+  uint16_t *rowworkptr                  = NULL;
+  int16_t s                             = 0;
+  int16_t size                          = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t yuv_index                    = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+
+    planeptr                            = (uint16_t *)(&this->buf_uk[(palette_index * rowsize * 2)]);
+    /* data starts at beginn of delta-Buffer + offset of the first */
+    /* 32 Bit long word in the buffer. The buffer starts with 8    */
+    /* of this Offset, for every bitplane (max 8) one              */
+    data                                = (uint16_t *)(&this->buf[_X_BE_32(&deltadata[palette_index])]);
+    if( data != (uint16_t *)this->buf ) {
+      /* This 8 Pointers are followd by another 8                    */
+      ptr                               = (uint16_t *)(&this->buf[_X_BE_32(&deltadata[(palette_index+8)])]);
+
+      /* in this case, I think big/little endian is not important ;-) */
+      while( *data !=  0xFFFF) {
+        row_ptr                         = 0;
+        size                            = _X_BE_16(data);
+        data++;
+        if( size >= 0 ) {
+          rowworkptr                    = planeptr + size;
+          pixel_ptr_bit                 = size * 16;
+          if( this->is_ham ) {
+            IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[pixel_ptr_bit],
+                               rowworkptr, data, bitplainoffeset[palette_index] );
+          } else {
+            IFF_REPLACE_SHORT( &this->index_buf[pixel_ptr_bit],
+                               &this->yuv_planes.y[pixel_ptr_bit], &this->yuv_planes.u[pixel_ptr_bit],
+                               &this->yuv_planes.v[pixel_ptr_bit], this->yuv_palette,
+                               rowworkptr, data, bitplainoffeset[palette_index] );
+          }
+          data++;
+        } else {
+          size                          = 0 - size + 2;
+          rowworkptr                    = planeptr + size;
+          pixel_ptr_bit                 = size * 16;
+          s                             = _X_BE_16(data);
+          data++;
+          while( s--) {
+            if( this->is_ham ) {
+              IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[pixel_ptr_bit],
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            } else {
+              IFF_REPLACE_SHORT( &this->index_buf[pixel_ptr_bit],
+                                 &this->yuv_planes.y[pixel_ptr_bit], &this->yuv_planes.u[pixel_ptr_bit],
+                                 &this->yuv_planes.v[pixel_ptr_bit], this->yuv_palette,
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            }
+            rowworkptr++;
+            data++;
+          }
+        }
+
+
+
+
+        size                            = _X_BE_16(ptr);
+        ptr++;
+        if (size < 0) {
+          for (s = size; s < 0; s++) {
+            if (data > data_end || rowworkptr > picture_end)
+              return;
+            yuv_index                   = ((row_ptr * this->width) + pixel_ptr_bit);
+            if( this->is_ham ) {
+              IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            } else {
+              IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                 &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                 &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            }
+            rowworkptr                 += rowsize_all_planes;
+            row_ptr++;
+          }
+          data++;
+        }
+        else {
+          for (s = 0; s < size; s++) {
+            if (data > data_end || rowworkptr > picture_end)
+              return;
+            yuv_index                   = ((row_ptr * this->width) + pixel_ptr_bit);
+            if( this->is_ham ) {
+              IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            } else {
+              IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                 &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                 &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            }
+            data++;
+            rowworkptr                 += rowsize_all_planes;
+            row_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+/* decoding method 4 */
+static void bitplane_set_dlta_short (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 16;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t palette_index                = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint16_t *ptr                         = NULL;
+  uint16_t *planeptr                    = NULL;
+  uint16_t *picture_end                 = (uint16_t *)(&this->buf_uk[(rowsize_all_planes * 2 * this->height)]);
+  uint16_t *data                        = NULL;
+  uint16_t *data_end                    = (uint16_t *)(&this->buf[this->size]);
+  uint16_t *rowworkptr                  = NULL;
+  int16_t s                             = 0;
+  int16_t size                          = 0;
+  uint16_t pixel_ptr                    = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t yuv_index                    = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+
+    planeptr                            = (uint16_t *)(&this->buf_uk[(palette_index * rowsize * 2)]);
+    /* data starts at beginn of delta-Buffer + offset of the first */
+    /* 32 Bit long word in the buffer. The buffer starts with 8    */
+    /* of this Offset, for every bitplane (max 8) one              */
+    data                                = (uint16_t *)(&this->buf[_X_BE_32(&deltadata[palette_index])]);
+    if( data != (uint16_t *)this->buf ) {
+      /* This 8 Pointers are followd by another 8                    */
+      ptr                               = (uint16_t *)(&this->buf[_X_BE_32(&deltadata[(palette_index+8)])]);
+
+      /* in this case, I think big/little endian is not important ;-) */
+      while( *ptr !=  0xFFFF) {
+        pixel_ptr                       = _X_BE_16(ptr);
+        pixel_ptr_bit                   = pixel_ptr * 16;
+        row_ptr                         = 0;
+        rowworkptr                      = planeptr + pixel_ptr;
+        ptr++;
+        size                            = _X_BE_16(ptr);
+        ptr++;
+        if (size < 0) {
+          for (s = size; s < 0; s++) {
+            if (data > data_end || rowworkptr > picture_end)
+              return;
+            yuv_index                   = ((row_ptr * this->width) + pixel_ptr_bit);
+            if( this->is_ham ) {
+              IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            } else {
+              IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                 &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                 &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            }
+            rowworkptr                 += rowsize_all_planes;
+            row_ptr++;
+          }
+          data++;
+        } else {
+          for (s = 0; s < size; s++) {
+            if (data > data_end || rowworkptr > picture_end)
+              return;
+            yuv_index                   = ((row_ptr * this->width) + pixel_ptr_bit);
+            if( this->is_ham ) {
+              IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            } else {
+              IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                 &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                 &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                 rowworkptr, data, bitplainoffeset[palette_index] );
+            }
+            data++;
+            rowworkptr                   += rowsize_all_planes;
+            row_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+/* decoding method 5 */
+static void bitplane_dlta_5 (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 8;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t yuv_index                    = 0;
+  uint32_t delta_offset                 = 0;
+  uint32_t palette_index                = 0;
+  uint32_t pixel_ptr                    = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint8_t  *planeptr                    = NULL;
+  uint8_t  *rowworkptr                  = NULL;
+  uint8_t  *picture_end                 = this->buf_uk + (rowsize_all_planes * this->height);
+  uint8_t  *data                        = NULL;
+  uint8_t  *data_end                    = this->buf + this->size;
+  uint8_t  op_count                     = 0;
+  uint8_t  op                           = 0;
+  uint8_t  count                        = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+
+    planeptr                            = &this->buf_uk[(palette_index * rowsize)];
+    /* data starts at beginn of delta-Buffer + offset of the first */
+    /* 32 Bit long word in the buffer. The buffer starts with 8    */
+    /* of this Offset, for every bitplane (max 8) one              */
+    delta_offset                        = _X_BE_32(&deltadata[palette_index]);
+
+    if (delta_offset > 0) {
+      data                              = this->buf + delta_offset;
+      for( pixel_ptr = 0; pixel_ptr < rowsize; pixel_ptr++) {
+        rowworkptr                      = planeptr + pixel_ptr;
+        pixel_ptr_bit                   = pixel_ptr * 8;
+        row_ptr                         = 0;
+        /* execute ops */
+        for( op_count = *data++; op_count; op_count--) {
+          op                            = *data++;
+          if (op & 0x80) {
+            /* Uniq ops */
+            count                       = op & 0x7f; /* get count */
+            while(count--) {
+              if (data > data_end || rowworkptr > picture_end)
+                 return;
+              yuv_index                 = ((row_ptr * this->width) + pixel_ptr_bit);
+              if( this->is_ham ) {
+                IFF_REPLACE_BYTE_SIMPLE(&this->index_buf[yuv_index],
+                                  *rowworkptr, *data, bitplainoffeset[palette_index] );
+              } else {
+                IFF_REPLACE_BYTE( &this->index_buf[yuv_index],
+                                  &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                  &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                  *rowworkptr, *data, bitplainoffeset[palette_index] );
+              }
+              data++;
+              rowworkptr               += rowsize_all_planes;
+              row_ptr++;
+            }
+          } else {
+            if (op == 0) {
+              /* Same ops */
+              count                     = *data++;
+              while(count--) {
+                if (data > data_end || rowworkptr > picture_end)
+                   return;
+                yuv_index               = ((row_ptr * this->width) + pixel_ptr_bit);
+                if( this->is_ham ) {
+                  IFF_REPLACE_BYTE_SIMPLE(&this->index_buf[yuv_index],
+                                    *rowworkptr, *data, bitplainoffeset[palette_index] );
+                } else {
+                  IFF_REPLACE_BYTE( &this->index_buf[yuv_index],
+                                    &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                    &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                    *rowworkptr, *data, bitplainoffeset[palette_index] );
+                }
+                rowworkptr             += rowsize_all_planes;
+                row_ptr++;
+              }
+              data++;
+            } else {
+              /* Skip ops */
+              rowworkptr               += (rowsize_all_planes * op);
+              row_ptr                  += op;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/* decoding method 7 (short version) */
+static void bitplane_dlta_7_short (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 16;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t yuv_index                    = 0;
+  uint32_t opcode_offset                = 0;
+  uint32_t data_offset                  = 0;
+  uint32_t palette_index                = 0;
+  uint32_t pixel_ptr                    = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint8_t  *planeptr                    = NULL;
+  uint16_t *rowworkptr                  = NULL;
+  uint16_t *picture_end                 = (uint16_t *)(&this->buf_uk[(rowsize_all_planes * 2 * this->height)]);
+  uint16_t *data                        = NULL;
+  uint16_t *data_end                    = (uint16_t *)(&this->buf[this->size]);
+  uint8_t  *op_ptr                      = NULL;
+  uint8_t  op_count                     = 0;
+  uint8_t  op                           = 0;
+  uint8_t  count                        = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+
+    planeptr                            = &this->buf_uk[(palette_index * rowsize * 2)];
+    /* find opcode and data offset (up to 8 pointers, one for every bitplane */
+    opcode_offset                       = _X_BE_32(&deltadata[palette_index]);
+    data_offset                         = _X_BE_32(&deltadata[palette_index + 8]);
+
+    if (opcode_offset > 0 && data_offset > 0) {
+      data                              = (uint16_t *)(&this->buf[data_offset]);
+      op_ptr                            = this->buf + opcode_offset;
+      for( pixel_ptr = 0; pixel_ptr < rowsize; pixel_ptr++) {
+        rowworkptr                      = (uint16_t *)(&planeptr[pixel_ptr * 2]);
+        pixel_ptr_bit                   = pixel_ptr * 16;
+        row_ptr                         = 0;
+        /* execute ops */
+        for( op_count = *op_ptr++; op_count; op_count--) {
+          op                            = *op_ptr++;
+          if (op & 0x80) {
+            /* Uniq ops */
+            count                       = op & 0x7f; /* get count */
+            while(count--) {
+              if (data > data_end || rowworkptr > picture_end)
+                 return;
+              yuv_index                 = ((row_ptr * this->width) + pixel_ptr_bit);
+              if( this->is_ham ) {
+                IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                   rowworkptr, data, bitplainoffeset[palette_index] );
+              } else {
+                IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                   &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                   &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                   rowworkptr, data, bitplainoffeset[palette_index] );
+              }
+              data++;
+              rowworkptr               += rowsize_all_planes;
+              row_ptr++;
+            }
+          } else {
+            if (op == 0) {
+              /* Same ops */
+              count                     = *op_ptr++;
+              while(count--) {
+                if (data > data_end || rowworkptr > picture_end)
+                   return;
+                yuv_index               = ((row_ptr * this->width) + pixel_ptr_bit);
+                if( this->is_ham ) {
+                  IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                     rowworkptr, data, bitplainoffeset[palette_index] );
+                } else {
+                  IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                     &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                     &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                     rowworkptr, data, bitplainoffeset[palette_index] );
+                }
+                rowworkptr             += rowsize_all_planes;
+                row_ptr++;
+              }
+              data++;
+            } else {
+              /* Skip ops */
+              rowworkptr               += (rowsize_all_planes * op);
+              row_ptr                  += op;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/* decoding method 7 (long version) */
+static void bitplane_dlta_7_long  (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 32;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t yuv_index                    = 0;
+  uint32_t opcode_offset                = 0;
+  uint32_t data_offset                  = 0;
+  uint32_t palette_index                = 0;
+  uint32_t pixel_ptr                    = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint8_t  *planeptr                    = NULL;
+  uint32_t *rowworkptr                  = NULL;
+  uint32_t *picture_end                 = (uint32_t *)(&this->buf_uk[(rowsize_all_planes * 4 * this->height)]);
+  uint32_t *data                        = NULL;
+  uint32_t *data_end                    = (uint32_t *)(&this->buf[this->size]);
+  uint8_t  *op_ptr                      = NULL;
+  uint8_t  op_count                     = 0;
+  uint8_t  op                           = 0;
+  uint8_t  count                        = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+    planeptr                            = &this->buf_uk[(palette_index * rowsize * 4)];
+    /* find opcode and data offset (up to 8 pointers, one for every bitplane */
+    opcode_offset                       = _X_BE_32(&deltadata[palette_index]);
+    data_offset                         = _X_BE_32(&deltadata[palette_index + 8]);
+
+    if (opcode_offset > 0 && data_offset > 0) {
+      data                              = (uint32_t *)(&this->buf[data_offset]);
+      op_ptr                            = this->buf + opcode_offset;
+      for( pixel_ptr = 0; pixel_ptr < rowsize; pixel_ptr++) {
+        rowworkptr                      = (uint32_t *)(&planeptr[pixel_ptr * 4]);
+        pixel_ptr_bit                   = pixel_ptr * 32;
+        row_ptr                         = 0;
+        /* execute ops */
+        for( op_count = *op_ptr++; op_count; op_count--) {
+          op                            = *op_ptr++;
+          if (op & 0x80) {
+            /* Uniq ops */
+            count                       = op & 0x7f; /* get count */
+            while(count--) {
+              if (data > data_end || rowworkptr > picture_end)
+                return;
+              yuv_index                 = ((row_ptr * this->width) + pixel_ptr_bit);
+              if( this->is_ham ) {
+                IFF_REPLACE_LONG_SIMPLE(&this->index_buf[yuv_index],
+                                   rowworkptr, data, bitplainoffeset[palette_index] );
+              } else {
+                IFF_REPLACE_LONG( &this->index_buf[yuv_index],
+                                   &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                   &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                   rowworkptr, data, bitplainoffeset[palette_index] );
+              }
+              data++;
+              rowworkptr               += rowsize_all_planes;
+              row_ptr++;
+            }
+          } else {
+            if (op == 0) {
+              /* Same ops */
+              count                     = *op_ptr++;
+              while(count--) {
+                if (data > data_end || rowworkptr > picture_end)
+                  return;
+                yuv_index               = ((row_ptr * this->width) + pixel_ptr_bit);
+                if( this->is_ham ) {
+                  IFF_REPLACE_LONG_SIMPLE(&this->index_buf[yuv_index],
+                                     rowworkptr, data, bitplainoffeset[palette_index] );
+                } else {
+                  IFF_REPLACE_LONG( &this->index_buf[yuv_index],
+                                    &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                    &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                    rowworkptr, data, bitplainoffeset[palette_index] );
+                }
+                rowworkptr             += rowsize_all_planes;
+                row_ptr++;
+              }
+              data++;
+            } else {
+             /* Skip ops */
+              rowworkptr               += (rowsize_all_planes * op);
+              row_ptr                  += op;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/* decoding method 8 short */
+static void bitplane_dlta_8_short (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 16;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t yuv_index                    = 0;
+  uint32_t delta_offset                 = 0;
+  uint32_t palette_index                = 0;
+  uint32_t pixel_ptr                    = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint16_t *planeptr                    = NULL;
+  uint16_t *rowworkptr                  = NULL;
+  uint16_t *picture_end                 = (uint16_t *)(&this->buf_uk[(rowsize_all_planes * 2 * this->height)]);
+  uint16_t *data                        = NULL;
+  uint16_t *data_end                    = (uint16_t *)(&this->buf[this->size]);
+  uint16_t op_count                     = 0;
+  uint16_t op                           = 0;
+  uint16_t count                        = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+
+    planeptr                            = (uint16_t *)(&this->buf_uk[(palette_index * rowsize * 2)]);
+    /* data starts at beginn of delta-Buffer + offset of the first */
+    /* 32 Bit long word in the buffer. The buffer starts with 8    */
+    /* of this Offset, for every bitplane (max 8) one              */
+    delta_offset                        = _X_BE_32(&deltadata[palette_index]);
+
+    if (delta_offset > 0) {
+      data                              = (uint16_t *)(&this->buf[delta_offset]);
+      for( pixel_ptr = 0; pixel_ptr < rowsize; pixel_ptr++) {
+        rowworkptr                      = planeptr + pixel_ptr;
+        pixel_ptr_bit                   = pixel_ptr * 16;
+        row_ptr                         = 0;
+        /* execute ops */
+        op_count = _X_BE_16(data);
+        data++;
+        for( ; op_count; op_count--) {
+          op                            = _X_BE_16(data);
+          data++;
+          if (op & 0x8000) {
+            /* Uniq ops */
+            count                       = op & 0x7fff; /* get count */
+            while(count--) {
+              if (data > data_end || rowworkptr > picture_end)
+                 return;
+              yuv_index                 = ((row_ptr * this->width) + pixel_ptr_bit);
+              if( this->is_ham ) {
+                IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                   rowworkptr, data, bitplainoffeset[palette_index] );
+              } else {
+                IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                   &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                   &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                   rowworkptr, data, bitplainoffeset[palette_index] );
+              }
+              data++;
+              rowworkptr               += rowsize_all_planes;
+              row_ptr++;
+            }
+          } else {
+            if (op == 0) {
+              /* Same ops */
+              count                     = _X_BE_16(data);
+              data++;
+              while(count--) {
+                if (data > data_end || rowworkptr > picture_end)
+                   return;
+                yuv_index               = ((row_ptr * this->width) + pixel_ptr_bit);
+                if( this->is_ham ) {
+                  IFF_REPLACE_SHORT_SIMPLE(&this->index_buf[yuv_index],
+                                     rowworkptr, data, bitplainoffeset[palette_index] );
+                } else {
+                  IFF_REPLACE_SHORT( &this->index_buf[yuv_index],
+                                     &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                     &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                     rowworkptr, data, bitplainoffeset[palette_index] );
+                }
+                rowworkptr             += rowsize_all_planes;
+                row_ptr++;
+              }
+              data++;
+            } else {
+              /* Skip ops */
+              rowworkptr               += (rowsize_all_planes * op);
+              row_ptr                  += op;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/* decoding method 8 long */
+static void bitplane_dlta_8_long (bitplane_decoder_t *this) {
+
+  uint32_t rowsize                      = this->width / 32;
+  uint32_t rowsize_all_planes           = rowsize * this->num_bitplanes;
+
+  uint32_t yuv_index                    = 0;
+  uint32_t delta_offset                 = 0;
+  uint32_t palette_index                = 0;
+  uint32_t pixel_ptr                    = 0;
+  uint32_t pixel_ptr_bit                = 0;
+  uint32_t row_ptr                      = 0;
+  uint32_t *deltadata                   = (uint32_t *)this->buf;
+  uint32_t *planeptr                    = NULL;
+  uint32_t *rowworkptr                  = NULL;
+  uint32_t *picture_end                 = (uint32_t *)(&this->buf_uk[(rowsize_all_planes * 4 * this->height)]);
+  uint32_t *data                        = NULL;
+  uint32_t *data_end                    = (uint32_t *)(&this->buf[this->size]);
+  uint32_t op_count                     = 0;
+  uint32_t op                           = 0;
+  uint32_t count                        = 0;
+
+  /* Repeat for each plane */
+  for(palette_index = 0; palette_index < this->num_bitplanes; palette_index++) {
+
+    planeptr                            = (uint32_t *)(&this->buf_uk[(palette_index * rowsize * 4)]);
+    /* data starts at beginn of delta-Buffer + offset of the first */
+    /* 32 Bit long word in the buffer. The buffer starts with 8    */
+    /* of this Offset, for every bitplane (max 8) one              */
+    delta_offset                        = _X_BE_32(&deltadata[palette_index]);
+
+    if (delta_offset > 0) {
+      data                              = (uint32_t *)(&this->buf[delta_offset]);
+      for( pixel_ptr = 0; pixel_ptr < rowsize; pixel_ptr++) {
+        rowworkptr                      = planeptr + pixel_ptr;
+        pixel_ptr_bit                   = pixel_ptr * 32;
+        row_ptr                         = 0;
+        /* execute ops */
+        op_count                        = _X_BE_32(data);
+        data++;
+        for( ; op_count; op_count--) {
+          op                            = _X_BE_32(data);
+          data++;
+          if (op & 0x80000000) {
+            /* Uniq ops */
+            count                       = op & 0x7fffffff; /* get count */
+            while(count--) {
+              if (data <= data_end || rowworkptr <= picture_end) {
+                yuv_index               = ((row_ptr * this->width) + pixel_ptr_bit);
+                if( this->is_ham ) {
+                  IFF_REPLACE_LONG_SIMPLE(&this->index_buf[yuv_index],
+                                     rowworkptr, data, bitplainoffeset[palette_index] );
+                } else {
+                  IFF_REPLACE_LONG( &this->index_buf[((row_ptr * this->width) + pixel_ptr_bit)],
+                                    &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                    &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                    rowworkptr, data, bitplainoffeset[palette_index] );
+                }
+              }
+              data++;
+              rowworkptr               += rowsize_all_planes;
+              row_ptr++;
+            }
+          } else {
+            if (op == 0) {
+              /* Same ops */
+              count                     = _X_BE_32(data);
+              data++;
+              while(count--) {
+                if (data <= data_end && rowworkptr <= picture_end) {
+                  yuv_index             = ((row_ptr * this->width) + pixel_ptr_bit);
+                  if( this->is_ham ) {
+                    IFF_REPLACE_LONG_SIMPLE(&this->index_buf[yuv_index],
+                                       rowworkptr, data, bitplainoffeset[palette_index] );
+                  } else {
+                    IFF_REPLACE_LONG( &this->index_buf[yuv_index],
+                                      &this->yuv_planes.y[yuv_index], &this->yuv_planes.u[yuv_index],
+                                      &this->yuv_planes.v[yuv_index], this->yuv_palette,
+                                      rowworkptr, data, bitplainoffeset[palette_index] );
+                  }
+                }
+                rowworkptr             += rowsize_all_planes;
+                row_ptr++;
+              }
+              data++;
+            } else {
+              /* Skip ops */
+              rowworkptr               += (rowsize_all_planes * op);
+              row_ptr                  += op;
+            }
+          }
+        }
+      }
+    }
+  }
+/*  bitplane_decode_bitplane(this->buf_uk, this->index_buf, this->width, this->height, this->num_bitplanes, 1);*/
+}
+
+static void bitplane_decode_data (video_decoder_t *this_gen,
+  buf_element_t *buf) {
+
+  bitplane_decoder_t *this              = (bitplane_decoder_t *) this_gen;
+  xine_bmiheader *bih                   = 0;
+  palette_entry_t *palette              = 0;
+  AnimHeader *anhd                      = NULL;
+  int i                                 = 0;
+  int j                                 = 0;
+  int buf_ptr                           = 0;
+  unsigned char r                       = 0;
+  unsigned char g                       = 0;
+  unsigned char b                       = 0;
+  uint8_t *buf_exchange                 = NULL;
+
+  vo_frame_t *img                       = 0; /* video out frame */
+
+  /* a video decoder does not care about this flag (?) */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if ((buf->decoder_flags & BUF_FLAG_SPECIAL) &&
+      (buf->decoder_info[1] == BUF_SPECIAL_PALETTE)) {
+    palette                             = (palette_entry_t *)buf->decoder_info_ptr[2];
+
+    for (i = 0; i < buf->decoder_info[2]; i++) {
+      this->yuv_palette[i * 4 + 0]      =
+        COMPUTE_Y(palette[i].r, palette[i].g, palette[i].b);
+      this->yuv_palette[i * 4 + 1]      =
+        COMPUTE_U(palette[i].r, palette[i].g, palette[i].b);
+      this->yuv_palette[i * 4 + 2]      =
+        COMPUTE_V(palette[i].r, palette[i].g, palette[i].b);
+      this->rgb_palette[i * 4 + 0]      = palette[i].r;
+      this->rgb_palette[i * 4 + 1]      = palette[i].g;
+      this->rgb_palette[i * 4 + 2]      = palette[i].b;
+    }
+
+    /* EHB Pictures not allways contain all 64 colors, sometimes only    */
+    /* the first 32 are included and sometimes all 64 colors are provide,*/
+    /* but second 32 are only stupid dirt, so recalculate them           */
+    if (((this->num_bitplanes  == 6) &&
+         (buf->decoder_info[2] == 32)) ||
+        (this->camg_mode & CAMG_EHB)) {
+      for (i = 32; i < 64; i++) {
+        this->rgb_palette[i * 4 + 0]    = palette[(i-32)].r / 2;
+        this->rgb_palette[i * 4 + 1]    = palette[(i-32)].g / 2;
+        this->rgb_palette[i * 4 + 2]    = palette[(i-32)].b / 2;
+        this->yuv_palette[i * 4 + 0]    =
+           COMPUTE_Y(this->rgb_palette[i*4+0], this->rgb_palette[i*4+1], this->rgb_palette[i*4+2]);
+        this->yuv_palette[i * 4 + 1]    =
+           COMPUTE_U(this->rgb_palette[i*4+0], this->rgb_palette[i*4+1], this->rgb_palette[i*4+2]);
+        this->yuv_palette[i * 4 + 2]    =
+           COMPUTE_V(this->rgb_palette[i*4+0], this->rgb_palette[i*4+1], this->rgb_palette[i*4+2]);
+       }
+    }
+
+    return;
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER) { /* need to initialize */
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+
+    bih                                 = (xine_bmiheader *) buf->content;
+    this->width                         = (bih->biWidth + 15) & ~0x0f;
+    this->height                        = bih->biHeight;
+    this->num_pixel                     = this->width * this->height;
+    this->ratio                         = (double)this->width/(double)this->height;
+    this->video_step                    = buf->decoder_info[1];
+    /* Palette based Formates use up to 8 Bit per pixel, always use 8 Bit if less */
+    this->bytes_per_pixel               = (bih->biBitCount + 1) / 8;
+    if( this->bytes_per_pixel < 1 )
+      this->bytes_per_pixel             = 1;
+
+    /* New Buffer for indexes (palette based formats) */
+    this->index_buf                     = calloc( this->num_pixel, this->bytes_per_pixel );
+    this->index_buf_hist                = calloc( this->num_pixel, this->bytes_per_pixel );
+
+    this->num_bitplanes                 = bih->biPlanes;
+    this->camg_mode                     = bih->biCompression;
+    if( this->camg_mode & CAMG_HAM )
+      this->is_ham                      = 1;
+    else
+      this->is_ham                      = 0;
+
+    if( buf->decoder_info[2]           != buf->decoder_info[3] &&
+        buf->decoder_info[3]            > 0 ) {
+      this->ratio                      *= buf->decoder_info[2];
+      this->ratio                      /= buf->decoder_info[3];
+    }
+
+    if( (bih->biCompression & CAMG_HIRES) &&
+        !(bih->biCompression & CAMG_LACE) ) {
+      if( (buf->decoder_info[2] * 16) > (buf->decoder_info[3] * 10) )
+        this->ratio                    /= 2.0;
+    }
+
+    if( !(bih->biCompression & CAMG_HIRES) &&
+        (bih->biCompression & CAMG_LACE) ) {
+      if( (buf->decoder_info[2] * 10) < (buf->decoder_info[3] * 16) )
+        this->ratio                    *= 2.0;
+    }
+
+    free (this->buf);
+    this->bufsize                       = VIDEOBUFSIZE;
+    this->buf                           = calloc(1, this->bufsize);
+    this->size                          = 0;
+    this->framenumber                   = 0;
+
+    init_yuv_planes(&this->yuv_planes, this->width, this->height);
+    init_yuv_planes(&this->yuv_planes_hist, this->width, this->height);
+
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+    this->decoder_ok                    = 1;
+
+    /* load the stream/meta info */
+    switch( buf->type ) {
+      case BUF_VIDEO_BITPLANE:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Uncompressed bitplane");
+        break;
+      case BUF_VIDEO_BITPLANE_BR1:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "ByteRun1 bitplane");
+        break;
+      default:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Unknown bitplane");
+        break;
+    }
+
+    return;
+  } else if (this->decoder_ok) {
+
+    this->skipframes                    = 0;
+    this->framenumber++;
+    if (this->size + buf->size > this->bufsize) {
+      this->bufsize                     = this->size + 2 * buf->size;
+      this->buf                         = realloc (this->buf, this->bufsize);
+    }
+
+    xine_fast_memcpy (&this->buf[this->size], buf->content, buf->size);
+
+    this->size                         += buf->size;
+
+    if (buf->decoder_flags & BUF_FLAG_FRAMERATE)
+      this->video_step = buf->decoder_info[0];
+
+    if (buf->decoder_flags & BUF_FLAG_FRAME_END) {
+
+      img = this->stream->video_out->get_frame (this->stream->video_out,
+                                        this->width, this->height,
+                                        this->ratio, XINE_IMGFMT_YUY2,
+                                        VO_BOTH_FIELDS);
+
+      img->duration                     = this->video_step;
+      img->pts                          = buf->pts;
+      img->bad_frame                    = 0;
+      anhd                              = (AnimHeader *)(buf->decoder_info_ptr[0]);
+
+      if( (this->buf_uk    == NULL) ||
+          (anhd            == NULL) ||
+          (anhd->operation == IFF_ANHD_ILBM) ) {
+
+        /* iterate through each row */
+        buf_ptr                         = 0;
+        this->size_uk                   = (((this->num_pixel) / 8) * this->num_bitplanes);
+
+        if( this->buf_uk_hist != NULL )
+          xine_fast_memcpy (this->buf_uk_hist, this->buf_uk, this->size_uk);
+        switch( buf->type ) {
+          case BUF_VIDEO_BITPLANE:
+            /* uncompressed Buffer, set decoded_buf pointer direct to input stream */
+            if( this->buf_uk == NULL )
+              this->buf_uk              = malloc(this->size);
+            xine_fast_memcpy (this->buf_uk, this->buf, this->size);
+            break;
+          case BUF_VIDEO_BITPLANE_BR1:
+            /* create Buffer for decompressed bitmap */
+            this->buf_uk                = bitplane_decode_byterun1(
+                                                   this->buf,          /* compressed buffer         */
+                                                   this->size,         /* size of compressed data   */
+                                                   this->size_uk );    /* size of uncompressed data */
+
+            if( this->buf_uk == NULL ) {
+              xine_log(this->stream->xine, XINE_LOG_MSG,
+                       _("bitplane: error doing ByteRun1 decompression\n"));
+              _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HANDLED, 0);
+              return;
+            }
+            /* set pointer to decompressed Buffer */
+            break;
+          default:
+            break;
+        }
+        bitplane_decode_bitplane(     this->buf_uk,              /* bitplane buffer         */
+                                      this->index_buf,           /* index buffer            */
+                                      this->width,               /* width                   */
+                                      this->height,              /* hight                   */
+                                      this->num_bitplanes,       /* number bitplanes        */
+                                      this->bytes_per_pixel);    /* used Bytes per pixel    */
+
+        if ((this->bytes_per_pixel == 1) &&
+            (this->is_ham == 0) ) {
+          buf_exchange                  = this->index_buf;
+          for (i = 0; i < (this->height * this->width); i++) {
+            j                           = *buf_exchange++ * 4;
+            this->yuv_planes.y[i]       = this->yuv_palette[j++];
+            this->yuv_planes.u[i]       = this->yuv_palette[j++];
+            this->yuv_planes.v[i]       = this->yuv_palette[j];
+          }
+        }
+        if( this->buf_uk_hist == NULL ) {
+          this->buf_uk_hist             = malloc(this->size_uk);
+          xine_fast_memcpy (this->buf_uk_hist, this->buf_uk, this->size_uk);
+          xine_fast_memcpy (this->index_buf_hist, this->index_buf,
+                            (this->num_pixel * this->bytes_per_pixel));
+          xine_fast_memcpy (this->yuv_planes_hist.y, this->yuv_planes.y, (this->num_pixel));
+          xine_fast_memcpy (this->yuv_planes_hist.u, this->yuv_planes.u, (this->num_pixel));
+          xine_fast_memcpy (this->yuv_planes_hist.v, this->yuv_planes.v, (this->num_pixel));
+        }
+      } else {
+        /* when no start-picture is given, create a empty one */
+        if( this->buf_uk_hist == NULL ) {
+          this->size_uk                 = (((this->num_pixel) / 8) * this->num_bitplanes);
+          this->buf_uk                  = calloc(this->num_bitplanes, ((this->num_pixel) / 8));
+          this->buf_uk_hist             = calloc(this->num_bitplanes, ((this->num_pixel) / 8));
+        }
+        if( this->index_buf == NULL ) {
+          this->index_buf               = calloc( this->num_pixel, this->bytes_per_pixel );
+          this->index_buf_hist          = calloc( this->num_pixel, this->bytes_per_pixel );
+        }
+
+        switch( anhd->operation ) {
+          /* also known as IFF-ANIM OPT1 (never seen in real world) */
+          case IFF_ANHD_XOR:
+            xine_log(this->stream->xine, XINE_LOG_MSG,
+                     _("bitplane: Anim Opt 1 is not supported at the moment\n"));
+            _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HANDLED, 0);
+            return;
+            break;
+          /* also known as IFF-ANIM OPT2 (never seen in real world) */
+          case IFF_ANHD_LDELTA:
+            xine_log(this->stream->xine, XINE_LOG_MSG,
+                     _("bitplane: Anim Opt 2 is not supported at the moment\n"));
+            _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HANDLED, 0);
+            return;
+            break;
+          /* also known as IFF-ANIM OPT3 */
+          case IFF_ANHD_SDELTA:
+            _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT3");
+            bitplane_sdelta_opt_3 ( this );
+            return;
+            break;
+          /* also known as IFF-ANIM OPT4 (never seen in real world) */
+          case IFF_ANHD_SLDELTA:
+            _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT4 (SLDELTA)");
+            bitplane_set_dlta_short ( this );
+            break;
+          /* also known as IFF-ANIM OPT5 */
+          case IFF_ANHD_BVDELTA:
+            _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT5 (BVDELTA)");
+            bitplane_dlta_5(this);
+            break;
+          /* IFF-ANIM OPT6 is exactly the same as OPT5, but for stereo-displays */
+          /* first picture is on the left display, second on the right, third on */
+          /* the left, forth on right, ... Only display left picture on mono display*/
+          case IFF_ANHD_STEREOO5:
+            _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT6 (BVDELTA STEREO)");
+            bitplane_dlta_5(this);
+            if( this->framenumber % 2   == 0 )
+              this->skipframes          = 1;
+            return;
+            break;
+          case IFF_ANHD_OPT7:
+            if(anhd->bits == 0) {
+              _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT7 (SHORT)");
+              bitplane_dlta_7_short(this);
+            } else {
+              _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT7 (LONG)");
+              bitplane_dlta_7_long(this);
+            }
+            break;
+          case IFF_ANHD_OPT8:
+            if(anhd->bits == 0) {
+              _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT8 (SHORT)");
+              bitplane_dlta_8_short(this);
+            } else {
+              _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Anim OPT8 (LONG)");
+              bitplane_dlta_8_long(this);
+            }
+            break;
+          case IFF_ANHD_ASCIIJ:
+            xine_log(this->stream->xine, XINE_LOG_MSG,
+                     _("bitplane: Anim ASCIIJ is not supported at the moment\n"));
+            _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HANDLED, 0);
+            return;
+            break;
+          default:
+            xine_log(this->stream->xine, XINE_LOG_MSG,
+                     _("bitplane: This anim-type is not supported at the moment\n"));
+            _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HANDLED, 0);
+            return;
+            break;
+        }
+        /* change old bitmap buffer (which now is the new one) with new buffer */
+        buf_exchange                    = this->buf_uk;
+        this->buf_uk                    = this->buf_uk_hist;
+        this->buf_uk_hist               = buf_exchange;
+        /* do the same with the index buffer */
+        buf_exchange                    = this->index_buf;
+        this->index_buf                 = this->index_buf_hist;
+        this->index_buf_hist            = buf_exchange;
+        /* and also with yuv buffer */
+        buf_exchange                    = this->yuv_planes.y;
+        this->yuv_planes.y              = this->yuv_planes_hist.y;
+        this->yuv_planes_hist.y         = buf_exchange;
+        buf_exchange                    = this->yuv_planes.u;
+        this->yuv_planes.u              = this->yuv_planes_hist.u;
+        this->yuv_planes_hist.u         = buf_exchange;
+        buf_exchange                    = this->yuv_planes.v;
+        this->yuv_planes.v              = this->yuv_planes_hist.v;
+        this->yuv_planes_hist.v         = buf_exchange;
+      }
+
+      if( this->skipframes == 0 ) {
+        switch (this->bytes_per_pixel) {
+          case 1:
+            /* HAM-pictrues need special handling */
+            if( this->is_ham ) {
+              /* Decode HAM-Pictures to YUV */
+              bitplane_decode_ham( this->index_buf,          /* HAM-bitplane buffer     */
+                                   &(this->yuv_planes),      /* YUV buffer              */
+                                   this->width,              /* width                   */
+                                   this->height,             /* hight                   */
+                                   this->num_bitplanes,      /* number bitplanes        */
+                                   this->bytes_per_pixel,    /* used Bytes per pixel    */
+                                   this->rgb_palette);       /* Palette (RGB)           */
+            }
+            break;
+          case 3:
+            buf_exchange                = this->index_buf;
+            for (i = 0; i < (this->height * this->width); i++) {
+              r                         = *buf_exchange++;
+              g                         = *buf_exchange++;
+              b                         = *buf_exchange++;
+
+              this->yuv_planes.y[i]     = COMPUTE_Y(r, g, b);
+              this->yuv_planes.u[i]     = COMPUTE_U(r, g, b);
+              this->yuv_planes.v[i]     = COMPUTE_V(r, g, b);
+            }
+            break;
+          default:
+            break;
+        }
+
+        yuv444_to_yuy2(&this->yuv_planes, img->base[0], img->pitches[0]);
+
+        img->draw(img, this->stream);
+      }
+      img->free(img);
+
+      this->size                        = 0;
+      if ( buf->decoder_info[1] > 90000 )
+        xine_usec_sleep(buf->decoder_info[1]);
+    }
+  }
+}
+
+/*
+ * This function is called when xine needs to flush the system. Not
+ * sure when or if this is used or even if it needs to do anything.
+ */
+static void bitplane_flush (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void bitplane_reset (video_decoder_t *this_gen) {
+  bitplane_decoder_t *this              = (bitplane_decoder_t *) this_gen;
+
+  this->size                            = 0;
+}
+
+static void bitplane_discontinuity (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void bitplane_dispose (video_decoder_t *this_gen) {
+  bitplane_decoder_t *this              = (bitplane_decoder_t *) this_gen;
+
+  free (this->buf);
+  free (this->buf_uk);
+  free (this->buf_uk_hist);
+  free (this->index_buf);
+  free (this->index_buf_hist);
+  free (this->index_buf);
+
+  if (this->decoder_ok) {
+    this->decoder_ok                    = 0;
+    this->stream->video_out->close(this->stream->video_out, this->stream);
+  }
+
+  free (this_gen);
+}
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  bitplane_decoder_t  *this             = (bitplane_decoder_t *) calloc(1, sizeof(bitplane_decoder_t));
+
+  this->video_decoder.decode_data       = bitplane_decode_data;
+  this->video_decoder.flush             = bitplane_flush;
+  this->video_decoder.reset             = bitplane_reset;
+  this->video_decoder.discontinuity     = bitplane_discontinuity;
+  this->video_decoder.dispose           = bitplane_dispose;
+  this->size                            = 0;
+
+  this->stream                          = stream;
+  this->class                           = (bitplane_class_t *) class_gen;
+
+  this->decoder_ok                      = 0;
+  this->buf                             = NULL;
+  this->buf_uk                          = NULL;
+  this->index_buf                       = NULL;
+  this->index_buf                         = NULL;
+
+  return &this->video_decoder;
+}
+
+static void *init_plugin (xine_t *xine, void *data) {
+
+  bitplane_class_t *this                = (bitplane_class_t *) calloc(1, sizeof(bitplane_class_t));
+
+  this->decoder_class.open_plugin       = open_plugin;
+  this->decoder_class.identifier        = "bitplane";
+  this->decoder_class.description       = N_("Raw bitplane video decoder plugin");
+  this->decoder_class.dispose           = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t video_types[] = {
+  BUF_VIDEO_BITPLANE,
+  BUF_VIDEO_BITPLANE_BR1,
+  0
+};
+
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  1                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "bitplane", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/foovideo.c b/src/video_dec/foovideo.c
new file mode 100644
index 000000000..99ec1287b
--- /dev/null
+++ b/src/video_dec/foovideo.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (C) 2000-2003 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * foovideo.c: This is a reference video decoder for the xine multimedia
+ * player. It really works too! It will output frames of packed YUY2 data
+ * where each byte in the map is the same value, which is 3 larger than the
+ * value from the last frame. This creates a slowly rotating solid color
+ * frame when the frames are played in succession.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+
+#define VIDEOBUFSIZE 128*1024
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} foovideo_class_t;
+
+typedef struct foovideo_decoder_s {
+  video_decoder_t   video_decoder;  /* parent video decoder structure */
+
+  foovideo_class_t *class;
+  xine_stream_t    *stream;
+
+  /* these are traditional variables in a video decoder object */
+  uint64_t          video_step;  /* frame duration in pts units */
+  int               decoder_ok;  /* current decoder status */
+  int               skipframes;
+
+  unsigned char    *buf;         /* the accumulated buffer data */
+  int               bufsize;     /* the maximum size of buf */
+  int               size;        /* the current size of buf */
+
+  int               width;       /* the width of a video frame */
+  int               height;      /* the height of a video frame */
+  double            ratio;       /* the width to height ratio */
+
+  /* these are variables exclusive to the foo video decoder */
+  unsigned char     current_yuv_byte;
+
+} foovideo_decoder_t;
+
+/**************************************************************************
+ * foovideo specific decode functions
+ *************************************************************************/
+
+/**************************************************************************
+ * xine video plugin functions
+ *************************************************************************/
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void foovideo_decode_data (video_decoder_t *this_gen,
+  buf_element_t *buf) {
+
+  foovideo_decoder_t *this = (foovideo_decoder_t *) this_gen;
+  xine_bmiheader *bih;
+
+  vo_frame_t *img; /* video out frame */
+
+  /* a video decoder does not care about this flag (?) */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if (buf->decoder_flags & BUF_FLAG_FRAMERATE) {
+    this->video_step = buf->decoder_info[0];
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, this->video_step);
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER) { /* need to initialize */
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+
+    free(this->buf);
+
+    bih = (xine_bmiheader *) buf->content;
+    this->width = bih->biWidth;
+    this->height = bih->biHeight;
+    this->ratio = (double)this->width/(double)this->height;
+
+    if (this->buf)
+      free (this->buf);
+    this->bufsize = VIDEOBUFSIZE;
+    this->buf = malloc(this->bufsize);
+    this->size = 0;
+
+    /* take this opportunity to load the stream/meta info */
+    _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "foovideo");
+
+    /* do anything else relating to initializing this decoder */
+    this->current_yuv_byte = 0;
+
+    this->decoder_ok = 1;
+
+    return;
+  } else if (this->decoder_ok) {
+
+    if (this->size + buf->size > this->bufsize) {
+      this->bufsize = this->size + 2 * buf->size;
+      this->buf = realloc (this->buf, this->bufsize);
+    }
+
+    xine_fast_memcpy (&this->buf[this->size], buf->content, buf->size);
+
+    this->size += buf->size;
+
+    if (buf->decoder_flags & BUF_FLAG_FRAME_END) {
+
+      img = this->stream->video_out->get_frame (this->stream->video_out,
+                                        this->width, this->height,
+                                        this->ratio,
+                                        XINE_IMGFMT_YUY2, VO_BOTH_FIELDS);
+
+      img->duration  = this->video_step;
+      img->pts       = buf->pts;
+      img->bad_frame = 0;
+
+      memset(img->base[0], this->current_yuv_byte,
+        this->width * this->height * 2);
+      this->current_yuv_byte += 3;
+
+      img->draw(img, this->stream);
+      img->free(img);
+
+      this->size = 0;
+    }
+  }
+}
+
+/*
+ * This function is called when xine needs to flush the system.
+ */
+static void foovideo_flush (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void foovideo_reset (video_decoder_t *this_gen) {
+  foovideo_decoder_t *this = (foovideo_decoder_t *) this_gen;
+
+  this->size = 0;
+}
+
+/*
+ * The decoder should forget any stored pts values here.
+ */
+static void foovideo_discontinuity (video_decoder_t *this_gen) {
+  foovideo_decoder_t *this = (foovideo_decoder_t *) this_gen;
+
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void foovideo_dispose (video_decoder_t *this_gen) {
+
+  foovideo_decoder_t *this = (foovideo_decoder_t *) this_gen;
+
+  free (this->buf);
+
+  if (this->decoder_ok) {
+    this->decoder_ok = 0;
+    this->stream->video_out->close(this->stream->video_out, this->stream);
+  }
+
+  free (this_gen);
+}
+
+/*
+ * This function allocates, initializes, and returns a private video
+ * decoder structure.
+ */
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  foovideo_decoder_t  *this ;
+
+  this = (foovideo_decoder_t *) calloc(1, sizeof(foovideo_decoder_t));
+
+  this->video_decoder.decode_data         = foovideo_decode_data;
+  this->video_decoder.flush               = foovideo_flush;
+  this->video_decoder.reset               = foovideo_reset;
+  this->video_decoder.discontinuity       = foovideo_discontinuity;
+  this->video_decoder.dispose             = foovideo_dispose;
+  this->size                              = 0;
+
+  this->stream                            = stream;
+  this->class                             = (foovideo_class_t *) class_gen;
+
+  this->decoder_ok    = 0;
+  this->buf           = NULL;
+
+  return &this->video_decoder;
+}
+
+/*
+ * This function frees the video decoder class and any other memory that was
+ * allocated.
+ */
+static void dispose_class (video_decoder_class_t *this) {
+  free (this);
+}
+
+/*
+ * This function allocates a private video decoder class and initializes
+ * the class's member functions.
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  foovideo_class_t *this;
+
+  this = (foovideo_class_t *) calloc(1, sizeof(foovideo_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "foovideo";
+  this->decoder_class.description     = N_("foovideo: reference xine video decoder plugin");
+  this->decoder_class.dispose         = dispose_class;
+
+  return this;
+}
+
+/*
+ * This is a list of all of the internal xine video buffer types that
+ * this decoder is able to handle. Check src/xine-engine/buffer.h for a
+ * list of valid buffer types (and add a new one if the one you need does
+ * not exist). Terminate the list with a 0.
+ */
+static const uint32_t video_types[] = {
+  /* BUF_VIDEO_FOOVIDEO, */
+  BUF_VIDEO_VQA,
+  BUF_VIDEO_SORENSON_V3,
+  0
+};
+
+/*
+ * This data structure combines the list of supported xine buffer types and
+ * the priority that the plugin should be given with respect to other
+ * plugins that handle the same buffer type. A plugin with priority (n+1)
+ * will be used instead of a plugin with priority (n).
+ */
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  5                    /* priority        */
+};
+
+/*
+ * The plugin catalog entry. This is the only information that this plugin
+ * will export to the public.
+ */
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* { type, API, "name", version, special_info, init_function } */
+  { PLUGIN_VIDEO_DECODER, 19, "foovideo", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/gdkpixbuf.c b/src/video_dec/gdkpixbuf.c
new file mode 100644
index 000000000..ed88323fb
--- /dev/null
+++ b/src/video_dec/gdkpixbuf.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2006 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * a gdk-pixbuf-based image video decoder
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#define LOG_MODULE "gdkpixbuf_video_decoder"
+#define LOG_VERBOSE
+/*
+#define LOG
+*/
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+
+#include <gdk-pixbuf/gdk-pixbuf.h>
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+
+  /*
+   * private variables
+   */
+
+} image_class_t;
+
+
+typedef struct image_decoder_s {
+  video_decoder_t   video_decoder;
+
+  image_class_t    *cls;
+
+  xine_stream_t    *stream;
+  int               video_open;
+
+  GdkPixbufLoader  *loader;
+
+} image_decoder_t;
+
+
+static void image_decode_data (video_decoder_t *this_gen, buf_element_t *buf) {
+  image_decoder_t *this = (image_decoder_t *) this_gen;
+  GError *error = NULL;
+
+  if (!this->video_open) {
+    lprintf("opening video\n");
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+    this->video_open = 1;
+  }
+
+  if (this->loader == NULL) {
+    this->loader = gdk_pixbuf_loader_new ();
+  }
+
+  if (gdk_pixbuf_loader_write (this->loader, buf->mem, buf->size, &error) == FALSE) {
+    lprintf("error loading image: %s\n", error->message);
+    g_error_free (error);
+    gdk_pixbuf_loader_close (this->loader, NULL);
+    g_object_unref (G_OBJECT (this->loader));
+    this->loader = NULL;
+    return;
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_FRAME_END) {
+    GdkPixbuf         *pixbuf;
+    int                width, height, x, y, rowstride, n_channels, i;
+    guchar            *img_buf;
+    yuv_planes_t       yuv_planes;
+    vo_frame_t        *img;
+
+    /*
+     * this->image -> rgb data
+     */
+    if (gdk_pixbuf_loader_close (this->loader, &error) == FALSE) {
+      lprintf("error loading image: %s\n", error->message);
+      g_error_free (error);
+      g_object_unref (G_OBJECT (this->loader));
+      this->loader = NULL;
+      return;
+    }
+
+    pixbuf = gdk_pixbuf_loader_get_pixbuf (this->loader);
+    if (pixbuf != NULL)
+      g_object_ref (G_OBJECT (pixbuf));
+    g_object_unref (this->loader);
+    this->loader = NULL;
+
+    if (pixbuf == NULL) {
+      lprintf("error loading image\n");
+      return;
+    }
+
+    width = gdk_pixbuf_get_width (pixbuf) & ~1; /* must be even for init_yuv_planes */
+    height = gdk_pixbuf_get_height (pixbuf);
+    img_buf = gdk_pixbuf_get_pixels (pixbuf);
+
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_WIDTH, width);
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, height);
+
+    lprintf("image loaded successfully\n");
+
+    /*
+     * rgb data -> yuv_planes
+     */
+    init_yuv_planes(&yuv_planes, width, height);
+
+    n_channels = gdk_pixbuf_get_n_channels (pixbuf);
+    rowstride = gdk_pixbuf_get_rowstride (pixbuf);
+    i = 0;
+    for (y = 0; y < height; y++) {
+      for (x = 0; x < width; x++) {
+        guchar *p;
+        p = img_buf + y * rowstride + x * n_channels;
+
+	yuv_planes.y[i] = COMPUTE_Y (p[0], p[1], p[2]);
+	yuv_planes.u[i] = COMPUTE_U (p[0], p[1], p[2]);
+	yuv_planes.v[i] = COMPUTE_V (p[0], p[1], p[2]);
+
+	i++;
+      }
+    }
+    gdk_pixbuf_unref (pixbuf);
+
+    /*
+     * alloc and draw video frame
+     */
+    img = this->stream->video_out->get_frame (this->stream->video_out, width,
+					      height, (double)width/(double)height,
+					      XINE_IMGFMT_YUY2,
+					      VO_BOTH_FIELDS);
+    img->pts = buf->pts;
+    img->duration = 3600;
+    img->bad_frame = 0;
+
+    yuv444_to_yuy2(&yuv_planes, img->base[0], img->pitches[0]);
+    free_yuv_planes(&yuv_planes);
+
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, img->duration);
+
+    img->draw(img, this->stream);
+    img->free(img);
+  }
+}
+
+
+static void image_flush (video_decoder_t *this_gen) {
+  /* image_decoder_t *this = (image_decoder_t *) this_gen; */
+
+  /*
+   * flush out any frames that are still stored in the decoder
+   */
+}
+
+
+static void image_reset (video_decoder_t *this_gen) {
+  image_decoder_t *this = (image_decoder_t *) this_gen;
+
+  /*
+   * reset decoder after engine flush (prepare for new
+   * video data not related to recently decoded data)
+   */
+
+  if (this->loader != NULL) {
+    gdk_pixbuf_loader_close (this->loader, NULL);
+    g_object_unref (G_OBJECT (this->loader));
+    this->loader = NULL;
+  }
+}
+
+
+static void image_discontinuity (video_decoder_t *this_gen) {
+  /* image_decoder_t *this = (image_decoder_t *) this_gen; */
+
+  /*
+   * a time reference discontinuity has happened.
+   * that is, it must forget any currently held pts value
+   */
+}
+
+static void image_dispose (video_decoder_t *this_gen) {
+  image_decoder_t *this = (image_decoder_t *) this_gen;
+
+  if (this->video_open) {
+    lprintf("closing video\n");
+
+    this->stream->video_out->close(this->stream->video_out, this->stream);
+    this->video_open = 0;
+  }
+
+  if (this->loader != NULL) {
+    gdk_pixbuf_loader_close (this->loader, NULL);
+    g_object_unref (G_OBJECT (this->loader));
+    this->loader = NULL;
+  }
+
+  lprintf("closed\n");
+  free (this);
+}
+
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen,
+				     xine_stream_t *stream) {
+
+  image_class_t   *cls = (image_class_t *) class_gen;
+  image_decoder_t *this;
+
+  lprintf("opened\n");
+
+  g_type_init ();
+
+  this = (image_decoder_t *) calloc(1, sizeof(image_decoder_t));
+
+  this->video_decoder.decode_data         = image_decode_data;
+  this->video_decoder.flush               = image_flush;
+  this->video_decoder.reset               = image_reset;
+  this->video_decoder.discontinuity       = image_discontinuity;
+  this->video_decoder.dispose             = image_dispose;
+  this->cls                               = cls;
+  this->stream                            = stream;
+
+  /*
+   * initialisation of privates
+   */
+
+  return &this->video_decoder;
+}
+
+/*
+ * image plugin class
+ */
+static void *init_class (xine_t *xine, void *data) {
+
+  image_class_t       *this;
+
+  this = (image_class_t *) calloc(1, sizeof(image_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "gdkpixbuf";
+  this->decoder_class.description     = N_("gdk-pixbuf image video decoder plugin");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  /*
+   * initialisation of privates
+   */
+
+  lprintf("class opened\n");
+
+  return this;
+}
+
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t supported_types[] = { BUF_VIDEO_IMAGE, BUF_VIDEO_JPEG, 0 };
+
+static const decoder_info_t dec_info_image = {
+  supported_types,     /* supported types */
+  7                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "gdkpixbuf", XINE_VERSION_CODE, &dec_info_image, init_class },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/image.c b/src/video_dec/image.c
new file mode 100644
index 000000000..e91588702
--- /dev/null
+++ b/src/video_dec/image.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (C) 2003-2005 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * a image video decoder
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#define LOG_MODULE "image_video_decoder"
+#define LOG_VERBOSE
+/*
+#define LOG
+*/
+
+#include <wand/magick_wand.h>
+#ifdef PACKAGE_NAME
+#undef PACKAGE_BUGREPORT
+#undef PACKAGE_NAME
+#undef PACKAGE_STRING
+#undef PACKAGE_TARNAME
+#undef PACKAGE_VERSION
+#endif
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+
+#ifdef HAVE_GRAPHICSMAGICK
+# define MAGICK_VERSION 0x670
+#else
+# if !defined(MagickLibVersion) || MagickLibVersion < 0x671
+#  define MAGICK_VERSION 0x670
+#else
+#  define MAGICK_VERSION MagickLibVersion
+# endif
+#endif
+
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+
+  /*
+   * private variables
+   */
+
+} image_class_t;
+
+
+typedef struct image_decoder_s {
+  video_decoder_t   video_decoder;
+
+  image_class_t    *cls;
+
+  xine_stream_t    *stream;
+  int               video_open;
+
+  unsigned char    *image;
+  int               index;
+
+} image_decoder_t;
+
+
+static void image_decode_data (video_decoder_t *this_gen, buf_element_t *buf) {
+  image_decoder_t *this = (image_decoder_t *) this_gen;
+
+  if (!this->video_open) {
+    lprintf("opening video\n");
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+    this->video_open = 1;
+  }
+
+  xine_buffer_copyin(this->image, this->index, buf->mem, buf->size);
+  this->index += buf->size;
+
+  if (buf->decoder_flags & BUF_FLAG_FRAME_END) {
+    int                width, height, i;
+    int                status;
+    MagickWand        *wand;
+    uint8_t           *img_buf, *img_buf_ptr;
+    yuv_planes_t       yuv_planes;
+    vo_frame_t        *img;
+
+    /*
+     * this->image -> rgb data
+     */
+#if MAGICK_VERSION < 0x671
+    InitializeMagick(NULL);
+#else
+    MagickWandGenesis();
+#endif
+    wand = NewMagickWand();
+    status = MagickReadImageBlob(wand, this->image, this->index);
+
+    this->index = 0;
+
+    if (!status) {
+      DestroyMagickWand(wand);
+#if MAGICK_VERSION < 0x671
+      DestroyMagick();
+#else
+      MagickWandTerminus();
+#endif
+      lprintf("error loading image\n");
+      return;
+    }
+
+    width = MagickGetImageWidth(wand) & ~1; /* must be even for init_yuv_planes */
+    height = MagickGetImageHeight(wand);
+    img_buf = malloc(width * height * 3);
+#if MAGICK_VERSION < 0x671
+    MagickGetImagePixels(wand, 0, 0, width, height, "RGB", CharPixel, img_buf);
+    DestroyMagickWand(wand);
+    DestroyMagick();
+#else
+    MagickExportImagePixels(wand, 0, 0, width, height, "RGB", CharPixel, img_buf);
+    DestroyMagickWand(wand);
+    MagickWandTerminus();
+#endif
+
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_WIDTH, width);
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, height);
+
+    lprintf("image loaded successfully\n");
+
+    /*
+     * rgb data -> yuv_planes
+     */
+    init_yuv_planes(&yuv_planes, width, height);
+
+    img_buf_ptr = img_buf;
+    for (i=0; i < width*height; i++) {
+      uint8_t r = *(img_buf_ptr++);
+      uint8_t g = *(img_buf_ptr++);
+      uint8_t b = *(img_buf_ptr++);
+
+      yuv_planes.y[i] = COMPUTE_Y(r, g, b);
+      yuv_planes.u[i] = COMPUTE_U(r, g, b);
+      yuv_planes.v[i] = COMPUTE_V(r, g, b);
+    }
+    free(img_buf);
+
+    /*
+     * alloc and draw video frame
+     */
+    img = this->stream->video_out->get_frame (this->stream->video_out, width,
+					      height, (double)width/(double)height,
+					      XINE_IMGFMT_YUY2,
+					      VO_BOTH_FIELDS);
+    img->pts = buf->pts;
+    img->duration = 3600;
+    img->bad_frame = 0;
+
+    yuv444_to_yuy2(&yuv_planes, img->base[0], img->pitches[0]);
+    free_yuv_planes(&yuv_planes);
+
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, img->duration);
+
+    img->draw(img, this->stream);
+    img->free(img);
+  }
+}
+
+
+static void image_flush (video_decoder_t *this_gen) {
+  /* image_decoder_t *this = (image_decoder_t *) this_gen; */
+
+  /*
+   * flush out any frames that are still stored in the decoder
+   */
+}
+
+
+static void image_reset (video_decoder_t *this_gen) {
+  image_decoder_t *this = (image_decoder_t *) this_gen;
+
+  /*
+   * reset decoder after engine flush (prepare for new
+   * video data not related to recently decoded data)
+   */
+
+  this->index = 0;
+}
+
+
+static void image_discontinuity (video_decoder_t *this_gen) {
+  /* image_decoder_t *this = (image_decoder_t *) this_gen; */
+
+  /*
+   * a time reference discontinuity has happened.
+   * that is, it must forget any currently held pts value
+   */
+}
+
+static void image_dispose (video_decoder_t *this_gen) {
+  image_decoder_t *this = (image_decoder_t *) this_gen;
+
+  if (this->video_open) {
+    lprintf("closing video\n");
+
+    this->stream->video_out->close(this->stream->video_out, this->stream);
+    this->video_open = 0;
+  }
+
+  xine_buffer_free(this->image);
+
+  lprintf("closed\n");
+  free (this);
+}
+
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen,
+				     xine_stream_t *stream) {
+
+  image_class_t   *cls = (image_class_t *) class_gen;
+  image_decoder_t *this;
+
+  lprintf("opened\n");
+
+  this = (image_decoder_t *) calloc(1, sizeof(image_decoder_t));
+
+  this->video_decoder.decode_data         = image_decode_data;
+  this->video_decoder.flush               = image_flush;
+  this->video_decoder.reset               = image_reset;
+  this->video_decoder.discontinuity       = image_discontinuity;
+  this->video_decoder.dispose             = image_dispose;
+  this->cls                               = cls;
+  this->stream                            = stream;
+
+  /*
+   * initialisation of privates
+   */
+
+  this->image = xine_buffer_init(10240);
+
+  return &this->video_decoder;
+}
+
+/*
+ * image plugin class
+ */
+static void *init_class (xine_t *xine, void *data) {
+
+  image_class_t       *this;
+
+  this = (image_class_t *) calloc(1, sizeof(image_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "imagevdec";
+  this->decoder_class.description     = N_("image video decoder plugin");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  /*
+   * initialisation of privates
+   */
+
+  lprintf("class opened\n");
+
+  return this;
+}
+
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t supported_types[] = { BUF_VIDEO_IMAGE,
+                                      0 };
+
+static const decoder_info_t dec_info_image = {
+  supported_types,     /* supported types */
+  6                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "image", XINE_VERSION_CODE, &dec_info_image, init_class },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/libmpeg2/Makefile.am b/src/video_dec/libmpeg2/Makefile.am
new file mode 100644
index 000000000..a6aab2a72
--- /dev/null
+++ b/src/video_dec/libmpeg2/Makefile.am
@@ -0,0 +1,34 @@
+include $(top_srcdir)/misc/Makefile.quiet
+include $(top_builddir)/misc/Makefile.plugins
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS  = $(DEFAULT_OCFLAGS) $(VISIBILITY_FLAG)
+AM_LDFLAGS = $(xineplug_ldflags)
+
+noinst_HEADERS = vlc.h mpeg2.h xvmc.h xvmc_vld.h mpeg2_internal.h idct_mlib.h vis.h \
+	libmpeg2_accel.h
+
+xineplug_LTLIBRARIES = xineplug_decode_mpeg2.la
+
+xineplug_decode_mpeg2_la_SOURCES = \
+	cpu_state.c \
+	decode.c \
+	header.c \
+	idct.c \
+	idct_altivec.c \
+	idct_mlib.c \
+	idct_mmx.c \
+	motion_comp.c \
+	motion_comp_altivec.c \
+	motion_comp_mmx.c \
+	motion_comp_mlib.c \
+	motion_comp_vis.c \
+	slice.c \
+	slice_xvmc.c \
+	slice_xvmc_vld.c \
+	stats.c \
+	xine_mpeg2_decoder.c \
+	libmpeg2_accel.c
+
+xineplug_decode_mpeg2_la_LIBADD = $(XINE_LIB) $(MLIB_LIBS) $(LTLIBINTL) $(AVUTIL_LIBS) -lm
+xineplug_decode_mpeg2_la_CFLAGS = $(AM_CFLAGS) $(MLIB_CFLAGS) $(AVUTIL_CFLAGS)
diff --git a/src/video_dec/libmpeg2/cpu_state.c b/src/video_dec/libmpeg2/cpu_state.c
new file mode 100644
index 000000000..d1507eec1
--- /dev/null
+++ b/src/video_dec/libmpeg2/cpu_state.c
@@ -0,0 +1,184 @@
+/*
+ * cpu_state.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+#include "xine_mmx.h"
+
+void (* mpeg2_cpu_state_save) (cpu_state_t * state) = NULL;
+void (* mpeg2_cpu_state_restore) (cpu_state_t * state) = NULL;
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static void state_restore_mmx (cpu_state_t * state)
+{
+    emms ();
+}
+#endif
+
+#if defined (ARCH_PPC) && defined (ENABLE_ALTIVEC)
+
+#ifndef HOST_OS_DARWIN
+
+static void state_save_altivec (cpu_state_t * state)
+{
+    asm ("						\n"
+	"	li		%r9,  16		\n"
+	"	stvx		%v20, 0,    %r3		\n"
+	"	li		%r11, 32		\n"
+	"	stvx		%v21, %r9,  %r3		\n"
+	"	li		%r9,  48		\n"
+	"	stvx		%v22, %r11, %r3		\n"
+	"	li		%r11, 64		\n"
+	"	stvx		%v23, %r9,  %r3		\n"
+	"	li		%r9,  80		\n"
+	"	stvx		%v24, %r11, %r3		\n"
+	"	li		%r11, 96		\n"
+	"	stvx		%v25, %r9,  %r3		\n"
+	"	li		%r9,  112		\n"
+	"	stvx		%v26, %r11, %r3		\n"
+	"	li		%r11, 128		\n"
+	"	stvx		%v27, %r9,  %r3		\n"
+	"	li		%r9,  144		\n"
+	"	stvx		%v28, %r11, %r3		\n"
+	"	li		%r11, 160		\n"
+	"	stvx		%v29, %r9,  %r3		\n"
+	"	li		%r9,  176		\n"
+	"	stvx		%v30, %r11, %r3		\n"
+	"	stvx		%v31, %r9,  %r3		\n"
+	 );
+}
+
+static void state_restore_altivec (cpu_state_t * state)
+{
+    asm ("						\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v20, 0,    %r3		\n"
+	"	li		%r11, 32		\n"
+	"	lvx		%v21, %r9,  %r3		\n"
+	"	li		%r9,  48		\n"
+	"	lvx		%v22, %r11, %r3		\n"
+	"	li		%r11, 64		\n"
+	"	lvx		%v23, %r9,  %r3		\n"
+	"	li		%r9,  80		\n"
+	"	lvx		%v24, %r11, %r3		\n"
+	"	li		%r11, 96		\n"
+	"	lvx		%v25, %r9,  %r3		\n"
+	"	li		%r9,  112		\n"
+	"	lvx		%v26, %r11, %r3		\n"
+	"	li		%r11, 128		\n"
+	"	lvx		%v27, %r9,  %r3		\n"
+	"	li		%r9,  144		\n"
+	"	lvx		%v28, %r11, %r3		\n"
+	"	li		%r11, 160		\n"
+	"	lvx		%v29, %r9,  %r3		\n"
+	"	li		%r9,  176		\n"
+	"	lvx		%v30, %r11, %r3		\n"
+	"	lvx		%v31, %r9,  %r3		\n"
+	 );
+}
+
+#else /* HOST_OS_DARWIN */
+
+#define LI(a,b) "li r" #a "," #b "\n\t"
+#define STVX0(a,b,c) "stvx v" #a ",0,r" #c "\n\t"
+#define STVX(a,b,c) "stvx v" #a ",r" #b ",r" #c "\n\t"
+#define LVX0(a,b,c) "lvx v" #a ",0,r" #c "\n\t"
+#define LVX(a,b,c) "lvx v" #a ",r" #b ",r" #c "\n\t"
+
+static void state_save_altivec (cpu_state_t * state)
+{
+    asm (LI (9, 16)
+	 STVX0 (20, 0, 3)
+	 LI (11, 32)
+	 STVX (21, 9, 3)
+	 LI (9, 48)
+	 STVX (22, 11, 3)
+	 LI (11, 64)
+	 STVX (23, 9, 3)
+	 LI (9, 80)
+	 STVX (24, 11, 3)
+	 LI (11, 96)
+	 STVX (25, 9, 3)
+	 LI (9, 112)
+	 STVX (26, 11, 3)
+	 LI (11, 128)
+	 STVX (27, 9, 3)
+	 LI (9, 144)
+	 STVX (28, 11, 3)
+	 LI (11, 160)
+	 STVX (29, 9, 3)
+	 LI (9, 176)
+	 STVX (30, 11, 3)
+	 STVX (31, 9, 3));
+}
+
+static void state_restore_altivec (cpu_state_t * state)
+{
+    asm (LI (9, 16)
+	 LVX0 (20, 0, 3)
+	 LI (11, 32)
+	 LVX (21, 9, 3)
+	 LI (9, 48)
+	 LVX (22, 11, 3)
+	 LI (11, 64)
+	 LVX (23, 9, 3)
+	 LI (9, 80)
+	 LVX (24, 11, 3)
+	 LI (11, 96)
+	 LVX (25, 9, 3)
+	 LI (9, 112)
+	 LVX (26, 11, 3)
+	 LI (11, 128)
+	 LVX (27, 9, 3)
+	 LI (9, 144)
+	 LVX (28, 11, 3)
+	 LI (11, 160)
+	 LVX (29, 9, 3)
+	 LI (9, 176)
+	 LVX (30, 11, 3)
+	 LVX (31, 9, 3));
+}
+#endif /* HOST_OS_DARWIN */
+
+#endif /* defined (ARCH_PPC) && defined (ENABLE_ALTIVEC) */
+
+void mpeg2_cpu_state_init (uint32_t mm_accel)
+{
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    if (mm_accel & MM_ACCEL_X86_MMX) {
+	mpeg2_cpu_state_restore = state_restore_mmx;
+    }
+#endif
+#if defined (ARCH_PPC) && defined (ENABLE_ALTIVEC)
+    if (mm_accel & MM_ACCEL_PPC_ALTIVEC) {
+	mpeg2_cpu_state_save = state_save_altivec;
+	mpeg2_cpu_state_restore = state_restore_altivec;
+    }
+#endif
+}
+
diff --git a/src/video_dec/libmpeg2/decode.c b/src/video_dec/libmpeg2/decode.c
new file mode 100644
index 000000000..848d111fc
--- /dev/null
+++ b/src/video_dec/libmpeg2/decode.c
@@ -0,0 +1,1006 @@
+/*
+ * decode.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * xine-specific version by G. Bartsch
+ *
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <string.h>	/* memcpy/memset, try to remove */
+#include <stdlib.h>
+#include <inttypes.h>
+#include <math.h>
+
+#define LOG_MODULE "decode"
+#define LOG_VERBOSE
+/*
+#define LOG
+*/
+
+#ifdef HAVE_FFMPEG_AVUTIL_H
+#  include <mem.h>
+#else
+#  include <libavutil/mem.h>
+#endif
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+
+#include "mpeg2.h"
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+#include "libmpeg2_accel.h"
+
+/*
+#define LOG_PAN_SCAN
+*/
+
+/* #define BUFFER_SIZE (224 * 1024) */
+#define BUFFER_SIZE (1194 * 1024) /* new buffer size for mpeg2dec 0.2.1 */
+
+static void process_userdata(mpeg2dec_t *mpeg2dec, uint8_t *buffer);
+
+void mpeg2_init (mpeg2dec_t * mpeg2dec, 
+		 xine_video_port_t * output)
+{
+  static int do_init = 1;
+  uint32_t mm_accel;
+
+    if (do_init) {
+	do_init = 0;
+	mm_accel = xine_mm_accel();
+	mpeg2_cpu_state_init (mm_accel);
+	mpeg2_idct_init (mm_accel);
+	mpeg2_mc_init (mm_accel);
+	libmpeg2_accel_scan(&mpeg2dec->accel, mpeg2_scan_norm, mpeg2_scan_alt);
+    }
+
+    if( !mpeg2dec->chunk_buffer )
+      mpeg2dec->chunk_buffer = av_mallocz(BUFFER_SIZE + 4);
+    if( !mpeg2dec->picture )
+      mpeg2dec->picture = av_mallocz(sizeof(picture_t));
+
+    mpeg2dec->shift = 0xffffff00;
+    mpeg2dec->new_sequence = 0;
+    mpeg2dec->is_sequence_needed = 1;
+    mpeg2dec->is_wait_for_ip_frames = 2;
+    mpeg2dec->frames_to_drop = 0;
+    mpeg2dec->drop_frame = 0;
+    mpeg2dec->in_slice = 0;
+    mpeg2dec->output = output;
+    mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+    mpeg2dec->code = 0xb4;
+    mpeg2dec->seek_mode = 0;
+
+    /* initialize AFD storage */
+    mpeg2dec->afd_value_seen = XINE_VIDEO_AFD_NOT_PRESENT;
+    mpeg2dec->afd_value_reported = (XINE_VIDEO_AFD_NOT_PRESENT - 1);
+
+    /* initialize substructures */
+    mpeg2_header_state_init (mpeg2dec->picture);
+
+    if ( output->get_capabilities(output) & VO_CAP_XXMC) {
+      printf("libmpeg2: output port has XxMC capability\n");
+      mpeg2dec->frame_format = XINE_IMGFMT_XXMC;
+    } else if( output->get_capabilities(output) & VO_CAP_XVMC_MOCOMP) {
+      printf("libmpeg2: output port has XvMC capability\n");
+      mpeg2dec->frame_format = XINE_IMGFMT_XVMC;
+    } else {
+      mpeg2dec->frame_format = XINE_IMGFMT_YV12;
+    }
+}
+
+static inline void get_frame_duration (mpeg2dec_t * mpeg2dec, vo_frame_t *frame)
+{
+  static const double durations[] = {
+    0,		/* invalid */
+    3753.75,	/* 23.976 fps */
+    3750,	/* 24 fps */
+    3600,	/* 25 fps */
+    3003,	/* 29.97 fps */
+    3000,	/* 30 fps */
+    1800,	/* 50 fps */
+    1501.5,	/* 59.94 fps */
+    1500,	/* 60 fps */
+  };
+  double duration = ((unsigned) mpeg2dec->picture->frame_rate_code > 8u)
+		    ? 0 : durations[mpeg2dec->picture->frame_rate_code];
+  
+  duration = duration * (mpeg2dec->picture->frame_rate_ext_n + 1.0) /
+			(mpeg2dec->picture->frame_rate_ext_d + 1.0);
+ 
+  /* this should be used to detect any special rff pattern */
+  mpeg2dec->rff_pattern = mpeg2dec->rff_pattern << 1;
+  mpeg2dec->rff_pattern |= !!frame->repeat_first_field;
+
+  if( ((mpeg2dec->rff_pattern & 0xff) == 0xaa ||
+      (mpeg2dec->rff_pattern & 0xff) == 0x55) &&
+      !mpeg2dec->picture->progressive_sequence ) {
+    /* special case for ntsc 3:2 pulldown */
+    duration *= 5.0 / 4.0;
+  }
+  else
+  {  
+    if( frame->repeat_first_field ) {
+      if( !mpeg2dec->picture->progressive_sequence &&
+           frame->progressive_frame ) {
+        /* decoder should output 3 fields, so adjust duration to
+           count on this extra field time */
+        duration *= 3.0 / 2.0;
+      } else if( mpeg2dec->picture->progressive_sequence ) {
+        /* for progressive sequences the output should repeat the
+           frame 1 or 2 times depending on top_field_first flag. */
+        duration *= (frame->top_field_first) ? 3 : 2;
+      }
+    }
+  }
+
+  frame->duration = (int) ceil (duration);
+  _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, frame->duration);
+  /*printf("mpeg2dec: rff=%u\n",frame->repeat_first_field);*/
+} 
+
+static double get_aspect_ratio(mpeg2dec_t *mpeg2dec)
+{
+  double ratio;
+  picture_t * picture = mpeg2dec->picture;
+  double mpeg1_pel_ratio[16] = {1.0 /* forbidden */,
+    1.0, 0.6735, 0.7031, 0.7615, 0.8055, 0.8437, 0.8935, 0.9157,
+    0.9815, 1.0255, 1.0695, 1.0950, 1.1575, 1.2015, 1.0 /*reserved*/ };
+
+  /* TODO: For slower machines the value of this function should be computed
+   *       once and cached!
+   */
+
+  if( !picture->mpeg1 ) {
+    /* these hardcoded values are defined on mpeg2 standard for
+     * aspect ratio. other values are reserved or forbidden.  */
+    switch(picture->aspect_ratio_information) {
+    case 2:
+      ratio = 4.0/3.0;
+      break;
+    case 3:
+      ratio = 16.0/9.0;
+      break;
+    case 4:
+      ratio = 2.11/1.0;
+      break;
+    case 1:
+    default:
+      ratio = (double)picture->coded_picture_width/(double)picture->coded_picture_height;
+      break;
+    }
+  } else {
+    /* mpeg1 constants refer to pixel aspect ratio */
+    ratio = (double)picture->coded_picture_width/(double)picture->coded_picture_height;
+    ratio /= mpeg1_pel_ratio[picture->aspect_ratio_information];
+  }
+
+  return ratio;
+}
+
+static void remember_metainfo (mpeg2dec_t *mpeg2dec) {
+
+  picture_t * picture = mpeg2dec->picture;
+
+  _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_WIDTH, picture->display_width);
+  _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, picture->display_height);
+  _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_RATIO,
+    ((double)10000 * get_aspect_ratio(mpeg2dec)));
+
+  switch (mpeg2dec->picture->frame_rate_code) {
+  case 1: /* 23.976 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 3913);
+    break;
+  case 2: /* 24 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 3750);
+    break;
+  case 3: /* 25 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 3600);
+    break;
+  case 4: /* 29.97 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 3003);
+    break;
+  case 5: /* 30 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 3000);
+    break;
+  case 6: /* 50 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 1800);
+    break;
+  case 7: /* 59.94 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 1525);
+    break;
+  case 8: /* 60 fps */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 1509);
+    break;
+  default:
+       /* printf ("invalid/unknown frame rate code : %d \n",
+               frame->frame_rate_code); */
+    _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_FRAME_DURATION, 3000);
+  }
+
+  _x_meta_info_set_utf8(mpeg2dec->stream, XINE_META_INFO_VIDEOCODEC, "MPEG (libmpeg2)");
+}
+
+static inline int parse_chunk (mpeg2dec_t * mpeg2dec, int code,
+			       uint8_t * buffer, int next_code)
+{
+    picture_t * picture;
+    int is_frame_done;
+    double ratio;
+    
+    /* wait for sequence_header_code */
+    if (mpeg2dec->is_sequence_needed) {
+      if (code != 0xb3) {
+        /* printf ("libmpeg2: waiting for sequence header\n");  */
+	mpeg2dec->pts = 0;
+	return 0;
+      }
+    }
+    if (mpeg2dec->is_frame_needed) {
+      /* printf ("libmpeg2: waiting for frame start\n");  */
+      mpeg2dec->pts = 0;
+      if (mpeg2dec->picture->current_frame)
+        mpeg2dec->picture->current_frame->bad_frame = 1;
+    }
+
+    mpeg2_stats (code, buffer);
+
+    picture = mpeg2dec->picture;
+    is_frame_done = mpeg2dec->in_slice && ((!code) || (code >= 0xb0));
+
+    if (is_frame_done)
+	mpeg2dec->in_slice = 0;
+    
+    if (is_frame_done && picture->current_frame != NULL) {
+
+	libmpeg2_accel_frame_completion(&mpeg2dec->accel, mpeg2dec->frame_format, 
+					picture, code);
+
+	if (((picture->picture_structure == FRAME_PICTURE) ||
+	     (picture->second_field)) ) {
+	  
+	  if (mpeg2dec->drop_frame)
+	    picture->current_frame->bad_frame = 1;
+   
+	  if (picture->picture_coding_type == B_TYPE) {
+	    if( picture->current_frame && !picture->current_frame->drawn ) {
+
+	      /* hack against wrong mpeg1 pts */
+	      if (picture->mpeg1) 
+	        picture->current_frame->pts = 0;
+
+	      get_frame_duration(mpeg2dec, picture->current_frame);
+	      mpeg2dec->frames_to_drop = picture->current_frame->draw (picture->current_frame, mpeg2dec->stream);
+	      picture->current_frame->drawn = 1;
+	    }
+	  } else if (picture->forward_reference_frame && !picture->forward_reference_frame->drawn) {
+	    get_frame_duration(mpeg2dec, picture->forward_reference_frame);
+	    mpeg2dec->frames_to_drop = picture->forward_reference_frame->draw (picture->forward_reference_frame,
+									       mpeg2dec->stream);
+	    picture->forward_reference_frame->drawn = 1;
+	  }
+	}
+    }
+
+    switch (code) {
+    case 0x00:	/* picture_start_code */
+	if (mpeg2_header_picture (picture, buffer)) {
+	    fprintf (stderr, "bad picture header\n");
+	    abort();
+	}
+
+	mpeg2dec->is_frame_needed=0;
+
+	if (!picture->second_field) {
+	  /* find out if we want to skip this frame */
+	  mpeg2dec->drop_frame = 0;
+	  
+	  /* picture->skip_non_intra_dct = (mpeg2dec->frames_to_drop>0) ; */
+	  
+	  switch (picture->picture_coding_type) {
+	  case B_TYPE:
+	    
+	    lprintf ("B-Frame\n");
+
+	    if (mpeg2dec->frames_to_drop>1) {
+	      lprintf ("dropping b-frame because frames_to_drop==%d\n",
+		       mpeg2dec->frames_to_drop);
+	      mpeg2dec->drop_frame = 1;
+	    } else if (!picture->forward_reference_frame || picture->forward_reference_frame->bad_frame 
+		       || !picture->backward_reference_frame || picture->backward_reference_frame->bad_frame) {
+#ifdef LOG
+	      printf ("libmpeg2: dropping b-frame because ref is bad (");
+	      if (picture->forward_reference_frame)
+		printf ("fw ref frame %d, bad %d;", picture->forward_reference_frame->id,
+			picture->forward_reference_frame->bad_frame);
+	      else
+		printf ("fw ref frame not there;");
+	      if (picture->backward_reference_frame)
+		printf ("bw ref frame %d, bad %d)\n", picture->backward_reference_frame->id,
+			picture->backward_reference_frame->bad_frame);
+	      else
+		printf ("fw ref frame not there)\n");
+#endif
+	      mpeg2dec->drop_frame = 1;
+	    } else if (mpeg2dec->is_wait_for_ip_frames > 0) {
+	      lprintf("dropping b-frame because refs are invalid\n");
+	      mpeg2dec->drop_frame = 1;
+	    }
+	    break;
+	    
+	  case P_TYPE:
+	    
+	    lprintf ("P-Frame\n");
+
+	    if (mpeg2dec->frames_to_drop>2) {
+	      mpeg2dec->drop_frame = 1;
+	      lprintf ("dropping p-frame because frames_to_drop==%d\n",
+		       mpeg2dec->frames_to_drop);
+	    } else if (!picture->backward_reference_frame || picture->backward_reference_frame->bad_frame) {
+	      mpeg2dec->drop_frame = 1;
+#ifdef LOG
+	      if (!picture->backward_reference_frame)
+		printf ("libmpeg2: dropping p-frame because no ref frame\n");
+	      else
+		printf ("libmpeg2: dropping p-frame because ref %d is bad\n", picture->backward_reference_frame->id);
+#endif
+	    } else if (mpeg2dec->is_wait_for_ip_frames > 1) {
+	      lprintf("dropping p-frame because ref is invalid\n");
+	      mpeg2dec->drop_frame = 1;
+	    } else if (mpeg2dec->is_wait_for_ip_frames)
+	      mpeg2dec->is_wait_for_ip_frames--;
+	    
+	    break;
+	    
+	  case I_TYPE:
+	    lprintf ("I-Frame\n");
+	    /* for the sake of dvd menus, never drop i-frames
+	    if (mpeg2dec->frames_to_drop>4) {
+	      mpeg2dec->drop_frame = 1;
+	    }
+	    */
+	    
+	    if (mpeg2dec->is_wait_for_ip_frames)
+	      mpeg2dec->is_wait_for_ip_frames--;
+	    
+	    break;
+	  }
+	}
+
+	break;
+
+    case 0xb2: /* user data code */
+        process_userdata(mpeg2dec, buffer);
+        break;
+
+    case 0xb3:	/* sequence_header_code */
+	if (mpeg2_header_sequence (picture, buffer)) {
+	    fprintf (stderr, "bad sequence header\n");
+	    /* abort(); */
+	    break;
+	}
+
+        /* reset AFD value to detect absence */
+        mpeg2dec->afd_value_seen = XINE_VIDEO_AFD_NOT_PRESENT;
+
+        /* according to ISO/IEC 13818-2, an extension start code will follow.
+         * Otherwise the stream follows ISO/IEC 11172-2 which means MPEG1 */ 
+        picture->mpeg1 = (next_code != 0xb5);
+
+	if (mpeg2dec->force_aspect) picture->aspect_ratio_information = mpeg2dec->force_aspect;
+
+	if (mpeg2dec->is_sequence_needed ) {
+	    mpeg2dec->new_sequence = 1;
+	}
+
+	if (mpeg2dec->is_sequence_needed 
+	    || (picture->aspect_ratio_information != picture->saved_aspect_ratio)
+	    || (picture->frame_width != picture->coded_picture_width)
+	    || (picture->frame_height != picture->coded_picture_height)) {
+	    xine_event_t event;
+	    xine_format_change_data_t data;
+	    
+	    remember_metainfo (mpeg2dec);
+	    event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+	    event.stream = mpeg2dec->stream;
+	    event.data = &data;
+	    event.data_length = sizeof(data);
+	    data.width = picture->coded_picture_width;
+	    data.height = picture->coded_picture_height;
+	    data.aspect = picture->aspect_ratio_information;
+	    data.pan_scan = mpeg2dec->force_pan_scan;
+	    xine_event_send(mpeg2dec->stream, &event);
+
+            _x_stream_info_set(mpeg2dec->stream,XINE_STREAM_INFO_VIDEO_WIDTH,
+              picture->display_width);
+	    _x_stream_info_set(mpeg2dec->stream,XINE_STREAM_INFO_VIDEO_HEIGHT,
+              picture->display_height);
+
+	    if (picture->forward_reference_frame &&
+		picture->forward_reference_frame != picture->current_frame &&
+		picture->forward_reference_frame != picture->backward_reference_frame)
+	      picture->forward_reference_frame->free (picture->forward_reference_frame);
+	  
+	    if (picture->backward_reference_frame &&
+		picture->backward_reference_frame != picture->current_frame)
+	      picture->backward_reference_frame->free (picture->backward_reference_frame);
+
+	    mpeg2dec->is_sequence_needed = 0;
+	    picture->forward_reference_frame = NULL;
+	    picture->backward_reference_frame = NULL;
+
+	    picture->frame_width = picture->coded_picture_width;
+	    picture->frame_height = picture->coded_picture_height;
+	    picture->saved_aspect_ratio = picture->aspect_ratio_information;
+	}
+	break;
+
+    case 0xb5:	/* extension_start_code */
+	if (mpeg2_header_extension (picture, buffer)) {
+	    fprintf (stderr, "bad extension\n");
+	    abort();
+	}
+	break;
+
+    case 0xb7:	/* sequence end code */
+        mpeg2_flush(mpeg2dec);
+        mpeg2dec->is_sequence_needed = 1;
+        break;
+
+    case 0xb8:	/* group of pictures start code */
+	if (mpeg2_header_group_of_pictures (picture, buffer)) {
+	  printf ("libmpeg2: bad group of pictures\n");
+	  abort();
+	}
+    default:
+        if ((code >= 0xb9) && (code != 0xe4)) {
+	  printf("Not multiplexed? 0x%x\n",code);
+	}
+	if (code >= 0xb0)
+	    break;
+
+        /* check for AFD change once per picture */
+        if (mpeg2dec->afd_value_reported != mpeg2dec->afd_value_seen) {
+            /* AFD data should better be stored in current_frame to have it */
+            /* ready and synchronous with other data like width or height. */
+            /* An AFD change should then be detected when a new frame is emitted */
+            /* from the decoder to report the AFD change in display order and not */
+            /* in decoding order like it happens below for now. */
+            _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_AFD, mpeg2dec->afd_value_seen);
+            lprintf ("AFD changed from %d to %d\n", mpeg2dec->afd_value_reported, mpeg2dec->afd_value_seen);
+            mpeg2dec->afd_value_reported = mpeg2dec->afd_value_seen;
+        }
+
+	if (!(mpeg2dec->in_slice)) {
+	    mpeg2dec->in_slice = 1;
+
+	    if (picture->second_field) {
+	      if (picture->current_frame)
+		picture->current_frame->field(picture->current_frame, 
+					      picture->picture_structure);
+	      else
+		mpeg2dec->drop_frame = 1;		
+	    } else {
+		int flags = picture->picture_structure;
+
+		if (!picture->mpeg1) flags |= VO_INTERLACED_FLAG;
+		if (mpeg2dec->force_pan_scan) flags |= VO_PAN_SCAN_FLAG;
+		if (mpeg2dec->new_sequence) flags |= VO_NEW_SEQUENCE_FLAG;
+
+		if ( picture->current_frame && 
+		     picture->current_frame != picture->backward_reference_frame &&
+		     picture->current_frame != picture->forward_reference_frame ) {
+			picture->current_frame->free (picture->current_frame);
+		}
+		if (picture->picture_coding_type == B_TYPE) {
+                    ratio = get_aspect_ratio(mpeg2dec);
+		    picture->current_frame =
+		        mpeg2dec->stream->video_out->get_frame (mpeg2dec->stream->video_out,
+						     picture->coded_picture_width,
+						     picture->coded_picture_height,
+						     ratio,
+						     mpeg2dec->frame_format,
+						     flags);
+		    libmpeg2_accel_new_frame( &mpeg2dec->accel, mpeg2dec->frame_format,
+					      picture, ratio, flags);
+		} else {
+                    ratio = get_aspect_ratio(mpeg2dec);
+		    picture->current_frame =
+		        mpeg2dec->stream->video_out->get_frame (mpeg2dec->stream->video_out,
+						     picture->coded_picture_width,
+						     picture->coded_picture_height,
+						     ratio,
+						     mpeg2dec->frame_format,
+						     flags);
+
+		    libmpeg2_accel_new_frame( &mpeg2dec->accel, mpeg2dec->frame_format,
+					      picture, ratio, flags);
+
+		    if (picture->forward_reference_frame &&
+		        picture->forward_reference_frame != picture->backward_reference_frame)
+		      picture->forward_reference_frame->free (picture->forward_reference_frame);
+
+		    picture->forward_reference_frame =
+			picture->backward_reference_frame;
+		    picture->backward_reference_frame = picture->current_frame;
+		}
+
+		if(mpeg2dec->new_sequence) 
+		    mpeg2dec->new_sequence = 
+			libmpeg2_accel_new_sequence(&mpeg2dec->accel, mpeg2dec->frame_format, 
+						    picture);
+
+		picture->current_frame->bad_frame          = 1;
+		picture->current_frame->drawn              = 0;
+		picture->current_frame->pts                = mpeg2dec->pts;
+                picture->current_frame->top_field_first    = picture->top_field_first;
+                picture->current_frame->repeat_first_field = picture->repeat_first_field;
+                picture->current_frame->progressive_frame  = picture->progressive_frame;
+                picture->current_frame->crop_right         = picture->coded_picture_width - picture->display_width;
+                picture->current_frame->crop_bottom        = picture->coded_picture_height - picture->display_height;
+
+                switch( picture->picture_coding_type ) {
+                  case I_TYPE:
+                    picture->current_frame->picture_coding_type = XINE_PICT_I_TYPE;
+                    break;
+                  case P_TYPE:
+                    picture->current_frame->picture_coding_type = XINE_PICT_P_TYPE;
+                    break;
+                  case B_TYPE:
+                    picture->current_frame->picture_coding_type = XINE_PICT_B_TYPE;
+                    break;
+                  case D_TYPE:
+                    picture->current_frame->picture_coding_type = XINE_PICT_D_TYPE;
+                    break;
+                }
+
+		lprintf ("decoding frame %d, type %s\n",
+			 picture->current_frame->id, picture->picture_coding_type == I_TYPE ? "I" :
+			 picture->picture_coding_type == P_TYPE ? "P" : "B");
+		mpeg2dec->pts = 0;
+		/*printf("Starting to decode frame %d\n",picture->current_frame->id);*/
+	    }
+	}
+
+	if (!mpeg2dec->drop_frame && picture->current_frame != NULL) {
+#ifdef DEBUG_LOG
+	  printf("slice target %08x past %08x future %08x\n",picture->current_frame,picture->forward_reference_frame,picture->backward_reference_frame);
+	  fflush(stdout);
+#endif
+	  libmpeg2_accel_slice(&mpeg2dec->accel, picture, code, buffer, mpeg2dec->chunk_size, 
+			       mpeg2dec->chunk_buffer);
+
+	  if( picture->v_offset > picture->limit_y || 
+	      picture->v_offset + 16 > picture->display_height ) { 
+	    picture->current_frame->bad_frame = 0;
+	  }
+	}
+    }
+
+    /* printf ("libmpeg2: parse_chunk %d completed\n", code);  */
+    return is_frame_done;
+}
+
+static inline int find_start_code (mpeg2dec_t * mpeg2dec,
+                                   uint8_t ** current, uint8_t * limit)
+{
+    uint8_t * p;
+
+    if (*current >= limit)
+	return 0;
+    if (mpeg2dec->shift == 0x00000100)
+	return 1;
+
+    mpeg2dec->shift = (mpeg2dec->shift | *(*current)++) << 8;
+
+    if (*current >= limit)
+	return 0;
+    if (mpeg2dec->shift == 0x00000100)
+	return 1;
+
+    mpeg2dec->shift = (mpeg2dec->shift | *(*current)++) << 8;
+
+    if (*current >= limit)
+	return 0;
+    if (mpeg2dec->shift == 0x00000100)
+	return 1;
+
+    limit--;
+
+    if (*current >= limit) {
+	mpeg2dec->shift = (mpeg2dec->shift | *(*current)++) << 8;
+	return 0;
+    }
+
+    p = *current;
+
+    while (p < limit && (p = (uint8_t *)memchr(p, 0x01, limit - p))) {
+	if (p[-2] || p[-1])
+	    p += 3;
+	else {
+	    *current = ++p;
+	    return 1;
+	}
+    }
+
+    *current = ++limit;
+    p = limit - 3;
+    mpeg2dec->shift = (mpeg2dec->shift | *p++) << 8;
+    mpeg2dec->shift = (mpeg2dec->shift | *p++) << 8;
+    mpeg2dec->shift = (mpeg2dec->shift | *p++) << 8;
+
+    return 0;
+}
+
+static inline uint8_t * copy_chunk (mpeg2dec_t * mpeg2dec,
+				    uint8_t * current, uint8_t * end)
+{
+    uint8_t * limit;
+    uint8_t * data = current;
+    int found, bite;
+
+    /* sequence end code 0xb7 doesn't have any data and there might be the case
+     * that no start code will follow this code for quite some time (e. g. in case
+     * of a still image.
+     * Therefore, return immediately with a chunk_size of 0. Setting code to 0xb4
+     * will eat up any trailing garbage next time.
+     */
+    if (mpeg2dec->code == 0xb7) {
+	mpeg2dec->code = 0xb4;
+	mpeg2dec->chunk_size = 0;
+	return current;
+    }
+
+    limit = current + (mpeg2dec->chunk_buffer + BUFFER_SIZE - mpeg2dec->chunk_ptr);
+    if (limit > end)
+	limit = end;
+
+    found = find_start_code(mpeg2dec, &current, limit);
+    bite = current - data;
+    if (bite) {
+	xine_fast_memcpy(mpeg2dec->chunk_ptr, data, bite);
+	mpeg2dec->chunk_ptr += bite;
+    }
+
+    if (found) {
+	mpeg2dec->code = *current++;
+	mpeg2dec->chunk_size = mpeg2dec->chunk_ptr - mpeg2dec->chunk_buffer - 3;
+	mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+	mpeg2dec->shift = 0xffffff00;
+	return current;
+    }
+ 
+    if (current == end)
+	return NULL;
+
+    /* we filled the chunk buffer without finding a start code */
+    mpeg2dec->code = 0xb4;	/* sequence_error_code */
+    mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+    return current;
+}
+
+int mpeg2_decode_data (mpeg2dec_t * mpeg2dec, uint8_t * current, uint8_t * end,
+		       uint64_t pts)
+{
+    int ret;
+    uint8_t code;
+
+    ret = 0;
+    if (mpeg2dec->seek_mode) {
+      mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+      mpeg2dec->code = 0xb4;
+      mpeg2dec->seek_mode = 0;
+      mpeg2dec->shift = 0xffffff00;
+      mpeg2dec->is_frame_needed = 1;
+    }
+
+    if (pts)
+      mpeg2dec->pts = pts;
+
+    while (current != end || mpeg2dec->code == 0xb7) {
+	code = mpeg2dec->code;
+	current = copy_chunk (mpeg2dec, current, end);
+	if (current == NULL) 
+	    break;
+	ret += parse_chunk (mpeg2dec, code, mpeg2dec->chunk_buffer, mpeg2dec->code);
+    }
+
+    libmpeg2_accel_frame_completion(&mpeg2dec->accel, mpeg2dec->frame_format, 
+				    mpeg2dec->picture, 0xff);
+
+    return ret;
+}
+
+void mpeg2_discontinuity (mpeg2dec_t * mpeg2dec) {
+  picture_t *picture = mpeg2dec->picture;
+
+  if( !picture )
+    return;
+  
+  mpeg2dec->in_slice = 0;
+  mpeg2dec->pts = 0;  
+  if ( picture->current_frame )
+    picture->current_frame->pts = 0;
+  if ( picture->forward_reference_frame )
+    picture->forward_reference_frame->pts = 0;
+  if ( picture->backward_reference_frame )
+    picture->backward_reference_frame->pts = 0;
+
+  libmpeg2_accel_discontinuity(&mpeg2dec->accel, mpeg2dec->frame_format, picture);
+}
+
+void mpeg2_reset (mpeg2dec_t * mpeg2dec) {
+  
+  picture_t *picture = mpeg2dec->picture;
+
+  if( !picture )
+    return;
+  
+  mpeg2_discontinuity(mpeg2dec);
+  
+  if( !picture->mpeg1 ) {
+    mpeg2dec->is_wait_for_ip_frames = 2;
+
+    /* mark current frames as bad so they won't make to screen */
+    if ( picture->current_frame )
+      picture->current_frame->bad_frame=1;
+    if (picture->forward_reference_frame )
+      picture->forward_reference_frame->bad_frame=1;
+    if (picture->backward_reference_frame)
+      picture->backward_reference_frame->bad_frame=1;
+
+  } else {
+    /* to free reference frames one also needs to fix slice.c to 
+     * abort when they are NULL. unfortunately it seems to break
+     * DVD menus.
+     *
+     * ...so let's do this for mpeg-1 only :)
+     */
+    if ( picture->current_frame && 
+	 picture->current_frame != picture->backward_reference_frame &&
+	 picture->current_frame != picture->forward_reference_frame )
+      picture->current_frame->free (picture->current_frame);
+    picture->current_frame = NULL;
+    
+    if (picture->forward_reference_frame &&
+        picture->forward_reference_frame != picture->backward_reference_frame)
+      picture->forward_reference_frame->free (picture->forward_reference_frame);
+    picture->forward_reference_frame = NULL;
+    
+    if (picture->backward_reference_frame)
+      picture->backward_reference_frame->free (picture->backward_reference_frame);
+    picture->backward_reference_frame = NULL;
+  }
+  
+  mpeg2dec->in_slice = 0;
+  mpeg2dec->seek_mode = 1;
+
+}
+
+void mpeg2_flush (mpeg2dec_t * mpeg2dec) {
+
+  picture_t *picture = mpeg2dec->picture;
+
+  if (!picture)
+    return;
+  
+  if (picture->current_frame && !picture->current_frame->drawn &&
+      !picture->current_frame->bad_frame) {
+    
+    lprintf ("blasting out current frame %d on flush\n",
+	     picture->current_frame->id);
+    
+    picture->current_frame->drawn = 1;
+    get_frame_duration(mpeg2dec, picture->current_frame);
+    
+    picture->current_frame->pts = 0;
+    picture->current_frame->draw(picture->current_frame, mpeg2dec->stream);
+  }
+
+}
+
+void mpeg2_close (mpeg2dec_t * mpeg2dec)
+{
+    picture_t *picture = mpeg2dec->picture;
+
+    /*
+    {
+	static uint8_t finalizer[] = {0,0,1,0xb4};
+	mpeg2_decode_data (mpeg2dec, finalizer, finalizer+4, 0);
+    }
+    */
+
+    /* 
+      dont remove any picture->*->free() below. doing so will cause buffer 
+      leak, and we only have about 15 of them.
+    */ 
+ 
+    if ( picture->current_frame ) {
+      if( !picture->current_frame->drawn ) {
+        lprintf ("blasting out current frame on close\n");
+        picture->current_frame->pts = 0;
+        get_frame_duration(mpeg2dec, picture->current_frame);
+        picture->current_frame->draw (picture->current_frame, mpeg2dec->stream);
+        picture->current_frame->drawn = 1;
+      }
+         
+      if( picture->current_frame != picture->backward_reference_frame &&
+          picture->current_frame != picture->forward_reference_frame ) {
+        picture->current_frame->free (picture->current_frame);
+      }
+      picture->current_frame = NULL;
+    }
+    
+    if (picture->forward_reference_frame &&
+        picture->forward_reference_frame != picture->backward_reference_frame) {
+      picture->forward_reference_frame->free (picture->forward_reference_frame);
+      picture->forward_reference_frame = NULL;
+    }
+    
+    if (picture->backward_reference_frame) {
+      if( !picture->backward_reference_frame->drawn) {
+        lprintf ("blasting out backward reference frame on close\n");
+        picture->backward_reference_frame->pts = 0;
+        get_frame_duration(mpeg2dec, picture->backward_reference_frame);
+        picture->backward_reference_frame->draw (picture->backward_reference_frame, mpeg2dec->stream);
+        picture->backward_reference_frame->drawn = 1;      
+      }
+      picture->backward_reference_frame->free (picture->backward_reference_frame);
+      picture->backward_reference_frame = NULL;
+    }
+
+    av_freep(&mpeg2dec->chunk_buffer);
+    av_freep(&mpeg2dec->picture_base);
+   
+    if ( mpeg2dec->cc_dec) {
+      /* dispose the closed caption decoder */
+      mpeg2dec->cc_dec->dispose(mpeg2dec->cc_dec);
+      mpeg2dec->cc_dec = NULL;
+    }
+}
+
+void mpeg2_find_sequence_header (mpeg2dec_t * mpeg2dec,
+				 uint8_t * current, uint8_t * end){
+
+  uint8_t code, next_code;
+  picture_t *picture = mpeg2dec->picture;
+
+  mpeg2dec->seek_mode = 1;
+
+  while (current != end) {
+    code = mpeg2dec->code;
+    current = copy_chunk (mpeg2dec, current, end);
+    if (current == NULL)
+      return ;
+    next_code = mpeg2dec->code;
+
+    /* printf ("looking for sequence header... %02x\n", code);  */
+
+    mpeg2_stats (code, mpeg2dec->chunk_buffer);
+
+    if (code == 0xb3) {	/* sequence_header_code */
+      if (mpeg2_header_sequence (picture, mpeg2dec->chunk_buffer)) {
+	printf ("libmpeg2: bad sequence header\n");
+	continue;
+      }
+
+      /* according to ISO/IEC 13818-2, an extension start code will follow.
+       * Otherwise the stream follows ISO/IEC 11172-2 which means MPEG1 */ 
+      picture->mpeg1 = (next_code != 0xb5);
+
+      if (mpeg2dec->force_aspect) picture->aspect_ratio_information = mpeg2dec->force_aspect;
+	  
+      if (mpeg2dec->is_sequence_needed) {
+        xine_event_t event;
+        xine_format_change_data_t data;
+
+	mpeg2dec->new_sequence = 1;
+	
+	mpeg2dec->is_sequence_needed = 0;
+	picture->frame_width  = picture->coded_picture_width;
+	picture->frame_height = picture->coded_picture_height;
+
+	remember_metainfo (mpeg2dec);
+
+	event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+	event.stream = mpeg2dec->stream;
+	event.data = &data;
+	event.data_length = sizeof(data);
+	data.width = picture->coded_picture_width;
+	data.height = picture->coded_picture_height;
+	data.aspect = picture->aspect_ratio_information;
+	data.pan_scan = mpeg2dec->force_pan_scan;
+	xine_event_send(mpeg2dec->stream, &event);
+
+	_x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_WIDTH,
+          picture->display_width);
+	_x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_HEIGHT,
+          picture->display_height);
+      }
+    } else if (code == 0xb5) {	/* extension_start_code */
+      if (mpeg2_header_extension (picture, mpeg2dec->chunk_buffer)) {
+	printf ("libmpeg2: bad extension\n");
+	continue ;
+      }
+    }
+  }
+}
+
+/* Find the end of the userdata field in an MPEG-2 stream */
+static uint8_t *find_end(uint8_t *buffer)
+{
+  uint8_t *current = buffer;
+  while(1) {
+    if (current[0] == 0 && current[1] == 0 && current[2] == 1)
+      break;
+    current++;
+  }
+  return current;
+}
+
+static void process_userdata(mpeg2dec_t *mpeg2dec, uint8_t *buffer)
+{
+  /* check if user data denotes closed captions */
+  if (buffer[0] == 'C' && buffer[1] == 'C') {
+    
+    if (!mpeg2dec->cc_dec) {
+      xine_event_t event;
+      xine_format_change_data_t data;
+      
+      /* open the closed caption decoder first */
+      mpeg2dec->cc_dec = _x_get_spu_decoder(mpeg2dec->stream, (BUF_SPU_CC >> 16) & 0xff);
+      
+      /* send a frame format event so that the CC decoder knows the initial image size */
+      event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+      event.stream = mpeg2dec->stream;
+      event.data = &data;
+      event.data_length = sizeof(data);
+      data.width = mpeg2dec->picture->coded_picture_width;
+      data.height = mpeg2dec->picture->coded_picture_height;
+      data.aspect = mpeg2dec->picture->aspect_ratio_information;
+      data.pan_scan = mpeg2dec->force_pan_scan;
+      xine_event_send(mpeg2dec->stream, &event);
+
+      _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_WIDTH,
+        mpeg2dec->picture->display_width);
+      _x_stream_info_set(mpeg2dec->stream, XINE_STREAM_INFO_VIDEO_HEIGHT,
+        mpeg2dec->picture->display_height);
+    }
+    
+    if (mpeg2dec->cc_dec) {
+      buf_element_t buf;
+      
+      buf.type = BUF_SPU_CC;
+      buf.content = &buffer[2];
+      buf.pts = mpeg2dec->pts;
+      buf.size = find_end(buffer) - &buffer[2];
+      buf.decoder_flags = 0;
+      
+      mpeg2dec->cc_dec->decode_data(mpeg2dec->cc_dec, &buf);
+    }
+  }
+  /* check Active Format Description ETSI TS 101 154 V1.5.1 */
+  else if (buffer[0] == 0x44 && buffer[1] == 0x54 && buffer[2] == 0x47 && buffer[3] == 0x31)
+    mpeg2dec->afd_value_seen = (buffer[4] & 0x40) ? (buffer[5] & 0x0f) : XINE_VIDEO_AFD_NOT_PRESENT;
+}
diff --git a/src/video_dec/libmpeg2/header.c b/src/video_dec/libmpeg2/header.c
new file mode 100644
index 000000000..0c2b76891
--- /dev/null
+++ b/src/video_dec/libmpeg2/header.c
@@ -0,0 +1,419 @@
+/*
+ * header.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+#define LOG_PAN_SCAN
+*/
+
+#include "config.h"
+
+#include <stdio.h>  /* For printf debugging */
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/attributes.h>
+
+/* default intra quant matrix, in zig-zag order */
+static const uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
+    8,
+    16, 16,
+    19, 16, 19,
+    22, 22, 22, 22,
+    22, 22, 26, 24, 26,
+    27, 27, 27, 26, 26, 26,
+    26, 27, 27, 27, 29, 29, 29,
+    34, 34, 34, 29, 29, 29, 27, 27,
+    29, 29, 32, 32, 34, 34, 37,
+    38, 37, 35, 35, 34, 35,
+    38, 38, 40, 40, 40,
+    48, 48, 46, 46,
+    56, 56, 58,
+    69, 69,
+    83
+};
+
+uint8_t mpeg2_scan_norm[64] ATTR_ALIGN(16) =
+{
+    /* Zig-Zag scan pattern */
+     0, 1, 8,16, 9, 2, 3,10,
+    17,24,32,25,18,11, 4, 5,
+    12,19,26,33,40,48,41,34,
+    27,20,13, 6, 7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) =
+{
+    /* Alternate scan pattern */
+    0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49,
+    41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43,
+    51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45,
+    53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63
+};
+
+/* count must be between 1 and 32 */
+static uint32_t get_bits(uint8_t *buffer, uint32_t count, uint32_t *bit_position) {
+  uint32_t byte_offset;
+  uint32_t bit_offset;
+  uint32_t bit_mask;
+  uint32_t bit_bite;
+  uint32_t result=0;
+  if (count == 0) return 0; 
+  do {
+    byte_offset = *bit_position >> 3;  /* Div 8 */
+    bit_offset = 8 - (*bit_position & 0x7); /* Bits got 87654321 */
+    bit_mask = ((1 << (bit_offset)) - 1);
+    bit_bite = bit_offset;
+    if (count < bit_offset) {
+      bit_mask ^=  ((1 << (bit_offset-count)) - 1);
+      bit_bite = count;
+    }
+    /*
+    printf("Byte=0x%02x Bitmask=0x%04x byte_offset=%u bit_offset=%u bit_byte=%u count=%u\n",buffer[byte_offset], bit_mask, byte_offset, bit_offset, bit_bite,count);
+    */
+    result = (result << bit_bite) | ((buffer[byte_offset] & bit_mask) >> (bit_offset-bit_bite));
+    *bit_position+=bit_bite;
+    count-=bit_bite;
+  } while ((count > 0) && (byte_offset<50) ); 
+  return result;
+}
+
+static int32_t get_bits_signed(uint8_t *buffer, uint32_t count, uint32_t *bit_position) {
+  uint32_t value = get_bits(buffer, count, bit_position);
+  uint32_t sign_mask = (uint32_t)(-1 << (count - 1));
+  if (value & sign_mask)
+    value |= sign_mask; /* sign-extend value */
+  return (int32_t)value;
+}
+
+void mpeg2_header_state_init (picture_t * picture)
+{
+    picture->scan = mpeg2_scan_norm;
+    picture->load_intra_quantizer_matrix = 1;
+    picture->load_non_intra_quantizer_matrix = 1;
+}
+
+int mpeg2_header_sequence (picture_t * picture, uint8_t * buffer)
+{
+    int width, height;
+    int i;
+
+    if ((buffer[6] & 0x20) != 0x20)
+	return 1;	/* missing marker_bit */
+
+    height = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+
+    picture->display_width = width = (height >> 12);
+    picture->display_height = height = (height & 0xfff);
+    
+    width = (width + 15) & ~15;
+    height = (height + 15) & ~15;
+
+    if ((width > 1920) || (height > 1152))
+	return 1;	/* size restrictions for MP@HL */
+
+    picture->coded_picture_width = width;
+    picture->coded_picture_height = height;
+
+    /* this is not used by the decoder */
+    picture->aspect_ratio_information = buffer[3] >> 4;
+    picture->frame_rate_code = buffer[3] & 15;
+    picture->bitrate = (buffer[4]<<10)|(buffer[5]<<2)|(buffer[6]>>6);
+
+    if (buffer[7] & 2) {
+	for (i = 0; i < 64; i++)
+	    picture->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i+7] << 7) | (buffer[i+8] >> 1);
+	buffer += 64;
+    } else
+	for (i = 0; i < 64; i++)
+	    picture->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		default_intra_quantizer_matrix [i];
+
+    if (buffer[7] & 1)
+	for (i = 0; i < 64; i++)
+	    picture->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		buffer[i+8];
+    else
+	for (i = 0; i < 64; i++)
+	    picture->non_intra_quantizer_matrix[i] = 16;
+    picture->load_intra_quantizer_matrix = 1;
+    picture->load_non_intra_quantizer_matrix = 1;
+    /* MPEG1 - for testing only */
+    picture->mpeg1 = 1;
+    picture->intra_dc_precision = 0;
+    picture->frame_pred_frame_dct = 1;
+    picture->q_scale_type = 0;
+    picture->concealment_motion_vectors = 0;
+    /* picture->alternate_scan = 0; */
+    picture->picture_structure = FRAME_PICTURE;
+    /* picture->second_field = 0; */
+
+    return 0;
+}
+
+static int sequence_extension (picture_t * picture, uint8_t * buffer)
+{
+    /* check chroma format, size extensions, marker bit */
+    if (((buffer[1] & 0x07) != 0x02) || (buffer[2] & 0xe0) ||
+	((buffer[3] & 0x01) != 0x01))
+	return 1;
+
+    /* this is not used by the decoder */
+    picture->progressive_sequence = (buffer[1] >> 3) & 1;
+
+    picture->low_delay = buffer[5] & 0x80;
+
+    if (!picture->progressive_sequence)
+	picture->coded_picture_height =
+	    (picture->coded_picture_height + 31) & ~31;
+    
+    
+    /* printf ("libmpeg2: low_delay : %d\n", picture->low_delay); */
+
+/*
+    printf ("libmpeg2: sequence extension+5 : %08x (%d)\n",
+	    buffer[5], buffer[5] % 0x80);
+ */
+
+    picture->frame_rate_ext_n = buffer[5] & 0x31;
+    picture->frame_rate_ext_d = (buffer[5] >> 2) & 0x03;
+    
+    /* MPEG1 - for testing only */
+    picture->mpeg1 = 0;
+
+    return 0;
+}
+
+static int quant_matrix_extension (picture_t * picture, uint8_t * buffer)
+{
+    int i;
+
+    if (buffer[0] & 8) {
+	for (i = 0; i < 64; i++)
+	    picture->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i] << 5) | (buffer[i+1] >> 3);
+	buffer += 64;
+    }
+
+    if (buffer[0] & 4)
+	for (i = 0; i < 64; i++)
+	    picture->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i] << 6) | (buffer[i+1] >> 2);
+
+    return 0;
+}
+
+static int picture_coding_extension (picture_t * picture, uint8_t * buffer)
+{
+    /* pre subtract 1 for use later in compute_motion_vector */
+    picture->f_motion.f_code[0] = (buffer[0] & 15) - 1;
+    picture->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
+    picture->b_motion.f_code[0] = (buffer[1] & 15) - 1;
+    picture->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
+
+    picture->intra_dc_precision = (buffer[2] >> 2) & 3;
+    picture->picture_structure = buffer[2] & 3;
+    picture->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    picture->concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    picture->q_scale_type = (buffer[3] >> 4) & 1;
+    picture->intra_vlc_format = (buffer[3] >> 3) & 1;
+
+    if (buffer[3] & 4)	/* alternate_scan */
+	picture->scan = mpeg2_scan_alt;
+    else
+	picture->scan = mpeg2_scan_norm;
+
+    /* these are not used by the decoder */
+    picture->top_field_first = buffer[3] >> 7;
+    picture->repeat_first_field = (buffer[3] >> 1) & 1;
+    picture->progressive_frame = buffer[4] >> 7;
+
+    return 0;
+}
+
+static int sequence_display_extension (picture_t * picture, uint8_t * buffer) {
+  /* FIXME: implement. */
+  uint32_t bit_position;
+  uint32_t padding;
+  
+  bit_position = 0; 
+  padding = get_bits(buffer, 4, &bit_position);
+  picture->video_format = get_bits(buffer, 3, &bit_position);
+  picture->colour_description = get_bits(buffer, 1, &bit_position);
+  if(picture->colour_description) {
+  picture->colour_primatives = get_bits(buffer, 8, &bit_position);
+  picture->transfer_characteristics = get_bits(buffer, 8, &bit_position);
+  picture->matrix_coefficients = get_bits(buffer, 8, &bit_position);
+  }
+  picture->display_horizontal_size = get_bits(buffer, 14, &bit_position);
+  padding = get_bits(buffer, 1, &bit_position);
+  picture->display_vertical_size = get_bits(buffer, 14, &bit_position);
+
+#ifdef LOG_PAN_SCAN
+  printf("Sequence_display_extension\n");
+  printf("     video_format: %u\n", picture->video_format);
+  printf("     colour_description: %u\n", picture->colour_description);
+  if(picture->colour_description) {
+  printf("     colour_primatives: %u\n", picture->colour_primatives);
+  printf("     transfer_characteristics %u\n", picture->transfer_characteristics);
+  printf("     matrix_coefficients %u\n", picture->matrix_coefficients);
+  }
+  printf("     display_horizontal_size %u\n", picture->display_horizontal_size);
+  printf("     display_vertical_size %u\n", picture->display_vertical_size);
+#endif
+
+  return 0;
+}
+
+static int picture_display_extension (picture_t * picture, uint8_t * buffer) {
+  uint32_t bit_position;
+  uint32_t padding;
+
+#ifdef LOG_PAN_SCAN     
+    printf ("libmpeg2: picture_display_extension\n");
+#endif
+  
+  bit_position = 0; 
+  padding = get_bits(buffer, 4, &bit_position);
+  picture->frame_centre_horizontal_offset = get_bits_signed(buffer, 16, &bit_position);
+  padding = get_bits(buffer, 1, &bit_position);
+  picture->frame_centre_vertical_offset = get_bits_signed(buffer, 16, &bit_position);
+  padding = get_bits(buffer, 1, &bit_position);
+
+#ifdef LOG_PAN_SCAN
+  printf("Pan & Scan centre (x,y) = (%d, %d)\n",  
+    picture->frame_centre_horizontal_offset,
+    picture->frame_centre_vertical_offset);
+#endif
+
+  return 0;
+}
+
+int mpeg2_header_extension (picture_t * picture, uint8_t * buffer)
+{
+    switch (buffer[0] & 0xf0) {
+    case 0x00:	/* reserved */
+        return 0;
+
+    case 0x10:	/* sequence extension */
+	return sequence_extension (picture, buffer);
+
+    case 0x20:	/* sequence display extension for Pan & Scan */
+	return sequence_display_extension (picture, buffer);
+
+    case 0x30:	/* quant matrix extension */
+	return quant_matrix_extension (picture, buffer);
+
+    case 0x40:	/* copyright extension */
+        return 0;
+
+    case 0x50:	/* sequence scalable extension */
+        return 0;
+
+    case 0x60:	/* reserved */
+        return 0;
+
+    case 0x70:	/* picture display extension for Pan & Scan */
+	return picture_display_extension (picture, buffer);
+
+    case 0x80:	/* picture coding extension */
+	return picture_coding_extension (picture, buffer);
+
+    case 0x90:	/* picture spacial scalable extension */
+        return 0;
+
+    case 0xA0:	/* picture temporal scalable extension */
+        return 0;
+
+    case 0xB0:	/* camera parameters extension */
+        return 0;
+
+    case 0xC0:	/* ITU-T extension */
+        return 0;
+
+    case 0xD0:	/* reserved */
+        return 0;
+
+    case 0xE0:	/* reserved */
+        return 0;
+
+    case 0xF0:	/* reserved */
+        return 0;
+    }
+
+    return 0;
+}
+
+int mpeg2_header_group_of_pictures (picture_t * picture, uint8_t * buffer) {
+  uint32_t bit_position;
+  uint32_t padding;
+  bit_position = 0;
+  
+  picture->drop_frame_flag = get_bits(buffer, 1, &bit_position);
+  picture->time_code_hours = get_bits(buffer, 5, &bit_position);
+  picture->time_code_minutes = get_bits(buffer, 6, &bit_position);
+  padding = get_bits(buffer, 1, &bit_position);
+  picture->time_code_seconds = get_bits(buffer, 6, &bit_position);
+  picture->time_code_pictures = get_bits(buffer, 6, &bit_position);
+  picture->closed_gop = get_bits(buffer, 1, &bit_position);
+  picture->broken_link = get_bits(buffer, 1, &bit_position);
+
+#ifdef LOG_PAN_SCAN     
+  printf("Group of pictures\n");
+  printf("     drop_frame_flag: %u\n", picture->drop_frame_flag);
+  printf("     time_code: HH:MM:SS:Pictures %02u:%02u:%02u:%02u\n", 
+         picture->time_code_hours,
+         picture->time_code_minutes,
+         picture->time_code_seconds,
+         picture->time_code_pictures);
+  printf("     closed_gop: %u\n", picture->closed_gop);
+  printf("     bloken_link: %u\n", picture->broken_link);
+#endif
+
+  return 0;
+}
+
+int mpeg2_header_picture (picture_t * picture, uint8_t * buffer)
+{
+    picture->picture_coding_type = (buffer [1] >> 3) & 7;
+    picture->vbv_delay = ((buffer[1] << 13) | (buffer[2] << 5) |
+			  (buffer[3] >> 3)) & 0xffff;
+
+    /* forward_f_code and backward_f_code - used in mpeg1 only */
+    picture->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
+    picture->f_motion.f_code[0] =
+	(((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
+    picture->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
+    picture->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
+
+    /* move in header_process_picture_header */
+        picture->second_field =
+            (picture->picture_structure != FRAME_PICTURE) &&
+            !(picture->second_field);
+
+    return 0;
+}
diff --git a/src/video_dec/libmpeg2/idct.c b/src/video_dec/libmpeg2/idct.c
new file mode 100644
index 000000000..9f216db58
--- /dev/null
+++ b/src/video_dec/libmpeg2/idct.c
@@ -0,0 +1,348 @@
+/*
+ * idct.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * Portions of this code are from the MPEG software simulation group
+ * idct implementation. This code will be replaced with a new
+ * implementation soon.
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/**********************************************************/
+/* inverse two dimensional DCT, Chen-Wang algorithm */
+/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */
+/* 32-bit integer arithmetic (8 bit coefficients) */
+/* 11 mults, 29 adds per DCT */
+/* sE, 18.8.91 */
+/**********************************************************/
+/* coefficients extended to 12 bit for IEEE1180-1990 */
+/* compliance sE, 2.1.94 */
+/**********************************************************/
+
+/* this code assumes >> to be a two's-complement arithmetic */
+/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+
+#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
+#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
+#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
+#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
+#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
+#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
+
+/* idct main entry points  */
+void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+void (* mpeg2_idct_add) (int16_t * block, uint8_t * dest, int stride);
+void (* mpeg2_idct) (int16_t * block);
+void (*	mpeg2_zero_block) (int16_t * block);
+
+static uint8_t clip_lut[1024];
+#define CLIP(i) ((clip_lut+384)[ (i)])
+
+/* row (horizontal) IDCT
+ *
+ * 7 pi 1
+ * dst[k] = sum c[l] * src[l] * cos ( -- * ( k + - ) * l )
+ * l=0 8 2
+ *
+ * where: c[0] = 128
+ * c[1..7] = 128*sqrt (2)
+ */
+
+static void inline idct_row (int16_t * block)
+{
+    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+    x1 = block[4] << 11;
+    x2 = block[6];
+    x3 = block[2];
+    x4 = block[1];
+    x5 = block[7];
+    x6 = block[5];
+    x7 = block[3];
+
+    /* shortcut */
+    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
+	block[0] = block[1] = block[2] = block[3] = block[4] =
+	    block[5] = block[6] = block[7] = block[0]<<3;
+	return;
+    }
+
+    x0 = (block[0] << 11) + 128; /* for proper rounding in the fourth stage */
+
+    /* first stage */
+    x8 = W7 * (x4 + x5);
+    x4 = x8 + (W1 - W7) * x4;
+    x5 = x8 - (W1 + W7) * x5;
+    x8 = W3 * (x6 + x7);
+    x6 = x8 - (W3 - W5) * x6;
+    x7 = x8 - (W3 + W5) * x7;
+ 
+    /* second stage */
+    x8 = x0 + x1;
+    x0 -= x1;
+    x1 = W6 * (x3 + x2);
+    x2 = x1 - (W2 + W6) * x2;
+    x3 = x1 + (W2 - W6) * x3;
+    x1 = x4 + x6;
+    x4 -= x6;
+    x6 = x5 + x7;
+    x5 -= x7;
+ 
+    /* third stage */
+    x7 = x8 + x3;
+    x8 -= x3;
+    x3 = x0 + x2;
+    x0 -= x2;
+    x2 = (181 * (x4 + x5) + 128) >> 8;
+    x4 = (181 * (x4 - x5) + 128) >> 8;
+ 
+    /* fourth stage */
+    block[0] = (x7 + x1) >> 8;
+    block[1] = (x3 + x2) >> 8;
+    block[2] = (x0 + x4) >> 8;
+    block[3] = (x8 + x6) >> 8;
+    block[4] = (x8 - x6) >> 8;
+    block[5] = (x0 - x4) >> 8;
+    block[6] = (x3 - x2) >> 8;
+    block[7] = (x7 - x1) >> 8;
+}
+
+/* column (vertical) IDCT
+ *
+ * 7 pi 1
+ * dst[8*k] = sum c[l] * src[8*l] * cos ( -- * ( k + - ) * l )
+ * l=0 8 2
+ *
+ * where: c[0] = 1/1024
+ * c[1..7] = (1/1024)*sqrt (2)
+ */
+
+static void inline idct_col (int16_t *block)
+{
+    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+    /* shortcut */
+    x1 = block [8*4] << 8;
+    x2 = block [8*6];
+    x3 = block [8*2];
+    x4 = block [8*1];
+    x5 = block [8*7];
+    x6 = block [8*5];
+    x7 = block [8*3];
+
+#if 0
+    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
+	block[8*0] = block[8*1] = block[8*2] = block[8*3] = block[8*4] =
+	    block[8*5] = block[8*6] = block[8*7] = (block[8*0] + 32) >> 6;
+	return;
+    }
+#endif
+
+    x0 = (block[8*0] << 8) + 8192;
+
+    /* first stage */
+    x8 = W7 * (x4 + x5) + 4;
+    x4 = (x8 + (W1 - W7) * x4) >> 3;
+    x5 = (x8 - (W1 + W7) * x5) >> 3;
+    x8 = W3 * (x6 + x7) + 4;
+    x6 = (x8 - (W3 - W5) * x6) >> 3;
+    x7 = (x8 - (W3 + W5) * x7) >> 3;
+ 
+    /* second stage */
+    x8 = x0 + x1;
+    x0 -= x1;
+    x1 = W6 * (x3 + x2) + 4;
+    x2 = (x1 - (W2 + W6) * x2) >> 3;
+    x3 = (x1 + (W2 - W6) * x3) >> 3;
+    x1 = x4 + x6;
+    x4 -= x6;
+    x6 = x5 + x7;
+    x5 -= x7;
+ 
+    /* third stage */
+    x7 = x8 + x3;
+    x8 -= x3;
+    x3 = x0 + x2;
+    x0 -= x2;
+    x2 = (181 * (x4 + x5) + 128) >> 8;
+    x4 = (181 * (x4 - x5) + 128) >> 8;
+ 
+    /* fourth stage */
+    block[8*0] = (x7 + x1) >> 14;
+    block[8*1] = (x3 + x2) >> 14;
+    block[8*2] = (x0 + x4) >> 14;
+    block[8*3] = (x8 + x6) >> 14;
+    block[8*4] = (x8 - x6) >> 14;
+    block[8*5] = (x0 - x4) >> 14;
+    block[8*6] = (x3 - x2) >> 14;
+    block[8*7] = (x7 - x1) >> 14;
+}
+
+static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+
+    i = 8;
+    do {
+	dest[0] = CLIP (block[0]);
+	dest[1] = CLIP (block[1]);
+	dest[2] = CLIP (block[2]);
+	dest[3] = CLIP (block[3]);
+	dest[4] = CLIP (block[4]);
+	dest[5] = CLIP (block[5]);
+	dest[6] = CLIP (block[6]);
+	dest[7] = CLIP (block[7]);
+
+	block[0] = 0;   block[1] = 0;   block[2] = 0;   block[3] = 0;
+	block[4] = 0;   block[5] = 0;   block[6] = 0;   block[7] = 0;
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
+
+static void mpeg2_idct_add_c (int16_t * block, uint8_t * dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+
+    i = 8;
+    do {
+	dest[0] = CLIP (block[0] + dest[0]);
+	dest[1] = CLIP (block[1] + dest[1]);
+	dest[2] = CLIP (block[2] + dest[2]);
+	dest[3] = CLIP (block[3] + dest[3]);
+	dest[4] = CLIP (block[4] + dest[4]);
+	dest[5] = CLIP (block[5] + dest[5]);
+	dest[6] = CLIP (block[6] + dest[6]);
+	dest[7] = CLIP (block[7] + dest[7]);
+
+	block[0] = 0;   block[1] = 0;   block[2] = 0;   block[3] = 0;
+	block[4] = 0;   block[5] = 0;   block[6] = 0;   block[7] = 0;
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
+
+static void mpeg2_idct_c (int16_t * block)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+}
+
+static void mpeg2_zero_block_c (int16_t * wblock)
+{
+  memset( wblock, 0, sizeof(int16_t) * 64 );
+}
+
+void mpeg2_idct_init (uint32_t mm_accel)
+{
+    mpeg2_zero_block = mpeg2_zero_block_c;
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    if (mm_accel & MM_ACCEL_X86_MMXEXT) {
+#ifdef LOG
+	fprintf (stderr, "Using MMXEXT for IDCT transform\n");
+#endif
+	mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
+	mpeg2_idct_add = mpeg2_idct_add_mmxext;
+	mpeg2_idct     = mpeg2_idct_mmxext;
+	mpeg2_zero_block = mpeg2_zero_block_mmx;
+	mpeg2_idct_mmx_init ();
+    } else if (mm_accel & MM_ACCEL_X86_MMX) {
+#ifdef LOG
+	fprintf (stderr, "Using MMX for IDCT transform\n");
+#endif
+	mpeg2_idct_copy = mpeg2_idct_copy_mmx;
+	mpeg2_idct_add  = mpeg2_idct_add_mmx;
+	mpeg2_idct      = mpeg2_idct_mmx;
+	mpeg2_zero_block = mpeg2_zero_block_mmx;
+	mpeg2_idct_mmx_init ();
+    } else
+#endif
+#if defined (ARCH_PPC) && defined (ENABLE_ALTIVEC)
+    if (mm_accel & MM_ACCEL_PPC_ALTIVEC) {
+#ifdef LOG
+	fprintf (stderr, "Using altivec for IDCT transform\n");
+#endif
+	mpeg2_idct_copy = mpeg2_idct_copy_altivec;
+	mpeg2_idct_add = mpeg2_idct_add_altivec;
+	mpeg2_idct_altivec_init ();
+	mpeg2_idct       = mpeg2_idct_c;
+    } else
+#endif
+#ifdef LIBMPEG2_MLIB
+    if (mm_accel & MM_ACCEL_MLIB) {
+	char * env_var;
+
+	env_var = getenv ("MLIB_NON_IEEE");
+
+	mpeg2_idct = mpeg2_idct_mlib;
+	if (env_var == NULL) {
+#ifdef LOG
+	    fprintf (stderr, "Using mlib for IDCT transform\n");
+#endif
+	    mpeg2_idct_add = mpeg2_idct_add_mlib;
+	} else {
+	    fprintf (stderr, "Using non-IEEE mlib for IDCT transform\n");
+	    mpeg2_idct_add = mpeg2_idct_add_mlib_non_ieee;
+	}
+	mpeg2_idct_copy = mpeg2_idct_copy_mlib_non_ieee;
+    } else
+#endif
+    {
+	int i;
+
+#ifdef LOG
+	fprintf (stderr, "No accelerated IDCT transform found\n");
+#endif
+	mpeg2_idct_copy = mpeg2_idct_copy_c;
+	mpeg2_idct_add  = mpeg2_idct_add_c;
+	mpeg2_idct      = mpeg2_idct_c;
+	for (i = -384; i < 640; i++)
+	    clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
+    }
+}
diff --git a/src/video_dec/libmpeg2/idct_altivec.c b/src/video_dec/libmpeg2/idct_altivec.c
new file mode 100644
index 000000000..de396560b
--- /dev/null
+++ b/src/video_dec/libmpeg2/idct_altivec.c
@@ -0,0 +1,233 @@
+/*
+ * idct_altivec.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined (ARCH_PPC) && defined (ENABLE_ALTIVEC)
+
+#include <altivec.h>
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+
+#define vector_s16_t vector signed short
+#define vector_u16_t vector unsigned short
+#define vector_s8_t vector signed char
+#define vector_u8_t vector unsigned char
+#define vector_s32_t vector signed int
+#define vector_u32_t vector unsigned int
+
+#define IDCT_HALF					\
+    /* 1st stage */					\
+    t1 = vec_mradds (a1, vx7, vx1 );			\
+    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));	\
+    t7 = vec_mradds (a2, vx5, vx3);			\
+    t3 = vec_mradds (ma2, vx3, vx5);			\
+							\
+    /* 2nd stage */					\
+    t5 = vec_adds (vx0, vx4);				\
+    t0 = vec_subs (vx0, vx4);				\
+    t2 = vec_mradds (a0, vx6, vx2);			\
+    t4 = vec_mradds (a0, vx2, vec_subs (zero,vx6));	\
+    t6 = vec_adds (t8, t3);				\
+    t3 = vec_subs (t8, t3);				\
+    t8 = vec_subs (t1, t7);				\
+    t1 = vec_adds (t1, t7);				\
+							\
+    /* 3rd stage */					\
+    t7 = vec_adds (t5, t2);				\
+    t2 = vec_subs (t5, t2);				\
+    t5 = vec_adds (t0, t4);				\
+    t0 = vec_subs (t0, t4);				\
+    t4 = vec_subs (t8, t3);				\
+    t3 = vec_adds (t8, t3);				\
+							\
+    /* 4th stage */					\
+    vy0 = vec_adds (t7, t1);				\
+    vy7 = vec_subs (t7, t1);				\
+    vy1 = vec_mradds (c4, t3, t5);			\
+    vy6 = vec_mradds (mc4, t3, t5);			\
+    vy2 = vec_mradds (c4, t4, t0);			\
+    vy5 = vec_mradds (mc4, t4, t0);			\
+    vy3 = vec_adds (t2, t6);				\
+    vy4 = vec_subs (t2, t6);
+
+#define IDCT								\
+    vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;		\
+    vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;		\
+    vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;			\
+    vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;			\
+    vector_u16_t shift;							\
+									\
+    c4 = vec_splat (constants[0], 0);					\
+    a0 = vec_splat (constants[0], 1);					\
+    a1 = vec_splat (constants[0], 2);					\
+    a2 = vec_splat (constants[0], 3);					\
+    mc4 = vec_splat (constants[0], 4);					\
+    ma2 = vec_splat (constants[0], 5);					\
+    bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3);	\
+									\
+    zero = vec_splat_s16 (0);						\
+    shift = vec_splat_u16 (4);						\
+									\
+    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);	\
+    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);	\
+    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);	\
+    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);	\
+    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);	\
+    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);	\
+    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);	\
+    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);	\
+									\
+    IDCT_HALF								\
+									\
+    vx0 = vec_mergeh (vy0, vy4);					\
+    vx1 = vec_mergel (vy0, vy4);					\
+    vx2 = vec_mergeh (vy1, vy5);					\
+    vx3 = vec_mergel (vy1, vy5);					\
+    vx4 = vec_mergeh (vy2, vy6);					\
+    vx5 = vec_mergel (vy2, vy6);					\
+    vx6 = vec_mergeh (vy3, vy7);					\
+    vx7 = vec_mergel (vy3, vy7);					\
+									\
+    vy0 = vec_mergeh (vx0, vx4);					\
+    vy1 = vec_mergel (vx0, vx4);					\
+    vy2 = vec_mergeh (vx1, vx5);					\
+    vy3 = vec_mergel (vx1, vx5);					\
+    vy4 = vec_mergeh (vx2, vx6);					\
+    vy5 = vec_mergel (vx2, vx6);					\
+    vy6 = vec_mergeh (vx3, vx7);					\
+    vy7 = vec_mergel (vx3, vx7);					\
+									\
+    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);			\
+    vx1 = vec_mergel (vy0, vy4);					\
+    vx2 = vec_mergeh (vy1, vy5);					\
+    vx3 = vec_mergel (vy1, vy5);					\
+    vx4 = vec_mergeh (vy2, vy6);					\
+    vx5 = vec_mergel (vy2, vy6);					\
+    vx6 = vec_mergeh (vy3, vy7);					\
+    vx7 = vec_mergel (vy3, vy7);					\
+									\
+    IDCT_HALF								\
+									\
+    shift = vec_splat_u16 (6);						\
+    vx0 = vec_sra (vy0, shift);						\
+    vx1 = vec_sra (vy1, shift);						\
+    vx2 = vec_sra (vy2, shift);						\
+    vx3 = vec_sra (vy3, shift);						\
+    vx4 = vec_sra (vy4, shift);						\
+    vx5 = vec_sra (vy5, shift);						\
+    vx6 = vec_sra (vy6, shift);						\
+    vx7 = vec_sra (vy7, shift);
+
+#if defined( __APPLE_CC__ ) && defined( __APPLE_ALTIVEC__ ) /* apple */
+#define VEC_S16(a,b,c,d,e,f,g,h) (vector_s16_t) (a, b, c, d, e, f, g, h)
+#else			/* gnu */
+#define VEC_S16(a,b,c,d,e,f,g,h) (vector_s16_t) {a, b, c, d, e, f, g, h}
+#endif
+
+static vector_s16_t constants[5] = {
+    VEC_S16(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
+    VEC_S16(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
+    VEC_S16(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
+    VEC_S16(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
+    VEC_S16(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
+};
+
+void mpeg2_idct_copy_altivec (vector_s16_t * block, unsigned char * dest,
+			      int stride)
+{
+    vector_u8_t tmp;
+
+    IDCT
+
+#define COPY(dest,src)						\
+    tmp = vec_packsu (src, src);				\
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);	\
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+
+    COPY (dest, vx0)	dest += stride;
+    COPY (dest, vx1)	dest += stride;
+    COPY (dest, vx2)	dest += stride;
+    COPY (dest, vx3)	dest += stride;
+    COPY (dest, vx4)	dest += stride;
+    COPY (dest, vx5)	dest += stride;
+    COPY (dest, vx6)	dest += stride;
+    COPY (dest, vx7)
+    memset (block, 0, 64 * sizeof (signed short));
+}
+
+void mpeg2_idct_add_altivec (vector_s16_t * block, unsigned char * dest,
+			     int stride)
+{
+    vector_u8_t tmp;
+    vector_s16_t tmp2, tmp3;
+    vector_u8_t perm0;
+    vector_u8_t perm1;
+    vector_u8_t p0, p1, p;
+
+    IDCT
+
+    p0 = vec_lvsl (0, dest);
+    p1 = vec_lvsl (stride, dest);
+    p = vec_splat_u8 (-1);
+    perm0 = vec_mergeh (p, p0);
+    perm1 = vec_mergeh (p, p1);
+
+#define ADD(dest,src,perm)						\
+    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */			\
+    tmp = vec_ld (0, dest);						\
+    tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);	\
+    tmp3 = vec_adds (tmp2, src);					\
+    tmp = vec_packsu (tmp3, tmp3);					\
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);		\
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+
+    ADD (dest, vx0, perm0)	dest += stride;
+    ADD (dest, vx1, perm1)	dest += stride;
+    ADD (dest, vx2, perm0)	dest += stride;
+    ADD (dest, vx3, perm1)	dest += stride;
+    ADD (dest, vx4, perm0)	dest += stride;
+    ADD (dest, vx5, perm1)	dest += stride;
+    ADD (dest, vx6, perm0)	dest += stride;
+    ADD (dest, vx7, perm1)
+    memset (block, 0, 64 * sizeof (signed short));
+}
+
+void mpeg2_idct_altivec_init (void)
+{
+    int i, j;
+
+    /* the altivec idct uses a transposed input, so we patch scan tables */
+    for (i = 0; i < 64; i++) {
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = (j >> 3) | ((j & 7) << 3);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3);
+    }
+}
+
+#endif	/* ARCH_PPC && ENABLED_ALTIVEC */
+
diff --git a/src/video_dec/libmpeg2/idct_mlib.c b/src/video_dec/libmpeg2/idct_mlib.c
new file mode 100644
index 000000000..e573c9790
--- /dev/null
+++ b/src/video_dec/libmpeg2/idct_mlib.c
@@ -0,0 +1,62 @@
+/*
+ * idct_mlib.c
+ * Copyright (C) 1999-2002 Håkan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef LIBMPEG2_MLIB
+
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+
+void mpeg2_idct_add_mlib (int16_t * block, uint8_t * dest, int stride)
+{
+    mlib_VideoIDCT_IEEE_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_copy_mlib_non_ieee (int16_t * block, uint8_t * dest,
+				    int stride)
+{
+    mlib_VideoIDCT8x8_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_add_mlib_non_ieee (int16_t * block, uint8_t * dest, int stride)
+{
+    mlib_VideoIDCT8x8_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_mlib (int16_t * block)
+{
+    mlib_VideoIDCT_IEEE_S16_S16 (block, block);
+}
+
+#endif
diff --git a/src/video_dec/libmpeg2/idct_mlib.h b/src/video_dec/libmpeg2/idct_mlib.h
new file mode 100644
index 000000000..1fb0787dd
--- /dev/null
+++ b/src/video_dec/libmpeg2/idct_mlib.h
@@ -0,0 +1,25 @@
+/*
+ * idct_mlib.h
+ *
+ * Copyright (C) 1999, Håkan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *	
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Make; see the file COPYING. If not, write to
+ * the Free Software Foundation, 
+ *
+ */
+
+void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride);
+void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride);
diff --git a/src/video_dec/libmpeg2/idct_mmx.c b/src/video_dec/libmpeg2/idct_mmx.c
new file mode 100644
index 000000000..92ae365b4
--- /dev/null
+++ b/src/video_dec/libmpeg2/idct_mmx.c
@@ -0,0 +1,741 @@
+/*
+ * idct_mmx.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+#include "xine_mmx.h"
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+
+
+#if 0
+/* C row IDCT - its just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+			     int16_t * table, int32_t * rounder)
+{
+    int C1, C2, C3, C4, C5, C6, C7;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    row += offset;
+
+    C1 = table[1];
+    C2 = table[2];
+    C3 = table[3];
+    C4 = table[4];
+    C5 = table[5];
+    C6 = table[6];
+    C7 = table[7];
+
+    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
+						   c4,  c6,  c4,  c6,	\
+						   c1,  c3, -c1, -c5,	\
+						   c5,  c7,  c3, -c7,	\
+						   c4, -c6,  c4, -c6,	\
+						  -c4,  c2,  c4, -c2,	\
+						   c5, -c1,  c3, -c1,	\
+						   c7,  c3,  c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+
+    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+}
+
+static inline void mmxext_row (int16_t * table, int32_t * rounder)
+{
+    movq_m2r (*(table+8), mm1);		// mm1 = -C5 -C1 C3 C1
+    pmaddwd_r2r (mm2, mm4);		// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
+
+    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
+    pshufw_r2r (mm6, mm6, 0x4e);	// mm6 = x3 x1 x7 x5
+
+    movq_m2r (*(table+12), mm7);	// mm7 = -C7 C3 C7 C5
+    pmaddwd_r2r (mm5, mm1);		// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
+
+    paddd_m2r (*rounder, mm3);		// mm3 += rounder
+    pmaddwd_r2r (mm6, mm7);		// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
+
+    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
+    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
+    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
+    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+
+    paddd_m2r (*rounder, mm0);		// mm0 += rounder
+    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+
+    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
+    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+
+    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
+    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+
+    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
+    movq_r2r (mm0, mm4);		// mm4 = a3 a2 + rounder
+
+    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
+    psubd_r2r (mm5, mm4);		// mm4 = a3-b3 a2-b2 + rounder
+}
+
+static inline void mmxext_row_tail (int16_t * row, int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+
+    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+
+    /* slot */
+
+    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+}
+
+static inline void mmxext_row_mid (int16_t * row, int store,
+				   int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+
+    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
+    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+
+    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+
+    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
+    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
+					   c4,  c6, -c4, -c2,	\
+					   c1,  c3,  c3, -c7,	\
+					   c5,  c7, -c1, -c5,	\
+					   c4, -c6,  c4, -c2,	\
+					  -c4,  c2,  c4, -c6,	\
+					   c5, -c1,  c7, -c5,	\
+					   c7,  c3,  c3, -c1 }
+
+static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+
+    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+
+    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
+    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+}
+
+static inline void mmx_row (int16_t * table, int32_t * rounder)
+{
+    pmaddwd_r2r (mm2, mm4);		// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
+    punpckldq_r2r (mm5, mm5);		// mm5 = x3 x1 x3 x1
+
+    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
+    punpckhdq_r2r (mm6, mm6);		// mm6 = x7 x5 x7 x5
+
+    movq_m2r (*(table+12), mm7);	// mm7 = -C5 -C1 C7 C5
+    pmaddwd_r2r (mm5, mm1);		// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
+
+    paddd_m2r (*rounder, mm3);		// mm3 += rounder
+    pmaddwd_r2r (mm6, mm7);		// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
+
+    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
+    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
+    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+
+    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
+    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+
+    paddd_m2r (*rounder, mm0);		// mm0 += rounder
+    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+
+    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
+    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+
+    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
+    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+
+    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
+    movq_r2r (mm0, mm7);		// mm7 = a3 a2 + rounder
+
+    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
+    psubd_r2r (mm5, mm7);		// mm7 = a3-b3 a2-b2 + rounder
+}
+
+static inline void mmx_row_tail (int16_t * row, int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+
+    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    movq_r2r (mm7, mm4);		// mm4 = y6 y7 y4 y5
+
+    pslld_i2r (16, mm7);		// mm7 = y7 0 y5 0
+
+    psrld_i2r (16, mm4);		// mm4 = 0 y6 0 y4
+
+    por_r2r (mm4, mm7);			// mm7 = y7 y6 y5 y4
+
+    /* slot */
+
+    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+}
+
+static inline void mmx_row_mid (int16_t * row, int store,
+				int offset, int16_t * table)
+{
+    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+
+    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
+    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+
+    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+
+    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+
+    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
+    movq_r2r (mm7, mm1);		// mm1 = y6 y7 y4 y5
+
+    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+    psrld_i2r (16, mm7);		// mm7 = 0 y6 0 y4
+
+    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
+    pslld_i2r (16, mm1);		// mm1 = y7 0 y5 0
+
+    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
+    por_r2r (mm1, mm7);			// mm7 = y7 y6 y5 y4
+
+    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
+    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+
+    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+}
+
+
+#if 0
+// C column IDCT - its just here to document the MMXEXT and MMX versions
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+    col += offset;
+
+    x0 = col[0*8];
+    x1 = col[1*8];
+    x2 = col[2*8];
+    x3 = col[3*8];
+    x4 = col[4*8];
+    x5 = col[5*8];
+    x6 = col[6*8];
+    x7 = col[7*8];
+
+    u04 = S (x0 + x4);
+    v04 = S (x0 - x4);
+    u26 = S (F (T2, x6) + x2);
+    v26 = S (F (T2, x2) - x6);
+
+    a0 = S (u04 + u26);
+    a1 = S (v04 + v26);
+    a2 = S (v04 - v26);
+    a3 = S (u04 - u26);
+
+    u17 = S (F (T1, x7) + x1);
+    v17 = S (F (T1, x1) - x7);
+    u35 = S (F (T3, x5) + x3);
+    v35 = S (F (T3, x3) - x5);
+
+    b0 = S (u17 + u35);
+    b3 = S (v17 - v35);
+    u12 = S (u17 - u35);
+    v12 = S (v17 + v35);
+    u12 = S (2 * F (C4, u12));
+    v12 = S (2 * F (C4, v12));
+    b1 = S (u12 + v12);
+    b2 = S (u12 - v12);
+
+    y0 = S (a0 + b0) >> COL_SHIFT;
+    y1 = S (a1 + b1) >> COL_SHIFT;
+    y2 = S (a2 + b2) >> COL_SHIFT;
+    y3 = S (a3 + b3) >> COL_SHIFT;
+
+    y4 = S (a3 - b3) >> COL_SHIFT;
+    y5 = S (a2 - b2) >> COL_SHIFT;
+    y6 = S (a1 - b1) >> COL_SHIFT;
+    y7 = S (a0 - b0) >> COL_SHIFT;
+
+    col[0*8] = y0;
+    col[1*8] = y1;
+    col[2*8] = y2;
+    col[3*8] = y3;
+    col[4*8] = y4;
+    col[5*8] = y5;
+    col[6*8] = y6;
+    col[7*8] = y7;
+}
+#endif
+
+
+// MMX column IDCT
+static inline void idct_col (int16_t * col, int offset)
+{
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+    static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+
+    /* column code adapted from peter gubanov */
+    /* http://www.elecard.com/peter/idct.shtml */
+
+    movq_m2r (*_T1, mm0);		// mm0 = T1
+
+    movq_m2r (*(col+offset+1*8), mm1);	// mm1 = x1
+    movq_r2r (mm0, mm2);		// mm2 = T1
+
+    movq_m2r (*(col+offset+7*8), mm4);	// mm4 = x7
+    pmulhw_r2r (mm1, mm0);		// mm0 = T1*x1
+
+    movq_m2r (*_T3, mm5);		// mm5 = T3
+    pmulhw_r2r (mm4, mm2);		// mm2 = T1*x7
+
+    movq_m2r (*(col+offset+5*8), mm6);	// mm6 = x5
+    movq_r2r (mm5, mm7);		// mm7 = T3-1
+
+    movq_m2r (*(col+offset+3*8), mm3);	// mm3 = x3
+    psubsw_r2r (mm4, mm0);		// mm0 = v17
+
+    movq_m2r (*_T2, mm4);		// mm4 = T2
+    pmulhw_r2r (mm3, mm5);		// mm5 = (T3-1)*x3
+
+    paddsw_r2r (mm2, mm1);		// mm1 = u17
+    pmulhw_r2r (mm6, mm7);		// mm7 = (T3-1)*x5
+
+    /* slot */
+
+    movq_r2r (mm4, mm2);		// mm2 = T2
+    paddsw_r2r (mm3, mm5);		// mm5 = T3*x3
+
+    pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2
+    paddsw_r2r (mm6, mm7);		// mm7 = T3*x5
+
+    psubsw_r2r (mm6, mm5);		// mm5 = v35
+    paddsw_r2r (mm3, mm7);		// mm7 = u35
+
+    movq_m2r (*(col+offset+6*8), mm3);	// mm3 = x6
+    movq_r2r (mm0, mm6);		// mm6 = v17
+
+    pmulhw_r2r (mm3, mm2);		// mm2 = T2*x6
+    psubsw_r2r (mm5, mm0);		// mm0 = b3
+
+    psubsw_r2r (mm3, mm4);		// mm4 = v26
+    paddsw_r2r (mm6, mm5);		// mm5 = v12
+
+    movq_r2m (mm0, *(col+offset+3*8));	// save b3 in scratch0
+    movq_r2r (mm1, mm6);		// mm6 = u17
+
+    paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26
+    paddsw_r2r (mm7, mm6);		// mm6 = b0
+
+    psubsw_r2r (mm7, mm1);		// mm1 = u12
+    movq_r2r (mm1, mm7);		// mm7 = u12
+
+    movq_m2r (*(col+offset+0*8), mm3);	// mm3 = x0
+    paddsw_r2r (mm5, mm1);		// mm1 = u12+v12
+
+    movq_m2r (*_C4, mm0);		// mm0 = C4/2
+    psubsw_r2r (mm5, mm7);		// mm7 = u12-v12
+
+    movq_r2m (mm6, *(col+offset+5*8));	// save b0 in scratch1
+    pmulhw_r2r (mm0, mm1);		// mm1 = b1/2
+
+    movq_r2r (mm4, mm6);		// mm6 = v26
+    pmulhw_r2r (mm0, mm7);		// mm7 = b2/2
+
+    movq_m2r (*(col+offset+4*8), mm5);	// mm5 = x4
+    movq_r2r (mm3, mm0);		// mm0 = x0
+
+    psubsw_r2r (mm5, mm3);		// mm3 = v04
+    paddsw_r2r (mm5, mm0);		// mm0 = u04
+
+    paddsw_r2r (mm3, mm4);		// mm4 = a1
+    movq_r2r (mm0, mm5);		// mm5 = u04
+
+    psubsw_r2r (mm6, mm3);		// mm3 = a2
+    paddsw_r2r (mm2, mm5);		// mm5 = a0
+
+    paddsw_r2r (mm1, mm1);		// mm1 = b1
+    psubsw_r2r (mm2, mm0);		// mm0 = a3
+
+    paddsw_r2r (mm7, mm7);		// mm7 = b2
+    movq_r2r (mm3, mm2);		// mm2 = a2
+
+    movq_r2r (mm4, mm6);		// mm6 = a1
+    paddsw_r2r (mm7, mm3);		// mm3 = a2+b2
+
+    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y2
+    paddsw_r2r (mm1, mm4);		// mm4 = a1+b1
+
+    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y1
+    psubsw_r2r (mm1, mm6);		// mm6 = a1-b1
+
+    movq_m2r (*(col+offset+5*8), mm1);	// mm1 = b0
+    psubsw_r2r (mm7, mm2);		// mm2 = a2-b2
+
+    psraw_i2r (COL_SHIFT, mm6);		// mm6 = y6
+    movq_r2r (mm5, mm7);		// mm7 = a0
+
+    movq_r2m (mm4, *(col+offset+1*8));	// save y1
+    psraw_i2r (COL_SHIFT, mm2);		// mm2 = y5
+
+    movq_r2m (mm3, *(col+offset+2*8));	// save y2
+    paddsw_r2r (mm1, mm5);		// mm5 = a0+b0
+
+    movq_m2r (*(col+offset+3*8), mm4);	// mm4 = b3
+    psubsw_r2r (mm1, mm7);		// mm7 = a0-b0
+
+    psraw_i2r (COL_SHIFT, mm5);		// mm5 = y0
+    movq_r2r (mm0, mm3);		// mm3 = a3
+
+    movq_r2m (mm2, *(col+offset+5*8));	// save y5
+    psubsw_r2r (mm4, mm3);		// mm3 = a3-b3
+
+    psraw_i2r (COL_SHIFT, mm7);		// mm7 = y7
+    paddsw_r2r (mm0, mm4);		// mm4 = a3+b3
+
+    movq_r2m (mm5, *(col+offset+0*8));	// save y0
+    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y4
+
+    movq_r2m (mm6, *(col+offset+6*8));	// save y6
+    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y3
+
+    movq_r2m (mm7, *(col+offset+7*8));	// save y7
+
+    movq_r2m (mm3, *(col+offset+4*8));	// save y4
+
+    movq_r2m (mm4, *(col+offset+3*8));	// save y3
+}
+
+
+static int32_t rounder0[] ATTR_ALIGN(8) =
+    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static int32_t rounder1[] ATTR_ALIGN(8) =
+    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
+static int32_t rounder7[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
+static int32_t rounder2[] ATTR_ALIGN(8) =
+    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
+static int32_t rounder6[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C2 * (C6-C2)/2 */
+static int32_t rounder3[] ATTR_ALIGN(8) =
+    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
+static int32_t rounder5[] ATTR_ALIGN(8) =
+    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
+
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
+static inline void idct (int16_t * block)				\
+{									\
+    static int16_t table04[] ATTR_ALIGN(16) =				\
+	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
+    static int16_t table17[] ATTR_ALIGN(16) =				\
+	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
+    static int16_t table26[] ATTR_ALIGN(16) =				\
+	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
+    static int16_t table35[] ATTR_ALIGN(16) =				\
+	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
+									\
+    idct_row_head (block, 0*8, table04);				\
+    idct_row (table04, rounder0);					\
+    idct_row_mid (block, 0*8, 4*8, table04);				\
+    idct_row (table04, rounder4);					\
+    idct_row_mid (block, 4*8, 1*8, table17);				\
+    idct_row (table17, rounder1);					\
+    idct_row_mid (block, 1*8, 7*8, table17);				\
+    idct_row (table17, rounder7);					\
+    idct_row_mid (block, 7*8, 2*8, table26);				\
+    idct_row (table26, rounder2);					\
+    idct_row_mid (block, 2*8, 6*8, table26);				\
+    idct_row (table26, rounder6);					\
+    idct_row_mid (block, 6*8, 3*8, table35);				\
+    idct_row (table35, rounder3);					\
+    idct_row_mid (block, 3*8, 5*8, table35);				\
+    idct_row (table35, rounder5);					\
+    idct_row_tail (block, 5*8);						\
+									\
+    idct_col (block, 0);						\
+    idct_col (block, 4);						\
+}
+
+
+#define COPY_MMX(offset,r0,r1,r2)	\
+do {					\
+    movq_m2r (*(block+offset), r0);	\
+    dest += stride;			\
+    movq_m2r (*(block+offset+4), r1);	\
+    movq_r2m (r2, *dest);		\
+    packuswb_r2r (r1, r0);		\
+} while (0)
+
+static void block_copy (int16_t * block, uint8_t * dest, int stride)
+{
+    movq_m2r (*(block+0*8), mm0);
+    movq_m2r (*(block+0*8+4), mm1);
+    movq_m2r (*(block+1*8), mm2);
+    packuswb_r2r (mm1, mm0);
+    movq_m2r (*(block+1*8+4), mm3);
+    movq_r2m (mm0, *dest);
+    packuswb_r2r (mm3, mm2);
+    COPY_MMX (2*8, mm0, mm1, mm2);
+    COPY_MMX (3*8, mm2, mm3, mm0);
+    COPY_MMX (4*8, mm0, mm1, mm2);
+    COPY_MMX (5*8, mm2, mm3, mm0);
+    COPY_MMX (6*8, mm0, mm1, mm2);
+    COPY_MMX (7*8, mm2, mm3, mm0);
+    movq_r2m (mm2, *(dest+stride));
+}
+
+
+#define ADD_MMX(offset,r1,r2,r3,r4)	\
+do {					\
+    movq_m2r (*(dest+2*stride), r1);	\
+    packuswb_r2r (r4, r3);		\
+    movq_r2r (r1, r2);			\
+    dest += stride;			\
+    movq_r2m (r3, *dest);		\
+    punpcklbw_r2r (mm0, r1);		\
+    paddsw_m2r (*(block+offset), r1);	\
+    punpckhbw_r2r (mm0, r2);		\
+    paddsw_m2r (*(block+offset+4), r2);	\
+} while (0)
+
+static void block_add (int16_t * block, uint8_t * dest, int stride)
+{
+    movq_m2r (*dest, mm1);
+    pxor_r2r (mm0, mm0);
+    movq_m2r (*(dest+stride), mm3);
+    movq_r2r (mm1, mm2);
+    punpcklbw_r2r (mm0, mm1);
+    movq_r2r (mm3, mm4);
+    paddsw_m2r (*(block+0*8), mm1);
+    punpckhbw_r2r (mm0, mm2);
+    paddsw_m2r (*(block+0*8+4), mm2);
+    punpcklbw_r2r (mm0, mm3);
+    paddsw_m2r (*(block+1*8), mm3);
+    packuswb_r2r (mm2, mm1);
+    punpckhbw_r2r (mm0, mm4);
+    movq_r2m (mm1, *dest);
+    paddsw_m2r (*(block+1*8+4), mm4);
+    ADD_MMX (2*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (3*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (4*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (5*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (6*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (7*8, mm3, mm4, mm1, mm2);
+    packuswb_r2r (mm4, mm3);
+    movq_r2m (mm3, *(dest+stride));
+}
+
+static inline void block_zero (int16_t * block)   {       
+  pxor_r2r (mm0, mm0);
+  movq_r2m (mm0, *(block+0*4));
+  movq_r2m (mm0, *(block+1*4));       
+  movq_r2m (mm0, *(block+2*4));       
+  movq_r2m (mm0, *(block+3*4));       
+  movq_r2m (mm0, *(block+4*4));       
+  movq_r2m (mm0, *(block+5*4));       
+  movq_r2m (mm0, *(block+6*4));       
+  movq_r2m (mm0, *(block+7*4));       
+  movq_r2m (mm0, *(block+8*4));       
+  movq_r2m (mm0, *(block+9*4));       
+  movq_r2m (mm0, *(block+10*4));       
+  movq_r2m (mm0, *(block+11*4));       
+  movq_r2m (mm0, *(block+12*4));       
+  movq_r2m (mm0, *(block+13*4));       
+  movq_r2m (mm0, *(block+14*4));       
+  movq_r2m (mm0, *(block+15*4));   
+}
+
+declare_idct (mmxext_idct, mmxext_table,
+	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride)
+{
+    mmxext_idct (block);
+    block_copy (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_add_mmxext (int16_t * block, uint8_t * dest, int stride)
+{
+    mmxext_idct (block);
+    block_add (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_mmxext (int16_t * block)
+{
+    mmxext_idct (block);
+}
+
+declare_idct (mmx_idct, mmx_table,
+	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+void mpeg2_idct_copy_mmx (int16_t * block, uint8_t * dest, int stride)
+{
+    mmx_idct (block);
+    block_copy (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_add_mmx (int16_t * block, uint8_t * dest, int stride)
+{
+    mmx_idct (block);
+    block_add (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_mmx (int16_t * block)
+{
+    mmx_idct (block);
+}
+
+void mpeg2_zero_block_mmx (int16_t * block)
+{
+    block_zero (block);
+}
+
+void mpeg2_idct_mmx_init (void)
+{
+    int i, j;
+
+    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
+
+    for (i = 0; i < 64; i++) {
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+    }
+}
+
+#endif
diff --git a/src/video_dec/libmpeg2/libmpeg2_accel.c b/src/video_dec/libmpeg2/libmpeg2_accel.c
new file mode 100644
index 000000000..92c0e280b
--- /dev/null
+++ b/src/video_dec/libmpeg2/libmpeg2_accel.c
@@ -0,0 +1,223 @@
+/*
+ * libmpeg2_accel.c
+ * Copyright (C) 2004 The Unichrome Project.
+ * Copyright (C) 2005 Thomas Hellstrom.
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify it 
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
+ */
+
+#include <xine/xine_internal.h>
+#include "mpeg2.h"
+#include "mpeg2_internal.h"
+#include "xvmc_vld.h"
+#include "libmpeg2_accel.h"
+
+
+void 
+libmpeg2_accel_scan( mpeg2dec_accel_t *accel, uint8_t *scan_norm, uint8_t *scan_alt)
+{
+  xvmc_setup_scan_ptable();
+}
+
+
+int
+libmpeg2_accel_discontinuity(mpeg2dec_accel_t *accel, uint32_t frame_format, picture_t *picture)
+{
+  accel->xvmc_last_slice_code=-1;
+  if ( !picture->current_frame )
+    return 0;
+  if (frame_format == XINE_IMGFMT_XXMC) {
+    xine_xxmc_t *xxmc = (xine_xxmc_t *) 
+      picture->current_frame->accel_data;
+    switch(xxmc->acceleration) {
+    case XINE_XVMC_ACCEL_VLD:
+    case XINE_XVMC_ACCEL_IDCT:
+    case XINE_XVMC_ACCEL_MOCOMP:
+      xxmc->proc_xxmc_flush( picture->current_frame );
+      break;
+    default:
+      break;
+    }
+  }
+  return 0;
+}
+
+int 
+libmpeg2_accel_new_sequence(mpeg2dec_accel_t *accel, uint32_t frame_format, picture_t *picture)
+{
+  switch(frame_format) {
+  case XINE_IMGFMT_XXMC:
+  case XINE_IMGFMT_XVMC: {
+    xine_xvmc_t *xvmc = (xine_xvmc_t *) 
+      picture->current_frame->accel_data;
+    picture->mc = xvmc->macroblocks;
+    return 0;
+  }
+  default:
+    break;
+  }
+  return 1;
+}
+
+int
+libmpeg2_accel_new_frame(mpeg2dec_accel_t *accel, uint32_t frame_format, 
+			 picture_t *picture, double ratio, uint32_t flags)
+{  
+  if (picture->current_frame) {
+    if (XINE_IMGFMT_XXMC == frame_format) {
+      xine_xxmc_t *xxmc = (xine_xxmc_t *) 
+	picture->current_frame->accel_data;
+      
+      /*
+       * Make a request for acceleration type and mpeg coding from
+       * the output plugin.
+       */
+      
+      xxmc->fallback_format = XINE_IMGFMT_YV12;
+      xxmc->acceleration = XINE_XVMC_ACCEL_VLD| XINE_XVMC_ACCEL_IDCT
+	| XINE_XVMC_ACCEL_MOCOMP ;
+
+      /*
+       * Standard MOCOMP / IDCT XvMC implementation for interlaced streams 
+       * is buggy. The bug is inherited from the old XvMC driver. Don't use it until
+       * it has been fixed. (A volunteer ?)
+       */
+
+      if ( picture->picture_structure != 3 ) {
+	picture->top_field_first = (picture->picture_structure == 1);
+	xxmc->acceleration &= ~( XINE_XVMC_ACCEL_IDCT |  XINE_XVMC_ACCEL_MOCOMP );
+      } 
+
+      xxmc->mpeg = (picture->mpeg1) ? XINE_XVMC_MPEG_1:XINE_XVMC_MPEG_2;
+      xxmc->proc_xxmc_update_frame (picture->current_frame->driver, 
+				    picture->current_frame,
+				    picture->coded_picture_width,
+				    picture->coded_picture_height,
+				    ratio,
+				    XINE_IMGFMT_XXMC, flags);
+    }
+  }
+  return 0;
+}
+
+void
+libmpeg2_accel_frame_completion(mpeg2dec_accel_t * accel, uint32_t frame_format, picture_t *picture,
+				int code)
+{
+	
+  if ( !picture->current_frame ) return;
+  
+  if (frame_format == XINE_IMGFMT_XXMC) {
+    xine_xxmc_t *xxmc = (xine_xxmc_t *) 
+      picture->current_frame->accel_data;
+    if (!xxmc->decoded) {
+      switch(picture->current_frame->format) {
+      case XINE_IMGFMT_XXMC:
+	switch(xxmc->acceleration) {
+	case XINE_XVMC_ACCEL_VLD:
+	  mpeg2_xxmc_vld_frame_complete(accel, picture, code);
+	  break;
+	case XINE_XVMC_ACCEL_IDCT:
+	case XINE_XVMC_ACCEL_MOCOMP:
+	  xxmc->decoded = !picture->current_frame->bad_frame;
+	  xxmc->proc_xxmc_flush( picture->current_frame );
+	  break;
+	default:
+	  break;
+	}
+      default:
+	break;
+      }
+    }
+  }
+}
+
+
+int 
+libmpeg2_accel_slice(mpeg2dec_accel_t *accel, picture_t *picture, int code, char * buffer, 
+		     uint32_t chunk_size, uint8_t *chunk_buffer)
+{
+  /*
+   * Don't reference frames of other formats. They are invalid. This may happen if the 
+   * xxmc plugin suddenly falls back to software decoding.
+   */
+
+  if (( picture->current_frame->picture_coding_type == XINE_PICT_P_TYPE ) ||
+      ( picture->current_frame->picture_coding_type == XINE_PICT_B_TYPE )) {
+    if (! picture->forward_reference_frame) return 1;
+    if (picture->forward_reference_frame->format != picture->current_frame->format) {
+      picture->v_offset = 0;
+      return 1;
+    }
+  }
+
+  if ( picture->current_frame->picture_coding_type == XINE_PICT_B_TYPE ) {
+    if (! picture->backward_reference_frame) return 1;
+    if (picture->backward_reference_frame->format != picture->current_frame->format) {
+      picture->v_offset = 0;
+      return 1;
+    }
+  }
+      
+  switch( picture->current_frame->format ) {
+
+  case XINE_IMGFMT_XXMC:
+    {
+      xine_xxmc_t *xxmc = (xine_xxmc_t *) 
+	picture->current_frame->accel_data;
+      
+      if ( xxmc->proc_xxmc_lock_valid( picture->current_frame,
+				       picture->forward_reference_frame,
+				       picture->backward_reference_frame,
+				       picture->current_frame->picture_coding_type)) {
+	picture->v_offset = 0;
+	return 1;
+      }
+      
+      switch(picture->current_frame->format) {
+      case XINE_IMGFMT_XXMC:
+	switch(xxmc->acceleration) {
+	case XINE_XVMC_ACCEL_VLD:
+	  mpeg2_xxmc_slice(accel, picture, code, buffer, chunk_size, chunk_buffer);
+	  break;
+	case XINE_XVMC_ACCEL_IDCT:
+	case XINE_XVMC_ACCEL_MOCOMP:
+	  mpeg2_xvmc_slice (accel, picture, code, buffer);
+	  break;
+	default:
+	  mpeg2_slice (picture, code, buffer);
+	  break;
+	}
+	break;
+      default:
+	mpeg2_slice (picture, code, buffer);
+	break;
+      }
+      xxmc->proc_xxmc_unlock(picture->current_frame->driver);
+      break;
+    }
+
+  case XINE_IMGFMT_XVMC:
+    mpeg2_xvmc_slice (accel, picture, code, buffer);
+    break;
+
+  default:
+    mpeg2_slice (picture, code, buffer);
+    break;
+  }
+  return 0;
+}
diff --git a/src/video_dec/libmpeg2/libmpeg2_accel.h b/src/video_dec/libmpeg2/libmpeg2_accel.h
new file mode 100644
index 000000000..5d0b37a78
--- /dev/null
+++ b/src/video_dec/libmpeg2/libmpeg2_accel.h
@@ -0,0 +1,48 @@
+/*
+ * libmpeg2_accel.h
+ * Copyright (C) 2004 The Unichrome Project.
+ * Copyright (C) 2005 Thomas Hellstrom.
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify it 
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
+ */
+
+#ifndef LIBMPEG2_ACCEL_H
+#define LIBMPEG2_ACCEL_H
+
+#include "mpeg2_internal.h"
+
+/*
+ * Internal context data type.
+ */
+
+typedef struct {
+  int xvmc_last_slice_code;
+  int slices_per_row;
+  int row_slice_count;
+  unsigned xxmc_mb_pic_height;
+} mpeg2dec_accel_t;
+
+extern int libmpeg2_accel_discontinuity(mpeg2dec_accel_t *accel, uint32_t frame_format, picture_t *picture);
+extern int libmpeg2_accel_new_sequence(mpeg2dec_accel_t *accel, uint32_t frame_format, picture_t *picture);
+extern int libmpeg2_accel_new_frame(mpeg2dec_accel_t *accel, uint32_t frame_format, picture_t *picture, double ratio, uint32_t flags);
+extern void libmpeg2_accel_frame_completion(mpeg2dec_accel_t *accel, uint32_t frame_format, picture_t *picture, int code);
+
+extern int libmpeg2_accel_slice(mpeg2dec_accel_t *accel, picture_t *picture, int code, 
+				char * buffer, uint32_t chunk_size, uint8_t *chunk_buffer);
+extern void libmpeg2_accel_scan( mpeg2dec_accel_t *accel, uint8_t *scan_norm, uint8_t *scan_alt);
+
+#endif
diff --git a/src/video_dec/libmpeg2/motion_comp.c b/src/video_dec/libmpeg2/motion_comp.c
new file mode 100644
index 000000000..4a324b8ea
--- /dev/null
+++ b/src/video_dec/libmpeg2/motion_comp.c
@@ -0,0 +1,154 @@
+/*
+ * motion_comp.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+
+mpeg2_mc_t mpeg2_mc;
+
+void mpeg2_mc_init (uint32_t mm_accel)
+{
+#ifdef LIBMPEG2_MLIB
+    if (mm_accel & MM_ACCEL_MLIB) {
+#ifdef LOG
+	fprintf (stderr, "Using mediaLib for motion compensation\n");
+#endif
+	mpeg2_mc = mpeg2_mc_mlib;
+    }
+#endif
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+    if (mm_accel & MM_ACCEL_X86_MMXEXT) {
+#ifdef LOG
+	fprintf (stderr, "Using MMXEXT for motion compensation\n");
+#endif
+	mpeg2_mc = mpeg2_mc_mmxext;
+    } else if (mm_accel & MM_ACCEL_X86_3DNOW) {
+#ifdef LOG
+	fprintf (stderr, "Using 3DNOW for motion compensation\n");
+#endif
+	mpeg2_mc = mpeg2_mc_3dnow;
+    } else if (mm_accel & MM_ACCEL_X86_MMX) {
+#ifdef LOG
+	fprintf (stderr, "Using MMX for motion compensation\n");
+#endif
+	mpeg2_mc = mpeg2_mc_mmx;
+    } else
+#endif
+#if defined (ARCH_PPC) && defined (ENABLE_ALTIVEC)
+    if (mm_accel & MM_ACCEL_PPC_ALTIVEC) {
+#ifdef LOG
+	fprintf (stderr, "Using altivec for motion compensation\n");
+#endif
+	mpeg2_mc = mpeg2_mc_altivec;
+    } else
+#endif
+#if defined(ARCH_SPARC) && defined(ENABLE_VIS)
+    if (mm_accel & MM_ACCEL_SPARC_VIS) {
+#ifdef LOG
+	fprintf (stderr, "Using VIS for motion compensation\n");
+#endif
+	mpeg2_mc = mpeg2_mc_vis;
+    } else
+#endif
+    {
+#ifdef LOG
+	fprintf (stderr, "No accelerated motion compensation found\n");
+#endif
+	mpeg2_mc = mpeg2_mc_c;
+    }
+}
+
+#define avg2(a,b) ((a+b+1)>>1)
+#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
+
+#define predict_o(i) (ref[i])
+#define predict_x(i) (avg2 (ref[i], ref[i+1]))
+#define predict_y(i) (avg2 (ref[i], (ref+stride)[i]))
+#define predict_xy(i) (avg4 (ref[i], ref[i+1], \
+			     (ref+stride)[i], (ref+stride)[i+1]))
+
+#define put(predictor,i) dest[i] = predictor (i)
+#define avg(predictor,i) dest[i] = avg2 (predictor (i), dest[i])
+
+/* mc function template */
+
+#define MC_FUNC(op,xy)							\
+static void MC_##op##_##xy##_16_c (uint8_t * dest, uint8_t * ref,	\
+				 int stride, int height)		\
+{									\
+    do {								\
+	op (predict_##xy, 0);						\
+	op (predict_##xy, 1);						\
+	op (predict_##xy, 2);						\
+	op (predict_##xy, 3);						\
+	op (predict_##xy, 4);						\
+	op (predict_##xy, 5);						\
+	op (predict_##xy, 6);						\
+	op (predict_##xy, 7);						\
+	op (predict_##xy, 8);						\
+	op (predict_##xy, 9);						\
+	op (predict_##xy, 10);						\
+	op (predict_##xy, 11);						\
+	op (predict_##xy, 12);						\
+	op (predict_##xy, 13);						\
+	op (predict_##xy, 14);						\
+	op (predict_##xy, 15);						\
+	ref += stride;							\
+	dest += stride;							\
+    } while (--height);							\
+}									\
+static void MC_##op##_##xy##_8_c (uint8_t * dest, uint8_t * ref,	\
+				int stride, int height)			\
+{									\
+    do {								\
+	op (predict_##xy, 0);						\
+	op (predict_##xy, 1);						\
+	op (predict_##xy, 2);						\
+	op (predict_##xy, 3);						\
+	op (predict_##xy, 4);						\
+	op (predict_##xy, 5);						\
+	op (predict_##xy, 6);						\
+	op (predict_##xy, 7);						\
+	ref += stride;							\
+	dest += stride;							\
+    } while (--height);							\
+}
+
+/* definitions of the actual mc functions */
+
+MC_FUNC (put,o)
+MC_FUNC (avg,o)
+MC_FUNC (put,x)
+MC_FUNC (avg,x)
+MC_FUNC (put,y)
+MC_FUNC (avg,y)
+MC_FUNC (put,xy)
+MC_FUNC (avg,xy)
+
+MPEG2_MC_EXTERN (c)
diff --git a/src/video_dec/libmpeg2/motion_comp_altivec.c b/src/video_dec/libmpeg2/motion_comp_altivec.c
new file mode 100644
index 000000000..99719b7fb
--- /dev/null
+++ b/src/video_dec/libmpeg2/motion_comp_altivec.c
@@ -0,0 +1,2031 @@
+/*
+ * motion_comp_altivec.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifndef HOST_OS_DARWIN
+
+#if defined (ARCH_PPC) && defined (ENABLE_ALTIVEC)
+
+#include "mpeg2_internal.h"
+
+#include <inttypes.h>
+
+/*
+ * The asm code is generated with:
+ *
+ * gcc-2.95 -fvec -DHOST_OS_DARWIN -O9 -fomit-frame-pointer -mregnames -S
+ *      motion_comp_altivec.c
+ *
+ * sed 's/.L/._L/g' motion_comp_altivec.s |
+ * awk '{args=""; len=split ($2, arg, ",");
+ *      for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a",";
+ *                               args = args sprintf ("%-6s", a) }
+ *      printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' |
+ * unexpand -a
+ */
+
+static void MC_put_o_16_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	li		%r9,  15		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	lvsl		%v12, 0,    %r4		\n"
+	"	mtctr		%r6			\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	add		%r0,  %r5,  %r5		\n"
+	"	vperm		%v13, %v1,  %v0,  %v12	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"._L6:						\n"
+	"	li		%r9,  15		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	stvx		%v13, 0,    %r3		\n"
+	"	vperm		%v13, %v1,  %v0,  %v12	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	stvx		%v13, %r5,  %r3		\n"
+	"	vperm		%v13, %v1,  %v0,  %v12	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	add		%r3,  %r3,  %r0		\n"
+	"	bdnz		._L6			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	stvx		%v13, 0,    %r3		\n"
+	"	vperm		%v13, %v1,  %v0,  %v12	\n"
+	"	stvx		%v13, %r5,  %r3		\n"
+	 );
+}
+
+static void MC_put_o_8_altivec (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v12, 0,    %r4		\n"
+	"	lvsl		%v1,  %r5,  %r4		\n"
+	"	vmrghb		%v12, %v12, %v12	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	li		%r9,  7			\n"
+	"	vmrghb		%v1,  %v1,  %v1		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vpkuhum		%v10, %v12, %v12	\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	mtctr		%r6			\n"
+	"	vpkuhum		%v11, %v1,  %v1		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v12, %v13, %v0,  %v10	\n"
+	"._L11:						\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	stvewx		%v12, 0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v1,  %v13, %v0,  %v11	\n"
+	"	stvewx		%v12, %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	vperm		%v12, %v13, %v0,  %v10	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	bdnz		._L11			\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	stvewx		%v12, 0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v1,  %v13, %v0,  %v11	\n"
+	"	stvewx		%v12, %r9,  %r3		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	 );
+}
+
+static void MC_put_x_16_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v11, 0,    %r4		\n"
+	"	vspltisb	%v0,  1			\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	vaddubm		%v10, %v11, %v0		\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v1,  %v12, %v13, %v10	\n"
+	"	vperm		%v0,  %v12, %v13, %v11	\n"
+	"	mtctr		%r6			\n"
+	"	add		%r0,  %r5,  %r5		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"._L16:						\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	stvx		%v0,  0,    %r3		\n"
+	"	vperm		%v1,  %v12, %v13, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v0,  %v12, %v13, %v11	\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	stvx		%v0,  %r5,  %r3		\n"
+	"	vperm		%v1,  %v12, %v13, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v0,  %v12, %v13, %v11	\n"
+	"	add		%r3,  %r3,  %r0		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	bdnz		._L16			\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	stvx		%v0,  0,    %r3		\n"
+	"	vperm		%v1,  %v12, %v13, %v10	\n"
+	"	vperm		%v0,  %v12, %v13, %v11	\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	stvx		%v0,  %r5,  %r3		\n"
+	 );
+}
+
+static void MC_put_x_8_altivec (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v0,  0,    %r4		\n"
+	"	vspltisb	%v13, 1			\n"
+	"	lvsl		%v10, %r5,  %r4		\n"
+	"	vmrghb		%v0,  %v0,  %v0		\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	vmrghb		%v10, %v10, %v10	\n"
+	"	vpkuhum		%v8,  %v0,  %v0		\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vpkuhum		%v9,  %v10, %v10	\n"
+	"	vaddubm		%v7,  %v8,  %v13	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v1,  %v11, %v12, %v8	\n"
+	"	mtctr		%r6			\n"
+	"	vaddubm		%v13, %v9,  %v13	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v0,  %v11, %v12, %v7	\n"
+	"	vavgub		%v0,  %v1,  %v0		\n"
+	"._L21:						\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	stvewx		%v0,  0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v1,  %v11, %v12, %v13	\n"
+	"	stvewx		%v0,  %r9,  %r3		\n"
+	"	vperm		%v0,  %v11, %v12, %v9	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	vavgub		%v10, %v0,  %v1		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	stvewx		%v10, 0,    %r3		\n"
+	"	vperm		%v1,  %v11, %v12, %v7	\n"
+	"	vperm		%v0,  %v11, %v12, %v8	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v10, %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	bdnz		._L21			\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	stvewx		%v0,  0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v1,  %v11, %v12, %v13	\n"
+	"	stvewx		%v0,  %r9,  %r3		\n"
+	"	vperm		%v0,  %v11, %v12, %v9	\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v10, %v0,  %v1		\n"
+	"	stvewx		%v10, 0,    %r3		\n"
+	"	stvewx		%v10, %r9,  %r3		\n"
+	 );
+}
+
+static void MC_put_y_16_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	li		%r9,  15		\n"
+	"	lvsl		%v10, 0,    %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v12, %v13, %v1,  %v10	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v11, %v13, %v1,  %v10	\n"
+	"	mtctr		%r6			\n"
+	"	add		%r0,  %r5,  %r5		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v12, %v11	\n"
+	"._L26:						\n"
+	"	li		%r9,  15		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	stvx		%v0,  0,    %r3		\n"
+	"	vperm		%v12, %v13, %v1,  %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	vavgub		%v0,  %v12, %v11	\n"
+	"	stvx		%v0,  %r5,  %r3		\n"
+	"	vperm		%v11, %v13, %v1,  %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	add		%r3,  %r3,  %r0		\n"
+	"	vavgub		%v0,  %v12, %v11	\n"
+	"	bdnz		._L26			\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	stvx		%v0,  0,    %r3		\n"
+	"	vperm		%v12, %v13, %v1,  %v10	\n"
+	"	vavgub		%v0,  %v12, %v11	\n"
+	"	stvx		%v0,  %r5,  %r3		\n"
+	 );
+}
+
+static void MC_put_y_8_altivec (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v13, 0,    %r4		\n"
+	"	lvsl		%v11, %r5,  %r4		\n"
+	"	vmrghb		%v13, %v13, %v13	\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	vmrghb		%v11, %v11, %v11	\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	vpkuhum		%v9,  %v13, %v13	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vpkuhum		%v10, %v11, %v11	\n"
+	"	vperm		%v13, %v12, %v1,  %v9	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v11, %v12, %v1,  %v10	\n"
+	"	mtctr		%r6			\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v13, %v11	\n"
+	"._L31:						\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	stvewx		%v0,  0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v13, %v12, %v1,  %v9	\n"
+	"	stvewx		%v0,  %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v13, %v11	\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	stvewx		%v0,  0,    %r3		\n"
+	"	vperm		%v11, %v12, %v1,  %v10	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v0,  %r9,  %r3		\n"
+	"	vavgub		%v0,  %v13, %v11	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	bdnz		._L31			\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v12, 0,    %r4		\n"
+	"	stvewx		%v0,  0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v13, %v12, %v1,  %v9	\n"
+	"	stvewx		%v0,  %r9,  %r3		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v0,  %v13, %v11	\n"
+	"	stvewx		%v0,  0,    %r3		\n"
+	"	stvewx		%v0,  %r9,  %r3		\n"
+	 );
+}
+
+static void MC_put_xy_16_altivec (uint8_t * dest, uint8_t * ref,
+				  int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v5,  0,    %r4		\n"
+	"	vspltisb	%v3,  1			\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	vaddubm		%v4,  %v5,  %v3		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v10, %v1,  %v0,  %v4	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	mtctr		%r6			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	add		%r0,  %r5,  %r5		\n"
+	"	vperm		%v10, %v1,  %v0,  %v4	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	vxor		%v6,  %v11, %v10	\n"
+	"	vavgub		%v7,  %v11, %v10	\n"
+	"	vor		%v0,  %v8,  %v6		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vand		%v0,  %v3,  %v0		\n"
+	"	vavgub		%v1,  %v9,  %v7		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vsububm		%v13, %v1,  %v0		\n"
+	"._L36:						\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	stvx		%v13, 0,    %r3		\n"
+	"	vperm		%v10, %v1,  %v0,  %v4	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v10, %v1,  %v0,  %v4	\n"
+	"	vavgub		%v12, %v9,  %v7		\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	vor		%v13, %v8,  %v6		\n"
+	"	vxor		%v0,  %v9,  %v7		\n"
+	"	vxor		%v6,  %v11, %v10	\n"
+	"	vand		%v13, %v3,  %v13	\n"
+	"	vavgub		%v7,  %v11, %v10	\n"
+	"	vor		%v1,  %v8,  %v6		\n"
+	"	vand		%v13, %v13, %v0		\n"
+	"	vxor		%v0,  %v9,  %v7		\n"
+	"	vand		%v1,  %v3,  %v1		\n"
+	"	vsububm		%v13, %v12, %v13	\n"
+	"	vand		%v1,  %v1,  %v0		\n"
+	"	stvx		%v13, %r5,  %r3		\n"
+	"	vavgub		%v0,  %v9,  %v7		\n"
+	"	add		%r3,  %r3,  %r0		\n"
+	"	vsububm		%v13, %v0,  %v1		\n"
+	"	bdnz		._L36			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	stvx		%v13, 0,    %r3		\n"
+	"	vperm		%v10, %v1,  %v0,  %v4	\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vor		%v0,  %v8,  %v6		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vand		%v0,  %v3,  %v0		\n"
+	"	vavgub		%v1,  %v9,  %v7		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vsububm		%v13, %v1,  %v0		\n"
+	"	stvx		%v13, %r5,  %r3		\n"
+	 );
+}
+
+static void MC_put_xy_8_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v4,  0,    %r4		\n"
+	"	vspltisb	%v3,  1			\n"
+	"	lvsl		%v5,  %r5,  %r4		\n"
+	"	vmrghb		%v4,  %v4,  %v4		\n"
+	"	li		%r9,  16		\n"
+	"	vmrghb		%v5,  %v5,  %v5		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	vpkuhum		%v4,  %v4,  %v4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	vpkuhum		%v5,  %v5,  %v5		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vaddubm		%v2,  %v4,  %v3		\n"
+	"	vperm		%v11, %v1,  %v0,  %v4	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vaddubm		%v19, %v5,  %v3		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v10, %v1,  %v0,  %v2	\n"
+	"	mtctr		%r6			\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v10, %v1,  %v0,  %v19	\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	vxor		%v6,  %v11, %v10	\n"
+	"	vavgub		%v7,  %v11, %v10	\n"
+	"	vor		%v0,  %v8,  %v6		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vand		%v0,  %v3,  %v0		\n"
+	"	vavgub		%v1,  %v9,  %v7		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vsububm		%v13, %v1,  %v0		\n"
+	"._L41:						\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v10, %v1,  %v0,  %v2	\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	"	vperm		%v11, %v1,  %v0,  %v4	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	li		%r9,  16		\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	vavgub		%v12, %v9,  %v7		\n"
+	"	vor		%v13, %v8,  %v6		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vperm		%v10, %v1,  %v0,  %v19	\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v11, %v1,  %v0,  %v5	\n"
+	"	vand		%v13, %v3,  %v13	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vxor		%v0,  %v9,  %v7		\n"
+	"	vxor		%v6,  %v11, %v10	\n"
+	"	vavgub		%v7,  %v11, %v10	\n"
+	"	vor		%v1,  %v8,  %v6		\n"
+	"	vand		%v13, %v13, %v0		\n"
+	"	vxor		%v0,  %v9,  %v7		\n"
+	"	vand		%v1,  %v3,  %v1		\n"
+	"	vsububm		%v13, %v12, %v13	\n"
+	"	vand		%v1,  %v1,  %v0		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	vavgub		%v0,  %v9,  %v7		\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vsububm		%v13, %v0,  %v1		\n"
+	"	bdnz		._L41			\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	vperm		%v10, %v1,  %v0,  %v2	\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v11, %v1,  %v0,  %v4	\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vor		%v0,  %v8,  %v6		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vand		%v0,  %v3,  %v0		\n"
+	"	vavgub		%v1,  %v9,  %v7		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vsububm		%v13, %v1,  %v0		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	 );
+}
+
+static void MC_avg_o_16_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	li		%r9,  15		\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvsl		%v11, 0,    %r4		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v0,  %v1,  %v0,  %v11	\n"
+	"	lvx		%v13, 0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	add		%r9,  %r5,  %r5		\n"
+	"	vavgub		%v12, %v13, %v0		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"._L46:						\n"
+	"	li		%r11, 15		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r11, %r4		\n"
+	"	lvx		%v13, %r5,  %r3		\n"
+	"	vperm		%v0,  %v1,  %v0,  %v11	\n"
+	"	stvx		%v12, 0,    %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v12, %v13, %v0		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v0,  %r11, %r4		\n"
+	"	lvx		%v13, %r9,  %r3		\n"
+	"	vperm		%v0,  %v1,  %v0,  %v11	\n"
+	"	stvx		%v12, %r5,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v12, %v13, %v0		\n"
+	"	add		%r3,  %r3,  %r9		\n"
+	"	bdnz		._L46			\n"
+	"	lvx		%v0,  %r11, %r4		\n"
+	"	lvx		%v1,  0,    %r4		\n"
+	"	lvx		%v13, %r5,  %r3		\n"
+	"	vperm		%v0,  %v1,  %v0,  %v11	\n"
+	"	stvx		%v12, 0,    %r3		\n"
+	"	vavgub		%v12, %v13, %v0		\n"
+	"	stvx		%v12, %r5,  %r3		\n"
+	 );
+}
+
+static void MC_avg_o_8_altivec (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v12, 0,    %r4		\n"
+	"	li		%r9,  7			\n"
+	"	vmrghb		%v12, %v12, %v12	\n"
+	"	lvsl		%v1,  %r5,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	vpkuhum		%v9,  %v12, %v12	\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vmrghb		%v1,  %v1,  %v1		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v0,  %v13, %v0,  %v9	\n"
+	"	lvx		%v11, 0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	vpkuhum		%v10, %v1,  %v1		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v12, %v11, %v0		\n"
+	"._L51:						\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v11, %r5,  %r3		\n"
+	"	stvewx		%v12, 0,    %r3		\n"
+	"	vperm		%v0,  %v13, %v0,  %v10	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v12, %r9,  %r3		\n"
+	"	vavgub		%v1,  %v11, %v0		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v11, %r5,  %r3		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	vperm		%v0,  %v13, %v0,  %v9	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	"	vavgub		%v12, %v11, %v0		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	bdnz		._L51			\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v0,  %r9,  %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v11, %r5,  %r3		\n"
+	"	stvewx		%v12, 0,    %r3		\n"
+	"	vperm		%v0,  %v13, %v0,  %v10	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v12, %r9,  %r3		\n"
+	"	vavgub		%v1,  %v11, %v0		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	 );
+}
+
+static void MC_avg_x_16_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v8,  0,    %r4		\n"
+	"	vspltisb	%v0,  1			\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	vaddubm		%v7,  %v8,  %v0		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vperm		%v1,  %v11, %v12, %v7	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v0,  %v11, %v12, %v8	\n"
+	"	lvx		%v9,  0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	add		%r9,  %r5,  %r5		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"._L56:						\n"
+	"	li		%r11, 16		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v12, %r11, %r4		\n"
+	"	lvx		%v9,  %r5,  %r3		\n"
+	"	stvx		%v10, 0,    %r3		\n"
+	"	vperm		%v0,  %v11, %v12, %v7	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v1,  %v11, %v12, %v8	\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v12, %r11, %r4		\n"
+	"	vavgub		%v1,  %v1,  %v0		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v13, %v11, %v12, %v7	\n"
+	"	vavgub		%v10, %v9,  %v1		\n"
+	"	vperm		%v0,  %v11, %v12, %v8	\n"
+	"	lvx		%v9,  %r9,  %r3		\n"
+	"	stvx		%v10, %r5,  %r3		\n"
+	"	vavgub		%v0,  %v0,  %v13	\n"
+	"	add		%r3,  %r3,  %r9		\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"	bdnz		._L56			\n"
+	"	lvx		%v12, %r11, %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v9,  %r5,  %r3		\n"
+	"	vperm		%v1,  %v11, %v12, %v7	\n"
+	"	stvx		%v10, 0,    %r3		\n"
+	"	vperm		%v0,  %v11, %v12, %v8	\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"	stvx		%v10, %r5,  %r3		\n"
+	 );
+}
+
+static void MC_avg_x_8_altivec (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v10, 0,    %r4		\n"
+	"	vspltisb	%v13, 1			\n"
+	"	li		%r9,  8			\n"
+	"	vmrghb		%v10, %v10, %v10	\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	vpkuhum		%v7,  %v10, %v10	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	lvsl		%v10, %r5,  %r4		\n"
+	"	vaddubm		%v6,  %v7,  %v13	\n"
+	"	vperm		%v0,  %v11, %v12, %v7	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vmrghb		%v10, %v10, %v10	\n"
+	"	lvx		%v9,  0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	vperm		%v1,  %v11, %v12, %v6	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vpkuhum		%v8,  %v10, %v10	\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	vaddubm		%v13, %v8,  %v13	\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"._L61:						\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v9,  %r5,  %r3		\n"
+	"	stvewx		%v10, 0,    %r3		\n"
+	"	vperm		%v1,  %v11, %v12, %v13	\n"
+	"	vperm		%v0,  %v11, %v12, %v8	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v10, %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vperm		%v1,  %v11, %v12, %v6	\n"
+	"	lvx		%v9,  %r5,  %r3		\n"
+	"	vperm		%v0,  %v11, %v12, %v7	\n"
+	"	stvewx		%v10, 0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	stvewx		%v10, %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"	bdnz		._L61			\n"
+	"	li		%r9,  8			\n"
+	"	lvx		%v12, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v9,  %r5,  %r3		\n"
+	"	vperm		%v1,  %v11, %v12, %v13	\n"
+	"	stvewx		%v10, 0,    %r3		\n"
+	"	vperm		%v0,  %v11, %v12, %v8	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v10, %r9,  %r3		\n"
+	"	vavgub		%v0,  %v0,  %v1		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v10, %v9,  %v0		\n"
+	"	stvewx		%v10, 0,    %r3		\n"
+	"	stvewx		%v10, %r9,  %r3		\n"
+	 );
+}
+
+static void MC_avg_y_16_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	li		%r9,  15		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvsl		%v9,  0,    %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v11, %v13, %v1,  %v9	\n"
+	"	li		%r11, 15		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vperm		%v10, %v13, %v1,  %v9	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	lvx		%v12, 0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	vavgub		%v0,  %v11, %v10	\n"
+	"	add		%r9,  %r5,  %r5		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v12, %v0		\n"
+	"._L66:						\n"
+	"	li		%r11, 15		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	lvx		%v12, %r5,  %r3		\n"
+	"	vperm		%v11, %v13, %v1,  %v9	\n"
+	"	stvx		%v0,  0,    %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v11, %v10	\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	vavgub		%v0,  %v12, %v0		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	lvx		%v12, %r9,  %r3		\n"
+	"	vperm		%v10, %v13, %v1,  %v9	\n"
+	"	stvx		%v0,  %r5,  %r3		\n"
+	"	vavgub		%v0,  %v11, %v10	\n"
+	"	add		%r3,  %r3,  %r9		\n"
+	"	vavgub		%v0,  %v12, %v0		\n"
+	"	bdnz		._L66			\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v12, %r5,  %r3		\n"
+	"	vperm		%v11, %v13, %v1,  %v9	\n"
+	"	stvx		%v0,  0,    %r3		\n"
+	"	vavgub		%v0,  %v11, %v10	\n"
+	"	vavgub		%v0,  %v12, %v0		\n"
+	"	stvx		%v0,  %r5,  %r3		\n"
+	 );
+}
+
+static void MC_avg_y_8_altivec (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v12, 0,    %r4		\n"
+	"	lvsl		%v9,  %r5,  %r4		\n"
+	"	vmrghb		%v12, %v12, %v12	\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	vmrghb		%v9,  %v9,  %v9		\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	vpkuhum		%v7,  %v12, %v12	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vpkuhum		%v8,  %v9,  %v9		\n"
+	"	vperm		%v12, %v11, %v13, %v7	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v9,  %v11, %v13, %v8	\n"
+	"	lvx		%v10, 0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v0,  %v12, %v9		\n"
+	"	vavgub		%v1,  %v10, %v0		\n"
+	"._L71:						\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v10, %r5,  %r3		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	vperm		%v12, %v11, %v13, %v7	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	"	vavgub		%v0,  %v12, %v9		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	li		%r9,  7			\n"
+	"	vavgub		%v1,  %v10, %v0		\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vperm		%v9,  %v11, %v13, %v8	\n"
+	"	lvx		%v10, %r5,  %r3		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	vavgub		%v0,  %v12, %v9		\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vavgub		%v1,  %v10, %v0		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	bdnz		._L71			\n"
+	"	li		%r9,  7			\n"
+	"	lvx		%v13, %r9,  %r4		\n"
+	"	lvx		%v11, 0,    %r4		\n"
+	"	lvx		%v10, %r5,  %r3		\n"
+	"	vperm		%v12, %v11, %v13, %v7	\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	li		%r9,  4			\n"
+	"	vavgub		%v0,  %v12, %v9		\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v1,  %v10, %v0		\n"
+	"	stvewx		%v1,  0,    %r3		\n"
+	"	stvewx		%v1,  %r9,  %r3		\n"
+	 );
+}
+
+static void MC_avg_xy_16_altivec (uint8_t * dest, uint8_t * ref,
+				  int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v4,  0,    %r4		\n"
+	"	vspltisb	%v2,  1			\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	vaddubm		%v3,  %v4,  %v2		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v10, %v13, %v1,  %v3	\n"
+	"	li		%r11, 16		\n"
+	"	vperm		%v11, %v13, %v1,  %v4	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v10, %v13, %v1,  %v3	\n"
+	"	lvx		%v6,  0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	vperm		%v11, %v13, %v1,  %v4	\n"
+	"	add		%r9,  %r5,  %r5		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vxor		%v5,  %v11, %v10	\n"
+	"	vavgub		%v7,  %v11, %v10	\n"
+	"	vor		%v1,  %v8,  %v5		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vand		%v1,  %v2,  %v1		\n"
+	"	vavgub		%v0,  %v9,  %v7		\n"
+	"	vand		%v1,  %v1,  %v13	\n"
+	"	vsububm		%v0,  %v0,  %v1		\n"
+	"	vavgub		%v12, %v6,  %v0		\n"
+	"._L76:						\n"
+	"	li		%r11, 16		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	lvx		%v6,  %r5,  %r3		\n"
+	"	stvx		%v12, 0,    %r3		\n"
+	"	vperm		%v10, %v13, %v1,  %v3	\n"
+	"	vperm		%v11, %v13, %v1,  %v4	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v10, %v13, %v1,  %v3	\n"
+	"	vavgub		%v12, %v9,  %v7		\n"
+	"	vperm		%v11, %v13, %v1,  %v4	\n"
+	"	vor		%v0,  %v8,  %v5		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vxor		%v5,  %v11, %v10	\n"
+	"	vand		%v0,  %v2,  %v0		\n"
+	"	vavgub		%v7,  %v11, %v10	\n"
+	"	vor		%v1,  %v8,  %v5		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vand		%v1,  %v2,  %v1		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vsububm		%v12, %v12, %v0		\n"
+	"	vand		%v1,  %v1,  %v13	\n"
+	"	vavgub		%v0,  %v9,  %v7		\n"
+	"	vavgub		%v12, %v6,  %v12	\n"
+	"	lvx		%v6,  %r9,  %r3		\n"
+	"	vsububm		%v0,  %v0,  %v1		\n"
+	"	stvx		%v12, %r5,  %r3		\n"
+	"	vavgub		%v12, %v6,  %v0		\n"
+	"	add		%r3,  %r3,  %r9		\n"
+	"	bdnz		._L76			\n"
+	"	lvx		%v1,  %r11, %r4		\n"
+	"	lvx		%v13, 0,    %r4		\n"
+	"	lvx		%v6,  %r5,  %r3		\n"
+	"	vperm		%v10, %v13, %v1,  %v3	\n"
+	"	stvx		%v12, 0,    %r3		\n"
+	"	vperm		%v11, %v13, %v1,  %v4	\n"
+	"	vxor		%v8,  %v11, %v10	\n"
+	"	vavgub		%v9,  %v11, %v10	\n"
+	"	vor		%v0,  %v8,  %v5		\n"
+	"	vxor		%v13, %v9,  %v7		\n"
+	"	vand		%v0,  %v2,  %v0		\n"
+	"	vavgub		%v1,  %v9,  %v7		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vsububm		%v1,  %v1,  %v0		\n"
+	"	vavgub		%v12, %v6,  %v1		\n"
+	"	stvx		%v12, %r5,  %r3		\n"
+	 );
+}
+
+static void MC_avg_xy_8_altivec (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    asm ("						\n"
+	"	lvsl		%v2,  0,    %r4		\n"
+	"	vspltisb	%v19, 1			\n"
+	"	lvsl		%v3,  %r5,  %r4		\n"
+	"	vmrghb		%v2,  %v2,  %v2		\n"
+	"	li		%r9,  16		\n"
+	"	vmrghb		%v3,  %v3,  %v3		\n"
+	"	lvx		%v9,  0,    %r4		\n"
+	"	vpkuhum		%v2,  %v2,  %v2		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	vpkuhum		%v3,  %v3,  %v3		\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vaddubm		%v18, %v2,  %v19	\n"
+	"	vperm		%v11, %v9,  %v1,  %v2	\n"
+	"	srawi		%r6,  %r6,  1		\n"
+	"	vaddubm		%v17, %v3,  %v19	\n"
+	"	addi		%r6,  %r6,  -1		\n"
+	"	vperm		%v10, %v9,  %v1,  %v18	\n"
+	"	lvx		%v4,  0,    %r3		\n"
+	"	mtctr		%r6			\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v9,  0,    %r4		\n"
+	"	vavgub		%v8,  %v11, %v10	\n"
+	"	vxor		%v7,  %v11, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vperm		%v10, %v9,  %v1,  %v17	\n"
+	"	vperm		%v11, %v9,  %v1,  %v3	\n"
+	"	vxor		%v5,  %v11, %v10	\n"
+	"	vavgub		%v6,  %v11, %v10	\n"
+	"	vor		%v1,  %v7,  %v5		\n"
+	"	vxor		%v13, %v8,  %v6		\n"
+	"	vand		%v1,  %v19, %v1		\n"
+	"	vavgub		%v0,  %v8,  %v6		\n"
+	"	vand		%v1,  %v1,  %v13	\n"
+	"	vsububm		%v0,  %v0,  %v1		\n"
+	"	vavgub		%v13, %v4,  %v0		\n"
+	"._L81:						\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v9,  0,    %r4		\n"
+	"	lvx		%v4,  %r5,  %r3		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	vperm		%v10, %v9,  %v1,  %v18	\n"
+	"	vperm		%v11, %v9,  %v1,  %v2	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	"	vxor		%v7,  %v11, %v10	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	li		%r9,  16		\n"
+	"	vavgub		%v8,  %v11, %v10	\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	vor		%v0,  %v7,  %v5		\n"
+	"	lvx		%v9,  0,    %r4		\n"
+	"	vxor		%v12, %v8,  %v6		\n"
+	"	vand		%v0,  %v19, %v0		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vperm		%v10, %v9,  %v1,  %v17	\n"
+	"	vavgub		%v13, %v8,  %v6		\n"
+	"	li		%r9,  4			\n"
+	"	vperm		%v11, %v9,  %v1,  %v3	\n"
+	"	vand		%v0,  %v0,  %v12	\n"
+	"	add		%r4,  %r4,  %r5		\n"
+	"	vxor		%v5,  %v11, %v10	\n"
+	"	vavgub		%v6,  %v11, %v10	\n"
+	"	vor		%v1,  %v7,  %v5		\n"
+	"	vsububm		%v13, %v13, %v0		\n"
+	"	vxor		%v0,  %v8,  %v6		\n"
+	"	vand		%v1,  %v19, %v1		\n"
+	"	vavgub		%v13, %v4,  %v13	\n"
+	"	vand		%v1,  %v1,  %v0		\n"
+	"	lvx		%v4,  %r5,  %r3		\n"
+	"	vavgub		%v0,  %v8,  %v6		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	"	vsububm		%v0,  %v0,  %v1		\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v13, %v4,  %v0		\n"
+	"	bdnz		._L81			\n"
+	"	li		%r9,  16		\n"
+	"	lvx		%v1,  %r9,  %r4		\n"
+	"	lvx		%v9,  0,    %r4		\n"
+	"	lvx		%v4,  %r5,  %r3		\n"
+	"	vperm		%v10, %v9,  %v1,  %v18	\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	vperm		%v11, %v9,  %v1,  %v2	\n"
+	"	li		%r9,  4			\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	"	vxor		%v7,  %v11, %v10	\n"
+	"	add		%r3,  %r3,  %r5		\n"
+	"	vavgub		%v8,  %v11, %v10	\n"
+	"	vor		%v0,  %v7,  %v5		\n"
+	"	vxor		%v13, %v8,  %v6		\n"
+	"	vand		%v0,  %v19, %v0		\n"
+	"	vavgub		%v1,  %v8,  %v6		\n"
+	"	vand		%v0,  %v0,  %v13	\n"
+	"	vsububm		%v1,  %v1,  %v0		\n"
+	"	vavgub		%v13, %v4,  %v1		\n"
+	"	stvewx		%v13, 0,    %r3		\n"
+	"	stvewx		%v13, %r9,  %r3		\n"
+	 );
+}
+
+MPEG2_MC_EXTERN (altivec)
+
+#endif	/* ARCH_PPC */
+
+#else	/* HOST_OS_DARWIN */
+
+#ifdef ENABLE_ALTIVEC
+
+#include "mpeg2_internal.h"
+
+#define vector_s16_t vector signed short
+#define vector_u16_t vector unsigned short
+#define vector_s8_t vector signed char
+#define vector_u8_t vector unsigned char
+#define vector_s32_t vector signed int
+#define vector_u32_t vector unsigned int
+
+void MC_put_o_16_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp = vec_perm (ref0, ref1, perm);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	tmp = vec_perm (ref0, ref1, perm);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_perm (ref0, ref1, perm);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    vec_st (tmp, 0, dest);
+    tmp = vec_perm (ref0, ref1, perm);
+    vec_st (tmp, stride, dest);
+}
+
+void MC_put_o_8_altivec (unsigned char * dest, unsigned char * ref,
+			 int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_perm (ref0, ref1, perm1);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_perm (ref0, ref1, perm0);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_perm (ref0, ref1, perm1);
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+void MC_put_x_16_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, tmp;
+
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, vec_splat_u8 (1));
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		   vec_perm (ref0, ref1, permB));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		       vec_perm (ref0, ref1, permB));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		       vec_perm (ref0, ref1, permB));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    vec_st (tmp, 0, dest);
+    tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		   vec_perm (ref0, ref1, permB));
+    vec_st (tmp, stride, dest);
+}
+
+void MC_put_x_8_altivec (unsigned char * dest, unsigned char * ref,
+			 int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
+
+    ones = vec_splat_u8 (1);
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    perm0B = vec_add (perm0A, ones);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    ref += stride;
+    tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
+		    vec_perm (ref0, ref1, perm0B));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
+			vec_perm (ref0, ref1, perm1B));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
+			vec_perm (ref0, ref1, perm0B));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
+		    vec_perm (ref0, ref1, perm1B));
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+void MC_put_y_16_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp1 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (tmp0, tmp1);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	tmp0 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (tmp0, tmp1);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp1 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (tmp0, tmp1);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    vec_st (tmp, 0, dest);
+    tmp0 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (tmp0, tmp1);
+    vec_st (tmp, stride, dest);
+}
+
+void MC_put_y_8_altivec (unsigned char * dest, unsigned char * ref,
+			 int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp1 = vec_perm (ref0, ref1, perm1);
+    tmp = vec_avg (tmp0, tmp1);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_perm (ref0, ref1, perm0);
+	tmp = vec_avg (tmp0, tmp1);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_perm (ref0, ref1, perm1);
+	tmp = vec_avg (tmp0, tmp1);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    tmp = vec_avg (tmp0, tmp1);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+void MC_put_xy_16_altivec (unsigned char * dest, unsigned char * ref,
+			   int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
+    vector_u8_t ones;
+
+    ones = vec_splat_u8 (1);
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    vec_st (tmp, 0, dest);
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+    vec_st (tmp, stride, dest);
+}
+
+void MC_put_xy_8_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
+    vector_u8_t avg0, avg1, xor0, xor1, tmp, ones;
+
+    ones = vec_splat_u8 (1);
+    perm0A = vec_lvsl (0, ref);
+    perm0A = vec_mergeh (perm0A, perm0A);
+    perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
+    perm0B = vec_add (perm0A, ones);
+    perm1A = vec_lvsl (stride, ref);
+    perm1A = vec_mergeh (perm1A, perm1A);
+    perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, perm1A);
+    B = vec_perm (ref0, ref1, perm1B);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm0A);
+	B = vec_perm (ref0, ref1, perm0B);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm1A);
+	B = vec_perm (ref0, ref1, perm1B);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+#if 0
+void MC_put_xy_8_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, A, B, C, D, tmp, zero, ones;
+    vector_u16_t splat2, temp;
+
+    ones = vec_splat_u8 (1);
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, ones);
+
+    zero = vec_splat_u8 (0);
+    splat2 = vec_splat_u16 (2);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	C = vec_perm (ref0, ref1, permA);
+	D = vec_perm (ref0, ref1, permB);
+
+	temp = vec_add (vec_add ((vector_u16_t)vec_mergeh (zero, A),
+				(vector_u16_t)vec_mergeh (zero, B)),
+		       vec_add ((vector_u16_t)vec_mergeh (zero, C),
+				(vector_u16_t)vec_mergeh (zero, D)));
+	temp = vec_sr (vec_add (temp, splat2), splat2);
+	tmp = vec_pack (temp, temp);
+
+	vec_st (tmp, 0, dest);
+	dest += stride;
+	tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		       vec_perm (ref0, ref1, permB));
+    } while (--height);
+}
+#endif
+
+void MC_avg_o_16_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp, prev;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+    vec_st (tmp, stride, dest);
+}
+
+void MC_avg_o_8_altivec (unsigned char * dest, unsigned char * ref,
+			 int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1, prev;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+void MC_avg_x_16_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, tmp, prev;
+
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, vec_splat_u8 (1));
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (0, dest);
+    ref += stride;
+    tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				  vec_perm (ref0, ref1, permB)));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				      vec_perm (ref0, ref1, permB)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				      vec_perm (ref0, ref1, permB)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				  vec_perm (ref0, ref1, permB)));
+    vec_st (tmp, stride, dest);
+}
+
+void MC_avg_x_8_altivec (unsigned char * dest, unsigned char * ref,
+			 int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
+    vector_u8_t prev;
+
+    ones = vec_splat_u8 (1);
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    perm0B = vec_add (perm0A, ones);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    prev = vec_ld (0, dest);
+    ref += stride;
+    tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
+				   vec_perm (ref0, ref1, perm0B)));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
+				       vec_perm (ref0, ref1, perm1B)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
+				       vec_perm (ref0, ref1, perm0B)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
+				   vec_perm (ref0, ref1, perm1B)));
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+void MC_avg_y_16_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp, prev;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp1 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	tmp0 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp1 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    tmp0 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    vec_st (tmp, stride, dest);
+}
+
+void MC_avg_y_8_altivec (unsigned char * dest, unsigned char * ref,
+			 int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1, prev;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp1 = vec_perm (ref0, ref1, perm1);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_perm (ref0, ref1, perm0);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_perm (ref0, ref1, perm1);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+void MC_avg_xy_16_altivec (unsigned char * dest, unsigned char * ref,
+			   int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
+    vector_u8_t ones, prev;
+
+    ones = vec_splat_u8 (1);
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+    vec_st (tmp, stride, dest);
+}
+
+void MC_avg_xy_8_altivec (unsigned char * dest, unsigned char * ref,
+			  int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
+    vector_u8_t avg0, avg1, xor0, xor1, tmp, ones, prev;
+
+    ones = vec_splat_u8 (1);
+    perm0A = vec_lvsl (0, ref);
+    perm0A = vec_mergeh (perm0A, perm0A);
+    perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
+    perm0B = vec_add (perm0A, ones);
+    perm1A = vec_lvsl (stride, ref);
+    perm1A = vec_mergeh (perm1A, perm1A);
+    perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    A = vec_perm (ref0, ref1, perm1A);
+    B = vec_perm (ref0, ref1, perm1B);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm0A);
+	B = vec_perm (ref0, ref1, perm0B);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm1A);
+	B = vec_perm (ref0, ref1, perm1B);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+MPEG2_MC_EXTERN (altivec)
+
+#endif /* ENABLE_ALTIVEC */
+
+#endif	/* HOST_OS_DARWIN */
+
diff --git a/src/video_dec/libmpeg2/motion_comp_mlib.c b/src/video_dec/libmpeg2/motion_comp_mlib.c
new file mode 100644
index 000000000..1a37070ae
--- /dev/null
+++ b/src/video_dec/libmpeg2/motion_comp_mlib.c
@@ -0,0 +1,181 @@
+/*
+ * motion_comp_mlib.c
+ * Copyright (C) 2000-2002 Håkan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef LIBMPEG2_MLIB
+
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+
+static void MC_put_o_16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16) 
+	mlib_VideoCopyRef_U8_U8_16x16 (dest, ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_16x8 (dest, ref, stride);
+}
+
+static void MC_put_x_16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpX_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_put_y_16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_put_xy_16_mlib (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpXY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_put_o_8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoCopyRef_U8_U8_8x8 (dest, ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_8x4 (dest, ref, stride);
+}
+
+static void MC_put_x_8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpX_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_put_y_8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_put_xy_8_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 8) 
+	mlib_VideoInterpXY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_avg_o_16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoCopyRefAve_U8_U8_16x16 (dest, ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_16x8 (dest, ref, stride);
+}
+
+static void MC_avg_x_16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveX_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_avg_y_16_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_avg_xy_16_mlib (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveXY_U8_U8_16x16 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_16x8 (dest, ref, stride, stride);
+}
+
+static void MC_avg_o_8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_8x4 (dest, ref, stride);
+}
+
+static void MC_avg_x_8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveX_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_avg_y_8_mlib (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+static void MC_avg_xy_8_mlib (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, ref, stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_8x4 (dest, ref, stride, stride);
+}
+
+MPEG2_MC_EXTERN (mlib)
+
+#endif
diff --git a/src/video_dec/libmpeg2/motion_comp_mmx.c b/src/video_dec/libmpeg2/motion_comp_mmx.c
new file mode 100644
index 000000000..9c5ab455d
--- /dev/null
+++ b/src/video_dec/libmpeg2/motion_comp_mmx.c
@@ -0,0 +1,1013 @@
+/*
+ * motion_comp_mmx.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+#include "xine_mmx.h"
+
+#define CPU_MMXEXT 0
+#define CPU_3DNOW 1
+
+
+/* MMX code - needs a rewrite */
+
+/* some rounding constants */
+static mmx_t round1 = {0x0001000100010001LL};
+static mmx_t round4 = {0x0002000200020002LL};
+
+/*
+ * This code should probably be compiled with loop unrolling
+ * (ie, -funroll-loops in gcc)becuase some of the loops
+ * use a small static number of iterations. This was written
+ * with the assumption the compiler knows best about when
+ * unrolling will help
+ */
+
+static inline void mmx_zero_reg ()
+{
+    /* load 0 into mm0 */
+    pxor_r2r (mm0, mm0);
+}
+
+static inline void mmx_average_2_U8 (uint8_t * dest,
+				     uint8_t * src1, uint8_t * src2)
+{
+    /* *dest = (*src1 + *src2 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	// load 8 src1 bytes
+    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+
+    movq_m2r (*src2, mm3);	// load 8 src2 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows to mm1
+    paddw_m2r (round1, mm1);
+    psraw_i2r (1, mm1);		// /2
+
+    paddw_r2r (mm4, mm2);	// add highs to mm2
+    paddw_m2r (round1, mm2);
+    psraw_i2r (1, mm2);		// /2
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1, *dest);	// store result in dest
+}
+
+static inline void mmx_interp_average_2_U8 (uint8_t * dest,
+					    uint8_t * src1, uint8_t * src2)
+{
+    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
+
+    movq_m2r (*dest, mm1);	// load 8 dest bytes
+    movq_r2r (mm1, mm2);	// copy 8 dest bytes
+
+    movq_m2r (*src1, mm3);	// load 8 src1 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src1 bytes
+
+    movq_m2r (*src2, mm5);	// load 8 src2 bytes
+    movq_r2r (mm5, mm6);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low dest bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high dest bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src1 bytes
+
+    punpcklbw_r2r (mm0, mm5);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm6);	// unpack high src2 bytes
+
+    paddw_r2r (mm5, mm3);	// add lows
+    paddw_m2r (round1, mm3);
+    psraw_i2r (1, mm3);		// /2
+
+    paddw_r2r (mm6, mm4);	// add highs
+    paddw_m2r (round1, mm4);
+    psraw_i2r (1, mm4);		// /2
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_m2r (round1, mm1);
+    psraw_i2r (1, mm1);		// /2
+
+    paddw_r2r (mm4, mm2);	// add highs
+    paddw_m2r (round1, mm2);
+    psraw_i2r (1, mm2);		// /2
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1, *dest);	// store result in dest
+}
+
+static inline void mmx_average_4_U8 (uint8_t * dest,
+				     uint8_t * src1, uint8_t * src2,
+				     uint8_t * src3, uint8_t * src4)
+{
+    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
+
+    movq_m2r (*src1, mm1);	// load 8 src1 bytes
+    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+
+    movq_m2r (*src2, mm3);	// load 8 src2 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	// load 8 src3 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src3 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src3 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src3 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    movq_m2r (*src4, mm5);	// load 8 src4 bytes
+    movq_r2r (mm5, mm6);	// copy 8 src4 bytes
+
+    punpcklbw_r2r (mm0, mm5);	// unpack low src4 bytes
+    punpckhbw_r2r (mm0, mm6);	// unpack high src4 bytes
+
+    paddw_r2r (mm5, mm1);	// add lows
+    paddw_r2r (mm6, mm2);	// add highs
+
+    /* now have subtotal in mm1 and mm2 */
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		// /4
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		// /4
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1, *dest);	// store result in dest
+}
+
+static inline void mmx_interp_average_4_U8 (uint8_t * dest,
+					    uint8_t * src1, uint8_t * src2,
+					    uint8_t * src3, uint8_t * src4)
+{
+    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	// load 8 src1 bytes
+    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+
+    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
+    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+
+    movq_m2r (*src2, mm3);	// load 8 src2 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	// load 8 src3 bytes
+    movq_r2r (mm3, mm4);	// copy 8 src3 bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low src3 bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high src3 bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    movq_m2r (*src4, mm5);	// load 8 src4 bytes
+    movq_r2r (mm5, mm6);	// copy 8 src4 bytes
+
+    punpcklbw_r2r (mm0, mm5);	// unpack low src4 bytes
+    punpckhbw_r2r (mm0, mm6);	// unpack high src4 bytes
+
+    paddw_r2r (mm5, mm1);	// add lows
+    paddw_r2r (mm6, mm2);	// add highs
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		// /4
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		// /4
+
+    /* now have subtotal/4 in mm1 and mm2 */
+
+    movq_m2r (*dest, mm3);	// load 8 dest bytes
+    movq_r2r (mm3, mm4);	// copy 8 dest bytes
+
+    punpcklbw_r2r (mm0, mm3);	// unpack low dest bytes
+    punpckhbw_r2r (mm0, mm4);	// unpack high dest bytes
+
+    paddw_r2r (mm3, mm1);	// add lows
+    paddw_r2r (mm4, mm2);	// add highs
+
+    paddw_m2r (round1, mm1);
+    psraw_i2r (1, mm1);		// /2
+    paddw_m2r (round1, mm2);
+    psraw_i2r (1, mm2);		// /2
+
+    /* now have end value in mm1 and mm2 */
+
+    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
+    movq_r2m (mm1,*dest);	// store result in dest
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_mmx (int width, int height,
+			       uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, dest, ref);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, dest+8, ref+8);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_o_8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_mmx (int width, int height,
+			       uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	movq_m2r (* ref, mm1);	// load 8 ref bytes
+	movq_r2m (mm1,* dest);	// store 8 bytes at curr
+
+	if (width == 16)
+	    {
+		movq_m2r (* (ref+8), mm1);	// load 8 ref bytes
+		movq_r2m (mm1,* (dest+8));	// store 8 bytes at curr
+	    }
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_o_16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_o_8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+/* Half pixel interpolation in the x direction */
+static inline void MC_avg_x_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_x_16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_x_8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_x_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_x_16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_x_8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_xy_mmx (int width, int height,
+				  uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
+				     ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_xy_16_mmx (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_xy_8_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_xy_mmx (int width, int height,
+				  uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_xy_16_mmx (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_xy_8_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_y_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_y_16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_y_8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_y_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_y_mmx (int width, int height,
+				 uint8_t * dest, uint8_t * ref, int stride)
+{
+    uint8_t * ref_next = ref+stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_y_16_mmx (uint8_t * dest, uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_y_8_mmx (uint8_t * dest, uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_y_mmx (8, height, dest, ref, stride);
+}
+
+
+MPEG2_MC_EXTERN (mmx)
+
+
+
+
+
+
+
+/* CPU_MMXEXT/CPU_3DNOW adaptation layer */
+
+#define pavg_r2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_r2r (src, dest);		\
+    else				\
+	pavgusb_r2r (src, dest);	\
+} while (0)
+
+#define pavg_m2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_m2r (src, dest);		\
+    else				\
+	pavgusb_m2r (src, dest);	\
+} while (0)
+
+
+/* CPU_MMXEXT code */
+
+
+static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put1_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int offset, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static mmx_t mask_one = {0x0101010101010101LL};
+
+static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int cpu)
+{
+    movq_m2r (*ref, mm0);
+    movq_m2r (*(ref+1), mm1);
+    movq_r2r (mm0, mm7);
+    pxor_r2r (mm1, mm7);
+    pavg_r2r (mm1, mm0);
+    ref += stride;
+
+    do {
+	movq_m2r (*ref, mm2);
+	movq_r2r (mm0, mm5);
+
+	movq_m2r (*(ref+1), mm3);
+	movq_r2r (mm2, mm6);
+
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm3, mm2);
+
+	por_r2r (mm6, mm7);
+	pxor_r2r (mm2, mm5);
+
+	pand_r2r (mm5, mm7);
+	pavg_r2r (mm2, mm0);
+
+	pand_m2r (mask_one, mm7);
+
+	psubusb_r2r (mm7, mm0);
+
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+
+	movq_r2r (mm6, mm7);	// unroll !
+	movq_r2r (mm2, mm0);	// unroll !
+    } while (--height);
+}
+
+static inline void MC_put4_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_8 (int height, uint8_t * dest, uint8_t * ref,
+			      int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_16 (int height, uint8_t * dest, uint8_t * ref,
+			       int stride, int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*(dest+8), mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_o_8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_o_16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_o_8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x_16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_x_8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x_16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x_8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_y_16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_y_8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y_16_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y_8_mmxext (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy_16_mmxext (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy_8_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy_16_mmxext (uint8_t * dest, uint8_t * ref,
+				 int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy_8_mmxext (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+
+MPEG2_MC_EXTERN (mmxext)
+
+
+
+static void MC_avg_o_16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_o_8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_o_16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_o_8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x_16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_x_8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x_16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x_8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_y_16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_y_8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y_16_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y_8_3dnow (uint8_t * dest, uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy_16_3dnow (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy_8_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy_16_3dnow (uint8_t * dest, uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy_8_3dnow (uint8_t * dest, uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+
+MPEG2_MC_EXTERN (3dnow)
+
+#endif
diff --git a/src/video_dec/libmpeg2/motion_comp_vis.c b/src/video_dec/libmpeg2/motion_comp_vis.c
new file mode 100644
index 000000000..d0a6673d6
--- /dev/null
+++ b/src/video_dec/libmpeg2/motion_comp_vis.c
@@ -0,0 +1,2059 @@
+/*
+ * motion_comp_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined(ARCH_SPARC) && defined(ENABLE_VIS)
+
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+#include "vis.h"
+
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16.  So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ *	fxor		f0, f2, f10
+ *	fand		f10, f4, f10
+ *	fmul8x16	f8, f10, f10
+ *	fand		f10, f6, f10
+ *	for		f0, f2, f12
+ *	fpsub16		f12, f10, f10
+ */
+
+#define DUP4(x) {x, x, x, x}
+#define DUP8(x) {x, x, x, x, x, x, x, x}
+static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
+static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
+static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
+static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
+static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
+static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
+static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
+static const int16_t constants256_512[] ATTR_ALIGN(8) =
+	{256, 512, 256, 512};
+static const int16_t constants256_1024[] ATTR_ALIGN(8) =
+	{256, 1024, 256, 1024};
+
+#define REF_0		0
+#define REF_0_1		1
+#define REF_2		2
+#define REF_2_1		3
+#define REF_4		4
+#define REF_4_1		5
+#define REF_6		6
+#define REF_6_1		7
+#define REF_S0		8
+#define REF_S0_1	9
+#define REF_S2		10
+#define REF_S2_1	11
+#define REF_S4		12
+#define REF_S4_1	13
+#define REF_S6		14
+#define REF_S6_1	15
+#define DST_0		16
+#define DST_1		17
+#define DST_2		18
+#define DST_3		19
+#define CONST_1		20
+#define CONST_2		20
+#define CONST_3		20
+#define CONST_6		20
+#define MASK_fe		20
+#define CONST_128	22
+#define CONST_256	22
+#define CONST_512	22
+#define CONST_1024	22
+#define TMP0		24
+#define TMP1		25
+#define TMP2		26
+#define TMP3		27
+#define TMP4		28
+#define TMP5		29
+#define ZERO		30
+#define MASK_7f		30
+
+#define TMP6		32
+#define TMP8		34
+#define TMP10		36
+#define TMP12		38
+#define TMP14		40
+#define TMP16		42
+#define TMP18		44
+#define TMP20		46
+#define TMP22		48
+#define TMP24		50
+#define TMP26		52
+#define TMP28		54
+#define TMP30		56
+#define TMP32		58
+
+static void MC_put_o_16_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+	do {	/* 5 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+
+		vis_faligndata(TMP2, TMP4, REF_2);
+		vis_st64_2(REF_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_o_8_vis (uint8_t * dest, uint8_t * _ref,
+			    int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+	do {	/* 4 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+
+		/* stall */
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+
+static void MC_avg_o_16_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(dest[8], DST_2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP6);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_xor(DST_2, REF_2, TMP8);
+
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_or(DST_0, REF_0, TMP10);
+		vis_ld64_2(dest, stride, DST_0);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+
+		vis_or(DST_2, REF_2, TMP12);
+		vis_ld64_2(dest, stride_8, DST_2);
+
+		vis_ld64(ref[0], TMP14);
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+
+		dest += stride;
+		vis_ld64_2(ref, 8, TMP16);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP18);
+		vis_faligndata(TMP2, TMP4, REF_2);
+		ref += stride;
+
+		vis_xor(DST_0, REF_0, TMP20);
+
+		vis_and(TMP20, MASK_fe, TMP20);
+
+		vis_xor(DST_2, REF_2, TMP22);
+		vis_mul8x16(CONST_128, TMP20, TMP20);
+
+		vis_and(TMP22, MASK_fe, TMP22);
+
+		vis_or(DST_0, REF_0, TMP24);
+		vis_mul8x16(CONST_128, TMP22, TMP22);
+
+		vis_or(DST_2, REF_2, TMP26);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_ld64_2(dest, stride_8, DST_2);
+		vis_faligndata(TMP16, TMP18, REF_2);
+
+		vis_and(TMP20, MASK_7f, TMP20);
+
+		vis_and(TMP22, MASK_7f, TMP22);
+
+		vis_psub16(TMP24, TMP20, TMP20);
+		vis_st64(TMP20, dest[0]);
+
+		vis_psub16(TMP26, TMP22, TMP22);
+		vis_st64_2(TMP22, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP6);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_ld64_2(ref, offset, TMP4);
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_xor(DST_2, REF_2, TMP8);
+
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_or(DST_0, REF_0, TMP10);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+
+	vis_or(DST_2, REF_2, TMP12);
+	vis_ld64_2(dest, stride_8, DST_2);
+
+	vis_ld64(ref[0], TMP14);
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+
+	dest += stride;
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_xor(DST_0, REF_0, TMP20);
+
+	vis_and(TMP20, MASK_fe, TMP20);
+
+	vis_xor(DST_2, REF_2, TMP22);
+	vis_mul8x16(CONST_128, TMP20, TMP20);
+
+	vis_and(TMP22, MASK_fe, TMP22);
+
+	vis_or(DST_0, REF_0, TMP24);
+	vis_mul8x16(CONST_128, TMP22, TMP22);
+
+	vis_or(DST_2, REF_2, TMP26);
+
+	vis_and(TMP20, MASK_7f, TMP20);
+
+	vis_and(TMP22, MASK_7f, TMP22);
+
+	vis_psub16(TMP24, TMP20, TMP20);
+	vis_st64(TMP20, dest[0]);
+
+	vis_psub16(TMP26, TMP22, TMP22);
+	vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_o_8_vis (uint8_t * dest, uint8_t * _ref,
+			    int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, offset, TMP2);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP4);
+
+		vis_ld64_2(ref, offset, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_ld64(ref[0], TMP12);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		vis_xor(DST_0, REF_0, TMP0);
+		ref += stride;
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+
+		vis_faligndata(TMP12, TMP2, REF_0);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_psub16(TMP6, TMP0, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP4);
+
+	vis_ld64_2(ref, offset, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(DST_0, REF_0, TMP6);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(DST_0, REF_0, TMP0);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, TMP4);
+	vis_st64(TMP4, dest[0]);
+	dest += stride;
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_or(DST_0, REF_0, TMP6);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_psub16(TMP6, TMP0, TMP4);
+	vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_x_16_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0],    TMP0);
+
+	vis_ld64_2(ref, 8,  TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 34 cycles */
+		vis_ld64(ref[0],    TMP0);
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_ld64_2(ref, 8,  TMP2);
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_ld64_2(ref, 16, TMP4);
+		vis_and(TMP6, MASK_fe, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],    TMP14);
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_ld64_2(ref, 8,  TMP16);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_ld64_2(ref, 16, TMP18);
+		ref += stride;
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_faligndata(TMP16, TMP18, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP14, TMP16, REF_2);
+			vis_faligndata(TMP16, TMP18, REF_6);
+		} else {
+			vis_src1(TMP16, REF_2);
+			vis_src1(TMP18, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0],    TMP0);
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_ld64_2(ref, 8,  TMP2);
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_x_8_vis (uint8_t * dest, uint8_t * _ref,
+			    int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 20 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+		ref += stride;
+
+		vis_ld64(ref[0], TMP8);
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+		} else {
+			vis_src1(TMP2, REF_2);
+		}
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_or(REF_0, REF_2, TMP14);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+		vis_faligndata(TMP8, TMP10, REF_0);
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP8, TMP10, REF_2);
+		} else {
+			vis_src1(TMP10, REF_2);
+		}
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_or(REF_0, REF_2, TMP14);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+}
+
+static void MC_avg_x_16_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	do {	/* 26 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[16], TMP4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(dest[8], DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_mul8x16au(REF_0,   CONST_256, TMP0);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO, REF_2_1, TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_mul8x16al(DST_0,   CONST_512, TMP4);
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_mul8x16al(DST_1,   CONST_512, TMP6);
+
+		vis_mul8x16au(REF_6,   CONST_256, TMP12);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4,   CONST_256, TMP16);
+
+		vis_padd16(TMP0, CONST_3, TMP8);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+		vis_padd16(TMP2, CONST_3, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_padd16(TMP16, TMP12, TMP0);
+
+		vis_st64(DST_0, dest[0]);
+		vis_mul8x16al(DST_2,   CONST_512, TMP4);
+		vis_padd16(TMP18, TMP14, TMP2);
+
+		vis_mul8x16al(DST_3,   CONST_512, TMP6);
+		vis_padd16(TMP0, CONST_3, TMP0);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[8]);
+
+		ref += stride;
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_x_8_vis (uint8_t * dest, uint8_t * _ref,
+			    int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_times_2 = stride << 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	height >>= 2;
+	do {	/* 47 cycles */
+		vis_ld64(ref[0],   TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[0],   TMP4);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 8, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],   TMP8);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP4, TMP6, REF_4);
+
+		vis_ld64(ref[0],   TMP12);
+
+		vis_ld64_2(ref, 8, TMP14);
+		ref += stride;
+		vis_faligndata(TMP8, TMP10, REF_S0);
+
+		vis_faligndata(TMP12, TMP14, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+
+			vis_ld64(dest[0], DST_0);
+			vis_faligndata(TMP0, TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_faligndata(TMP4, TMP6, REF_6);
+
+			vis_faligndata(TMP8, TMP10, REF_S2);
+
+			vis_faligndata(TMP12, TMP14, REF_S6);
+		} else {
+			vis_ld64(dest[0], DST_0);
+			vis_src1(TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_src1(TMP6, REF_6);
+
+			vis_src1(TMP10, REF_S2);
+
+			vis_src1(TMP14, REF_S6);
+		}
+
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP16, TMP0);
+		vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP10, CONST_3, TMP10);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP8, TMP16, TMP8);
+
+		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+		vis_padd16(TMP10, TMP18, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_pmerge(ZERO,     REF_S0,     TMP0);
+
+		vis_pmerge(ZERO,     REF_S2,     TMP24);
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP24, TMP0);
+		vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP10, CONST_3, TMP10);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+		vis_padd16(TMP0, TMP16, TMP0);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(TMP8, TMP20, TMP8);
+
+		vis_padd16(TMP10, TMP22, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_y_16_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP6);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64_2(ref, 8, TMP8);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_ld64_2(ref, offset, TMP10);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP6, TMP8, REF_2);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP8, TMP10, REF_6);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_xor(REF_4, REF_6, TMP16);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_ld64(ref[0], TMP6);
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_ld64_2(ref, 8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_and(TMP16, MASK_fe, TMP16);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_mul8x16(CONST_128, TMP16, TMP16);
+		vis_xor(REF_0, REF_2, TMP0);
+
+		vis_xor(REF_4, REF_6, TMP2);
+
+		vis_or(REF_0, REF_2, TMP20);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_and(TMP16, MASK_7f, TMP16);
+
+		vis_psub16(TMP14, TMP12, TMP12);
+		vis_st64(TMP12, dest[0]);
+
+		vis_psub16(TMP18, TMP16, TMP16);
+		vis_st64_2(TMP16, dest, 8);
+		dest += stride;
+
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP2, MASK_fe, TMP2);
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16(CONST_128, TMP2, TMP2);
+
+		vis_faligndata(TMP8, TMP10, REF_6);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_and(TMP2, MASK_7f, TMP2);
+
+		vis_psub16(TMP20, TMP0, TMP0);
+		vis_st64(TMP0, dest[0]);
+
+		vis_psub16(TMP18, TMP2, TMP2);
+		vis_st64_2(TMP2, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_xor(REF_4, REF_6, TMP16);
+
+	vis_ld64_2(ref, offset, TMP4);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_and(TMP16, MASK_fe, TMP16);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_mul8x16(CONST_128, TMP16, TMP16);
+	vis_xor(REF_0, REF_2, TMP0);
+
+	vis_xor(REF_4, REF_6, TMP2);
+
+	vis_or(REF_0, REF_2, TMP20);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_and(TMP16, MASK_7f, TMP16);
+
+	vis_psub16(TMP14, TMP12, TMP12);
+	vis_st64(TMP12, dest[0]);
+
+	vis_psub16(TMP18, TMP16, TMP16);
+	vis_st64_2(TMP16, dest, 8);
+	dest += stride;
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP2, MASK_fe, TMP2);
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_mul8x16(CONST_128, TMP2, TMP2);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_and(TMP2, MASK_7f, TMP2);
+
+	vis_psub16(TMP20, TMP0, TMP0);
+	vis_st64(TMP0, dest[0]);
+
+	vis_psub16(TMP18, TMP2, TMP2);
+	vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_y_8_vis (uint8_t * dest, uint8_t * _ref,
+			    int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, offset, TMP2);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP4);
+
+	vis_ld64_2(ref, offset, TMP6);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP4, TMP6, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_2);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, offset, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_y_16_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int stride_16;
+	int offset;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+	stride_16 = stride + offset;
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_6);
+	height >>= 1;
+
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP12);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_pmerge(ZERO,       REF_6,     TMP16);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_pmerge(ZERO,     REF_4,     TMP4);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+
+		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+		vis_faligndata(TMP8, TMP10, REF_6);
+		vis_mul8x16al(DST_0,   CONST_512, TMP20);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_1,   CONST_512, TMP22);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP4, CONST_3, TMP4);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_padd16(TMP6, CONST_3, TMP6);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+		vis_mul8x16al(REF_S0,   CONST_512, TMP20);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_mul8x16al(REF_S2,   CONST_512, TMP24);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_2,   CONST_256, TMP28);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+		vis_padd16(TMP16, TMP4, TMP16);
+		vis_mul8x16au(REF_6,   CONST_256, REF_S4);
+
+		vis_padd16(TMP18, TMP6, TMP18);
+		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+		vis_pack16(TMP12, DST_0);
+		vis_padd16(TMP28, TMP0, TMP12);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP30, TMP2, TMP14);
+
+		vis_pack16(TMP16, DST_2);
+		vis_padd16(REF_S4, TMP4, TMP16);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(REF_S6, TMP6, TMP18);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_pack16(TMP16, DST_2);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_y_8_vis (uint8_t * dest, uint8_t * _ref,
+			    int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8;
+	int offset;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, offset, TMP2);
+	stride_8 = stride + offset;
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+
+	height >>= 1;
+	do {	/* 20 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP8);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+
+		vis_ld64_2(dest, stride, DST_2);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+		vis_pmerge(ZERO,       REF_0,     TMP12);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+		vis_pmerge(ZERO,       REF_0_1,   TMP14);
+
+		vis_padd16(TMP12, CONST_3, TMP12);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP14, CONST_3, TMP14);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_faligndata(TMP4, TMP6, REF_2);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_mul8x16au(REF_2,   CONST_256, TMP20);
+
+		vis_padd16(TMP8, TMP16, TMP0);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+		vis_padd16(TMP10, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+
+		vis_padd16(TMP12, TMP24, TMP0);
+
+		vis_padd16(TMP14, TMP26, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_16_vis (uint8_t * dest, uint8_t * _ref,
+			      int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants2[0], CONST_2);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16au(REF_0, CONST_256, TMP0);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_padd16(TMP0, CONST_2, TMP8);
+		vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+		vis_padd16(TMP2, CONST_2, TMP10);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+		vis_padd16(TMP8, TMP4, TMP8);
+		vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+		vis_padd16(TMP10, TMP6, TMP10);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP8, TMP12);
+
+		vis_padd16(TMP14, TMP10, TMP14);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP0, CONST_2, TMP12);
+
+		vis_mul8x16au(REF_S0, CONST_256, TMP0);
+		vis_padd16(TMP2, CONST_2, TMP14);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_padd16(TMP12, TMP4, TMP12);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP4);
+		vis_padd16(TMP14, TMP6, TMP14);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+		vis_padd16(TMP20, TMP12, TMP20);
+
+		vis_padd16(TMP22, TMP14, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(TMP0, TMP4, TMP24);
+
+		vis_mul8x16au(REF_S4, CONST_256, TMP0);
+		vis_padd16(TMP2, TMP6, TMP26);
+
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+		vis_padd16(TMP24, TMP8, TMP24);
+
+		vis_padd16(TMP26, TMP10, TMP26);
+		vis_pack16(TMP24, DST_0);
+
+		vis_pack16(TMP26, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_pmerge(ZERO, REF_S6, TMP4);
+
+		vis_pmerge(ZERO,      REF_S6_1,  TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_padd16(TMP0, TMP12, TMP0);
+
+		vis_padd16(TMP2, TMP14, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_8_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants2[0], CONST_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 26 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0,   CONST_256, TMP8);
+		vis_pmerge(ZERO,        REF_S2,    TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+		vis_pmerge(ZERO,        REF_S2_1,  TMP14);
+
+		vis_ld64_2(ref, stride, TMP4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_pmerge(ZERO, REF_S4, TMP18);
+
+		vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_padd16(TMP18, CONST_2, TMP18);
+		vis_mul8x16au(REF_S6,   CONST_256, TMP22);
+
+		vis_padd16(TMP20, CONST_2, TMP20);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+		vis_mul8x16au(REF_S0,   CONST_256, TMP26);
+		vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+		vis_mul8x16au(REF_S2,   CONST_256, TMP30);
+		vis_padd16(TMP18, TMP22, TMP18);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP8,  TMP18, TMP8);
+
+		vis_padd16(TMP10, TMP20, TMP10);
+
+		vis_padd16(TMP8,  TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP8,  DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP18, TMP26, TMP18);
+
+		vis_padd16(TMP20, TMP28, TMP20);
+
+		vis_padd16(TMP18, TMP30, TMP18);
+
+		vis_padd16(TMP20, TMP32, TMP20);
+		vis_pack16(TMP18, DST_2);
+
+		vis_pack16(TMP20, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_16_vis (uint8_t * dest, uint8_t * _ref,
+			      int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants6[0], CONST_6);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {	/* 55 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_0, TMP0);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP0, CONST_6, TMP0);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP2, CONST_6, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+		vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+		vis_padd16(TMP12, TMP30, TMP12);
+
+		vis_padd16(TMP14, TMP32, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP4, CONST_6, TMP4);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP6, CONST_6, TMP6);
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+		vis_padd16(TMP4, TMP8, TMP4);
+		vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
+
+		vis_padd16(TMP6, TMP10, TMP6);
+
+		vis_padd16(TMP20, TMP4, TMP20);
+
+		vis_padd16(TMP22, TMP6, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_padd16(TMP20, REF_0, TMP20);
+		vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+		vis_padd16(TMP22, REF_2, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO,      REF_S4_1,  REF_2);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_padd16(REF_4, TMP0, TMP8);
+
+		vis_mul8x16au(REF_S6, CONST_256, REF_4);
+		vis_padd16(REF_6, TMP2, TMP10);
+
+		vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(REF_0, TMP4, REF_0);
+
+		vis_mul8x16al(DST_2,   CONST_1024, TMP30);
+		vis_padd16(REF_2, TMP6, REF_2);
+
+		vis_mul8x16al(DST_3,   CONST_1024, TMP32);
+		vis_padd16(REF_0, REF_4, REF_0);
+
+		vis_padd16(REF_2, REF_6, REF_2);
+
+		vis_padd16(REF_0, TMP30, REF_0);
+
+		/* stall */
+
+		vis_padd16(REF_2, TMP32, REF_2);
+		vis_pack16(REF_0, DST_2);
+
+		vis_pack16(REF_2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_8_vis (uint8_t * dest, uint8_t * _ref,
+			     int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64(constants6[0], CONST_6);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP8);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		vis_ld64_2(dest, stride, DST_2);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_S4, TMP22);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP24);
+
+		vis_mul8x16au(REF_S6, CONST_256, TMP26);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP28);
+
+		vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+		vis_padd16(TMP22, CONST_6, TMP22);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+		vis_padd16(TMP24, CONST_6, TMP24);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP24, TMP28, TMP24);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP26);
+		vis_padd16(TMP8, TMP22, TMP8);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+		vis_padd16(TMP10, TMP24, TMP10);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(REF_S4, TMP22, TMP12);
+
+		vis_padd16(REF_S6, TMP24, TMP14);
+
+		vis_padd16(TMP12, TMP26, TMP12);
+
+		vis_padd16(TMP14, TMP28, TMP14);
+
+		vis_padd16(TMP12, REF_0, TMP12);
+
+		vis_padd16(TMP14, REF_2, TMP14);
+		vis_pack16(TMP12, DST_2);
+
+		vis_pack16(TMP14, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+MPEG2_MC_EXTERN(vis);
+
+#endif  /* defined(ARCH_SPARC) && defined(ENABLE_VIS) */
diff --git a/src/video_dec/libmpeg2/mpeg2.h b/src/video_dec/libmpeg2/mpeg2.h
new file mode 100644
index 000000000..ae69688f5
--- /dev/null
+++ b/src/video_dec/libmpeg2/mpeg2.h
@@ -0,0 +1,100 @@
+/*
+ * mpeg2.h
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Structure for the mpeg2dec decoder */
+
+#ifndef MPEG2_H
+#define MPEG2_H
+
+#include "libmpeg2_accel.h"
+
+typedef struct mpeg2dec_s {
+    xine_video_port_t * output;
+    uint32_t frame_format;
+
+    /* this is where we keep the state of the decoder */
+    struct picture_s * picture;
+    void *picture_base;
+    
+    uint32_t shift;
+    int new_sequence;
+    int is_sequence_needed;
+    int is_wait_for_ip_frames;
+    int frames_to_drop, drop_frame;
+    int in_slice;
+    int seek_mode, is_frame_needed;
+
+    /* the maximum chunk size is determined by vbv_buffer_size */
+    /* which is 224K for MP@ML streams. */
+    /* (we make no pretenses of decoding anything more than that) */
+    /* allocated in init - gcc has problems allocating such big structures */
+    uint8_t * chunk_buffer;
+    void *chunk_base;
+    /* pointer to current position in chunk_buffer */
+    uint8_t * chunk_ptr;
+    /* last start code ? */
+    uint8_t code;
+    uint32_t chunk_size;
+
+    int64_t pts;
+    uint32_t rff_pattern; 
+    int force_aspect;
+    int force_pan_scan;
+
+    /* AFD data can be found after a sequence, group or picture start code */
+    /* and will be stored in afd_value_seen. Later it will be transfered to */
+    /* a stream property and stored into afd_value_reported to detect changes */
+    int afd_value_seen;
+    int afd_value_reported;
+
+    xine_stream_t *stream;
+    
+    /* a spu decoder for possible closed captions */
+    spu_decoder_t *cc_dec;
+    mpeg2dec_accel_t accel;
+
+} mpeg2dec_t ;
+
+
+/* initialize mpegdec with a opaque user pointer */
+void mpeg2_init (mpeg2dec_t * mpeg2dec, 
+		 xine_video_port_t * output);
+
+/* destroy everything which was allocated, shutdown the output */
+void mpeg2_close (mpeg2dec_t * mpeg2dec);
+
+int mpeg2_decode_data (mpeg2dec_t * mpeg2dec,
+		       uint8_t * data_start, uint8_t * data_end, 
+		       uint64_t pts);
+
+void mpeg2_find_sequence_header (mpeg2dec_t * mpeg2dec,
+				 uint8_t * data_start, uint8_t * data_end);
+
+void mpeg2_flush (mpeg2dec_t * mpeg2dec);
+void mpeg2_reset (mpeg2dec_t * mpeg2dec);
+void mpeg2_discontinuity (mpeg2dec_t * mpeg2dec);
+
+/* Not needed, it is defined as static in decode.c, and no-one else called it
+ * currently
+ */
+/* void process_userdata(mpeg2dec_t *mpeg2dec, uint8_t *buffer); */
+
+#endif
diff --git a/src/video_dec/libmpeg2/mpeg2_internal.h b/src/video_dec/libmpeg2/mpeg2_internal.h
new file mode 100644
index 000000000..eeaa16227
--- /dev/null
+++ b/src/video_dec/libmpeg2/mpeg2_internal.h
@@ -0,0 +1,294 @@
+/*
+ * mpeg2_internal.h
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef MPEG2_INTERNAL_H
+#define MPEG2_INTERNAL_H
+
+#include <xine/video_out.h>
+#include "accel_xvmc.h"
+
+#ifdef ENABLE_ALTIVEC
+#include <altivec.h>
+#endif
+
+/* macroblock modes */
+#define MACROBLOCK_INTRA            XINE_MACROBLOCK_INTRA
+#define MACROBLOCK_PATTERN          XINE_MACROBLOCK_PATTERN
+#define MACROBLOCK_MOTION_BACKWARD  XINE_MACROBLOCK_MOTION_BACKWARD
+#define MACROBLOCK_MOTION_FORWARD   XINE_MACROBLOCK_MOTION_FORWARD
+#define MACROBLOCK_QUANT            XINE_MACROBLOCK_QUANT
+#define DCT_TYPE_INTERLACED         XINE_MACROBLOCK_DCT_TYPE_INTERLACED
+
+/* motion_type */
+#define MOTION_TYPE_MASK (3*64)
+#define MOTION_TYPE_BASE 64
+#define MC_FIELD (1*64)
+#define MC_FRAME (2*64)
+#define MC_16X8 (2*64)
+#define MC_DMV (3*64)
+
+/* picture structure */
+#define TOP_FIELD     VO_TOP_FIELD
+#define BOTTOM_FIELD  VO_BOTTOM_FIELD
+#define FRAME_PICTURE VO_BOTH_FIELDS
+
+/* picture coding type (mpeg2 header) */
+#define I_TYPE 1
+#define P_TYPE 2
+#define B_TYPE 3
+#define D_TYPE 4
+               
+typedef struct motion_s {
+    uint8_t * ref[2][3];
+    uint8_t ** ref2[2];
+    int pmv[2][2];
+    int f_code[2];
+} motion_t;
+
+typedef struct picture_s {
+    /* first, state that carries information from one macroblock to the */
+    /* next inside a slice, and is never used outside of mpeg2_slice() */
+
+    /* DCT coefficients - should be kept aligned ! */
+    int16_t DCTblock[64];
+
+    /* XvMC DCT block and macroblock data for XvMC acceleration */
+    xine_macroblocks_t *mc;
+    int XvMC_mb_type;
+    int XvMC_mv_field_sel[2][2];
+    int XvMC_x;
+    int XvMC_y;
+    int XvMC_motion_type;
+    int XvMC_dmvector[2];
+    int XvMC_cbp;
+    int XvMC_dct_type;
+
+    /* bit parsing stuff */
+    uint32_t bitstream_buf;	/* current 32 bit working set of buffer */
+    int bitstream_bits;		/* used bits in working set */
+    uint8_t * bitstream_ptr;	/* buffer with stream data */
+
+    uint8_t * dest[3];
+    int pitches[3];
+    int offset;
+    unsigned int limit_x;
+    unsigned int limit_y_16;
+    unsigned int limit_y_8;
+    unsigned int limit_y;
+
+    /* Motion vectors */
+    /* The f_ and b_ correspond to the forward and backward motion */
+    /* predictors */
+    motion_t b_motion;
+    motion_t f_motion;
+
+    /* predictor for DC coefficients in intra blocks */
+    int16_t dc_dct_pred[3];
+
+    int quantizer_scale;	/* remove */
+    int current_field;		/* remove */
+    int dmv_offset;		/* remove */
+    unsigned int v_offset;		/* remove */
+
+
+    /* now non-slice-specific information */
+
+    /* sequence header stuff */
+    uint8_t intra_quantizer_matrix [64];
+    uint8_t non_intra_quantizer_matrix [64];
+    int load_intra_quantizer_matrix;
+    int load_non_intra_quantizer_matrix;
+
+    /* The width and height of the picture snapped to macroblock units */
+    int coded_picture_width;
+    int coded_picture_height;
+    
+    /* The width and height as it appears on header sequence */
+    unsigned int display_width, display_height;
+
+    /* picture header stuff */
+
+    /* what type of picture this is (I, P, B, D) */
+    int picture_coding_type;
+
+    int vbv_delay;
+    int low_delay;
+	
+    /* picture coding extension stuff */
+	
+    /* quantization factor for intra dc coefficients */
+    int intra_dc_precision;
+    /* top/bottom/both fields */
+    int picture_structure;
+    /* bool to indicate all predictions are frame based */
+    int frame_pred_frame_dct;
+    /* bool to indicate whether intra blocks have motion vectors */
+    /* (for concealment) */
+    int concealment_motion_vectors;
+    /* bit to indicate which quantization table to use */
+    int q_scale_type;
+    /* bool to use different vlc tables */
+    int intra_vlc_format;
+    /* used for DMV MC */
+    int top_field_first;
+
+    /* stuff derived from bitstream */
+
+    /* pointer to the zigzag scan we're supposed to be using */
+    uint8_t * scan;
+
+    struct vo_frame_s * current_frame;
+    struct vo_frame_s * forward_reference_frame;
+    struct vo_frame_s * backward_reference_frame;
+
+    int frame_width, frame_height;
+
+    int second_field;
+
+    int mpeg1;
+
+    int skip_non_intra_dct;
+
+    /* these things are not needed by the decoder */
+    /* this is a temporary interface, we will build a better one later. */
+    int aspect_ratio_information;
+    int saved_aspect_ratio;
+    int frame_rate_code;
+    int progressive_sequence;
+    int repeat_first_field;
+    int progressive_frame;
+    int32_t frame_centre_horizontal_offset;
+    int32_t frame_centre_vertical_offset;
+    uint32_t video_format;
+    uint32_t colour_description;
+    uint32_t colour_primatives;
+    uint32_t transfer_characteristics;
+    uint32_t matrix_coefficients;
+    uint32_t display_horizontal_size;
+    uint32_t display_vertical_size;
+    uint32_t drop_frame_flag;
+    uint32_t time_code_hours;
+    uint32_t time_code_minutes;
+    uint32_t time_code_seconds;
+    uint32_t time_code_pictures;
+    uint32_t closed_gop;
+    uint32_t broken_link;
+
+    int bitrate;
+    int frame_rate_ext_n;
+    int frame_rate_ext_d;
+
+} picture_t;
+
+typedef struct cpu_state_s {
+#ifdef ARCH_PPC
+    uint8_t regv[12*16];
+#endif
+    int dummy;
+} cpu_state_t;
+
+/* cpu_state.c */
+extern void (* mpeg2_cpu_state_save) (cpu_state_t * state);
+extern void (* mpeg2_cpu_state_restore) (cpu_state_t * state);
+void mpeg2_cpu_state_init (uint32_t mm_accel);
+
+/* header.c */
+extern uint8_t mpeg2_scan_norm[64];
+extern uint8_t mpeg2_scan_alt[64];
+void mpeg2_header_state_init (picture_t * picture);
+int mpeg2_header_picture (picture_t * picture, uint8_t * buffer);
+int mpeg2_header_sequence (picture_t * picture, uint8_t * buffer);
+int mpeg2_header_extension (picture_t * picture, uint8_t * buffer);
+int mpeg2_header_group_of_pictures (picture_t * picture, uint8_t * buffer);
+
+/* idct.c */
+extern void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+extern void (* mpeg2_idct_add) (int16_t * block, uint8_t * dest, int stride);
+extern void (* mpeg2_idct) (int16_t * block);
+extern void (* mpeg2_zero_block) (int16_t * block);
+void mpeg2_idct_init (uint32_t mm_accel);
+
+/* idct_mlib.c */
+void mpeg2_idct_add_mlib (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_copy_mlib_non_ieee (int16_t * block, uint8_t * dest,
+				    int stride);
+void mpeg2_idct_add_mlib_non_ieee (int16_t * block, uint8_t * dest,
+				   int stride);
+void mpeg2_idct_mlib (int16_t * block);
+
+/* idct_mmx.c */
+void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmxext (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_mmxext (int16_t * block);
+void mpeg2_idct_copy_mmx (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmx (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_mmx (int16_t * block);
+void mpeg2_zero_block_mmx (int16_t * block);
+void mpeg2_idct_mmx_init (void);
+
+/* idct_altivec.c */
+# ifdef ENABLE_ALTIVEC
+void mpeg2_idct_copy_altivec (vector signed short * block, unsigned char * dest,
+			      int stride);
+void mpeg2_idct_add_altivec (vector signed short * block, unsigned char * dest,
+			     int stride);
+# else /* ! ENABLE_ALTIVEC */
+void mpeg2_idct_copy_altivec (signed short * block, unsigned char * dest,
+			      int stride);
+void mpeg2_idct_add_altivec (signed short * block, unsigned char * dest,
+			     int stride);
+# endif /* ENABLE_ALTIVEC */
+void mpeg2_idct_altivec_init (void);
+
+/* motion_comp.c */
+void mpeg2_mc_init (uint32_t mm_accel);
+
+typedef struct mpeg2_mc_s {
+    void (* put [8]) (uint8_t * dst, uint8_t *, int32_t, int32_t);
+    void (* avg [8]) (uint8_t * dst, uint8_t *, int32_t, int32_t);
+} mpeg2_mc_t;
+
+#define MPEG2_MC_EXTERN(x) mpeg2_mc_t mpeg2_mc_##x = {			  \
+    {MC_put_o_16_##x, MC_put_x_16_##x, MC_put_y_16_##x, MC_put_xy_16_##x, \
+     MC_put_o_8_##x,  MC_put_x_8_##x,  MC_put_y_8_##x,  MC_put_xy_8_##x}, \
+    {MC_avg_o_16_##x, MC_avg_x_16_##x, MC_avg_y_16_##x, MC_avg_xy_16_##x, \
+     MC_avg_o_8_##x,  MC_avg_x_8_##x,  MC_avg_y_8_##x,  MC_avg_xy_8_##x}  \
+};
+
+extern mpeg2_mc_t mpeg2_mc;
+extern mpeg2_mc_t mpeg2_mc_c;
+extern mpeg2_mc_t mpeg2_mc_mmx;
+extern mpeg2_mc_t mpeg2_mc_mmxext;
+extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_altivec;
+extern mpeg2_mc_t mpeg2_mc_mlib;
+extern mpeg2_mc_t mpeg2_mc_vis;
+
+/* slice.c */
+void mpeg2_slice (picture_t * picture, int code, uint8_t * buffer);
+
+/* stats.c */
+void mpeg2_stats (int code, uint8_t * buffer);
+
+
+#endif
diff --git a/src/video_dec/libmpeg2/slice.c b/src/video_dec/libmpeg2/slice.c
new file mode 100644
index 000000000..8247a9a24
--- /dev/null
+++ b/src/video_dec/libmpeg2/slice.c
@@ -0,0 +1,1833 @@
+/*
+ * slice.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include "mpeg2_internal.h"
+#include <xine/attributes.h>
+
+#include "vlc.h"
+
+static const int non_linear_quantizer_scale [] = {
+     0,  1,  2,  3,  4,  5,   6,   7,
+     8, 10, 12, 14, 16, 18,  20,  22,
+    24, 28, 32, 36, 40, 44,  48,  52,
+    56, 64, 72, 80, 88, 96, 104, 112
+};
+
+static inline int get_macroblock_modes (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int macroblock_modes;
+    const MBtab * tab;
+
+    switch (picture->picture_coding_type) {
+    case I_TYPE:
+
+	tab = MB_I + UBITS (bit_buf, 1);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if ((! (picture->frame_pred_frame_dct)) &&
+	    (picture->picture_structure == FRAME_PICTURE)) {
+	    macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+	    DUMPBITS (bit_buf, bits, 1);
+	}
+
+	return macroblock_modes;
+
+    case P_TYPE:
+
+	tab = MB_P + UBITS (bit_buf, 5);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (picture->picture_structure != FRAME_PICTURE) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (picture->frame_pred_frame_dct) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
+		macroblock_modes |= MC_FRAME;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case B_TYPE:
+
+	tab = MB_B + UBITS (bit_buf, 6);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (picture->picture_structure != FRAME_PICTURE) {
+	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (picture->frame_pred_frame_dct) {
+	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
+	    macroblock_modes |= MC_FRAME;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_INTRA)
+		goto intra;
+	    macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+	    DUMPBITS (bit_buf, bits, 2);
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+	    intra:
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case D_TYPE:
+
+	DUMPBITS (bit_buf, bits, 1);
+	return MACROBLOCK_INTRA;
+
+    default:
+	return 0;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_quantizer_scale (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    int quantizer_scale_code;
+
+    quantizer_scale_code = UBITS (bit_buf, 5);
+    DUMPBITS (bit_buf, bits, 5);
+
+    if (picture->q_scale_type)
+	return non_linear_quantizer_scale [quantizer_scale_code];
+    else
+	return quantizer_scale_code << 1;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_motion_delta (picture_t * picture, int f_code)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    int delta;
+    int sign;
+    const MVtab * tab;
+
+    if (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 1);
+	return 0;
+    } else if (bit_buf >= 0x0c000000) {
+
+	tab = MV_4 + UBITS (bit_buf, 4);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + f_code + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code)
+	    delta += UBITS (bit_buf, f_code);
+	bit_buf <<= f_code;
+
+	return (delta ^ sign) - sign;
+
+    } else {
+
+	tab = MV_10 + UBITS (bit_buf, 10);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code) {
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    delta += UBITS (bit_buf, f_code);
+	    DUMPBITS (bit_buf, bits, f_code);
+	}
+
+	return (delta ^ sign) - sign;
+
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int bound_motion_vector (int vec, int f_code)
+{
+#if 1
+    unsigned int limit;
+    int sign;
+
+    limit = 16 << f_code;
+
+    if ((unsigned int)(vec + limit) < 2 * limit)
+	return vec;
+    else {
+	sign = ((int32_t)vec) >> 31;
+	return vec - ((2 * limit) ^ sign) + sign;
+    }
+#else
+    return ((int32_t)vector << (27 - f_code)) >> (27 - f_code);
+#endif
+}
+
+static inline int get_dmv (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    const DMVtab * tab;
+
+    tab = DMV_2 + UBITS (bit_buf, 2);
+    DUMPBITS (bit_buf, bits, tab->len);
+    return tab->dmv;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_coded_block_pattern (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    const CBPtab * tab;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    if (bit_buf >= 0x20000000) {
+
+	tab = CBP_7 + (UBITS (bit_buf, 7) - 16);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+
+    } else {
+
+	tab = CBP_9 + UBITS (bit_buf, 9);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+    }
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_luma_dc_dct_diff (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_lum_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff;
+	} else {
+	    DUMPBITS (bit_buf, bits, 3);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 9) - 0x1e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_chroma_dc_dct_diff (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_chrom_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff;
+	} else {
+	    DUMPBITS (bit_buf, bits, 2);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 10) - 0x3e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len + 1);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define SATURATE(val)			\
+do {					\
+    if ((uint32_t)(val + 2048) > 4095)	\
+	val = (val > 0) ? 2047 : -2048;	\
+} while (0)
+
+static void get_intra_block_B14 (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    dest = picture->DCTblock;
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = (SBITS (bit_buf, 12) *
+		   quantizer_scale * quant_matrix[j]) / 16;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_intra_block_B15 (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    dest = picture->DCTblock;
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B15_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64) {
+
+	    normal_code:
+		j = scan[i];
+		bit_buf <<= tab->len;
+		bits += tab->len + 1;
+		val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+
+		/* if (bitstream_get (1)) val = -val; */
+		val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		bit_buf <<= 1;
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    } else {
+
+		/* end of block. I commented out this code because if we */
+		/* dont exit here we will still exit at the later test :) */
+
+		/* if (i >= 128) break;	*/	/* end of block */
+
+		/* escape code */
+
+		i += UBITS (bit_buf << 6, 6) - 64;
+		if (i >= 64)
+		    break;	/* illegal, check against buffer overflow */
+
+		j = scan[i];
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		val = (SBITS (bit_buf, 12) *
+		       quantizer_scale * quant_matrix[j]) / 16;
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    }
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B15_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 4);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_non_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = -1;
+    mismatch = 1;
+    dest = picture->DCTblock;
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1;
+	    val = (val * quantizer_scale * quant_matrix[j]) / 32;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_mpeg1_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = 0;
+    dest = picture->DCTblock;
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quantizer_scale * quant_matrix[j]) >> 4;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = (val * quantizer_scale * quant_matrix[j]) / 16;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_mpeg1_non_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int val;
+    uint8_t * scan = picture->scan;
+    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = -1;
+    dest = picture->DCTblock;
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[j]) >> 5;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = 2 * (val + SBITS (val, 1)) + 1;
+	    val = (val * quantizer_scale * quant_matrix[j]) / 32;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static inline void slice_intra_DCT (picture_t * picture, int cc,
+				    uint8_t * dest, int stride)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)  
+#define bit_ptr (picture->bitstream_ptr)
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    /* Get the intra DC coefficient and inverse quantize it */
+    if (cc == 0)
+	picture->dc_dct_pred[0] += get_luma_dc_dct_diff (picture);
+    else
+	picture->dc_dct_pred[cc] += get_chroma_dc_dct_diff (picture);
+    picture->DCTblock[0] =
+	picture->dc_dct_pred[cc] << (3 - picture->intra_dc_precision);
+
+    if (picture->mpeg1) {
+	if (picture->picture_coding_type != D_TYPE)
+	    get_mpeg1_intra_block (picture);
+    } else if (picture->intra_vlc_format)
+	get_intra_block_B15 (picture);
+    else
+	get_intra_block_B14 (picture);
+    mpeg2_idct_copy (picture->DCTblock, dest, stride);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void slice_non_intra_DCT (picture_t * picture, uint8_t * dest,
+					int stride)
+{
+    if (picture->mpeg1)
+	get_mpeg1_non_intra_block (picture);
+    else
+	get_non_intra_block (picture);
+    mpeg2_idct_add (picture->DCTblock, dest, stride);
+}
+
+#define MOTION(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * picture->offset + motion_x;				      \
+    pos_y = 2 * picture->v_offset + motion_y + 2 * y;			      \
+    if (pos_x > picture->limit_x) {					      \
+      pos_x = ((int)pos_x < 0) ? 0 : picture->limit_x;			      \
+      motion_x = pos_x - 2 * picture->offset;				      \
+    }									      \
+    if (pos_y > picture->limit_y_ ## size){				      \
+      pos_y = ((int)pos_y < 0) ? 0 : picture->limit_y_ ## size;		      \
+      motion_y = pos_y - 2 * picture->v_offset - 2 * y;			      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (picture->dest[0] + y * picture->pitches[0] +	      \
+		    picture->offset, ref[0] + (pos_x >> 1) +		      \
+		    (pos_y >> 1) * picture->pitches[0], picture->pitches[0],  \
+		    size);						      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (picture->dest[1] + y/2 * picture->pitches[1] +	      \
+		      (picture->offset >> 1), ref[1] +			      \
+		      (((picture->offset + motion_x) >> 1) +		      \
+		       ((((picture->v_offset + motion_y) >> 1) + y/2) *	      \
+		        picture->pitches[1])), picture->pitches[1], size/2);  \
+    table[4+xy_half] (picture->dest[2] + y/2 * picture->pitches[2] +	      \
+		      (picture->offset >> 1), ref[2] +			      \
+		      (((picture->offset + motion_x) >> 1) +		      \
+		       ((((picture->v_offset + motion_y) >> 1) + y/2) *	      \
+		        picture->pitches[2])), picture->pitches[2], size/2)   \
+
+#define MOTION_FIELD(table,ref,motion_x,motion_y,dest_field,op,src_field)     \
+    pos_x = 2 * picture->offset + motion_x;				      \
+    pos_y = picture->v_offset + motion_y;				      \
+    if (pos_x > picture->limit_x) {					      \
+      pos_x = ((int)pos_x < 0) ? 0 : picture->limit_x;			      \
+      motion_x = pos_x - 2 * picture->offset;				      \
+    }									      \
+    if (pos_y > picture->limit_y){					      \
+      pos_y = ((int)pos_y < 0) ? 0 : picture->limit_y;			      \
+      motion_y = pos_y - picture->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (picture->dest[0] + dest_field * picture->pitches[0] +     \
+		    picture->offset,					      \
+		    (ref[0] + (pos_x >> 1) +				      \
+		     ((pos_y op) + src_field) * picture->pitches[0]),	      \
+		    2 * picture->pitches[0], 8);			      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (picture->dest[1] + dest_field * picture->pitches[1] +   \
+		      (picture->offset >> 1), ref[1] +			      \
+		      (((picture->offset + motion_x) >> 1) +		      \
+		       (((picture->v_offset >> 1) +			      \
+		         (motion_y op) + src_field) * picture->pitches[1])),  \
+		      2 * picture->pitches[1], 4);			      \
+    table[4+xy_half] (picture->dest[2] + dest_field * picture->pitches[2] +   \
+		      (picture->offset >> 1), ref[2] +			      \
+		      (((picture->offset + motion_x) >> 1) +		      \
+		       (((picture->v_offset >> 1) +			      \
+		         (motion_y op) + src_field) * picture->pitches[2])),  \
+		      2 * picture->pitches[2], 4)
+
+static void motion_mp1 (picture_t * picture, motion_t * motion,
+			void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = (motion->pmv[0][0] +
+		(get_motion_delta (picture,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_x = bound_motion_vector (motion_x,
+				    motion->f_code[0] + motion->f_code[1]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] +
+		(get_motion_delta (picture,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_y = bound_motion_vector (motion_y,
+				    motion->f_code[0] + motion->f_code[1]);
+    motion->pmv[0][1] = motion_y;
+
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_frame (picture_t * picture, motion_t * motion,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_field (picture_t * picture, motion_t * motion,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y, field;
+    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field = UBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[0][1] = motion_y << 1;
+
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 0, & ~1, field);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field = UBITS (bit_buf, 1);
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[1][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[1][1] >> 1) + get_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[1][1] = motion_y << 1;
+
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 1, & ~1, field);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_dmv (picture_t * picture, motion_t * motion,
+			   void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y, dmv_x, dmv_y, m, other_x, other_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    dmv_x = get_dmv (picture);
+
+    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;
+    dmv_y = get_dmv (picture);
+
+    m = picture->top_field_first ? 1 : 3;
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 0, | 1, 0);
+
+    m = picture->top_field_first ? 3 : 1;
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 1, & ~1, 0);
+
+    pos_x = 2 * picture->offset + motion_x;
+    pos_y = picture->v_offset + motion_y;
+    if(pos_x > picture->limit_x){
+      pos_x = ((int)pos_x < 0) ? 0 : picture->limit_x;
+      motion_x = pos_x - 2 * picture->offset;
+    }
+    if(pos_y > picture->limit_y){
+      pos_y = ((int)pos_y < 0) ? 0 : picture->limit_y;
+      motion_y = pos_y - picture->v_offset;
+    }
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);
+    offset = (pos_x >> 1) + (pos_y & ~1) * picture->pitches[0];
+    mpeg2_mc.avg[xy_half]
+	(picture->dest[0] + picture->offset,
+	 motion->ref[0][0] + offset, 2 * picture->pitches[0], 8);
+    mpeg2_mc.avg[xy_half]
+	(picture->dest[0] + picture->pitches[0] + picture->offset,
+	 motion->ref[0][0] + picture->pitches[0] + offset,
+	 2 * picture->pitches[0], 8);
+    motion_x /= 2;	motion_y /= 2;
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);
+    offset = (((picture->offset + motion_x) >> 1) +
+	      (((picture->v_offset >> 1) + (motion_y & ~1)) *
+	       picture->pitches[1]));
+    mpeg2_mc.avg[4+xy_half]
+	(picture->dest[1] + (picture->offset >> 1),
+	 motion->ref[0][1] + offset, 2 * picture->pitches[1], 4);
+    mpeg2_mc.avg[4+xy_half]
+	(picture->dest[1] + picture->pitches[1] + (picture->offset >> 1),
+	 motion->ref[0][1] + picture->pitches[1] + offset,
+	 2 * picture->pitches[1], 4);
+    offset = (((picture->offset + motion_x) >> 1) +
+	      (((picture->v_offset >> 1) + (motion_y & ~1)) *
+	       picture->pitches[2]));
+    mpeg2_mc.avg[4+xy_half]
+	(picture->dest[2] + (picture->offset >> 1),
+	 motion->ref[0][2] + offset, 2 * picture->pitches[2], 4);
+    mpeg2_mc.avg[4+xy_half]
+	(picture->dest[2] + picture->pitches[2] + (picture->offset >> 1),
+	 motion->ref[0][2] + picture->pitches[2] + offset,
+	 2 * picture->pitches[2], 4);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_reuse (picture_t * picture, motion_t * motion,
+			  void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half;
+
+    motion_x = motion->pmv[0][0];
+    motion_y = motion->pmv[0][1];
+
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
+}
+
+static void motion_zero (picture_t * picture, motion_t * motion,
+			 void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    table[0] (picture->dest[0] + picture->offset,
+	      (motion->ref[0][0] + picture->offset +
+	       picture->v_offset * picture->pitches[0]),
+	      picture->pitches[0], 16);
+
+    table[4] (picture->dest[1] + (picture->offset >> 1),
+	      motion->ref[0][1] + (picture->offset >> 1) +
+	      (picture->v_offset >> 1) * picture->pitches[1],
+	      picture->pitches[1], 8);
+    table[4] (picture->dest[2] + (picture->offset >> 1),
+	      motion->ref[0][2] + (picture->offset >> 1) +
+	      (picture->v_offset >> 1) * picture->pitches[2],
+	      picture->pitches[2], 8);
+}
+
+/* like motion_frame, but parsing without actual motion compensation */
+static void motion_fr_conceal (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][0] +
+	   get_motion_delta (picture, picture->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][1] +
+	   get_motion_delta (picture, picture->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
+    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_field (picture_t * picture, motion_t * motion,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    uint8_t ** ref_field;
+    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+    MOTION (table, ref_field, motion_x, motion_y, 16, 0);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_16x8 (picture_t * picture, motion_t * motion,
+			    void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    uint8_t ** ref_field;
+    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[0][1] = motion_y;
+
+    MOTION (table, ref_field, motion_x, motion_y, 8, 0);
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[1][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[1][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion_y;
+
+    MOTION (table, ref_field, motion_x, motion_y, 8, 8);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_dmv (picture_t * picture, motion_t * motion,
+			   void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y, other_x, other_y;
+    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    other_x = ((motion_x + (motion_x > 0)) >> 1) + get_dmv (picture);
+
+    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+    other_y = (((motion_y + (motion_y > 0)) >> 1) + get_dmv (picture) +
+	       picture->dmv_offset);
+
+    MOTION (mpeg2_mc.put, motion->ref[0], motion_x, motion_y, 16, 0);
+    MOTION (mpeg2_mc.avg, motion->ref[1], other_x, other_y, 16, 0);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_conceal (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    DUMPBITS (bit_buf, bits, 1); /* remove field_select */
+
+    tmp = (picture->f_motion.pmv[0][0] +
+	   get_motion_delta (picture, picture->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][1] +
+	   get_motion_delta (picture, picture->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
+    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define MOTION_CALL(routine,direction)				\
+do {								\
+    if ((direction) & MACROBLOCK_MOTION_FORWARD)		\
+	routine (picture, &(picture->f_motion), mpeg2_mc.put);	\
+    if ((direction) & MACROBLOCK_MOTION_BACKWARD)		\
+	routine (picture, &(picture->b_motion),			\
+		 ((direction) & MACROBLOCK_MOTION_FORWARD ?	\
+		  mpeg2_mc.avg : mpeg2_mc.put));		\
+} while (0)
+
+#define NEXT_MACROBLOCK							    \
+do {									    \
+    picture->offset += 16;						    \
+    if (picture->offset == picture->coded_picture_width) {		    \
+	do { /* just so we can use the break statement */		    \
+	    if (picture->current_frame->proc_slice) {			    \
+		picture->current_frame->proc_slice (picture->current_frame, \
+						    picture->dest);	    \
+	    }								    \
+	    picture->dest[0] += 16 * picture->pitches[0];		    \
+	    picture->dest[1] += 8 * picture->pitches[1];		    \
+	    picture->dest[2] += 8 * picture->pitches[2];		    \
+	} while (0);							    \
+	picture->v_offset += 16;					    \
+	if (picture->v_offset > picture->limit_y) {			    \
+	    if (mpeg2_cpu_state_restore)				    \
+		mpeg2_cpu_state_restore (&cpu_state);			    \
+	    return;							    \
+	}								    \
+	picture->offset = 0;						    \
+    }									    \
+} while (0)
+
+static inline int slice_init (picture_t * picture, int code)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int offset, height;
+    struct vo_frame_s * forward_reference_frame;
+    struct vo_frame_s * backward_reference_frame;
+    const MBAtab * mba;
+
+    offset = picture->picture_structure == BOTTOM_FIELD;
+    picture->pitches[0] = picture->current_frame->pitches[0];
+    picture->pitches[1] = picture->current_frame->pitches[1];
+    picture->pitches[2] = picture->current_frame->pitches[2];
+
+    if( picture->forward_reference_frame ) {
+        forward_reference_frame = picture->forward_reference_frame;
+    }
+    else {
+        /* return 1; */
+        forward_reference_frame = picture->current_frame;
+    }
+    
+    if( picture->backward_reference_frame ) {
+        backward_reference_frame = picture->backward_reference_frame;
+    }
+    else {
+        /* return 1; */
+        backward_reference_frame = picture->current_frame;
+    }
+    
+    picture->f_motion.ref[0][0] =
+        forward_reference_frame->base[0] + (offset ? picture->pitches[0] : 0);
+    picture->f_motion.ref[0][1] =
+        forward_reference_frame->base[1] + (offset ? picture->pitches[1] : 0);
+    picture->f_motion.ref[0][2] =
+        forward_reference_frame->base[2] + (offset ? picture->pitches[2] : 0);
+    
+    picture->b_motion.ref[0][0] =
+	backward_reference_frame->base[0] + (offset ? picture->pitches[0] : 0);
+    picture->b_motion.ref[0][1] =
+	backward_reference_frame->base[1] + (offset ? picture->pitches[1] : 0);
+    picture->b_motion.ref[0][2] =
+	backward_reference_frame->base[2] + (offset ? picture->pitches[2] : 0);
+    
+    if (picture->picture_structure != FRAME_PICTURE) {
+	uint8_t ** forward_ref;
+	int bottom_field;
+
+	bottom_field = (picture->picture_structure == BOTTOM_FIELD);
+	picture->dmv_offset = bottom_field ? 1 : -1;
+	picture->f_motion.ref2[0] = picture->f_motion.ref[bottom_field];
+	picture->f_motion.ref2[1] = picture->f_motion.ref[!bottom_field];
+	picture->b_motion.ref2[0] = picture->b_motion.ref[bottom_field];
+	picture->b_motion.ref2[1] = picture->b_motion.ref[!bottom_field];
+
+	forward_ref = forward_reference_frame->base;
+	if (picture->second_field && (picture->picture_coding_type != B_TYPE))
+	    forward_ref = picture->current_frame->base;
+
+	picture->f_motion.ref[1][0] = forward_ref[0] + (bottom_field ? 0 : picture->pitches[0]);
+	picture->f_motion.ref[1][1] = forward_ref[1] + (bottom_field ? 0 : picture->pitches[1]);
+	picture->f_motion.ref[1][2] = forward_ref[2] + (bottom_field ? 0 : picture->pitches[2]);
+
+	picture->b_motion.ref[1][0] =
+	    backward_reference_frame->base[0] + (bottom_field ? 0 : picture->pitches[0]);
+	picture->b_motion.ref[1][1] =
+	    backward_reference_frame->base[1] + (bottom_field ? 0 : picture->pitches[1]);
+	picture->b_motion.ref[1][2] =
+	    backward_reference_frame->base[2] + (bottom_field ? 0 : picture->pitches[2]);
+    }
+
+    picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+    picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
+    picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+
+    picture->v_offset = (code - 1) * 16;
+    offset = (code - 1);
+    if (picture->picture_structure != FRAME_PICTURE)
+	offset = 2 * offset;
+
+    picture->dest[0] = picture->current_frame->base[0] + picture->pitches[0] * offset * 16;
+    picture->dest[1] = picture->current_frame->base[1] + picture->pitches[1] * offset * 8;
+    picture->dest[2] = picture->current_frame->base[2] + picture->pitches[2] * offset * 8;
+
+    height = picture->coded_picture_height;
+    switch (picture->picture_structure) {
+    case BOTTOM_FIELD:
+	picture->dest[0] += picture->pitches[0];
+	picture->dest[1] += picture->pitches[1];
+	picture->dest[2] += picture->pitches[2];
+	/* follow thru */
+    case TOP_FIELD:
+	picture->pitches[0] <<= 1;
+	picture->pitches[1] <<= 1;
+	picture->pitches[2] <<= 1;
+	height >>= 1;
+    }
+    picture->limit_x = 2 * picture->coded_picture_width - 32;
+    picture->limit_y_16 = 2 * height - 32;
+    picture->limit_y_8 = 2 * height - 16;
+    picture->limit_y = height - 16;
+
+    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+	picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision + 7);
+
+    picture->quantizer_scale = get_quantizer_scale (picture);
+
+    /* ignore intra_slice and all the extra data */
+    while (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 9);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+    }
+
+    /* decode initial macroblock address increment */
+    offset = 0;
+    while (1) {
+	if (bit_buf >= 0x08000000) {
+	    mba = MBA_5 + (UBITS (bit_buf, 6) - 2);
+	    break;
+	} else if (bit_buf >= 0x01800000) {
+	    mba = MBA_11 + (UBITS (bit_buf, 12) - 24);
+	    break;
+	} else switch (UBITS (bit_buf, 12)) {
+	case 8:		/* macroblock_escape */
+	    offset += 33;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	case 15:	/* macroblock_stuffing (MPEG1 only) */
+	    bit_buf &= 0xfffff;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	default:	/* error */
+	    return 1;
+	}
+    }
+    DUMPBITS (bit_buf, bits, mba->len + 1);
+    picture->offset = (offset + mba->mba) << 4;
+
+    while (picture->offset - picture->coded_picture_width >= 0) {
+	picture->offset -= picture->coded_picture_width;
+	if ((picture->current_frame->proc_slice == NULL) ||
+	    (picture->picture_coding_type != B_TYPE)) {
+	    picture->dest[0] += 16 * picture->pitches[0];
+	    picture->dest[1] += 8 * picture->pitches[1];
+	    picture->dest[2] += 8 * picture->pitches[2];
+	}
+	picture->v_offset += 16;
+    }
+    if (picture->v_offset > picture->limit_y)
+	return 1;
+
+    return 0;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+void mpeg2_slice (picture_t * picture, int code, uint8_t * buffer)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    cpu_state_t cpu_state;
+
+    bitstream_init (picture, buffer);
+
+    if (slice_init (picture, code))
+	return;
+
+    if (mpeg2_cpu_state_save)
+	mpeg2_cpu_state_save (&cpu_state);
+
+    while (1) {
+	int macroblock_modes;
+	int mba_inc;
+	const MBAtab * mba;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+
+	macroblock_modes = get_macroblock_modes (picture);
+
+	/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
+	if (macroblock_modes & MACROBLOCK_QUANT)
+	    picture->quantizer_scale = get_quantizer_scale (picture);
+
+	if (macroblock_modes & MACROBLOCK_INTRA) {
+
+	    int DCT_offset, DCT_stride;
+	    int offset;
+	    uint8_t * dest_y;
+
+	    if (picture->concealment_motion_vectors) {
+		if (picture->picture_structure == FRAME_PICTURE)
+		    motion_fr_conceal (picture);
+		else
+		    motion_fi_conceal (picture);
+	    } else {
+		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+		picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
+		picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+	    }
+
+	    if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		DCT_offset = picture->pitches[0];
+		DCT_stride = picture->pitches[0] * 2;
+	    } else {
+		DCT_offset = picture->pitches[0] * 8;
+		DCT_stride = picture->pitches[0];
+	    }
+
+	    offset = picture->offset;
+	    dest_y = picture->dest[0] + offset;
+	    slice_intra_DCT (picture, 0, dest_y, DCT_stride);
+	    slice_intra_DCT (picture, 0, dest_y + 8, DCT_stride);
+	    slice_intra_DCT (picture, 0, dest_y + DCT_offset, DCT_stride);
+	    slice_intra_DCT (picture, 0, dest_y + DCT_offset + 8, DCT_stride);
+	    slice_intra_DCT (picture, 1, picture->dest[1] + (offset >> 1),
+			     picture->pitches[1]);
+	    slice_intra_DCT (picture, 2, picture->dest[2] + (offset >> 1),
+			     picture->pitches[2]);
+
+	    if (picture->picture_coding_type == D_TYPE) {
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	} else {
+
+	    if (picture->picture_structure == FRAME_PICTURE)
+		switch (macroblock_modes & MOTION_TYPE_MASK) {
+		case MC_FRAME:
+		    if (picture->mpeg1)
+			MOTION_CALL (motion_mp1, macroblock_modes);
+		    else
+			MOTION_CALL (motion_fr_frame, macroblock_modes);
+		    break;
+
+		case MC_FIELD:
+		    MOTION_CALL (motion_fr_field, macroblock_modes);
+		    break;
+
+		case MC_DMV:
+		    MOTION_CALL (motion_fr_dmv, MACROBLOCK_MOTION_FORWARD);
+		    break;
+
+		case 0:
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    break;
+		}
+	    else
+		switch (macroblock_modes & MOTION_TYPE_MASK) {
+		case MC_FIELD:
+		    MOTION_CALL (motion_fi_field, macroblock_modes);
+		    break;
+
+		case MC_16X8:
+		    MOTION_CALL (motion_fi_16x8, macroblock_modes);
+		    break;
+
+		case MC_DMV:
+		    MOTION_CALL (motion_fi_dmv, MACROBLOCK_MOTION_FORWARD);
+		    break;
+
+		case 0:
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    break;
+		}
+
+	    if (macroblock_modes & MACROBLOCK_PATTERN) {
+		int coded_block_pattern;
+		int DCT_offset, DCT_stride;
+		int offset;
+		uint8_t * dest_y;
+
+		if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		    DCT_offset = picture->pitches[0];
+		    DCT_stride = picture->pitches[0] * 2;
+		} else {
+		    DCT_offset = picture->pitches[0] * 8;
+		    DCT_stride = picture->pitches[0];
+		}
+
+		coded_block_pattern = get_coded_block_pattern (picture);
+
+		offset = picture->offset;
+		dest_y = picture->dest[0] + offset;
+		if (coded_block_pattern & 0x20)
+		    slice_non_intra_DCT (picture, dest_y, DCT_stride);
+		if (coded_block_pattern & 0x10)
+		    slice_non_intra_DCT (picture, dest_y + 8, DCT_stride);
+		if (coded_block_pattern & 0x08)
+		    slice_non_intra_DCT (picture, dest_y + DCT_offset,
+					 DCT_stride);
+		if (coded_block_pattern & 0x04)
+		    slice_non_intra_DCT (picture, dest_y + DCT_offset + 8,
+					 DCT_stride);
+		if (coded_block_pattern & 0x2)
+		    slice_non_intra_DCT (picture,
+					 picture->dest[1] + (offset >> 1),
+					 picture->pitches[1]);
+		if (coded_block_pattern & 0x1)
+		    slice_non_intra_DCT (picture,
+					 picture->dest[2] + (offset >> 1),
+					 picture->pitches[2]);
+	    }
+
+	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+		picture->dc_dct_pred[2] = 128 << picture->intra_dc_precision;
+	}
+
+	NEXT_MACROBLOCK;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	mba_inc = 0;
+	while (1) {
+	    if (bit_buf >= 0x10000000) {
+		mba = MBA_5 + (UBITS (bit_buf, 5) - 2);
+		break;
+	    } else if (bit_buf >= 0x03000000) {
+		mba = MBA_11 + (UBITS (bit_buf, 11) - 24);
+		break;
+	    } else switch (UBITS (bit_buf, 11)) {
+	    case 8:		/* macroblock_escape */
+		mba_inc += 33;
+		/* pass through */
+	    case 15:	/* macroblock_stuffing (MPEG1 only) */
+		DUMPBITS (bit_buf, bits, 11);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		continue;
+	    default:	/* end of slice, or error */
+		if (mpeg2_cpu_state_restore)
+		    mpeg2_cpu_state_restore (&cpu_state);
+		return;
+	    }
+	}
+	DUMPBITS (bit_buf, bits, mba->len);
+	mba_inc += mba->mba;
+
+	if (mba_inc) {
+	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+		picture->dc_dct_pred[2] = 128 << picture->intra_dc_precision;
+
+	    if (picture->picture_coding_type == P_TYPE) {
+		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+
+		do {
+		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    } else {
+		do {
+		    MOTION_CALL (motion_reuse, macroblock_modes);
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    }
+	}
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
diff --git a/src/video_dec/libmpeg2/slice_xvmc.c b/src/video_dec/libmpeg2/slice_xvmc.c
new file mode 100644
index 000000000..014ae7924
--- /dev/null
+++ b/src/video_dec/libmpeg2/slice_xvmc.c
@@ -0,0 +1,1988 @@
+/*
+ * slice_xvmc.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>	/* memcpy/memset, try to remove */
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include "mpeg2_internal.h"
+#include <xine/xineutils.h>
+
+#include <xine/attributes.h>
+#include "accel_xvmc.h"
+#include "xvmc.h"
+
+
+#define MOTION_ACCEL   XINE_VO_MOTION_ACCEL
+#define IDCT_ACCEL     XINE_VO_IDCT_ACCEL
+#define SIGNED_INTRA   XINE_VO_SIGNED_INTRA
+#define ACCEL          (MOTION_ACCEL | IDCT_ACCEL)
+
+#include "vlc.h"
+/* original (non-patched) scan tables */
+
+static const uint8_t mpeg2_scan_norm_orig[64] ATTR_ALIGN(16) =
+{
+    /* Zig-Zag scan pattern */
+     0, 1, 8,16, 9, 2, 3,10,
+    17,24,32,25,18,11, 4, 5,
+    12,19,26,33,40,48,41,34,
+    27,20,13, 6, 7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+static const uint8_t mpeg2_scan_alt_orig[64] ATTR_ALIGN(16) =
+{
+    /* Alternate scan pattern */
+    0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49,
+    41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43,
+    51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45,
+    53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63
+};
+
+static uint8_t mpeg2_scan_alt_ptable[64] ATTR_ALIGN(16);
+static uint8_t mpeg2_scan_norm_ptable[64] ATTR_ALIGN(16);
+static uint8_t mpeg2_scan_orig_ptable[64] ATTR_ALIGN(16);
+
+void xvmc_setup_scan_ptable( void )
+{
+    int i;
+    for (i=0; i<64; ++i) {
+	mpeg2_scan_norm_ptable[mpeg2_scan_norm_orig[i]] = mpeg2_scan_norm[i];
+	mpeg2_scan_alt_ptable[mpeg2_scan_alt_orig[i]] = mpeg2_scan_alt[i];
+	mpeg2_scan_orig_ptable[i] = i;
+    }
+}
+    
+
+static const int non_linear_quantizer_scale [] = {
+    0,  1,  2,  3,  4,  5,   6,   7,
+    8, 10, 12, 14, 16, 18,  20,  22,
+    24, 28, 32, 36, 40, 44,  48,  52,
+    56, 64, 72, 80, 88, 96, 104, 112
+};
+
+static inline int get_xvmc_macroblock_modes (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int macroblock_modes;
+    const MBtab * tab;
+
+    switch (picture->picture_coding_type) {
+    case I_TYPE:
+
+	tab = MB_I + UBITS (bit_buf, 1);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if ((! (picture->frame_pred_frame_dct)) &&
+	    (picture->picture_structure == FRAME_PICTURE)) {
+	    macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+	    DUMPBITS (bit_buf, bits, 1);
+	}
+
+	return macroblock_modes;
+
+    case P_TYPE:
+
+	tab = MB_P + UBITS (bit_buf, 5);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (picture->picture_structure != FRAME_PICTURE) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (picture->frame_pred_frame_dct) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
+		macroblock_modes |= MC_FRAME;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case B_TYPE:
+
+	tab = MB_B + UBITS (bit_buf, 6);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (picture->picture_structure != FRAME_PICTURE) {
+	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
+		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (picture->frame_pred_frame_dct) {
+	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
+	    macroblock_modes |= MC_FRAME;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_INTRA)
+		goto intra;
+	    macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
+	    DUMPBITS (bit_buf, bits, 2);
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+	    intra:
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case D_TYPE:
+
+	DUMPBITS (bit_buf, bits, 1);
+	return MACROBLOCK_INTRA;
+
+    default:
+	return 0;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_xvmc_quantizer_scale (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    int quantizer_scale_code;
+
+    quantizer_scale_code = UBITS (bit_buf, 5);
+    DUMPBITS (bit_buf, bits, 5);
+
+    if (picture->q_scale_type)
+	return non_linear_quantizer_scale [quantizer_scale_code];
+    else
+	return quantizer_scale_code << 1;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_xvmc_motion_delta (picture_t * picture, int f_code)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    int delta;
+    int sign;
+    const MVtab * tab;
+
+    if (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 1);
+	return 0;
+    } else if (bit_buf >= 0x0c000000) {
+
+	tab = MV_4 + UBITS (bit_buf, 4);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + f_code + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code)
+	    delta += UBITS (bit_buf, f_code);
+	bit_buf <<= f_code;
+
+	return (delta ^ sign) - sign;
+
+    } else {
+
+	tab = MV_10 + UBITS (bit_buf, 10);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code) {
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    delta += UBITS (bit_buf, f_code);
+	    DUMPBITS (bit_buf, bits, f_code);
+	}
+
+	return (delta ^ sign) - sign;
+
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int bound_motion_vector (int vec, int f_code)
+{
+#if 1
+    unsigned int limit;
+    int sign;
+
+    limit = 16 << f_code;
+
+    if ((unsigned int)(vec + limit) < 2 * limit)
+	return vec;
+    else {
+	sign = ((int32_t)vec) >> 31;
+	return vec - ((2 * limit) ^ sign) + sign;
+    }
+#else
+    return ((int32_t)vec << (27 - f_code)) >> (27 - f_code);
+#endif
+}
+
+static inline int get_xvmc_dmv (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    const DMVtab * tab;
+
+    tab = DMV_2 + UBITS (bit_buf, 2);
+    DUMPBITS (bit_buf, bits, tab->len);
+    return tab->dmv;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_xvmc_coded_block_pattern (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+
+    const CBPtab * tab;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    if (bit_buf >= 0x20000000) {
+
+	tab = CBP_7 + (UBITS (bit_buf, 7) - 16);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+
+    } else {
+
+	tab = CBP_9 + UBITS (bit_buf, 9);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+    }
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_xvmc_luma_dc_dct_diff (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_lum_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff;
+	} else {
+	    DUMPBITS (bit_buf, bits, 3);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 9) - 0x1e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_xvmc_chroma_dc_dct_diff (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_chrom_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff;
+	} else {
+	    DUMPBITS (bit_buf, bits, 2);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 10) - 0x3e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len + 1);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define SATURATE(val)			\
+do {					\
+    if ((uint32_t)(val + 2048) > 4095)	\
+	val = (val > 0) ? 2047 : -2048;	\
+} while (0)
+
+static void get_xvmc_intra_block_B14 (picture_t * picture)
+{
+    int i;
+    int j;
+    int l;
+    int val;
+    const uint8_t * scan = picture->scan;
+    uint8_t * scan_ptable = mpeg2_scan_orig_ptable;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    dest = picture->mc->blockptr;
+    
+    if( picture->mc->xvmc_accel & IDCT_ACCEL ) {
+	if ( scan == mpeg2_scan_norm ) {
+	    scan =  mpeg2_scan_norm_orig; 
+	    scan_ptable = mpeg2_scan_norm_ptable;
+	} else {
+	    scan = mpeg2_scan_alt_orig;
+	    scan_ptable = mpeg2_scan_alt_ptable;
+	}
+    }
+	    
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    l = scan_ptable[j = scan[i]];
+	    
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quantizer_scale * quant_matrix[l]) >> 4;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    l = scan_ptable[j = scan[i]];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = (SBITS (bit_buf, 12) *
+		   quantizer_scale * quant_matrix[l]) / 16;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_xvmc_intra_block_B15 (picture_t * picture)
+{
+    int i;
+    int j;
+    int l;
+    int val;
+    const uint8_t * scan = picture->scan;
+    uint8_t * scan_ptable = mpeg2_scan_orig_ptable;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    dest = picture->mc->blockptr;
+
+    if( picture->mc->xvmc_accel & IDCT_ACCEL ) {
+	if ( scan == mpeg2_scan_norm ) {
+	    scan =  mpeg2_scan_norm_orig; 
+	    scan_ptable = mpeg2_scan_norm_ptable;
+	} else {
+	    scan = mpeg2_scan_alt_orig;
+	    scan_ptable = mpeg2_scan_alt_ptable;
+	}
+    }
+	    	    
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B15_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64) {
+
+	    normal_code:
+		l = scan_ptable[j = scan[i]];
+		bit_buf <<= tab->len;
+		bits += tab->len + 1;
+		val = (tab->level * quantizer_scale * quant_matrix[l]) >> 4;
+
+		/* if (bitstream_get (1)) val = -val; */
+		val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		bit_buf <<= 1;
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    } else {
+
+		/* end of block. I commented out this code because if we */
+		/* dont exit here we will still exit at the later test :) */
+
+		/* if (i >= 128) break;	*/	/* end of block */
+
+		/* escape code */
+
+                i += UBITS (bit_buf << 6, 6) - 64;
+		if (i >= 64)
+		    break;	/* illegal, check against buffer overflow */
+
+		l = scan_ptable[j = scan[i]];
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		val = (SBITS (bit_buf, 12) *
+		       quantizer_scale * quant_matrix[l]) / 16;
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    }
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B15_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 4);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_xvmc_non_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int l;
+    int val;
+    const uint8_t * scan = picture->scan;
+    uint8_t * scan_ptable = mpeg2_scan_orig_ptable;
+    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = -1;
+    mismatch = 1;
+
+    dest = picture->mc->blockptr;
+
+    if( picture->mc->xvmc_accel & IDCT_ACCEL ) {
+	if ( scan == mpeg2_scan_norm ) {
+	    scan =  mpeg2_scan_norm_orig; 
+	    scan_ptable = mpeg2_scan_norm_ptable;
+	} else {
+	    scan = mpeg2_scan_alt_orig;
+	    scan_ptable = mpeg2_scan_alt_ptable;
+	}
+    }
+	    
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    l = scan_ptable[j = scan[i]];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[l]) >> 5;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+            i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    l = scan_ptable[j = scan[i]];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1;
+	    val = (val * quantizer_scale * quant_matrix[l]) / 32;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 1;
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_xvmc_mpeg1_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int l;
+    int val;
+    const uint8_t * scan = picture->scan;
+    uint8_t * scan_ptable = mpeg2_scan_orig_ptable;
+    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = 0;
+
+    dest = picture->mc->blockptr;
+
+    if( picture->mc->xvmc_accel & IDCT_ACCEL ) {
+	if ( scan == mpeg2_scan_norm ) {
+	    scan =  mpeg2_scan_norm_orig; 
+	    scan_ptable = mpeg2_scan_norm_ptable;
+	} else {
+	    scan = mpeg2_scan_alt_orig;
+	    scan_ptable = mpeg2_scan_alt_ptable;
+	}
+    }
+	    
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    l = scan_ptable[j = scan[i]];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quantizer_scale * quant_matrix[l]) >> 4;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+            i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    l = scan_ptable[j = scan[i]];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = (val * quantizer_scale * quant_matrix[l]) / 16;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static void get_xvmc_mpeg1_non_intra_block (picture_t * picture)
+{
+    int i;
+    int j;
+    int l;
+    int val;
+    const uint8_t * scan = picture->scan;
+    uint8_t * scan_ptable = mpeg2_scan_orig_ptable;
+    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
+    int quantizer_scale = picture->quantizer_scale;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    uint8_t * bit_ptr;
+    int16_t * dest;
+
+    i = -1;
+
+    dest = picture->mc->blockptr;
+
+    if( picture->mc->xvmc_accel & IDCT_ACCEL ) {
+	if ( scan == mpeg2_scan_norm ) {
+	    scan =  mpeg2_scan_norm_orig; 
+	    scan_ptable = mpeg2_scan_norm_ptable;
+	} else {
+	    scan = mpeg2_scan_alt_orig;
+	    scan_ptable = mpeg2_scan_alt_ptable;
+	}
+    }
+	    
+    bit_buf = picture->bitstream_buf;
+    bits = picture->bitstream_bits;
+    bit_ptr = picture->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    l = scan_ptable[j = scan[i]];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2*tab->level+1) * quantizer_scale * quant_matrix[l]) >> 5;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    l = scan_ptable[j = scan[i]];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = 2 * (val + SBITS (val, 1)) + 1;
+	    val = (val * quantizer_scale * quant_matrix[l]) / 32;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
+    picture->bitstream_buf = bit_buf;
+    picture->bitstream_bits = bits;
+    picture->bitstream_ptr = bit_ptr;
+}
+
+static inline void slice_xvmc_intra_DCT (picture_t * picture, int cc,
+				    uint8_t * dest, int stride)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)  
+#define bit_ptr (picture->bitstream_ptr)
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    /* Get the intra DC coefficient and inverse quantize it */
+
+    //    printf("slice: slice_xvmc_intra_DCT cc=%d pred[0]=%d\n",cc,picture->dc_dct_pred[0]);
+    if (cc == 0)
+	picture->dc_dct_pred[0] += get_xvmc_luma_dc_dct_diff (picture);
+    else
+	picture->dc_dct_pred[cc] += get_xvmc_chroma_dc_dct_diff (picture);
+    //TODO conversion to signed format 
+    //    printf("slice:  pred[0]=%d presision=%d\n",picture->dc_dct_pred[0],
+    //       picture->intra_dc_precision);
+
+    mpeg2_zero_block(picture->mc->blockptr);
+
+    picture->mc->blockptr[0] = picture->dc_dct_pred[cc] << (3 - picture->intra_dc_precision);
+
+    if (picture->mpeg1) {
+	if (picture->picture_coding_type != D_TYPE)
+	    get_xvmc_mpeg1_intra_block (picture);
+    } else if (picture->intra_vlc_format)
+	get_xvmc_intra_block_B15 (picture);
+    else
+	get_xvmc_intra_block_B14 (picture);
+
+    if((picture->mc->xvmc_accel & ACCEL) == MOTION_ACCEL) {
+        //motion_comp only no idct acceleration so do it in software
+        mpeg2_idct (picture->mc->blockptr);
+    }
+    picture->mc->blockptr += 64;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void slice_xvmc_non_intra_DCT (picture_t * picture, uint8_t * dest,
+					int stride)
+{
+  mpeg2_zero_block(picture->mc->blockptr);
+
+    if (picture->mpeg1)
+	get_xvmc_mpeg1_non_intra_block (picture);
+    else
+	get_xvmc_non_intra_block (picture);
+
+    if((picture->mc->xvmc_accel & ACCEL) == MOTION_ACCEL) {
+      // motion comp only no idct acceleration so do it in sw
+      mpeg2_idct (picture->mc->blockptr);
+    }
+    picture->mc->blockptr += 64;
+}
+
+static void motion_mp1 (picture_t * picture, motion_t * motion,
+			void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = (motion->pmv[0][0] +
+		(get_xvmc_motion_delta (picture,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_x = bound_motion_vector (motion_x,
+				    motion->f_code[0] + motion->f_code[1]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] +
+		(get_xvmc_motion_delta (picture,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_y = bound_motion_vector (motion_y,
+				    motion->f_code[0] + motion->f_code[1]);
+    motion->pmv[0][1] = motion_y;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_frame (picture_t * picture, motion_t * motion,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_xvmc_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_field (picture_t * picture, motion_t * motion,
+			     void (** table) (uint8_t *, uint8_t *, int, int),
+			     int dir)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y, field;
+    //    unsigned int pos_x, pos_y, xy_half;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field = UBITS (bit_buf, 1);
+    picture->XvMC_mv_field_sel[0][dir] = field;
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] >> 1) + get_xvmc_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[0][1] = motion_y << 1;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    field = UBITS (bit_buf, 1);
+    //TODO look at field select need bob  (weave ok)
+    picture->XvMC_mv_field_sel[1][dir] = field;
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[1][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[1][1] >> 1) + get_xvmc_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[1][1] = motion_y << 1;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fr_dmv (picture_t * picture, motion_t * motion,
+			   void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+
+    // TODO field select ?? possible need to be 0
+    picture->XvMC_mv_field_sel[0][0] = picture->XvMC_mv_field_sel[1][0] = 0;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    motion_y = (motion->pmv[0][1] >> 1) + get_xvmc_motion_delta (picture,
+							    motion->f_code[1]);
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_reuse (picture_t * picture, motion_t * motion,
+			  void (** table) (uint8_t *, uint8_t *, int, int))
+{
+    int motion_x, motion_y;
+
+    motion_x = motion->pmv[0][0];
+    motion_y = motion->pmv[0][1];
+
+}
+
+/* like motion_frame, but parsing without actual motion compensation */
+static void motion_fr_conceal (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][0] +
+	   get_xvmc_motion_delta (picture, picture->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][1] +
+	   get_xvmc_motion_delta (picture, picture->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
+    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_field (picture_t * picture, motion_t * motion,
+			     void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    uint8_t ** ref_field;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
+
+    // TODO field select may need to do something here for bob (weave ok)
+    picture->XvMC_mv_field_sel[0][0] = picture->XvMC_mv_field_sel[1][0] = 0;
+
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_xvmc_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_16x8 (picture_t * picture, motion_t * motion,
+			    void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+    uint8_t ** ref_field;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
+
+    // TODO field select may need to do something here bob  (weave ok)
+    picture->XvMC_mv_field_sel[0][0] = picture->XvMC_mv_field_sel[1][0] = 0;
+
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[0][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[0][1] + get_xvmc_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[0][1] = motion_y;
+
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
+
+    // TODO field select may need to do something here for bob (weave ok)
+    picture->XvMC_mv_field_sel[0][0] = picture->XvMC_mv_field_sel[1][0] = 0;
+
+    DUMPBITS (bit_buf, bits, 1);
+
+    motion_x = motion->pmv[1][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = motion->pmv[1][1] + get_xvmc_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion_y;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static void motion_fi_dmv (picture_t * picture, motion_t * motion,
+			   void (** table) (uint8_t *, uint8_t *, int, int))
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int motion_x, motion_y;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = motion->pmv[0][0] + get_xvmc_motion_delta (picture,
+						     motion->f_code[0]);
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    motion_y = motion->pmv[0][1] + get_xvmc_motion_delta (picture,
+						     motion->f_code[1]);
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+
+    // TODO field select may need to do something here for bob  (weave ok)
+    picture->XvMC_mv_field_sel[0][0] = picture->XvMC_mv_field_sel[1][0] = 0;
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+
+static void motion_fi_conceal (picture_t * picture)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    DUMPBITS (bit_buf, bits, 1); /* remove field_select */
+
+    tmp = (picture->f_motion.pmv[0][0] +
+	   get_xvmc_motion_delta (picture, picture->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (picture->f_motion.pmv[0][1] +
+	   get_xvmc_motion_delta (picture, picture->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
+    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define MOTION_CALL(routine,direction)				\
+do {								\
+    if ((direction) & MACROBLOCK_MOTION_FORWARD)		\
+	routine (picture, &(picture->f_motion), mpeg2_mc.put);	\
+    if ((direction) & MACROBLOCK_MOTION_BACKWARD)		\
+	routine (picture, &(picture->b_motion),			\
+		 ((direction) & MACROBLOCK_MOTION_FORWARD ?	\
+		  mpeg2_mc.avg : mpeg2_mc.put));		\
+} while (0)
+
+#define NEXT_MACROBLOCK							    \
+do {									    \
+    picture->offset += 16;						    \
+    if (picture->offset == picture->coded_picture_width) {		    \
+	do { /* just so we can use the break statement */		    \
+	    if (picture->current_frame->proc_slice) {			    \
+		picture->current_frame->proc_slice (picture->current_frame, \
+					      picture->dest);		    \
+		if (picture->picture_coding_type == B_TYPE)		    \
+		    break;						    \
+	    }								    \
+	    picture->dest[0] += 16 * picture->pitches[0];		    \
+	    picture->dest[1] += 8 * picture->pitches[1];		    \
+	    picture->dest[2] += 8 * picture->pitches[2];		    \
+	} while (0);							    \
+	picture->v_offset += 16;					    \
+	if (picture->v_offset > picture->limit_y) {			    \
+	    if (mpeg2_cpu_state_restore)				    \
+		mpeg2_cpu_state_restore (&cpu_state);			    \
+	    return;							    \
+	}								    \
+	picture->offset = 0;						    \
+    }									    \
+} while (0)
+
+static inline int slice_xvmc_init (picture_t * picture, int code)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    int offset, height;
+    struct vo_frame_s * forward_reference_frame;
+    struct vo_frame_s * backward_reference_frame;
+    const MBAtab * mba;
+
+    offset = picture->picture_structure == BOTTOM_FIELD;
+    picture->pitches[0] = picture->current_frame->pitches[0];
+    picture->pitches[1] = picture->current_frame->pitches[1];
+    picture->pitches[2] = picture->current_frame->pitches[2];
+
+    if( picture->forward_reference_frame ) {
+        forward_reference_frame = picture->forward_reference_frame;
+    }
+    else {
+        /* return 1; */
+        forward_reference_frame = picture->current_frame;
+    }
+    
+    if( picture->backward_reference_frame ) {
+        backward_reference_frame = picture->backward_reference_frame;
+    }
+    else {
+        /* return 1; */
+        backward_reference_frame = picture->current_frame;
+    }
+    
+    picture->f_motion.ref[0][0] =
+        forward_reference_frame->base[0] + (offset ? picture->pitches[0] : 0);
+    picture->f_motion.ref[0][1] =
+        forward_reference_frame->base[1] + (offset ? picture->pitches[1] : 0);
+    picture->f_motion.ref[0][2] =
+        forward_reference_frame->base[2] + (offset ? picture->pitches[2] : 0);
+    
+    picture->b_motion.ref[0][0] =
+	backward_reference_frame->base[0] + (offset ? picture->pitches[0] : 0);
+    picture->b_motion.ref[0][1] =
+	backward_reference_frame->base[1] + (offset ? picture->pitches[1] : 0);
+    picture->b_motion.ref[0][2] =
+	backward_reference_frame->base[2] + (offset ? picture->pitches[2] : 0);
+    
+    if (picture->picture_structure != FRAME_PICTURE) {
+	uint8_t ** forward_ref;
+	int bottom_field;
+
+	bottom_field = (picture->picture_structure == BOTTOM_FIELD);
+	picture->dmv_offset = bottom_field ? 1 : -1;
+	picture->f_motion.ref2[0] = picture->f_motion.ref[bottom_field];
+	picture->f_motion.ref2[1] = picture->f_motion.ref[!bottom_field];
+	picture->b_motion.ref2[0] = picture->b_motion.ref[bottom_field];
+	picture->b_motion.ref2[1] = picture->b_motion.ref[!bottom_field];
+
+	forward_ref = forward_reference_frame->base;
+	if (picture->second_field && (picture->picture_coding_type != B_TYPE))
+	    forward_ref = picture->current_frame->base;
+
+	picture->f_motion.ref[1][0] = forward_ref[0] + (bottom_field ? 0 : picture->pitches[0]);
+	picture->f_motion.ref[1][1] = forward_ref[1] + (bottom_field ? 0 : picture->pitches[1]);
+	picture->f_motion.ref[1][2] = forward_ref[2] + (bottom_field ? 0 : picture->pitches[2]);
+
+	picture->b_motion.ref[1][0] =
+	    backward_reference_frame->base[0] + (bottom_field ? 0 : picture->pitches[0]);
+	picture->b_motion.ref[1][1] =
+	    backward_reference_frame->base[1] + (bottom_field ? 0 : picture->pitches[1]);
+	picture->b_motion.ref[1][2] =
+	    backward_reference_frame->base[2] + (bottom_field ? 0 : picture->pitches[2]);
+    }
+
+    picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+    picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
+    picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+
+    picture->v_offset = (code - 1) * 16;
+    offset = (code - 1);
+    if (picture->current_frame->proc_slice && picture->picture_coding_type == B_TYPE)
+	offset = 0;
+    else if (picture->picture_structure != FRAME_PICTURE)
+	offset = 2 * offset;
+
+    picture->dest[0] = picture->current_frame->base[0] + picture->pitches[0] * offset * 16;
+    picture->dest[1] = picture->current_frame->base[1] + picture->pitches[1] * offset * 8;
+    picture->dest[2] = picture->current_frame->base[2] + picture->pitches[2] * offset * 8;
+
+    height = picture->coded_picture_height;
+    switch (picture->picture_structure) {
+    case BOTTOM_FIELD:
+	picture->dest[0] += picture->pitches[0];
+	picture->dest[1] += picture->pitches[1];
+	picture->dest[2] += picture->pitches[2];
+	/* follow thru */
+    case TOP_FIELD:
+	picture->pitches[0] <<= 1;
+	picture->pitches[1] <<= 1;
+	picture->pitches[2] <<= 1;
+	height >>= 1;
+    }
+    picture->limit_x = 2 * picture->coded_picture_width - 32;
+    picture->limit_y_16 = 2 * height - 32;
+    picture->limit_y_8 = 2 * height - 16;
+    picture->limit_y = height - 16;
+
+    //TODO conversion to signed format signed format
+    if((picture->mc->xvmc_accel & ACCEL) == MOTION_ACCEL &&
+       !(picture->mc->xvmc_accel & SIGNED_INTRA)) {
+      //Motion Comp only unsigned intra
+      // original:
+      picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+	picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision + 7);
+    } else {
+      //Motion Comp only signed intra  MOTION_ACCEL+SIGNED_INTRA
+      picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+	picture->dc_dct_pred[2] = 0;
+    }
+
+    picture->quantizer_scale = get_xvmc_quantizer_scale (picture);
+
+    /* ignore intra_slice and all the extra data */
+    while (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 9);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+    }
+
+    /* decode initial macroblock address increment */
+    offset = 0;
+    while (1) {
+	if (bit_buf >= 0x08000000) {
+	    mba = MBA_5 + (UBITS (bit_buf, 6) - 2);
+	    break;
+	} else if (bit_buf >= 0x01800000) {
+	    mba = MBA_11 + (UBITS (bit_buf, 12) - 24);
+	    break;
+	} else switch (UBITS (bit_buf, 12)) {
+	case 8:		/* macroblock_escape */
+	    offset += 33;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	case 15:	/* macroblock_stuffing (MPEG1 only) */
+	    bit_buf &= 0xfffff;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	default:	/* error */
+	    return 1;
+	}
+    }
+    DUMPBITS (bit_buf, bits, mba->len + 1);
+    picture->offset = (offset + mba->mba) << 4;
+
+    while (picture->offset - picture->coded_picture_width >= 0) {
+	picture->offset -= picture->coded_picture_width;
+	if ((picture->current_frame->proc_slice == NULL) ||
+	    (picture->picture_coding_type != B_TYPE)) {
+	    picture->dest[0] += 16 * picture->pitches[0];
+	    picture->dest[1] += 8 * picture->pitches[1];
+	    picture->dest[2] += 8 * picture->pitches[2];
+	}
+	picture->v_offset += 16;
+    }
+    if (picture->v_offset > picture->limit_y)
+	return 1;
+
+    return 0;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+void mpeg2_xvmc_slice (mpeg2dec_accel_t *accel, picture_t * picture, int code, uint8_t * buffer)
+{
+#define bit_buf (picture->bitstream_buf)
+#define bits (picture->bitstream_bits)
+#define bit_ptr (picture->bitstream_ptr)
+    cpu_state_t cpu_state;
+    xine_xvmc_t *xvmc = (xine_xvmc_t *) picture->current_frame->accel_data;
+
+    if (1 == code) {
+      accel->xvmc_last_slice_code = 0;
+    }
+    if ((code != accel->xvmc_last_slice_code + 1) &&
+	(code != accel->xvmc_last_slice_code))
+	return;
+    
+    bitstream_init (picture, buffer);
+
+    if (slice_xvmc_init (picture, code))
+	return;
+
+    if (mpeg2_cpu_state_save)
+	mpeg2_cpu_state_save (&cpu_state);
+
+    while (1) {
+	int macroblock_modes;
+	int mba_inc;
+	const MBAtab * mba;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+
+	macroblock_modes = get_xvmc_macroblock_modes (picture); //macroblock_modes()
+	picture->XvMC_mb_type = macroblock_modes & 0x1F;
+	picture->XvMC_dct_type = (macroblock_modes & DCT_TYPE_INTERLACED)>>5;
+	picture->XvMC_motion_type = (macroblock_modes & MOTION_TYPE_MASK)>>6;
+
+	picture->XvMC_x = picture->offset/16;
+	picture->XvMC_y = picture->v_offset/16;
+
+	if((picture->XvMC_x == 0) && (picture->XvMC_y == 0)) {
+	  picture->XvMC_mv_field_sel[0][0] = 
+	    picture->XvMC_mv_field_sel[1][0] = 
+	    picture->XvMC_mv_field_sel[0][1] = 
+	    picture->XvMC_mv_field_sel[1][1] = 0;
+	}
+
+	picture->XvMC_cbp = 0x3f;  //TODO set for intra 4:2:0 6 blocks yyyyuv all enabled
+
+	/* maybe integrate MACROBLOCK_QUANT test into get_xvmc_macroblock_modes ? */
+	if (macroblock_modes & MACROBLOCK_QUANT)
+	    picture->quantizer_scale = get_xvmc_quantizer_scale (picture);
+	if (macroblock_modes & MACROBLOCK_INTRA) {
+
+	    int DCT_offset, DCT_stride;
+	    int offset;
+	    uint8_t * dest_y;
+
+	    if (picture->concealment_motion_vectors) {
+		if (picture->picture_structure == FRAME_PICTURE)
+		    motion_fr_conceal (picture);
+		else
+		    motion_fi_conceal (picture);
+	    } else {
+		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+		picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
+		picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+	    }
+
+	    if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		DCT_offset = picture->pitches[0];
+		DCT_stride = picture->pitches[0] * 2;
+	    } else {
+		DCT_offset = picture->pitches[0] * 8;
+		DCT_stride = picture->pitches[0];
+	    }
+	    offset = picture->offset;
+	    dest_y = picture->dest[0] + offset;
+	    // unravaled loop of 6 block(i) calls in macroblock()
+	    slice_xvmc_intra_DCT (picture, 0, dest_y, DCT_stride);
+	    slice_xvmc_intra_DCT (picture, 0, dest_y + 8, DCT_stride);
+	    slice_xvmc_intra_DCT (picture, 0, dest_y + DCT_offset, DCT_stride);
+	    slice_xvmc_intra_DCT (picture, 0, dest_y + DCT_offset + 8, DCT_stride);
+	    slice_xvmc_intra_DCT (picture, 1, picture->dest[1] + (offset >> 1),
+			     picture->pitches[1]);
+	    slice_xvmc_intra_DCT (picture, 2, picture->dest[2] + (offset >> 1),
+			     picture->pitches[2]);
+
+	    if (picture->picture_coding_type == D_TYPE) {
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	} else {
+	    picture->XvMC_cbp = 0;
+
+	    if (picture->picture_structure == FRAME_PICTURE)
+		switch (macroblock_modes & MOTION_TYPE_MASK) {
+		case MC_FRAME:
+		    if (picture->mpeg1) {
+			MOTION_CALL (motion_mp1, macroblock_modes);
+		    } else {
+			MOTION_CALL (motion_fr_frame, macroblock_modes);
+		    }
+		    break;
+
+		case MC_FIELD:
+		    //MOTION_CALL (motion_fr_field, macroblock_modes);
+
+		    if ((macroblock_modes) & MACROBLOCK_MOTION_FORWARD)
+		      motion_fr_field(picture, &(picture->f_motion),
+				       mpeg2_mc.put,0);
+		    if ((macroblock_modes) & MACROBLOCK_MOTION_BACKWARD)
+		      motion_fr_field(picture, &(picture->b_motion),
+			     ((macroblock_modes) & MACROBLOCK_MOTION_FORWARD ?
+				mpeg2_mc.avg : mpeg2_mc.put),1);
+
+		    break;
+
+		case MC_DMV:
+		    MOTION_CALL (motion_fr_dmv, MACROBLOCK_MOTION_FORWARD);
+		    break;
+
+		case 0:
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    //	 MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    break;
+		}
+	    else
+		switch (macroblock_modes & MOTION_TYPE_MASK) {
+		case MC_FIELD:
+		    MOTION_CALL (motion_fi_field, macroblock_modes);
+		    break;
+
+		case MC_16X8:
+		    MOTION_CALL (motion_fi_16x8, macroblock_modes);
+		    break;
+
+		case MC_DMV:
+		    MOTION_CALL (motion_fi_dmv, MACROBLOCK_MOTION_FORWARD);
+		    break;
+
+		case 0:
+		    /* non-intra mb without forward mv in a P picture */
+		    picture->f_motion.pmv[0][0] = 0;
+		    picture->f_motion.pmv[0][1] = 0;
+		    picture->f_motion.pmv[1][0] = 0;
+		    picture->f_motion.pmv[1][1] = 0;
+		    //	 MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    break;
+		}
+
+	    if (macroblock_modes & MACROBLOCK_PATTERN) {
+		int coded_block_pattern;
+		int DCT_offset, DCT_stride;
+		int offset;
+		uint8_t * dest_y;
+
+		if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		    DCT_offset = picture->pitches[0];
+		    DCT_stride = picture->pitches[0] * 2;
+		} else {
+		    DCT_offset = picture->pitches[0] * 8;
+		    DCT_stride = picture->pitches[0];
+		}
+
+		picture->XvMC_cbp = coded_block_pattern = get_xvmc_coded_block_pattern (picture);
+		offset = picture->offset;
+		dest_y = picture->dest[0] + offset;
+		// TODO  optimize not fully used for idct accel only mc.
+		if (coded_block_pattern & 0x20)
+		    slice_xvmc_non_intra_DCT (picture, dest_y, DCT_stride); //  cc0  luma 0
+		if (coded_block_pattern & 0x10)
+		    slice_xvmc_non_intra_DCT (picture, dest_y + 8, DCT_stride); // cc0 luma 1
+		if (coded_block_pattern & 0x08)
+		    slice_xvmc_non_intra_DCT (picture, dest_y + DCT_offset,
+					 DCT_stride); // cc0 luma 2
+		if (coded_block_pattern & 0x04)
+		    slice_xvmc_non_intra_DCT (picture, dest_y + DCT_offset + 8,
+					 DCT_stride); // cc0 luma 3
+		if (coded_block_pattern & 0x2)
+		    slice_xvmc_non_intra_DCT (picture,
+					 picture->dest[1] + (offset >> 1),
+					 picture->pitches[1]); // cc1 croma 
+		if (coded_block_pattern & 0x1)
+		    slice_xvmc_non_intra_DCT (picture,
+					 picture->dest[2] + (offset >> 1),
+					 picture->pitches[2]); // cc2 croma
+	    }
+
+            if((picture->mc->xvmc_accel & ACCEL) == MOTION_ACCEL &&
+	       !(picture->mc->xvmc_accel & SIGNED_INTRA)) {
+	        // original:
+	        picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+		    picture->dc_dct_pred[2] = 128 << picture->intra_dc_precision;
+
+	    } else { // MOTION_ACCEL+SIGNED_INTRA
+	        picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+		    picture->dc_dct_pred[2] = 0;
+	    }
+
+	}
+        xvmc->proc_macro_block(picture->XvMC_x, picture->XvMC_y,
+					 picture->XvMC_mb_type,
+					 picture->XvMC_motion_type,
+					 picture->XvMC_mv_field_sel,
+					 picture->XvMC_dmvector,
+					 picture->XvMC_cbp,
+					 picture->XvMC_dct_type,
+					 picture->current_frame,
+					 picture->forward_reference_frame,
+					 picture->backward_reference_frame,
+					 picture->picture_structure,
+					 picture->second_field,
+				         picture->f_motion.pmv,
+				         picture->b_motion.pmv);
+
+
+	NEXT_MACROBLOCK;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	mba_inc = 0;
+	while (1) {
+	    if (bit_buf >= 0x10000000) {
+		mba = MBA_5 + (UBITS (bit_buf, 5) - 2);
+		break;
+	    } else if (bit_buf >= 0x03000000) {
+		mba = MBA_11 + (UBITS (bit_buf, 11) - 24);
+		break;
+	    } else switch (UBITS (bit_buf, 11)) {
+	    case 8:		/* macroblock_escape */
+		mba_inc += 33;
+		/* pass through */
+	    case 15:	/* macroblock_stuffing (MPEG1 only) */
+		DUMPBITS (bit_buf, bits, 11);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		continue;
+	    default:	/* end of slice, or error */
+		if (mpeg2_cpu_state_restore)
+		    mpeg2_cpu_state_restore (&cpu_state);
+		accel->xvmc_last_slice_code = code;
+		return;
+	    }
+	}
+	DUMPBITS (bit_buf, bits, mba->len);
+	mba_inc += mba->mba;
+	if (mba_inc) {
+	    //TODO  conversion to signed format signed format
+          if((picture->mc->xvmc_accel & ACCEL) == MOTION_ACCEL &&
+	     !(picture->mc->xvmc_accel & SIGNED_INTRA)) {
+	    // original:
+	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+	      picture->dc_dct_pred[2] = 128 << picture->intra_dc_precision;
+	  } else { // MOTION_ACCEL+SIGNED_INTRA
+	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
+	      picture->dc_dct_pred[2] = 0;
+	  }
+
+	    picture->XvMC_cbp = 0; 
+	    if (picture->picture_coding_type == P_TYPE) {
+		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
+		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+
+		do {
+		    if(picture->mc->xvmc_accel) {
+
+		        /* derive motion_type */
+		        if(picture->picture_structure == FRAME_PICTURE) {
+			  picture->XvMC_motion_type = XINE_MC_FRAME;
+			} else {
+			  picture->XvMC_motion_type = XINE_MC_FIELD;
+			  /* predict from field of same parity */
+			  picture->XvMC_mv_field_sel[0][0] =
+			    picture->XvMC_mv_field_sel[0][1] =
+			      (picture->picture_structure==BOTTOM_FIELD);
+			}
+			picture->XvMC_mb_type = macroblock_modes & 0x1E;
+			picture->XvMC_x = picture->offset/16;
+			picture->XvMC_y = picture->v_offset/16;
+
+			xvmc->proc_macro_block(picture->XvMC_x,picture->XvMC_y,
+					 picture->XvMC_mb_type,
+					 picture->XvMC_motion_type,
+					 picture->XvMC_mv_field_sel,
+					 picture->XvMC_dmvector,
+					 picture->XvMC_cbp,
+					 picture->XvMC_dct_type,
+					 picture->current_frame,
+					 picture->forward_reference_frame,
+					 picture->backward_reference_frame,
+					 picture->picture_structure,
+					 picture->second_field,
+				         picture->f_motion.pmv,
+				         picture->b_motion.pmv);
+		    } else {
+		      // MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    }
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    } else {
+		do {
+		    if(picture->mc->xvmc_accel) {
+
+		        /* derive motion_type */
+		        if(picture->picture_structure == FRAME_PICTURE) {
+			  picture->XvMC_motion_type = XINE_MC_FRAME;
+			} else {
+			  picture->XvMC_motion_type = XINE_MC_FIELD;
+			  /* predict from field of same parity */
+			  picture->XvMC_mv_field_sel[0][0] =
+			    picture->XvMC_mv_field_sel[0][1] =
+			      (picture->picture_structure==BOTTOM_FIELD);
+			}
+
+			picture->XvMC_mb_type = macroblock_modes & 0x1E;
+			picture->XvMC_x = picture->offset/16;
+			picture->XvMC_y = picture->v_offset/16;
+
+			xvmc->proc_macro_block(picture->XvMC_x,picture->XvMC_y,
+					 picture->XvMC_mb_type,
+					 picture->XvMC_motion_type,
+					 picture->XvMC_mv_field_sel,
+					 picture->XvMC_dmvector,
+					 picture->XvMC_cbp,
+					 picture->XvMC_dct_type,
+					 picture->current_frame,
+					 picture->forward_reference_frame,
+					 picture->backward_reference_frame,
+					 picture->picture_structure,
+					 picture->second_field,
+				         picture->f_motion.pmv,
+				         picture->b_motion.pmv);
+		    } else {
+		        MOTION_CALL (motion_reuse, macroblock_modes);
+		    }
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    }
+	}
+    }
+    accel->xvmc_last_slice_code = code;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
diff --git a/src/video_dec/libmpeg2/slice_xvmc_vld.c b/src/video_dec/libmpeg2/slice_xvmc_vld.c
new file mode 100644
index 000000000..60fa8b4f8
--- /dev/null
+++ b/src/video_dec/libmpeg2/slice_xvmc_vld.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2004 The Unichrome project. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTIES OR REPRESENTATIONS; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include "mpeg2.h"
+#include "mpeg2_internal.h"
+#include "xvmc_vld.h"
+
+static const uint8_t zig_zag_scan[64] ATTR_ALIGN(16) =
+{
+    /* Zig-Zag scan pattern */
+     0, 1, 8,16, 9, 2, 3,10,
+    17,24,32,25,18,11, 4, 5,
+    12,19,26,33,40,48,41,34,
+    27,20,13, 6, 7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+static const uint8_t alternate_scan [64] ATTR_ALIGN(16) =
+{
+    /* Alternate scan pattern */
+    0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49,
+    41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43,
+    51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45,
+    53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63
+};
+
+void mpeg2_xxmc_slice( mpeg2dec_accel_t *accel, picture_t *picture, 
+		       int code, uint8_t *buffer, uint32_t chunk_size, 
+		       uint8_t *chunk_buffer)
+
+{
+  vo_frame_t
+    *frame = picture->current_frame;
+  xine_xxmc_t 
+    *xxmc = (xine_xxmc_t *) frame->accel_data;
+  xine_vld_frame_t 
+    *vft = &xxmc->vld_frame;
+  unsigned
+    mb_frame_height;
+  int 
+    i;
+  const uint8_t *
+    scan_pattern;
+  float
+    ms_per_slice;
+
+  if (1 == code && accel->xvmc_last_slice_code != 1) {
+    frame->bad_frame = 1;
+    accel->slices_per_row = 1;
+    accel->row_slice_count = 1;
+
+    /*
+     * Check that first field went through OK. Otherwise,
+     * indicate bad frame. 
+     */
+    
+    if (picture->second_field) {
+      accel->xvmc_last_slice_code = (xxmc->decoded) ? 0 : -1;
+      xxmc->decoded = 0;
+    } else {
+      accel->xvmc_last_slice_code = 0;
+    }
+
+    mb_frame_height =
+      (!(picture->mpeg1) && (picture->progressive_sequence)) ?
+      2*((picture->coded_picture_height+31) >> 5) :
+      (picture->coded_picture_height+15) >> 4;
+    accel->xxmc_mb_pic_height = (picture->picture_structure == FRAME_PICTURE ) ?
+      mb_frame_height : mb_frame_height >> 1;
+
+    ms_per_slice = 1000. / (90000. * mb_frame_height) * frame->duration;
+    xxmc->sleep = 1. / (ms_per_slice * 0.45); 
+    if (xxmc->sleep < 1.) xxmc->sleep = 1.;
+
+    if (picture->mpeg1) {
+      vft->mv_ranges[0][0] = picture->b_motion.f_code[0];
+      vft->mv_ranges[0][1] = picture->b_motion.f_code[0];
+      vft->mv_ranges[1][0] = picture->f_motion.f_code[0];
+      vft->mv_ranges[1][1] = picture->f_motion.f_code[0];
+    } else {
+      vft->mv_ranges[0][0] = picture->b_motion.f_code[0];
+      vft->mv_ranges[0][1] = picture->b_motion.f_code[1];
+      vft->mv_ranges[1][0] = picture->f_motion.f_code[0];
+      vft->mv_ranges[1][1] = picture->f_motion.f_code[1];
+    }
+
+    vft->picture_structure = picture->picture_structure;
+    vft->picture_coding_type = picture->picture_coding_type;
+    vft->mpeg_coding = (picture->mpeg1) ? 0 : 1;
+    vft->progressive_sequence = picture->progressive_sequence;
+    vft->scan = (picture->scan == mpeg2_scan_alt);
+    vft->pred_dct_frame = picture->frame_pred_frame_dct;
+    vft->concealment_motion_vectors = 
+      picture->concealment_motion_vectors;
+    vft->q_scale_type = picture->q_scale_type;
+    vft->intra_vlc_format = picture->intra_vlc_format;
+    vft->intra_dc_precision = picture->intra_dc_precision;
+    vft->second_field = picture->second_field;
+
+    /*
+     * Translation of libmpeg2's Q-matrix layout to VLD XvMC's. 
+     * Errors here will give
+     * blocky artifacts and sometimes wrong colors.
+     */
+
+    scan_pattern = (vft->scan) ? alternate_scan : zig_zag_scan;
+
+    if ((vft->load_intra_quantizer_matrix = picture->load_intra_quantizer_matrix)) {
+      for (i=0; i<64; ++i) {
+	vft->intra_quantizer_matrix[scan_pattern[i]] = 
+	  picture->intra_quantizer_matrix[picture->scan[i]]; 
+      }
+    }      
+
+    if ((vft->load_non_intra_quantizer_matrix = picture->load_non_intra_quantizer_matrix)) {
+      for (i=0; i<64; ++i) {
+	vft->non_intra_quantizer_matrix[scan_pattern[i]] = 
+	  picture->non_intra_quantizer_matrix[picture->scan[i]];
+      }
+    }
+
+    picture->load_intra_quantizer_matrix = 0;
+    picture->load_non_intra_quantizer_matrix = 0;
+    vft->forward_reference_frame = picture->forward_reference_frame;
+    vft->backward_reference_frame = picture->backward_reference_frame;
+    xxmc->proc_xxmc_begin( frame ); 
+    if (xxmc->result != 0) {
+      accel->xvmc_last_slice_code=-1;
+    }
+  }
+  
+  if (((code == accel->xvmc_last_slice_code + 1) || 
+       (code == accel->xvmc_last_slice_code))) {
+
+    /*
+     * Send this slice to the output plugin. May stall for a long
+     * time in proc_slice;
+     */
+
+    frame->bad_frame = 1;
+    xxmc->slice_data_size = chunk_size;
+    xxmc->slice_data = chunk_buffer;
+    xxmc->slice_code = code;
+    
+    xxmc->proc_xxmc_slice( frame );
+    
+    if (xxmc->result != 0) {
+	accel->xvmc_last_slice_code=-1;
+	return;
+    }
+    /*
+     * Keep track of slices.
+     */ 
+
+    accel->row_slice_count = (accel->xvmc_last_slice_code == code) ? 
+      accel->row_slice_count + 1 : 1;
+    accel->slices_per_row = (accel->row_slice_count > accel->slices_per_row) ? 
+      accel->row_slice_count:accel->slices_per_row;
+    accel->xvmc_last_slice_code = code;
+
+  } else  {
+
+    /*
+     * An error has occured.
+     */
+
+    lprintf("libmpeg2: VLD XvMC: Slice error.\n");
+    accel->xvmc_last_slice_code = -1;
+    return;
+  }
+}
+
+void mpeg2_xxmc_vld_frame_complete(mpeg2dec_accel_t *accel, picture_t *picture, int code) 
+{
+  vo_frame_t
+    *frame = picture->current_frame;
+  xine_xxmc_t 
+    *xxmc = (xine_xxmc_t *) frame->accel_data;
+  
+  if (xxmc->decoded) return;
+  if (accel->xvmc_last_slice_code == -1) {
+    xxmc->proc_xxmc_flush( frame );
+    return;
+  }
+
+  if ((code != 0xff) || ((accel->xvmc_last_slice_code == 
+			  accel->xxmc_mb_pic_height) && 
+			 accel->slices_per_row == accel->row_slice_count)) {
+
+    xxmc->proc_xxmc_flush( frame );
+    
+    if (xxmc->result) {
+      accel->xvmc_last_slice_code=-1;
+      frame->bad_frame = 1;
+      return;
+    }
+    xxmc->decoded = 1;
+    accel->xvmc_last_slice_code = 0;
+    if (picture->picture_structure == 3 || picture->second_field) {
+      if (xxmc->result == 0) 
+	frame->bad_frame = 0;
+    } 
+  }
+}
diff --git a/src/video_dec/libmpeg2/stats.c b/src/video_dec/libmpeg2/stats.c
new file mode 100644
index 000000000..63c701179
--- /dev/null
+++ b/src/video_dec/libmpeg2/stats.c
@@ -0,0 +1,317 @@
+/*
+ * stats.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2_internal.h"
+
+static int debug_level = -1;
+
+/* Determine is debug output is required. */
+/* We could potentially have multiple levels of debug info */
+static int debug_is_on (void)
+{
+    char * env_var;
+	
+    if (debug_level < 0) {
+	env_var = getenv ("MPEG2_DEBUG");
+
+	if (env_var)
+	    debug_level = 1;
+	else
+	    debug_level = 0;
+    }
+	
+    return debug_level;
+}
+
+static void stats_picture (uint8_t * buffer)
+{
+    static const char *const picture_coding_type_str [8] = {
+	"Invalid picture type",
+	"I-type",
+	"P-type",
+	"B-type",
+	"D (very bad)",
+	"Invalid","Invalid","Invalid"
+    };
+
+    int picture_coding_type;
+    int temporal_reference;
+    int vbv_delay;
+
+    temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
+    picture_coding_type = (buffer [1] >> 3) & 7;
+    vbv_delay = ((buffer[1] << 13) | (buffer[2] << 5) |
+		 (buffer[3] >> 3)) & 0xffff;
+
+    fprintf (stderr, " (picture) %s temporal_reference %d, vbv_delay %d\n",
+	     picture_coding_type_str [picture_coding_type],
+	     temporal_reference, vbv_delay);
+}
+
+static void stats_user_data (uint8_t * buffer)
+{
+    fprintf (stderr, " (user_data)\n");
+}
+
+static void stats_sequence (uint8_t * buffer)
+{
+    static const char *const aspect_ratio_information_str[8] = {
+	"Invalid Aspect Ratio",
+	"1:1",
+	"4:3",
+	"16:9",
+	"2.21:1",
+	"Invalid Aspect Ratio",
+	"Invalid Aspect Ratio",
+	"Invalid Aspect Ratio"
+    };
+    static const char *const frame_rate_str[16] = {
+	"Invalid frame_rate_code",
+	"23.976", "24", "25" , "29.97",
+	"30" , "50", "59.94", "60" ,
+	"Invalid frame_rate_code", "Invalid frame_rate_code",
+	"Invalid frame_rate_code", "Invalid frame_rate_code",
+	"Invalid frame_rate_code", "Invalid frame_rate_code",
+	"Invalid frame_rate_code"
+    };
+
+    int horizontal_size;
+    int vertical_size;
+    int aspect_ratio_information;
+    int frame_rate_code;
+    int bit_rate_value;
+    int vbv_buffer_size_value;
+    int constrained_parameters_flag;
+    int load_intra_quantizer_matrix;
+    int load_non_intra_quantizer_matrix;
+
+    vertical_size = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    horizontal_size = vertical_size >> 12;
+    vertical_size &= 0xfff;
+    aspect_ratio_information = buffer[3] >> 4;
+    frame_rate_code = buffer[3] & 15;
+    bit_rate_value = (buffer[4] << 10) | (buffer[5] << 2) | (buffer[6] >> 6);
+    vbv_buffer_size_value = ((buffer[6] << 5) | (buffer[7] >> 3)) & 0x3ff;
+    constrained_parameters_flag = buffer[7] & 4;
+    load_intra_quantizer_matrix = buffer[7] & 2;
+    if (load_intra_quantizer_matrix)
+	buffer += 64;
+    load_non_intra_quantizer_matrix = buffer[7] & 1;
+
+    fprintf (stderr, " (seq) %dx%d %s, %s fps, %5.0f kbps, VBV %d kB%s%s%s\n",
+	     horizontal_size, vertical_size,
+	     aspect_ratio_information_str [aspect_ratio_information],
+	     frame_rate_str [frame_rate_code],
+	     bit_rate_value * 400.0 / 1000.0,
+	     2 * vbv_buffer_size_value,
+	     constrained_parameters_flag ? " , CP":"",
+	     load_intra_quantizer_matrix ? " , Custom Intra Matrix":"",
+	     load_non_intra_quantizer_matrix ? " , Custom Non-Intra Matrix":"");
+}
+
+static void stats_sequence_error (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_error)\n");
+}
+
+static void stats_sequence_end (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_end)\n");
+}
+
+static void stats_group (uint8_t * buffer)
+{
+    fprintf (stderr, " (group)%s%s\n",
+	     (buffer[4] & 0x40) ? " closed_gop" : "",
+	     (buffer[4] & 0x20) ? " broken_link" : "");
+}
+
+static void stats_slice (int code, uint8_t * buffer)
+{
+    /* fprintf (stderr, " (slice %d)\n", code); */
+}
+
+static void stats_sequence_extension (uint8_t * buffer)
+{
+    static const char *const chroma_format_str[4] = {
+	"Invalid Chroma Format",
+	"4:2:0 Chroma",
+	"4:2:2 Chroma",
+	"4:4:4 Chroma"
+    };
+
+    int progressive_sequence;
+    int chroma_format;
+
+    progressive_sequence = (buffer[1] >> 3) & 1;
+    chroma_format = (buffer[1] >> 1) & 3;
+
+    fprintf (stderr, " (seq_ext) progressive_sequence %d, %s\n",
+	     progressive_sequence, chroma_format_str [chroma_format]);
+}
+
+static void stats_sequence_display_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_display_extension)\n");
+}
+
+static void stats_quant_matrix_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (quant_matrix_extension)\n");
+}
+
+static void stats_copyright_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (copyright_extension)\n");
+}
+
+
+static void stats_sequence_scalable_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (sequence_scalable_extension)\n");
+}
+
+static void stats_picture_display_extension (uint8_t * buffer)
+{
+    fprintf (stderr, " (picture_display_extension)\n");
+}
+
+static void stats_picture_coding_extension (uint8_t * buffer)
+{
+    static const char *const picture_structure_str[4] = {
+	"Invalid Picture Structure",
+	"Top field",
+	"Bottom field",
+	"Frame Picture"
+    };
+
+    int f_code[2][2];
+    int intra_dc_precision;
+    int picture_structure;
+    int top_field_first;
+    int frame_pred_frame_dct;
+    int concealment_motion_vectors;
+    int q_scale_type;
+    int intra_vlc_format;
+    int alternate_scan;
+    int repeat_first_field;
+    int progressive_frame;
+
+    f_code[0][0] = buffer[0] & 15;
+    f_code[0][1] = buffer[1] >> 4;
+    f_code[1][0] = buffer[1] & 15;
+    f_code[1][1] = buffer[2] >> 4;
+    intra_dc_precision = (buffer[2] >> 2) & 3;
+    picture_structure = buffer[2] & 3;
+    top_field_first = buffer[3] >> 7;
+    frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    q_scale_type = (buffer[3] >> 4) & 1;
+    intra_vlc_format = (buffer[3] >> 3) & 1;
+    alternate_scan = (buffer[3] >> 2) & 1;
+    repeat_first_field = (buffer[3] >> 1) & 1;
+    progressive_frame = buffer[4] >> 7;
+
+    fprintf (stderr,
+	     " (pic_ext) %s\n", picture_structure_str [picture_structure]);
+    fprintf (stderr,
+	     " (pic_ext) forward horizontal f_code % d, forward vertical f_code % d\n",
+	     f_code[0][0], f_code[0][1]);
+    fprintf (stderr,
+	     " (pic_ext) backward horizontal f_code % d, backward vertical f_code % d\n", 
+	     f_code[1][0], f_code[1][1]);
+    fprintf (stderr,
+	     " (pic_ext) intra_dc_precision %d, top_field_first %d, frame_pred_frame_dct %d\n",
+	     intra_dc_precision, top_field_first, frame_pred_frame_dct);
+    fprintf (stderr,
+	     " (pic_ext) concealment_motion_vectors %d, q_scale_type %d, intra_vlc_format %d\n",
+	     concealment_motion_vectors, q_scale_type, intra_vlc_format);
+    fprintf (stderr,
+	     " (pic_ext) alternate_scan %d, repeat_first_field %d, progressive_frame %d\n",
+	     alternate_scan, repeat_first_field, progressive_frame);
+}
+
+void mpeg2_stats (int code, uint8_t * buffer)
+{
+    if (! (debug_is_on ()))
+	return;
+
+    switch (code) {
+    case 0x00:
+	stats_picture (buffer);
+	break;
+    case 0xb2:
+	stats_user_data (buffer);
+	break;
+    case 0xb3:
+	stats_sequence (buffer);
+	break;
+    case 0xb4:
+	stats_sequence_error (buffer);
+	break;
+    case 0xb5:
+	switch (buffer[0] >> 4) {
+	case 1:
+	    stats_sequence_extension (buffer);
+	    break;
+	case 2:
+	    stats_sequence_display_extension (buffer);
+	    break;
+	case 3:
+	    stats_quant_matrix_extension (buffer);
+	    break;
+	case 4:
+	    stats_copyright_extension (buffer);
+	    break;
+	case 5:
+	    stats_sequence_scalable_extension (buffer);
+	    break;
+	case 7:
+	    stats_picture_display_extension (buffer);
+	    break;
+	case 8:
+	    stats_picture_coding_extension (buffer);
+	    break;
+	default:
+	    fprintf (stderr, " (unknown extension %#x)\n", buffer[0] >> 4);
+	}
+	break;
+    case 0xb7:
+	stats_sequence_end (buffer);
+	break;
+    case 0xb8:
+	stats_group (buffer);
+	break;
+    default:
+	if (code < 0xb0)
+	    stats_slice (code, buffer);
+	else
+	    fprintf (stderr, " (unknown start code %#02x)\n", code);
+    }
+}
diff --git a/src/video_dec/libmpeg2/vis.h b/src/video_dec/libmpeg2/vis.h
new file mode 100644
index 000000000..69dd49075
--- /dev/null
+++ b/src/video_dec/libmpeg2/vis.h
@@ -0,0 +1,328 @@
+/*
+ * vis.h
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* You may be asking why I hard-code the instruction opcodes and don't
+ * use the normal VIS assembler mnenomics for the VIS instructions.
+ *
+ * The reason is that Sun, in their infinite wisdom, decided that a binary
+ * using a VIS instruction will cause it to be marked (in the ELF headers)
+ * as doing so, and this prevents the OS from loading such binaries if the
+ * current cpu doesn't have VIS.  There is no way to easily override this
+ * behavior of the assembler that I am aware of.
+ *
+ * This totally defeats what libmpeg2 is trying to do which is allow a
+ * single binary to be created, and then detect the availability of VIS
+ * at runtime.
+ *
+ * I'm not saying that tainting the binary by default is bad, rather I'm
+ * saying that not providing a way to override this easily unnecessarily
+ * ties people's hands.
+ *
+ * Thus, we do the opcode encoding by hand and output 32-bit words in
+ * the assembler to keep the binary from becoming tainted.
+ */
+
+#define vis_opc_base	((0x1 << 31) | (0x36 << 19))
+#define vis_opf(X)	((X) << 5)
+#define vis_sreg(X)	(X)
+#define vis_dreg(X)	(((X)&0x1f)|((X)>>5))
+#define vis_rs1_s(X)	(vis_sreg(X) << 14)
+#define vis_rs1_d(X)	(vis_dreg(X) << 14)
+#define vis_rs2_s(X)	(vis_sreg(X) << 0)
+#define vis_rs2_d(X)	(vis_dreg(X) << 0)
+#define vis_rd_s(X)	(vis_sreg(X) << 25)
+#define vis_rd_d(X)	(vis_dreg(X) << 25)
+
+#define vis_ss2s(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_dd2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_d(rs1) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_ss2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_sd2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d2s(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s2d(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d12d(opf,rs1,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_d(rs1) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d22d(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_s12s(opf,rs1,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s22s(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s(opf,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rd_s(rd)))
+
+#define vis_d(opf,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rd_d(rd)))
+
+#define vis_r2m(op,rd,mem) \
+	__asm__ __volatile__ (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
+
+#define vis_r2m_2(op,rd,mem1,mem2) \
+	__asm__ __volatile__ (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
+
+#define vis_m2r(op,mem,rd) \
+	__asm__ __volatile__ (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
+
+#define vis_m2r_2(op,mem1,mem2,rd) \
+	__asm__ __volatile__ (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
+
+static inline void vis_set_gsr(unsigned int _val)
+{
+	register unsigned int val asm("g1");
+
+	val = _val;
+	__asm__ __volatile__(".word 0xa7804000"
+			     : : "r" (val));
+}
+
+#define VIS_GSR_ALIGNADDR_MASK	0x0000007
+#define VIS_GSR_ALIGNADDR_SHIFT	0
+#define VIS_GSR_SCALEFACT_MASK	0x0000078
+#define VIS_GSR_SCALEFACT_SHIFT	3
+
+#define vis_ld32(mem,rs1)		vis_m2r(ld, mem, rs1)
+#define vis_ld32_2(mem1,mem2,rs1)	vis_m2r_2(ld, mem1, mem2, rs1)
+#define vis_st32(rs1,mem)		vis_r2m(st, rs1, mem)
+#define vis_st32_2(rs1,mem1,mem2)	vis_r2m_2(st, rs1, mem1, mem2)
+#define vis_ld64(mem,rs1)		vis_m2r(ldd, mem, rs1)
+#define vis_ld64_2(mem1,mem2,rs1)	vis_m2r_2(ldd, mem1, mem2, rs1)
+#define vis_st64(rs1,mem)		vis_r2m(std, rs1, mem)
+#define vis_st64_2(rs1,mem1,mem2)	vis_r2m_2(std, rs1, mem1, mem2)
+
+#define vis_ldblk(mem, rd) \
+do {	register void *__mem asm("g1"); \
+	__mem = &(mem); \
+	__asm__ __volatile__(".word 0xc1985e00 | %1" \
+			     : \
+			     : "r" (__mem), \
+			       "i" (vis_rd_d(rd)) \
+			     : "memory"); \
+} while (0)
+
+#define vis_stblk(rd, mem) \
+do {	register void *__mem asm("g1"); \
+	__mem = &(mem); \
+	__asm__ __volatile__(".word 0xc1b85e00 | %1" \
+			     : \
+			     : "r" (__mem), \
+			       "i" (vis_rd_d(rd)) \
+			     : "memory"); \
+} while (0)
+
+#define vis_membar_storestore()	\
+	__asm__ __volatile__(".word 0x8143e008" : : : "memory")
+
+#define vis_membar_sync()	\
+	__asm__ __volatile__(".word 0x8143e040" : : : "memory")
+
+/* 16 and 32 bit partitioned addition and subtraction.  The normal
+ * versions perform 4 16-bit or 2 32-bit additions or subtractions.
+ * The 's' versions perform 2 16-bit or 2 32-bit additions or
+ * subtractions.
+ */
+
+#define vis_padd16(rs1,rs2,rd)		vis_dd2d(0x50, rs1, rs2, rd)
+#define vis_padd16s(rs1,rs2,rd)		vis_ss2s(0x51, rs1, rs2, rd)
+#define vis_padd32(rs1,rs2,rd)		vis_dd2d(0x52, rs1, rs2, rd)
+#define vis_padd32s(rs1,rs2,rd)		vis_ss2s(0x53, rs1, rs2, rd)
+#define vis_psub16(rs1,rs2,rd)		vis_dd2d(0x54, rs1, rs2, rd)
+#define vis_psub16s(rs1,rs2,rd)		vis_ss2s(0x55, rs1, rs2, rd)
+#define vis_psub32(rs1,rs2,rd)		vis_dd2d(0x56, rs1, rs2, rd)
+#define vis_psub32s(rs1,rs2,rd)		vis_ss2s(0x57, rs1, rs2, rd)
+
+/* Pixel formatting instructions.  */
+
+#define vis_pack16(rs2,rd)		vis_d2s( 0x3b,      rs2, rd)
+#define vis_pack32(rs1,rs2,rd)		vis_dd2d(0x3a, rs1, rs2, rd)
+#define vis_packfix(rs2,rd)		vis_d2s( 0x3d,      rs2, rd)
+#define vis_expand(rs2,rd)		vis_s2d( 0x4d,      rs2, rd)
+#define vis_pmerge(rs1,rs2,rd)		vis_ss2d(0x4b, rs1, rs2, rd)
+
+/* Partitioned multiply instructions.  */
+
+#define vis_mul8x16(rs1,rs2,rd)		vis_sd2d(0x31, rs1, rs2, rd)
+#define vis_mul8x16au(rs1,rs2,rd)	vis_ss2d(0x33, rs1, rs2, rd)
+#define vis_mul8x16al(rs1,rs2,rd)	vis_ss2d(0x35, rs1, rs2, rd)
+#define vis_mul8sux16(rs1,rs2,rd)	vis_dd2d(0x36, rs1, rs2, rd)
+#define vis_mul8ulx16(rs1,rs2,rd)	vis_dd2d(0x37, rs1, rs2, rd)
+#define vis_muld8sux16(rs1,rs2,rd)	vis_ss2d(0x38, rs1, rs2, rd)
+#define vis_muld8ulx16(rs1,rs2,rd)	vis_ss2d(0x39, rs1, rs2, rd)
+
+/* Alignment instructions.  */
+
+static inline void *vis_alignaddr(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x18) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(1)));
+
+	return ptr;
+}
+
+static inline void vis_alignaddr_g0(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x18) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(0)));
+}
+
+static inline void *vis_alignaddrl(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x19) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(1)));
+
+	return ptr;
+}
+
+static inline void vis_alignaddrl_g0(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x19) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(0)));
+}
+
+#define vis_faligndata(rs1,rs2,rd)	vis_dd2d(0x48, rs1, rs2, rd)
+
+/* Logical operate instructions.  */
+
+#define vis_fzero(rd)			vis_d(   0x60,           rd)
+#define vis_fzeros(rd)			vis_s(   0x61,           rd)
+#define vis_fone(rd)			vis_d(   0x7e,           rd)
+#define vis_fones(rd)			vis_s(   0x7f,           rd)
+#define vis_src1(rs1,rd)		vis_d12d(0x74, rs1,      rd)
+#define vis_src1s(rs1,rd)		vis_s12s(0x75, rs1,      rd)
+#define vis_src2(rs2,rd)		vis_d22d(0x78,      rs2, rd)
+#define vis_src2s(rs2,rd)		vis_s22s(0x79,      rs2, rd)
+#define vis_not1(rs1,rd)		vis_d12d(0x6a, rs1,      rd)
+#define vis_not1s(rs1,rd)		vis_s12s(0x6b, rs1,      rd)
+#define vis_not2(rs2,rd)		vis_d22d(0x66,      rs2, rd)
+#define vis_not2s(rs2,rd)		vis_s22s(0x67,      rs2, rd)
+#define vis_or(rs1,rs2,rd)		vis_dd2d(0x7c, rs1, rs2, rd)
+#define vis_ors(rs1,rs2,rd)		vis_ss2s(0x7d, rs1, rs2, rd)
+#define vis_nor(rs1,rs2,rd)		vis_dd2d(0x62, rs1, rs2, rd)
+#define vis_nors(rs1,rs2,rd)		vis_ss2s(0x63, rs1, rs2, rd)
+#define vis_and(rs1,rs2,rd)		vis_dd2d(0x70, rs1, rs2, rd)
+#define vis_ands(rs1,rs2,rd)		vis_ss2s(0x71, rs1, rs2, rd)
+#define vis_nand(rs1,rs2,rd)		vis_dd2d(0x6e, rs1, rs2, rd)
+#define vis_nands(rs1,rs2,rd)		vis_ss2s(0x6f, rs1, rs2, rd)
+#define vis_xor(rs1,rs2,rd)		vis_dd2d(0x6c, rs1, rs2, rd)
+#define vis_xors(rs1,rs2,rd)		vis_ss2s(0x6d, rs1, rs2, rd)
+#define vis_xnor(rs1,rs2,rd)		vis_dd2d(0x72, rs1, rs2, rd)
+#define vis_xnors(rs1,rs2,rd)		vis_ss2s(0x73, rs1, rs2, rd)
+#define vis_ornot1(rs1,rs2,rd)		vis_dd2d(0x7a, rs1, rs2, rd)
+#define vis_ornot1s(rs1,rs2,rd)		vis_ss2s(0x7b, rs1, rs2, rd)
+#define vis_ornot2(rs1,rs2,rd)		vis_dd2d(0x76, rs1, rs2, rd)
+#define vis_ornot2s(rs1,rs2,rd)		vis_ss2s(0x77, rs1, rs2, rd)
+#define vis_andnot1(rs1,rs2,rd)		vis_dd2d(0x68, rs1, rs2, rd)
+#define vis_andnot1s(rs1,rs2,rd)	vis_ss2s(0x69, rs1, rs2, rd)
+#define vis_andnot2(rs1,rs2,rd)		vis_dd2d(0x64, rs1, rs2, rd)
+#define vis_andnot2s(rs1,rs2,rd)	vis_ss2s(0x65, rs1, rs2, rd)
+
+/* Pixel component distance.  */
+
+#define vis_pdist(rs1,rs2,rd)		vis_dd2d(0x3e, rs1, rs2, rd)
diff --git a/src/video_dec/libmpeg2/vlc.h b/src/video_dec/libmpeg2/vlc.h
new file mode 100644
index 000000000..65de9a840
--- /dev/null
+++ b/src/video_dec/libmpeg2/vlc.h
@@ -0,0 +1,428 @@
+/*
+ * vlc.h
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define GETWORD(bit_buf,shift,bit_ptr)				\
+do {								\
+    bit_buf |= ((bit_ptr[0] << 8) | bit_ptr[1]) << (shift);	\
+    bit_ptr += 2;						\
+} while (0)
+
+static inline void bitstream_init (picture_t * picture, uint8_t * start)
+{
+    picture->bitstream_buf =
+	(start[0] << 24) | (start[1] << 16) | (start[2] << 8) | start[3];
+    picture->bitstream_ptr = start + 4;
+    picture->bitstream_bits = -16;
+}
+
+/* make sure that there are at least 16 valid bits in bit_buf */
+#define NEEDBITS(bit_buf,bits,bit_ptr)		\
+do {						\
+    if (bits > 0) {				\
+	GETWORD (bit_buf, bits, bit_ptr);	\
+	bits -= 16;				\
+    }						\
+} while (0)
+
+/* remove num valid bits from bit_buf */
+#define DUMPBITS(bit_buf,bits,num)	\
+do {					\
+    bit_buf <<= (num);			\
+    bits += (num);			\
+} while (0)
+
+/* take num bits from the high part of bit_buf and zero extend them */
+#define UBITS(bit_buf,num) (((uint32_t)(bit_buf)) >> (32 - (num)))
+
+/* take num bits from the high part of bit_buf and sign extend them */
+#define SBITS(bit_buf,num) (((int32_t)(bit_buf)) >> (32 - (num)))
+
+typedef struct {
+    uint8_t modes;
+    uint8_t len;
+} MBtab;
+
+typedef struct {
+    uint8_t delta;
+    uint8_t len;
+} MVtab;
+
+typedef struct {
+    int8_t dmv;
+    uint8_t len;
+} DMVtab;
+
+typedef struct {
+    uint8_t cbp;
+    uint8_t len;
+} CBPtab;
+
+typedef struct {
+    uint8_t size;
+    uint8_t len;
+} DCtab;
+
+typedef struct {
+    uint8_t run;
+    uint8_t level;
+    uint8_t len;
+} DCTtab;
+
+typedef struct {
+    uint8_t mba;
+    uint8_t len;
+} MBAtab;
+
+
+#define INTRA MACROBLOCK_INTRA
+#define QUANT MACROBLOCK_QUANT
+
+static const MBtab MB_I [] = {
+    {INTRA|QUANT, 2}, {INTRA, 1}
+};
+
+#define MC MACROBLOCK_MOTION_FORWARD
+#define CODED MACROBLOCK_PATTERN
+
+static const MBtab MB_P [] = {
+    {INTRA|QUANT, 6}, {CODED|QUANT, 5}, {MC|CODED|QUANT, 5}, {INTRA,    5},
+    {MC,          3}, {MC,          3}, {MC,             3}, {MC,       3},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1}
+};
+
+#define FWD MACROBLOCK_MOTION_FORWARD
+#define BWD MACROBLOCK_MOTION_BACKWARD
+#define INTER MACROBLOCK_MOTION_FORWARD|MACROBLOCK_MOTION_BACKWARD
+
+static const MBtab MB_B [] = {
+    {0,                 0}, {INTRA|QUANT,       6},
+    {BWD|CODED|QUANT,   6}, {FWD|CODED|QUANT,   6},
+    {INTER|CODED|QUANT, 5}, {INTER|CODED|QUANT, 5},
+					{INTRA,       5}, {INTRA,       5},
+    {FWD,         4}, {FWD,         4}, {FWD,         4}, {FWD,         4},
+    {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}
+};
+
+#undef INTRA
+#undef QUANT
+#undef MC
+#undef CODED
+#undef FWD
+#undef BWD
+#undef INTER
+
+
+static const MVtab MV_4 [] = {
+    { 3, 6}, { 2, 4}, { 1, 3}, { 1, 3}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}
+};
+
+static const MVtab MV_10 [] = {
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10},
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, {15,10}, {14,10}, {13,10}, {12,10},
+    {11,10}, {10,10}, { 9, 9}, { 9, 9}, { 8, 9}, { 8, 9}, { 7, 9}, { 7, 9},
+    { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7},
+    { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7},
+    { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}
+};
+
+
+static const DMVtab DMV_2 [] = {
+    { 0, 1}, { 0, 1}, { 1, 2}, {-1, 2}
+};
+
+
+static const CBPtab CBP_7 [] = {
+    {0x22, 7}, {0x12, 7}, {0x0a, 7}, {0x06, 7},
+    {0x21, 7}, {0x11, 7}, {0x09, 7}, {0x05, 7},
+    {0x3f, 6}, {0x3f, 6}, {0x03, 6}, {0x03, 6},
+    {0x24, 6}, {0x24, 6}, {0x18, 6}, {0x18, 6},
+    {0x3e, 5}, {0x3e, 5}, {0x3e, 5}, {0x3e, 5},
+    {0x02, 5}, {0x02, 5}, {0x02, 5}, {0x02, 5},
+    {0x3d, 5}, {0x3d, 5}, {0x3d, 5}, {0x3d, 5},
+    {0x01, 5}, {0x01, 5}, {0x01, 5}, {0x01, 5},
+    {0x38, 5}, {0x38, 5}, {0x38, 5}, {0x38, 5},
+    {0x34, 5}, {0x34, 5}, {0x34, 5}, {0x34, 5},
+    {0x2c, 5}, {0x2c, 5}, {0x2c, 5}, {0x2c, 5},
+    {0x1c, 5}, {0x1c, 5}, {0x1c, 5}, {0x1c, 5},
+    {0x28, 5}, {0x28, 5}, {0x28, 5}, {0x28, 5},
+    {0x14, 5}, {0x14, 5}, {0x14, 5}, {0x14, 5},
+    {0x30, 5}, {0x30, 5}, {0x30, 5}, {0x30, 5},
+    {0x0c, 5}, {0x0c, 5}, {0x0c, 5}, {0x0c, 5},
+    {0x20, 4}, {0x20, 4}, {0x20, 4}, {0x20, 4},
+    {0x20, 4}, {0x20, 4}, {0x20, 4}, {0x20, 4},
+    {0x10, 4}, {0x10, 4}, {0x10, 4}, {0x10, 4},
+    {0x10, 4}, {0x10, 4}, {0x10, 4}, {0x10, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3},
+    {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3}
+};
+
+static const CBPtab CBP_9 [] = {
+    {0,    0}, {0x00, 9}, {0x27, 9}, {0x1b, 9},
+    {0x3b, 9}, {0x37, 9}, {0x2f, 9}, {0x1f, 9},
+    {0x3a, 8}, {0x3a, 8}, {0x36, 8}, {0x36, 8},
+    {0x2e, 8}, {0x2e, 8}, {0x1e, 8}, {0x1e, 8},
+    {0x39, 8}, {0x39, 8}, {0x35, 8}, {0x35, 8},
+    {0x2d, 8}, {0x2d, 8}, {0x1d, 8}, {0x1d, 8},
+    {0x26, 8}, {0x26, 8}, {0x1a, 8}, {0x1a, 8},
+    {0x25, 8}, {0x25, 8}, {0x19, 8}, {0x19, 8},
+    {0x2b, 8}, {0x2b, 8}, {0x17, 8}, {0x17, 8},
+    {0x33, 8}, {0x33, 8}, {0x0f, 8}, {0x0f, 8},
+    {0x2a, 8}, {0x2a, 8}, {0x16, 8}, {0x16, 8},
+    {0x32, 8}, {0x32, 8}, {0x0e, 8}, {0x0e, 8},
+    {0x29, 8}, {0x29, 8}, {0x15, 8}, {0x15, 8},
+    {0x31, 8}, {0x31, 8}, {0x0d, 8}, {0x0d, 8},
+    {0x23, 8}, {0x23, 8}, {0x13, 8}, {0x13, 8},
+    {0x0b, 8}, {0x0b, 8}, {0x07, 8}, {0x07, 8}
+};
+
+
+static const DCtab DC_lum_5 [] = {
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
+    {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}
+};
+
+static const DCtab DC_chrom_5 [] = {
+    {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}
+};
+
+static const DCtab DC_long [] = {
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, { 7, 6}, { 7, 6},
+    {8, 7}, {8, 7}, {8, 7}, {8, 7}, {9, 8}, {9, 8}, {10, 9}, {11, 9}
+};
+
+
+static const DCTtab DCT_16 [] = {
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {  2,18, 0}, {  2,17, 0}, {  2,16, 0}, {  2,15, 0},
+    {  7, 3, 0}, { 17, 2, 0}, { 16, 2, 0}, { 15, 2, 0},
+    { 14, 2, 0}, { 13, 2, 0}, { 12, 2, 0}, { 32, 1, 0},
+    { 31, 1, 0}, { 30, 1, 0}, { 29, 1, 0}, { 28, 1, 0}
+};
+
+static const DCTtab DCT_15 [] = {
+    {  1,40,15}, {  1,39,15}, {  1,38,15}, {  1,37,15},
+    {  1,36,15}, {  1,35,15}, {  1,34,15}, {  1,33,15},
+    {  1,32,15}, {  2,14,15}, {  2,13,15}, {  2,12,15},
+    {  2,11,15}, {  2,10,15}, {  2, 9,15}, {  2, 8,15},
+    {  1,31,14}, {  1,31,14}, {  1,30,14}, {  1,30,14},
+    {  1,29,14}, {  1,29,14}, {  1,28,14}, {  1,28,14},
+    {  1,27,14}, {  1,27,14}, {  1,26,14}, {  1,26,14},
+    {  1,25,14}, {  1,25,14}, {  1,24,14}, {  1,24,14},
+    {  1,23,14}, {  1,23,14}, {  1,22,14}, {  1,22,14},
+    {  1,21,14}, {  1,21,14}, {  1,20,14}, {  1,20,14},
+    {  1,19,14}, {  1,19,14}, {  1,18,14}, {  1,18,14},
+    {  1,17,14}, {  1,17,14}, {  1,16,14}, {  1,16,14}
+};
+
+static const DCTtab DCT_13 [] = {
+    { 11, 2,13}, { 10, 2,13}, {  6, 3,13}, {  4, 4,13},
+    {  3, 5,13}, {  2, 7,13}, {  2, 6,13}, {  1,15,13},
+    {  1,14,13}, {  1,13,13}, {  1,12,13}, { 27, 1,13},
+    { 26, 1,13}, { 25, 1,13}, { 24, 1,13}, { 23, 1,13},
+    {  1,11,12}, {  1,11,12}, {  9, 2,12}, {  9, 2,12},
+    {  5, 3,12}, {  5, 3,12}, {  1,10,12}, {  1,10,12},
+    {  3, 4,12}, {  3, 4,12}, {  8, 2,12}, {  8, 2,12},
+    { 22, 1,12}, { 22, 1,12}, { 21, 1,12}, { 21, 1,12},
+    {  1, 9,12}, {  1, 9,12}, { 20, 1,12}, { 20, 1,12},
+    { 19, 1,12}, { 19, 1,12}, {  2, 5,12}, {  2, 5,12},
+    {  4, 3,12}, {  4, 3,12}, {  1, 8,12}, {  1, 8,12},
+    {  7, 2,12}, {  7, 2,12}, { 18, 1,12}, { 18, 1,12}
+};
+
+static const DCTtab DCT_B14_10 [] = {
+    { 17, 1,10}, {  6, 2,10}, {  1, 7,10}, {  3, 3,10},
+    {  2, 4,10}, { 16, 1,10}, { 15, 1,10}, {  5, 2,10}
+};
+
+static const DCTtab DCT_B14_8 [] = {
+    { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
+    {  3, 2, 7}, {  3, 2, 7}, { 10, 1, 7}, { 10, 1, 7},
+    {  1, 4, 7}, {  1, 4, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6},
+    {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6},
+    {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    { 14, 1, 8}, {  1, 6, 8}, { 13, 1, 8}, { 12, 1, 8},
+    {  4, 2, 8}, {  2, 3, 8}, {  1, 5, 8}, { 11, 1, 8}
+};
+
+static const DCTtab DCT_B14AC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}
+};
+
+static const DCTtab DCT_B14DC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}
+};
+
+static const DCTtab DCT_B15_10 [] = {
+    {  6, 2, 9}, {  6, 2, 9}, { 15, 1, 9}, { 15, 1, 9},
+    {  3, 4,10}, { 17, 1,10}, { 16, 1, 9}, { 16, 1, 9}
+};
+
+static const DCTtab DCT_B15_8 [] = {
+    { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
+    {  8, 1, 7}, {  8, 1, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  7, 1, 7}, {  7, 1, 7}, {  3, 2, 7}, {  3, 2, 7},
+    {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6},
+    {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6},
+    {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    {  2, 5, 8}, { 12, 1, 8}, {  1,11, 8}, {  1,10, 8},
+    { 14, 1, 8}, { 13, 1, 8}, {  4, 2, 8}, {  2, 4, 8},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    { 10, 1, 7}, { 10, 1, 7}, {  2, 3, 7}, {  2, 3, 7},
+    { 11, 1, 7}, { 11, 1, 7}, {  1, 8, 7}, {  1, 8, 7},
+    {  1, 9, 7}, {  1, 9, 7}, {  1,12, 8}, {  1,13, 8},
+    {  3, 3, 8}, {  5, 2, 8}, {  1,14, 8}, {  1,15, 8}
+};
+
+
+static const MBAtab MBA_5 [] = {
+		    {6, 5}, {5, 5}, {4, 4}, {4, 4}, {3, 4}, {3, 4},
+    {2, 3}, {2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}
+};
+
+static const MBAtab MBA_11 [] = {
+    {32, 11}, {31, 11}, {30, 11}, {29, 11},
+    {28, 11}, {27, 11}, {26, 11}, {25, 11},
+    {24, 11}, {23, 11}, {22, 11}, {21, 11},
+    {20, 10}, {20, 10}, {19, 10}, {19, 10},
+    {18, 10}, {18, 10}, {17, 10}, {17, 10},
+    {16, 10}, {16, 10}, {15, 10}, {15, 10},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7}
+};
diff --git a/src/video_dec/libmpeg2/xine_mpeg2_decoder.c b/src/video_dec/libmpeg2/xine_mpeg2_decoder.c
new file mode 100644
index 000000000..c4c7fac2d
--- /dev/null
+++ b/src/video_dec/libmpeg2/xine_mpeg2_decoder.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2000-2003 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * stuff needed to turn libmpeg2 into a xine decoder plugin
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define LOG_MODULE "mpeg2_decoder"
+#define LOG_VERBOSE
+/*
+#define LOG
+*/
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include "mpeg2.h"
+#include "mpeg2_internal.h"
+#include <xine/buffer.h>
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} mpeg2_class_t;
+
+
+typedef struct mpeg2dec_decoder_s {
+  video_decoder_t  video_decoder;
+  mpeg2dec_t       mpeg2;
+  mpeg2_class_t   *class;
+  xine_stream_t   *stream;
+} mpeg2dec_decoder_t;
+
+static void mpeg2dec_decode_data (video_decoder_t *this_gen, buf_element_t *buf) {
+  mpeg2dec_decoder_t *this = (mpeg2dec_decoder_t *) this_gen;
+
+  lprintf ("decode_data, flags=0x%08x ...\n", buf->decoder_flags);
+
+  /* handle aspect hints from xine-dvdnav */
+  if (buf->decoder_flags & BUF_FLAG_SPECIAL) {
+    if (buf->decoder_info[1] == BUF_SPECIAL_ASPECT) {
+      this->mpeg2.force_aspect = buf->decoder_info[2];
+      if (buf->decoder_info[3] == 0x1 && buf->decoder_info[2] == 3)
+	/* letterboxing is denied, we have to do pan&scan */
+	this->mpeg2.force_pan_scan = 1;
+      else
+	this->mpeg2.force_pan_scan = 0;
+    }
+    return;
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW) {
+    mpeg2_find_sequence_header (&this->mpeg2, buf->content, buf->content + buf->size);
+  } else {
+
+    mpeg2_decode_data (&this->mpeg2, buf->content, buf->content + buf->size,
+		       buf->pts);
+  }
+
+  lprintf ("decode_data...done\n");
+}
+
+static void mpeg2dec_flush (video_decoder_t *this_gen) {
+  mpeg2dec_decoder_t *this = (mpeg2dec_decoder_t *) this_gen;
+
+  lprintf ("flush\n");
+
+  mpeg2_flush (&this->mpeg2);
+}
+
+static void mpeg2dec_reset (video_decoder_t *this_gen) {
+  mpeg2dec_decoder_t *this = (mpeg2dec_decoder_t *) this_gen;
+
+  mpeg2_reset (&this->mpeg2);
+}
+
+static void mpeg2dec_discontinuity (video_decoder_t *this_gen) {
+  mpeg2dec_decoder_t *this = (mpeg2dec_decoder_t *) this_gen;
+
+  mpeg2_discontinuity (&this->mpeg2);
+}
+
+static void mpeg2dec_dispose (video_decoder_t *this_gen) {
+
+  mpeg2dec_decoder_t *this = (mpeg2dec_decoder_t *) this_gen;
+
+  lprintf ("close\n");
+
+  mpeg2_close (&this->mpeg2);
+
+  this->stream->video_out->close(this->stream->video_out, this->stream);
+
+  free (this);
+}
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+  mpeg2dec_decoder_t *this ;
+
+  this = (mpeg2dec_decoder_t *) calloc(1, sizeof(mpeg2dec_decoder_t));
+
+  this->video_decoder.decode_data         = mpeg2dec_decode_data;
+  this->video_decoder.flush               = mpeg2dec_flush;
+  this->video_decoder.reset               = mpeg2dec_reset;
+  this->video_decoder.discontinuity       = mpeg2dec_discontinuity;
+  this->video_decoder.dispose             = mpeg2dec_dispose;
+  this->stream                            = stream;
+  this->class                             = (mpeg2_class_t *) class_gen;
+  this->mpeg2.stream = stream;
+
+  mpeg2_init (&this->mpeg2, stream->video_out);
+  (stream->video_out->open) (stream->video_out, stream);
+  this->mpeg2.force_aspect = this->mpeg2.force_pan_scan = 0;
+
+  return &this->video_decoder;
+}
+
+/*
+ * mpeg2 plugin class
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  mpeg2_class_t *this;
+
+  this = (mpeg2_class_t *) calloc(1, sizeof(mpeg2_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "mpeg2dec";
+  this->decoder_class.description     = N_("mpeg2 based video decoder plugin");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t supported_types[] = { BUF_VIDEO_MPEG, 0 };
+
+static const decoder_info_t dec_info_mpeg2 = {
+  supported_types,     /* supported types */
+  7                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "mpeg2", XINE_VERSION_CODE, &dec_info_mpeg2, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/libmpeg2/xvmc.h b/src/video_dec/libmpeg2/xvmc.h
new file mode 100644
index 000000000..5d61bcf83
--- /dev/null
+++ b/src/video_dec/libmpeg2/xvmc.h
@@ -0,0 +1,32 @@
+/*
+ * mpeg2_internal.h
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _XVMC_H
+#include "libmpeg2_accel.h"
+
+/* slice_xvmc.c */
+
+void mpeg2_xvmc_slice (mpeg2dec_accel_t *accel, picture_t * picture, int code, uint8_t * buffer);
+void xvmc_setup_scan_ptable( void );
+
+#endif
diff --git a/src/video_dec/libmpeg2/xvmc_vld.h b/src/video_dec/libmpeg2/xvmc_vld.h
new file mode 100644
index 000000000..561d1789d
--- /dev/null
+++ b/src/video_dec/libmpeg2/xvmc_vld.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2004 The Unichrome project. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTIES OR REPRESENTATIONS; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *
+ */
+
+#ifndef _XVMC_VLD_H
+#define _XVMC_VLD_H
+
+#include "accel_xvmc.h"
+#include "xvmc.h"
+
+extern void mpeg2_xxmc_slice( mpeg2dec_accel_t *accel, picture_t *picture, 
+			      int code, uint8_t *buffer, uint32_t chunk_size, 
+			      uint8_t *chunk_buffer); 
+extern void mpeg2_xxmc_vld_frame_complete(mpeg2dec_accel_t *accel, picture_t *picture, int code);
+
+
+#endif
diff --git a/src/video_dec/libmpeg2new/Makefile.am b/src/video_dec/libmpeg2new/Makefile.am
new file mode 100644
index 000000000..2ff66d089
--- /dev/null
+++ b/src/video_dec/libmpeg2new/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/misc/Makefile.quiet
+include $(top_builddir)/misc/Makefile.plugins
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS  = $(DEFAULT_OCFLAGS) $(VISIBILITY_FLAG)
+AM_LDFLAGS = $(xineplug_ldflags)
+
+SUBDIRS = include libmpeg2
+
+if ENABLE_MPEG2NEW
+mpeg2new_module = xineplug_decode_mpeg2new.la
+endif
+
+xineplug_LTLIBRARIES = $(mpeg2new_module)
+
+xineplug_decode_mpeg2new_la_SOURCES = \
+	xine_mpeg2new_decoder.c
+
+xineplug_decode_mpeg2new_la_LIBADD = $(XINE_LIB) libmpeg2/libmpeg2.la 
+xineplug_decode_mpeg2new_la_CFLAGS = $(AM_CFLAGS) $(MLIB_CFLAGS)
diff --git a/src/video_dec/libmpeg2new/include/Makefile.am b/src/video_dec/libmpeg2new/include/Makefile.am
new file mode 100644
index 000000000..d9c7a4df6
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/misc/Makefile.quiet
+include $(top_builddir)/misc/Makefile.plugins
+include $(top_srcdir)/misc/Makefile.common
+
+EXTRA_DIST = video_out.h mmx.h alpha_asm.h vis.h attributes.h tendra.h mpeg2.h mpeg2convert.h
diff --git a/src/video_dec/libmpeg2new/include/alpha_asm.h b/src/video_dec/libmpeg2new/include/alpha_asm.h
new file mode 100644
index 000000000..bf1081f24
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/alpha_asm.h
@@ -0,0 +1,181 @@
+/*
+ * Alpha assembly macros
+ * Copyright (c) 2002-2003 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307	 USA
+ */
+
+#ifndef ALPHA_ASM_H
+#define ALPHA_ASM_H
+
+#include <inttypes.h>
+
+#if defined __GNUC__
+# define GNUC_PREREQ(maj, min) \
+        ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
+#else
+# define GNUC_PREREQ(maj, min) 0
+#endif
+
+#define AMASK_BWX (1 << 0)
+#define AMASK_FIX (1 << 1)
+#define AMASK_CIX (1 << 2)
+#define AMASK_MVI (1 << 8)
+
+#ifdef __alpha_bwx__
+# define HAVE_BWX() 1
+#else
+# define HAVE_BWX() (amask(AMASK_BWX) == 0)
+#endif
+#ifdef __alpha_fix__
+# define HAVE_FIX() 1
+#else
+# define HAVE_FIX() (amask(AMASK_FIX) == 0)
+#endif
+#ifdef __alpha_max__
+# define HAVE_MVI() 1
+#else
+# define HAVE_MVI() (amask(AMASK_MVI) == 0)
+#endif
+#ifdef __alpha_cix__
+# define HAVE_CIX() 1
+#else
+# define HAVE_CIX() (amask(AMASK_CIX) == 0)
+#endif
+
+inline static uint64_t BYTE_VEC(uint64_t x)
+{
+    x |= x <<  8;
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+inline static uint64_t WORD_VEC(uint64_t x)
+{
+    x |= x << 16;
+    x |= x << 32;
+    return x;
+}
+
+#define ldq(p) (*(const uint64_t *) (p))
+#define ldl(p) (*(const int32_t *) (p))
+#define stl(l, p) do { *(uint32_t *) (p) = (l); } while (0)
+#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0)
+#define sextw(x) ((int16_t) (x))
+
+#ifdef __GNUC__
+struct unaligned_long { uint64_t l; } __attribute__((packed));
+#define ldq_u(p)     (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
+#define uldq(a)	     (((const struct unaligned_long *) (a))->l)
+
+#if GNUC_PREREQ(3,3)
+#define prefetch(p)     __builtin_prefetch((p), 0, 1)
+#define prefetch_en(p)  __builtin_prefetch((p), 0, 0)
+#define prefetch_m(p)   __builtin_prefetch((p), 1, 1)
+#define prefetch_men(p) __builtin_prefetch((p), 1, 0)
+#define cmpbge	__builtin_alpha_cmpbge
+/* Avoid warnings.  */
+#define extql(a, b)	__builtin_alpha_extql(a, (uint64_t) (b))
+#define extwl(a, b)	__builtin_alpha_extwl(a, (uint64_t) (b))
+#define extqh(a, b)	__builtin_alpha_extqh(a, (uint64_t) (b))
+#define zap	__builtin_alpha_zap
+#define zapnot	__builtin_alpha_zapnot
+#define amask	__builtin_alpha_amask
+#define implver	__builtin_alpha_implver
+#define rpcc	__builtin_alpha_rpcc
+#else
+#define prefetch(p)     asm volatile("ldl $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_en(p)  asm volatile("ldq $31,%0"  : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_m(p)   asm volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define prefetch_men(p) asm volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory")
+#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extql(a, b)  ({ uint64_t __r; asm ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extwl(a, b)  ({ uint64_t __r; asm ("extwl   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extqh(a, b)  ({ uint64_t __r; asm ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zap(a, b)    ({ uint64_t __r; asm ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define amask(a)     ({ uint64_t __r; asm ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));	     __r; })
+#define implver()    ({ uint64_t __r; asm ("implver %0"         : "=r" (__r));			     __r; })
+#define rpcc()	     ({ uint64_t __r; asm volatile ("rpcc %0"   : "=r" (__r));			     __r; })
+#endif
+#define wh64(p) asm volatile("wh64 (%0)" : : "r"(p) : "memory")
+
+#if GNUC_PREREQ(3,3) && defined(__alpha_max__)
+#define minub8	__builtin_alpha_minub8
+#define minsb8	__builtin_alpha_minsb8
+#define minuw4	__builtin_alpha_minuw4
+#define minsw4	__builtin_alpha_minsw4
+#define maxub8	__builtin_alpha_maxub8
+#define maxsb8	__builtin_alpha_maxsb8
+#define maxuw4	__builtin_alpha_maxuw4	
+#define maxsw4	__builtin_alpha_maxsw4
+#define perr	__builtin_alpha_perr
+#define pklb	__builtin_alpha_pklb
+#define pkwb	__builtin_alpha_pkwb
+#define unpkbl	__builtin_alpha_unpkbl
+#define unpkbw	__builtin_alpha_unpkbw
+#else
+#define minub8(a, b) ({ uint64_t __r; asm (".arch ev6; minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsb8(a, b) ({ uint64_t __r; asm (".arch ev6; minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minuw4(a, b) ({ uint64_t __r; asm (".arch ev6; minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsw4(a, b) ({ uint64_t __r; asm (".arch ev6; minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxub8(a, b) ({ uint64_t __r; asm (".arch ev6; maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsb8(a, b) ({ uint64_t __r; asm (".arch ev6; maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxuw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define perr(a, b)   ({ uint64_t __r; asm (".arch ev6; perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
+#define pklb(a)      ({ uint64_t __r; asm (".arch ev6; pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#define pkwb(a)      ({ uint64_t __r; asm (".arch ev6; pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#define unpkbl(a)    ({ uint64_t __r; asm (".arch ev6; unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#define unpkbw(a)    ({ uint64_t __r; asm (".arch ev6; unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#endif
+
+#elif defined(__DECC)		/* Digital/Compaq/hp "ccc" compiler */
+
+#include <c_asm.h>
+#define ldq_u(a)     asm ("ldq_u   %v0,0(%a0)", a)
+#define uldq(a)	     (*(const __unaligned uint64_t *) (a))
+#define cmpbge(a, b) asm ("cmpbge  %a0,%a1,%v0", a, b)
+#define extql(a, b)  asm ("extql   %a0,%a1,%v0", a, b)
+#define extwl(a, b)  asm ("extwl   %a0,%a1,%v0", a, b)
+#define extqh(a, b)  asm ("extqh   %a0,%a1,%v0", a, b)
+#define zap(a, b)    asm ("zap     %a0,%a1,%v0", a, b)
+#define zapnot(a, b) asm ("zapnot  %a0,%a1,%v0", a, b)
+#define amask(a)     asm ("amask   %a0,%v0", a)
+#define implver()    asm ("implver %v0")
+#define rpcc()	     asm ("rpcc	   %v0")
+#define minub8(a, b) asm ("minub8  %a0,%a1,%v0", a, b)
+#define minsb8(a, b) asm ("minsb8  %a0,%a1,%v0", a, b)
+#define minuw4(a, b) asm ("minuw4  %a0,%a1,%v0", a, b)
+#define minsw4(a, b) asm ("minsw4  %a0,%a1,%v0", a, b)
+#define maxub8(a, b) asm ("maxub8  %a0,%a1,%v0", a, b)
+#define maxsb8(a, b) asm ("maxsb8  %a0,%a1,%v0", a, b)
+#define maxuw4(a, b) asm ("maxuw4  %a0,%a1,%v0", a, b)
+#define maxsw4(a, b) asm ("maxsw4  %a0,%a1,%v0", a, b)
+#define perr(a, b)   asm ("perr    %a0,%a1,%v0", a, b)
+#define pklb(a)      asm ("pklb    %a0,%v0", a)
+#define pkwb(a)      asm ("pkwb    %a0,%v0", a)
+#define unpkbl(a)    asm ("unpkbl  %a0,%v0", a)
+#define unpkbw(a)    asm ("unpkbw  %a0,%v0", a)
+#define wh64(a)      asm ("wh64    %a0", a)
+
+#else
+#error "Unknown compiler!"
+#endif
+
+#endif /* ALPHA_ASM_H */
diff --git a/src/video_dec/libmpeg2new/include/attributes.h b/src/video_dec/libmpeg2new/include/attributes.h
new file mode 100644
index 000000000..83f1364a2
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/attributes.h
@@ -0,0 +1,33 @@
+/*
+ * attributes.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* use gcc attribs to align critical data structures */
+#include <xine/attributes.h>
+
+#ifdef HAVE_BUILTIN_EXPECT
+#define likely(x) __builtin_expect ((x) != 0, 1)
+#define unlikely(x) __builtin_expect ((x) != 0, 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
diff --git a/src/video_dec/libmpeg2new/include/mmx.h b/src/video_dec/libmpeg2new/include/mmx.h
new file mode 100644
index 000000000..08b4d4776
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/mmx.h
@@ -0,0 +1,263 @@
+/*
+ * mmx.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * The type of an value that fits in an MMX register (note that long
+ * long constant values MUST be suffixed by LL and unsigned long long
+ * values by ULL, lest they be truncated by the compiler)
+ */
+
+typedef	union {
+	long long		q;	/* Quadword (64-bit) value */
+	unsigned long long	uq;	/* Unsigned Quadword */
+	int			d[2];	/* 2 Doubleword (32-bit) values */
+	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
+	short			w[4];	/* 4 Word (16-bit) values */
+	unsigned short		uw[4];	/* 4 Unsigned Word */
+	char			b[8];	/* 8 Byte (8-bit) values */
+	unsigned char		ub[8];	/* 8 Unsigned Byte */
+	float			s[2];	/* Single-precision (32-bit) value */
+} ATTR_ALIGN(8) mmx_t;	/* On an 8-byte (64-bit) boundary */
+
+
+#define	mmx_i2r(op,imm,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "i" (imm) )
+
+#define	mmx_m2r(op,mem,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem))
+
+#define	mmx_r2m(op,reg,mem) \
+	__asm__ __volatile__ (#op " %%" #reg ", %0" \
+			      : "=m" (mem) \
+			      : /* nothing */ )
+
+#define	mmx_r2r(op,regs,regd) \
+	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+
+#define	emms() __asm__ __volatile__ ("emms")
+
+#define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
+#define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
+#define	movd_v2r(var,reg)	__asm__ __volatile__ ("movd %0, %%" #reg \
+						      : /* nothing */ \
+						      : "rm" (var))
+#define	movd_r2v(reg,var)	__asm__ __volatile__ ("movd %%" #reg ", %0" \
+						      : "=rm" (var) \
+						      : /* nothing */ )
+
+#define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
+#define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
+#define	movq_r2r(regs,regd)	mmx_r2r (movq, regs, regd)
+
+#define	packssdw_m2r(var,reg)	mmx_m2r (packssdw, var, reg)
+#define	packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
+#define	packsswb_m2r(var,reg)	mmx_m2r (packsswb, var, reg)
+#define	packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
+
+#define	packuswb_m2r(var,reg)	mmx_m2r (packuswb, var, reg)
+#define	packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
+
+#define	paddb_m2r(var,reg)	mmx_m2r (paddb, var, reg)
+#define	paddb_r2r(regs,regd)	mmx_r2r (paddb, regs, regd)
+#define	paddd_m2r(var,reg)	mmx_m2r (paddd, var, reg)
+#define	paddd_r2r(regs,regd)	mmx_r2r (paddd, regs, regd)
+#define	paddw_m2r(var,reg)	mmx_m2r (paddw, var, reg)
+#define	paddw_r2r(regs,regd)	mmx_r2r (paddw, regs, regd)
+
+#define	paddsb_m2r(var,reg)	mmx_m2r (paddsb, var, reg)
+#define	paddsb_r2r(regs,regd)	mmx_r2r (paddsb, regs, regd)
+#define	paddsw_m2r(var,reg)	mmx_m2r (paddsw, var, reg)
+#define	paddsw_r2r(regs,regd)	mmx_r2r (paddsw, regs, regd)
+
+#define	paddusb_m2r(var,reg)	mmx_m2r (paddusb, var, reg)
+#define	paddusb_r2r(regs,regd)	mmx_r2r (paddusb, regs, regd)
+#define	paddusw_m2r(var,reg)	mmx_m2r (paddusw, var, reg)
+#define	paddusw_r2r(regs,regd)	mmx_r2r (paddusw, regs, regd)
+
+#define	pand_m2r(var,reg)	mmx_m2r (pand, var, reg)
+#define	pand_r2r(regs,regd)	mmx_r2r (pand, regs, regd)
+
+#define	pandn_m2r(var,reg)	mmx_m2r (pandn, var, reg)
+#define	pandn_r2r(regs,regd)	mmx_r2r (pandn, regs, regd)
+
+#define	pcmpeqb_m2r(var,reg)	mmx_m2r (pcmpeqb, var, reg)
+#define	pcmpeqb_r2r(regs,regd)	mmx_r2r (pcmpeqb, regs, regd)
+#define	pcmpeqd_m2r(var,reg)	mmx_m2r (pcmpeqd, var, reg)
+#define	pcmpeqd_r2r(regs,regd)	mmx_r2r (pcmpeqd, regs, regd)
+#define	pcmpeqw_m2r(var,reg)	mmx_m2r (pcmpeqw, var, reg)
+#define	pcmpeqw_r2r(regs,regd)	mmx_r2r (pcmpeqw, regs, regd)
+
+#define	pcmpgtb_m2r(var,reg)	mmx_m2r (pcmpgtb, var, reg)
+#define	pcmpgtb_r2r(regs,regd)	mmx_r2r (pcmpgtb, regs, regd)
+#define	pcmpgtd_m2r(var,reg)	mmx_m2r (pcmpgtd, var, reg)
+#define	pcmpgtd_r2r(regs,regd)	mmx_r2r (pcmpgtd, regs, regd)
+#define	pcmpgtw_m2r(var,reg)	mmx_m2r (pcmpgtw, var, reg)
+#define	pcmpgtw_r2r(regs,regd)	mmx_r2r (pcmpgtw, regs, regd)
+
+#define	pmaddwd_m2r(var,reg)	mmx_m2r (pmaddwd, var, reg)
+#define	pmaddwd_r2r(regs,regd)	mmx_r2r (pmaddwd, regs, regd)
+
+#define	pmulhw_m2r(var,reg)	mmx_m2r (pmulhw, var, reg)
+#define	pmulhw_r2r(regs,regd)	mmx_r2r (pmulhw, regs, regd)
+
+#define	pmullw_m2r(var,reg)	mmx_m2r (pmullw, var, reg)
+#define	pmullw_r2r(regs,regd)	mmx_r2r (pmullw, regs, regd)
+
+#define	por_m2r(var,reg)	mmx_m2r (por, var, reg)
+#define	por_r2r(regs,regd)	mmx_r2r (por, regs, regd)
+
+#define	pslld_i2r(imm,reg)	mmx_i2r (pslld, imm, reg)
+#define	pslld_m2r(var,reg)	mmx_m2r (pslld, var, reg)
+#define	pslld_r2r(regs,regd)	mmx_r2r (pslld, regs, regd)
+#define	psllq_i2r(imm,reg)	mmx_i2r (psllq, imm, reg)
+#define	psllq_m2r(var,reg)	mmx_m2r (psllq, var, reg)
+#define	psllq_r2r(regs,regd)	mmx_r2r (psllq, regs, regd)
+#define	psllw_i2r(imm,reg)	mmx_i2r (psllw, imm, reg)
+#define	psllw_m2r(var,reg)	mmx_m2r (psllw, var, reg)
+#define	psllw_r2r(regs,regd)	mmx_r2r (psllw, regs, regd)
+
+#define	psrad_i2r(imm,reg)	mmx_i2r (psrad, imm, reg)
+#define	psrad_m2r(var,reg)	mmx_m2r (psrad, var, reg)
+#define	psrad_r2r(regs,regd)	mmx_r2r (psrad, regs, regd)
+#define	psraw_i2r(imm,reg)	mmx_i2r (psraw, imm, reg)
+#define	psraw_m2r(var,reg)	mmx_m2r (psraw, var, reg)
+#define	psraw_r2r(regs,regd)	mmx_r2r (psraw, regs, regd)
+
+#define	psrld_i2r(imm,reg)	mmx_i2r (psrld, imm, reg)
+#define	psrld_m2r(var,reg)	mmx_m2r (psrld, var, reg)
+#define	psrld_r2r(regs,regd)	mmx_r2r (psrld, regs, regd)
+#define	psrlq_i2r(imm,reg)	mmx_i2r (psrlq, imm, reg)
+#define	psrlq_m2r(var,reg)	mmx_m2r (psrlq, var, reg)
+#define	psrlq_r2r(regs,regd)	mmx_r2r (psrlq, regs, regd)
+#define	psrlw_i2r(imm,reg)	mmx_i2r (psrlw, imm, reg)
+#define	psrlw_m2r(var,reg)	mmx_m2r (psrlw, var, reg)
+#define	psrlw_r2r(regs,regd)	mmx_r2r (psrlw, regs, regd)
+
+#define	psubb_m2r(var,reg)	mmx_m2r (psubb, var, reg)
+#define	psubb_r2r(regs,regd)	mmx_r2r (psubb, regs, regd)
+#define	psubd_m2r(var,reg)	mmx_m2r (psubd, var, reg)
+#define	psubd_r2r(regs,regd)	mmx_r2r (psubd, regs, regd)
+#define	psubw_m2r(var,reg)	mmx_m2r (psubw, var, reg)
+#define	psubw_r2r(regs,regd)	mmx_r2r (psubw, regs, regd)
+
+#define	psubsb_m2r(var,reg)	mmx_m2r (psubsb, var, reg)
+#define	psubsb_r2r(regs,regd)	mmx_r2r (psubsb, regs, regd)
+#define	psubsw_m2r(var,reg)	mmx_m2r (psubsw, var, reg)
+#define	psubsw_r2r(regs,regd)	mmx_r2r (psubsw, regs, regd)
+
+#define	psubusb_m2r(var,reg)	mmx_m2r (psubusb, var, reg)
+#define	psubusb_r2r(regs,regd)	mmx_r2r (psubusb, regs, regd)
+#define	psubusw_m2r(var,reg)	mmx_m2r (psubusw, var, reg)
+#define	psubusw_r2r(regs,regd)	mmx_r2r (psubusw, regs, regd)
+
+#define	punpckhbw_m2r(var,reg)		mmx_m2r (punpckhbw, var, reg)
+#define	punpckhbw_r2r(regs,regd)	mmx_r2r (punpckhbw, regs, regd)
+#define	punpckhdq_m2r(var,reg)		mmx_m2r (punpckhdq, var, reg)
+#define	punpckhdq_r2r(regs,regd)	mmx_r2r (punpckhdq, regs, regd)
+#define	punpckhwd_m2r(var,reg)		mmx_m2r (punpckhwd, var, reg)
+#define	punpckhwd_r2r(regs,regd)	mmx_r2r (punpckhwd, regs, regd)
+
+#define	punpcklbw_m2r(var,reg) 		mmx_m2r (punpcklbw, var, reg)
+#define	punpcklbw_r2r(regs,regd)	mmx_r2r (punpcklbw, regs, regd)
+#define	punpckldq_m2r(var,reg)		mmx_m2r (punpckldq, var, reg)
+#define	punpckldq_r2r(regs,regd)	mmx_r2r (punpckldq, regs, regd)
+#define	punpcklwd_m2r(var,reg)		mmx_m2r (punpcklwd, var, reg)
+#define	punpcklwd_r2r(regs,regd)	mmx_r2r (punpcklwd, regs, regd)
+
+#define	pxor_m2r(var,reg)	mmx_m2r (pxor, var, reg)
+#define	pxor_r2r(regs,regd)	mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define pavgusb_m2r(var,reg)	mmx_m2r (pavgusb, var, reg)
+#define pavgusb_r2r(regs,regd)	mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define mmx_m2ri(op,mem,reg,imm) \
+	__asm__ __volatile__ (#op " %1, %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem), "i" (imm))
+
+#define mmx_r2ri(op,regs,regd,imm) \
+	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+			      : /* nothing */ \
+			      : "i" (imm) )
+
+#define	mmx_fetch(mem,hint) \
+	__asm__ __volatile__ ("prefetch" #hint " %0" \
+			      : /* nothing */ \
+			      : "m" (mem))
+
+
+#define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
+
+#define	movntq_r2m(mmreg,var)		mmx_r2m (movntq, mmreg, var)
+
+#define	pavgb_m2r(var,reg)		mmx_m2r (pavgb, var, reg)
+#define	pavgb_r2r(regs,regd)		mmx_r2r (pavgb, regs, regd)
+#define	pavgw_m2r(var,reg)		mmx_m2r (pavgw, var, reg)
+#define	pavgw_r2r(regs,regd)		mmx_r2r (pavgw, regs, regd)
+
+#define	pextrw_r2r(mmreg,reg,imm)	mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define	pinsrw_r2r(reg,mmreg,imm)	mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define	pmaxsw_m2r(var,reg)		mmx_m2r (pmaxsw, var, reg)
+#define	pmaxsw_r2r(regs,regd)		mmx_r2r (pmaxsw, regs, regd)
+
+#define	pmaxub_m2r(var,reg)		mmx_m2r (pmaxub, var, reg)
+#define	pmaxub_r2r(regs,regd)		mmx_r2r (pmaxub, regs, regd)
+
+#define	pminsw_m2r(var,reg)		mmx_m2r (pminsw, var, reg)
+#define	pminsw_r2r(regs,regd)		mmx_r2r (pminsw, regs, regd)
+
+#define	pminub_m2r(var,reg)		mmx_m2r (pminub, var, reg)
+#define	pminub_r2r(regs,regd)		mmx_r2r (pminub, regs, regd)
+
+#define	pmovmskb(mmreg,reg) \
+	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
+
+#define	pmulhuw_m2r(var,reg)		mmx_m2r (pmulhuw, var, reg)
+#define	pmulhuw_r2r(regs,regd)		mmx_r2r (pmulhuw, regs, regd)
+
+#define	prefetcht0(mem)			mmx_fetch (mem, t0)
+#define	prefetcht1(mem)			mmx_fetch (mem, t1)
+#define	prefetcht2(mem)			mmx_fetch (mem, t2)
+#define	prefetchnta(mem)		mmx_fetch (mem, nta)
+
+#define	psadbw_m2r(var,reg)		mmx_m2r (psadbw, var, reg)
+#define	psadbw_r2r(regs,regd)		mmx_r2r (psadbw, regs, regd)
+
+#define	pshufw_m2r(var,reg,imm)		mmx_m2ri(pshufw, var, reg, imm)
+#define	pshufw_r2r(regs,regd,imm)	mmx_r2ri(pshufw, regs, regd, imm)
+
+#define	sfence() __asm__ __volatile__ ("sfence\n\t")
diff --git a/src/video_dec/libmpeg2new/include/mpeg2.h b/src/video_dec/libmpeg2new/include/mpeg2.h
new file mode 100644
index 000000000..6c1a3805b
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/mpeg2.h
@@ -0,0 +1,202 @@
+/*
+ * mpeg2.h
+ * Copyright (C) 2000-2004 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef MPEG2_H
+#define MPEG2_H
+
+#define MPEG2_VERSION(a,b,c) (((a)<<16)|((b)<<8)|(c))
+#define MPEG2_RELEASE MPEG2_VERSION (0, 4, 1)	/* 0.4.1 */
+
+#define SEQ_FLAG_MPEG2 1
+#define SEQ_FLAG_CONSTRAINED_PARAMETERS 2
+#define SEQ_FLAG_PROGRESSIVE_SEQUENCE 4
+#define SEQ_FLAG_LOW_DELAY 8
+#define SEQ_FLAG_COLOUR_DESCRIPTION 16
+
+#define SEQ_MASK_VIDEO_FORMAT 0xe0
+#define SEQ_VIDEO_FORMAT_COMPONENT 0
+#define SEQ_VIDEO_FORMAT_PAL 0x20
+#define SEQ_VIDEO_FORMAT_NTSC 0x40
+#define SEQ_VIDEO_FORMAT_SECAM 0x60
+#define SEQ_VIDEO_FORMAT_MAC 0x80
+#define SEQ_VIDEO_FORMAT_UNSPECIFIED 0xa0
+
+typedef struct mpeg2_sequence_s {
+    unsigned int width, height;
+    unsigned int chroma_width, chroma_height;
+    unsigned int byte_rate;
+    unsigned int vbv_buffer_size;
+    uint32_t flags;
+
+    unsigned int picture_width, picture_height;
+    unsigned int display_width, display_height;
+    unsigned int pixel_width, pixel_height;
+    unsigned int frame_period;
+
+    uint8_t profile_level_id;
+    uint8_t colour_primaries;
+    uint8_t transfer_characteristics;
+    uint8_t matrix_coefficients;
+} mpeg2_sequence_t;
+
+#define GOP_FLAG_DROP_FRAME 1
+#define GOP_FLAG_BROKEN_LINK 2
+#define GOP_FLAG_CLOSED_GOP 4
+
+typedef struct mpeg2_gop_s {
+    uint8_t hours;
+    uint8_t minutes;
+    uint8_t seconds;
+    uint8_t pictures;
+    uint32_t flags;
+} mpeg2_gop_t;
+
+#define PIC_MASK_CODING_TYPE 7
+#define PIC_FLAG_CODING_TYPE_I 1
+#define PIC_FLAG_CODING_TYPE_P 2
+#define PIC_FLAG_CODING_TYPE_B 3
+#define PIC_FLAG_CODING_TYPE_D 4
+
+#define PIC_FLAG_TOP_FIELD_FIRST 8
+#define PIC_FLAG_PROGRESSIVE_FRAME 16
+#define PIC_FLAG_COMPOSITE_DISPLAY 32
+#define PIC_FLAG_SKIP 64
+#define PIC_FLAG_TAGS 128
+#define PIC_MASK_COMPOSITE_DISPLAY 0xfffff000
+
+typedef struct mpeg2_picture_s {
+    unsigned int temporal_reference;
+    unsigned int nb_fields;
+    uint32_t tag, tag2;
+    uint32_t flags;
+    struct {
+	int x, y;
+    } display_offset[3];
+} mpeg2_picture_t;
+
+typedef struct mpeg2_fbuf_s {
+    uint8_t * buf[3];
+    void * id;
+} mpeg2_fbuf_t;
+
+typedef struct mpeg2_info_s {
+    const mpeg2_sequence_t * sequence;
+    const mpeg2_gop_t * gop;
+    const mpeg2_picture_t * current_picture;
+    const mpeg2_picture_t * current_picture_2nd;
+    const mpeg2_fbuf_t * current_fbuf;
+    const mpeg2_picture_t * display_picture;
+    const mpeg2_picture_t * display_picture_2nd;
+    const mpeg2_fbuf_t * display_fbuf;
+    const mpeg2_fbuf_t * discard_fbuf;
+    const uint8_t * user_data;
+    unsigned int user_data_len;
+} mpeg2_info_t;
+
+typedef struct mpeg2dec_s mpeg2dec_t;
+typedef struct mpeg2_decoder_s mpeg2_decoder_t;
+
+typedef enum {
+    STATE_BUFFER = 0,
+    STATE_SEQUENCE = 1,
+    STATE_SEQUENCE_REPEATED = 2,
+    STATE_SEQUENCE_MODIFIED = 3,
+    STATE_GOP = 4,
+    STATE_PICTURE = 5,
+    STATE_SLICE_1ST = 6,
+    STATE_PICTURE_2ND = 7,
+    STATE_SLICE = 8,
+    STATE_END = 9,
+    STATE_INVALID = 10,
+    STATE_INVALID_END = 11
+} mpeg2_state_t;
+
+typedef struct mpeg2_convert_init_s {
+    unsigned int id_size;
+    unsigned int buf_size[3];
+    void (* start) (void * id, const mpeg2_fbuf_t * fbuf,
+		    const mpeg2_picture_t * picture, const mpeg2_gop_t * gop);
+    void (* copy) (void * id, uint8_t * const * src, unsigned int v_offset);
+} mpeg2_convert_init_t;
+typedef enum {
+    MPEG2_CONVERT_SET = 0,
+    MPEG2_CONVERT_STRIDE = 1,
+    MPEG2_CONVERT_START = 2
+} mpeg2_convert_stage_t;
+typedef int mpeg2_convert_t (int stage, void * id,
+			     const mpeg2_sequence_t * sequence, int stride,
+			     uint32_t accel, void * arg,
+			     mpeg2_convert_init_t * result);
+int mpeg2_convert (mpeg2dec_t * mpeg2dec, mpeg2_convert_t convert, void * arg);
+int mpeg2_stride (mpeg2dec_t * mpeg2dec, int stride);
+void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id);
+void mpeg2_custom_fbuf (mpeg2dec_t * mpeg2dec, int custom_fbuf);
+
+#define MPEG2_ACCEL_X86_MMX 1
+#define MPEG2_ACCEL_X86_3DNOW 2
+#define MPEG2_ACCEL_X86_MMXEXT 4
+#define MPEG2_ACCEL_X86_SSE2 8
+#define MPEG2_ACCEL_X86_SSE3 16
+#define MPEG2_ACCEL_PPC_ALTIVEC 1
+#define MPEG2_ACCEL_ALPHA 1
+#define MPEG2_ACCEL_ALPHA_MVI 2
+#define MPEG2_ACCEL_SPARC_VIS 1
+#define MPEG2_ACCEL_SPARC_VIS2 2
+#define MPEG2_ACCEL_DETECT 0x80000000
+
+uint32_t mpeg2_accel (uint32_t accel);
+mpeg2dec_t * mpeg2_init (void);
+const mpeg2_info_t * mpeg2_info (mpeg2dec_t * mpeg2dec);
+void mpeg2_close (mpeg2dec_t * mpeg2dec);
+
+void mpeg2_buffer (mpeg2dec_t * mpeg2dec, uint8_t * start, uint8_t * end);
+int mpeg2_getpos (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec);
+
+void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset);
+void mpeg2_skip (mpeg2dec_t * mpeg2dec, int skip);
+void mpeg2_slice_region (mpeg2dec_t * mpeg2dec, int start, int end);
+
+void mpeg2_tag_picture (mpeg2dec_t * mpeg2dec, uint32_t tag, uint32_t tag2);
+
+void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
+		      uint8_t * forward_fbuf[3], uint8_t * backward_fbuf[3]);
+void mpeg2_slice (mpeg2_decoder_t * decoder, int code, const uint8_t * buffer);
+int mpeg2_guess_aspect (const mpeg2_sequence_t * sequence,
+			unsigned int * pixel_width,
+			unsigned int * pixel_height);
+
+typedef enum {
+    MPEG2_ALLOC_MPEG2DEC = 0,
+    MPEG2_ALLOC_CHUNK = 1,
+    MPEG2_ALLOC_YUV = 2,
+    MPEG2_ALLOC_CONVERT_ID = 3,
+    MPEG2_ALLOC_CONVERTED = 4
+} mpeg2_alloc_t;
+
+void * mpeg2_malloc (unsigned size, mpeg2_alloc_t reason);
+void mpeg2_free (void * buf);
+void mpeg2_malloc_hooks (void * malloc (unsigned, mpeg2_alloc_t),
+			 int free (void *));
+
+#endif /* MPEG2_H */
diff --git a/src/video_dec/libmpeg2new/include/mpeg2convert.h b/src/video_dec/libmpeg2new/include/mpeg2convert.h
new file mode 100644
index 000000000..aac5d1991
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/mpeg2convert.h
@@ -0,0 +1,48 @@
+/*
+ * mpeg2convert.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef MPEG2CONVERT_H
+#define MPEG2CONVERT_H
+
+mpeg2_convert_t mpeg2convert_rgb32;
+mpeg2_convert_t mpeg2convert_rgb24;
+mpeg2_convert_t mpeg2convert_rgb16;
+mpeg2_convert_t mpeg2convert_rgb15;
+mpeg2_convert_t mpeg2convert_rgb8;
+mpeg2_convert_t mpeg2convert_bgr32;
+mpeg2_convert_t mpeg2convert_bgr24;
+mpeg2_convert_t mpeg2convert_bgr16;
+mpeg2_convert_t mpeg2convert_bgr15;
+mpeg2_convert_t mpeg2convert_bgr8;
+
+typedef enum {
+    MPEG2CONVERT_RGB = 0,
+    MPEG2CONVERT_BGR = 1
+} mpeg2convert_rgb_order_t;
+
+mpeg2_convert_t * mpeg2convert_rgb (mpeg2convert_rgb_order_t order,
+				    unsigned int bpp);
+
+mpeg2_convert_t mpeg2convert_uyvy;
+
+#endif /* MPEG2CONVERT_H */
diff --git a/src/video_dec/libmpeg2new/include/sse.h b/src/video_dec/libmpeg2new/include/sse.h
new file mode 100644
index 000000000..4bd853f8b
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/sse.h
@@ -0,0 +1,256 @@
+/*
+ * sse.h
+ * Copyright (C) 1999-2003 R. Fisher
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+typedef	union {
+	float			sf[4];	/* Single-precision (32-bit) value */
+} ATTR_ALIGN(16) sse_t;	/* On a 16 byte (128-bit) boundary */
+
+
+#define	sse_i2r(op, imm, reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "X" (imm) )
+
+#define	sse_m2r(op, mem, reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+#define	sse_r2m(op, reg, mem) \
+	__asm__ __volatile__ (#op " %%" #reg ", %0" \
+			      : "=X" (mem) \
+			      : /* nothing */ )
+
+#define	sse_r2r(op, regs, regd) \
+	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+#define	sse_r2ri(op, regs, regd, imm) \
+	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+			      : /* nothing */ \
+			      : "X" (imm) )
+
+#define	sse_m2ri(op, mem, reg, subop) \
+	__asm__ __volatile__ (#op " %0, %%" #reg ", " #subop \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+
+#define	movaps_m2r(var, reg)	sse_m2r(movaps, var, reg)
+#define	movaps_r2m(reg, var)	sse_r2m(movaps, reg, var)
+#define	movaps_r2r(regs, regd)	sse_r2r(movaps, regs, regd)
+
+#define	movntps_r2m(xmmreg, var)	sse_r2m(movntps, xmmreg, var)
+
+#define	movups_m2r(var, reg)	sse_m2r(movups, var, reg)
+#define	movups_r2m(reg, var)	sse_r2m(movups, reg, var)
+#define	movups_r2r(regs, regd)	sse_r2r(movups, regs, regd)
+
+#define	movhlps_r2r(regs, regd)	sse_r2r(movhlps, regs, regd)
+
+#define	movlhps_r2r(regs, regd)	sse_r2r(movlhps, regs, regd)
+
+#define	movhps_m2r(var, reg)	sse_m2r(movhps, var, reg)
+#define	movhps_r2m(reg, var)	sse_r2m(movhps, reg, var)
+
+#define	movlps_m2r(var, reg)	sse_m2r(movlps, var, reg)
+#define	movlps_r2m(reg, var)	sse_r2m(movlps, reg, var)
+
+#define	movss_m2r(var, reg)	sse_m2r(movss, var, reg)
+#define	movss_r2m(reg, var)	sse_r2m(movss, reg, var)
+#define	movss_r2r(regs, regd)	sse_r2r(movss, regs, regd)
+
+#define	shufps_m2r(var, reg, index)	sse_m2ri(shufps, var, reg, index)
+#define	shufps_r2r(regs, regd, index)	sse_r2ri(shufps, regs, regd, index)
+
+#define	cvtpi2ps_m2r(var, xmmreg)	sse_m2r(cvtpi2ps, var, xmmreg)
+#define	cvtpi2ps_r2r(mmreg, xmmreg)	sse_r2r(cvtpi2ps, mmreg, xmmreg)
+
+#define	cvtps2pi_m2r(var, mmreg)	sse_m2r(cvtps2pi, var, mmreg)
+#define	cvtps2pi_r2r(xmmreg, mmreg)	sse_r2r(cvtps2pi, mmreg, xmmreg)
+
+#define	cvttps2pi_m2r(var, mmreg)	sse_m2r(cvttps2pi, var, mmreg)
+#define	cvttps2pi_r2r(xmmreg, mmreg)	sse_r2r(cvttps2pi, mmreg, xmmreg)
+
+#define	cvtsi2ss_m2r(var, xmmreg)	sse_m2r(cvtsi2ss, var, xmmreg)
+#define	cvtsi2ss_r2r(reg, xmmreg)	sse_r2r(cvtsi2ss, reg, xmmreg)
+
+#define	cvtss2si_m2r(var, reg)		sse_m2r(cvtss2si, var, reg)
+#define	cvtss2si_r2r(xmmreg, reg)	sse_r2r(cvtss2si, xmmreg, reg)
+
+#define	cvttss2si_m2r(var, reg)		sse_m2r(cvtss2si, var, reg)
+#define	cvttss2si_r2r(xmmreg, reg)	sse_r2r(cvtss2si, xmmreg, reg)
+
+#define	movmskps(xmmreg, reg) \
+	__asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg)
+
+#define	addps_m2r(var, reg)		sse_m2r(addps, var, reg)
+#define	addps_r2r(regs, regd)		sse_r2r(addps, regs, regd)
+
+#define	addss_m2r(var, reg)		sse_m2r(addss, var, reg)
+#define	addss_r2r(regs, regd)		sse_r2r(addss, regs, regd)
+
+#define	subps_m2r(var, reg)		sse_m2r(subps, var, reg)
+#define	subps_r2r(regs, regd)		sse_r2r(subps, regs, regd)
+
+#define	subss_m2r(var, reg)		sse_m2r(subss, var, reg)
+#define	subss_r2r(regs, regd)		sse_r2r(subss, regs, regd)
+
+#define	mulps_m2r(var, reg)		sse_m2r(mulps, var, reg)
+#define	mulps_r2r(regs, regd)		sse_r2r(mulps, regs, regd)
+
+#define	mulss_m2r(var, reg)		sse_m2r(mulss, var, reg)
+#define	mulss_r2r(regs, regd)		sse_r2r(mulss, regs, regd)
+
+#define	divps_m2r(var, reg)		sse_m2r(divps, var, reg)
+#define	divps_r2r(regs, regd)		sse_r2r(divps, regs, regd)
+
+#define	divss_m2r(var, reg)		sse_m2r(divss, var, reg)
+#define	divss_r2r(regs, regd)		sse_r2r(divss, regs, regd)
+
+#define	rcpps_m2r(var, reg)		sse_m2r(rcpps, var, reg)
+#define	rcpps_r2r(regs, regd)		sse_r2r(rcpps, regs, regd)
+
+#define	rcpss_m2r(var, reg)		sse_m2r(rcpss, var, reg)
+#define	rcpss_r2r(regs, regd)		sse_r2r(rcpss, regs, regd)
+
+#define	rsqrtps_m2r(var, reg)		sse_m2r(rsqrtps, var, reg)
+#define	rsqrtps_r2r(regs, regd)		sse_r2r(rsqrtps, regs, regd)
+
+#define	rsqrtss_m2r(var, reg)		sse_m2r(rsqrtss, var, reg)
+#define	rsqrtss_r2r(regs, regd)		sse_r2r(rsqrtss, regs, regd)
+
+#define	sqrtps_m2r(var, reg)		sse_m2r(sqrtps, var, reg)
+#define	sqrtps_r2r(regs, regd)		sse_r2r(sqrtps, regs, regd)
+
+#define	sqrtss_m2r(var, reg)		sse_m2r(sqrtss, var, reg)
+#define	sqrtss_r2r(regs, regd)		sse_r2r(sqrtss, regs, regd)
+
+#define	andps_m2r(var, reg)		sse_m2r(andps, var, reg)
+#define	andps_r2r(regs, regd)		sse_r2r(andps, regs, regd)
+
+#define	andnps_m2r(var, reg)		sse_m2r(andnps, var, reg)
+#define	andnps_r2r(regs, regd)		sse_r2r(andnps, regs, regd)
+
+#define	orps_m2r(var, reg)		sse_m2r(orps, var, reg)
+#define	orps_r2r(regs, regd)		sse_r2r(orps, regs, regd)
+
+#define	xorps_m2r(var, reg)		sse_m2r(xorps, var, reg)
+#define	xorps_r2r(regs, regd)		sse_r2r(xorps, regs, regd)
+
+#define	maxps_m2r(var, reg)		sse_m2r(maxps, var, reg)
+#define	maxps_r2r(regs, regd)		sse_r2r(maxps, regs, regd)
+
+#define	maxss_m2r(var, reg)		sse_m2r(maxss, var, reg)
+#define	maxss_r2r(regs, regd)		sse_r2r(maxss, regs, regd)
+
+#define	minps_m2r(var, reg)		sse_m2r(minps, var, reg)
+#define	minps_r2r(regs, regd)		sse_r2r(minps, regs, regd)
+
+#define	minss_m2r(var, reg)		sse_m2r(minss, var, reg)
+#define	minss_r2r(regs, regd)		sse_r2r(minss, regs, regd)
+
+#define	cmpps_m2r(var, reg, op)		sse_m2ri(cmpps, var, reg, op)
+#define	cmpps_r2r(regs, regd, op)	sse_r2ri(cmpps, regs, regd, op)
+
+#define	cmpeqps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 0)
+#define	cmpeqps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 0)
+
+#define	cmpltps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 1)
+#define	cmpltps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 1)
+
+#define	cmpleps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 2)
+#define	cmpleps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 2)
+
+#define	cmpunordps_m2r(var, reg)	sse_m2ri(cmpps, var, reg, 3)
+#define	cmpunordps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 3)
+
+#define	cmpneqps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 4)
+#define	cmpneqps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 4)
+
+#define	cmpnltps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 5)
+#define	cmpnltps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 5)
+
+#define	cmpnleps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 6)
+#define	cmpnleps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 6)
+
+#define	cmpordps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 7)
+#define	cmpordps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 7)
+
+#define	cmpss_m2r(var, reg, op)		sse_m2ri(cmpss, var, reg, op)
+#define	cmpss_r2r(regs, regd, op)	sse_r2ri(cmpss, regs, regd, op)
+
+#define	cmpeqss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 0)
+#define	cmpeqss_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 0)
+
+#define	cmpltss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 1)
+#define	cmpltss_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 1)
+
+#define	cmpless_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 2)
+#define	cmpless_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 2)
+
+#define	cmpunordss_m2r(var, reg)	sse_m2ri(cmpss, var, reg, 3)
+#define	cmpunordss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 3)
+
+#define	cmpneqss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 4)
+#define	cmpneqss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 4)
+
+#define	cmpnltss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 5)
+#define	cmpnltss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 5)
+
+#define	cmpnless_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 6)
+#define	cmpnless_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 6)
+
+#define	cmpordss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 7)
+#define	cmpordss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 7)
+
+#define	comiss_m2r(var, reg)		sse_m2r(comiss, var, reg)
+#define	comiss_r2r(regs, regd)		sse_r2r(comiss, regs, regd)
+
+#define	ucomiss_m2r(var, reg)		sse_m2r(ucomiss, var, reg)
+#define	ucomiss_r2r(regs, regd)		sse_r2r(ucomiss, regs, regd)
+
+#define	unpcklps_m2r(var, reg)		sse_m2r(unpcklps, var, reg)
+#define	unpcklps_r2r(regs, regd)	sse_r2r(unpcklps, regs, regd)
+
+#define	unpckhps_m2r(var, reg)		sse_m2r(unpckhps, var, reg)
+#define	unpckhps_r2r(regs, regd)	sse_r2r(unpckhps, regs, regd)
+
+#define	fxrstor(mem) \
+	__asm__ __volatile__ ("fxrstor %0" \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+#define	fxsave(mem) \
+	__asm__ __volatile__ ("fxsave %0" \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+#define	stmxcsr(mem) \
+	__asm__ __volatile__ ("stmxcsr %0" \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+#define	ldmxcsr(mem) \
+	__asm__ __volatile__ ("ldmxcsr %0" \
+			      : /* nothing */ \
+			      : "X" (mem))
+
diff --git a/src/video_dec/libmpeg2new/include/tendra.h b/src/video_dec/libmpeg2new/include/tendra.h
new file mode 100644
index 000000000..09900916a
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/tendra.h
@@ -0,0 +1,35 @@
+/*
+ * tendra.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#pragma TenDRA begin
+#pragma TenDRA longlong type warning
+
+#ifdef TenDRA_check
+
+#pragma TenDRA conversion analysis (pointer-int explicit) off
+#pragma TenDRA implicit function declaration off
+
+/* avoid the "No declarations in translation unit" problem */
+int TenDRA;
+
+#endif /* TenDRA_check */
diff --git a/src/video_dec/libmpeg2new/include/video_out.h b/src/video_dec/libmpeg2new/include/video_out.h
new file mode 100644
index 000000000..342c55197
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/video_out.h
@@ -0,0 +1,58 @@
+/*
+ * video_out.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+struct mpeg2_sequence_s;
+struct mpeg2_convert_init_s;
+typedef struct {
+    int (* convert) (int stage, void * id,
+		     const struct mpeg2_sequence_s * sequence,
+		     int stride, uint32_t accel, void * arg,
+		     struct mpeg2_convert_init_s * result);
+} vo_setup_result_t;
+
+typedef struct vo_instance_s vo_instance_t;
+struct vo_instance_s {
+    int (* setup) (vo_instance_t * instance, unsigned int width,
+		   unsigned int height, unsigned int chroma_width,
+		   unsigned int chroma_height, vo_setup_result_t * result);
+    void (* setup_fbuf) (vo_instance_t * instance, uint8_t ** buf, void ** id);
+    void (* set_fbuf) (vo_instance_t * instance, uint8_t ** buf, void ** id);
+    void (* start_fbuf) (vo_instance_t * instance,
+			 uint8_t * const * buf, void * id);
+    void (* draw) (vo_instance_t * instance, uint8_t * const * buf, void * id);
+    void (* discard) (vo_instance_t * instance,
+		      uint8_t * const * buf, void * id);
+    void (* close) (vo_instance_t * instance);
+};
+
+typedef vo_instance_t * vo_open_t (void);
+
+typedef struct {
+    char * name;
+    vo_open_t * open;
+} vo_driver_t;
+
+void vo_accel (uint32_t accel);
+
+/* return NULL terminated array of all drivers */
+vo_driver_t const * vo_drivers (void);
diff --git a/src/video_dec/libmpeg2new/include/vis.h b/src/video_dec/libmpeg2new/include/vis.h
new file mode 100644
index 000000000..69dd49075
--- /dev/null
+++ b/src/video_dec/libmpeg2new/include/vis.h
@@ -0,0 +1,328 @@
+/*
+ * vis.h
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* You may be asking why I hard-code the instruction opcodes and don't
+ * use the normal VIS assembler mnenomics for the VIS instructions.
+ *
+ * The reason is that Sun, in their infinite wisdom, decided that a binary
+ * using a VIS instruction will cause it to be marked (in the ELF headers)
+ * as doing so, and this prevents the OS from loading such binaries if the
+ * current cpu doesn't have VIS.  There is no way to easily override this
+ * behavior of the assembler that I am aware of.
+ *
+ * This totally defeats what libmpeg2 is trying to do which is allow a
+ * single binary to be created, and then detect the availability of VIS
+ * at runtime.
+ *
+ * I'm not saying that tainting the binary by default is bad, rather I'm
+ * saying that not providing a way to override this easily unnecessarily
+ * ties people's hands.
+ *
+ * Thus, we do the opcode encoding by hand and output 32-bit words in
+ * the assembler to keep the binary from becoming tainted.
+ */
+
+#define vis_opc_base	((0x1 << 31) | (0x36 << 19))
+#define vis_opf(X)	((X) << 5)
+#define vis_sreg(X)	(X)
+#define vis_dreg(X)	(((X)&0x1f)|((X)>>5))
+#define vis_rs1_s(X)	(vis_sreg(X) << 14)
+#define vis_rs1_d(X)	(vis_dreg(X) << 14)
+#define vis_rs2_s(X)	(vis_sreg(X) << 0)
+#define vis_rs2_d(X)	(vis_dreg(X) << 0)
+#define vis_rd_s(X)	(vis_sreg(X) << 25)
+#define vis_rd_d(X)	(vis_dreg(X) << 25)
+
+#define vis_ss2s(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_dd2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_d(rs1) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_ss2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_sd2d(opf,rs1,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d2s(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s2d(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d12d(opf,rs1,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_d(rs1) | \
+                                       vis_rd_d(rd)))
+
+#define vis_d22d(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_d(rs2) | \
+                                       vis_rd_d(rd)))
+
+#define vis_s12s(opf,rs1,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs1_s(rs1) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s22s(opf,rs2,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rs2_s(rs2) | \
+                                       vis_rd_s(rd)))
+
+#define vis_s(opf,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rd_s(rd)))
+
+#define vis_d(opf,rd) \
+	__asm__ __volatile__ (".word %0" \
+			      : : "i" (vis_opc_base | vis_opf(opf) | \
+                                       vis_rd_d(rd)))
+
+#define vis_r2m(op,rd,mem) \
+	__asm__ __volatile__ (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
+
+#define vis_r2m_2(op,rd,mem1,mem2) \
+	__asm__ __volatile__ (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
+
+#define vis_m2r(op,mem,rd) \
+	__asm__ __volatile__ (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
+
+#define vis_m2r_2(op,mem1,mem2,rd) \
+	__asm__ __volatile__ (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
+
+static inline void vis_set_gsr(unsigned int _val)
+{
+	register unsigned int val asm("g1");
+
+	val = _val;
+	__asm__ __volatile__(".word 0xa7804000"
+			     : : "r" (val));
+}
+
+#define VIS_GSR_ALIGNADDR_MASK	0x0000007
+#define VIS_GSR_ALIGNADDR_SHIFT	0
+#define VIS_GSR_SCALEFACT_MASK	0x0000078
+#define VIS_GSR_SCALEFACT_SHIFT	3
+
+#define vis_ld32(mem,rs1)		vis_m2r(ld, mem, rs1)
+#define vis_ld32_2(mem1,mem2,rs1)	vis_m2r_2(ld, mem1, mem2, rs1)
+#define vis_st32(rs1,mem)		vis_r2m(st, rs1, mem)
+#define vis_st32_2(rs1,mem1,mem2)	vis_r2m_2(st, rs1, mem1, mem2)
+#define vis_ld64(mem,rs1)		vis_m2r(ldd, mem, rs1)
+#define vis_ld64_2(mem1,mem2,rs1)	vis_m2r_2(ldd, mem1, mem2, rs1)
+#define vis_st64(rs1,mem)		vis_r2m(std, rs1, mem)
+#define vis_st64_2(rs1,mem1,mem2)	vis_r2m_2(std, rs1, mem1, mem2)
+
+#define vis_ldblk(mem, rd) \
+do {	register void *__mem asm("g1"); \
+	__mem = &(mem); \
+	__asm__ __volatile__(".word 0xc1985e00 | %1" \
+			     : \
+			     : "r" (__mem), \
+			       "i" (vis_rd_d(rd)) \
+			     : "memory"); \
+} while (0)
+
+#define vis_stblk(rd, mem) \
+do {	register void *__mem asm("g1"); \
+	__mem = &(mem); \
+	__asm__ __volatile__(".word 0xc1b85e00 | %1" \
+			     : \
+			     : "r" (__mem), \
+			       "i" (vis_rd_d(rd)) \
+			     : "memory"); \
+} while (0)
+
+#define vis_membar_storestore()	\
+	__asm__ __volatile__(".word 0x8143e008" : : : "memory")
+
+#define vis_membar_sync()	\
+	__asm__ __volatile__(".word 0x8143e040" : : : "memory")
+
+/* 16 and 32 bit partitioned addition and subtraction.  The normal
+ * versions perform 4 16-bit or 2 32-bit additions or subtractions.
+ * The 's' versions perform 2 16-bit or 2 32-bit additions or
+ * subtractions.
+ */
+
+#define vis_padd16(rs1,rs2,rd)		vis_dd2d(0x50, rs1, rs2, rd)
+#define vis_padd16s(rs1,rs2,rd)		vis_ss2s(0x51, rs1, rs2, rd)
+#define vis_padd32(rs1,rs2,rd)		vis_dd2d(0x52, rs1, rs2, rd)
+#define vis_padd32s(rs1,rs2,rd)		vis_ss2s(0x53, rs1, rs2, rd)
+#define vis_psub16(rs1,rs2,rd)		vis_dd2d(0x54, rs1, rs2, rd)
+#define vis_psub16s(rs1,rs2,rd)		vis_ss2s(0x55, rs1, rs2, rd)
+#define vis_psub32(rs1,rs2,rd)		vis_dd2d(0x56, rs1, rs2, rd)
+#define vis_psub32s(rs1,rs2,rd)		vis_ss2s(0x57, rs1, rs2, rd)
+
+/* Pixel formatting instructions.  */
+
+#define vis_pack16(rs2,rd)		vis_d2s( 0x3b,      rs2, rd)
+#define vis_pack32(rs1,rs2,rd)		vis_dd2d(0x3a, rs1, rs2, rd)
+#define vis_packfix(rs2,rd)		vis_d2s( 0x3d,      rs2, rd)
+#define vis_expand(rs2,rd)		vis_s2d( 0x4d,      rs2, rd)
+#define vis_pmerge(rs1,rs2,rd)		vis_ss2d(0x4b, rs1, rs2, rd)
+
+/* Partitioned multiply instructions.  */
+
+#define vis_mul8x16(rs1,rs2,rd)		vis_sd2d(0x31, rs1, rs2, rd)
+#define vis_mul8x16au(rs1,rs2,rd)	vis_ss2d(0x33, rs1, rs2, rd)
+#define vis_mul8x16al(rs1,rs2,rd)	vis_ss2d(0x35, rs1, rs2, rd)
+#define vis_mul8sux16(rs1,rs2,rd)	vis_dd2d(0x36, rs1, rs2, rd)
+#define vis_mul8ulx16(rs1,rs2,rd)	vis_dd2d(0x37, rs1, rs2, rd)
+#define vis_muld8sux16(rs1,rs2,rd)	vis_ss2d(0x38, rs1, rs2, rd)
+#define vis_muld8ulx16(rs1,rs2,rd)	vis_ss2d(0x39, rs1, rs2, rd)
+
+/* Alignment instructions.  */
+
+static inline void *vis_alignaddr(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x18) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(1)));
+
+	return ptr;
+}
+
+static inline void vis_alignaddr_g0(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x18) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(0)));
+}
+
+static inline void *vis_alignaddrl(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x19) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(1)));
+
+	return ptr;
+}
+
+static inline void vis_alignaddrl_g0(void *_ptr)
+{
+	register void *ptr asm("g1");
+
+	ptr = _ptr;
+
+	__asm__ __volatile__(".word %2"
+			     : "=&r" (ptr)
+			     : "0" (ptr),
+			       "i" (vis_opc_base | vis_opf(0x19) |
+				    vis_rs1_s(1) |
+				    vis_rs2_s(0) |
+				    vis_rd_s(0)));
+}
+
+#define vis_faligndata(rs1,rs2,rd)	vis_dd2d(0x48, rs1, rs2, rd)
+
+/* Logical operate instructions.  */
+
+#define vis_fzero(rd)			vis_d(   0x60,           rd)
+#define vis_fzeros(rd)			vis_s(   0x61,           rd)
+#define vis_fone(rd)			vis_d(   0x7e,           rd)
+#define vis_fones(rd)			vis_s(   0x7f,           rd)
+#define vis_src1(rs1,rd)		vis_d12d(0x74, rs1,      rd)
+#define vis_src1s(rs1,rd)		vis_s12s(0x75, rs1,      rd)
+#define vis_src2(rs2,rd)		vis_d22d(0x78,      rs2, rd)
+#define vis_src2s(rs2,rd)		vis_s22s(0x79,      rs2, rd)
+#define vis_not1(rs1,rd)		vis_d12d(0x6a, rs1,      rd)
+#define vis_not1s(rs1,rd)		vis_s12s(0x6b, rs1,      rd)
+#define vis_not2(rs2,rd)		vis_d22d(0x66,      rs2, rd)
+#define vis_not2s(rs2,rd)		vis_s22s(0x67,      rs2, rd)
+#define vis_or(rs1,rs2,rd)		vis_dd2d(0x7c, rs1, rs2, rd)
+#define vis_ors(rs1,rs2,rd)		vis_ss2s(0x7d, rs1, rs2, rd)
+#define vis_nor(rs1,rs2,rd)		vis_dd2d(0x62, rs1, rs2, rd)
+#define vis_nors(rs1,rs2,rd)		vis_ss2s(0x63, rs1, rs2, rd)
+#define vis_and(rs1,rs2,rd)		vis_dd2d(0x70, rs1, rs2, rd)
+#define vis_ands(rs1,rs2,rd)		vis_ss2s(0x71, rs1, rs2, rd)
+#define vis_nand(rs1,rs2,rd)		vis_dd2d(0x6e, rs1, rs2, rd)
+#define vis_nands(rs1,rs2,rd)		vis_ss2s(0x6f, rs1, rs2, rd)
+#define vis_xor(rs1,rs2,rd)		vis_dd2d(0x6c, rs1, rs2, rd)
+#define vis_xors(rs1,rs2,rd)		vis_ss2s(0x6d, rs1, rs2, rd)
+#define vis_xnor(rs1,rs2,rd)		vis_dd2d(0x72, rs1, rs2, rd)
+#define vis_xnors(rs1,rs2,rd)		vis_ss2s(0x73, rs1, rs2, rd)
+#define vis_ornot1(rs1,rs2,rd)		vis_dd2d(0x7a, rs1, rs2, rd)
+#define vis_ornot1s(rs1,rs2,rd)		vis_ss2s(0x7b, rs1, rs2, rd)
+#define vis_ornot2(rs1,rs2,rd)		vis_dd2d(0x76, rs1, rs2, rd)
+#define vis_ornot2s(rs1,rs2,rd)		vis_ss2s(0x77, rs1, rs2, rd)
+#define vis_andnot1(rs1,rs2,rd)		vis_dd2d(0x68, rs1, rs2, rd)
+#define vis_andnot1s(rs1,rs2,rd)	vis_ss2s(0x69, rs1, rs2, rd)
+#define vis_andnot2(rs1,rs2,rd)		vis_dd2d(0x64, rs1, rs2, rd)
+#define vis_andnot2s(rs1,rs2,rd)	vis_ss2s(0x65, rs1, rs2, rd)
+
+/* Pixel component distance.  */
+
+#define vis_pdist(rs1,rs2,rd)		vis_dd2d(0x3e, rs1, rs2, rd)
diff --git a/src/video_dec/libmpeg2new/libmpeg2/Makefile.am b/src/video_dec/libmpeg2new/libmpeg2/Makefile.am
new file mode 100644
index 000000000..3a69cd1b4
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/Makefile.am
@@ -0,0 +1,23 @@
+include $(top_srcdir)/misc/Makefile.quiet
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS = $(DEFAULT_OCFLAGS) $(VISIBILITY_FLAG)
+
+if ENABLE_MPEG2NEW
+mpeg2new_libs = libmpeg2.la libmpeg2arch.la
+endif
+
+noinst_LTLIBRARIES = $(mpeg2new_libs)
+
+libmpeg2_la_SOURCES = alloc.c header.c decode.c slice.c motion_comp.c idct.c
+libmpeg2_la_LIBADD = libmpeg2arch.la
+
+AM_CPPFLAGS = -I$(srcdir)/../include
+
+libmpeg2arch_la_SOURCES = motion_comp_mmx.c idct_mmx.c \
+                          motion_comp_altivec.c idct_altivec.c \
+                          motion_comp_alpha.c idct_alpha.c \
+                          motion_comp_vis.c \
+                          cpu_accel.c cpu_state.c
+
+EXTRA_DIST = mpeg2_internal.h vlc.h
diff --git a/src/video_dec/libmpeg2new/libmpeg2/alloc.c b/src/video_dec/libmpeg2new/libmpeg2/alloc.c
new file mode 100644
index 000000000..f1a7afa1c
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/alloc.c
@@ -0,0 +1,70 @@
+/*
+ * alloc.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+
+static void * (* malloc_hook) (unsigned size, mpeg2_alloc_t reason) = NULL;
+static int (* free_hook) (void * buf) = NULL;
+
+void * mpeg2_malloc (unsigned size, mpeg2_alloc_t reason)
+{
+    char * buf;
+
+    if (malloc_hook) {
+	buf = (char *) malloc_hook (size, reason);
+	if (buf)
+	    return buf;
+    }
+
+    if (size) {
+	buf = (char *) malloc (size + 63 + sizeof (void **));
+	if (buf) {
+	    char * align_buf;
+
+	    align_buf = buf + 63 + sizeof (void **);
+	    align_buf -= (long)align_buf & 63;
+	    *(((void **)align_buf) - 1) = buf;
+	    return align_buf;
+	}
+    }
+    return NULL;
+}
+
+void mpeg2_free (void * buf)
+{
+    if (free_hook && free_hook (buf))
+	return;
+
+    if (buf)
+	free (*(((void **)buf) - 1));
+}
+
+void mpeg2_malloc_hooks (void * malloc (unsigned, mpeg2_alloc_t),
+			 int free (void *))
+{
+    malloc_hook = malloc;
+    free_hook = free;
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/configure.incl b/src/video_dec/libmpeg2new/libmpeg2/configure.incl
new file mode 100644
index 000000000..f8dbd5aef
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/configure.incl
@@ -0,0 +1,11 @@
+AC_SUBST([LIBMPEG2_CFLAGS])
+
+dnl avoid -fPIC when possible
+AC_LIBTOOL_NON_PIC([LIBMPEG2_CFLAGS="$LIBMPEG2_CFLAGS -prefer-non-pic"])
+
+dnl check for cpudetect
+AC_ARG_ENABLE([accel-detect],
+    [  --disable-accel-detect  make a version without accel detection code])
+if test x"$enable_accel_detect" != x"no"; then
+    AC_DEFINE([ACCEL_DETECT],,[autodetect accelerations])
+fi
diff --git a/src/video_dec/libmpeg2new/libmpeg2/convert_internal.h b/src/video_dec/libmpeg2new/libmpeg2/convert_internal.h
new file mode 100644
index 000000000..d1e63d5e3
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/convert_internal.h
@@ -0,0 +1,42 @@
+/*
+ * convert_internal.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+typedef struct {
+    uint8_t * rgb_ptr;
+    int width;
+    int field;
+    int y_stride, rgb_stride, y_increm, uv_increm, rgb_increm, rgb_slice;
+    int chroma420, convert420;
+    int dither_offset, dither_stride;
+    int y_stride_frame, uv_stride_frame, rgb_stride_frame, rgb_stride_min;
+} convert_rgb_t;
+
+typedef void mpeg2convert_copy_t (void * id, uint8_t * const * src,
+				  unsigned int v_offset);
+
+mpeg2convert_copy_t * mpeg2convert_rgb_mmxext (int bpp, int mode,
+					       const mpeg2_sequence_t * seq);
+mpeg2convert_copy_t * mpeg2convert_rgb_mmx (int bpp, int mode,
+					    const mpeg2_sequence_t * seq);
+mpeg2convert_copy_t * mpeg2convert_rgb_vis (int bpp, int mode,
+					    const mpeg2_sequence_t * seq);
diff --git a/src/video_dec/libmpeg2new/libmpeg2/cpu_accel.c b/src/video_dec/libmpeg2new/libmpeg2/cpu_accel.c
new file mode 100644
index 000000000..7846f1e88
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/cpu_accel.c
@@ -0,0 +1,258 @@
+/*
+ * cpu_accel.c
+ * Copyright (C) 2000-2004 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+#ifdef ARCH_X86
+static inline uint32_t arch_accel (uint32_t accel)
+{
+    if (accel & (MPEG2_ACCEL_X86_3DNOW | MPEG2_ACCEL_X86_MMXEXT))
+	accel |= MPEG2_ACCEL_X86_MMX;
+	
+    if (accel & (MPEG2_ACCEL_X86_SSE2 | MPEG2_ACCEL_X86_SSE3))
+	accel |= MPEG2_ACCEL_X86_MMXEXT;
+	
+    if (accel & (MPEG2_ACCEL_X86_SSE3))
+	accel |= MPEG2_ACCEL_X86_SSE2;
+
+#ifdef ACCEL_DETECT
+    if (accel & MPEG2_ACCEL_DETECT) {
+	uint32_t eax, ebx, ecx, edx;
+	int AMD;
+
+#if !defined(PIC) && !defined(__PIC__)
+#define cpuid(op,eax,ebx,ecx,edx)	\
+    __asm__ ("cpuid"			\
+	     : "=a" (eax),		\
+	       "=b" (ebx),		\
+	       "=c" (ecx),		\
+	       "=d" (edx)		\
+	     : "a" (op)			\
+	     : "cc")
+#else	/* PIC version : save ebx */
+#define cpuid(op,eax,ebx,ecx,edx)	\
+    __asm__ ("push %%ebx\n\t"		\
+	     "cpuid\n\t"		\
+	     "movl %%ebx,%1\n\t"	\
+	     "pop %%ebx"		\
+	     : "=a" (eax),		\
+	       "=r" (ebx),		\
+	       "=c" (ecx),		\
+	       "=d" (edx)		\
+	     : "a" (op)			\
+	     : "cc")
+#endif
+
+	__asm__ ("pushf\n\t"
+		 "pushf\n\t"
+		 "pop %0\n\t"
+		 "movl %0,%1\n\t"
+		 "xorl $0x200000,%0\n\t"
+		 "push %0\n\t"
+		 "popf\n\t"
+		 "pushf\n\t"
+		 "pop %0\n\t"
+		 "popf"
+		 : "=r" (eax),
+		 "=r" (ebx)
+		 :
+		 : "cc");
+
+	if (eax == ebx)			/* no cpuid */
+	    return accel;
+
+	cpuid (0x00000000, eax, ebx, ecx, edx);
+	if (!eax)			/* vendor string only */
+	    return accel;
+
+	AMD = (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65);
+
+	cpuid (0x00000001, eax, ebx, ecx, edx);
+	if (! (edx & 0x00800000))	/* no MMX */
+	    return accel;
+
+	accel |= MPEG2_ACCEL_X86_MMX;
+	if (edx & 0x02000000)	/* SSE - identical to AMD MMX extensions */
+	    accel |= MPEG2_ACCEL_X86_MMXEXT;
+
+	if (edx & 0x04000000)	/* SSE2 */
+	    accel |= MPEG2_ACCEL_X86_SSE2;
+	    
+	if (ecx & 0x00000001)	/* SSE3 */
+	    accel |= MPEG2_ACCEL_X86_SSE3;
+	    
+	cpuid (0x80000000, eax, ebx, ecx, edx);
+	if (eax < 0x80000001)		/* no extended capabilities */
+	    return accel;
+
+	cpuid (0x80000001, eax, ebx, ecx, edx);
+
+	if (edx & 0x80000000)
+	    accel |= MPEG2_ACCEL_X86_3DNOW;
+
+	if (AMD && (edx & 0x00400000))	/* AMD MMX extensions */
+	    accel |= MPEG2_ACCEL_X86_MMXEXT;
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
+}
+#endif /* ARCH_X86 */
+
+#if defined(ACCEL_DETECT) && (defined(ARCH_PPC) || defined(ARCH_SPARC))
+#include <signal.h>
+#include <setjmp.h>
+
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static RETSIGTYPE sigill_handler (int sig)
+{
+    if (!canjump) {
+	signal (sig, SIG_DFL);
+	raise (sig);
+    }
+
+    canjump = 0;
+    siglongjmp (jmpbuf, 1);
+}
+#endif /* ACCEL_DETECT && (ARCH_PPC || ARCH_SPARC) */
+
+#ifdef ARCH_PPC
+static inline uint32_t arch_accel (uint32_t accel)
+{
+#ifdef ACCEL_DETECT
+    if (accel & (MPEG2_ACCEL_PPC_ALTIVEC | MPEG2_ACCEL_DETECT) ==
+	MPEG2_ACCEL_DETECT) {
+	static RETSIGTYPE (* oldsig) (int);
+
+	oldsig = signal (SIGILL, sigill_handler);
+	if (sigsetjmp (jmpbuf, 1)) {
+	    signal (SIGILL, oldsig);
+	    return accel;
+	}
+
+	canjump = 1;
+
+#ifdef HAVE_ALTIVEC_H	/* gnu */
+#define VAND(a,b,c) "vand " #a "," #b "," #c "\n\t"
+#else			/* apple */
+#define VAND(a,b,c) "vand v" #a ",v" #b ",v" #c "\n\t"
+#endif
+	asm volatile ("mtspr 256, %0\n\t"
+		      VAND (0, 0, 0)
+		      :
+		      : "r" (-1));
+
+	canjump = 0;
+	accel |= MPEG2_ACCEL_PPC_ALTIVEC;
+
+	signal (SIGILL, oldsig);
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
+}
+#endif /* ARCH_PPC */
+
+#ifdef ARCH_SPARC
+static inline uint32_t arch_accel (uint32_t accel)
+{
+    if (accel & MPEG2_ACCEL_SPARC_VIS2)
+	accel |= MPEG2_ACCEL_SPARC_VIS;
+
+#ifdef ACCEL_DETECT
+    if (accel & (MPEG2_ACCEL_SPARC_VIS2 | MPEG2_ACCEL_DETECT) ==
+	MPEG2_ACCEL_DETECT) {
+	static RETSIGTYPE (* oldsig) (int);
+
+	oldsig = signal (SIGILL, sigill_handler);
+	if (sigsetjmp (jmpbuf, 1)) {
+	    signal (SIGILL, oldsig);
+	    return accel;
+	}
+
+	canjump = 1;
+
+	/* pdist %f0, %f0, %f0 */
+	__asm__ __volatile__(".word\t0x81b007c0");
+
+	canjump = 0;
+	accel |= MPEG2_ACCEL_SPARC_VIS;
+
+	if (sigsetjmp (jmpbuf, 1)) {
+	    signal (SIGILL, oldsig);
+	    return accel;
+	}
+
+	canjump = 1;
+
+	/* edge8n %g0, %g0, %g0 */
+	__asm__ __volatile__(".word\t0x81b00020");
+
+	canjump = 0;
+	accel |= MPEG2_ACCEL_SPARC_VIS2;
+
+	signal (SIGILL, oldsig);
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
+}
+#endif /* ARCH_SPARC */
+
+#ifdef ARCH_ALPHA
+static inline uint32_t arch_accel (uint32_t accel)
+{
+    if (accel & MPEG2_ACCEL_ALPHA_MVI)
+	accel |= MPEG2_ACCEL_ALPHA;
+
+#ifdef ACCEL_DETECT
+    if (accel & MPEG2_ACCEL_DETECT) {
+	uint64_t no_mvi;
+
+	asm volatile ("amask %1, %0"
+		      : "=r" (no_mvi)
+		      : "rI" (256));	/* AMASK_MVI */
+	accel |= no_mvi ? MPEG2_ACCEL_ALPHA : (MPEG2_ACCEL_ALPHA |
+					       MPEG2_ACCEL_ALPHA_MVI);
+    }
+#endif /* ACCEL_DETECT */
+
+    return accel;
+}
+#endif /* ARCH_ALPHA */
+
+uint32_t mpeg2_detect_accel (uint32_t accel)
+{
+#if defined (ARCH_X86) || defined (ARCH_PPC) || defined (ARCH_ALPHA) || defined (ARCH_SPARC)
+    accel = arch_accel (accel);
+#endif
+    return accel;
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/cpu_state.c b/src/video_dec/libmpeg2new/libmpeg2/cpu_state.c
new file mode 100644
index 000000000..edbf2dd28
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/cpu_state.c
@@ -0,0 +1,129 @@
+/*
+ * cpu_state.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+#ifdef ARCH_X86
+#include "../include/mmx.h"
+#endif
+
+void (* mpeg2_cpu_state_save) (cpu_state_t * state) = NULL;
+void (* mpeg2_cpu_state_restore) (cpu_state_t * state) = NULL;
+
+#ifdef ARCH_X86
+static void state_restore_mmx (cpu_state_t * state)
+{
+    emms ();
+}
+#endif
+
+#ifdef ARCH_PPC
+#ifdef HAVE_ALTIVEC_H	/* gnu */
+#define LI(a,b) "li " #a "," #b "\n\t"
+#define STVX0(a,b,c) "stvx " #a ",0," #c "\n\t"
+#define STVX(a,b,c) "stvx " #a "," #b "," #c "\n\t"
+#define LVX0(a,b,c) "lvx " #a ",0," #c "\n\t"
+#define LVX(a,b,c) "lvx " #a "," #b "," #c "\n\t"
+#else			/* apple */
+#define LI(a,b) "li r" #a "," #b "\n\t"
+#define STVX0(a,b,c) "stvx v" #a ",0,r" #c "\n\t"
+#define STVX(a,b,c) "stvx v" #a ",r" #b ",r" #c "\n\t"
+#define LVX0(a,b,c) "lvx v" #a ",0,r" #c "\n\t"
+#define LVX(a,b,c) "lvx v" #a ",r" #b ",r" #c "\n\t"
+#endif
+
+static void state_save_altivec (cpu_state_t * state)
+{
+    asm (LI (9, 16)
+	 STVX0 (20, 0, 3)
+	 LI (11, 32)
+	 STVX (21, 9, 3)
+	 LI (9, 48)
+	 STVX (22, 11, 3)
+	 LI (11, 64)
+	 STVX (23, 9, 3)
+	 LI (9, 80)
+	 STVX (24, 11, 3)
+	 LI (11, 96)
+	 STVX (25, 9, 3)
+	 LI (9, 112)
+	 STVX (26, 11, 3)
+	 LI (11, 128)
+	 STVX (27, 9, 3)
+	 LI (9, 144)
+	 STVX (28, 11, 3)
+	 LI (11, 160)
+	 STVX (29, 9, 3)
+	 LI (9, 176)
+	 STVX (30, 11, 3)
+	 STVX (31, 9, 3));
+}
+
+static void state_restore_altivec (cpu_state_t * state)
+{
+    asm (LI (9, 16)
+	 LVX0 (20, 0, 3)
+	 LI (11, 32)
+	 LVX (21, 9, 3)
+	 LI (9, 48)
+	 LVX (22, 11, 3)
+	 LI (11, 64)
+	 LVX (23, 9, 3)
+	 LI (9, 80)
+	 LVX (24, 11, 3)
+	 LI (11, 96)
+	 LVX (25, 9, 3)
+	 LI (9, 112)
+	 LVX (26, 11, 3)
+	 LI (11, 128)
+	 LVX (27, 9, 3)
+	 LI (9, 144)
+	 LVX (28, 11, 3)
+	 LI (11, 160)
+	 LVX (29, 9, 3)
+	 LI (9, 176)
+	 LVX (30, 11, 3)
+	 LVX (31, 9, 3));
+}
+#endif
+
+void mpeg2_cpu_state_init (uint32_t accel)
+{
+#ifdef ARCH_X86
+    if (accel & MPEG2_ACCEL_X86_MMX) {
+	mpeg2_cpu_state_restore = state_restore_mmx;
+    }
+#endif
+#ifdef ARCH_PPC
+    if (accel & MPEG2_ACCEL_PPC_ALTIVEC) {
+	mpeg2_cpu_state_save = state_save_altivec;
+	mpeg2_cpu_state_restore = state_restore_altivec;
+    }
+#endif
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/decode.c b/src/video_dec/libmpeg2new/libmpeg2/decode.c
new file mode 100644
index 000000000..337ba4466
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/decode.c
@@ -0,0 +1,439 @@
+/*
+ * decode.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <string.h>	/* memcmp/memset, try to remove */
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+static int mpeg2_accels = 0;
+
+#define BUFFER_SIZE (1194 * 1024)
+
+const mpeg2_info_t * mpeg2_info (mpeg2dec_t * mpeg2dec)
+{
+    return &(mpeg2dec->info);
+}
+
+static inline int skip_chunk (mpeg2dec_t * mpeg2dec, int bytes)
+{
+    uint8_t * current;
+    uint32_t shift;
+    uint8_t * limit;
+    uint8_t byte;
+
+    if (!bytes)
+	return 0;
+
+    current = mpeg2dec->buf_start;
+    shift = mpeg2dec->shift;
+    limit = current + bytes;
+
+    do {
+	byte = *current++;
+	if (shift == 0x00000100) {
+	    int skipped;
+
+	    mpeg2dec->shift = 0xffffff00;
+	    skipped = current - mpeg2dec->buf_start;
+	    mpeg2dec->buf_start = current;
+	    return skipped;
+	}
+	shift = (shift | byte) << 8;
+    } while (current < limit);
+
+    mpeg2dec->shift = shift;
+    mpeg2dec->buf_start = current;
+    return 0;
+}
+
+static inline int copy_chunk (mpeg2dec_t * mpeg2dec, int bytes)
+{
+    uint8_t * current;
+    uint32_t shift;
+    uint8_t * chunk_ptr;
+    uint8_t * limit;
+    uint8_t byte;
+
+    if (!bytes)
+	return 0;
+
+    current = mpeg2dec->buf_start;
+    shift = mpeg2dec->shift;
+    chunk_ptr = mpeg2dec->chunk_ptr;
+    limit = current + bytes;
+
+    do {
+	byte = *current++;
+	if (shift == 0x00000100) {
+	    int copied;
+
+	    mpeg2dec->shift = 0xffffff00;
+	    mpeg2dec->chunk_ptr = chunk_ptr + 1;
+	    copied = current - mpeg2dec->buf_start;
+	    mpeg2dec->buf_start = current;
+	    return copied;
+	}
+	shift = (shift | byte) << 8;
+	*chunk_ptr++ = byte;
+    } while (current < limit);
+
+    mpeg2dec->shift = shift;
+    mpeg2dec->buf_start = current;
+    return 0;
+}
+
+void mpeg2_buffer (mpeg2dec_t * mpeg2dec, uint8_t * start, uint8_t * end)
+{
+    mpeg2dec->buf_start = start;
+    mpeg2dec->buf_end = end;
+}
+
+int mpeg2_getpos (mpeg2dec_t * mpeg2dec)
+{
+    return mpeg2dec->buf_end - mpeg2dec->buf_start;
+}
+
+static inline mpeg2_state_t seek_chunk (mpeg2dec_t * mpeg2dec)
+{
+    int size, skipped;
+
+    size = mpeg2dec->buf_end - mpeg2dec->buf_start;
+    skipped = skip_chunk (mpeg2dec, size);
+    if (!skipped) {
+	mpeg2dec->bytes_since_tag += size;
+	return STATE_BUFFER;
+    }
+    mpeg2dec->bytes_since_tag += skipped;
+    mpeg2dec->code = mpeg2dec->buf_start[-1];
+    return STATE_INTERNAL_NORETURN;
+}
+
+mpeg2_state_t mpeg2_seek_header (mpeg2dec_t * mpeg2dec)
+{
+    while (!(mpeg2dec->code == 0xb3 ||
+	     ((mpeg2dec->code == 0xb7 || mpeg2dec->code == 0xb8 ||
+	       !mpeg2dec->code) && mpeg2dec->sequence.width != (unsigned)-1)))
+	if (seek_chunk (mpeg2dec) == STATE_BUFFER)
+	    return STATE_BUFFER;
+    mpeg2dec->chunk_start = mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+    mpeg2dec->user_data_len = 0;
+    return ((mpeg2dec->code == 0xb7) ?
+	    mpeg2_header_end (mpeg2dec) : mpeg2_parse_header (mpeg2dec));
+}
+
+#define RECEIVED(code,state) (((state) << 8) + (code))
+
+mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec)
+{
+    int size_buffer, size_chunk, copied;
+
+    if (mpeg2dec->action) {
+	mpeg2_state_t state;
+
+	state = mpeg2dec->action (mpeg2dec);
+	if ((int)state > (int)STATE_INTERNAL_NORETURN)
+	    return state;
+    }
+
+    while (1) {
+	while ((unsigned) (mpeg2dec->code - mpeg2dec->first_decode_slice) <
+	       mpeg2dec->nb_decode_slices) {
+	    size_buffer = mpeg2dec->buf_end - mpeg2dec->buf_start;
+	    size_chunk = (mpeg2dec->chunk_buffer + BUFFER_SIZE -
+			  mpeg2dec->chunk_ptr);
+	    if (size_buffer <= size_chunk) {
+		copied = copy_chunk (mpeg2dec, size_buffer);
+		if (!copied) {
+		    mpeg2dec->bytes_since_tag += size_buffer;
+		    mpeg2dec->chunk_ptr += size_buffer;
+		    return STATE_BUFFER;
+		}
+	    } else {
+		copied = copy_chunk (mpeg2dec, size_chunk);
+		if (!copied) {
+		    /* filled the chunk buffer without finding a start code */
+		    mpeg2dec->bytes_since_tag += size_chunk;
+		    mpeg2dec->action = seek_chunk;
+		    return STATE_INVALID;
+		}
+	    }
+	    mpeg2dec->bytes_since_tag += copied;
+
+	    mpeg2_slice (&(mpeg2dec->decoder), mpeg2dec->code,
+			 mpeg2dec->chunk_start);
+	    mpeg2dec->code = mpeg2dec->buf_start[-1];
+	    mpeg2dec->chunk_ptr = mpeg2dec->chunk_start;
+	}
+	if ((unsigned) (mpeg2dec->code - 1) >= 0xb0 - 1)
+	    break;
+	if (seek_chunk (mpeg2dec) == STATE_BUFFER)
+	    return STATE_BUFFER;
+    }
+
+    mpeg2dec->action = mpeg2_seek_header;
+    switch (mpeg2dec->code) {
+    case 0x00:
+	return mpeg2dec->state;
+    case 0xb3:
+    case 0xb7:
+    case 0xb8:
+	return (mpeg2dec->state == STATE_SLICE) ? STATE_SLICE : STATE_INVALID;
+    default:
+	mpeg2dec->action = seek_chunk;
+	return STATE_INVALID;
+    }
+}
+
+mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec)
+{
+    static int (* process_header[]) (mpeg2dec_t * mpeg2dec) = {
+	mpeg2_header_picture, mpeg2_header_extension, mpeg2_header_user_data,
+	mpeg2_header_sequence, NULL, NULL, NULL, NULL, mpeg2_header_gop
+    };
+    int size_buffer, size_chunk, copied;
+
+    mpeg2dec->action = mpeg2_parse_header;
+    mpeg2dec->info.user_data = NULL;	mpeg2dec->info.user_data_len = 0;
+    while (1) {
+	size_buffer = mpeg2dec->buf_end - mpeg2dec->buf_start;
+	size_chunk = (mpeg2dec->chunk_buffer + BUFFER_SIZE -
+		      mpeg2dec->chunk_ptr);
+	if (size_buffer <= size_chunk) {
+	    copied = copy_chunk (mpeg2dec, size_buffer);
+	    if (!copied) {
+		mpeg2dec->bytes_since_tag += size_buffer;
+		mpeg2dec->chunk_ptr += size_buffer;
+		return STATE_BUFFER;
+	    }
+	} else {
+	    copied = copy_chunk (mpeg2dec, size_chunk);
+	    if (!copied) {
+		/* filled the chunk buffer without finding a start code */
+		mpeg2dec->bytes_since_tag += size_chunk;
+		mpeg2dec->code = 0xb4;
+		mpeg2dec->action = mpeg2_seek_header;
+		return STATE_INVALID;
+	    }
+	}
+	mpeg2dec->bytes_since_tag += copied;
+
+	if (process_header[mpeg2dec->code & 0x0b] (mpeg2dec)) {
+	    mpeg2dec->code = mpeg2dec->buf_start[-1];
+	    mpeg2dec->action = mpeg2_seek_header;
+	    return STATE_INVALID;
+	}
+
+	mpeg2dec->code = mpeg2dec->buf_start[-1];
+	switch (RECEIVED (mpeg2dec->code, mpeg2dec->state)) {
+
+	/* state transition after a sequence header */
+	case RECEIVED (0x00, STATE_SEQUENCE):
+	case RECEIVED (0xb8, STATE_SEQUENCE):
+	    mpeg2_header_sequence_finalize (mpeg2dec);
+	    break;
+
+	/* other legal state transitions */
+	case RECEIVED (0x00, STATE_GOP):
+	    mpeg2_header_gop_finalize (mpeg2dec);
+	    break;
+	case RECEIVED (0x01, STATE_PICTURE):
+	case RECEIVED (0x01, STATE_PICTURE_2ND):
+	    mpeg2_header_picture_finalize (mpeg2dec, mpeg2_accels);
+	    mpeg2dec->action = mpeg2_header_slice_start;
+	    break;
+
+	/* legal headers within a given state */
+	case RECEIVED (0xb2, STATE_SEQUENCE):
+	case RECEIVED (0xb2, STATE_GOP):
+	case RECEIVED (0xb2, STATE_PICTURE):
+	case RECEIVED (0xb2, STATE_PICTURE_2ND):
+	case RECEIVED (0xb5, STATE_SEQUENCE):
+	case RECEIVED (0xb5, STATE_PICTURE):
+	case RECEIVED (0xb5, STATE_PICTURE_2ND):
+	    mpeg2dec->chunk_ptr = mpeg2dec->chunk_start;
+	    continue;
+
+	default:
+	    mpeg2dec->action = mpeg2_seek_header;
+	    return STATE_INVALID;
+	}
+
+	mpeg2dec->chunk_start = mpeg2dec->chunk_ptr = mpeg2dec->chunk_buffer;
+	mpeg2dec->user_data_len = 0;
+	return mpeg2dec->state;
+    }
+}
+
+int mpeg2_convert (mpeg2dec_t * mpeg2dec, mpeg2_convert_t convert, void * arg)
+{
+    mpeg2_convert_init_t convert_init;
+    int error;
+
+    error = convert (MPEG2_CONVERT_SET, NULL, &(mpeg2dec->sequence), 0,
+		     mpeg2_accels, arg, &convert_init);
+    if (!error) {
+	mpeg2dec->convert = convert;
+	mpeg2dec->convert_arg = arg;
+	mpeg2dec->convert_id_size = convert_init.id_size;
+	mpeg2dec->convert_stride = 0;
+    }
+    return error;
+}
+
+int mpeg2_stride (mpeg2dec_t * mpeg2dec, int stride)
+{
+    if (!mpeg2dec->convert) {
+	if (stride < (int) mpeg2dec->sequence.width)
+	    stride = mpeg2dec->sequence.width;
+	mpeg2dec->decoder.stride_frame = stride;
+    } else {
+	mpeg2_convert_init_t convert_init;
+
+	stride = mpeg2dec->convert (MPEG2_CONVERT_STRIDE, NULL,
+				    &(mpeg2dec->sequence), stride,
+				    mpeg2_accels, mpeg2dec->convert_arg,
+				    &convert_init);
+	mpeg2dec->convert_id_size = convert_init.id_size;
+	mpeg2dec->convert_stride = stride;
+    }
+    return stride;
+}
+
+void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id)
+{
+    mpeg2_fbuf_t * fbuf;
+
+    if (mpeg2dec->custom_fbuf) {
+	if (mpeg2dec->state == STATE_SEQUENCE) {
+	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
+	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
+	}
+	mpeg2_set_fbuf (mpeg2dec, (mpeg2dec->decoder.coding_type ==
+				   PIC_FLAG_CODING_TYPE_B));
+	fbuf = mpeg2dec->fbuf[0];
+    } else {
+	fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index].fbuf);
+	mpeg2dec->alloc_index_user = ++mpeg2dec->alloc_index;
+    }
+    fbuf->buf[0] = buf[0];
+    fbuf->buf[1] = buf[1];
+    fbuf->buf[2] = buf[2];
+    fbuf->id = id;
+}
+
+void mpeg2_custom_fbuf (mpeg2dec_t * mpeg2dec, int custom_fbuf)
+{
+    mpeg2dec->custom_fbuf = custom_fbuf;
+}
+
+void mpeg2_skip (mpeg2dec_t * mpeg2dec, int skip)
+{
+    mpeg2dec->first_decode_slice = 1;
+    mpeg2dec->nb_decode_slices = skip ? 0 : (0xb0 - 1);
+}
+
+void mpeg2_slice_region (mpeg2dec_t * mpeg2dec, int start, int end)
+{
+    start = (start < 1) ? 1 : (start > 0xb0) ? 0xb0 : start;
+    end = (end < start) ? start : (end > 0xb0) ? 0xb0 : end;
+    mpeg2dec->first_decode_slice = start;
+    mpeg2dec->nb_decode_slices = end - start;
+}
+
+void mpeg2_tag_picture (mpeg2dec_t * mpeg2dec, uint32_t tag, uint32_t tag2)
+{
+    mpeg2dec->tag_previous = mpeg2dec->tag_current;
+    mpeg2dec->tag2_previous = mpeg2dec->tag2_current;
+    mpeg2dec->tag_current = tag;
+    mpeg2dec->tag2_current = tag2;
+    mpeg2dec->num_tags++;
+    mpeg2dec->bytes_since_tag = 0;
+}
+
+uint32_t mpeg2_accel (uint32_t accel)
+{
+    if (!mpeg2_accels) {
+	mpeg2_accels = mpeg2_detect_accel (accel) | MPEG2_ACCEL_DETECT;
+	mpeg2_cpu_state_init (mpeg2_accels);
+	mpeg2_idct_init (mpeg2_accels);
+	mpeg2_mc_init (mpeg2_accels);
+    }
+    return mpeg2_accels & ~MPEG2_ACCEL_DETECT;
+}
+
+void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset)
+{
+    mpeg2dec->buf_start = mpeg2dec->buf_end = NULL;
+    mpeg2dec->num_tags = 0;
+    mpeg2dec->shift = 0xffffff00;
+    mpeg2dec->code = 0xb4;
+    mpeg2dec->action = mpeg2_seek_header;
+    mpeg2dec->state = STATE_INVALID;
+    mpeg2dec->first = 1;
+
+    mpeg2_reset_info(&(mpeg2dec->info));
+    mpeg2dec->info.gop = NULL;
+    mpeg2dec->info.user_data = NULL;
+    mpeg2dec->info.user_data_len = 0;
+    if (full_reset) {
+	mpeg2dec->info.sequence = NULL;
+	mpeg2_header_state_init (mpeg2dec);
+    }
+
+}
+
+mpeg2dec_t * mpeg2_init (void)
+{
+    mpeg2dec_t * mpeg2dec;
+
+    mpeg2_accel (MPEG2_ACCEL_DETECT);
+
+    mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
+					    MPEG2_ALLOC_MPEG2DEC);
+    if (mpeg2dec == NULL)
+	return NULL;
+
+    memset (mpeg2dec->decoder.DCTblock, 0, 64 * sizeof (int16_t));
+    memset (mpeg2dec->quantizer_matrix, 0, 4 * 64 * sizeof (uint8_t));
+
+    mpeg2dec->chunk_buffer = (uint8_t *) mpeg2_malloc (BUFFER_SIZE + 4,
+						       MPEG2_ALLOC_CHUNK);
+
+    mpeg2dec->sequence.width = (unsigned)-1;
+    mpeg2_reset (mpeg2dec, 1);
+
+    return mpeg2dec;
+}
+
+void mpeg2_close (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_header_state_init (mpeg2dec);
+    mpeg2_free (mpeg2dec->chunk_buffer);
+    mpeg2_free (mpeg2dec);
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/header.c b/src/video_dec/libmpeg2new/libmpeg2/header.c
new file mode 100644
index 000000000..935a50aa3
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/header.c
@@ -0,0 +1,961 @@
+/*
+ * header.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Regis Duchesne <hpreg@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+#include <stdlib.h>	/* defines NULL */
+#include <string.h>	/* memcmp */
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+#define SEQ_EXT 2
+#define SEQ_DISPLAY_EXT 4
+#define QUANT_MATRIX_EXT 8
+#define COPYRIGHT_EXT 0x10
+#define PIC_DISPLAY_EXT 0x80
+#define PIC_CODING_EXT 0x100
+
+/* default intra quant matrix, in zig-zag order */
+static const uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
+    8,
+    16, 16,
+    19, 16, 19,
+    22, 22, 22, 22,
+    22, 22, 26, 24, 26,
+    27, 27, 27, 26, 26, 26,
+    26, 27, 27, 27, 29, 29, 29,
+    34, 34, 34, 29, 29, 29, 27, 27,
+    29, 29, 32, 32, 34, 34, 37,
+    38, 37, 35, 35, 34, 35,
+    38, 38, 40, 40, 40,
+    48, 48, 46, 46,
+    56, 56, 58,
+    69, 69,
+    83
+};
+
+uint8_t mpeg2_scan_norm[64] ATTR_ALIGN(16) = {
+    /* Zig-Zag scan pattern */
+     0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) = {
+    /* Alternate scan pattern */
+     0, 8,  16, 24,  1,  9,  2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
+    41, 33, 26, 18,  3, 11,  4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
+    51, 59, 20, 28,  5, 13,  6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
+    53, 61, 22, 30,  7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
+};
+
+void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec)
+{
+    if (mpeg2dec->sequence.width != (unsigned)-1) {
+	int i;
+
+	mpeg2dec->sequence.width = (unsigned)-1;
+	if (!mpeg2dec->custom_fbuf)
+	    for (i = mpeg2dec->alloc_index_user;
+		 i < mpeg2dec->alloc_index; i++) {
+		mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[0]);
+		mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[1]);
+		mpeg2_free (mpeg2dec->fbuf_alloc[i].fbuf.buf[2]);
+	    }
+	if (mpeg2dec->convert_start)
+	    for (i = 0; i < 3; i++) {
+		mpeg2_free (mpeg2dec->yuv_buf[i][0]);
+		mpeg2_free (mpeg2dec->yuv_buf[i][1]);
+		mpeg2_free (mpeg2dec->yuv_buf[i][2]);
+	    }
+	if (mpeg2dec->decoder.convert_id)
+	    mpeg2_free (mpeg2dec->decoder.convert_id);
+    }
+    mpeg2dec->decoder.coding_type = I_TYPE;
+    mpeg2dec->decoder.convert = NULL;
+    mpeg2dec->decoder.convert_id = NULL;
+    mpeg2dec->picture = mpeg2dec->pictures;
+    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[0].fbuf;
+    mpeg2dec->fbuf[1] = &mpeg2dec->fbuf_alloc[1].fbuf;
+    mpeg2dec->fbuf[2] = &mpeg2dec->fbuf_alloc[2].fbuf;
+    mpeg2dec->first = 1;
+    mpeg2dec->alloc_index = 0;
+    mpeg2dec->alloc_index_user = 0;
+    mpeg2dec->first_decode_slice = 1;
+    mpeg2dec->nb_decode_slices = 0xb0 - 1;
+    mpeg2dec->convert = NULL;
+    mpeg2dec->convert_start = NULL;
+    mpeg2dec->custom_fbuf = 0;
+    mpeg2dec->yuv_index = 0;
+}
+
+void mpeg2_reset_info (mpeg2_info_t * info)
+{
+    info->current_picture = info->current_picture_2nd = NULL;
+    info->display_picture = info->display_picture_2nd = NULL;
+    info->current_fbuf = info->display_fbuf = info->discard_fbuf = NULL;
+}
+
+static void info_user_data (mpeg2dec_t * mpeg2dec)
+{
+    if (mpeg2dec->user_data_len) {
+	mpeg2dec->info.user_data = mpeg2dec->chunk_buffer;
+	mpeg2dec->info.user_data_len = mpeg2dec->user_data_len - 3;
+    }
+}
+
+int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_sequence_t * sequence = &(mpeg2dec->new_sequence);
+    static unsigned int frame_period[16] = {
+	0, 1126125, 1125000, 1080000, 900900, 900000, 540000, 450450, 450000,
+	/* unofficial: xing 15 fps */
+	1800000,
+	/* unofficial: libmpeg3 "Unofficial economy rates" 5/10/12/15 fps */
+	5400000, 2700000, 2250000, 1800000, 0, 0
+    };
+    int i;
+
+    if ((buffer[6] & 0x20) != 0x20)	/* missing marker_bit */
+	return 1;
+
+    i = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    if (! (sequence->display_width = sequence->picture_width = i >> 12))
+	return 1;
+    if (! (sequence->display_height = sequence->picture_height = i & 0xfff))
+	return 1;
+    sequence->width = (sequence->picture_width + 15) & ~15;
+    sequence->height = (sequence->picture_height + 15) & ~15;
+    sequence->chroma_width = sequence->width >> 1;
+    sequence->chroma_height = sequence->height >> 1;
+
+    sequence->flags = (SEQ_FLAG_PROGRESSIVE_SEQUENCE |
+		       SEQ_VIDEO_FORMAT_UNSPECIFIED);
+
+    sequence->pixel_width = buffer[3] >> 4;	/* aspect ratio */
+    sequence->frame_period = frame_period[buffer[3] & 15];
+
+    sequence->byte_rate = (buffer[4]<<10) | (buffer[5]<<2) | (buffer[6]>>6);
+
+    sequence->vbv_buffer_size = ((buffer[6]<<16)|(buffer[7]<<8))&0x1ff800;
+
+    if (buffer[7] & 4)
+	sequence->flags |= SEQ_FLAG_CONSTRAINED_PARAMETERS;
+
+    mpeg2dec->copy_matrix = 3;
+    if (buffer[7] & 2) {
+	for (i = 0; i < 64; i++)
+	    mpeg2dec->new_quantizer_matrix[0][mpeg2_scan_norm[i]] =
+		(buffer[i+7] << 7) | (buffer[i+8] >> 1);
+	buffer += 64;
+    } else
+	for (i = 0; i < 64; i++)
+	    mpeg2dec->new_quantizer_matrix[0][mpeg2_scan_norm[i]] =
+		default_intra_quantizer_matrix[i];
+
+    if (buffer[7] & 1)
+	for (i = 0; i < 64; i++)
+	    mpeg2dec->new_quantizer_matrix[1][mpeg2_scan_norm[i]] =
+		buffer[i+8];
+    else
+	memset (mpeg2dec->new_quantizer_matrix[1], 16, 64);
+
+    sequence->profile_level_id = 0x80;
+    sequence->colour_primaries = 0;
+    sequence->transfer_characteristics = 0;
+    sequence->matrix_coefficients = 0;
+
+    mpeg2dec->ext_state = SEQ_EXT;
+    mpeg2dec->state = STATE_SEQUENCE;
+    mpeg2dec->display_offset_x = mpeg2dec->display_offset_y = 0;
+
+    return 0;
+}
+
+static int sequence_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_sequence_t * sequence = &(mpeg2dec->new_sequence);
+    uint32_t flags;
+
+    if (!(buffer[3] & 1))
+	return 1;
+
+    sequence->profile_level_id = (buffer[0] << 4) | (buffer[1] >> 4);
+
+    sequence->display_width = sequence->picture_width +=
+	((buffer[1] << 13) | (buffer[2] << 5)) & 0x3000;
+    sequence->display_height = sequence->picture_height +=
+	(buffer[2] << 7) & 0x3000;
+    sequence->width = (sequence->picture_width + 15) & ~15;
+    sequence->height = (sequence->picture_height + 15) & ~15;
+    flags = sequence->flags | SEQ_FLAG_MPEG2;
+    if (!(buffer[1] & 8)) {
+	flags &= ~SEQ_FLAG_PROGRESSIVE_SEQUENCE;
+	sequence->height = (sequence->height + 31) & ~31;
+    }
+    if (buffer[5] & 0x80)
+	flags |= SEQ_FLAG_LOW_DELAY;
+    sequence->flags = flags;
+    sequence->chroma_width = sequence->width;
+    sequence->chroma_height = sequence->height;
+    switch (buffer[1] & 6) {
+    case 0:	/* invalid */
+	return 1;
+    case 2:	/* 4:2:0 */
+	sequence->chroma_height >>= 1;
+    case 4:	/* 4:2:2 */
+	sequence->chroma_width >>= 1;
+    }
+
+    sequence->byte_rate += ((buffer[2]<<25) | (buffer[3]<<17)) & 0x3ffc0000;
+
+    sequence->vbv_buffer_size |= buffer[4] << 21;
+
+    sequence->frame_period =
+	sequence->frame_period * ((buffer[5]&31)+1) / (((buffer[5]>>2)&3)+1);
+
+    mpeg2dec->ext_state = SEQ_DISPLAY_EXT;
+
+    return 0;
+}
+
+static int sequence_display_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_sequence_t * sequence = &(mpeg2dec->new_sequence);
+
+    sequence->flags = ((sequence->flags & ~SEQ_MASK_VIDEO_FORMAT) |
+		       ((buffer[0]<<4) & SEQ_MASK_VIDEO_FORMAT));
+    if (buffer[0] & 1) {
+	sequence->flags |= SEQ_FLAG_COLOUR_DESCRIPTION;
+	sequence->colour_primaries = buffer[1];
+	sequence->transfer_characteristics = buffer[2];
+	sequence->matrix_coefficients = buffer[3];
+	buffer += 3;
+    }
+
+    if (!(buffer[2] & 2))	/* missing marker_bit */
+	return 1;
+
+    sequence->display_width = (buffer[1] << 6) | (buffer[2] >> 2);
+    sequence->display_height =
+	((buffer[2]& 1 ) << 13) | (buffer[3] << 5) | (buffer[4] >> 3);
+
+    return 0;
+}
+
+static inline void simplify (unsigned int * u, unsigned int * v)
+{
+    unsigned int a, b, tmp;
+
+    a = *u;	b = *v;
+    while (a) {	/* find greatest common divisor */
+	tmp = a;	a = b % tmp;	b = tmp;
+    }
+    *u /= b;	*v /= b;
+}
+
+static inline void finalize_sequence (mpeg2_sequence_t * sequence)
+{
+    int width;
+    int height;
+
+    sequence->byte_rate *= 50;
+
+    if (sequence->flags & SEQ_FLAG_MPEG2) {
+	switch (sequence->pixel_width) {
+	case 1:		/* square pixels */
+	    sequence->pixel_width = sequence->pixel_height = 1;	return;
+	case 2:		/* 4:3 aspect ratio */
+	    width = 4; height = 3;	break;
+	case 3:		/* 16:9 aspect ratio */
+	    width = 16; height = 9;	break;
+	case 4:		/* 2.21:1 aspect ratio */
+	    width = 221; height = 100;	break;
+	default:	/* illegal */
+	    sequence->pixel_width = sequence->pixel_height = 0;	return;
+	}
+	width *= sequence->display_height;
+	height *= sequence->display_width;
+
+    } else {
+	if (sequence->byte_rate == 50 * 0x3ffff) 
+	    sequence->byte_rate = 0;        /* mpeg-1 VBR */ 
+
+	switch (sequence->pixel_width) {
+	case 0:	case 15:	/* illegal */
+	    sequence->pixel_width = sequence->pixel_height = 0;		return;
+	case 1:	/* square pixels */
+	    sequence->pixel_width = sequence->pixel_height = 1;		return;
+	case 3:	/* 720x576 16:9 */
+	    sequence->pixel_width = 64;	sequence->pixel_height = 45;	return;
+	case 6:	/* 720x480 16:9 */
+	    sequence->pixel_width = 32;	sequence->pixel_height = 27;	return;
+	case 8: /* BT.601 625 lines 4:3 */
+	    sequence->pixel_width = 59;	sequence->pixel_height = 54;	return;
+	case 12: /* BT.601 525 lines 4:3 */
+	    sequence->pixel_width = 10;	sequence->pixel_height = 11;	return;
+	default:
+	    height = 88 * sequence->pixel_width + 1171;
+	    width = 2000;
+	}
+    }
+
+    sequence->pixel_width = width;
+    sequence->pixel_height = height;
+    simplify (&sequence->pixel_width, &sequence->pixel_height);
+}
+
+int mpeg2_guess_aspect (const mpeg2_sequence_t * sequence,
+			unsigned int * pixel_width,
+			unsigned int * pixel_height)
+{
+    static struct {
+	unsigned int width, height;
+    } video_modes[] = {
+	{720, 576}, /* 625 lines, 13.5 MHz (D1, DV, DVB, DVD) */
+	{704, 576}, /* 625 lines, 13.5 MHz (1/1 D1, DVB, DVD, 4CIF) */
+	{544, 576}, /* 625 lines, 10.125 MHz (DVB, laserdisc) */
+	{528, 576}, /* 625 lines, 10.125 MHz (3/4 D1, DVB, laserdisc) */
+	{480, 576}, /* 625 lines, 9 MHz (2/3 D1, DVB, SVCD) */
+	{352, 576}, /* 625 lines, 6.75 MHz (D2, 1/2 D1, CVD, DVB, DVD) */
+	{352, 288}, /* 625 lines, 6.75 MHz, 1 field (D4, VCD, DVB, DVD, CIF) */
+	{176, 144}, /* 625 lines, 3.375 MHz, half field (QCIF) */
+	{720, 486}, /* 525 lines, 13.5 MHz (D1) */
+	{704, 486}, /* 525 lines, 13.5 MHz */
+	{720, 480}, /* 525 lines, 13.5 MHz (DV, DSS, DVD) */
+	{704, 480}, /* 525 lines, 13.5 MHz (1/1 D1, ATSC, DVD) */
+	{544, 480}, /* 525 lines. 10.125 MHz (DSS, laserdisc) */
+	{528, 480}, /* 525 lines. 10.125 MHz (3/4 D1, laserdisc) */
+	{480, 480}, /* 525 lines, 9 MHz (2/3 D1, SVCD) */
+	{352, 480}, /* 525 lines, 6.75 MHz (D2, 1/2 D1, CVD, DVD) */
+	{352, 240}  /* 525  lines. 6.75 MHz, 1 field (D4, VCD, DSS, DVD) */
+    };
+    unsigned int width, height, pix_width, pix_height, i, DAR_16_9;
+
+    *pixel_width = sequence->pixel_width;
+    *pixel_height = sequence->pixel_height;
+    width = sequence->picture_width;
+    height = sequence->picture_height;
+    for (i = 0; i < sizeof (video_modes) / sizeof (video_modes[0]); i++)
+	if (width == video_modes[i].width && height == video_modes[i].height)
+	    break;
+    if (i == sizeof (video_modes) / sizeof (video_modes[0]) ||
+	(sequence->pixel_width == 1 && sequence->pixel_height == 1) ||
+	width != sequence->display_width || height != sequence->display_height)
+	return 0;
+
+    for (pix_height = 1; height * pix_height < 480; pix_height <<= 1);
+    height *= pix_height;
+    for (pix_width = 1; width * pix_width <= 352; pix_width <<= 1);
+    width *= pix_width;
+
+    if (! (sequence->flags & SEQ_FLAG_MPEG2)) {
+	static unsigned int mpeg1_check[2][2] = {{11, 54}, {27, 45}};
+	DAR_16_9 = (sequence->pixel_height == 27 ||
+		    sequence->pixel_height == 45);
+	if (width < 704 ||
+	    sequence->pixel_height != mpeg1_check[DAR_16_9][height == 576])
+	    return 0;
+    } else {
+	DAR_16_9 = (3 * sequence->picture_width * sequence->pixel_width >
+		    4 * sequence->picture_height * sequence->pixel_height);
+	switch (width) {
+	case 528: case 544:	pix_width *= 4; pix_height *= 3; break;
+	case 480:		pix_width *= 3; pix_height *= 2; break;
+	}
+    }
+    if (DAR_16_9) {
+	pix_width *= 4; pix_height *= 3;
+    }
+    if (height == 576) {
+	pix_width *= 59; pix_height *= 54;
+    } else {
+	pix_width *= 10; pix_height *= 11;
+    }
+    *pixel_width = pix_width;
+    *pixel_height = pix_height;
+    simplify (pixel_width, pixel_height);
+    return (height == 576) ? 1 : 2;
+}
+
+static void copy_matrix (mpeg2dec_t * mpeg2dec, int index)
+{
+    if (memcmp (mpeg2dec->quantizer_matrix[index],
+		mpeg2dec->new_quantizer_matrix[index], 64)) {
+	memcpy (mpeg2dec->quantizer_matrix[index],
+		mpeg2dec->new_quantizer_matrix[index], 64);
+	mpeg2dec->scaled[index] = -1;
+    }
+}
+
+static void finalize_matrix (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+    int i;
+
+    for (i = 0; i < 2; i++) {
+	if (mpeg2dec->copy_matrix & (1 << i))
+	    copy_matrix (mpeg2dec, i);
+	if ((mpeg2dec->copy_matrix & (4 << i)) &&
+	    memcmp (mpeg2dec->quantizer_matrix[i],
+		    mpeg2dec->new_quantizer_matrix[i+2], 64)) {
+	    copy_matrix (mpeg2dec, i + 2);
+	    decoder->chroma_quantizer[i] = decoder->quantizer_prescale[i+2];
+	} else if (mpeg2dec->copy_matrix & (5 << i))
+	    decoder->chroma_quantizer[i] = decoder->quantizer_prescale[i];
+    }
+}
+
+static mpeg2_state_t invalid_end_action (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.gop = NULL;
+    info_user_data (mpeg2dec);
+    mpeg2_header_state_init (mpeg2dec);
+    mpeg2dec->sequence = mpeg2dec->new_sequence;
+    mpeg2dec->action = mpeg2_seek_header;
+    mpeg2dec->state = STATE_SEQUENCE;
+    return STATE_SEQUENCE;
+}
+
+void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_sequence_t * sequence = &(mpeg2dec->new_sequence);
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+
+    finalize_sequence (sequence);
+    finalize_matrix (mpeg2dec);
+
+    decoder->mpeg1 = !(sequence->flags & SEQ_FLAG_MPEG2);
+    decoder->width = sequence->width;
+    decoder->height = sequence->height;
+    decoder->vertical_position_extension = (sequence->picture_height > 2800);
+    decoder->chroma_format = ((sequence->chroma_width == sequence->width) +
+			      (sequence->chroma_height == sequence->height));
+
+    if (mpeg2dec->sequence.width != (unsigned)-1) {
+	/*
+	 * According to 6.1.1.6, repeat sequence headers should be
+	 * identical to the original. However some encoders dont
+	 * respect that and change various fields (including bitrate
+	 * and aspect ratio) in the repeat sequence headers. So we
+	 * choose to be as conservative as possible and only restart
+	 * the decoder if the width, height, chroma_width,
+	 * chroma_height or low_delay flag are modified.
+	 */
+	if (sequence->width != mpeg2dec->sequence.width ||
+	    sequence->height != mpeg2dec->sequence.height ||
+	    sequence->chroma_width != mpeg2dec->sequence.chroma_width ||
+	    sequence->chroma_height != mpeg2dec->sequence.chroma_height ||
+	    ((sequence->flags ^ mpeg2dec->sequence.flags) &
+	     SEQ_FLAG_LOW_DELAY)) {
+	    decoder->stride_frame = sequence->width;
+	    mpeg2_header_end (mpeg2dec);
+	    mpeg2dec->action = invalid_end_action;
+	    mpeg2dec->state = STATE_INVALID_END;
+	    return;
+	}
+	mpeg2dec->state = (memcmp (&(mpeg2dec->sequence), sequence,
+				   sizeof (mpeg2_sequence_t)) ?
+			   STATE_SEQUENCE_MODIFIED : STATE_SEQUENCE_REPEATED);
+    } else
+	decoder->stride_frame = sequence->width;
+    mpeg2dec->sequence = *sequence;
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.sequence = &(mpeg2dec->sequence);
+    mpeg2dec->info.gop = NULL;
+    info_user_data (mpeg2dec);
+}
+
+int mpeg2_header_gop (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_gop_t * gop = &(mpeg2dec->new_gop);
+
+    if (! (buffer[1] & 8))
+	return 1;
+    gop->hours = (buffer[0] >> 2) & 31;
+    gop->minutes = ((buffer[0] << 4) | (buffer[1] >> 4)) & 63;
+    gop->seconds = ((buffer[1] << 3) | (buffer[2] >> 5)) & 63;
+    gop->pictures = ((buffer[2] << 1) | (buffer[3] >> 7)) & 63;
+    gop->flags = (buffer[0] >> 7) | ((buffer[3] >> 4) & 6);
+    mpeg2dec->state = STATE_GOP;
+    return 0;
+}
+
+void mpeg2_header_gop_finalize (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->gop = mpeg2dec->new_gop;
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.gop = &(mpeg2dec->gop);
+    info_user_data (mpeg2dec);
+}
+
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int b_type)
+{
+    int i;
+
+    for (i = 0; i < 3; i++)
+	if (mpeg2dec->fbuf[1] != &mpeg2dec->fbuf_alloc[i].fbuf &&
+	    mpeg2dec->fbuf[2] != &mpeg2dec->fbuf_alloc[i].fbuf) {
+	    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[i].fbuf;
+	    mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
+	    if (b_type || (mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+		if (b_type || mpeg2dec->convert)
+		    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
+		mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
+	    }
+	    break;
+	}
+}
+
+int mpeg2_header_picture (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_picture_t * picture = &(mpeg2dec->new_picture);
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+    int type;
+
+    mpeg2dec->state = ((mpeg2dec->state != STATE_SLICE_1ST) ?
+		       STATE_PICTURE : STATE_PICTURE_2ND);
+    mpeg2dec->ext_state = PIC_CODING_EXT;
+
+    picture->temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
+
+    type = (buffer [1] >> 3) & 7;
+    if (type == PIC_FLAG_CODING_TYPE_P || type == PIC_FLAG_CODING_TYPE_B) {
+	/* forward_f_code and backward_f_code - used in mpeg1 only */
+	decoder->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
+	decoder->f_motion.f_code[0] =
+	    (((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
+	decoder->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
+	decoder->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
+    }
+
+    picture->flags = PIC_FLAG_PROGRESSIVE_FRAME | type;
+    picture->tag = picture->tag2 = 0;
+    if (mpeg2dec->num_tags) {
+	if (mpeg2dec->bytes_since_tag >= mpeg2dec->chunk_ptr - buffer + 4) {
+	    mpeg2dec->num_tags = 0;
+	    picture->tag = mpeg2dec->tag_current;
+	    picture->tag2 = mpeg2dec->tag2_current;
+	    picture->flags |= PIC_FLAG_TAGS;
+	} else if (mpeg2dec->num_tags > 1) {
+	    mpeg2dec->num_tags = 1;
+	    picture->tag = mpeg2dec->tag_previous;
+	    picture->tag2 = mpeg2dec->tag2_previous;
+	    picture->flags |= PIC_FLAG_TAGS;
+	}
+    }
+    picture->nb_fields = 2;
+    picture->display_offset[0].x = picture->display_offset[1].x =
+	picture->display_offset[2].x = mpeg2dec->display_offset_x;
+    picture->display_offset[0].y = picture->display_offset[1].y =
+	picture->display_offset[2].y = mpeg2dec->display_offset_y;
+
+    /* XXXXXX decode extra_information_picture as well */
+
+    mpeg2dec->q_scale_type = 0;
+    decoder->intra_dc_precision = 7;
+    decoder->frame_pred_frame_dct = 1;
+    decoder->concealment_motion_vectors = 0;
+    decoder->scan = mpeg2_scan_norm;
+    decoder->picture_structure = FRAME_PICTURE;
+    mpeg2dec->copy_matrix = 0;
+
+    return 0;
+}
+
+static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_picture_t * picture = &(mpeg2dec->new_picture);
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+    uint32_t flags;
+
+    /* pre subtract 1 for use later in compute_motion_vector */
+    decoder->f_motion.f_code[0] = (buffer[0] & 15) - 1;
+    decoder->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
+    decoder->b_motion.f_code[0] = (buffer[1] & 15) - 1;
+    decoder->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
+
+    flags = picture->flags;
+    decoder->intra_dc_precision = 7 - ((buffer[2] >> 2) & 3);
+    decoder->picture_structure = buffer[2] & 3;
+    switch (decoder->picture_structure) {
+    case TOP_FIELD:
+	flags |= PIC_FLAG_TOP_FIELD_FIRST;
+    case BOTTOM_FIELD:
+	picture->nb_fields = 1;
+	break;
+    case FRAME_PICTURE:
+	if (!(mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)) {
+	    picture->nb_fields = (buffer[3] & 2) ? 3 : 2;
+	    flags |= (buffer[3] & 128) ? PIC_FLAG_TOP_FIELD_FIRST : 0;
+	} else
+	    picture->nb_fields = (buffer[3]&2) ? ((buffer[3]&128) ? 6 : 4) : 2;
+	break;
+    default:
+	return 1;
+    }
+    decoder->top_field_first = buffer[3] >> 7;
+    decoder->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    decoder->concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    mpeg2dec->q_scale_type = buffer[3] & 16;
+    decoder->intra_vlc_format = (buffer[3] >> 3) & 1;
+    decoder->scan = (buffer[3] & 4) ? mpeg2_scan_alt : mpeg2_scan_norm;
+    if (!(buffer[4] & 0x80))
+	flags &= ~PIC_FLAG_PROGRESSIVE_FRAME;
+    if (buffer[4] & 0x40)
+	flags |= (((buffer[4]<<26) | (buffer[5]<<18) | (buffer[6]<<10)) &
+		  PIC_MASK_COMPOSITE_DISPLAY) | PIC_FLAG_COMPOSITE_DISPLAY;
+    picture->flags = flags;
+
+    mpeg2dec->ext_state = PIC_DISPLAY_EXT | COPYRIGHT_EXT | QUANT_MATRIX_EXT;
+
+    return 0;
+}
+
+static int picture_display_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    mpeg2_picture_t * picture = &(mpeg2dec->new_picture);
+    int i, nb_pos;
+
+    nb_pos = picture->nb_fields;
+    if (mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)
+	nb_pos >>= 1;
+
+    for (i = 0; i < nb_pos; i++) {
+	int x, y;
+
+	x = ((buffer[4*i] << 24) | (buffer[4*i+1] << 16) |
+	     (buffer[4*i+2] << 8) | buffer[4*i+3]) >> (11-2*i);
+	y = ((buffer[4*i+2] << 24) | (buffer[4*i+3] << 16) |
+	     (buffer[4*i+4] << 8) | buffer[4*i+5]) >> (10-2*i);
+	if (! (x & y & 1))
+	    return 1;
+	picture->display_offset[i].x = mpeg2dec->display_offset_x = x >> 1;
+	picture->display_offset[i].y = mpeg2dec->display_offset_y = y >> 1;
+    }
+    for (; i < 3; i++) {
+	picture->display_offset[i].x = mpeg2dec->display_offset_x;
+	picture->display_offset[i].y = mpeg2dec->display_offset_y;
+    }
+    return 0;
+}
+
+void mpeg2_header_picture_finalize (mpeg2dec_t * mpeg2dec, uint32_t accels)
+{
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+    int old_type_b = (decoder->coding_type == B_TYPE);
+    int low_delay = mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY;
+
+    finalize_matrix (mpeg2dec);
+    decoder->coding_type = mpeg2dec->new_picture.flags & PIC_MASK_CODING_TYPE;
+
+    if (mpeg2dec->state == STATE_PICTURE) {
+	mpeg2_picture_t * picture;
+	mpeg2_picture_t * other;
+
+	decoder->second_field = 0;
+
+	picture = other = mpeg2dec->pictures;
+	if (old_type_b ^ (mpeg2dec->picture < mpeg2dec->pictures + 2))
+	    picture += 2;
+	else
+	    other += 2;
+	mpeg2dec->picture = picture;
+	*picture = mpeg2dec->new_picture;
+
+	if (!old_type_b) {
+	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
+	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
+	}
+	mpeg2dec->fbuf[0] = NULL;
+	mpeg2_reset_info (&(mpeg2dec->info));
+	mpeg2dec->info.current_picture = picture;
+	mpeg2dec->info.display_picture = picture;
+	if (decoder->coding_type != B_TYPE) {
+	    if (!low_delay) {
+		if (mpeg2dec->first) {
+		    mpeg2dec->info.display_picture = NULL;
+		    mpeg2dec->first = 0;
+		} else {
+		    mpeg2dec->info.display_picture = other;
+		    if (other->nb_fields == 1)
+			mpeg2dec->info.display_picture_2nd = other + 1;
+		    mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
+		}
+	    }
+	    if (!low_delay + !mpeg2dec->convert)
+		mpeg2dec->info.discard_fbuf =
+		    mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert];
+	}
+	if (mpeg2dec->convert) {
+	    mpeg2_convert_init_t convert_init;
+	    if (!mpeg2dec->convert_start) {
+		int y_size, uv_size;
+
+		mpeg2dec->decoder.convert_id =
+		    mpeg2_malloc (mpeg2dec->convert_id_size,
+				  MPEG2_ALLOC_CONVERT_ID);
+		mpeg2dec->convert (MPEG2_CONVERT_START,
+				   mpeg2dec->decoder.convert_id,
+				   &(mpeg2dec->sequence),
+				   mpeg2dec->convert_stride, accels,
+				   mpeg2dec->convert_arg, &convert_init);
+		mpeg2dec->convert_start = convert_init.start;
+		mpeg2dec->decoder.convert = convert_init.copy;
+
+		y_size = decoder->stride_frame * mpeg2dec->sequence.height;
+		uv_size = y_size >> (2 - mpeg2dec->decoder.chroma_format);
+		mpeg2dec->yuv_buf[0][0] =
+		    (uint8_t *) mpeg2_malloc (y_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[0][1] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[0][2] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[1][0] =
+		    (uint8_t *) mpeg2_malloc (y_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[1][1] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[1][2] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		y_size = decoder->stride_frame * 32;
+		uv_size = y_size >> (2 - mpeg2dec->decoder.chroma_format);
+		mpeg2dec->yuv_buf[2][0] =
+		    (uint8_t *) mpeg2_malloc (y_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[2][1] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+		mpeg2dec->yuv_buf[2][2] =
+		    (uint8_t *) mpeg2_malloc (uv_size, MPEG2_ALLOC_YUV);
+	    }
+	    if (!mpeg2dec->custom_fbuf) {
+		while (mpeg2dec->alloc_index < 3) {
+		    mpeg2_fbuf_t * fbuf;
+
+		    fbuf = &mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf;
+		    fbuf->id = NULL;
+		    fbuf->buf[0] =
+			(uint8_t *) mpeg2_malloc (convert_init.buf_size[0],
+						  MPEG2_ALLOC_CONVERTED);
+		    fbuf->buf[1] =
+			(uint8_t *) mpeg2_malloc (convert_init.buf_size[1],
+						  MPEG2_ALLOC_CONVERTED);
+		    fbuf->buf[2] =
+			(uint8_t *) mpeg2_malloc (convert_init.buf_size[2],
+						  MPEG2_ALLOC_CONVERTED);
+		}
+		mpeg2_set_fbuf (mpeg2dec, (decoder->coding_type == B_TYPE));
+	    }
+	} else if (!mpeg2dec->custom_fbuf) {
+	    while (mpeg2dec->alloc_index < 3) {
+		mpeg2_fbuf_t * fbuf;
+		int y_size, uv_size;
+
+		fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf);
+		fbuf->id = NULL;
+		y_size = decoder->stride_frame * mpeg2dec->sequence.height;
+		uv_size = y_size >> (2 - decoder->chroma_format);
+		fbuf->buf[0] = (uint8_t *) mpeg2_malloc (y_size,
+							 MPEG2_ALLOC_YUV);
+		fbuf->buf[1] = (uint8_t *) mpeg2_malloc (uv_size,
+							 MPEG2_ALLOC_YUV);
+		fbuf->buf[2] = (uint8_t *) mpeg2_malloc (uv_size,
+							 MPEG2_ALLOC_YUV);
+	    }
+	    mpeg2_set_fbuf (mpeg2dec, (decoder->coding_type == B_TYPE));
+	}
+    } else {
+	decoder->second_field = 1;
+	mpeg2dec->picture++;	/* second field picture */
+	*(mpeg2dec->picture) = mpeg2dec->new_picture;
+	mpeg2dec->info.current_picture_2nd = mpeg2dec->picture;
+	if (low_delay || decoder->coding_type == B_TYPE)
+	    mpeg2dec->info.display_picture_2nd = mpeg2dec->picture;
+    }
+
+    info_user_data (mpeg2dec);
+}
+
+static int copyright_ext (mpeg2dec_t * mpeg2dec)
+{
+    return 0;
+}
+
+static int quant_matrix_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+	if (buffer[0] & (8 >> i)) {
+	    for (j = 0; j < 64; j++)
+		mpeg2dec->new_quantizer_matrix[i][mpeg2_scan_norm[j]] =
+		    (buffer[j] << (i+5)) | (buffer[j+1] >> (3-i));
+	    mpeg2dec->copy_matrix |= 1 << i;
+	    buffer += 64;
+	}
+
+    return 0;
+}
+
+int mpeg2_header_extension (mpeg2dec_t * mpeg2dec)
+{
+    static int (* parser[]) (mpeg2dec_t *) = {
+	0, sequence_ext, sequence_display_ext, quant_matrix_ext,
+	copyright_ext, 0, 0, picture_display_ext, picture_coding_ext
+    };
+    int ext, ext_bit;
+
+    ext = mpeg2dec->chunk_start[0] >> 4;
+    ext_bit = 1 << ext;
+
+    if (!(mpeg2dec->ext_state & ext_bit))
+	return 0;	/* ignore illegal extensions */
+    mpeg2dec->ext_state &= ~ext_bit;
+    return parser[ext] (mpeg2dec);
+}
+
+int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->user_data_len += mpeg2dec->chunk_ptr - 1 - mpeg2dec->chunk_start;
+    mpeg2dec->chunk_start = mpeg2dec->chunk_ptr - 1;
+    
+    return 0;
+}
+
+static void prescale (mpeg2dec_t * mpeg2dec, int index)
+{
+    static int non_linear_scale [] = {
+	 0,  1,  2,  3,  4,  5,   6,   7,
+	 8, 10, 12, 14, 16, 18,  20,  22,
+	24, 28, 32, 36, 40, 44,  48,  52,
+	56, 64, 72, 80, 88, 96, 104, 112
+    };
+    int i, j, k;
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+
+    if (mpeg2dec->scaled[index] != mpeg2dec->q_scale_type) {
+	mpeg2dec->scaled[index] = mpeg2dec->q_scale_type;
+	for (i = 0; i < 32; i++) {
+	    k = mpeg2dec->q_scale_type ? non_linear_scale[i] : (i << 1);
+	    for (j = 0; j < 64; j++)
+		decoder->quantizer_prescale[index][i][j] =
+		    k * mpeg2dec->quantizer_matrix[index][j];
+	}
+    }
+}
+
+mpeg2_state_t mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_decoder_t * decoder = &(mpeg2dec->decoder);
+
+    mpeg2dec->info.user_data = NULL;	mpeg2dec->info.user_data_len = 0;
+    mpeg2dec->state = ((mpeg2dec->picture->nb_fields > 1 ||
+			mpeg2dec->state == STATE_PICTURE_2ND) ?
+		       STATE_SLICE : STATE_SLICE_1ST);
+
+    if (mpeg2dec->decoder.coding_type != D_TYPE) {
+	prescale (mpeg2dec, 0);
+	if (decoder->chroma_quantizer[0] == decoder->quantizer_prescale[2])
+	    prescale (mpeg2dec, 2);
+	if (mpeg2dec->decoder.coding_type != I_TYPE) {
+	    prescale (mpeg2dec, 1);
+	    if (decoder->chroma_quantizer[1] == decoder->quantizer_prescale[3])
+		prescale (mpeg2dec, 3);
+	}
+    }
+
+    if (!(mpeg2dec->nb_decode_slices))
+	mpeg2dec->picture->flags |= PIC_FLAG_SKIP;
+    else if (mpeg2dec->convert_start) {
+	mpeg2dec->convert_start (decoder->convert_id, mpeg2dec->fbuf[0],
+				 mpeg2dec->picture, mpeg2dec->info.gop);
+
+	if (mpeg2dec->decoder.coding_type == B_TYPE)
+	    mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->yuv_buf[2],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index]);
+	else {
+	    mpeg2_init_fbuf (&(mpeg2dec->decoder),
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index]);
+	    if (mpeg2dec->state == STATE_SLICE)
+		mpeg2dec->yuv_index ^= 1;
+	}
+    } else {
+	int b_type;
+
+	b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
+	mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->fbuf[0]->buf,
+			 mpeg2dec->fbuf[b_type + 1]->buf,
+			 mpeg2dec->fbuf[b_type]->buf);
+    }
+    mpeg2dec->action = NULL;
+    return STATE_INTERNAL_NORETURN;
+}
+
+static mpeg2_state_t seek_sequence (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_reset_info (&(mpeg2dec->info));
+    mpeg2dec->info.sequence = NULL;
+    mpeg2dec->info.gop = NULL;
+    mpeg2_header_state_init (mpeg2dec);
+    mpeg2dec->action = mpeg2_seek_header;
+    return mpeg2_seek_header (mpeg2dec);
+}
+
+mpeg2_state_t mpeg2_header_end (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2_picture_t * picture;
+    int b_type;
+
+    b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
+    picture = mpeg2dec->pictures;
+    if ((mpeg2dec->picture >= picture + 2) ^ b_type)
+	picture = mpeg2dec->pictures + 2;
+
+    mpeg2_reset_info (&(mpeg2dec->info));
+    if (!(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+	mpeg2dec->info.display_picture = picture;
+	if (picture->nb_fields == 1)
+	    mpeg2dec->info.display_picture_2nd = picture + 1;
+	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[b_type];
+	if (!mpeg2dec->convert)
+	    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type + 1];
+    } else if (!mpeg2dec->convert)
+	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type];
+    mpeg2dec->action = seek_sequence;
+    return STATE_END;
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/idct.c b/src/video_dec/libmpeg2new/libmpeg2/idct.c
new file mode 100644
index 000000000..8b982bb33
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/idct.c
@@ -0,0 +1,287 @@
+/*
+ * idct.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+#define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
+#define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
+#define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
+#define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
+#define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
+#define W7 565  /* 2048 * sqrt (2) * cos (7 * pi / 16) */
+
+/* idct main entry point  */
+void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+void (* mpeg2_idct_add) (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+
+/*
+ * In legal streams, the IDCT output should be between -384 and +384.
+ * In corrupted streams, it is possible to force the IDCT output to go
+ * to +-3826 - this is the worst case for a column IDCT where the
+ * column inputs are 16-bit values.
+ */
+uint8_t mpeg2_clip[3840 * 2 + 256];
+#define CLIP(i) ((mpeg2_clip + 3840)[i])
+
+#if 0
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    t0 = W0 * d0 + W1 * d1;		\
+    t1 = W0 * d1 - W1 * d0;		\
+} while (0)
+#else
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    int tmp = W0 * (d0 + d1);		\
+    t0 = tmp + (W1 - W0) * d1;		\
+    t1 = tmp - (W1 + W0) * d0;		\
+} while (0)
+#endif
+
+static void inline idct_row (int16_t * const block)
+{
+    int d0, d1, d2, d3;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+    int t0, t1, t2, t3;
+
+    /* shortcut */
+    if (likely (!(block[1] | ((int32_t *)block)[1] | ((int32_t *)block)[2] |
+		  ((int32_t *)block)[3]))) {
+	uint32_t tmp = (uint16_t) (block[0] >> 1);
+	tmp |= tmp << 16;
+	((int32_t *)block)[0] = tmp;
+	((int32_t *)block)[1] = tmp;
+	((int32_t *)block)[2] = tmp;
+	((int32_t *)block)[3] = tmp;
+	return;
+    }
+
+    d0 = (block[0] << 11) + 2048;
+    d1 = block[1];
+    d2 = block[2] << 11;
+    d3 = block[3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[4];
+    d1 = block[5];
+    d2 = block[6];
+    d3 = block[7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
+
+    block[0] = (a0 + b0) >> 12;
+    block[1] = (a1 + b1) >> 12;
+    block[2] = (a2 + b2) >> 12;
+    block[3] = (a3 + b3) >> 12;
+    block[4] = (a3 - b3) >> 12;
+    block[5] = (a2 - b2) >> 12;
+    block[6] = (a1 - b1) >> 12;
+    block[7] = (a0 - b0) >> 12;
+}
+
+static void inline idct_col (int16_t * const block)
+{
+    int d0, d1, d2, d3;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+    int t0, t1, t2, t3;
+
+    d0 = (block[8*0] << 11) + 65536;
+    d1 = block[8*1];
+    d2 = block[8*2] << 11;
+    d3 = block[8*3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[8*4];
+    d1 = block[8*5];
+    d2 = block[8*6];
+    d3 = block[8*7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
+
+    block[8*0] = (a0 + b0) >> 17;
+    block[8*1] = (a1 + b1) >> 17;
+    block[8*2] = (a2 + b2) >> 17;
+    block[8*3] = (a3 + b3) >> 17;
+    block[8*4] = (a3 - b3) >> 17;
+    block[8*5] = (a2 - b2) >> 17;
+    block[8*6] = (a1 - b1) >> 17;
+    block[8*7] = (a0 - b0) >> 17;
+}
+
+static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
+			       const int stride)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+    do {
+	dest[0] = CLIP (block[0]);
+	dest[1] = CLIP (block[1]);
+	dest[2] = CLIP (block[2]);
+	dest[3] = CLIP (block[3]);
+	dest[4] = CLIP (block[4]);
+	dest[5] = CLIP (block[5]);
+	dest[6] = CLIP (block[6]);
+	dest[7] = CLIP (block[7]);
+
+	((int32_t *)block)[0] = 0;	((int32_t *)block)[1] = 0;
+	((int32_t *)block)[2] = 0;	((int32_t *)block)[3] = 0;
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
+
+static void mpeg2_idct_add_c (const int last, int16_t * block,
+			      uint8_t * dest, const int stride)
+{
+    int i;
+
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	for (i = 0; i < 8; i++)
+	    idct_row (block + 8 * i);
+	for (i = 0; i < 8; i++)
+	    idct_col (block + i);
+	do {
+	    dest[0] = CLIP (block[0] + dest[0]);
+	    dest[1] = CLIP (block[1] + dest[1]);
+	    dest[2] = CLIP (block[2] + dest[2]);
+	    dest[3] = CLIP (block[3] + dest[3]);
+	    dest[4] = CLIP (block[4] + dest[4]);
+	    dest[5] = CLIP (block[5] + dest[5]);
+	    dest[6] = CLIP (block[6] + dest[6]);
+	    dest[7] = CLIP (block[7] + dest[7]);
+
+	    ((int32_t *)block)[0] = 0;	((int32_t *)block)[1] = 0;
+	    ((int32_t *)block)[2] = 0;	((int32_t *)block)[3] = 0;
+
+	    dest += stride;
+	    block += 8;
+	} while (--i);
+    } else {
+	int DC;
+
+	DC = (block[0] + 64) >> 7;
+	block[0] = block[63] = 0;
+	i = 8;
+	do {
+	    dest[0] = CLIP (DC + dest[0]);
+	    dest[1] = CLIP (DC + dest[1]);
+	    dest[2] = CLIP (DC + dest[2]);
+	    dest[3] = CLIP (DC + dest[3]);
+	    dest[4] = CLIP (DC + dest[4]);
+	    dest[5] = CLIP (DC + dest[5]);
+	    dest[6] = CLIP (DC + dest[6]);
+	    dest[7] = CLIP (DC + dest[7]);
+	    dest += stride;
+	} while (--i);
+    }
+}
+
+void mpeg2_idct_init (uint32_t accel)
+{
+#ifdef ARCH_X86
+    if (accel & MPEG2_ACCEL_X86_MMXEXT) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
+	mpeg2_idct_add = mpeg2_idct_add_mmxext;
+	mpeg2_idct_mmx_init ();
+    } else if (accel & MPEG2_ACCEL_X86_MMX) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mmx;
+	mpeg2_idct_add = mpeg2_idct_add_mmx;
+	mpeg2_idct_mmx_init ();
+    } else
+#endif
+#ifdef ARCH_PPC
+    if (accel & MPEG2_ACCEL_PPC_ALTIVEC) {
+	mpeg2_idct_copy = mpeg2_idct_copy_altivec;
+	mpeg2_idct_add = mpeg2_idct_add_altivec;
+	mpeg2_idct_altivec_init ();
+    } else
+#endif
+#ifdef ARCH_ALPHA
+    if (accel & MPEG2_ACCEL_ALPHA_MVI) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mvi;
+	mpeg2_idct_add = mpeg2_idct_add_mvi;
+	mpeg2_idct_alpha_init ();
+    } else if (accel & MPEG2_ACCEL_ALPHA) {
+	int i;
+
+	mpeg2_idct_copy = mpeg2_idct_copy_alpha;
+	mpeg2_idct_add = mpeg2_idct_add_alpha;
+	mpeg2_idct_alpha_init ();
+	for (i = -3840; i < 3840 + 256; i++)
+	    CLIP(i) = (i < 0) ? 0 : ((i > 255) ? 255 : i);
+    } else
+#endif
+    {
+	extern uint8_t mpeg2_scan_norm[64];
+	extern uint8_t mpeg2_scan_alt[64];
+	int i, j;
+
+	mpeg2_idct_copy = mpeg2_idct_copy_c;
+	mpeg2_idct_add = mpeg2_idct_add_c;
+	for (i = -3840; i < 3840 + 256; i++)
+	    CLIP(i) = (i < 0) ? 0 : ((i > 255) ? 255 : i);
+	for (i = 0; i < 64; i++) {
+	    j = mpeg2_scan_norm[i];
+	    mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	    j = mpeg2_scan_alt[i];
+	    mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	}
+    }
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/idct_alpha.c b/src/video_dec/libmpeg2new/libmpeg2/idct_alpha.c
new file mode 100644
index 000000000..1d8fd08ee
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/idct_alpha.c
@@ -0,0 +1,379 @@
+/*
+ * idct_alpha.c
+ * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org>
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_ALPHA
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include <xine/attributes.h>
+#include "mpeg2_internal.h"
+#include "alpha_asm.h"
+
+#define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
+#define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
+#define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
+#define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
+#define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
+#define W7 565  /* 2048 * sqrt (2) * cos (7 * pi / 16) */
+
+extern uint8_t mpeg2_clip[3840 * 2 + 256];
+#define CLIP(i) ((mpeg2_clip + 3840)[i])
+
+#if 0
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    t0 = W0 * d0 + W1 * d1;			\
+    t1 = W0 * d1 - W1 * d0;			\
+} while (0)
+#else
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    int_fast32_t tmp = W0 * (d0 + d1);	\
+    t0 = tmp + (W1 - W0) * d1;		\
+    t1 = tmp - (W1 + W0) * d0;		\
+} while (0)
+#endif
+
+static void inline idct_row (int16_t * const block)
+{
+    uint64_t l, r;
+    int_fast32_t d0, d1, d2, d3;
+    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int_fast32_t t0, t1, t2, t3;
+
+    l = ldq (block);
+    r = ldq (block + 4);
+
+    /* shortcut */
+    if (likely (!((l & ~0xffffUL) | r))) {
+	uint64_t tmp = (uint16_t) (l >> 1);
+	tmp |= tmp << 16;
+	tmp |= tmp << 32;
+	((int32_t *)block)[0] = tmp;
+	((int32_t *)block)[1] = tmp;
+	((int32_t *)block)[2] = tmp;
+	((int32_t *)block)[3] = tmp;
+	return;
+    }
+
+    d0 = (sextw (l) << 11) + 2048;
+    d1 = sextw (extwl (l, 2));
+    d2 = sextw (extwl (l, 4)) << 11;
+    d3 = sextw (extwl (l, 6));
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = sextw (r);
+    d1 = sextw (extwl (r, 2));
+    d2 = sextw (extwl (r, 4));
+    d3 = sextw (extwl (r, 6));
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
+
+    block[0] = (a0 + b0) >> 12;
+    block[1] = (a1 + b1) >> 12;
+    block[2] = (a2 + b2) >> 12;
+    block[3] = (a3 + b3) >> 12;
+    block[4] = (a3 - b3) >> 12;
+    block[5] = (a2 - b2) >> 12;
+    block[6] = (a1 - b1) >> 12;
+    block[7] = (a0 - b0) >> 12;
+}
+
+static void inline idct_col (int16_t * const block)
+{
+    int_fast32_t d0, d1, d2, d3;
+    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int_fast32_t t0, t1, t2, t3;
+
+    d0 = (block[8*0] << 11) + 65536;
+    d1 = block[8*1];
+    d2 = block[8*2] << 11;
+    d3 = block[8*3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[8*4];
+    d1 = block[8*5];
+    d2 = block[8*6];
+    d3 = block[8*7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) >> 8) * 181;
+    b2 = ((t0 - t1) >> 8) * 181;
+
+    block[8*0] = (a0 + b0) >> 17;
+    block[8*1] = (a1 + b1) >> 17;
+    block[8*2] = (a2 + b2) >> 17;
+    block[8*3] = (a3 + b3) >> 17;
+    block[8*4] = (a3 - b3) >> 17;
+    block[8*5] = (a2 - b2) >> 17;
+    block[8*6] = (a1 - b1) >> 17;
+    block[8*7] = (a0 - b0) >> 17;
+}
+
+void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, const int stride)
+{
+    uint64_t clampmask;
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+
+    clampmask = zap (-1, 0xaa);	/* 0x00ff00ff00ff00ff */
+    do {
+	uint64_t shorts0, shorts1;
+
+	shorts0 = ldq (block);
+	shorts0 = maxsw4 (shorts0, 0);
+	shorts0 = minsw4 (shorts0, clampmask);
+	stl (pkwb (shorts0), dest);
+
+	shorts1 = ldq (block + 4);
+	shorts1 = maxsw4 (shorts1, 0);
+	shorts1 = minsw4 (shorts1, clampmask);
+	stl (pkwb (shorts1), dest + 4);
+
+	stq (0, block);
+	stq (0, block + 4);
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
+
+void mpeg2_idct_add_mvi (const int last, int16_t * block,
+			 uint8_t * dest, const int stride)
+{
+    uint64_t clampmask;
+    uint64_t signmask;
+    int i;
+
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	for (i = 0; i < 8; i++)
+	    idct_row (block + 8 * i);
+	for (i = 0; i < 8; i++)
+	    idct_col (block + i);
+	clampmask = zap (-1, 0xaa);	/* 0x00ff00ff00ff00ff */
+	signmask = zap (-1, 0x33);
+	signmask ^= signmask >> 1;	/* 0x8000800080008000 */
+
+	do {
+	    uint64_t shorts0, pix0, signs0;
+	    uint64_t shorts1, pix1, signs1;
+
+	    shorts0 = ldq (block);
+	    shorts1 = ldq (block + 4);
+
+	    pix0 = unpkbw (ldl (dest));
+	    /* signed subword add (MMX paddw).  */
+	    signs0 = shorts0 & signmask;
+	    shorts0 &= ~signmask;
+	    shorts0 += pix0;
+	    shorts0 ^= signs0;
+	    /* clamp. */
+	    shorts0 = maxsw4 (shorts0, 0);
+	    shorts0 = minsw4 (shorts0, clampmask);	
+
+	    /* next 4.  */
+	    pix1 = unpkbw (ldl (dest + 4));
+	    signs1 = shorts1 & signmask;
+	    shorts1 &= ~signmask;
+	    shorts1 += pix1;
+	    shorts1 ^= signs1;
+	    shorts1 = maxsw4 (shorts1, 0);
+	    shorts1 = minsw4 (shorts1, clampmask);
+
+	    stl (pkwb (shorts0), dest);
+	    stl (pkwb (shorts1), dest + 4);
+	    stq (0, block);
+	    stq (0, block + 4);
+
+	    dest += stride;
+	    block += 8;
+	} while (--i);
+    } else {
+	int DC;
+	uint64_t p0, p1, p2, p3, p4, p5, p6, p7;
+	uint64_t DCs;
+
+	DC = (block[0] + 64) >> 7;
+	block[0] = block[63] = 0;
+
+	p0 = ldq (dest + 0 * stride);
+	p1 = ldq (dest + 1 * stride);
+	p2 = ldq (dest + 2 * stride);
+	p3 = ldq (dest + 3 * stride);
+	p4 = ldq (dest + 4 * stride);
+	p5 = ldq (dest + 5 * stride);
+	p6 = ldq (dest + 6 * stride);
+	p7 = ldq (dest + 7 * stride);
+
+	if (DC > 0) {
+	    DCs = BYTE_VEC (likely (DC <= 255) ? DC : 255);
+	    p0 += minub8 (DCs, ~p0);
+	    p1 += minub8 (DCs, ~p1);
+	    p2 += minub8 (DCs, ~p2);
+	    p3 += minub8 (DCs, ~p3);
+	    p4 += minub8 (DCs, ~p4);
+	    p5 += minub8 (DCs, ~p5);
+	    p6 += minub8 (DCs, ~p6);
+	    p7 += minub8 (DCs, ~p7);
+	} else {
+	    DCs = BYTE_VEC (likely (-DC <= 255) ? -DC : 255);
+	    p0 -= minub8 (DCs, p0);
+	    p1 -= minub8 (DCs, p1);
+	    p2 -= minub8 (DCs, p2);
+	    p3 -= minub8 (DCs, p3);
+	    p4 -= minub8 (DCs, p4);
+	    p5 -= minub8 (DCs, p5);
+	    p6 -= minub8 (DCs, p6);
+	    p7 -= minub8 (DCs, p7);
+	}
+
+	stq (p0, dest + 0 * stride);
+	stq (p1, dest + 1 * stride);
+	stq (p2, dest + 2 * stride);
+	stq (p3, dest + 3 * stride);
+	stq (p4, dest + 4 * stride);
+	stq (p5, dest + 5 * stride);
+	stq (p6, dest + 6 * stride);
+	stq (p7, dest + 7 * stride);
+    }
+}
+
+void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, const int stride)
+{
+    int i;
+
+    for (i = 0; i < 8; i++)
+	idct_row (block + 8 * i);
+    for (i = 0; i < 8; i++)
+	idct_col (block + i);
+    do {
+	dest[0] = CLIP (block[0]);
+	dest[1] = CLIP (block[1]);
+	dest[2] = CLIP (block[2]);
+	dest[3] = CLIP (block[3]);
+	dest[4] = CLIP (block[4]);
+	dest[5] = CLIP (block[5]);
+	dest[6] = CLIP (block[6]);
+	dest[7] = CLIP (block[7]);
+
+	stq(0, block);
+	stq(0, block + 4);
+
+	dest += stride;
+	block += 8;
+    } while (--i);
+}
+
+void mpeg2_idct_add_alpha (const int last, int16_t * block,
+			   uint8_t * dest, const int stride)
+{
+    int i;
+
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	for (i = 0; i < 8; i++)
+	    idct_row (block + 8 * i);
+	for (i = 0; i < 8; i++)
+	    idct_col (block + i);
+	do {
+	    dest[0] = CLIP (block[0] + dest[0]);
+	    dest[1] = CLIP (block[1] + dest[1]);
+	    dest[2] = CLIP (block[2] + dest[2]);
+	    dest[3] = CLIP (block[3] + dest[3]);
+	    dest[4] = CLIP (block[4] + dest[4]);
+	    dest[5] = CLIP (block[5] + dest[5]);
+	    dest[6] = CLIP (block[6] + dest[6]);
+	    dest[7] = CLIP (block[7] + dest[7]);
+
+	    stq(0, block);
+	    stq(0, block + 4);
+
+	    dest += stride;
+	    block += 8;
+	} while (--i);
+    } else {
+	int DC;
+
+	DC = (block[0] + 64) >> 7;
+	block[0] = block[63] = 0;
+	i = 8;
+	do {
+	    dest[0] = CLIP (DC + dest[0]);
+	    dest[1] = CLIP (DC + dest[1]);
+	    dest[2] = CLIP (DC + dest[2]);
+	    dest[3] = CLIP (DC + dest[3]);
+	    dest[4] = CLIP (DC + dest[4]);
+	    dest[5] = CLIP (DC + dest[5]);
+	    dest[6] = CLIP (DC + dest[6]);
+	    dest[7] = CLIP (DC + dest[7]);
+	    dest += stride;
+	} while (--i);
+    }
+}
+
+void mpeg2_idct_alpha_init (void)
+{
+    extern uint8_t mpeg2_scan_norm[64];
+    extern uint8_t mpeg2_scan_alt[64];
+    int i, j;
+
+    for (i = 0; i < 64; i++) {
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+    }
+}
+
+#endif /* ARCH_ALPHA */
diff --git a/src/video_dec/libmpeg2new/libmpeg2/idct_altivec.c b/src/video_dec/libmpeg2new/libmpeg2/idct_altivec.c
new file mode 100644
index 000000000..f15bca165
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/idct_altivec.c
@@ -0,0 +1,288 @@
+/*
+ * idct_altivec.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_PPC
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include <xine/attributes.h>
+#include "mpeg2_internal.h"
+
+typedef vector signed char vector_s8_t;
+typedef vector unsigned char vector_u8_t;
+typedef vector signed short vector_s16_t;
+typedef vector unsigned short vector_u16_t;
+typedef vector signed int vector_s32_t;
+typedef vector unsigned int vector_u32_t;
+
+#if defined(HAVE_ALTIVEC_H) && (__GNUC__ * 100 + __GNUC_MINOR__ < 303)
+/* work around gcc <3.3 vec_mergel bug */
+static inline vector_s16_t my_vec_mergel (vector_s16_t const A,
+					  vector_s16_t const B)
+{
+    static const vector_u8_t mergel = {
+	0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b,
+	0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f
+    };
+    return vec_perm (A, B, mergel);
+}
+#undef vec_mergel
+#define vec_mergel my_vec_mergel
+#endif
+
+#ifdef HAVE_ALTIVEC_H	/* gnu */
+#define VEC_S16(a,b,c,d,e,f,g,h) {a, b, c, d, e, f, g, h}
+#else			/* apple */
+#define VEC_S16(a,b,c,d,e,f,g,h) (vector_s16_t) (a, b, c, d, e, f, g, h)
+#endif
+
+static const vector_s16_t constants ATTR_ALIGN(16) =
+    VEC_S16 (23170, 13573, 6518, 21895, -23170, -21895, 32, 31);
+static const vector_s16_t constants_1 ATTR_ALIGN(16) =
+    VEC_S16 (16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725);
+static const vector_s16_t constants_2 ATTR_ALIGN(16) =
+    VEC_S16 (16069, 22289, 20995, 18895, 16069, 18895, 20995, 22289);
+static const vector_s16_t constants_3 ATTR_ALIGN(16) =
+    VEC_S16 (21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692);
+static const vector_s16_t constants_4 ATTR_ALIGN(16) =
+    VEC_S16 (13623, 18895, 17799, 16019, 13623, 16019, 17799, 18895);
+
+#define IDCT								\
+    vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;		\
+    vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;		\
+    vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;			\
+    vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;			\
+    vector_u16_t shift;							\
+									\
+    c4 = vec_splat (constants, 0);					\
+    a0 = vec_splat (constants, 1);					\
+    a1 = vec_splat (constants, 2);					\
+    a2 = vec_splat (constants, 3);					\
+    mc4 = vec_splat (constants, 4);					\
+    ma2 = vec_splat (constants, 5);					\
+    bias = (vector_s16_t)vec_splat ((vector_s32_t)constants, 3);	\
+									\
+    zero = vec_splat_s16 (0);						\
+									\
+    vx0 = vec_adds (block[0], block[4]);				\
+    vx4 = vec_subs (block[0], block[4]);				\
+    t5 = vec_mradds (vx0, constants_1, zero);				\
+    t0 = vec_mradds (vx4, constants_1, zero);				\
+									\
+    vx1 = vec_mradds (a1, block[7], block[1]);				\
+    vx7 = vec_mradds (a1, block[1], vec_subs (zero, block[7]));		\
+    t1 = vec_mradds (vx1, constants_2, zero);				\
+    t8 = vec_mradds (vx7, constants_2, zero);				\
+									\
+    vx2 = vec_mradds (a0, block[6], block[2]);				\
+    vx6 = vec_mradds (a0, block[2], vec_subs (zero, block[6]));		\
+    t2 = vec_mradds (vx2, constants_3, zero);				\
+    t4 = vec_mradds (vx6, constants_3, zero);				\
+									\
+    vx3 = vec_mradds (block[3], constants_4, zero);			\
+    vx5 = vec_mradds (block[5], constants_4, zero);			\
+    t7 = vec_mradds (a2, vx5, vx3);					\
+    t3 = vec_mradds (ma2, vx3, vx5);					\
+									\
+    t6 = vec_adds (t8, t3);						\
+    t3 = vec_subs (t8, t3);						\
+    t8 = vec_subs (t1, t7);						\
+    t1 = vec_adds (t1, t7);						\
+    t6 = vec_mradds (a0, t6, t6);	/* a0+1 == 2*c4 */		\
+    t1 = vec_mradds (a0, t1, t1);	/* a0+1 == 2*c4 */		\
+									\
+    t7 = vec_adds (t5, t2);						\
+    t2 = vec_subs (t5, t2);						\
+    t5 = vec_adds (t0, t4);						\
+    t0 = vec_subs (t0, t4);						\
+    t4 = vec_subs (t8, t3);						\
+    t3 = vec_adds (t8, t3);						\
+									\
+    vy0 = vec_adds (t7, t1);						\
+    vy7 = vec_subs (t7, t1);						\
+    vy1 = vec_adds (t5, t3);						\
+    vy6 = vec_subs (t5, t3);						\
+    vy2 = vec_adds (t0, t4);						\
+    vy5 = vec_subs (t0, t4);						\
+    vy3 = vec_adds (t2, t6);						\
+    vy4 = vec_subs (t2, t6);						\
+									\
+    vx0 = vec_mergeh (vy0, vy4);					\
+    vx1 = vec_mergel (vy0, vy4);					\
+    vx2 = vec_mergeh (vy1, vy5);					\
+    vx3 = vec_mergel (vy1, vy5);					\
+    vx4 = vec_mergeh (vy2, vy6);					\
+    vx5 = vec_mergel (vy2, vy6);					\
+    vx6 = vec_mergeh (vy3, vy7);					\
+    vx7 = vec_mergel (vy3, vy7);					\
+									\
+    vy0 = vec_mergeh (vx0, vx4);					\
+    vy1 = vec_mergel (vx0, vx4);					\
+    vy2 = vec_mergeh (vx1, vx5);					\
+    vy3 = vec_mergel (vx1, vx5);					\
+    vy4 = vec_mergeh (vx2, vx6);					\
+    vy5 = vec_mergel (vx2, vx6);					\
+    vy6 = vec_mergeh (vx3, vx7);					\
+    vy7 = vec_mergel (vx3, vx7);					\
+									\
+    vx0 = vec_mergeh (vy0, vy4);					\
+    vx1 = vec_mergel (vy0, vy4);					\
+    vx2 = vec_mergeh (vy1, vy5);					\
+    vx3 = vec_mergel (vy1, vy5);					\
+    vx4 = vec_mergeh (vy2, vy6);					\
+    vx5 = vec_mergel (vy2, vy6);					\
+    vx6 = vec_mergeh (vy3, vy7);					\
+    vx7 = vec_mergel (vy3, vy7);					\
+									\
+    vx0 = vec_adds (vx0, bias);						\
+    t5 = vec_adds (vx0, vx4);						\
+    t0 = vec_subs (vx0, vx4);						\
+									\
+    t1 = vec_mradds (a1, vx7, vx1);					\
+    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));			\
+									\
+    t2 = vec_mradds (a0, vx6, vx2);					\
+    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));			\
+									\
+    t7 = vec_mradds (a2, vx5, vx3);					\
+    t3 = vec_mradds (ma2, vx3, vx5);					\
+									\
+    t6 = vec_adds (t8, t3);						\
+    t3 = vec_subs (t8, t3);						\
+    t8 = vec_subs (t1, t7);						\
+    t1 = vec_adds (t1, t7);						\
+									\
+    t7 = vec_adds (t5, t2);						\
+    t2 = vec_subs (t5, t2);						\
+    t5 = vec_adds (t0, t4);						\
+    t0 = vec_subs (t0, t4);						\
+    t4 = vec_subs (t8, t3);						\
+    t3 = vec_adds (t8, t3);						\
+									\
+    vy0 = vec_adds (t7, t1);						\
+    vy7 = vec_subs (t7, t1);						\
+    vy1 = vec_mradds (c4, t3, t5);					\
+    vy6 = vec_mradds (mc4, t3, t5);					\
+    vy2 = vec_mradds (c4, t4, t0);					\
+    vy5 = vec_mradds (mc4, t4, t0);					\
+    vy3 = vec_adds (t2, t6);						\
+    vy4 = vec_subs (t2, t6);						\
+									\
+    shift = vec_splat_u16 (6);						\
+    vx0 = vec_sra (vy0, shift);						\
+    vx1 = vec_sra (vy1, shift);						\
+    vx2 = vec_sra (vy2, shift);						\
+    vx3 = vec_sra (vy3, shift);						\
+    vx4 = vec_sra (vy4, shift);						\
+    vx5 = vec_sra (vy5, shift);						\
+    vx6 = vec_sra (vy6, shift);						\
+    vx7 = vec_sra (vy7, shift);
+
+void mpeg2_idct_copy_altivec (int16_t * const _block, uint8_t * dest,
+			      const int stride)
+{
+    vector_s16_t * const block = (vector_s16_t *)_block;
+    vector_u8_t tmp;
+
+    IDCT
+
+#define COPY(dest,src)						\
+    tmp = vec_packsu (src, src);				\
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);	\
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+
+    COPY (dest, vx0)	dest += stride;
+    COPY (dest, vx1)	dest += stride;
+    COPY (dest, vx2)	dest += stride;
+    COPY (dest, vx3)	dest += stride;
+    COPY (dest, vx4)	dest += stride;
+    COPY (dest, vx5)	dest += stride;
+    COPY (dest, vx6)	dest += stride;
+    COPY (dest, vx7)
+
+    block[0] = block[1] = block[2] = block[3] = zero;
+    block[4] = block[5] = block[6] = block[7] = zero;
+}
+
+void mpeg2_idct_add_altivec (const int last, int16_t * const _block,
+			     uint8_t * dest, const int stride)
+{
+    vector_s16_t * const block = (vector_s16_t *)_block;
+    vector_u8_t tmp;
+    vector_s16_t tmp2, tmp3;
+    vector_u8_t perm0;
+    vector_u8_t perm1;
+    vector_u8_t p0, p1, p;
+
+    IDCT
+
+    p0 = vec_lvsl (0, dest);
+    p1 = vec_lvsl (stride, dest);
+    p = vec_splat_u8 (-1);
+    perm0 = vec_mergeh (p, p0);
+    perm1 = vec_mergeh (p, p1);
+
+#define ADD(dest,src,perm)						\
+    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */			\
+    tmp = vec_ld (0, dest);						\
+    tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);	\
+    tmp3 = vec_adds (tmp2, src);					\
+    tmp = vec_packsu (tmp3, tmp3);					\
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);		\
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+
+    ADD (dest, vx0, perm0)	dest += stride;
+    ADD (dest, vx1, perm1)	dest += stride;
+    ADD (dest, vx2, perm0)	dest += stride;
+    ADD (dest, vx3, perm1)	dest += stride;
+    ADD (dest, vx4, perm0)	dest += stride;
+    ADD (dest, vx5, perm1)	dest += stride;
+    ADD (dest, vx6, perm0)	dest += stride;
+    ADD (dest, vx7, perm1)
+
+    block[0] = block[1] = block[2] = block[3] = zero;
+    block[4] = block[5] = block[6] = block[7] = zero;
+}
+
+void mpeg2_idct_altivec_init (void)
+{
+    extern uint8_t mpeg2_scan_norm[64];
+    extern uint8_t mpeg2_scan_alt[64];
+    int i, j;
+
+    /* the altivec idct uses a transposed input, so we patch scan tables */
+    for (i = 0; i < 64; i++) {
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = (j >> 3) | ((j & 7) << 3);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3);
+    }
+}
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/idct_mlib.c b/src/video_dec/libmpeg2new/libmpeg2/idct_mlib.c
new file mode 100644
index 000000000..55a2e9b64
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/idct_mlib.c
@@ -0,0 +1,60 @@
+/*
+ * idct_mlib.c
+ * Copyright (C) 1999-2003 Håkan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef LIBMPEG2_MLIB
+
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "mpeg2_internal.h"
+
+void mpeg2_idct_add_mlib (const int last, int16_t * const block,
+			  uint8_t * const dest, const int stride)
+{
+    mlib_VideoIDCT_IEEE_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_copy_mlib_non_ieee (int16_t * const block,
+				    uint8_t * const dest, const int stride)
+{
+    mlib_VideoIDCT8x8_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_add_mlib_non_ieee (const int last, int16_t * const block,
+				   uint8_t * const dest, const int stride)
+{
+    mlib_VideoIDCT8x8_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/idct_mmx.c b/src/video_dec/libmpeg2new/libmpeg2/idct_mmx.c
new file mode 100644
index 000000000..d5a5c08a4
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/idct_mmx.c
@@ -0,0 +1,814 @@
+/*
+ * idct_mmx.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_X86
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+#include "../include/mmx.h"
+
+#define ROW_SHIFT 15
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+
+
+#if 0
+/* C row IDCT - its just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+			     int16_t * table, int32_t * rounder)
+{
+    int C1, C2, C3, C4, C5, C6, C7;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    row += offset;
+
+    C1 = table[1];
+    C2 = table[2];
+    C3 = table[3];
+    C4 = table[4];
+    C5 = table[5];
+    C6 = table[6];
+    C7 = table[7];
+
+    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
+						   c4,  c6,  c4,  c6,	\
+						   c1,  c3, -c1, -c5,	\
+						   c5,  c7,  c3, -c7,	\
+						   c4, -c6,  c4, -c6,	\
+						  -c4,  c2,  c4, -c2,	\
+						   c5, -c1,  c3, -c1,	\
+						   c7,  c3,  c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * const row, const int offset,
+				    const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
+}
+
+static inline void mmxext_row (const int16_t * const table,
+			       const int32_t * const rounder)
+{
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C5 -C1 C3 C1 */
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
+
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
+    pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
+
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C7 C3 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
+
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
+
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
+
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
+
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
+
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
+
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm4);		/* mm4 = a3 a2 + rounder */
+
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm4);		/* mm4 = a3-b3 a2-b2 + rounder */
+}
+
+static inline void mmxext_row_tail (int16_t * const row, const int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
+
+    /* slot */
+
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
+}
+
+static inline void mmxext_row_mid (int16_t * const row, const int store,
+				   const int offset,
+				   const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
+
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
+
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
+					   c4,  c6, -c4, -c2,	\
+					   c1,  c3,  c3, -c7,	\
+					   c5,  c7, -c1, -c5,	\
+					   c4, -c6,  c4, -c2,	\
+					  -c4,  c2,  c4, -c6,	\
+					   c5, -c1,  c7, -c5,	\
+					   c7,  c3,  c3, -c1 }
+
+static inline void mmx_row_head (int16_t * const row, const int offset,
+				 const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
+}
+
+static inline void mmx_row (const int16_t * const table,
+			    const int32_t * const rounder)
+{
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
+    punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
+
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
+    punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
+
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C5 -C1 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
+
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
+
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
+
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
+
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
+
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
+
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm7);		/* mm7 = a3 a2 + rounder */
+
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm7);		/* mm7 = a3-b3 a2-b2 + rounder */
+}
+
+static inline void mmx_row_tail (int16_t * const row, const int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm4);		/* mm4 = y6 y7 y4 y5 */
+
+    pslld_i2r (16, mm7);		/* mm7 = y7 0 y5 0 */
+
+    psrld_i2r (16, mm4);		/* mm4 = 0 y6 0 y4 */
+
+    por_r2r (mm4, mm7);			/* mm7 = y7 y6 y5 y4 */
+
+    /* slot */
+
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
+}
+
+static inline void mmx_row_mid (int16_t * const row, const int store,
+				const int offset, const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm1);		/* mm1 = y6 y7 y4 y5 */
+
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
+    psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
+
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
+
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
+
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+}
+
+
+#if 0
+/* C column IDCT - its just here to document the MMXEXT and MMX versions */
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+    col += offset;
+
+    x0 = col[0*8];
+    x1 = col[1*8];
+    x2 = col[2*8];
+    x3 = col[3*8];
+    x4 = col[4*8];
+    x5 = col[5*8];
+    x6 = col[6*8];
+    x7 = col[7*8];
+
+    u04 = S (x0 + x4);
+    v04 = S (x0 - x4);
+    u26 = S (F (T2, x6) + x2);
+    v26 = S (F (T2, x2) - x6);
+
+    a0 = S (u04 + u26);
+    a1 = S (v04 + v26);
+    a2 = S (v04 - v26);
+    a3 = S (u04 - u26);
+
+    u17 = S (F (T1, x7) + x1);
+    v17 = S (F (T1, x1) - x7);
+    u35 = S (F (T3, x5) + x3);
+    v35 = S (F (T3, x3) - x5);
+
+    b0 = S (u17 + u35);
+    b3 = S (v17 - v35);
+    u12 = S (u17 - u35);
+    v12 = S (v17 + v35);
+    u12 = S (2 * F (C4, u12));
+    v12 = S (2 * F (C4, v12));
+    b1 = S (u12 + v12);
+    b2 = S (u12 - v12);
+
+    y0 = S (a0 + b0) >> COL_SHIFT;
+    y1 = S (a1 + b1) >> COL_SHIFT;
+    y2 = S (a2 + b2) >> COL_SHIFT;
+    y3 = S (a3 + b3) >> COL_SHIFT;
+
+    y4 = S (a3 - b3) >> COL_SHIFT;
+    y5 = S (a2 - b2) >> COL_SHIFT;
+    y6 = S (a1 - b1) >> COL_SHIFT;
+    y7 = S (a0 - b0) >> COL_SHIFT;
+
+    col[0*8] = y0;
+    col[1*8] = y1;
+    col[2*8] = y2;
+    col[3*8] = y3;
+    col[4*8] = y4;
+    col[5*8] = y5;
+    col[6*8] = y6;
+    col[7*8] = y7;
+}
+#endif
+
+
+/* MMX column IDCT */
+static inline void idct_col (int16_t * const col, const int offset)
+{
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+    static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+
+    /* column code adapted from peter gubanov */
+    /* http://www.elecard.com/peter/idct.shtml */
+
+    movq_m2r (*_T1, mm0);		/* mm0 = T1 */
+
+    movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
+    movq_r2r (mm0, mm2);		/* mm2 = T1 */
+
+    movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
+    pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
+
+    movq_m2r (*_T3, mm5);		/* mm5 = T3 */
+    pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
+
+    movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
+    movq_r2r (mm5, mm7);		/* mm7 = T3-1 */
+
+    movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
+    psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
+
+    movq_m2r (*_T2, mm4);		/* mm4 = T2 */
+    pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
+
+    paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
+    pmulhw_r2r (mm6, mm7);		/* mm7 = (T3-1)*x5 */
+
+    /* slot */
+
+    movq_r2r (mm4, mm2);		/* mm2 = T2 */
+    paddsw_r2r (mm3, mm5);		/* mm5 = T3*x3 */
+
+    pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
+    paddsw_r2r (mm6, mm7);		/* mm7 = T3*x5 */
+
+    psubsw_r2r (mm6, mm5);		/* mm5 = v35 */
+    paddsw_r2r (mm3, mm7);		/* mm7 = u35 */
+
+    movq_m2r (*(col+offset+6*8), mm3);	/* mm3 = x6 */
+    movq_r2r (mm0, mm6);		/* mm6 = v17 */
+
+    pmulhw_r2r (mm3, mm2);		/* mm2 = T2*x6 */
+    psubsw_r2r (mm5, mm0);		/* mm0 = b3 */
+
+    psubsw_r2r (mm3, mm4);		/* mm4 = v26 */
+    paddsw_r2r (mm6, mm5);		/* mm5 = v12 */
+
+    movq_r2m (mm0, *(col+offset+3*8));	/* save b3 in scratch0 */
+    movq_r2r (mm1, mm6);		/* mm6 = u17 */
+
+    paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
+    paddsw_r2r (mm7, mm6);		/* mm6 = b0 */
+
+    psubsw_r2r (mm7, mm1);		/* mm1 = u12 */
+    movq_r2r (mm1, mm7);		/* mm7 = u12 */
+
+    movq_m2r (*(col+offset+0*8), mm3);	/* mm3 = x0 */
+    paddsw_r2r (mm5, mm1);		/* mm1 = u12+v12 */
+
+    movq_m2r (*_C4, mm0);		/* mm0 = C4/2 */
+    psubsw_r2r (mm5, mm7);		/* mm7 = u12-v12 */
+
+    movq_r2m (mm6, *(col+offset+5*8));	/* save b0 in scratch1 */
+    pmulhw_r2r (mm0, mm1);		/* mm1 = b1/2 */
+
+    movq_r2r (mm4, mm6);		/* mm6 = v26 */
+    pmulhw_r2r (mm0, mm7);		/* mm7 = b2/2 */
+
+    movq_m2r (*(col+offset+4*8), mm5);	/* mm5 = x4 */
+    movq_r2r (mm3, mm0);		/* mm0 = x0 */
+
+    psubsw_r2r (mm5, mm3);		/* mm3 = v04 */
+    paddsw_r2r (mm5, mm0);		/* mm0 = u04 */
+
+    paddsw_r2r (mm3, mm4);		/* mm4 = a1 */
+    movq_r2r (mm0, mm5);		/* mm5 = u04 */
+
+    psubsw_r2r (mm6, mm3);		/* mm3 = a2 */
+    paddsw_r2r (mm2, mm5);		/* mm5 = a0 */
+
+    paddsw_r2r (mm1, mm1);		/* mm1 = b1 */
+    psubsw_r2r (mm2, mm0);		/* mm0 = a3 */
+
+    paddsw_r2r (mm7, mm7);		/* mm7 = b2 */
+    movq_r2r (mm3, mm2);		/* mm2 = a2 */
+
+    movq_r2r (mm4, mm6);		/* mm6 = a1 */
+    paddsw_r2r (mm7, mm3);		/* mm3 = a2+b2 */
+
+    psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y2 */
+    paddsw_r2r (mm1, mm4);		/* mm4 = a1+b1 */
+
+    psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y1 */
+    psubsw_r2r (mm1, mm6);		/* mm6 = a1-b1 */
+
+    movq_m2r (*(col+offset+5*8), mm1);	/* mm1 = b0 */
+    psubsw_r2r (mm7, mm2);		/* mm2 = a2-b2 */
+
+    psraw_i2r (COL_SHIFT, mm6);		/* mm6 = y6 */
+    movq_r2r (mm5, mm7);		/* mm7 = a0 */
+
+    movq_r2m (mm4, *(col+offset+1*8));	/* save y1 */
+    psraw_i2r (COL_SHIFT, mm2);		/* mm2 = y5 */
+
+    movq_r2m (mm3, *(col+offset+2*8));	/* save y2 */
+    paddsw_r2r (mm1, mm5);		/* mm5 = a0+b0 */
+
+    movq_m2r (*(col+offset+3*8), mm4);	/* mm4 = b3 */
+    psubsw_r2r (mm1, mm7);		/* mm7 = a0-b0 */
+
+    psraw_i2r (COL_SHIFT, mm5);		/* mm5 = y0 */
+    movq_r2r (mm0, mm3);		/* mm3 = a3 */
+
+    movq_r2m (mm2, *(col+offset+5*8));	/* save y5 */
+    psubsw_r2r (mm4, mm3);		/* mm3 = a3-b3 */
+
+    psraw_i2r (COL_SHIFT, mm7);		/* mm7 = y7 */
+    paddsw_r2r (mm0, mm4);		/* mm4 = a3+b3 */
+
+    movq_r2m (mm5, *(col+offset+0*8));	/* save y0 */
+    psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y4 */
+
+    movq_r2m (mm6, *(col+offset+6*8));	/* save y6 */
+    psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y3 */
+
+    movq_r2m (mm7, *(col+offset+7*8));	/* save y7 */
+
+    movq_r2m (mm3, *(col+offset+4*8));	/* save y4 */
+
+    movq_r2m (mm4, *(col+offset+3*8));	/* save y3 */
+}
+
+
+static const int32_t rounder0[] ATTR_ALIGN(8) =
+    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static const int32_t rounder1[] ATTR_ALIGN(8) =
+    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
+static const int32_t rounder7[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
+static const int32_t rounder2[] ATTR_ALIGN(8) =
+    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
+static const int32_t rounder6[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C2 * (C6-C2)/2 */
+static const int32_t rounder3[] ATTR_ALIGN(8) =
+    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
+static const int32_t rounder5[] ATTR_ALIGN(8) =
+    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
+
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
+static inline void idct (int16_t * const block)				\
+{									\
+    static const int16_t table04[] ATTR_ALIGN(16) =			\
+	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
+    static const int16_t table17[] ATTR_ALIGN(16) =			\
+	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
+    static const int16_t table26[] ATTR_ALIGN(16) =			\
+	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
+    static const int16_t table35[] ATTR_ALIGN(16) =			\
+	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
+									\
+    idct_row_head (block, 0*8, table04);				\
+    idct_row (table04, rounder0);					\
+    idct_row_mid (block, 0*8, 4*8, table04);				\
+    idct_row (table04, rounder4);					\
+    idct_row_mid (block, 4*8, 1*8, table17);				\
+    idct_row (table17, rounder1);					\
+    idct_row_mid (block, 1*8, 7*8, table17);				\
+    idct_row (table17, rounder7);					\
+    idct_row_mid (block, 7*8, 2*8, table26);				\
+    idct_row (table26, rounder2);					\
+    idct_row_mid (block, 2*8, 6*8, table26);				\
+    idct_row (table26, rounder6);					\
+    idct_row_mid (block, 6*8, 3*8, table35);				\
+    idct_row (table35, rounder3);					\
+    idct_row_mid (block, 3*8, 5*8, table35);				\
+    idct_row (table35, rounder5);					\
+    idct_row_tail (block, 5*8);						\
+									\
+    idct_col (block, 0);						\
+    idct_col (block, 4);						\
+}
+
+
+#define COPY_MMX(offset,r0,r1,r2)	\
+do {					\
+    movq_m2r (*(block+offset), r0);	\
+    dest += stride;			\
+    movq_m2r (*(block+offset+4), r1);	\
+    movq_r2m (r2, *dest);		\
+    packuswb_r2r (r1, r0);		\
+} while (0)
+
+static inline void block_copy (int16_t * const block, uint8_t * dest,
+			       const int stride)
+{
+    movq_m2r (*(block+0*8), mm0);
+    movq_m2r (*(block+0*8+4), mm1);
+    movq_m2r (*(block+1*8), mm2);
+    packuswb_r2r (mm1, mm0);
+    movq_m2r (*(block+1*8+4), mm3);
+    movq_r2m (mm0, *dest);
+    packuswb_r2r (mm3, mm2);
+    COPY_MMX (2*8, mm0, mm1, mm2);
+    COPY_MMX (3*8, mm2, mm3, mm0);
+    COPY_MMX (4*8, mm0, mm1, mm2);
+    COPY_MMX (5*8, mm2, mm3, mm0);
+    COPY_MMX (6*8, mm0, mm1, mm2);
+    COPY_MMX (7*8, mm2, mm3, mm0);
+    movq_r2m (mm2, *(dest+stride));
+}
+
+
+#define ADD_MMX(offset,r1,r2,r3,r4)	\
+do {					\
+    movq_m2r (*(dest+2*stride), r1);	\
+    packuswb_r2r (r4, r3);		\
+    movq_r2r (r1, r2);			\
+    dest += stride;			\
+    movq_r2m (r3, *dest);		\
+    punpcklbw_r2r (mm0, r1);		\
+    paddsw_m2r (*(block+offset), r1);	\
+    punpckhbw_r2r (mm0, r2);		\
+    paddsw_m2r (*(block+offset+4), r2);	\
+} while (0)
+
+static inline void block_add (int16_t * const block, uint8_t * dest,
+			      const int stride)
+{
+    movq_m2r (*dest, mm1);
+    pxor_r2r (mm0, mm0);
+    movq_m2r (*(dest+stride), mm3);
+    movq_r2r (mm1, mm2);
+    punpcklbw_r2r (mm0, mm1);
+    movq_r2r (mm3, mm4);
+    paddsw_m2r (*(block+0*8), mm1);
+    punpckhbw_r2r (mm0, mm2);
+    paddsw_m2r (*(block+0*8+4), mm2);
+    punpcklbw_r2r (mm0, mm3);
+    paddsw_m2r (*(block+1*8), mm3);
+    packuswb_r2r (mm2, mm1);
+    punpckhbw_r2r (mm0, mm4);
+    movq_r2m (mm1, *dest);
+    paddsw_m2r (*(block+1*8+4), mm4);
+    ADD_MMX (2*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (3*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (4*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (5*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (6*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (7*8, mm3, mm4, mm1, mm2);
+    packuswb_r2r (mm4, mm3);
+    movq_r2m (mm3, *(dest+stride));
+}
+
+
+static inline void block_zero (int16_t * const block)
+{
+    pxor_r2r (mm0, mm0);
+    movq_r2m (mm0, *(block+0*4));
+    movq_r2m (mm0, *(block+1*4));
+    movq_r2m (mm0, *(block+2*4));
+    movq_r2m (mm0, *(block+3*4));
+    movq_r2m (mm0, *(block+4*4));
+    movq_r2m (mm0, *(block+5*4));
+    movq_r2m (mm0, *(block+6*4));
+    movq_r2m (mm0, *(block+7*4));
+    movq_r2m (mm0, *(block+8*4));
+    movq_r2m (mm0, *(block+9*4));
+    movq_r2m (mm0, *(block+10*4));
+    movq_r2m (mm0, *(block+11*4));
+    movq_r2m (mm0, *(block+12*4));
+    movq_r2m (mm0, *(block+13*4));
+    movq_r2m (mm0, *(block+14*4));
+    movq_r2m (mm0, *(block+15*4));
+}
+
+
+#define CPU_MMXEXT 0
+#define CPU_MMX 1
+
+#define dup4(reg)			\
+do {					\
+    if (cpu != CPU_MMXEXT) {		\
+	punpcklwd_r2r (reg, reg);	\
+	punpckldq_r2r (reg, reg);	\
+    } else				\
+	pshufw_r2r (reg, reg, 0x00);	\
+} while (0)
+
+static inline void block_add_DC (int16_t * const block, uint8_t * dest,
+				 const int stride, const int cpu)
+{
+    movd_v2r ((block[0] + 64) >> 7, mm0);
+    pxor_r2r (mm1, mm1);
+    movq_m2r (*dest, mm2);
+    dup4 (mm0);
+    psubsw_r2r (mm0, mm1);
+    packuswb_r2r (mm0, mm0);
+    paddusb_r2r (mm0, mm2);
+    packuswb_r2r (mm1, mm1);
+    movq_m2r (*(dest + stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    block[0] = 0;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    dest += stride;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    dest += stride;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    block[63] = 0;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *(dest + stride));
+    psubusb_r2r (mm1, mm3);
+    movq_r2m (mm3, *(dest + 2*stride));
+}
+
+
+declare_idct (mmxext_idct, mmxext_table,
+	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
+			     const int stride)
+{
+    mmxext_idct (block);
+    block_copy (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
+			    uint8_t * const dest, const int stride)
+{
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	mmxext_idct (block);
+	block_add (block, dest, stride);
+	block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMXEXT);
+}
+
+
+declare_idct (mmx_idct, mmx_table,
+	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
+			  const int stride)
+{
+    mmx_idct (block);
+    block_copy (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_add_mmx (const int last, int16_t * const block,
+			 uint8_t * const dest, const int stride)
+{
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	mmx_idct (block);
+	block_add (block, dest, stride);
+	block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMX);
+}
+
+
+void mpeg2_idct_mmx_init (void)
+{
+    extern uint8_t mpeg2_scan_norm[64];
+    extern uint8_t mpeg2_scan_alt[64];
+    int i, j;
+
+    /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
+
+    for (i = 0; i < 64; i++) {
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+    }
+}
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/libmpeg2.pc.in b/src/video_dec/libmpeg2new/libmpeg2/libmpeg2.pc.in
new file mode 100644
index 000000000..d54500b0e
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/libmpeg2.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libmpeg2
+Description: A decoding library for MPEG-1 and MPEG-2 streams.
+Version: @VERSION@
+Libs: -L${libdir} -lmpeg2
+Cflags: -I${includedir}/@PACKAGE@
diff --git a/src/video_dec/libmpeg2new/libmpeg2/libmpeg2convert.pc.in b/src/video_dec/libmpeg2new/libmpeg2/libmpeg2convert.pc.in
new file mode 100644
index 000000000..42383a6e2
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/libmpeg2convert.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libmpeg2convert
+Description: libmpeg2 helper functions for converting to various formats.
+Version: @VERSION@
+Libs: -L${libdir} -lmpeg2convert
+Cflags: -I${includedir}/@PACKAGE@
diff --git a/src/video_dec/libmpeg2new/libmpeg2/motion_comp.c b/src/video_dec/libmpeg2new/libmpeg2/motion_comp.c
new file mode 100644
index 000000000..d5a265d5c
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/motion_comp.c
@@ -0,0 +1,130 @@
+/*
+ * motion_comp.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+mpeg2_mc_t mpeg2_mc;
+
+void mpeg2_mc_init (uint32_t accel)
+{
+#ifdef ARCH_X86
+    if (accel & MPEG2_ACCEL_X86_MMXEXT)
+	mpeg2_mc = mpeg2_mc_mmxext;
+    else if (accel & MPEG2_ACCEL_X86_3DNOW)
+	mpeg2_mc = mpeg2_mc_3dnow;
+    else if (accel & MPEG2_ACCEL_X86_MMX)
+	mpeg2_mc = mpeg2_mc_mmx;
+    else
+#endif
+#ifdef ARCH_PPC
+    if (accel & MPEG2_ACCEL_PPC_ALTIVEC)
+	mpeg2_mc = mpeg2_mc_altivec;
+    else
+#endif
+#ifdef ARCH_ALPHA
+    if (accel & MPEG2_ACCEL_ALPHA)
+	mpeg2_mc = mpeg2_mc_alpha;
+    else
+#endif
+#ifdef ARCH_SPARC
+    if (accel & MPEG2_ACCEL_SPARC_VIS)
+	mpeg2_mc = mpeg2_mc_vis;
+    else
+#endif
+	mpeg2_mc = mpeg2_mc_c;
+}
+
+#define avg2(a,b) ((a+b+1)>>1)
+#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
+
+#define predict_o(i) (ref[i])
+#define predict_x(i) (avg2 (ref[i], ref[i+1]))
+#define predict_y(i) (avg2 (ref[i], (ref+stride)[i]))
+#define predict_xy(i) (avg4 (ref[i], ref[i+1], \
+			     (ref+stride)[i], (ref+stride)[i+1]))
+
+#define put(predictor,i) dest[i] = predictor (i)
+#define avg(predictor,i) dest[i] = avg2 (predictor (i), dest[i])
+
+/* mc function template */
+
+#define MC_FUNC(op,xy)							\
+static void MC_##op##_##xy##_16_c (uint8_t * dest, const uint8_t * ref,	\
+				   const int stride, int height)	\
+{									\
+    do {								\
+	op (predict_##xy, 0);						\
+	op (predict_##xy, 1);						\
+	op (predict_##xy, 2);						\
+	op (predict_##xy, 3);						\
+	op (predict_##xy, 4);						\
+	op (predict_##xy, 5);						\
+	op (predict_##xy, 6);						\
+	op (predict_##xy, 7);						\
+	op (predict_##xy, 8);						\
+	op (predict_##xy, 9);						\
+	op (predict_##xy, 10);						\
+	op (predict_##xy, 11);						\
+	op (predict_##xy, 12);						\
+	op (predict_##xy, 13);						\
+	op (predict_##xy, 14);						\
+	op (predict_##xy, 15);						\
+	ref += stride;							\
+	dest += stride;							\
+    } while (--height);							\
+}									\
+static void MC_##op##_##xy##_8_c (uint8_t * dest, const uint8_t * ref,	\
+				  const int stride, int height)		\
+{									\
+    do {								\
+	op (predict_##xy, 0);						\
+	op (predict_##xy, 1);						\
+	op (predict_##xy, 2);						\
+	op (predict_##xy, 3);						\
+	op (predict_##xy, 4);						\
+	op (predict_##xy, 5);						\
+	op (predict_##xy, 6);						\
+	op (predict_##xy, 7);						\
+	ref += stride;							\
+	dest += stride;							\
+    } while (--height);							\
+}
+
+/* definitions of the actual mc functions */
+
+MC_FUNC (put,o)
+MC_FUNC (avg,o)
+MC_FUNC (put,x)
+MC_FUNC (avg,x)
+MC_FUNC (put,y)
+MC_FUNC (avg,y)
+MC_FUNC (put,xy)
+MC_FUNC (avg,xy)
+
+MPEG2_MC_EXTERN (c)
diff --git a/src/video_dec/libmpeg2new/libmpeg2/motion_comp_alpha.c b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_alpha.c
new file mode 100644
index 000000000..73f6625d2
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_alpha.c
@@ -0,0 +1,253 @@
+/*
+ * motion_comp_alpha.c
+ * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_ALPHA
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+#include "../include/alpha_asm.h"
+
+static inline uint64_t avg2 (uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC (0xfe)) >> 1);
+}
+
+// Load two unaligned quadwords from addr. This macro only works if
+// addr is actually unaligned.
+#define ULOAD16(ret_l,ret_r,addr)			\
+    do {						\
+	uint64_t _l = ldq_u (addr +  0);		\
+	uint64_t _m = ldq_u (addr +  8);		\
+	uint64_t _r = ldq_u (addr + 16);		\
+	ret_l = extql (_l, addr) | extqh (_m, addr);	\
+	ret_r = extql (_m, addr) | extqh (_r, addr);	\
+    } while (0)
+
+// Load two aligned quadwords from addr.
+#define ALOAD16(ret_l,ret_r,addr)			\
+    do {						\
+	ret_l = ldq (addr);				\
+	ret_r = ldq (addr + 8);				\
+    } while (0)
+
+#define OP8(LOAD,LOAD16,STORE)			\
+    do {					\
+	STORE (LOAD (pixels), block);		\
+	pixels += line_size;			\
+	block += line_size;			\
+    } while (--h)
+
+#define OP16(LOAD,LOAD16,STORE)			\
+    do {					\
+	uint64_t l, r;				\
+	LOAD16 (l, r, pixels);			\
+	STORE (l, block);			\
+	STORE (r, block + 8);			\
+	pixels += line_size;			\
+	block += line_size;			\
+    } while (--h)
+
+#define OP8_X2(LOAD,LOAD16,STORE)			\
+    do {						\
+	uint64_t p0, p1;				\
+							\
+	p0 = LOAD (pixels);				\
+	p1 = p0 >> 8 | ((uint64_t) pixels[8] << 56);	\
+	STORE (avg2 (p0, p1), block);			\
+	pixels += line_size;				\
+	block += line_size;				\
+    } while (--h)
+
+#define OP16_X2(LOAD,LOAD16,STORE)				\
+    do {							\
+	uint64_t p0, p1;					\
+								\
+	LOAD16 (p0, p1, pixels);				\
+	STORE (avg2(p0, p0 >> 8 | p1 << 56), block);		\
+	STORE (avg2(p1, p1 >> 8 | (uint64_t) pixels[16] << 56),	\
+	       block + 8);					\
+	pixels += line_size;					\
+	block += line_size;					\
+    } while (--h)
+
+#define OP8_Y2(LOAD,LOAD16,STORE)		\
+    do {					\
+	uint64_t p0, p1;			\
+	p0 = LOAD (pixels);			\
+	pixels += line_size;			\
+	p1 = LOAD (pixels);			\
+	do {					\
+	    uint64_t av = avg2 (p0, p1);	\
+	    if (--h == 0) line_size = 0;	\
+	    pixels += line_size;		\
+	    p0 = p1;				\
+	    p1 = LOAD (pixels);			\
+	    STORE (av, block);			\
+	    block += line_size;			\
+	} while (h);				\
+    } while (0)
+
+#define OP16_Y2(LOAD,LOAD16,STORE)		\
+    do {					\
+	uint64_t p0l, p0r, p1l, p1r;		\
+	LOAD16 (p0l, p0r, pixels);		\
+	pixels += line_size;			\
+	LOAD16 (p1l, p1r, pixels);		\
+	do {					\
+	    uint64_t avl, avr;			\
+	    if (--h == 0) line_size = 0;	\
+	    avl = avg2 (p0l, p1l);		\
+	    avr = avg2 (p0r, p1r);		\
+	    p0l = p1l;				\
+	    p0r = p1r;				\
+	    pixels += line_size;		\
+	    LOAD16 (p1l, p1r, pixels);		\
+	    STORE (avl, block);			\
+	    STORE (avr, block + 8);		\
+	    block += line_size;			\
+	} while (h);				\
+    } while (0)
+
+#define OP8_XY2(LOAD,LOAD16,STORE)				\
+    do {							\
+	uint64_t pl, ph;					\
+	uint64_t p1 = LOAD (pixels);				\
+	uint64_t p2 = p1 >> 8 | ((uint64_t) pixels[8] << 56);	\
+								\
+	ph = (((p1 & ~BYTE_VEC (0x03)) >> 2) +			\
+	      ((p2 & ~BYTE_VEC (0x03)) >> 2));			\
+	pl = ((p1 & BYTE_VEC (0x03)) +				\
+	      (p2 & BYTE_VEC (0x03)));				\
+								\
+	do {							\
+	    uint64_t npl, nph;					\
+								\
+	    pixels += line_size;				\
+	    p1 = LOAD (pixels);					\
+	    p2 = (p1 >> 8) | ((uint64_t) pixels[8] << 56);	\
+	    nph = (((p1 & ~BYTE_VEC (0x03)) >> 2) +		\
+	           ((p2 & ~BYTE_VEC (0x03)) >> 2));		\
+	    npl = ((p1 & BYTE_VEC (0x03)) +			\
+	           (p2 & BYTE_VEC (0x03)));			\
+								\
+	    STORE (ph + nph +					\
+		   (((pl + npl + BYTE_VEC (0x02)) >> 2) &	\
+		    BYTE_VEC (0x03)), block);			\
+								\
+	    block += line_size;					\
+            pl = npl;						\
+	    ph = nph;						\
+	} while (--h);						\
+    } while (0)
+
+#define OP16_XY2(LOAD,LOAD16,STORE)				\
+    do {							\
+	uint64_t p0, p1, p2, p3, pl_l, ph_l, pl_r, ph_r;	\
+	LOAD16 (p0, p2, pixels);				\
+	p1 = p0 >> 8 | (p2 << 56);				\
+	p3 = p2 >> 8 | ((uint64_t)pixels[16] << 56);		\
+								\
+	ph_l = (((p0 & ~BYTE_VEC (0x03)) >> 2) +		\
+	        ((p1 & ~BYTE_VEC (0x03)) >> 2));		\
+	pl_l = ((p0 & BYTE_VEC (0x03)) +			\
+	        (p1 & BYTE_VEC(0x03)));				\
+	ph_r = (((p2 & ~BYTE_VEC (0x03)) >> 2) +		\
+	        ((p3 & ~BYTE_VEC (0x03)) >> 2));		\
+	pl_r = ((p2 & BYTE_VEC (0x03)) +			\
+	        (p3 & BYTE_VEC (0x03)));			\
+								\
+	do {							\
+	    uint64_t npl_l, nph_l, npl_r, nph_r;		\
+								\
+	    pixels += line_size;				\
+	    LOAD16 (p0, p2, pixels);				\
+	    p1 = p0 >> 8 | (p2 << 56);				\
+	    p3 = p2 >> 8 | ((uint64_t)pixels[16] << 56);	\
+	    nph_l = (((p0 & ~BYTE_VEC (0x03)) >> 2) +		\
+		     ((p1 & ~BYTE_VEC (0x03)) >> 2));		\
+	    npl_l = ((p0 & BYTE_VEC (0x03)) +			\
+		     (p1 & BYTE_VEC (0x03)));			\
+	    nph_r = (((p2 & ~BYTE_VEC (0x03)) >> 2) +		\
+		     ((p3 & ~BYTE_VEC (0x03)) >> 2));		\
+	    npl_r = ((p2 & BYTE_VEC (0x03)) +			\
+		     (p3 & BYTE_VEC (0x03)));			\
+								\
+	    STORE (ph_l + nph_l +				\
+		   (((pl_l + npl_l + BYTE_VEC (0x02)) >> 2) &	\
+		    BYTE_VEC(0x03)), block);			\
+	    STORE (ph_r + nph_r +				\
+		   (((pl_r + npl_r + BYTE_VEC (0x02)) >> 2) &	\
+		    BYTE_VEC(0x03)), block + 8);		\
+								\
+	    block += line_size;					\
+	    pl_l = npl_l;					\
+	    ph_l = nph_l;					\
+	    pl_r = npl_r;					\
+	    ph_r = nph_r;					\
+	} while (--h);						\
+    } while (0)
+
+#define MAKE_OP(OPNAME,SIZE,SUFF,OPKIND,STORE)				\
+static void MC_ ## OPNAME ## _ ## SUFF ## _ ## SIZE ## _alpha		\
+	(uint8_t *restrict block, const uint8_t *restrict pixels,	\
+	 int line_size, int h)						\
+{									\
+    if ((uint64_t) pixels & 0x7) {					\
+	OPKIND (uldq, ULOAD16, STORE);					\
+    } else {								\
+	OPKIND (ldq, ALOAD16, STORE);					\
+    }									\
+}
+
+#define PIXOP(OPNAME,STORE)			\
+    MAKE_OP (OPNAME, 8,  o,  OP8,      STORE);	\
+    MAKE_OP (OPNAME, 8,  x,  OP8_X2,   STORE);	\
+    MAKE_OP (OPNAME, 8,  y,  OP8_Y2,   STORE);	\
+    MAKE_OP (OPNAME, 8,  xy, OP8_XY2,  STORE);	\
+    MAKE_OP (OPNAME, 16, o,  OP16,     STORE);	\
+    MAKE_OP (OPNAME, 16, x,  OP16_X2,  STORE);	\
+    MAKE_OP (OPNAME, 16, y,  OP16_Y2,  STORE);	\
+    MAKE_OP (OPNAME, 16, xy, OP16_XY2, STORE);
+
+#define STORE(l,b) stq (l, b)
+PIXOP (put, STORE);
+#undef STORE
+#define STORE(l,b) stq (avg2 (l, ldq (b)), b);
+PIXOP (avg, STORE);
+
+mpeg2_mc_t mpeg2_mc_alpha = {
+    { MC_put_o_16_alpha, MC_put_x_16_alpha,
+      MC_put_y_16_alpha, MC_put_xy_16_alpha,
+      MC_put_o_8_alpha, MC_put_x_8_alpha,
+      MC_put_y_8_alpha, MC_put_xy_8_alpha },
+    { MC_avg_o_16_alpha, MC_avg_x_16_alpha,
+      MC_avg_y_16_alpha, MC_avg_xy_16_alpha,
+      MC_avg_o_8_alpha, MC_avg_x_8_alpha,
+      MC_avg_y_8_alpha, MC_avg_xy_8_alpha }
+};
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/motion_comp_altivec.c b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_altivec.c
new file mode 100644
index 000000000..cc1b72f56
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_altivec.c
@@ -0,0 +1,1010 @@
+/*
+ * motion_comp_altivec.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_PPC
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+typedef vector signed char vector_s8_t;
+typedef vector unsigned char vector_u8_t;
+typedef vector signed short vector_s16_t;
+typedef vector unsigned short vector_u16_t;
+typedef vector signed int vector_s32_t;
+typedef vector unsigned int vector_u32_t;
+
+#ifndef COFFEE_BREAK	/* Workarounds for gcc suckage */
+
+static inline vector_u8_t my_vec_ld (int const A, const uint8_t * const B)
+{
+    return vec_ld (A, (uint8_t *)B);
+}
+#undef vec_ld
+#define vec_ld my_vec_ld
+
+static inline vector_u8_t my_vec_and (vector_u8_t const A, vector_u8_t const B)
+{
+    return vec_and (A, B);
+}
+#undef vec_and
+#define vec_and my_vec_and
+
+static inline vector_u8_t my_vec_avg (vector_u8_t const A, vector_u8_t const B)
+{
+    return vec_avg (A, B);
+}
+#undef vec_avg
+#define vec_avg my_vec_avg
+
+#endif
+
+static void MC_put_o_16_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp = vec_perm (ref0, ref1, perm);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	tmp = vec_perm (ref0, ref1, perm);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_perm (ref0, ref1, perm);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    vec_st (tmp, 0, dest);
+    tmp = vec_perm (ref0, ref1, perm);
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_put_o_8_altivec (uint8_t * dest, const uint8_t * ref,
+				const int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_perm (ref0, ref1, perm1);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_perm (ref0, ref1, perm0);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_perm (ref0, ref1, perm1);
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+static void MC_put_x_16_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, tmp;
+
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, vec_splat_u8 (1));
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		   vec_perm (ref0, ref1, permB));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		       vec_perm (ref0, ref1, permB));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		       vec_perm (ref0, ref1, permB));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    vec_st (tmp, 0, dest);
+    tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		   vec_perm (ref0, ref1, permB));
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_put_x_8_altivec (uint8_t * dest, const uint8_t * ref,
+				const int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
+
+    ones = vec_splat_u8 (1);
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    perm0B = vec_add (perm0A, ones);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    ref += stride;
+    tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
+		    vec_perm (ref0, ref1, perm0B));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
+			vec_perm (ref0, ref1, perm1B));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_avg (vec_perm (ref0, ref1, perm0A),
+			vec_perm (ref0, ref1, perm0B));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_avg (vec_perm (ref0, ref1, perm1A),
+		    vec_perm (ref0, ref1, perm1B));
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+static void MC_put_y_16_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp1 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (tmp0, tmp1);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	tmp0 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (tmp0, tmp1);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp1 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (tmp0, tmp1);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    vec_st (tmp, 0, dest);
+    tmp0 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (tmp0, tmp1);
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_put_y_8_altivec (uint8_t * dest, const uint8_t * ref,
+				const int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp1 = vec_perm (ref0, ref1, perm1);
+    tmp = vec_avg (tmp0, tmp1);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_perm (ref0, ref1, perm0);
+	tmp = vec_avg (tmp0, tmp1);
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_perm (ref0, ref1, perm1);
+	tmp = vec_avg (tmp0, tmp1);
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    tmp = vec_avg (tmp0, tmp1);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+static void MC_put_xy_16_altivec (uint8_t * dest, const uint8_t * ref,
+				  const int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
+    vector_u8_t ones;
+
+    ones = vec_splat_u8 (1);
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, 0, dest);
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    vec_st (tmp, 0, dest);
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_put_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
+    vector_u8_t avg0, avg1, xor0, xor1, tmp, ones;
+
+    ones = vec_splat_u8 (1);
+    perm0A = vec_lvsl (0, ref);
+    perm0A = vec_mergeh (perm0A, perm0A);
+    perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
+    perm0B = vec_add (perm0A, ones);
+    perm1A = vec_lvsl (stride, ref);
+    perm1A = vec_mergeh (perm1A, perm1A);
+    perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, perm1A);
+    B = vec_perm (ref0, ref1, perm1B);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm0A);
+	B = vec_perm (ref0, ref1, perm0B);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm1A);
+	B = vec_perm (ref0, ref1, perm1B);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_sub (vec_avg (avg0, avg1),
+		       vec_and (vec_and (ones, vec_or (xor0, xor1)),
+				vec_xor (avg0, avg1)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_sub (vec_avg (avg0, avg1),
+		   vec_and (vec_and (ones, vec_or (xor0, xor1)),
+			    vec_xor (avg0, avg1)));
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+#if 0
+static void MC_put_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, A, B, C, D, tmp, zero, ones;
+    vector_u16_t splat2, temp;
+
+    ones = vec_splat_u8 (1);
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, ones);
+
+    zero = vec_splat_u8 (0);
+    splat2 = vec_splat_u16 (2);
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	C = vec_perm (ref0, ref1, permA);
+	D = vec_perm (ref0, ref1, permB);
+
+	temp = vec_add (vec_add ((vector_u16_t)vec_mergeh (zero, A),
+				(vector_u16_t)vec_mergeh (zero, B)),
+		       vec_add ((vector_u16_t)vec_mergeh (zero, C),
+				(vector_u16_t)vec_mergeh (zero, D)));
+	temp = vec_sr (vec_add (temp, splat2), splat2);
+	tmp = vec_pack (temp, temp);
+
+	vec_st (tmp, 0, dest);
+	dest += stride;
+	tmp = vec_avg (vec_perm (ref0, ref1, permA),
+		       vec_perm (ref0, ref1, permB));
+    } while (--height);
+}
+#endif
+
+static void MC_avg_o_16_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp, prev;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    tmp = vec_avg (prev, vec_perm (ref0, ref1, perm));
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_avg_o_8_altivec (uint8_t * dest, const uint8_t * ref,
+				const int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, ref0, ref1, prev;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_avg (prev, vec_perm (ref0, ref1, perm0));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_avg (prev, vec_perm (ref0, ref1, perm1));
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+static void MC_avg_x_16_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, tmp, prev;
+
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, vec_splat_u8 (1));
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (0, dest);
+    ref += stride;
+    tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				  vec_perm (ref0, ref1, permB)));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				      vec_perm (ref0, ref1, permB)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				      vec_perm (ref0, ref1, permB)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    tmp = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, permA),
+				  vec_perm (ref0, ref1, permB)));
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_avg_x_8_altivec (uint8_t * dest, const uint8_t * ref,
+				const int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ones, tmp0, tmp1, ref0, ref1;
+    vector_u8_t prev;
+
+    ones = vec_splat_u8 (1);
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0A = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    perm0B = vec_add (perm0A, ones);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1A = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    prev = vec_ld (0, dest);
+    ref += stride;
+    tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
+				   vec_perm (ref0, ref1, perm0B)));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
+				       vec_perm (ref0, ref1, perm1B)));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm0A),
+				       vec_perm (ref0, ref1, perm0B)));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp0, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp0, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp1 = vec_avg (prev, vec_avg (vec_perm (ref0, ref1, perm1A),
+				   vec_perm (ref0, ref1, perm1B)));
+    vec_ste ((vector_u32_t)tmp1, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp1, 4, (unsigned int *)dest);
+}
+
+static void MC_avg_y_16_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t perm, ref0, ref1, tmp0, tmp1, tmp, prev;
+
+    perm = vec_lvsl (0, ref);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp1 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	tmp0 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (15, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	tmp1 = vec_perm (ref0, ref1, perm);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (15, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    tmp0 = vec_perm (ref0, ref1, perm);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_avg_y_8_altivec (uint8_t * dest, const uint8_t * ref,
+				const int stride, int height)
+{
+    vector_u8_t perm0, perm1, tmp0, tmp1, tmp, ref0, ref1, prev;
+
+    tmp0 = vec_lvsl (0, ref);
+    tmp0 = vec_mergeh (tmp0, tmp0);
+    perm0 = vec_pack ((vector_u16_t)tmp0, (vector_u16_t)tmp0);
+    tmp1 = vec_lvsl (stride, ref);
+    tmp1 = vec_mergeh (tmp1, tmp1);
+    perm1 = vec_pack ((vector_u16_t)tmp1, (vector_u16_t)tmp1);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    tmp1 = vec_perm (ref0, ref1, perm1);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp0 = vec_perm (ref0, ref1, perm0);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (7, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	tmp1 = vec_perm (ref0, ref1, perm1);
+	tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (7, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    tmp0 = vec_perm (ref0, ref1, perm0);
+    tmp = vec_avg (prev, vec_avg (tmp0, tmp1));
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+static void MC_avg_xy_16_altivec (uint8_t * dest, const uint8_t * ref,
+				  const int stride, int height)
+{
+    vector_u8_t permA, permB, ref0, ref1, A, B, avg0, avg1, xor0, xor1, tmp;
+    vector_u8_t ones, prev;
+
+    ones = vec_splat_u8 (1);
+    permA = vec_lvsl (0, ref);
+    permB = vec_add (permA, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_st (tmp, 0, dest);
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (16, ref);
+	ref += stride;
+	prev = vec_ld (2*stride, dest);
+	vec_st (tmp, stride, dest);
+	dest += 2*stride;
+	A = vec_perm (ref0, ref1, permA);
+	B = vec_perm (ref0, ref1, permB);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (16, ref);
+    prev = vec_ld (stride, dest);
+    vec_st (tmp, 0, dest);
+    A = vec_perm (ref0, ref1, permA);
+    B = vec_perm (ref0, ref1, permB);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+    vec_st (tmp, stride, dest);
+}
+
+static void MC_avg_xy_8_altivec (uint8_t * dest, const uint8_t * ref,
+				 const int stride, int height)
+{
+    vector_u8_t perm0A, perm0B, perm1A, perm1B, ref0, ref1, A, B;
+    vector_u8_t avg0, avg1, xor0, xor1, tmp, ones, prev;
+
+    ones = vec_splat_u8 (1);
+    perm0A = vec_lvsl (0, ref);
+    perm0A = vec_mergeh (perm0A, perm0A);
+    perm0A = vec_pack ((vector_u16_t)perm0A, (vector_u16_t)perm0A);
+    perm0B = vec_add (perm0A, ones);
+    perm1A = vec_lvsl (stride, ref);
+    perm1A = vec_mergeh (perm1A, perm1A);
+    perm1A = vec_pack ((vector_u16_t)perm1A, (vector_u16_t)perm1A);
+    perm1B = vec_add (perm1A, ones);
+
+    height = (height >> 1) - 1;
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    ref += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    ref += stride;
+    prev = vec_ld (0, dest);
+    A = vec_perm (ref0, ref1, perm1A);
+    B = vec_perm (ref0, ref1, perm1B);
+    avg1 = vec_avg (A, B);
+    xor1 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+
+    do {
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm0A);
+	B = vec_perm (ref0, ref1, perm0B);
+	avg0 = vec_avg (A, B);
+	xor0 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+
+	ref0 = vec_ld (0, ref);
+	ref1 = vec_ld (8, ref);
+	ref += stride;
+	prev = vec_ld (stride, dest);
+	vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+	vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+	dest += stride;
+	A = vec_perm (ref0, ref1, perm1A);
+	B = vec_perm (ref0, ref1, perm1B);
+	avg1 = vec_avg (A, B);
+	xor1 = vec_xor (A, B);
+	tmp = vec_avg (prev,
+		       vec_sub (vec_avg (avg0, avg1),
+				vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					 vec_xor (avg0, avg1))));
+    } while (--height);
+
+    ref0 = vec_ld (0, ref);
+    ref1 = vec_ld (8, ref);
+    prev = vec_ld (stride, dest);
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    dest += stride;
+    A = vec_perm (ref0, ref1, perm0A);
+    B = vec_perm (ref0, ref1, perm0B);
+    avg0 = vec_avg (A, B);
+    xor0 = vec_xor (A, B);
+    tmp = vec_avg (prev, vec_sub (vec_avg (avg0, avg1),
+				  vec_and (vec_and (ones, vec_or (xor0, xor1)),
+					   vec_xor (avg0, avg1))));
+    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);
+    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+}
+
+MPEG2_MC_EXTERN (altivec)
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/motion_comp_mlib.c b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_mlib.c
new file mode 100644
index 000000000..71c085029
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_mlib.c
@@ -0,0 +1,190 @@
+/*
+ * motion_comp_mlib.c
+ * Copyright (C) 2000-2003 Håkan Hjort <d95hjort@dtek.chalmers.se>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef LIBMPEG2_MLIB
+
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "mpeg2_internal.h"
+
+static void MC_put_o_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoCopyRef_U8_U8_16x16 (dest, (uint8_t *) ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_16x8 (dest, (uint8_t *) ref, stride);
+}
+
+static void MC_put_x_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpX_U8_U8_16x16 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_16x8 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_put_y_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpY_U8_U8_16x16 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_16x8 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_put_xy_16_mlib (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpXY_U8_U8_16x16 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_16x8 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_put_o_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *) ref, stride);
+    else
+	mlib_VideoCopyRef_U8_U8_8x4 (dest, (uint8_t *) ref, stride);
+}
+
+static void MC_put_x_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpX_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpX_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_put_y_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpY_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpY_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_put_xy_8_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpXY_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpXY_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_avg_o_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoCopyRefAve_U8_U8_16x16 (dest, (uint8_t *) ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_16x8 (dest, (uint8_t *) ref, stride);
+}
+
+static void MC_avg_x_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveX_U8_U8_16x16 (dest, (uint8_t *) ref,
+					  stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_16x8 (dest, (uint8_t *) ref,
+					 stride, stride);
+}
+
+static void MC_avg_y_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveY_U8_U8_16x16 (dest, (uint8_t *) ref,
+					  stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_16x8 (dest, (uint8_t *) ref,
+					 stride, stride);
+}
+
+static void MC_avg_xy_16_mlib (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    if (height == 16)
+	mlib_VideoInterpAveXY_U8_U8_16x16 (dest, (uint8_t *) ref,
+					   stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_16x8 (dest, (uint8_t *) ref,
+					  stride, stride);
+}
+
+static void MC_avg_o_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, (uint8_t *) ref, stride);
+    else
+	mlib_VideoCopyRefAve_U8_U8_8x4 (dest, (uint8_t *) ref, stride);
+}
+
+static void MC_avg_x_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpAveX_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_avg_y_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveY_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
+    else
+	mlib_VideoInterpAveY_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
+}
+
+static void MC_avg_xy_8_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    if (height == 8)
+	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *) ref,
+					 stride, stride);
+    else
+	mlib_VideoInterpAveXY_U8_U8_8x4 (dest, (uint8_t *) ref,
+					 stride, stride);
+}
+
+MPEG2_MC_EXTERN (mlib)
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/motion_comp_mmx.c b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_mmx.c
new file mode 100644
index 000000000..8694bdfea
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_mmx.c
@@ -0,0 +1,1005 @@
+/*
+ * motion_comp_mmx.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_X86
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+#include "../include/mmx.h"
+
+#define CPU_MMXEXT 0
+#define CPU_3DNOW 1
+
+
+/* MMX code - needs a rewrite */
+
+/*
+ * Motion Compensation frequently needs to average values using the
+ * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction
+ * to compute this, but it's been left out of classic MMX.
+ *
+ * We need to be careful of overflows when doing this computation.
+ * Rather than unpacking data to 16-bits, which reduces parallelism,
+ * we use the following formulas:
+ *
+ * (x+y)>>1 == (x&y)+((x^y)>>1)
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ */
+
+/* some rounding constants */
+static mmx_t mask1 = {0xfefefefefefefefeLL};
+static mmx_t round4 = {0x0002000200020002LL};
+
+/*
+ * This code should probably be compiled with loop unrolling
+ * (ie, -funroll-loops in gcc)becuase some of the loops
+ * use a small static number of iterations. This was written
+ * with the assumption the compiler knows best about when
+ * unrolling will help
+ */
+
+static inline void mmx_zero_reg ()
+{
+    /* load 0 into mm0 */
+    pxor_r2r (mm0, mm0);
+}
+
+static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1,
+				     const uint8_t * src2)
+{
+    /* *dest = (*src1 + *src2 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
+
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
+
+    pxor_r2r (mm1, mm3);	/* xor src1 and src2 */
+    pand_m2r (mask1, mm3);	/* mask lower bits */
+    psrlq_i2r (1, mm3);		/* /2 */
+    por_r2r (mm2, mm4);		/* or src1 and src2 */
+    psubb_r2r (mm3, mm4);	/* subtract subresults */
+    movq_r2m (mm4, *dest);	/* store result in dest */
+}
+
+static inline void mmx_interp_average_2_U8 (uint8_t * dest,
+					    const uint8_t * src1,
+					    const uint8_t * src2)
+{
+    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
+
+    movq_m2r (*dest, mm1);	/* load 8 dest bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 dest bytes */
+
+    movq_m2r (*src1, mm3);	/* load 8 src1 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src1 bytes */
+
+    movq_m2r (*src2, mm5);	/* load 8 src2 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src2 bytes */
+
+    pxor_r2r (mm3, mm5);	/* xor src1 and src2 */
+    pand_m2r (mask1, mm5);	/* mask lower bits */
+    psrlq_i2r (1, mm5);		/* /2 */
+    por_r2r (mm4, mm6);		/* or src1 and src2 */
+    psubb_r2r (mm5, mm6);	/* subtract subresults */
+    movq_r2r (mm6, mm5);	/* copy subresult */
+
+    pxor_r2r (mm1, mm5);	/* xor srcavg and dest */
+    pand_m2r (mask1, mm5);	/* mask lower bits */
+    psrlq_i2r (1, mm5);		/* /2 */
+    por_r2r (mm2, mm6);		/* or srcavg and dest */
+    psubb_r2r (mm5, mm6);	/* subtract subresults */
+    movq_r2m (mm6, *dest);	/* store result in dest */
+}
+
+static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
+				     const uint8_t * src2,
+				     const uint8_t * src3,
+				     const uint8_t * src4)
+{
+    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
+
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
+
+    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
+    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */
+
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */
+
+    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
+    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */
+
+    paddw_r2r (mm5, mm1);	/* add lows */
+    paddw_r2r (mm6, mm2);	/* add highs */
+
+    /* now have subtotal in mm1 and mm2 */
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		/* /4 */
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		/* /4 */
+
+    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
+    movq_r2m (mm1, *dest);	/* store result in dest */
+}
+
+static inline void mmx_interp_average_4_U8 (uint8_t * dest,
+					    const uint8_t * src1,
+					    const uint8_t * src2,
+					    const uint8_t * src3,
+					    const uint8_t * src4)
+{
+    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
+
+    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
+    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */
+
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */
+
+    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
+    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */
+
+    paddw_r2r (mm5, mm1);	/* add lows */
+    paddw_r2r (mm6, mm2);	/* add highs */
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		/* /4 */
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		/* /4 */
+
+    /* now have subtotal/4 in mm1 and mm2 */
+
+    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */
+
+    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
+    movq_r2r (mm1,mm2);		/* copy subresult */
+
+    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
+    pand_m2r (mask1, mm3);	/* mask lower bits */
+    psrlq_i2r (1, mm3);		/* /2 */
+    por_r2r (mm2, mm4);		/* or srcavg and dest */
+    psubb_r2r (mm3, mm4);	/* subtract subresults */
+    movq_r2m (mm4, *dest);	/* store result in dest */
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_mmx (const int width, int height, uint8_t * dest,
+			       const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, dest, ref);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, dest+8, ref+8);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
+			       const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	movq_m2r (* ref, mm1);	/* load 8 ref bytes */
+	movq_r2m (mm1,* dest);	/* store 8 bytes at curr */
+
+	if (width == 16)
+	    {
+		movq_m2r (* (ref+8), mm1);	/* load 8 ref bytes */
+		movq_r2m (mm1,* (dest+8));	/* store 8 bytes at curr */
+	    }
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+/* Half pixel interpolation in the x direction */
+static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest,
+				  const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
+				     ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest,
+				  const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_y_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_y_mmx (8, height, dest, ref, stride);
+}
+
+
+MPEG2_MC_EXTERN (mmx)
+
+
+
+
+
+
+
+/* CPU_MMXEXT/CPU_3DNOW adaptation layer */
+
+#define pavg_r2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_r2r (src, dest);		\
+    else				\
+	pavgusb_r2r (src, dest);	\
+} while (0)
+
+#define pavg_m2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_m2r (src, dest);		\
+    else				\
+	pavgusb_m2r (src, dest);	\
+} while (0)
+
+
+/* CPU_MMXEXT code */
+
+
+static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int offset,
+			      const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int offset,
+			       const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int offset,
+			      const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int offset,
+			       const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static mmx_t mask_one = {0x0101010101010101LL};
+
+static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
+{
+    movq_m2r (*ref, mm0);
+    movq_m2r (*(ref+1), mm1);
+    movq_r2r (mm0, mm7);
+    pxor_r2r (mm1, mm7);
+    pavg_r2r (mm1, mm0);
+    ref += stride;
+
+    do {
+	movq_m2r (*ref, mm2);
+	movq_r2r (mm0, mm5);
+
+	movq_m2r (*(ref+1), mm3);
+	movq_r2r (mm2, mm6);
+
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm3, mm2);
+
+	por_r2r (mm6, mm7);
+	pxor_r2r (mm2, mm5);
+
+	pand_r2r (mm5, mm7);
+	pavg_r2r (mm2, mm0);
+
+	pand_m2r (mask_one, mm7);
+
+	psubusb_r2r (mm7, mm0);
+
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+
+	movq_r2r (mm6, mm7);	/* unroll ! */
+	movq_r2r (mm2, mm0);	/* unroll ! */
+    } while (--height);
+}
+
+static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*(dest+8), mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+
+MPEG2_MC_EXTERN (mmxext)
+
+
+
+static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+
+MPEG2_MC_EXTERN (3dnow)
+
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/motion_comp_vis.c b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_vis.c
new file mode 100644
index 000000000..e4b61aaa7
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/motion_comp_vis.c
@@ -0,0 +1,2061 @@
+/*
+ * motion_comp_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined(ARCH_SPARC) && defined(ENABLE_VIS)
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+#include "../include/vis.h"
+
+/* The trick used in some of this file is the formula from the MMX
+ * motion comp code, which is:
+ *
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ *
+ * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
+ * We avoid overflows by masking before we do the shift, and we
+ * implement the shift by multiplying by 1/2 using mul8x16.  So in
+ * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
+ * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
+ * the value 0x80808080 is in f8):
+ *
+ *	fxor		f0, f2, f10
+ *	fand		f10, f4, f10
+ *	fmul8x16	f8, f10, f10
+ *	fand		f10, f6, f10
+ *	for		f0, f2, f12
+ *	fpsub16		f12, f10, f10
+ */
+
+#define DUP4(x) {x, x, x, x}
+#define DUP8(x) {x, x, x, x, x, x, x, x}
+static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
+static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
+static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
+static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
+static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
+static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
+static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
+static const int16_t constants256_512[] ATTR_ALIGN(8) =
+	{256, 512, 256, 512};
+static const int16_t constants256_1024[] ATTR_ALIGN(8) =
+	{256, 1024, 256, 1024};
+
+#define REF_0		0
+#define REF_0_1		1
+#define REF_2		2
+#define REF_2_1		3
+#define REF_4		4
+#define REF_4_1		5
+#define REF_6		6
+#define REF_6_1		7
+#define REF_S0		8
+#define REF_S0_1	9
+#define REF_S2		10
+#define REF_S2_1	11
+#define REF_S4		12
+#define REF_S4_1	13
+#define REF_S6		14
+#define REF_S6_1	15
+#define DST_0		16
+#define DST_1		17
+#define DST_2		18
+#define DST_3		19
+#define CONST_1		20
+#define CONST_2		20
+#define CONST_3		20
+#define CONST_6		20
+#define MASK_fe		20
+#define CONST_128	22
+#define CONST_256	22
+#define CONST_512	22
+#define CONST_1024	22
+#define TMP0		24
+#define TMP1		25
+#define TMP2		26
+#define TMP3		27
+#define TMP4		28
+#define TMP5		29
+#define ZERO		30
+#define MASK_7f		30
+
+#define TMP6		32
+#define TMP8		34
+#define TMP10		36
+#define TMP12		38
+#define TMP14		40
+#define TMP16		42
+#define TMP18		44
+#define TMP20		46
+#define TMP22		48
+#define TMP24		50
+#define TMP26		52
+#define TMP28		54
+#define TMP30		56
+#define TMP32		58
+
+static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+	do {	/* 5 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+
+		vis_faligndata(TMP2, TMP4, REF_2);
+		vis_st64_2(REF_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+	do {	/* 4 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+
+		/* stall */
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_st64(REF_0, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+
+static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(dest[8], DST_2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP6);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_xor(DST_2, REF_2, TMP8);
+
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_or(DST_0, REF_0, TMP10);
+		vis_ld64_2(dest, stride, DST_0);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+
+		vis_or(DST_2, REF_2, TMP12);
+		vis_ld64_2(dest, stride_8, DST_2);
+
+		vis_ld64(ref[0], TMP14);
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+
+		dest += stride;
+		vis_ld64_2(ref, 8, TMP16);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP18);
+		vis_faligndata(TMP2, TMP4, REF_2);
+		ref += stride;
+
+		vis_xor(DST_0, REF_0, TMP20);
+
+		vis_and(TMP20, MASK_fe, TMP20);
+
+		vis_xor(DST_2, REF_2, TMP22);
+		vis_mul8x16(CONST_128, TMP20, TMP20);
+
+		vis_and(TMP22, MASK_fe, TMP22);
+
+		vis_or(DST_0, REF_0, TMP24);
+		vis_mul8x16(CONST_128, TMP22, TMP22);
+
+		vis_or(DST_2, REF_2, TMP26);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_ld64_2(dest, stride_8, DST_2);
+		vis_faligndata(TMP16, TMP18, REF_2);
+
+		vis_and(TMP20, MASK_7f, TMP20);
+
+		vis_and(TMP22, MASK_7f, TMP22);
+
+		vis_psub16(TMP24, TMP20, TMP20);
+		vis_st64(TMP20, dest[0]);
+
+		vis_psub16(TMP26, TMP22, TMP22);
+		vis_st64_2(TMP22, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP6);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_ld64_2(ref, offset, TMP4);
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_xor(DST_2, REF_2, TMP8);
+
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_or(DST_0, REF_0, TMP10);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+
+	vis_or(DST_2, REF_2, TMP12);
+	vis_ld64_2(dest, stride_8, DST_2);
+
+	vis_ld64(ref[0], TMP14);
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+
+	dest += stride;
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_2);
+
+	vis_xor(DST_0, REF_0, TMP20);
+
+	vis_and(TMP20, MASK_fe, TMP20);
+
+	vis_xor(DST_2, REF_2, TMP22);
+	vis_mul8x16(CONST_128, TMP20, TMP20);
+
+	vis_and(TMP22, MASK_fe, TMP22);
+
+	vis_or(DST_0, REF_0, TMP24);
+	vis_mul8x16(CONST_128, TMP22, TMP22);
+
+	vis_or(DST_2, REF_2, TMP26);
+
+	vis_and(TMP20, MASK_7f, TMP20);
+
+	vis_and(TMP22, MASK_7f, TMP22);
+
+	vis_psub16(TMP24, TMP20, TMP20);
+	vis_st64(TMP20, dest[0]);
+
+	vis_psub16(TMP26, TMP22, TMP22);
+	vis_st64_2(TMP22, dest, 8);
+}
+
+static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, offset, TMP2);
+
+	vis_ld64(dest[0], DST_0);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(DST_0, REF_0, TMP4);
+
+		vis_ld64_2(ref, offset, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+		ref += stride;
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_ld64(ref[0], TMP12);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		vis_xor(DST_0, REF_0, TMP0);
+		ref += stride;
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_or(DST_0, REF_0, TMP6);
+		vis_ld64_2(dest, stride, DST_0);
+
+		vis_faligndata(TMP12, TMP2, REF_0);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_psub16(TMP6, TMP0, TMP4);
+		vis_st64(TMP4, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(DST_0, REF_0, TMP4);
+
+	vis_ld64_2(ref, offset, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(DST_0, REF_0, TMP6);
+	vis_ld64_2(dest, stride, DST_0);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(DST_0, REF_0, TMP0);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, TMP4);
+	vis_st64(TMP4, dest[0]);
+	dest += stride;
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_or(DST_0, REF_0, TMP6);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_psub16(TMP6, TMP0, TMP4);
+	vis_st64(TMP4, dest[0]);
+}
+
+static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0],    TMP0);
+
+	vis_ld64_2(ref, 8,  TMP2);
+
+	vis_ld64_2(ref, 16, TMP4);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 34 cycles */
+		vis_ld64(ref[0],    TMP0);
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_ld64_2(ref, 8,  TMP2);
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_ld64_2(ref, 16, TMP4);
+		vis_and(TMP6, MASK_fe, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],    TMP14);
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_ld64_2(ref, 8,  TMP16);
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_ld64_2(ref, 16, TMP18);
+		ref += stride;
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP6);
+
+		vis_xor(REF_4, REF_6, TMP8);
+
+		vis_and(TMP6, MASK_fe, TMP6);
+
+		vis_mul8x16(CONST_128, TMP6, TMP6);
+		vis_and(TMP8, MASK_fe, TMP8);
+
+		vis_mul8x16(CONST_128, TMP8, TMP8);
+		vis_or(REF_0, REF_2, TMP10);
+
+		vis_or(REF_4, REF_6, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_faligndata(TMP14, TMP16, REF_0);
+
+		vis_faligndata(TMP16, TMP18, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP14, TMP16, REF_2);
+			vis_faligndata(TMP16, TMP18, REF_6);
+		} else {
+			vis_src1(TMP16, REF_2);
+			vis_src1(TMP18, REF_6);
+		}
+
+		vis_and(TMP6, MASK_7f, TMP6);
+
+		vis_and(TMP8, MASK_7f, TMP8);
+
+		vis_psub16(TMP10, TMP6, TMP6);
+		vis_st64(TMP6, dest[0]);
+
+		vis_psub16(TMP12, TMP8, TMP8);
+		vis_st64_2(TMP8, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0],    TMP0);
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_ld64_2(ref, 8,  TMP2);
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_ld64_2(ref, 16, TMP4);
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+		vis_faligndata(TMP2, TMP4, REF_6);
+	} else {
+		vis_src1(TMP2, REF_2);
+		vis_src1(TMP4, REF_6);
+	}
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP6);
+
+	vis_xor(REF_4, REF_6, TMP8);
+
+	vis_and(TMP6, MASK_fe, TMP6);
+
+	vis_mul8x16(CONST_128, TMP6, TMP6);
+	vis_and(TMP8, MASK_fe, TMP8);
+
+	vis_mul8x16(CONST_128, TMP8, TMP8);
+	vis_or(REF_0, REF_2, TMP10);
+
+	vis_or(REF_4, REF_6, TMP12);
+
+	vis_and(TMP6, MASK_7f, TMP6);
+
+	vis_and(TMP8, MASK_7f, TMP8);
+
+	vis_psub16(TMP10, TMP6, TMP6);
+	vis_st64(TMP6, dest[0]);
+
+	vis_psub16(TMP12, TMP8, TMP8);
+	vis_st64_2(TMP8, dest, 8);
+}
+
+static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64(ref[8], TMP2);
+
+	vis_ld64(constants_fe[0], MASK_fe);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+
+	vis_ld64(constants128[0], CONST_128);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	ref += stride;
+	height = (height >> 1) - 1;
+
+	do {	/* 20 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_and(TMP4, MASK_fe, TMP4);
+		ref += stride;
+
+		vis_ld64(ref[0], TMP8);
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+		} else {
+			vis_src1(TMP2, REF_2);
+		}
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_or(REF_0, REF_2, TMP14);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_alignaddr_g0((void *)off);
+		vis_faligndata(TMP8, TMP10, REF_0);
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP8, TMP10, REF_2);
+		} else {
+			vis_src1(TMP10, REF_2);
+		}
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_alignaddr_g0((void *)off);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_2);
+	} else {
+		vis_src1(TMP2, REF_2);
+	}
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_or(REF_0, REF_2, TMP14);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+}
+
+static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	do {	/* 26 cycles */
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64(ref[8], TMP2);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[16], TMP4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64(dest[8], DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+		}
+
+		vis_mul8x16au(REF_0,   CONST_256, TMP0);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO, REF_2_1, TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_mul8x16al(DST_0,   CONST_512, TMP4);
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_mul8x16al(DST_1,   CONST_512, TMP6);
+
+		vis_mul8x16au(REF_6,   CONST_256, TMP12);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4,   CONST_256, TMP16);
+
+		vis_padd16(TMP0, CONST_3, TMP8);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP18);
+
+		vis_padd16(TMP2, CONST_3, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_padd16(TMP16, TMP12, TMP0);
+
+		vis_st64(DST_0, dest[0]);
+		vis_mul8x16al(DST_2,   CONST_512, TMP4);
+		vis_padd16(TMP18, TMP14, TMP2);
+
+		vis_mul8x16al(DST_3,   CONST_512, TMP6);
+		vis_padd16(TMP0, CONST_3, TMP0);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[8]);
+
+		ref += stride;
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_times_2 = stride << 1;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_fzero(ZERO);
+	vis_ld64(constants256_512[0], CONST_256);
+
+	ref = vis_alignaddr(ref);
+	height >>= 2;
+	do {	/* 47 cycles */
+		vis_ld64(ref[0],   TMP0);
+
+		vis_ld64_2(ref, 8, TMP2);
+		ref += stride;
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64(ref[0],   TMP4);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, 8, TMP6);
+		ref += stride;
+
+		vis_ld64(ref[0],   TMP8);
+
+		vis_ld64_2(ref, 8, TMP10);
+		ref += stride;
+		vis_faligndata(TMP4, TMP6, REF_4);
+
+		vis_ld64(ref[0],   TMP12);
+
+		vis_ld64_2(ref, 8, TMP14);
+		ref += stride;
+		vis_faligndata(TMP8, TMP10, REF_S0);
+
+		vis_faligndata(TMP12, TMP14, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+
+			vis_ld64(dest[0], DST_0);
+			vis_faligndata(TMP0, TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_faligndata(TMP4, TMP6, REF_6);
+
+			vis_faligndata(TMP8, TMP10, REF_S2);
+
+			vis_faligndata(TMP12, TMP14, REF_S6);
+		} else {
+			vis_ld64(dest[0], DST_0);
+			vis_src1(TMP2, REF_2);
+
+			vis_ld64_2(dest, stride, DST_2);
+			vis_src1(TMP6, REF_6);
+
+			vis_src1(TMP10, REF_S2);
+
+			vis_src1(TMP14, REF_S6);
+		}
+
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_pmerge(ZERO,     REF_2,     TMP4);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP16, TMP0);
+		vis_mul8x16au(REF_6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_2, CONST_512, TMP16);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(DST_3, CONST_512, TMP18);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP10, CONST_3, TMP10);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP8, TMP16, TMP8);
+
+		vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
+		vis_padd16(TMP10, TMP18, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_pmerge(ZERO,     REF_S0,     TMP0);
+
+		vis_pmerge(ZERO,     REF_S2,     TMP24);
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16au(REF_S4, CONST_256, TMP8);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
+
+		vis_padd16(TMP0, TMP24, TMP0);
+		vis_mul8x16au(REF_S6, CONST_256, TMP12);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
+
+		vis_padd16(TMP8, CONST_3, TMP8);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+
+		vis_padd16(TMP10, CONST_3, TMP10);
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+		vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
+
+		vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
+		vis_padd16(TMP0, TMP16, TMP0);
+
+		vis_padd16(TMP2, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(TMP8, TMP20, TMP8);
+
+		vis_padd16(TMP10, TMP22, TMP10);
+		vis_pack16(TMP8, DST_2);
+
+		vis_pack16(TMP10, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP6);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64_2(ref, 8, TMP8);
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_ld64_2(ref, offset, TMP10);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP6, TMP8, REF_2);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP8, TMP10, REF_6);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 24 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_ld64_2(ref, 8, TMP2);
+		vis_xor(REF_4, REF_6, TMP16);
+
+		vis_ld64_2(ref, offset, TMP4);
+		ref += stride;
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_ld64(ref[0], TMP6);
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_ld64_2(ref, 8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, offset, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_and(TMP16, MASK_fe, TMP16);
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+
+		vis_mul8x16(CONST_128, TMP16, TMP16);
+		vis_xor(REF_0, REF_2, TMP0);
+
+		vis_xor(REF_4, REF_6, TMP2);
+
+		vis_or(REF_0, REF_2, TMP20);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_and(TMP16, MASK_7f, TMP16);
+
+		vis_psub16(TMP14, TMP12, TMP12);
+		vis_st64(TMP12, dest[0]);
+
+		vis_psub16(TMP18, TMP16, TMP16);
+		vis_st64_2(TMP16, dest, 8);
+		dest += stride;
+
+		vis_or(REF_4, REF_6, TMP18);
+
+		vis_and(TMP0, MASK_fe, TMP0);
+
+		vis_and(TMP2, MASK_fe, TMP2);
+		vis_mul8x16(CONST_128, TMP0, TMP0);
+
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16(CONST_128, TMP2, TMP2);
+
+		vis_faligndata(TMP8, TMP10, REF_6);
+
+		vis_and(TMP0, MASK_7f, TMP0);
+
+		vis_and(TMP2, MASK_7f, TMP2);
+
+		vis_psub16(TMP20, TMP0, TMP0);
+		vis_st64(TMP0, dest[0]);
+
+		vis_psub16(TMP18, TMP2, TMP2);
+		vis_st64_2(TMP2, dest, 8);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_ld64_2(ref, 8, TMP2);
+	vis_xor(REF_4, REF_6, TMP16);
+
+	vis_ld64_2(ref, offset, TMP4);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_faligndata(TMP2, TMP4, REF_4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_and(TMP16, MASK_fe, TMP16);
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+
+	vis_mul8x16(CONST_128, TMP16, TMP16);
+	vis_xor(REF_0, REF_2, TMP0);
+
+	vis_xor(REF_4, REF_6, TMP2);
+
+	vis_or(REF_0, REF_2, TMP20);
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_and(TMP16, MASK_7f, TMP16);
+
+	vis_psub16(TMP14, TMP12, TMP12);
+	vis_st64(TMP12, dest[0]);
+
+	vis_psub16(TMP18, TMP16, TMP16);
+	vis_st64_2(TMP16, dest, 8);
+	dest += stride;
+
+	vis_or(REF_4, REF_6, TMP18);
+
+	vis_and(TMP0, MASK_fe, TMP0);
+
+	vis_and(TMP2, MASK_fe, TMP2);
+	vis_mul8x16(CONST_128, TMP0, TMP0);
+
+	vis_mul8x16(CONST_128, TMP2, TMP2);
+
+	vis_and(TMP0, MASK_7f, TMP0);
+
+	vis_and(TMP2, MASK_7f, TMP2);
+
+	vis_psub16(TMP20, TMP0, TMP0);
+	vis_st64(TMP0, dest[0]);
+
+	vis_psub16(TMP18, TMP2, TMP2);
+	vis_st64_2(TMP2, dest, 8);
+}
+
+static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int offset;
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[0], TMP0);
+
+	vis_ld64_2(ref, offset, TMP2);
+	ref += stride;
+
+	vis_ld64(ref[0], TMP4);
+
+	vis_ld64_2(ref, offset, TMP6);
+	ref += stride;
+
+	vis_ld64(constants_fe[0], MASK_fe);
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_ld64(constants_7f[0], MASK_7f);
+	vis_faligndata(TMP4, TMP6, REF_2);
+
+	vis_ld64(constants128[0], CONST_128);
+	height = (height >> 1) - 1;
+	do {	/* 12 cycles */
+		vis_ld64(ref[0], TMP0);
+		vis_xor(REF_0, REF_2, TMP4);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+		vis_and(TMP4, MASK_fe, TMP4);
+
+		vis_or(REF_0, REF_2, TMP6);
+		vis_mul8x16(CONST_128, TMP4, TMP4);
+
+		vis_faligndata(TMP0, TMP2, REF_0);
+		vis_ld64(ref[0], TMP0);
+
+		vis_ld64_2(ref, offset, TMP2);
+		ref += stride;
+		vis_xor(REF_0, REF_2, TMP12);
+
+		vis_and(TMP4, MASK_7f, TMP4);
+
+		vis_and(TMP12, MASK_fe, TMP12);
+
+		vis_mul8x16(CONST_128, TMP12, TMP12);
+		vis_or(REF_0, REF_2, TMP14);
+
+		vis_psub16(TMP6, TMP4, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_faligndata(TMP0, TMP2, REF_2);
+
+		vis_and(TMP12, MASK_7f, TMP12);
+
+		vis_psub16(TMP14, TMP12, DST_0);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+	} while (--height);
+
+	vis_ld64(ref[0], TMP0);
+	vis_xor(REF_0, REF_2, TMP4);
+
+	vis_ld64_2(ref, offset, TMP2);
+	vis_and(TMP4, MASK_fe, TMP4);
+
+	vis_or(REF_0, REF_2, TMP6);
+	vis_mul8x16(CONST_128, TMP4, TMP4);
+
+	vis_faligndata(TMP0, TMP2, REF_0);
+
+	vis_xor(REF_0, REF_2, TMP12);
+
+	vis_and(TMP4, MASK_7f, TMP4);
+
+	vis_and(TMP12, MASK_fe, TMP12);
+
+	vis_mul8x16(CONST_128, TMP12, TMP12);
+	vis_or(REF_0, REF_2, TMP14);
+
+	vis_psub16(TMP6, TMP4, DST_0);
+	vis_st64(DST_0, dest[0]);
+	dest += stride;
+
+	vis_and(TMP12, MASK_7f, TMP12);
+
+	vis_psub16(TMP14, TMP12, DST_0);
+	vis_st64(DST_0, dest[0]);
+}
+
+static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8 = stride + 8;
+	int stride_16;
+	int offset;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 16 : 0;
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64_2(ref, offset, TMP4);
+	stride_16 = stride + offset;
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_6);
+	height >>= 1;
+
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP12);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP14);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_pmerge(ZERO,       REF_6,     TMP16);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_pmerge(ZERO,     REF_0,     TMP0);
+		vis_mul8x16au(REF_0_1, CONST_256, TMP2);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_pmerge(ZERO,     REF_4,     TMP4);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+
+		vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
+		vis_faligndata(TMP6, TMP8, REF_2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
+		vis_faligndata(TMP8, TMP10, REF_6);
+		vis_mul8x16al(DST_0,   CONST_512, TMP20);
+
+		vis_padd16(TMP0, CONST_3, TMP0);
+		vis_mul8x16al(DST_1,   CONST_512, TMP22);
+
+		vis_padd16(TMP2, CONST_3, TMP2);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP4, CONST_3, TMP4);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_padd16(TMP6, CONST_3, TMP6);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+		vis_mul8x16al(REF_S0,   CONST_512, TMP20);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_mul8x16al(REF_S2,   CONST_512, TMP24);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_2,   CONST_256, TMP28);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP30);
+
+		vis_padd16(TMP16, TMP4, TMP16);
+		vis_mul8x16au(REF_6,   CONST_256, REF_S4);
+
+		vis_padd16(TMP18, TMP6, TMP18);
+		vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
+
+		vis_pack16(TMP12, DST_0);
+		vis_padd16(TMP28, TMP0, TMP12);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP30, TMP2, TMP14);
+
+		vis_pack16(TMP16, DST_2);
+		vis_padd16(REF_S4, TMP4, TMP16);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(REF_S6, TMP6, TMP18);
+
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_padd16(TMP16, TMP24, TMP16);
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(TMP18, TMP26, TMP18);
+		vis_pack16(TMP16, DST_2);
+
+		vis_pack16(TMP18, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
+			    const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	int stride_8;
+	int offset;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+	offset = (ref != _ref) ? 8 : 0;
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, offset, TMP2);
+	stride_8 = stride + offset;
+
+	vis_ld64(constants3[0], CONST_3);
+	vis_faligndata(TMP0, TMP2, REF_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+
+	height >>= 1;
+	do {	/* 20 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_pmerge(ZERO,       REF_2,     TMP8);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+
+		vis_ld64_2(dest, stride, DST_2);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_mul8x16al(DST_0,   CONST_512, TMP16);
+		vis_pmerge(ZERO,       REF_0,     TMP12);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_mul8x16al(DST_1,   CONST_512, TMP18);
+		vis_pmerge(ZERO,       REF_0_1,   TMP14);
+
+		vis_padd16(TMP12, CONST_3, TMP12);
+		vis_mul8x16al(DST_2,   CONST_512, TMP24);
+
+		vis_padd16(TMP14, CONST_3, TMP14);
+		vis_mul8x16al(DST_3,   CONST_512, TMP26);
+
+		vis_faligndata(TMP4, TMP6, REF_2);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_mul8x16au(REF_2,   CONST_256, TMP20);
+
+		vis_padd16(TMP8, TMP16, TMP0);
+		vis_mul8x16au(REF_2_1, CONST_256, TMP22);
+
+		vis_padd16(TMP10, TMP18, TMP2);
+		vis_pack16(TMP0, DST_0);
+
+		vis_pack16(TMP2, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP12, TMP20, TMP12);
+
+		vis_padd16(TMP14, TMP22, TMP14);
+
+		vis_padd16(TMP12, TMP24, TMP0);
+
+		vis_padd16(TMP14, TMP26, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+			      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants2[0], CONST_2);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16au(REF_0, CONST_256, TMP0);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_padd16(TMP0, CONST_2, TMP8);
+		vis_mul8x16au(REF_4, CONST_256, TMP0);
+
+		vis_padd16(TMP2, CONST_2, TMP10);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP2);
+
+		vis_padd16(TMP8, TMP4, TMP8);
+		vis_mul8x16au(REF_6, CONST_256, TMP4);
+
+		vis_padd16(TMP10, TMP6, TMP10);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP8, TMP12);
+
+		vis_padd16(TMP14, TMP10, TMP14);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP0, CONST_2, TMP12);
+
+		vis_mul8x16au(REF_S0, CONST_256, TMP0);
+		vis_padd16(TMP2, CONST_2, TMP14);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
+		vis_padd16(TMP12, TMP4, TMP12);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP4);
+		vis_padd16(TMP14, TMP6, TMP14);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
+		vis_padd16(TMP20, TMP12, TMP20);
+
+		vis_padd16(TMP22, TMP14, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+		vis_padd16(TMP0, TMP4, TMP24);
+
+		vis_mul8x16au(REF_S4, CONST_256, TMP0);
+		vis_padd16(TMP2, TMP6, TMP26);
+
+		vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
+		vis_padd16(TMP24, TMP8, TMP24);
+
+		vis_padd16(TMP26, TMP10, TMP26);
+		vis_pack16(TMP24, DST_0);
+
+		vis_pack16(TMP26, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_pmerge(ZERO, REF_S6, TMP4);
+
+		vis_pmerge(ZERO,      REF_S6_1,  TMP6);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+
+		vis_padd16(TMP0, TMP12, TMP0);
+
+		vis_padd16(TMP2, TMP14, TMP2);
+		vis_pack16(TMP0, DST_2);
+
+		vis_pack16(TMP2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(constants2[0], CONST_2);
+
+	vis_ld64(constants256_512[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 26 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0,   CONST_256, TMP8);
+		vis_pmerge(ZERO,        REF_S2,    TMP12);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
+		vis_pmerge(ZERO,        REF_S2_1,  TMP14);
+
+		vis_ld64_2(ref, stride, TMP4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_pmerge(ZERO, REF_S4, TMP18);
+
+		vis_pmerge(ZERO, REF_S4_1, TMP20);
+
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_padd16(TMP18, CONST_2, TMP18);
+		vis_mul8x16au(REF_S6,   CONST_256, TMP22);
+
+		vis_padd16(TMP20, CONST_2, TMP20);
+		vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
+
+		vis_mul8x16au(REF_S0,   CONST_256, TMP26);
+		vis_pmerge(ZERO, REF_S0_1, TMP28);
+
+		vis_mul8x16au(REF_S2,   CONST_256, TMP30);
+		vis_padd16(TMP18, TMP22, TMP18);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP8,  TMP18, TMP8);
+
+		vis_padd16(TMP10, TMP20, TMP10);
+
+		vis_padd16(TMP8,  TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+		vis_pack16(TMP8,  DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+		vis_padd16(TMP18, TMP26, TMP18);
+
+		vis_padd16(TMP20, TMP28, TMP20);
+
+		vis_padd16(TMP18, TMP30, TMP18);
+
+		vis_padd16(TMP20, TMP32, TMP20);
+		vis_pack16(TMP18, DST_2);
+
+		vis_pack16(TMP20, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
+			      const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+	int stride_16 = stride + 16;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[ 0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64(ref[ 8], TMP2);
+
+	vis_ld64(ref[16], TMP4);
+
+	vis_ld64(constants6[0], CONST_6);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP2, TMP4, REF_S4);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+		vis_faligndata(TMP2, TMP4, REF_S6);
+	} else {
+		vis_src1(TMP2, REF_S2);
+		vis_src1(TMP4, REF_S6);
+	}
+
+	height >>= 1;
+	do {	/* 55 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		vis_mul8x16au(REF_S2, CONST_256, TMP16);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP18);
+
+		vis_ld64_2(ref, stride_16, TMP4);
+		ref += stride;
+		vis_mul8x16au(REF_S4, CONST_256, TMP20);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP22);
+
+		vis_ld64_2(ref, stride, TMP6);
+		vis_mul8x16au(REF_S6, CONST_256, TMP24);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP26);
+
+		vis_ld64_2(ref, stride_8, TMP8);
+		vis_faligndata(TMP0, TMP2, REF_0);
+
+		vis_ld64_2(ref, stride_16, TMP10);
+		ref += stride;
+		vis_faligndata(TMP2, TMP4, REF_4);
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP6, TMP8, REF_S0);
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_faligndata(TMP8, TMP10, REF_S4);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_2);
+			vis_faligndata(TMP2, TMP4, REF_6);
+			vis_faligndata(TMP6, TMP8, REF_S2);
+			vis_faligndata(TMP8, TMP10, REF_S6);
+		} else {
+			vis_src1(TMP2, REF_2);
+			vis_src1(TMP4, REF_6);
+			vis_src1(TMP8, REF_S2);
+			vis_src1(TMP10, REF_S6);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_0, TMP0);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_0_1,  TMP2);
+
+		vis_mul8x16au(REF_2, CONST_256, TMP4);
+		vis_pmerge(ZERO,      REF_2_1,  TMP6);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP0, CONST_6, TMP0);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP2, CONST_6, TMP2);
+
+		vis_padd16(TMP0, TMP4, TMP0);
+		vis_mul8x16au(REF_4, CONST_256, TMP4);
+
+		vis_padd16(TMP2, TMP6, TMP2);
+		vis_mul8x16au(REF_4_1, CONST_256, TMP6);
+
+		vis_padd16(TMP12, TMP0, TMP12);
+		vis_mul8x16au(REF_6, CONST_256, TMP8);
+
+		vis_padd16(TMP14, TMP2, TMP14);
+		vis_mul8x16au(REF_6_1, CONST_256, TMP10);
+
+		vis_padd16(TMP12, TMP16, TMP12);
+		vis_mul8x16au(REF_S0, CONST_256, REF_4);
+
+		vis_padd16(TMP14, TMP18, TMP14);
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
+
+		vis_padd16(TMP12, TMP30, TMP12);
+
+		vis_padd16(TMP14, TMP32, TMP14);
+		vis_pack16(TMP12, DST_0);
+
+		vis_pack16(TMP14, DST_1);
+		vis_st64(DST_0, dest[0]);
+		vis_padd16(TMP4, CONST_6, TMP4);
+
+		vis_ld64_2(dest, stride, DST_0);
+		vis_padd16(TMP6, CONST_6, TMP6);
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+
+		vis_padd16(TMP4, TMP8, TMP4);
+		vis_mul8x16au(REF_S2_1, CONST_256,  TMP14);
+
+		vis_padd16(TMP6, TMP10, TMP6);
+
+		vis_padd16(TMP20, TMP4, TMP20);
+
+		vis_padd16(TMP22, TMP6, TMP22);
+
+		vis_padd16(TMP20, TMP24, TMP20);
+
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_padd16(TMP20, REF_0, TMP20);
+		vis_mul8x16au(REF_S4, CONST_256, REF_0);
+
+		vis_padd16(TMP22, REF_2, TMP22);
+		vis_pack16(TMP20, DST_2);
+
+		vis_pack16(TMP22, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+
+		vis_ld64_2(dest, 8, DST_2);
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO,      REF_S4_1,  REF_2);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_padd16(REF_4, TMP0, TMP8);
+
+		vis_mul8x16au(REF_S6, CONST_256, REF_4);
+		vis_padd16(REF_6, TMP2, TMP10);
+
+		vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+
+		vis_padd16(REF_0, TMP4, REF_0);
+
+		vis_mul8x16al(DST_2,   CONST_1024, TMP30);
+		vis_padd16(REF_2, TMP6, REF_2);
+
+		vis_mul8x16al(DST_3,   CONST_1024, TMP32);
+		vis_padd16(REF_0, REF_4, REF_0);
+
+		vis_padd16(REF_2, REF_6, REF_2);
+
+		vis_padd16(REF_0, TMP30, REF_0);
+
+		/* stall */
+
+		vis_padd16(REF_2, TMP32, REF_2);
+		vis_pack16(REF_0, DST_2);
+
+		vis_pack16(REF_2, DST_3);
+		vis_st64_2(DST_2, dest, 8);
+		dest += stride;
+	} while (--height);
+}
+
+static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
+			     const int stride, int height)
+{
+	uint8_t *ref = (uint8_t *) _ref;
+	unsigned long off = (unsigned long) ref & 0x7;
+	unsigned long off_plus_1 = off + 1;
+	int stride_8 = stride + 8;
+
+	vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
+
+	ref = vis_alignaddr(ref);
+
+	vis_ld64(ref[0], TMP0);
+	vis_fzero(ZERO);
+
+	vis_ld64_2(ref, 8, TMP2);
+
+	vis_ld64(constants6[0], CONST_6);
+
+	vis_ld64(constants256_1024[0], CONST_256);
+	vis_faligndata(TMP0, TMP2, REF_S0);
+
+	if (off != 0x7) {
+		vis_alignaddr_g0((void *)off_plus_1);
+		vis_faligndata(TMP0, TMP2, REF_S2);
+	} else {
+		vis_src1(TMP2, REF_S2);
+	}
+
+	height >>= 1;
+	do {	/* 31 cycles */
+		vis_ld64_2(ref, stride, TMP0);
+		vis_mul8x16au(REF_S0, CONST_256, TMP8);
+		vis_pmerge(ZERO,      REF_S0_1,  TMP10);
+
+		vis_ld64_2(ref, stride_8, TMP2);
+		ref += stride;
+		vis_mul8x16au(REF_S2, CONST_256, TMP12);
+		vis_pmerge(ZERO,      REF_S2_1,  TMP14);
+
+		vis_alignaddr_g0((void *)off);
+
+		vis_ld64_2(ref, stride, TMP4);
+		vis_faligndata(TMP0, TMP2, REF_S4);
+
+		vis_ld64_2(ref, stride_8, TMP6);
+		ref += stride;
+
+		vis_ld64(dest[0], DST_0);
+		vis_faligndata(TMP4, TMP6, REF_S0);
+
+		vis_ld64_2(dest, stride, DST_2);
+
+		if (off != 0x7) {
+			vis_alignaddr_g0((void *)off_plus_1);
+			vis_faligndata(TMP0, TMP2, REF_S6);
+			vis_faligndata(TMP4, TMP6, REF_S2);
+		} else {
+			vis_src1(TMP2, REF_S6);
+			vis_src1(TMP6, REF_S2);
+		}
+
+		vis_mul8x16al(DST_0,   CONST_1024, TMP30);
+		vis_pmerge(ZERO, REF_S4, TMP22);
+
+		vis_mul8x16al(DST_1,   CONST_1024, TMP32);
+		vis_pmerge(ZERO,      REF_S4_1,  TMP24);
+
+		vis_mul8x16au(REF_S6, CONST_256, TMP26);
+		vis_pmerge(ZERO,      REF_S6_1,  TMP28);
+
+		vis_mul8x16au(REF_S0, CONST_256, REF_S4);
+		vis_padd16(TMP22, CONST_6, TMP22);
+
+		vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
+		vis_padd16(TMP24, CONST_6, TMP24);
+
+		vis_mul8x16al(DST_2,   CONST_1024, REF_0);
+		vis_padd16(TMP22, TMP26, TMP22);
+
+		vis_mul8x16al(DST_3,   CONST_1024, REF_2);
+		vis_padd16(TMP24, TMP28, TMP24);
+
+		vis_mul8x16au(REF_S2, CONST_256, TMP26);
+		vis_padd16(TMP8, TMP22, TMP8);
+
+		vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
+		vis_padd16(TMP10, TMP24, TMP10);
+
+		vis_padd16(TMP8, TMP12, TMP8);
+
+		vis_padd16(TMP10, TMP14, TMP10);
+
+		vis_padd16(TMP8, TMP30, TMP8);
+
+		vis_padd16(TMP10, TMP32, TMP10);
+		vis_pack16(TMP8, DST_0);
+
+		vis_pack16(TMP10, DST_1);
+		vis_st64(DST_0, dest[0]);
+		dest += stride;
+
+		vis_padd16(REF_S4, TMP22, TMP12);
+
+		vis_padd16(REF_S6, TMP24, TMP14);
+
+		vis_padd16(TMP12, TMP26, TMP12);
+
+		vis_padd16(TMP14, TMP28, TMP14);
+
+		vis_padd16(TMP12, REF_0, TMP12);
+
+		vis_padd16(TMP14, REF_2, TMP14);
+		vis_pack16(TMP12, DST_2);
+
+		vis_pack16(TMP14, DST_3);
+		vis_st64(DST_2, dest[0]);
+		dest += stride;
+	} while (--height);
+}
+
+MPEG2_MC_EXTERN(vis);
+
+#endif  /* defined(ARCH_SPARC) && defined(ENABLE_VIS) */
diff --git a/src/video_dec/libmpeg2new/libmpeg2/mpeg2_internal.h b/src/video_dec/libmpeg2new/libmpeg2/mpeg2_internal.h
new file mode 100644
index 000000000..fec7d4744
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/mpeg2_internal.h
@@ -0,0 +1,302 @@
+/*
+ * mpeg2_internal.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define STATE_INTERNAL_NORETURN ((mpeg2_state_t)-1)
+
+/* macroblock modes */
+#define MACROBLOCK_INTRA 1
+#define MACROBLOCK_PATTERN 2
+#define MACROBLOCK_MOTION_BACKWARD 4
+#define MACROBLOCK_MOTION_FORWARD 8
+#define MACROBLOCK_QUANT 16
+#define DCT_TYPE_INTERLACED 32
+/* motion_type */
+#define MOTION_TYPE_SHIFT 6
+#define MC_FIELD 1
+#define MC_FRAME 2
+#define MC_16X8 2
+#define MC_DMV 3
+
+/* picture structure */
+#define TOP_FIELD 1
+#define BOTTOM_FIELD 2
+#define FRAME_PICTURE 3
+
+/* picture coding type */
+#define I_TYPE 1
+#define P_TYPE 2
+#define B_TYPE 3
+#define D_TYPE 4
+
+typedef void mpeg2_mc_fct (uint8_t *, const uint8_t *, int, int);
+
+typedef struct {
+    uint8_t * ref[2][3];
+    uint8_t ** ref2[2];
+    int pmv[2][2];
+    int f_code[2];
+} motion_t;
+
+typedef void motion_parser_t (mpeg2_decoder_t * decoder,
+			      motion_t * motion,
+			      mpeg2_mc_fct * const * table);
+
+struct mpeg2_decoder_s {
+    /* first, state that carries information from one macroblock to the */
+    /* next inside a slice, and is never used outside of mpeg2_slice() */
+
+    /* bit parsing stuff */
+    uint32_t bitstream_buf;		/* current 32 bit working set */
+    int bitstream_bits;			/* used bits in working set */
+    const uint8_t * bitstream_ptr;	/* buffer with stream data */
+
+    uint8_t * dest[3];
+
+    int offset;
+    int stride;
+    int uv_stride;
+    int slice_stride;
+    int slice_uv_stride;
+    int stride_frame;
+    unsigned int limit_x;
+    unsigned int limit_y_16;
+    unsigned int limit_y_8;
+    unsigned int limit_y;
+
+    /* Motion vectors */
+    /* The f_ and b_ correspond to the forward and backward motion */
+    /* predictors */
+    motion_t b_motion;
+    motion_t f_motion;
+    motion_parser_t * motion_parser[5];
+
+    /* predictor for DC coefficients in intra blocks */
+    int16_t dc_dct_pred[3];
+
+    /* DCT coefficients */
+    int16_t DCTblock[64] ATTR_ALIGN(64);
+
+    uint8_t * picture_dest[3];
+    void (* convert) (void * convert_id, uint8_t * const * src,
+		      unsigned int v_offset);
+    void * convert_id;
+
+    int dmv_offset;
+    unsigned int v_offset;
+
+    /* now non-slice-specific information */
+
+    /* sequence header stuff */
+    uint16_t * quantizer_matrix[4];
+    uint16_t (* chroma_quantizer[2])[64];
+    uint16_t quantizer_prescale[4][32][64];
+
+    /* The width and height of the picture snapped to macroblock units */
+    int width;
+    int height;
+    int vertical_position_extension;
+    int chroma_format;
+
+    /* picture header stuff */
+
+    /* what type of picture this is (I, P, B, D) */
+    int coding_type;
+
+    /* picture coding extension stuff */
+
+    /* quantization factor for intra dc coefficients */
+    int intra_dc_precision;
+    /* top/bottom/both fields */
+    int picture_structure;
+    /* bool to indicate all predictions are frame based */
+    int frame_pred_frame_dct;
+    /* bool to indicate whether intra blocks have motion vectors */
+    /* (for concealment) */
+    int concealment_motion_vectors;
+    /* bool to use different vlc tables */
+    int intra_vlc_format;
+    /* used for DMV MC */
+    int top_field_first;
+
+    /* stuff derived from bitstream */
+
+    /* pointer to the zigzag scan we're supposed to be using */
+    const uint8_t * scan;
+
+    int second_field;
+
+    int mpeg1;
+};
+
+typedef struct {
+    mpeg2_fbuf_t fbuf;
+} fbuf_alloc_t;
+
+struct mpeg2dec_s {
+    mpeg2_decoder_t decoder;
+
+    mpeg2_info_t info;
+
+    uint32_t shift;
+    int is_display_initialized;
+    mpeg2_state_t (* action) (struct mpeg2dec_s * mpeg2dec);
+    mpeg2_state_t state;
+    uint32_t ext_state;
+
+    /* allocated in init - gcc has problems allocating such big structures */
+    uint8_t * chunk_buffer;
+    /* pointer to start of the current chunk */
+    uint8_t * chunk_start;
+    /* pointer to current position in chunk_buffer */
+    uint8_t * chunk_ptr;
+    /* last start code ? */
+    uint8_t code;
+
+    /* picture tags */
+    uint32_t tag_current, tag2_current, tag_previous, tag2_previous;
+    int num_tags;
+    int bytes_since_tag;
+
+    int first;
+    int alloc_index_user;
+    int alloc_index;
+    uint8_t first_decode_slice;
+    uint8_t nb_decode_slices;
+
+    unsigned int user_data_len;
+
+    mpeg2_sequence_t new_sequence;
+    mpeg2_sequence_t sequence;
+    mpeg2_gop_t new_gop;
+    mpeg2_gop_t gop;
+    mpeg2_picture_t new_picture;
+    mpeg2_picture_t pictures[4];
+    mpeg2_picture_t * picture;
+    /*const*/ mpeg2_fbuf_t * fbuf[3];	/* 0: current fbuf, 1-2: prediction fbufs */
+
+    fbuf_alloc_t fbuf_alloc[3];
+    int custom_fbuf;
+
+    uint8_t * yuv_buf[3][3];
+    int yuv_index;
+    mpeg2_convert_t * convert;
+    void * convert_arg;
+    unsigned int convert_id_size;
+    int convert_stride;
+    void (* convert_start) (void * id, const mpeg2_fbuf_t * fbuf,
+			    const mpeg2_picture_t * picture,
+			    const mpeg2_gop_t * gop);
+
+    uint8_t * buf_start;
+    uint8_t * buf_end;
+
+    int16_t display_offset_x, display_offset_y;
+
+    int copy_matrix;
+    int8_t q_scale_type, scaled[4];
+    uint8_t quantizer_matrix[4][64];
+    uint8_t new_quantizer_matrix[4][64];
+};
+
+typedef struct {
+#ifdef ARCH_PPC
+    uint8_t regv[12*16];
+#endif
+    int dummy;
+} cpu_state_t;
+
+/* cpu_accel.c */
+uint32_t mpeg2_detect_accel (uint32_t accel);
+
+/* cpu_state.c */
+void mpeg2_cpu_state_init (uint32_t accel);
+
+/* decode.c */
+mpeg2_state_t mpeg2_seek_header (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec);
+
+/* header.c */
+void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec);
+void mpeg2_reset_info (mpeg2_info_t * info);
+int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_gop (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_picture (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_extension (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_gop_finalize (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_picture_finalize (mpeg2dec_t * mpeg2dec, uint32_t accels);
+mpeg2_state_t mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_header_end (mpeg2dec_t * mpeg2dec);
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int b_type);
+
+/* idct.c */
+void mpeg2_idct_init (uint32_t accel);
+
+/* idct_mmx.c */
+void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmxext (int last, int16_t * block,
+			    uint8_t * dest, int stride);
+void mpeg2_idct_copy_mmx (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmx (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+void mpeg2_idct_mmx_init (void);
+
+/* idct_altivec.c */
+void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_altivec (int last, int16_t * block,
+			     uint8_t * dest, int stride);
+void mpeg2_idct_altivec_init (void);
+
+/* idct_alpha.c */
+void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mvi (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_alpha (int last, int16_t * block,
+			   uint8_t * dest, int stride);
+void mpeg2_idct_alpha_init (void);
+
+/* motion_comp.c */
+void mpeg2_mc_init (uint32_t accel);
+
+typedef struct {
+    mpeg2_mc_fct * put [8];
+    mpeg2_mc_fct * avg [8];
+} mpeg2_mc_t;
+
+#define MPEG2_MC_EXTERN(x) mpeg2_mc_t mpeg2_mc_##x = {			  \
+    {MC_put_o_16_##x, MC_put_x_16_##x, MC_put_y_16_##x, MC_put_xy_16_##x, \
+     MC_put_o_8_##x,  MC_put_x_8_##x,  MC_put_y_8_##x,  MC_put_xy_8_##x}, \
+    {MC_avg_o_16_##x, MC_avg_x_16_##x, MC_avg_y_16_##x, MC_avg_xy_16_##x, \
+     MC_avg_o_8_##x,  MC_avg_x_8_##x,  MC_avg_y_8_##x,  MC_avg_xy_8_##x}  \
+};
+
+extern mpeg2_mc_t mpeg2_mc_c;
+extern mpeg2_mc_t mpeg2_mc_mmx;
+extern mpeg2_mc_t mpeg2_mc_mmxext;
+extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_altivec;
+extern mpeg2_mc_t mpeg2_mc_alpha;
+extern mpeg2_mc_t mpeg2_mc_vis;
diff --git a/src/video_dec/libmpeg2new/libmpeg2/rgb.c b/src/video_dec/libmpeg2new/libmpeg2/rgb.c
new file mode 100644
index 000000000..e4abcacc2
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/rgb.c
@@ -0,0 +1,598 @@
+/*
+ * rgb.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+#include <xine/attributes.h>
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+#include "convert_internal.h"
+
+static int matrix_coefficients = 6;
+
+static const int Inverse_Table_6_9[8][4] = {
+    {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
+    {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
+    {104597, 132201, 25675, 53279}, /* unspecified */
+    {104597, 132201, 25675, 53279}, /* reserved */
+    {104448, 132798, 24759, 53109}, /* FCC */
+    {104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */
+    {104597, 132201, 25675, 53279}, /* SMPTE 170M */
+    {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
+};
+
+static const uint8_t dither[] ATTR_ALIGN(32) = {
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+     3,  9, 27, 63,  1,  4, 25, 59,  5, 12, 28, 67,  3,  7, 26, 62,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+    19, 45, 11, 27, 17, 41,  9, 22, 21, 49, 13, 30, 19, 44, 11, 26,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+     0,  2, 24, 57,  6, 15, 30, 70,  0,  1, 23, 55,  6, 14, 29, 69,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+    16, 38,  8, 20, 22, 52, 14, 34, 16, 37,  8, 19, 21, 51, 14, 33,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+     4, 11, 28, 66,  2,  6, 26, 61,  4, 10, 27, 65,  2,  5, 25, 60,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+    20, 47, 12, 29, 18, 43, 10, 25, 20, 46, 12, 28, 18, 42, 10, 23,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+     0,  0, 23, 54,  5, 13, 29, 68,  1,  3, 24, 58,  7, 17, 30, 71,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35,
+    15, 36,  7, 18, 21, 50, 13, 31, 17, 39,  9, 21, 22, 53, 15, 35
+};
+
+static const uint8_t dither_temporal[64] = {
+    0x00, 0x20, 0x21, 0x01, 0x40, 0x60, 0x61, 0x41,
+    0x42, 0x62, 0x63, 0x43, 0x02, 0x22, 0x23, 0x03,
+    0x80, 0xa0, 0xa1, 0x81, 0xc0, 0xe0, 0xe1, 0xc1,
+    0xc2, 0xe2, 0xe3, 0xc3, 0x82, 0xa2, 0xa3, 0x83,
+    0x84, 0xa4, 0xa5, 0x85, 0xc4, 0xe4, 0xe5, 0xc5,
+    0xc6, 0xe6, 0xe7, 0xc7, 0x86, 0xa6, 0xa7, 0x87,
+    0x04, 0x24, 0x25, 0x05, 0x44, 0x64, 0x65, 0x45,
+    0x46, 0x66, 0x67, 0x47, 0x06, 0x26, 0x27, 0x07
+};
+
+typedef struct {
+    convert_rgb_t base;
+    void * table_rV[256];
+    void * table_gU[256];
+    int table_gV[256];
+    void * table_bU[256];
+} convert_rgb_c_t;
+
+#define RGB(type,i)							\
+    U = pu[i];								\
+    V = pv[i];								\
+    r = (type *) id->table_rV[V];					\
+    g = (type *) (((uint8_t *)id->table_gU[U]) + id->table_gV[V]);	\
+    b = (type *) id->table_bU[U];
+
+#define DST(py,dst,i,j)			\
+    Y = py[i];				\
+    dst[i] = r[Y] + g[Y] + b[Y];
+
+#define DSTRGB(py,dst,i,j)					\
+    Y = py[i];							\
+    dst[3*i] = r[Y]; dst[3*i+1] = g[Y]; dst[3*i+2] = b[Y];
+
+#define DSTBGR(py,dst,i,j)					\
+    Y = py[i];							\
+    dst[3*i] = b[Y]; dst[3*i+1] = g[Y]; dst[3*i+2] = r[Y];
+
+#define DSTDITHER(py,dst,i,j)						  \
+    Y = py[i];								  \
+    dst[i] = r[Y+pd[2*i+96*j]] + g[Y-pd[2*i+96*j]] + b[Y+pd[2*i+1+96*j]];
+
+#define DO(x) x
+#define SKIP(x)
+
+#define DECLARE_420(func,type,num,DST,DITHER)				\
+static void func (void * _id, uint8_t * const * src,			\
+		  unsigned int v_offset)				\
+{									\
+    const convert_rgb_c_t * const id = (convert_rgb_c_t *) _id;		\
+    type * dst_1;							\
+    const uint8_t * py_1, * pu, * pv;					\
+    int i;								\
+    DITHER(uint8_t dithpos = id->base.dither_offset;)			\
+									\
+    dst_1 = (type *)(id->base.rgb_ptr + id->base.rgb_slice * v_offset);	\
+    py_1 = src[0];	pu = src[1];	pv = src[2];			\
+									\
+    i = 8;								\
+    do {								\
+	const uint8_t * py_2;						\
+	int j, U, V, Y;							\
+	const type * r, * g, * b;					\
+	type * dst_2;							\
+	DITHER(const uint8_t * const pd = dither + 2 * dithpos;)	\
+									\
+	dst_2 = (type *)((char *)dst_1 + id->base.rgb_stride);		\
+	py_2 = py_1 + id->base.y_stride;				\
+	j = id->base.width;						\
+	do {								\
+	    RGB (type, 0)						\
+	    DST (py_1, dst_1, 0, 0)					\
+	    DST (py_1, dst_1, 1, 0)					\
+	    DST (py_2, dst_2, 0, 1)					\
+	    DST (py_2, dst_2, 1, 1)					\
+									\
+	    RGB (type, 1)						\
+	    DST (py_2, dst_2, 2, 1)					\
+	    DST (py_2, dst_2, 3, 1)					\
+	    DST (py_1, dst_1, 2, 0)					\
+	    DST (py_1, dst_1, 3, 0)					\
+									\
+	    RGB (type, 2)						\
+	    DST (py_1, dst_1, 4, 0)					\
+	    DST (py_1, dst_1, 5, 0)					\
+	    DST (py_2, dst_2, 4, 1)					\
+	    DST (py_2, dst_2, 5, 1)					\
+									\
+	    RGB (type, 3)						\
+	    DST (py_2, dst_2, 6, 1)					\
+	    DST (py_2, dst_2, 7, 1)					\
+	    DST (py_1, dst_1, 6, 0)					\
+	    DST (py_1, dst_1, 7, 0)					\
+									\
+	    pu += 4;							\
+	    pv += 4;							\
+	    py_1 += 8;							\
+	    py_2 += 8;							\
+	    dst_1 += 8 * num;						\
+	    dst_2 += 8 * num;						\
+	} while (--j);							\
+	if (--i == id->base.field) {					\
+	    dst_1 = (type *)(id->base.rgb_ptr +				\
+			     id->base.rgb_slice * (v_offset + 1));	\
+	    py_1 = src[0] + id->base.y_stride_frame;			\
+	    pu = src[1] + id->base.uv_stride_frame;			\
+	    pv = src[2] + id->base.uv_stride_frame;			\
+	} else {							\
+	    py_1 += id->base.y_increm;					\
+	    pu += id->base.uv_increm;					\
+	    pv += id->base.uv_increm;					\
+	    dst_1 = (type *)((char *)dst_1 + id->base.rgb_increm);	\
+	    DITHER(dithpos += id->base.dither_stride;)			\
+	}								\
+    } while (i);							\
+}
+
+DECLARE_420 (rgb_c_32_420, uint32_t, 1, DST, SKIP)
+DECLARE_420 (rgb_c_24_rgb_420, uint8_t, 3, DSTRGB, SKIP)
+DECLARE_420 (rgb_c_24_bgr_420, uint8_t, 3, DSTBGR, SKIP)
+DECLARE_420 (rgb_c_16_420, uint16_t, 1, DST, SKIP)
+DECLARE_420 (rgb_c_8_420, uint8_t, 1, DSTDITHER, DO)
+
+#define DECLARE_422(func,type,num,DST,DITHER)				\
+static void func (void * _id, uint8_t * const * src,			\
+		  unsigned int v_offset)				\
+{									\
+    const convert_rgb_c_t * const id = (convert_rgb_c_t *) _id;		\
+    type * dst;								\
+    const uint8_t * py, * pu, * pv;					\
+    int i;								\
+    DITHER(uint8_t dithpos = id->base.dither_offset;)			\
+									\
+    dst = (type *)(id->base.rgb_ptr + id->base.rgb_stride * v_offset);	\
+    py = src[0];	pu = src[1];	pv = src[2];			\
+									\
+    i = 16;								\
+    do {								\
+	int j, U, V, Y;							\
+	const type * r, * g, * b;					\
+	DITHER(const uint8_t * const pd = dither + 2 * dithpos;)	\
+									\
+	j = id->base.width;						\
+	do {								\
+	    RGB (type, 0)						\
+	    DST (py, dst, 0, 0)						\
+	    DST (py, dst, 1, 0)						\
+									\
+	    RGB (type, 1)						\
+	    DST (py, dst, 2, 0)						\
+	    DST (py, dst, 3, 0)						\
+									\
+	    RGB (type, 2)						\
+	    DST (py, dst, 4, 0)						\
+	    DST (py, dst, 5, 0)						\
+									\
+	    RGB (type, 3)						\
+	    DST (py, dst, 6, 0)						\
+	    DST (py, dst, 7, 0)						\
+									\
+	    pu += 4;							\
+	    pv += 4;							\
+	    py += 8;							\
+	    dst += 8 * num;						\
+	} while (--j);							\
+	py += id->base.y_increm;					\
+	pu += id->base.uv_increm;					\
+	pv += id->base.uv_increm;					\
+	dst = (type *)((char *)dst + id->base.rgb_increm);		\
+	DITHER(dithpos += id->base.dither_stride;)			\
+    } while (--i);							\
+}
+
+DECLARE_422 (rgb_c_32_422, uint32_t, 1, DST, SKIP)
+DECLARE_422 (rgb_c_24_rgb_422, uint8_t, 3, DSTRGB, SKIP)
+DECLARE_422 (rgb_c_24_bgr_422, uint8_t, 3, DSTBGR, SKIP)
+DECLARE_422 (rgb_c_16_422, uint16_t, 1, DST, SKIP)
+DECLARE_422 (rgb_c_8_422, uint8_t, 1, DSTDITHER, DO)
+
+#define DECLARE_444(func,type,num,DST,DITHER)				\
+static void func (void * _id, uint8_t * const * src,			\
+		  unsigned int v_offset)				\
+{									\
+    const convert_rgb_c_t * const id = (convert_rgb_c_t *) _id;		\
+    type * dst;								\
+    const uint8_t * py, * pu, * pv;					\
+    int i;								\
+    DITHER(uint8_t dithpos = id->base.dither_offset;)			\
+									\
+    dst = (type *)(id->base.rgb_ptr + id->base.rgb_stride * v_offset);	\
+    py = src[0];	pu = src[1];	pv = src[2];			\
+									\
+    i = 16;								\
+    do {								\
+	int j, U, V, Y;							\
+	const type * r, * g, * b;					\
+	DITHER(const uint8_t * const pd = dither + 2 * dithpos;)	\
+									\
+	j = id->base.width;						\
+	do {								\
+	    RGB (type, 0)						\
+	    DST (py, dst, 0, 0)						\
+	    RGB (type, 1)						\
+	    DST (py, dst, 1, 0)						\
+	    RGB (type, 2)						\
+	    DST (py, dst, 2, 0)						\
+	    RGB (type, 3)						\
+	    DST (py, dst, 3, 0)						\
+	    RGB (type, 4)						\
+	    DST (py, dst, 4, 0)						\
+	    RGB (type, 5)						\
+	    DST (py, dst, 5, 0)						\
+	    RGB (type, 6)						\
+	    DST (py, dst, 6, 0)						\
+	    RGB (type, 7)						\
+	    DST (py, dst, 7, 0)						\
+									\
+	    pu += 8;							\
+	    pv += 8;							\
+	    py += 8;							\
+	    dst += 8 * num;						\
+	} while (--j);							\
+	py += id->base.y_increm;				   	\
+	pu += id->base.y_increm;				   	\
+	pv += id->base.y_increm;				   	\
+	dst = (type *)((char *)dst + id->base.rgb_increm);		\
+	DITHER(dithpos += id->base.dither_stride;)			\
+    } while (--i);							\
+}
+
+DECLARE_444 (rgb_c_32_444, uint32_t, 1, DST, SKIP)
+DECLARE_444 (rgb_c_24_rgb_444, uint8_t, 3, DSTRGB, SKIP)
+DECLARE_444 (rgb_c_24_bgr_444, uint8_t, 3, DSTBGR, SKIP)
+DECLARE_444 (rgb_c_16_444, uint16_t, 1, DST, SKIP)
+DECLARE_444 (rgb_c_8_444, uint8_t, 1, DSTDITHER, DO)
+
+static void rgb_start (void * _id, const mpeg2_fbuf_t * fbuf,
+		       const mpeg2_picture_t * picture,
+		       const mpeg2_gop_t * gop)
+{
+    convert_rgb_t * id = (convert_rgb_t *) _id;
+    int uv_stride = id->uv_stride_frame;
+    id->y_stride = id->y_stride_frame;
+    id->rgb_ptr = fbuf->buf[0];
+    id->rgb_slice = id->rgb_stride = id->rgb_stride_frame;
+    id->dither_stride = 32;
+    id->dither_offset = dither_temporal[picture->temporal_reference & 63];
+    id->field = 0;
+    if ((picture->nb_fields == 1) ||
+	(id->chroma420 && !(picture->flags & PIC_FLAG_PROGRESSIVE_FRAME))) {
+	uv_stride <<= 1;
+	id->y_stride <<= 1;
+	id->rgb_stride <<= 1;
+	id->dither_stride <<= 1;
+	id->dither_offset += 16;
+	if (picture->nb_fields == 1) {
+	    id->rgb_slice <<= 1;
+	    if (!(picture->flags & PIC_FLAG_TOP_FIELD_FIRST)) {
+		id->rgb_ptr += id->rgb_stride_frame;
+		id->dither_offset += 32;
+	    }
+	} else
+	    id->field = 8 >> id->convert420;
+    }
+    id->y_increm = (id->y_stride << id->convert420) - id->y_stride_frame;
+    id->uv_increm = uv_stride - id->uv_stride_frame;
+    id->rgb_increm = (id->rgb_stride << id->convert420) - id->rgb_stride_min;
+    id->dither_stride <<= id->convert420;
+}
+
+static inline int div_round (int dividend, int divisor)
+{
+    if (dividend > 0)
+	return (dividend + (divisor>>1)) / divisor;
+    else
+	return -((-dividend + (divisor>>1)) / divisor);
+}
+
+static unsigned int rgb_c_init (convert_rgb_c_t * id,
+				mpeg2convert_rgb_order_t order,
+				unsigned int bpp)
+{
+    int i;
+    uint8_t table_Y[1024];
+    uint32_t * table_32 = 0;
+    uint16_t * table_16 = 0;
+    uint8_t * table_8 = 0;
+    uint8_t * table_332 = 0;
+    int entry_size = 0;
+    void * table_r = 0;
+    void * table_g = 0;
+    void * table_b = 0;
+
+    int crv = Inverse_Table_6_9[matrix_coefficients][0];
+    int cbu = Inverse_Table_6_9[matrix_coefficients][1];
+    int cgu = -Inverse_Table_6_9[matrix_coefficients][2];
+    int cgv = -Inverse_Table_6_9[matrix_coefficients][3];
+
+    for (i = 0; i < 1024; i++) {
+	int j;
+
+	j = (76309 * (i - 384 - 16) + 32768) >> 16;
+	table_Y[i] = (j < 0) ? 0 : ((j > 255) ? 255 : j);
+    }
+
+    switch (bpp) {
+    case 32:
+	if (!id)
+	    return (197 + 2*682 + 256 + 132) * sizeof (uint32_t);
+	table_32 = (uint32_t *) (id + 1);
+	entry_size = sizeof (uint32_t);
+	table_r = table_32 + 197;
+	table_b = table_32 + 197 + 685;
+	table_g = table_32 + 197 + 2*682;
+
+	for (i = -197; i < 256+197; i++)
+	    ((uint32_t *) table_r)[i] =
+		table_Y[i+384] << ((order == MPEG2CONVERT_RGB) ? 16 : 0);
+	for (i = -132; i < 256+132; i++)
+	    ((uint32_t *) table_g)[i] = table_Y[i+384] << 8;
+	for (i = -232; i < 256+232; i++)
+	    ((uint32_t *) table_b)[i] =
+		table_Y[i+384] << ((order == MPEG2CONVERT_RGB) ? 0 : 16);
+	break;
+
+    case 24:
+	if (!id)
+	    return (256 + 2*232) * sizeof (uint8_t);
+	table_8 = (uint8_t *) (id + 1);
+	entry_size = sizeof (uint8_t);
+	table_r = table_g = table_b = table_8 + 232;
+
+	for (i = -232; i < 256+232; i++)
+	    ((uint8_t * )table_b)[i] = table_Y[i+384];
+	break;
+
+    case 15:
+    case 16:
+	if (!id)
+	    return (197 + 2*682 + 256 + 132) * sizeof (uint16_t);
+	table_16 = (uint16_t *) (id + 1);
+	entry_size = sizeof (uint16_t);
+	table_r = table_16 + 197;
+	table_b = table_16 + 197 + 685;
+	table_g = table_16 + 197 + 2*682;
+
+	for (i = -197; i < 256+197; i++) {
+	    int j = table_Y[i+384] >> 3;
+
+	    if (order == MPEG2CONVERT_RGB)
+		j <<= ((bpp==16) ? 11 : 10);
+
+	    ((uint16_t *)table_r)[i] = j;
+	}
+	for (i = -132; i < 256+132; i++) {
+	    int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
+
+	    ((uint16_t *)table_g)[i] = j << 5;
+	}
+	for (i = -232; i < 256+232; i++) {
+	    int j = table_Y[i+384] >> 3;
+
+	    if (order == MPEG2CONVERT_BGR)
+		j <<= ((bpp==16) ? 11 : 10);
+
+	    ((uint16_t *)table_b)[i] = j;
+	}
+	break;
+
+    case 8:
+	if (!id)
+	    return (197 + 2*682 + 256 + 232 + 71) * sizeof (uint8_t);
+	table_332 = (uint8_t *) (id + 1);
+	entry_size = sizeof (uint8_t);
+	table_r = table_332 + 197;
+	table_g = table_332 + 197 + 682 + 30;
+	table_b = table_332 + 197 + 2*682;
+
+	for (i = -197; i < 256+197+30; i++)
+	    ((uint8_t *)table_r)[i] = ((table_Y[i+384] * 7 / 255) <<
+				       (order == MPEG2CONVERT_RGB ? 5 : 0));
+	for (i = -132; i < 256+132+30; i++)
+	    ((uint8_t *)table_g)[i-30] = ((table_Y[i+384] * 7 / 255) <<
+					  (order == MPEG2CONVERT_RGB ? 2 : 3));
+	for (i = -232; i < 256+232+71; i++)
+	    ((uint8_t *)table_b)[i] = ((table_Y[i+384] / 85) <<
+				       (order == MPEG2CONVERT_RGB ? 0 : 6));
+	break;
+    }
+
+    for (i = 0; i < 256; i++) {
+	id->table_rV[i] = (((uint8_t *)table_r) +
+			   entry_size * div_round (crv * (i-128), 76309));
+	id->table_gU[i] = (((uint8_t *)table_g) +
+			   entry_size * div_round (cgu * (i-128), 76309));
+	id->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
+	id->table_bU[i] = (((uint8_t *)table_b) +
+			   entry_size * div_round (cbu * (i-128), 76309));
+    }
+
+    return 0;
+}
+
+static int rgb_internal (mpeg2convert_rgb_order_t order, unsigned int bpp,
+			 int stage, void * _id, const mpeg2_sequence_t * seq,
+			 int stride, uint32_t accel, void * arg,
+			 mpeg2_convert_init_t * result)
+{
+    convert_rgb_t * id = (convert_rgb_t *) _id;
+    mpeg2convert_copy_t * copy = (mpeg2convert_copy_t *) 0;
+    unsigned int id_size = sizeof (convert_rgb_t);
+    int chroma420 = (seq->chroma_height < seq->height);
+    int convert420 = 0;
+    int rgb_stride_min = ((bpp + 7) >> 3) * seq->width;
+
+#ifdef ARCH_X86
+    if (!copy && (accel & MPEG2_ACCEL_X86_MMXEXT)) {
+	convert420 = 0;
+	copy = mpeg2convert_rgb_mmxext (order, bpp, seq);
+    }
+    if (!copy && (accel & MPEG2_ACCEL_X86_MMX)) {
+	convert420 = 0;
+	copy = mpeg2convert_rgb_mmx (order, bpp, seq);
+    }
+#endif
+#ifdef ARCH_SPARC
+    if (!copy && (accel & MPEG2_ACCEL_SPARC_VIS)) {
+	convert420 = chroma420;
+	copy = mpeg2convert_rgb_vis (order, bpp, seq);
+    }
+#endif
+    if (!copy) {
+	int src, dest;
+	static void (* rgb_c[3][5]) (void *, uint8_t * const *,
+				     unsigned int) =
+	    {{rgb_c_24_bgr_420, rgb_c_8_420, rgb_c_16_420,
+	      rgb_c_24_rgb_420, rgb_c_32_420},
+	     {rgb_c_24_bgr_422, rgb_c_8_422, rgb_c_16_422,
+	      rgb_c_24_rgb_422, rgb_c_32_422},
+	     {rgb_c_24_bgr_444, rgb_c_8_444, rgb_c_16_444,
+	      rgb_c_24_rgb_444, rgb_c_32_444}};
+
+	convert420 = chroma420;
+	id_size = (sizeof (convert_rgb_c_t) +
+		   rgb_c_init ((convert_rgb_c_t *) id, order, bpp));
+	src = ((seq->chroma_width == seq->width) +
+	       (seq->chroma_height == seq->height));
+	dest = ((bpp == 24 && order == MPEG2CONVERT_BGR) ? 0 : (bpp + 7) >> 3);
+	copy = rgb_c[src][dest];
+    }
+
+    result->id_size = id_size;
+
+    if (stride < rgb_stride_min)
+	stride = rgb_stride_min;
+
+    if (stage == MPEG2_CONVERT_STRIDE)
+	return stride;
+    else if (stage == MPEG2_CONVERT_START) {
+	id->width = seq->width >> 3;
+	id->y_stride_frame = seq->width;
+	id->uv_stride_frame = seq->chroma_width;
+	id->rgb_stride_frame = stride;
+	id->rgb_stride_min = rgb_stride_min;
+	id->chroma420 = chroma420;
+	id->convert420 = convert420;
+	result->buf_size[0] = stride * seq->height;
+	result->buf_size[1] = result->buf_size[2] = 0;
+	result->start = rgb_start;
+	result->copy = copy;
+    }
+    return 0;
+}
+
+#define DECLARE(func,order,bpp)						\
+int func (int stage, void * id,						\
+	  const mpeg2_sequence_t * sequence, int stride,		\
+	  uint32_t accel, void * arg, mpeg2_convert_init_t * result)	\
+{									\
+    return rgb_internal (order, bpp, stage, id, sequence, stride,	\
+			 accel, arg, result);				\
+}
+
+DECLARE (mpeg2convert_rgb32, MPEG2CONVERT_RGB, 32)
+DECLARE (mpeg2convert_rgb24, MPEG2CONVERT_RGB, 24)
+DECLARE (mpeg2convert_rgb16, MPEG2CONVERT_RGB, 16)
+DECLARE (mpeg2convert_rgb15, MPEG2CONVERT_RGB, 15)
+DECLARE (mpeg2convert_rgb8, MPEG2CONVERT_RGB, 8)
+DECLARE (mpeg2convert_bgr32, MPEG2CONVERT_BGR, 32)
+DECLARE (mpeg2convert_bgr24, MPEG2CONVERT_BGR, 24)
+DECLARE (mpeg2convert_bgr16, MPEG2CONVERT_BGR, 16)
+DECLARE (mpeg2convert_bgr15, MPEG2CONVERT_BGR, 15)
+DECLARE (mpeg2convert_bgr8, MPEG2CONVERT_BGR, 8)
+
+mpeg2_convert_t * mpeg2convert_rgb (mpeg2convert_rgb_order_t order,
+				    unsigned int bpp)
+{
+    static mpeg2_convert_t * table[5][2] =
+	{{mpeg2convert_rgb15, mpeg2convert_bgr15},
+	 {mpeg2convert_rgb8, mpeg2convert_bgr8},
+	 {mpeg2convert_rgb16, mpeg2convert_bgr16},
+	 {mpeg2convert_rgb24, mpeg2convert_bgr24},
+	 {mpeg2convert_rgb32, mpeg2convert_bgr32}};
+
+    if (order == MPEG2CONVERT_RGB || order == MPEG2CONVERT_BGR) {
+	if (bpp == 15)
+	    return table[0][order == MPEG2CONVERT_BGR];
+	else if (bpp >= 8 && bpp <= 32 && (bpp & 7) == 0)
+	    return table[bpp >> 3][order == MPEG2CONVERT_BGR];
+    }
+    return (mpeg2_convert_t *) 0;
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/rgb_mmx.c b/src/video_dec/libmpeg2new/libmpeg2/rgb_mmx.c
new file mode 100644
index 000000000..6ca7e65a8
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/rgb_mmx.c
@@ -0,0 +1,321 @@
+/*
+ * rgb_mmx.c
+ * Copyright (C) 2000-2003 Silicon Integrated System Corp.
+ * All Rights Reserved.
+ *
+ * Author: Olie Lho <ollie@sis.com.tw>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_X86
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+#include "convert_internal.h"
+#include <xine/attributes.h>
+#include "mmx.h"
+
+#define CPU_MMXEXT 0
+#define CPU_MMX 1
+
+/* CPU_MMXEXT/CPU_MMX adaptation layer */
+
+#define movntq(src,dest)	\
+do {				\
+    if (cpu == CPU_MMXEXT)	\
+	movntq_r2m (src, dest);	\
+    else			\
+	movq_r2m (src, dest);	\
+} while (0)
+
+static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
+{
+    static mmx_t mmx_80w = {0x0080008000800080LL};
+    static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
+    static mmx_t mmx_U_blue = {0x4093409340934093LL};
+    static mmx_t mmx_V_red = {0x3312331233123312LL};
+    static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
+    static mmx_t mmx_10w = {0x1010101010101010LL};
+    static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
+    static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};
+
+    movd_m2r (*pu, mm0);		/* mm0 = 00 00 00 00 u3 u2 u1 u0 */
+    movd_m2r (*pv, mm1);		/* mm1 = 00 00 00 00 v3 v2 v1 v0 */
+    movq_m2r (*py, mm6);		/* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
+    /* XXX might do cache preload for image here */
+
+    /*
+     * Do the multiply part of the conversion for even and odd pixels
+     * register usage:
+     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
+     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
+     * mm6 -> Y even, mm7 -> Y odd
+     */
+
+    punpcklbw_r2r (mm4, mm0);		/* mm0 = u3 u2 u1 u0 */
+    punpcklbw_r2r (mm4, mm1);		/* mm1 = v3 v2 v1 v0 */
+    psubsw_m2r (mmx_80w, mm0);		/* u -= 128 */
+    psubsw_m2r (mmx_80w, mm1);		/* v -= 128 */
+    psllw_i2r (3, mm0);			/* promote precision */
+    psllw_i2r (3, mm1);			/* promote precision */
+    movq_r2r (mm0, mm2);		/* mm2 = u3 u2 u1 u0 */
+    movq_r2r (mm1, mm3);		/* mm3 = v3 v2 v1 v0 */
+    pmulhw_m2r (mmx_U_green, mm2);	/* mm2 = u * u_green */
+    pmulhw_m2r (mmx_V_green, mm3);	/* mm3 = v * v_green */
+    pmulhw_m2r (mmx_U_blue, mm0);	/* mm0 = chroma_b */
+    pmulhw_m2r (mmx_V_red, mm1);	/* mm1 = chroma_r */
+    paddsw_r2r (mm3, mm2);		/* mm2 = chroma_g */
+
+    psubusb_m2r (mmx_10w, mm6);		/* Y -= 16 */
+    movq_r2r (mm6, mm7);		/* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+    pand_m2r (mmx_00ffw, mm6);		/* mm6 =    Y6    Y4    Y2    Y0 */
+    psrlw_i2r (8, mm7);			/* mm7 =    Y7    Y5    Y3    Y1 */
+    psllw_i2r (3, mm6);			/* promote precision */
+    psllw_i2r (3, mm7);			/* promote precision */
+    pmulhw_m2r (mmx_Y_coeff, mm6);	/* mm6 = luma_rgb even */
+    pmulhw_m2r (mmx_Y_coeff, mm7);	/* mm7 = luma_rgb odd */
+
+    /*
+     * Do the addition part of the conversion for even and odd pixels
+     * register usage:
+     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
+     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
+     * mm6 -> Y even, mm7 -> Y odd
+     */
+
+    movq_r2r (mm0, mm3);		/* mm3 = chroma_b */
+    movq_r2r (mm1, mm4);		/* mm4 = chroma_r */
+    movq_r2r (mm2, mm5);		/* mm5 = chroma_g */
+    paddsw_r2r (mm6, mm0);		/* mm0 = B6 B4 B2 B0 */
+    paddsw_r2r (mm7, mm3);		/* mm3 = B7 B5 B3 B1 */
+    paddsw_r2r (mm6, mm1);		/* mm1 = R6 R4 R2 R0 */
+    paddsw_r2r (mm7, mm4);		/* mm4 = R7 R5 R3 R1 */
+    paddsw_r2r (mm6, mm2);		/* mm2 = G6 G4 G2 G0 */
+    paddsw_r2r (mm7, mm5);		/* mm5 = G7 G5 G3 G1 */
+    packuswb_r2r (mm0, mm0);		/* saturate to 0-255 */
+    packuswb_r2r (mm1, mm1);		/* saturate to 0-255 */
+    packuswb_r2r (mm2, mm2);		/* saturate to 0-255 */
+    packuswb_r2r (mm3, mm3);		/* saturate to 0-255 */
+    packuswb_r2r (mm4, mm4);		/* saturate to 0-255 */
+    packuswb_r2r (mm5, mm5);		/* saturate to 0-255 */
+    punpcklbw_r2r (mm3, mm0);		/* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
+    punpcklbw_r2r (mm4, mm1);		/* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
+    punpcklbw_r2r (mm5, mm2);		/* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
+}
+
+static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu)
+{
+    static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
+    static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
+    static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
+
+    /*
+     * convert RGB plane to RGB 16 bits
+     * mm0 -> B, mm1 -> R, mm2 -> G
+     * mm4 -> GB, mm5 -> AR pixel 4-7
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+
+    pand_m2r (mmx_bluemask, mm0);	/* mm0 = b7b6b5b4b3______ */
+    pand_m2r (mmx_greenmask, mm2);	/* mm2 = g7g6g5g4g3g2____ */
+    pand_m2r (mmx_redmask, mm1);	/* mm1 = r7r6r5r4r3______ */
+    psrlq_i2r (3, mm0);			/* mm0 = ______b7b6b5b4b3 */
+    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
+    movq_r2r (mm0, mm5);		/* mm5 = ______b7b6b5b4b3 */
+    movq_r2r (mm2, mm7);		/* mm7 = g7g6g5g4g3g2____ */
+
+    punpcklbw_r2r (mm4, mm2);
+    punpcklbw_r2r (mm1, mm0);
+    psllq_i2r (3, mm2);
+    por_r2r (mm2, mm0);
+    movntq (mm0, *image);
+
+    punpckhbw_r2r (mm4, mm7);
+    punpckhbw_r2r (mm1, mm5);
+    psllq_i2r (3, mm7);
+    por_r2r (mm7, mm5);
+    movntq (mm5, *(image+8));
+}
+
+static inline void mmx_unpack_32rgb (uint8_t * image, const int cpu)
+{
+    /*
+     * convert RGB plane to RGB packed format,
+     * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
+     * mm4 -> GB, mm5 -> AR pixel 4-7,
+     * mm6 -> GB, mm7 -> AR pixel 0-3
+     */
+
+    pxor_r2r (mm3, mm3);
+    movq_r2r (mm0, mm6);
+    movq_r2r (mm1, mm7);
+    movq_r2r (mm0, mm4);
+    movq_r2r (mm1, mm5);
+    punpcklbw_r2r (mm2, mm6);
+    punpcklbw_r2r (mm3, mm7);
+    punpcklwd_r2r (mm7, mm6);
+    movntq (mm6, *image);
+    movq_r2r (mm0, mm6);
+    punpcklbw_r2r (mm2, mm6);
+    punpckhwd_r2r (mm7, mm6);
+    movntq (mm6, *(image+8));
+    punpckhbw_r2r (mm2, mm4);
+    punpckhbw_r2r (mm3, mm5);
+    punpcklwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+16));
+    movq_r2r (mm0, mm4);
+    punpckhbw_r2r (mm2, mm4);
+    punpckhwd_r2r (mm5, mm4);
+    movntq (mm4, *(image+24));
+}
+
+static inline void rgb16 (void * const _id, uint8_t * const * src,
+			  const unsigned int v_offset, const int cpu)
+{
+    convert_rgb_t * const id = (convert_rgb_t *) _id;
+    uint8_t * dst;
+    uint8_t * py, * pu, * pv;
+    int i, j;
+
+    dst = id->rgb_ptr + id->rgb_slice * v_offset;
+    py = src[0];	pu = src[1];	pv = src[2];
+
+    i = 16;
+    do {
+	j = id->width;
+	do {
+	    mmx_yuv2rgb (py, pu, pv);
+	    mmx_unpack_16rgb (dst, cpu);
+	    py += 8;
+	    pu += 4;
+	    pv += 4;
+	    dst += 16;
+	} while (--j);
+
+	dst += id->rgb_increm;
+	py += id->y_increm;
+	if (--i == id->field) {
+	    dst = id->rgb_ptr + id->rgb_slice * (v_offset + 1);
+	    py = src[0] + id->y_stride_frame;
+	    pu = src[1] + id->uv_stride_frame;
+	    pv = src[2] + id->uv_stride_frame;
+	} else if (! (i & id->chroma420)) {
+	    pu += id->uv_increm;
+	    pv += id->uv_increm;
+	} else {
+	    pu -= id->uv_stride_frame;
+	    pv -= id->uv_stride_frame;
+	}
+    } while (i);
+}
+
+static inline void argb32 (void * const _id, uint8_t * const * src,
+			   const unsigned int v_offset, const int cpu)
+{
+    convert_rgb_t * const id = (convert_rgb_t *) _id;
+    uint8_t * dst;
+    uint8_t * py, * pu, * pv;
+    int i, j;
+
+    dst = id->rgb_ptr + id->rgb_slice * v_offset;
+    py = src[0];	pu = src[1];	pv = src[2];
+
+    i = 16;
+    do {
+	j = id->width;
+	do {
+	    mmx_yuv2rgb (py, pu, pv);
+	    mmx_unpack_32rgb (dst, cpu);
+	    py += 8;
+	    pu += 4;
+	    pv += 4;
+	    dst += 32;
+	} while (--j);
+
+	dst += id->rgb_increm;
+	py += id->y_increm;
+	if (--i == id->field) {
+	    dst = id->rgb_ptr + id->rgb_slice * (v_offset + 1);
+	    py = src[0] + id->y_stride_frame;
+	    pu = src[1] + id->uv_stride_frame;
+	    pv = src[2] + id->uv_stride_frame;
+	} else if (! (i & id->chroma420)) {
+	    pu += id->uv_increm;
+	    pv += id->uv_increm;
+	} else {
+	    pu -= id->uv_stride_frame;
+	    pv -= id->uv_stride_frame;
+	}
+    } while (i);
+}
+
+static void mmxext_rgb16 (void * id, uint8_t * const * src,
+			  unsigned int v_offset)
+{
+    rgb16 (id, src, v_offset, CPU_MMXEXT);
+}
+
+static void mmxext_argb32 (void * id, uint8_t * const * src,
+			   unsigned int v_offset)
+{
+    argb32 (id, src, v_offset, CPU_MMXEXT);
+}
+
+static void mmx_rgb16 (void * id, uint8_t * const * src, unsigned int v_offset)
+{
+    rgb16 (id, src, v_offset, CPU_MMX);
+}
+
+static void mmx_argb32 (void * id, uint8_t * const * src,
+			unsigned int v_offset)
+{
+    argb32 (id, src, v_offset, CPU_MMX);
+}
+
+mpeg2convert_copy_t * mpeg2convert_rgb_mmxext (int order, int bpp,
+					       const mpeg2_sequence_t * seq)
+{
+    if (order == MPEG2CONVERT_RGB && seq->chroma_width < seq->width) {
+	if (bpp == 16)
+	    return mmxext_rgb16;
+	else if (bpp == 32)
+	    return mmxext_argb32;
+    }
+    return NULL;	/* Fallback to C */
+}
+
+mpeg2convert_copy_t * mpeg2convert_rgb_mmx (int order, int bpp,
+					    const mpeg2_sequence_t * seq)
+{
+    if (order == MPEG2CONVERT_RGB && seq->chroma_width < seq->width) {
+	if (bpp == 16)
+	    return mmx_rgb16;
+	else if (bpp == 32)
+	    return mmx_argb32;
+    }
+    return NULL;	/* Fallback to C */
+}
+#endif
diff --git a/src/video_dec/libmpeg2new/libmpeg2/rgb_vis.c b/src/video_dec/libmpeg2new/libmpeg2/rgb_vis.c
new file mode 100644
index 000000000..cbd7c7072
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/rgb_vis.c
@@ -0,0 +1,384 @@
+/*
+ * rgb_vis.c
+ * Copyright (C) 2003 David S. Miller <davem@redhat.com>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#ifdef ARCH_SPARC
+
+#include <stddef.h>
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+#include "convert_internal.h"
+#include <xine/attributes.h>
+#include "vis.h"
+
+/* Based partially upon the MMX yuv2rgb code, see there for credits.
+ *
+ * The difference here is that since we have enough registers we
+ * process both even and odd scanlines in one pass.
+ */
+
+static const uint16_t const_2048[] ATTR_ALIGN(8) = {2048, 2048, 2048, 2048};
+static const uint16_t const_1024[] ATTR_ALIGN(8) = {1024, 1024, 1024, 1024};
+static const uint16_t const_128[] ATTR_ALIGN(8) = {128, 128, 128, 128};
+static const uint8_t const_Ugreen[] ATTR_ALIGN(8) =
+	{0xf3, 0x00, 0xf3, 0x00, 0xf3, 0x00, 0xf3, 0x00};
+static const uint8_t const_Vgreen[] ATTR_ALIGN(8) =
+	{0xe6, 0x00, 0xe6, 0x00, 0xe6, 0x00, 0xe6, 0x00};
+static const uint8_t const_Ublue_Vred[] ATTR_ALIGN(8) =
+	{0x41, 0x41, 0x41, 0x41, 0x33, 0x33, 0x33, 0x33};
+static const uint8_t const_Ycoeff[] ATTR_ALIGN(4) = {0x25, 0x25, 0x25, 0x25};
+
+#define TMP0		0
+#define TMP1		1
+#define TMP2		2
+#define TMP3		3
+#define TMP4		4
+#define TMP5		5
+#define TMP6		6
+#define TMP7		7
+#define TMP8		8
+#define TMP9		9
+#define TMP10		10
+#define TMP11		11
+#define TMP12		12
+#define TMP13		13
+
+#define CONST_UBLUE	14
+#define CONST_VRED	15
+#define CONST_2048	16
+
+#define BLUE8_EVEN	18
+#define BLUE8_ODD	19
+#define RED8_EVEN	20
+#define RED8_ODD	21
+#define GREEN8_EVEN	22
+#define GREEN8_ODD	23
+
+#define BLUE8_2_EVEN	24
+#define BLUE8_2_ODD	25
+#define RED8_2_EVEN	26
+#define RED8_2_ODD	27
+#define GREEN8_2_EVEN	28
+#define GREEN8_2_ODD	29
+
+#define CONST_YCOEFF	30
+#define ZEROS		31
+
+#define PU_0		32
+#define PU_2		34
+#define PV_0		36
+#define PV_2		38
+#define PY_0		40
+#define PY_2		42
+#define PY_4		44
+#define PY_6		46
+
+#define CONST_128	56
+#define CONST_1024	58
+#define CONST_VGREEN	60
+#define CONST_UGREEN	62
+
+static inline void vis_init_consts(void)
+{
+	vis_set_gsr(7 << VIS_GSR_SCALEFACT_SHIFT);
+
+	vis_ld64(const_2048[0], CONST_2048);
+	vis_ld64(const_1024[0], CONST_1024);
+	vis_ld64(const_Ugreen[0], CONST_UGREEN);
+	vis_ld64(const_Vgreen[0], CONST_VGREEN);
+	vis_fzeros(ZEROS);
+	vis_ld64(const_Ublue_Vred[0], CONST_UBLUE);
+	vis_ld32(const_Ycoeff[0], CONST_YCOEFF);
+	vis_ld64(const_128[0],  CONST_128);
+}
+
+static inline void vis_yuv2rgb(uint8_t *py, uint8_t *pu, uint8_t *pv,
+			       int y_stride)
+{
+	vis_ld32(pu[0], TMP0);
+
+	vis_ld32(pv[0], TMP2);
+
+	vis_ld64(py[0], TMP4);
+	vis_mul8x16au(TMP0, CONST_2048, PU_0);
+
+	vis_ld64_2(py, y_stride, TMP8);
+	vis_mul8x16au(TMP2, CONST_2048, PV_0);
+
+	vis_pmerge(TMP4, TMP5, TMP6);
+
+	vis_pmerge(TMP6, TMP7, TMP4);
+
+	vis_pmerge(TMP8, TMP9, TMP10);
+
+	vis_pmerge(TMP10, TMP11, TMP8);
+	vis_mul8x16au(TMP4, CONST_2048, PY_0);
+
+	vis_psub16(PU_0, CONST_1024, PU_0);
+	vis_mul8x16au(TMP5, CONST_2048, PY_2);
+
+	vis_psub16(PV_0, CONST_1024, PV_0);
+	vis_mul8x16au(TMP8, CONST_2048, PY_4);
+
+	vis_psub16(PY_0, CONST_128, PY_0);
+	vis_mul8x16au(TMP9, CONST_2048, PY_6);
+
+	vis_psub16(PY_2, CONST_128, PY_2);
+	vis_mul8x16(CONST_YCOEFF, PY_0, PY_0);
+
+	vis_psub16(PY_4, CONST_128, PY_4);
+	vis_mul8x16(CONST_YCOEFF, PY_2, PY_2);
+
+	vis_psub16(PY_6, CONST_128, PY_6);
+	vis_mul8x16(CONST_YCOEFF, PY_4, PY_4);
+
+	vis_mul8x16(CONST_YCOEFF, PY_6, PY_6);
+
+	vis_mul8sux16(CONST_UGREEN, PU_0, TMP0);
+
+	vis_mul8sux16(CONST_VGREEN, PV_0, TMP2);
+
+	vis_mul8x16(CONST_UBLUE, PU_0, TMP4);
+
+	vis_mul8x16(CONST_VRED, PV_0, TMP6);
+	vis_padd16(TMP0, TMP2, TMP10);
+
+	vis_padd16(PY_0, TMP4, TMP0);
+
+	vis_padd16(PY_2, TMP4, TMP2);
+	vis_pack16(TMP0, BLUE8_EVEN);
+
+	vis_padd16(PY_4, TMP4, TMP0);
+	vis_pack16(TMP2, BLUE8_ODD);
+
+	vis_padd16(PY_6, TMP4, TMP2);
+	vis_pack16(TMP0, BLUE8_2_EVEN);
+
+	vis_padd16(PY_0, TMP6, TMP0);
+	vis_pack16(TMP2, BLUE8_2_ODD);
+
+	vis_padd16(PY_2, TMP6, TMP2);
+	vis_pack16(TMP0, RED8_EVEN);
+
+	vis_padd16(PY_4, TMP6, TMP0);
+	vis_pack16(TMP2, RED8_ODD);
+
+	vis_padd16(PY_6, TMP6, TMP2);
+	vis_pack16(TMP0, RED8_2_EVEN);
+
+	vis_padd16(PY_0, TMP10, TMP0);
+	vis_pack16(TMP2, RED8_2_ODD);
+
+	vis_padd16(PY_2, TMP10, TMP2);
+	vis_pack16(TMP0, GREEN8_EVEN);
+
+	vis_padd16(PY_4, TMP10, TMP0);
+	vis_pack16(TMP2, GREEN8_ODD);
+
+	vis_padd16(PY_6, TMP10, TMP2);
+	vis_pack16(TMP0, GREEN8_2_EVEN);
+
+	vis_pack16(TMP2, GREEN8_2_ODD);
+	vis_pmerge(BLUE8_EVEN, BLUE8_ODD, BLUE8_EVEN);
+
+	vis_pmerge(BLUE8_2_EVEN, BLUE8_2_ODD, BLUE8_2_EVEN);
+
+	vis_pmerge(RED8_EVEN, RED8_ODD, RED8_EVEN);
+
+	vis_pmerge(RED8_2_EVEN, RED8_2_ODD, RED8_2_EVEN);
+
+	vis_pmerge(GREEN8_EVEN, GREEN8_ODD, GREEN8_EVEN);
+
+	vis_pmerge(GREEN8_2_EVEN, GREEN8_2_ODD, GREEN8_2_EVEN);
+}
+
+static inline void vis_unpack_32rgb(uint8_t *image, int stride)
+{
+	vis_pmerge(ZEROS, GREEN8_EVEN, TMP0);
+	vis_pmerge(RED8_EVEN, BLUE8_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_ODD, TMP8);
+	vis_pmerge(RED8_ODD, BLUE8_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+
+	image += stride;
+
+	vis_pmerge(ZEROS, GREEN8_2_EVEN, TMP0);
+	vis_pmerge(RED8_2_EVEN, BLUE8_2_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_2_ODD, TMP8);
+	vis_pmerge(RED8_2_ODD, BLUE8_2_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+}
+
+static inline void vis_unpack_32bgr(uint8_t *image, int stride)
+{
+	vis_pmerge(ZEROS, GREEN8_EVEN, TMP0);
+	vis_pmerge(BLUE8_EVEN, RED8_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_ODD, TMP8);
+	vis_pmerge(BLUE8_ODD, RED8_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+
+	image += stride;
+
+	vis_pmerge(ZEROS, GREEN8_2_EVEN, TMP0);
+	vis_pmerge(BLUE8_2_EVEN, RED8_2_EVEN, TMP2);
+
+	vis_pmerge(TMP0, TMP2, TMP4);
+	vis_st64(TMP4, image[0]);
+
+	vis_pmerge(TMP1, TMP3, TMP6);
+	vis_st64_2(TMP6, image, 8);
+
+	vis_pmerge(ZEROS, GREEN8_2_ODD, TMP8);
+	vis_pmerge(BLUE8_2_ODD, RED8_2_ODD, TMP10);
+
+	vis_pmerge(TMP8, TMP10, TMP0);
+	vis_st64_2(TMP0, image, 16);
+
+	vis_pmerge(TMP9, TMP11, TMP2);
+	vis_st64_2(TMP2, image, 24);
+}
+
+static inline void vis_yuv420_argb32(uint8_t *image,
+				     uint8_t *py, uint8_t *pu, uint8_t *pv,
+				     int width, int height, int rgb_stride,
+				     int y_stride, int uv_stride)
+{
+	height >>= 1;
+	uv_stride -= width >> 1;
+	do {
+		int i = width >> 3;
+		do {
+			vis_yuv2rgb(py, pu, pv, y_stride);
+			vis_unpack_32rgb(image, rgb_stride);
+			py += 8;
+			pu += 4;
+			pv += 4;
+			image += 32;
+		} while (--i);
+
+		py    += (y_stride << 1) - width;
+		image += (rgb_stride << 1) - 4 * width;
+		pu    += uv_stride;
+		pv    += uv_stride;
+	} while (--height);
+}
+
+static inline void vis_yuv420_abgr32(uint8_t *image,
+				     uint8_t *py, uint8_t *pu, uint8_t *pv,
+				     int width, int height, int rgb_stride,
+				     int y_stride, int uv_stride)
+{
+	height >>= 1;
+	uv_stride -= width >> 1;
+	do {
+		int i = width >> 3;
+		do {
+			vis_yuv2rgb(py, pu, pv, y_stride);
+			vis_unpack_32bgr(image, rgb_stride);
+			py += 8;
+			pu += 4;
+			pv += 4;
+			image += 32;
+		} while (--i);
+
+		py    += (y_stride << 1) - width;
+		image += (rgb_stride << 1) - 4 * width;
+		pu    += uv_stride;
+		pv    += uv_stride;
+	} while (--height);
+}
+
+static void vis_argb32(void *_id, uint8_t * const *src,
+		       unsigned int v_offset)
+{
+	convert_rgb_t *id = (convert_rgb_t *) _id;
+
+	vis_init_consts();
+	vis_yuv420_argb32(id->rgb_ptr + id->rgb_stride * v_offset,
+			  src[0], src[1], src[2], id->width, 16,
+			  id->rgb_stride, id->y_stride, id->y_stride >> 1);
+}
+
+static void vis_abgr32(void *_id, uint8_t * const *src,
+		       unsigned int v_offset)
+{
+	convert_rgb_t *id = (convert_rgb_t *) _id;
+
+	vis_init_consts();
+	vis_yuv420_abgr32(id->rgb_ptr + id->rgb_stride * v_offset,
+			  src[0], src[1], src[2], id->width, 16,
+			  id->rgb_stride, id->y_stride, id->y_stride >> 1);
+}
+
+mpeg2convert_copy_t *mpeg2convert_rgb_vis(int order, int bpp,
+					  const mpeg2_sequence_t * seq)
+{
+	if (bpp == 32 && seq->chroma_height < seq->height) {
+		if (order == MPEG2CONVERT_RGB)
+			return vis_argb32;
+		if (order == MPEG2CONVERT_BGR)
+			return vis_abgr32;
+	}
+
+	return NULL;	/* Fallback to C */
+}
+
+#endif /* ARCH_SPARC */
diff --git a/src/video_dec/libmpeg2new/libmpeg2/slice.c b/src/video_dec/libmpeg2new/libmpeg2/slice.c
new file mode 100644
index 000000000..ce4508639
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/slice.c
@@ -0,0 +1,2058 @@
+/*
+ * slice.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Peter Gubanov <peter@elecard.net.ru>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "../include/mpeg2.h"
+#include "../include/attributes.h"
+#include "mpeg2_internal.h"
+
+extern mpeg2_mc_t mpeg2_mc;
+extern void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+extern void (* mpeg2_idct_add) (int last, int16_t * block,
+				uint8_t * dest, int stride);
+extern void (* mpeg2_cpu_state_save) (cpu_state_t * state);
+extern void (* mpeg2_cpu_state_restore) (cpu_state_t * state);
+
+#include "vlc.h"
+
+static inline int get_macroblock_modes (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int macroblock_modes;
+    const MBtab * tab;
+
+    switch (decoder->coding_type) {
+    case I_TYPE:
+
+	tab = MB_I + UBITS (bit_buf, 1);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if ((! (decoder->frame_pred_frame_dct)) &&
+	    (decoder->picture_structure == FRAME_PICTURE)) {
+	    macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+	    DUMPBITS (bit_buf, bits, 1);
+	}
+
+	return macroblock_modes;
+
+    case P_TYPE:
+
+	tab = MB_P + UBITS (bit_buf, 5);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (decoder->picture_structure != FRAME_PICTURE) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
+	} else if (decoder->frame_pred_frame_dct) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
+		macroblock_modes |= MC_FRAME << MOTION_TYPE_SHIFT;
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
+	}
+
+    case B_TYPE:
+
+	tab = MB_B + UBITS (bit_buf, 6);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (decoder->picture_structure != FRAME_PICTURE) {
+	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (decoder->frame_pred_frame_dct) {
+	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
+	    macroblock_modes |= MC_FRAME << MOTION_TYPE_SHIFT;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_INTRA)
+		goto intra;
+	    macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+	    DUMPBITS (bit_buf, bits, 2);
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+	    intra:
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case D_TYPE:
+
+	DUMPBITS (bit_buf, bits, 1);
+	return MACROBLOCK_INTRA;
+
+    default:
+	return 0;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void get_quantizer_scale (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    int quantizer_scale_code;
+
+    quantizer_scale_code = UBITS (bit_buf, 5);
+    DUMPBITS (bit_buf, bits, 5);
+
+    decoder->quantizer_matrix[0] =
+	decoder->quantizer_prescale[0][quantizer_scale_code];
+    decoder->quantizer_matrix[1] =
+	decoder->quantizer_prescale[1][quantizer_scale_code];
+    decoder->quantizer_matrix[2] =
+	decoder->chroma_quantizer[0][quantizer_scale_code];
+    decoder->quantizer_matrix[3] =
+	decoder->chroma_quantizer[1][quantizer_scale_code];
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_motion_delta (mpeg2_decoder_t * const decoder,
+				    const int f_code)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    int delta;
+    int sign;
+    const MVtab * tab;
+
+    if (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 1);
+	return 0;
+    } else if (bit_buf >= 0x0c000000) {
+
+	tab = MV_4 + UBITS (bit_buf, 4);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + f_code + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code)
+	    delta += UBITS (bit_buf, f_code);
+	bit_buf <<= f_code;
+
+	return (delta ^ sign) - sign;
+
+    } else {
+
+	tab = MV_10 + UBITS (bit_buf, 10);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code) {
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    delta += UBITS (bit_buf, f_code);
+	    DUMPBITS (bit_buf, bits, f_code);
+	}
+
+	return (delta ^ sign) - sign;
+
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int bound_motion_vector (const int vector, const int f_code)
+{
+    return ((int32_t)vector << (27 - f_code)) >> (27 - f_code);
+}
+
+static inline int get_dmv (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    const DMVtab * tab;
+
+    tab = DMV_2 + UBITS (bit_buf, 2);
+    DUMPBITS (bit_buf, bits, tab->len);
+    return tab->dmv;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_coded_block_pattern (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    const CBPtab * tab;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    if (bit_buf >= 0x20000000) {
+
+	tab = CBP_7 + (UBITS (bit_buf, 7) - 16);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+
+    } else {
+
+	tab = CBP_9 + UBITS (bit_buf, 9);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+    }
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_luma_dc_dct_diff (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_lum_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff << decoder->intra_dc_precision;
+	} else {
+	    DUMPBITS (bit_buf, bits, 3);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 9) - 0x1e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff << decoder->intra_dc_precision;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_chroma_dc_dct_diff (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_chrom_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff << decoder->intra_dc_precision;
+	} else {
+	    DUMPBITS (bit_buf, bits, 2);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 10) - 0x3e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len + 1);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff << decoder->intra_dc_precision;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define SATURATE(val)				\
+do {						\
+    val <<= 4;					\
+    if (unlikely (val != (int16_t) val))	\
+	val = (SBITS (val, 1) ^ 2047) << 4;	\
+} while (0)
+
+static void get_intra_block_B14 (mpeg2_decoder_t * const decoder,
+				 const uint16_t * const quant_matrix)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quant_matrix[j]) >> 4;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = (SBITS (bit_buf, 12) * quant_matrix[j]) / 16;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+}
+
+static void get_intra_block_B15 (mpeg2_decoder_t * const decoder,
+				 const uint16_t * const quant_matrix)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B15_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64) {
+
+	    normal_code:
+		j = scan[i];
+		bit_buf <<= tab->len;
+		bits += tab->len + 1;
+		val = (tab->level * quant_matrix[j]) >> 4;
+
+		/* if (bitstream_get (1)) val = -val; */
+		val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		bit_buf <<= 1;
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    } else {
+
+		/* end of block. I commented out this code because if we */
+		/* dont exit here we will still exit at the later test :) */
+
+		/* if (i >= 128) break;	*/	/* end of block */
+
+		/* escape code */
+
+		i += UBITS (bit_buf << 6, 6) - 64;
+		if (i >= 64)
+		    break;	/* illegal, check against buffer overflow */
+
+		j = scan[i];
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		val = (SBITS (bit_buf, 12) * quant_matrix[j]) / 16;
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+
+		continue;
+
+	    }
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B15_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+}
+
+static int get_non_intra_block (mpeg2_decoder_t * const decoder,
+				const uint16_t * const quant_matrix)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = -1;
+    mismatch = -1;
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2 * tab->level + 1) * quant_matrix[j]) >> 5;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1;
+	    val = (val * quant_matrix[j]) / 32;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+    return i;
+}
+
+static void get_mpeg1_intra_block (mpeg2_decoder_t * const decoder)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    const uint16_t * const quant_matrix = decoder->quantizer_matrix[0];
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = 0;
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quant_matrix[j]) >> 4;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = (val * quant_matrix[j]) / 16;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+}
+
+static int get_mpeg1_non_intra_block (mpeg2_decoder_t * const decoder)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    const uint16_t * const quant_matrix = decoder->quantizer_matrix[1];
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = -1;
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2 * tab->level + 1) * quant_matrix[j]) >> 5;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64)
+		break;	/* illegal, check needed to avoid buffer overflow */
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = 2 * (val + SBITS (val, 1)) + 1;
+	    val = (val * quant_matrix[j]) / 32;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+    return i;
+}
+
+static inline void slice_intra_DCT (mpeg2_decoder_t * const decoder,
+				    const int cc,
+				    uint8_t * const dest, const int stride)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    /* Get the intra DC coefficient and inverse quantize it */
+    if (cc == 0)
+	decoder->DCTblock[0] =
+	    decoder->dc_dct_pred[0] += get_luma_dc_dct_diff (decoder);
+    else
+	decoder->DCTblock[0] =
+	    decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff (decoder);
+
+    if (decoder->mpeg1) {
+	if (decoder->coding_type != D_TYPE)
+	    get_mpeg1_intra_block (decoder);
+    } else if (decoder->intra_vlc_format)
+	get_intra_block_B15 (decoder, decoder->quantizer_matrix[cc ? 2 : 0]);
+    else
+	get_intra_block_B14 (decoder, decoder->quantizer_matrix[cc ? 2 : 0]);
+    mpeg2_idct_copy (decoder->DCTblock, dest, stride);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void slice_non_intra_DCT (mpeg2_decoder_t * const decoder,
+					const int cc,
+					uint8_t * const dest, const int stride)
+{
+    int last;
+
+    if (decoder->mpeg1)
+	last = get_mpeg1_non_intra_block (decoder);
+    else
+	last = get_non_intra_block (decoder,
+				    decoder->quantizer_matrix[cc ? 3 : 1]);
+    mpeg2_idct_add (last, decoder->DCTblock, dest, stride);
+}
+
+#define MOTION_420(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y_ ## size)) {			      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y_ ## size;	      \
+	motion_y = pos_y - 2 * decoder->v_offset - 2 * y;		      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + (pos_x >> 1) + (pos_y >> 1) * decoder->stride,   \
+		    decoder->stride, size);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      ((((decoder->v_offset + motion_y) >> 1) + y/2) *		      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + y/2 * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      decoder->uv_stride, size/2);			      \
+    table[4+xy_half] (decoder->dest[2] + y/2 * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      decoder->uv_stride, size/2)
+
+#define MOTION_FIELD_420(table,ref,motion_x,motion_y,dest_field,op,src_field) \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset,					      \
+		    (ref[0] + (pos_x >> 1) +				      \
+		     ((pos_y op) + src_field) * decoder->stride),	      \
+		    2 * decoder->stride, 8);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      (((decoder->v_offset >> 1) + (motion_y op) + src_field) *	      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      2 * decoder->uv_stride, 4);			      \
+    table[4+xy_half] (decoder->dest[2] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      2 * decoder->uv_stride, 4)
+
+#define MOTION_DMV_420(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      (((decoder->v_offset >> 1) + (motion_y & ~1)) *		      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + (decoder->offset >> 1),	      \
+		      ref[1] + offset, 2 * decoder->uv_stride, 4);	      \
+    table[4+xy_half] (decoder->dest[1] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[1] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 4);			      \
+    table[4+xy_half] (decoder->dest[2] + (decoder->offset >> 1),	      \
+		      ref[2] + offset, 2 * decoder->uv_stride, 4);	      \
+    table[4+xy_half] (decoder->dest[2] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[2] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 4)
+
+#define MOTION_ZERO_420(table,ref)					      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      (ref[0] + decoder->offset +				      \
+	       decoder->v_offset * decoder->stride), decoder->stride, 16);    \
+    offset = ((decoder->offset >> 1) +					      \
+	      (decoder->v_offset >> 1) * decoder->uv_stride);		      \
+    table[4] (decoder->dest[1] + (decoder->offset >> 1),		      \
+	      ref[1] + offset, decoder->uv_stride, 8);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->uv_stride, 8)
+
+#define MOTION_422(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y_ ## size)) {			      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y_ ## size;	      \
+	motion_y = pos_y - 2 * decoder->v_offset - 2 * y;		      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y >> 1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + offset, decoder->stride, size);		      \
+    offset = (offset + (motion_x & (motion_x < 0))) >> 1;		      \
+    motion_x /= 2;							      \
+    xy_half = ((pos_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (decoder->dest[1] + y * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      decoder->uv_stride, size);			      \
+    table[4+xy_half] (decoder->dest[2] + y * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      decoder->uv_stride, size)
+
+#define MOTION_FIELD_422(table,ref,motion_x,motion_y,dest_field,op,src_field) \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + ((pos_y op) + src_field) * decoder->stride;	      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[0] + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    offset = (offset + (motion_x & (motion_x < 0))) >> 1;		      \
+    motion_x /= 2;							      \
+    xy_half = ((pos_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (decoder->dest[1] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      2 * decoder->uv_stride, 8);			      \
+    table[4+xy_half] (decoder->dest[2] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      2 * decoder->uv_stride, 8)
+
+#define MOTION_DMV_422(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    offset = (offset + (motion_x & (motion_x < 0))) >> 1;		      \
+    motion_x /= 2;							      \
+    xy_half = ((pos_y & 1) << 1) | (motion_x & 1);			      \
+    table[4+xy_half] (decoder->dest[1] + (decoder->offset >> 1),	      \
+		      ref[1] + offset, 2 * decoder->uv_stride, 8);	      \
+    table[4+xy_half] (decoder->dest[1] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[1] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 8);			      \
+    table[4+xy_half] (decoder->dest[2] + (decoder->offset >> 1),	      \
+		      ref[2] + offset, 2 * decoder->uv_stride, 8);	      \
+    table[4+xy_half] (decoder->dest[2] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[2] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 8)
+
+#define MOTION_ZERO_422(table,ref)					      \
+    offset = decoder->offset + decoder->v_offset * decoder->stride;	      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      ref[0] + offset, decoder->stride, 16);			      \
+    offset >>= 1;							      \
+    table[4] (decoder->dest[1] + (decoder->offset >> 1),		      \
+	      ref[1] + offset, decoder->uv_stride, 16);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->uv_stride, 16)
+
+#define MOTION_444(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y_ ## size)) {			      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y_ ## size;	      \
+	motion_y = pos_y - 2 * decoder->v_offset - 2 * y;		      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y >> 1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + offset, decoder->stride, size);		      \
+    table[xy_half] (decoder->dest[1] + y * decoder->stride + decoder->offset, \
+		    ref[1] + offset, decoder->stride, size);		      \
+    table[xy_half] (decoder->dest[2] + y * decoder->stride + decoder->offset, \
+		    ref[2] + offset, decoder->stride, size)
+
+#define MOTION_FIELD_444(table,ref,motion_x,motion_y,dest_field,op,src_field) \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + ((pos_y op) + src_field) * decoder->stride;	      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[0] + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[1] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[1] + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[2] + dest_field * decoder->stride +	      \
+		    decoder->offset, ref[2] + offset,			      \
+		    2 * decoder->stride, 8)
+
+#define MOTION_DMV_444(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[1] + decoder->offset,			      \
+		    ref[1] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[1] + decoder->stride + decoder->offset,     \
+		    ref[1] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    table[xy_half] (decoder->dest[2] + decoder->offset,			      \
+		    ref[2] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[2] + decoder->stride + decoder->offset,     \
+		    ref[2] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8)
+
+#define MOTION_ZERO_444(table,ref)					      \
+    offset = decoder->offset + decoder->v_offset * decoder->stride;	      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      ref[0] + offset, decoder->stride, 16);			      \
+    table[4] (decoder->dest[1] + decoder->offset,			      \
+	      ref[1] + offset, decoder->stride, 16);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->stride, 16)
+
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+static void motion_mp1 (mpeg2_decoder_t * const decoder,
+			motion_t * const motion,
+			mpeg2_mc_fct * const * const table)
+{
+    int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_x = (motion->pmv[0][0] +
+		(get_motion_delta (decoder,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_x = bound_motion_vector (motion_x,
+				    motion->f_code[0] + motion->f_code[1]);
+    motion->pmv[0][0] = motion_x;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    motion_y = (motion->pmv[0][1] +
+		(get_motion_delta (decoder,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_y = bound_motion_vector (motion_y,
+				    motion->f_code[0] + motion->f_code[1]);
+    motion->pmv[0][1] = motion_y;
+
+    MOTION_420 (table, motion->ref[0], motion_x, motion_y, 16, 0);
+}
+
+#define MOTION_FUNCTIONS(FORMAT,MOTION,MOTION_FIELD,MOTION_DMV,MOTION_ZERO)   \
+									      \
+static void motion_fr_frame_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+									      \
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_fr_field_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y, field;					      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    field = UBITS (bit_buf, 1);						      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[0][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = ((motion->pmv[0][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[0][1] = motion_y << 1;					      \
+									      \
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 0, & ~1, field); \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    field = UBITS (bit_buf, 1);						      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = ((motion->pmv[1][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[1][1] = motion_y << 1;					      \
+									      \
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 1, & ~1, field); \
+}									      \
+									      \
+static void motion_fr_dmv_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				    motion_t * const motion,		      \
+				    mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y, dmv_x, dmv_y, m, other_x, other_y;		      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    dmv_x = get_dmv (decoder);						      \
+									      \
+    motion_y = ((motion->pmv[0][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;		      \
+    dmv_y = get_dmv (decoder);						      \
+									      \
+    m = decoder->top_field_first ? 1 : 3;				      \
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;		      \
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;	      \
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 0, | 1, 0); \
+									      \
+    m = decoder->top_field_first ? 3 : 1;				      \
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;		      \
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;	      \
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 1, & ~1, 0);\
+									      \
+    MOTION_DMV (mpeg2_mc.avg, motion->ref[0], motion_x, motion_y);	      \
+}									      \
+									      \
+static void motion_reuse_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				   motion_t * const motion,		      \
+				   mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    motion_x = motion->pmv[0][0];					      \
+    motion_y = motion->pmv[0][1];					      \
+									      \
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_zero_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				  motion_t * const motion,		      \
+				  mpeg2_mc_fct * const * const table)	      \
+{									      \
+    unsigned int offset;						      \
+									      \
+    motion->pmv[0][0] = motion->pmv[0][1] = 0;				      \
+    motion->pmv[1][0] = motion->pmv[1][1] = 0;				      \
+									      \
+    MOTION_ZERO (table, motion->ref[0]);				      \
+}									      \
+									      \
+static void motion_fi_field_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y;						      \
+    uint8_t ** ref_field;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];			      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+									      \
+    MOTION (table, ref_field, motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_fi_16x8_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				     motion_t * const motion,		      \
+				     mpeg2_mc_fct * const * const table)      \
+{									      \
+    int motion_x, motion_y;						      \
+    uint8_t ** ref_field;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];			      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[0][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[0][1] = motion_y;					      \
+									      \
+    MOTION (table, ref_field, motion_x, motion_y, 8, 0);		      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];			      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_y = motion->pmv[1][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion_y;					      \
+									      \
+    MOTION (table, ref_field, motion_x, motion_y, 8, 8);		      \
+}									      \
+									      \
+static void motion_fi_dmv_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				    motion_t * const motion,		      \
+				    mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y, other_x, other_y;				      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+    NEEDBITS (bit_buf, bits, bit_ptr);					      \
+    other_x = ((motion_x + (motion_x > 0)) >> 1) + get_dmv (decoder);	      \
+									      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+    other_y = (((motion_y + (motion_y > 0)) >> 1) + get_dmv (decoder) +	      \
+	       decoder->dmv_offset);					      \
+									      \
+    MOTION (mpeg2_mc.put, motion->ref[0], motion_x, motion_y, 16, 0);	      \
+    MOTION (mpeg2_mc.avg, motion->ref[1], other_x, other_y, 16, 0);	      \
+}									      \
+
+MOTION_FUNCTIONS (420, MOTION_420, MOTION_FIELD_420, MOTION_DMV_420,
+		  MOTION_ZERO_420)
+MOTION_FUNCTIONS (422, MOTION_422, MOTION_FIELD_422, MOTION_DMV_422,
+		  MOTION_ZERO_422)
+MOTION_FUNCTIONS (444, MOTION_444, MOTION_FIELD_444, MOTION_DMV_444,
+		  MOTION_ZERO_444)
+
+/* like motion_frame, but parsing without actual motion compensation */
+static void motion_fr_conceal (mpeg2_decoder_t * const decoder)
+{
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (decoder->f_motion.pmv[0][0] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[0]);
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (decoder->f_motion.pmv[0][1] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[1]);
+    decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+}
+
+static void motion_fi_conceal (mpeg2_decoder_t * const decoder)
+{
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    DUMPBITS (bit_buf, bits, 1); /* remove field_select */
+
+    tmp = (decoder->f_motion.pmv[0][0] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[0]);
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr);
+    tmp = (decoder->f_motion.pmv[0][1] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[1]);
+    decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+}
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+
+#define MOTION_CALL(routine,direction)				\
+do {								\
+    if ((direction) & MACROBLOCK_MOTION_FORWARD)		\
+	routine (decoder, &(decoder->f_motion), mpeg2_mc.put);	\
+    if ((direction) & MACROBLOCK_MOTION_BACKWARD)		\
+	routine (decoder, &(decoder->b_motion),			\
+		 ((direction) & MACROBLOCK_MOTION_FORWARD ?	\
+		  mpeg2_mc.avg : mpeg2_mc.put));		\
+} while (0)
+
+#define NEXT_MACROBLOCK							\
+do {									\
+    decoder->offset += 16;						\
+    if (decoder->offset == decoder->width) {				\
+	do { /* just so we can use the break statement */		\
+	    if (decoder->convert) {					\
+		decoder->convert (decoder->convert_id, decoder->dest,	\
+				  decoder->v_offset);			\
+		if (decoder->coding_type == B_TYPE)			\
+		    break;						\
+	    }								\
+	    decoder->dest[0] += decoder->slice_stride;			\
+	    decoder->dest[1] += decoder->slice_uv_stride;		\
+	    decoder->dest[2] += decoder->slice_uv_stride;		\
+	} while (0);							\
+	decoder->v_offset += 16;					\
+	if (decoder->v_offset > decoder->limit_y) {			\
+	    if (mpeg2_cpu_state_restore)				\
+		mpeg2_cpu_state_restore (&cpu_state);			\
+	    return;							\
+	}								\
+	decoder->offset = 0;						\
+    }									\
+} while (0)
+
+void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
+		      uint8_t * forward_fbuf[3], uint8_t * backward_fbuf[3])
+{
+    int offset, stride, height, bottom_field;
+
+    stride = decoder->stride_frame;
+    bottom_field = (decoder->picture_structure == BOTTOM_FIELD);
+    offset = bottom_field ? stride : 0;
+    height = decoder->height;
+
+    decoder->picture_dest[0] = current_fbuf[0] + offset;
+    decoder->picture_dest[1] = current_fbuf[1] + (offset >> 1);
+    decoder->picture_dest[2] = current_fbuf[2] + (offset >> 1);
+
+    decoder->f_motion.ref[0][0] = forward_fbuf[0] + offset;
+    decoder->f_motion.ref[0][1] = forward_fbuf[1] + (offset >> 1);
+    decoder->f_motion.ref[0][2] = forward_fbuf[2] + (offset >> 1);
+
+    decoder->b_motion.ref[0][0] = backward_fbuf[0] + offset;
+    decoder->b_motion.ref[0][1] = backward_fbuf[1] + (offset >> 1);
+    decoder->b_motion.ref[0][2] = backward_fbuf[2] + (offset >> 1);
+
+    if (decoder->picture_structure != FRAME_PICTURE) {
+	decoder->dmv_offset = bottom_field ? 1 : -1;
+	decoder->f_motion.ref2[0] = decoder->f_motion.ref[bottom_field];
+	decoder->f_motion.ref2[1] = decoder->f_motion.ref[!bottom_field];
+	decoder->b_motion.ref2[0] = decoder->b_motion.ref[bottom_field];
+	decoder->b_motion.ref2[1] = decoder->b_motion.ref[!bottom_field];
+	offset = stride - offset;
+
+	if (decoder->second_field && (decoder->coding_type != B_TYPE))
+	    forward_fbuf = current_fbuf;
+
+	decoder->f_motion.ref[1][0] = forward_fbuf[0] + offset;
+	decoder->f_motion.ref[1][1] = forward_fbuf[1] + (offset >> 1);
+	decoder->f_motion.ref[1][2] = forward_fbuf[2] + (offset >> 1);
+
+	decoder->b_motion.ref[1][0] = backward_fbuf[0] + offset;
+	decoder->b_motion.ref[1][1] = backward_fbuf[1] + (offset >> 1);
+	decoder->b_motion.ref[1][2] = backward_fbuf[2] + (offset >> 1);
+
+	stride <<= 1;
+	height >>= 1;
+    }
+
+    decoder->stride = stride;
+    decoder->uv_stride = stride >> 1;
+    decoder->slice_stride = 16 * stride;
+    decoder->slice_uv_stride =
+	decoder->slice_stride >> (2 - decoder->chroma_format);
+    decoder->limit_x = 2 * decoder->width - 32;
+    decoder->limit_y_16 = 2 * height - 32;
+    decoder->limit_y_8 = 2 * height - 16;
+    decoder->limit_y = height - 16;
+
+    if (decoder->mpeg1) {
+	decoder->motion_parser[0] = motion_zero_420;
+	decoder->motion_parser[MC_FRAME] = motion_mp1;
+	decoder->motion_parser[4] = motion_reuse_420;
+    } else if (decoder->picture_structure == FRAME_PICTURE) {
+	if (decoder->chroma_format == 0) {
+	    decoder->motion_parser[0] = motion_zero_420;
+	    decoder->motion_parser[MC_FIELD] = motion_fr_field_420;
+	    decoder->motion_parser[MC_FRAME] = motion_fr_frame_420;
+	    decoder->motion_parser[MC_DMV] = motion_fr_dmv_420;
+	    decoder->motion_parser[4] = motion_reuse_420;
+	} else if (decoder->chroma_format == 1) {
+	    decoder->motion_parser[0] = motion_zero_422;
+	    decoder->motion_parser[MC_FIELD] = motion_fr_field_422;
+	    decoder->motion_parser[MC_FRAME] = motion_fr_frame_422;
+	    decoder->motion_parser[MC_DMV] = motion_fr_dmv_422;
+	    decoder->motion_parser[4] = motion_reuse_422;
+	} else {
+	    decoder->motion_parser[0] = motion_zero_444;
+	    decoder->motion_parser[MC_FIELD] = motion_fr_field_444;
+	    decoder->motion_parser[MC_FRAME] = motion_fr_frame_444;
+	    decoder->motion_parser[MC_DMV] = motion_fr_dmv_444;
+	    decoder->motion_parser[4] = motion_reuse_444;
+	}
+    } else {
+	if (decoder->chroma_format == 0) {
+	    decoder->motion_parser[0] = motion_zero_420;
+	    decoder->motion_parser[MC_FIELD] = motion_fi_field_420;
+	    decoder->motion_parser[MC_16X8] = motion_fi_16x8_420;
+	    decoder->motion_parser[MC_DMV] = motion_fi_dmv_420;
+	    decoder->motion_parser[4] = motion_reuse_420;
+	} else if (decoder->chroma_format == 1) {
+	    decoder->motion_parser[0] = motion_zero_422;
+	    decoder->motion_parser[MC_FIELD] = motion_fi_field_422;
+	    decoder->motion_parser[MC_16X8] = motion_fi_16x8_422;
+	    decoder->motion_parser[MC_DMV] = motion_fi_dmv_422;
+	    decoder->motion_parser[4] = motion_reuse_422;
+	} else {
+	    decoder->motion_parser[0] = motion_zero_444;
+	    decoder->motion_parser[MC_FIELD] = motion_fi_field_444;
+	    decoder->motion_parser[MC_16X8] = motion_fi_16x8_444;
+	    decoder->motion_parser[MC_DMV] = motion_fi_dmv_444;
+	    decoder->motion_parser[4] = motion_reuse_444;
+	}
+    }
+}
+
+static inline int slice_init (mpeg2_decoder_t * const decoder, int code)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int offset;
+    const MBAtab * mba;
+
+    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+	decoder->dc_dct_pred[2] = 16384;
+
+    decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
+    decoder->b_motion.pmv[0][0] = decoder->b_motion.pmv[0][1] = 0;
+    decoder->b_motion.pmv[1][0] = decoder->b_motion.pmv[1][1] = 0;
+
+    if (decoder->vertical_position_extension) {
+	code += UBITS (bit_buf, 3) << 7;
+	DUMPBITS (bit_buf, bits, 3);
+    }
+    decoder->v_offset = (code - 1) * 16;
+    offset = 0;
+    if (!(decoder->convert) || decoder->coding_type != B_TYPE)
+	offset = (code - 1) * decoder->slice_stride;
+
+    decoder->dest[0] = decoder->picture_dest[0] + offset;
+    offset >>= (2 - decoder->chroma_format);
+    decoder->dest[1] = decoder->picture_dest[1] + offset;
+    decoder->dest[2] = decoder->picture_dest[2] + offset;
+
+    get_quantizer_scale (decoder);
+
+    /* ignore intra_slice and all the extra data */
+    while (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 9);
+	NEEDBITS (bit_buf, bits, bit_ptr);
+    }
+
+    /* decode initial macroblock address increment */
+    offset = 0;
+    while (1) {
+	if (bit_buf >= 0x08000000) {
+	    mba = MBA_5 + (UBITS (bit_buf, 6) - 2);
+	    break;
+	} else if (bit_buf >= 0x01800000) {
+	    mba = MBA_11 + (UBITS (bit_buf, 12) - 24);
+	    break;
+	} else switch (UBITS (bit_buf, 12)) {
+	case 8:		/* macroblock_escape */
+	    offset += 33;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	case 15:	/* macroblock_stuffing (MPEG1 only) */
+	    bit_buf &= 0xfffff;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	default:	/* error */
+	    return 1;
+	}
+    }
+    DUMPBITS (bit_buf, bits, mba->len + 1);
+    decoder->offset = (offset + mba->mba) << 4;
+
+    while (decoder->offset - decoder->width >= 0) {
+	decoder->offset -= decoder->width;
+	if (!(decoder->convert) || decoder->coding_type != B_TYPE) {
+	    decoder->dest[0] += decoder->slice_stride;
+	    decoder->dest[1] += decoder->slice_uv_stride;
+	    decoder->dest[2] += decoder->slice_uv_stride;
+	}
+	decoder->v_offset += 16;
+    }
+    if (decoder->v_offset > decoder->limit_y)
+	return 1;
+
+    return 0;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+void mpeg2_slice (mpeg2_decoder_t * const decoder, const int code,
+		  const uint8_t * const buffer)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    cpu_state_t cpu_state;
+
+    bitstream_init (decoder, buffer);
+
+    if (slice_init (decoder, code))
+	return;
+
+    if (mpeg2_cpu_state_save)
+	mpeg2_cpu_state_save (&cpu_state);
+
+    while (1) {
+	int macroblock_modes;
+	int mba_inc;
+	const MBAtab * mba;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+
+	macroblock_modes = get_macroblock_modes (decoder);
+
+	/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
+	if (macroblock_modes & MACROBLOCK_QUANT)
+	    get_quantizer_scale (decoder);
+
+	if (macroblock_modes & MACROBLOCK_INTRA) {
+
+	    int DCT_offset, DCT_stride;
+	    int offset;
+	    uint8_t * dest_y;
+
+	    if (decoder->concealment_motion_vectors) {
+		if (decoder->picture_structure == FRAME_PICTURE)
+		    motion_fr_conceal (decoder);
+		else
+		    motion_fi_conceal (decoder);
+	    } else {
+		decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+		decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
+		decoder->b_motion.pmv[0][0] = decoder->b_motion.pmv[0][1] = 0;
+		decoder->b_motion.pmv[1][0] = decoder->b_motion.pmv[1][1] = 0;
+	    }
+
+	    if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		DCT_offset = decoder->stride;
+		DCT_stride = decoder->stride * 2;
+	    } else {
+		DCT_offset = decoder->stride * 8;
+		DCT_stride = decoder->stride;
+	    }
+
+	    offset = decoder->offset;
+	    dest_y = decoder->dest[0] + offset;
+	    slice_intra_DCT (decoder, 0, dest_y, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + 8, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset + 8, DCT_stride);
+	    if (likely (decoder->chroma_format == 0)) {
+		slice_intra_DCT (decoder, 1, decoder->dest[1] + (offset >> 1),
+				 decoder->uv_stride);
+		slice_intra_DCT (decoder, 2, decoder->dest[2] + (offset >> 1),
+				 decoder->uv_stride);
+		if (decoder->coding_type == D_TYPE) {
+		    NEEDBITS (bit_buf, bits, bit_ptr);
+		    DUMPBITS (bit_buf, bits, 1);
+		}
+	    } else if (likely (decoder->chroma_format == 1)) {
+		uint8_t * dest_u = decoder->dest[1] + (offset >> 1);
+		uint8_t * dest_v = decoder->dest[2] + (offset >> 1);
+		DCT_stride >>= 1;
+		DCT_offset >>= 1;
+		slice_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset, DCT_stride);
+	    } else {
+		uint8_t * dest_u = decoder->dest[1] + offset;
+		uint8_t * dest_v = decoder->dest[2] + offset;
+		slice_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + 8, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + 8, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset + 8,
+				 DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset + 8,
+				 DCT_stride);
+	    }
+	} else {
+
+	    motion_parser_t * parser;
+
+	    parser =
+		decoder->motion_parser[macroblock_modes >> MOTION_TYPE_SHIFT];
+	    MOTION_CALL (parser, macroblock_modes);
+
+	    if (macroblock_modes & MACROBLOCK_PATTERN) {
+		int coded_block_pattern;
+		int DCT_offset, DCT_stride;
+
+		if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		    DCT_offset = decoder->stride;
+		    DCT_stride = decoder->stride * 2;
+		} else {
+		    DCT_offset = decoder->stride * 8;
+		    DCT_stride = decoder->stride;
+		}
+
+		coded_block_pattern = get_coded_block_pattern (decoder);
+
+		if (likely (decoder->chroma_format == 0)) {
+		    int offset = decoder->offset;
+		    uint8_t * dest_y = decoder->dest[0] + offset;
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + (offset >> 1),
+					     decoder->uv_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + (offset >> 1),
+					     decoder->uv_stride);
+		} else if (likely (decoder->chroma_format == 1)) {
+		    int offset;
+		    uint8_t * dest_y;
+
+		    coded_block_pattern |= bit_buf & (3 << 30);
+		    DUMPBITS (bit_buf, bits, 2);
+
+		    offset = decoder->offset;
+		    dest_y = decoder->dest[0] + offset;
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+
+		    DCT_stride >>= 1;
+		    DCT_offset = (DCT_offset + offset) >> 1;
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + (offset >> 1),
+					     DCT_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + (offset >> 1),
+					     DCT_stride);
+		    if (coded_block_pattern & (2 << 30))
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (1 << 30))
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + DCT_offset,
+					     DCT_stride);
+		} else {
+		    int offset;
+		    uint8_t * dest_y, * dest_u, * dest_v;
+
+		    coded_block_pattern |= bit_buf & (63 << 26);
+		    DUMPBITS (bit_buf, bits, 6);
+
+		    offset = decoder->offset;
+		    dest_y = decoder->dest[0] + offset;
+		    dest_u = decoder->dest[1] + offset;
+		    dest_v = decoder->dest[2] + offset;
+
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		    if (coded_block_pattern & (32 << 26))
+			slice_non_intra_DCT (decoder, 1, dest_u + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (16 << 26))
+			slice_non_intra_DCT (decoder, 2, dest_v + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (8 << 26))
+			slice_non_intra_DCT (decoder, 1, dest_u + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (4 << 26))
+			slice_non_intra_DCT (decoder, 2, dest_v + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (2 << 26))
+			slice_non_intra_DCT (decoder, 1,
+					     dest_u + DCT_offset + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (1 << 26))
+			slice_non_intra_DCT (decoder, 2,
+					     dest_v + DCT_offset + 8,
+					     DCT_stride);
+		}
+	    }
+
+	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+		decoder->dc_dct_pred[2] = 16384;
+	}
+
+	NEXT_MACROBLOCK;
+
+	NEEDBITS (bit_buf, bits, bit_ptr);
+	mba_inc = 0;
+	while (1) {
+	    if (bit_buf >= 0x10000000) {
+		mba = MBA_5 + (UBITS (bit_buf, 5) - 2);
+		break;
+	    } else if (bit_buf >= 0x03000000) {
+		mba = MBA_11 + (UBITS (bit_buf, 11) - 24);
+		break;
+	    } else switch (UBITS (bit_buf, 11)) {
+	    case 8:		/* macroblock_escape */
+		mba_inc += 33;
+		/* pass through */
+	    case 15:	/* macroblock_stuffing (MPEG1 only) */
+		DUMPBITS (bit_buf, bits, 11);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		continue;
+	    default:	/* end of slice, or error */
+		if (mpeg2_cpu_state_restore)
+		    mpeg2_cpu_state_restore (&cpu_state);
+		return;
+	    }
+	}
+	DUMPBITS (bit_buf, bits, mba->len);
+	mba_inc += mba->mba;
+
+	if (mba_inc) {
+	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+		decoder->dc_dct_pred[2] = 16384;
+
+	    if (decoder->coding_type == P_TYPE) {
+		do {
+		    MOTION_CALL (decoder->motion_parser[0],
+				 MACROBLOCK_MOTION_FORWARD);
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    } else {
+		do {
+		    MOTION_CALL (decoder->motion_parser[4], macroblock_modes);
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    }
+	}
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/uyvy.c b/src/video_dec/libmpeg2new/libmpeg2/uyvy.c
new file mode 100644
index 000000000..7f107ffad
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/uyvy.c
@@ -0,0 +1,123 @@
+/*
+ * uyvy.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Regis Duchesne <hpreg@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "mpeg2convert.h"
+
+typedef struct {
+    int width;
+    int stride;
+    int chroma420;
+    uint8_t * out;
+} convert_uyvy_t;
+
+static void uyvy_start (void * _id, const mpeg2_fbuf_t * fbuf,
+			const mpeg2_picture_t * picture,
+			const mpeg2_gop_t * gop)
+{
+    convert_uyvy_t * instance = (convert_uyvy_t *) _id;
+
+    instance->out = fbuf->buf[0];
+    instance->stride = instance->width;
+    if (picture->nb_fields == 1) {
+	if (! (picture->flags & PIC_FLAG_TOP_FIELD_FIRST))
+	    instance->out += 2 * instance->stride;
+	instance->stride <<= 1;
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define PACK(a,b,c,d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
+#else
+#define PACK(a,b,c,d) (((d) << 24) | ((c) << 16) | ((b) << 8) | (a))
+#endif
+
+static void uyvy_copy (void * const _id, uint8_t * const * src,
+		       const unsigned int v_offset)
+{
+    const convert_uyvy_t * const id = (convert_uyvy_t *) _id;
+    uint8_t * _dst;
+    uint8_t * py, * pu, * pv;
+    int i, j;
+
+    _dst = id->out + 2 * id->stride * v_offset;
+    py = src[0]; pu = src[1]; pv = src[2];
+
+    i = 16;
+    do {
+	uint32_t * dst = (uint32_t *) _dst;
+
+	j = id->width >> 4;
+	do {
+	    dst[0] = PACK (pu[0],  py[0], pv[0],  py[1]);
+	    dst[1] = PACK (pu[1],  py[2], pv[1],  py[3]);
+	    dst[2] = PACK (pu[2],  py[4], pv[2],  py[5]);
+	    dst[3] = PACK (pu[3],  py[6], pv[3],  py[7]);
+	    dst[4] = PACK (pu[4],  py[8], pv[4],  py[9]);
+	    dst[5] = PACK (pu[5], py[10], pv[5], py[11]);
+	    dst[6] = PACK (pu[6], py[12], pv[6], py[13]);
+	    dst[7] = PACK (pu[7], py[14], pv[7], py[15]);
+	    py += 16;
+	    pu += 8;
+	    pv += 8;
+	    dst += 8;
+	} while (--j);
+	py -= id->width;
+	pu -= id->width >> 1;
+	pv -= id->width >> 1;
+	_dst += 2 * id->stride;
+	py += id->stride;
+	if (! (--i & id->chroma420)) {
+	    pu += id->stride >> 1;
+	    pv += id->stride >> 1;
+	}
+    } while (i);
+}
+
+int mpeg2convert_uyvy (int stage, void * _id, const mpeg2_sequence_t * seq,
+		       int stride, uint32_t accel, void * arg,
+		       mpeg2_convert_init_t * result)
+{
+    convert_uyvy_t * instance = (convert_uyvy_t *) _id;
+
+    if (seq->chroma_width == seq->width)
+	return 1;
+
+    if (instance) {
+	instance->width = seq->width;
+	instance->chroma420 = (seq->chroma_height < seq->height);
+	result->buf_size[0] = seq->width * seq->height * 2;
+	result->buf_size[1] = result->buf_size[2] = 0;
+	result->start = uyvy_start;
+	result->copy = uyvy_copy;
+    } else {
+	result->id_size = sizeof (convert_uyvy_t);
+    }
+
+    return 0;
+}
diff --git a/src/video_dec/libmpeg2new/libmpeg2/vlc.h b/src/video_dec/libmpeg2new/libmpeg2/vlc.h
new file mode 100644
index 000000000..57448ce04
--- /dev/null
+++ b/src/video_dec/libmpeg2new/libmpeg2/vlc.h
@@ -0,0 +1,429 @@
+/*
+ * vlc.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define GETWORD(bit_buf,shift,bit_ptr)				\
+do {								\
+    bit_buf |= ((bit_ptr[0] << 8) | bit_ptr[1]) << (shift);	\
+    bit_ptr += 2;						\
+} while (0)
+
+static inline void bitstream_init (mpeg2_decoder_t * decoder,
+				   const uint8_t * start)
+{
+    decoder->bitstream_buf =
+	(start[0] << 24) | (start[1] << 16) | (start[2] << 8) | start[3];
+    decoder->bitstream_ptr = start + 4;
+    decoder->bitstream_bits = -16;
+}
+
+/* make sure that there are at least 16 valid bits in bit_buf */
+#define NEEDBITS(bit_buf,bits,bit_ptr)		\
+do {						\
+    if (unlikely (bits > 0)) {			\
+	GETWORD (bit_buf, bits, bit_ptr);	\
+	bits -= 16;				\
+    }						\
+} while (0)
+
+/* remove num valid bits from bit_buf */
+#define DUMPBITS(bit_buf,bits,num)	\
+do {					\
+    bit_buf <<= (num);			\
+    bits += (num);			\
+} while (0)
+
+/* take num bits from the high part of bit_buf and zero extend them */
+#define UBITS(bit_buf,num) (((uint32_t)(bit_buf)) >> (32 - (num)))
+
+/* take num bits from the high part of bit_buf and sign extend them */
+#define SBITS(bit_buf,num) (((int32_t)(bit_buf)) >> (32 - (num)))
+
+typedef struct {
+    uint8_t modes;
+    uint8_t len;
+} MBtab;
+
+typedef struct {
+    uint8_t delta;
+    uint8_t len;
+} MVtab;
+
+typedef struct {
+    int8_t dmv;
+    uint8_t len;
+} DMVtab;
+
+typedef struct {
+    uint8_t cbp;
+    uint8_t len;
+} CBPtab;
+
+typedef struct {
+    uint8_t size;
+    uint8_t len;
+} DCtab;
+
+typedef struct {
+    uint8_t run;
+    uint8_t level;
+    uint8_t len;
+} DCTtab;
+
+typedef struct {
+    uint8_t mba;
+    uint8_t len;
+} MBAtab;
+
+
+#define INTRA MACROBLOCK_INTRA
+#define QUANT MACROBLOCK_QUANT
+
+static const MBtab MB_I [] = {
+    {INTRA|QUANT, 2}, {INTRA, 1}
+};
+
+#define MC MACROBLOCK_MOTION_FORWARD
+#define CODED MACROBLOCK_PATTERN
+
+static const MBtab MB_P [] = {
+    {INTRA|QUANT, 6}, {CODED|QUANT, 5}, {MC|CODED|QUANT, 5}, {INTRA,    5},
+    {MC,          3}, {MC,          3}, {MC,             3}, {MC,       3},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1}
+};
+
+#define FWD MACROBLOCK_MOTION_FORWARD
+#define BWD MACROBLOCK_MOTION_BACKWARD
+#define INTER MACROBLOCK_MOTION_FORWARD|MACROBLOCK_MOTION_BACKWARD
+
+static const MBtab MB_B [] = {
+    {0,                 6}, {INTRA|QUANT,       6},
+    {BWD|CODED|QUANT,   6}, {FWD|CODED|QUANT,   6},
+    {INTER|CODED|QUANT, 5}, {INTER|CODED|QUANT, 5},
+					{INTRA,       5}, {INTRA,       5},
+    {FWD,         4}, {FWD,         4}, {FWD,         4}, {FWD,         4},
+    {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}
+};
+
+#undef INTRA
+#undef QUANT
+#undef MC
+#undef CODED
+#undef FWD
+#undef BWD
+#undef INTER
+
+
+static const MVtab MV_4 [] = {
+    { 3, 6}, { 2, 4}, { 1, 3}, { 1, 3}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}
+};
+
+static const MVtab MV_10 [] = {
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10},
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, {15,10}, {14,10}, {13,10}, {12,10},
+    {11,10}, {10,10}, { 9, 9}, { 9, 9}, { 8, 9}, { 8, 9}, { 7, 9}, { 7, 9},
+    { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7},
+    { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7},
+    { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}
+};
+
+
+static const DMVtab DMV_2 [] = {
+    { 0, 1}, { 0, 1}, { 1, 2}, {-1, 2}
+};
+
+
+static const CBPtab CBP_7 [] = {
+    {0x11, 7}, {0x12, 7}, {0x14, 7}, {0x18, 7},
+    {0x21, 7}, {0x22, 7}, {0x24, 7}, {0x28, 7},
+    {0x3f, 6}, {0x3f, 6}, {0x30, 6}, {0x30, 6},
+    {0x09, 6}, {0x09, 6}, {0x06, 6}, {0x06, 6},
+    {0x1f, 5}, {0x1f, 5}, {0x1f, 5}, {0x1f, 5},
+    {0x10, 5}, {0x10, 5}, {0x10, 5}, {0x10, 5},
+    {0x2f, 5}, {0x2f, 5}, {0x2f, 5}, {0x2f, 5},
+    {0x20, 5}, {0x20, 5}, {0x20, 5}, {0x20, 5},
+    {0x07, 5}, {0x07, 5}, {0x07, 5}, {0x07, 5},
+    {0x0b, 5}, {0x0b, 5}, {0x0b, 5}, {0x0b, 5},
+    {0x0d, 5}, {0x0d, 5}, {0x0d, 5}, {0x0d, 5},
+    {0x0e, 5}, {0x0e, 5}, {0x0e, 5}, {0x0e, 5},
+    {0x05, 5}, {0x05, 5}, {0x05, 5}, {0x05, 5},
+    {0x0a, 5}, {0x0a, 5}, {0x0a, 5}, {0x0a, 5},
+    {0x03, 5}, {0x03, 5}, {0x03, 5}, {0x03, 5},
+    {0x0c, 5}, {0x0c, 5}, {0x0c, 5}, {0x0c, 5},
+    {0x01, 4}, {0x01, 4}, {0x01, 4}, {0x01, 4},
+    {0x01, 4}, {0x01, 4}, {0x01, 4}, {0x01, 4},
+    {0x02, 4}, {0x02, 4}, {0x02, 4}, {0x02, 4},
+    {0x02, 4}, {0x02, 4}, {0x02, 4}, {0x02, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3}
+};
+
+static const CBPtab CBP_9 [] = {
+    {0,    9}, {0x00, 9}, {0x39, 9}, {0x36, 9},
+    {0x37, 9}, {0x3b, 9}, {0x3d, 9}, {0x3e, 9},
+    {0x17, 8}, {0x17, 8}, {0x1b, 8}, {0x1b, 8},
+    {0x1d, 8}, {0x1d, 8}, {0x1e, 8}, {0x1e, 8},
+    {0x27, 8}, {0x27, 8}, {0x2b, 8}, {0x2b, 8},
+    {0x2d, 8}, {0x2d, 8}, {0x2e, 8}, {0x2e, 8},
+    {0x19, 8}, {0x19, 8}, {0x16, 8}, {0x16, 8},
+    {0x29, 8}, {0x29, 8}, {0x26, 8}, {0x26, 8},
+    {0x35, 8}, {0x35, 8}, {0x3a, 8}, {0x3a, 8},
+    {0x33, 8}, {0x33, 8}, {0x3c, 8}, {0x3c, 8},
+    {0x15, 8}, {0x15, 8}, {0x1a, 8}, {0x1a, 8},
+    {0x13, 8}, {0x13, 8}, {0x1c, 8}, {0x1c, 8},
+    {0x25, 8}, {0x25, 8}, {0x2a, 8}, {0x2a, 8},
+    {0x23, 8}, {0x23, 8}, {0x2c, 8}, {0x2c, 8},
+    {0x31, 8}, {0x31, 8}, {0x32, 8}, {0x32, 8},
+    {0x34, 8}, {0x34, 8}, {0x38, 8}, {0x38, 8}
+};
+
+
+static const DCtab DC_lum_5 [] = {
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
+    {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}
+};
+
+static const DCtab DC_chrom_5 [] = {
+    {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}
+};
+
+static const DCtab DC_long [] = {
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, { 7, 6}, { 7, 6},
+    {8, 7}, {8, 7}, {8, 7}, {8, 7}, {9, 8}, {9, 8}, {10, 9}, {11, 9}
+};
+
+
+static const DCTtab DCT_16 [] = {
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {  2,18, 0}, {  2,17, 0}, {  2,16, 0}, {  2,15, 0},
+    {  7, 3, 0}, { 17, 2, 0}, { 16, 2, 0}, { 15, 2, 0},
+    { 14, 2, 0}, { 13, 2, 0}, { 12, 2, 0}, { 32, 1, 0},
+    { 31, 1, 0}, { 30, 1, 0}, { 29, 1, 0}, { 28, 1, 0}
+};
+
+static const DCTtab DCT_15 [] = {
+    {  1,40,15}, {  1,39,15}, {  1,38,15}, {  1,37,15},
+    {  1,36,15}, {  1,35,15}, {  1,34,15}, {  1,33,15},
+    {  1,32,15}, {  2,14,15}, {  2,13,15}, {  2,12,15},
+    {  2,11,15}, {  2,10,15}, {  2, 9,15}, {  2, 8,15},
+    {  1,31,14}, {  1,31,14}, {  1,30,14}, {  1,30,14},
+    {  1,29,14}, {  1,29,14}, {  1,28,14}, {  1,28,14},
+    {  1,27,14}, {  1,27,14}, {  1,26,14}, {  1,26,14},
+    {  1,25,14}, {  1,25,14}, {  1,24,14}, {  1,24,14},
+    {  1,23,14}, {  1,23,14}, {  1,22,14}, {  1,22,14},
+    {  1,21,14}, {  1,21,14}, {  1,20,14}, {  1,20,14},
+    {  1,19,14}, {  1,19,14}, {  1,18,14}, {  1,18,14},
+    {  1,17,14}, {  1,17,14}, {  1,16,14}, {  1,16,14}
+};
+
+static const DCTtab DCT_13 [] = {
+    { 11, 2,13}, { 10, 2,13}, {  6, 3,13}, {  4, 4,13},
+    {  3, 5,13}, {  2, 7,13}, {  2, 6,13}, {  1,15,13},
+    {  1,14,13}, {  1,13,13}, {  1,12,13}, { 27, 1,13},
+    { 26, 1,13}, { 25, 1,13}, { 24, 1,13}, { 23, 1,13},
+    {  1,11,12}, {  1,11,12}, {  9, 2,12}, {  9, 2,12},
+    {  5, 3,12}, {  5, 3,12}, {  1,10,12}, {  1,10,12},
+    {  3, 4,12}, {  3, 4,12}, {  8, 2,12}, {  8, 2,12},
+    { 22, 1,12}, { 22, 1,12}, { 21, 1,12}, { 21, 1,12},
+    {  1, 9,12}, {  1, 9,12}, { 20, 1,12}, { 20, 1,12},
+    { 19, 1,12}, { 19, 1,12}, {  2, 5,12}, {  2, 5,12},
+    {  4, 3,12}, {  4, 3,12}, {  1, 8,12}, {  1, 8,12},
+    {  7, 2,12}, {  7, 2,12}, { 18, 1,12}, { 18, 1,12}
+};
+
+static const DCTtab DCT_B14_10 [] = {
+    { 17, 1,10}, {  6, 2,10}, {  1, 7,10}, {  3, 3,10},
+    {  2, 4,10}, { 16, 1,10}, { 15, 1,10}, {  5, 2,10}
+};
+
+static const DCTtab DCT_B14_8 [] = {
+    { 65, 0,12}, { 65, 0,12}, { 65, 0,12}, { 65, 0,12},
+    {  3, 2, 7}, {  3, 2, 7}, { 10, 1, 7}, { 10, 1, 7},
+    {  1, 4, 7}, {  1, 4, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6},
+    {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6},
+    {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    { 14, 1, 8}, {  1, 6, 8}, { 13, 1, 8}, { 12, 1, 8},
+    {  4, 2, 8}, {  2, 3, 8}, {  1, 5, 8}, { 11, 1, 8}
+};
+
+static const DCTtab DCT_B14AC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}
+};
+
+static const DCTtab DCT_B14DC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}
+};
+
+static const DCTtab DCT_B15_10 [] = {
+    {  6, 2, 9}, {  6, 2, 9}, { 15, 1, 9}, { 15, 1, 9},
+    {  3, 4,10}, { 17, 1,10}, { 16, 1, 9}, { 16, 1, 9}
+};
+
+static const DCTtab DCT_B15_8 [] = {
+    { 65, 0,12}, { 65, 0,12}, { 65, 0,12}, { 65, 0,12},
+    {  8, 1, 7}, {  8, 1, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  7, 1, 7}, {  7, 1, 7}, {  3, 2, 7}, {  3, 2, 7},
+    {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6},
+    {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6},
+    {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    {  2, 5, 8}, { 12, 1, 8}, {  1,11, 8}, {  1,10, 8},
+    { 14, 1, 8}, { 13, 1, 8}, {  4, 2, 8}, {  2, 4, 8},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    { 10, 1, 7}, { 10, 1, 7}, {  2, 3, 7}, {  2, 3, 7},
+    { 11, 1, 7}, { 11, 1, 7}, {  1, 8, 7}, {  1, 8, 7},
+    {  1, 9, 7}, {  1, 9, 7}, {  1,12, 8}, {  1,13, 8},
+    {  3, 3, 8}, {  5, 2, 8}, {  1,14, 8}, {  1,15, 8}
+};
+
+
+static const MBAtab MBA_5 [] = {
+		    {6, 5}, {5, 5}, {4, 4}, {4, 4}, {3, 4}, {3, 4},
+    {2, 3}, {2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}
+};
+
+static const MBAtab MBA_11 [] = {
+    {32, 11}, {31, 11}, {30, 11}, {29, 11},
+    {28, 11}, {27, 11}, {26, 11}, {25, 11},
+    {24, 11}, {23, 11}, {22, 11}, {21, 11},
+    {20, 10}, {20, 10}, {19, 10}, {19, 10},
+    {18, 10}, {18, 10}, {17, 10}, {17, 10},
+    {16, 10}, {16, 10}, {15, 10}, {15, 10},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7}
+};
diff --git a/src/video_dec/libmpeg2new/xine_mpeg2new_decoder.c b/src/video_dec/libmpeg2new/xine_mpeg2new_decoder.c
new file mode 100644
index 000000000..2678168e2
--- /dev/null
+++ b/src/video_dec/libmpeg2new/xine_mpeg2new_decoder.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (C) 2000-2004 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
+ *
+ * stuff needed to turn libmpeg2 into a xine decoder plugin
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "./include/mpeg2.h"
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+
+/*
+#define LOG
+#define LOG_FRAME_ALLOC_FREE
+#define LOG_ENTRY
+#define LOG_FRAME_COUNTER
+*/
+
+#define _x_abort() do {} while (0)
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} mpeg2_class_t;
+
+typedef struct {
+  uint32_t id;
+  vo_frame_t * img;
+} img_state_t;
+
+typedef struct mpeg2_video_decoder_s {
+  video_decoder_t  video_decoder;
+  mpeg2dec_t      *mpeg2dec;
+  mpeg2_class_t   *class;
+  xine_stream_t   *stream;
+  int32_t         force_aspect;
+  int             force_pan_scan;
+  double          ratio;
+  img_state_t     img_state[30];
+  uint32_t	  frame_number;
+  uint32_t        rff_pattern;
+
+} mpeg2_video_decoder_t;
+
+#ifndef LOG_FRAME_ALLOC_FREE
+inline static void mpeg2_video_print_bad_state(img_state_t * img_state) {}
+#else
+static void mpeg2_video_print_bad_state(img_state_t * img_state) {
+  int32_t n,m;
+  m=0;
+  for(n=0;n<30;n++) {
+    if (img_state[n].id>0) {
+      printf("%d = %u\n",n, img_state[n].id);
+      m++;
+    }
+  }
+  if (m > 3) _x_abort();
+  if (m == 0) printf("NO FRAMES\n");
+}
+#endif
+
+static void mpeg2_video_free_all(img_state_t * img_state) {
+  int32_t n,m;
+  vo_frame_t * img;
+  printf("libmpeg2new:free_all\n");
+  for(n=0;n<30;n++) {
+    if (img_state[n].id>0) {
+      img = img_state[n].img;
+      img->free(img);
+      img_state[n].id = 0;
+    }
+  }
+}
+
+
+static void mpeg2_video_print_fbuf(const mpeg2_fbuf_t * fbuf) {
+  printf("%p",fbuf);
+  vo_frame_t * img;
+  if (fbuf) {
+    img = (vo_frame_t *) fbuf->id;
+    if (img) {
+      printf (", img=%p, (id=%d)\n",
+             img, img->id);
+    } else {
+      printf (", img=NULL\n");
+    }
+  } else {
+    printf ("\n");
+  }
+}
+
+static void mpeg2_video_decode_data (video_decoder_t *this_gen, buf_element_t *buf_element) {
+  mpeg2_video_decoder_t *this = (mpeg2_video_decoder_t *) this_gen;
+  uint8_t * current = buf_element->content;
+  uint8_t * end = buf_element->content + buf_element->size;
+  const mpeg2_info_t * info;
+  mpeg2_state_t state;
+  vo_frame_t * img;
+  uint32_t picture_structure;
+  int32_t frame_skipping;
+
+  /* handle aspect hints from xine-dvdnav */
+  if (buf_element->decoder_flags & BUF_FLAG_SPECIAL) {
+    if (buf_element->decoder_info[1] == BUF_SPECIAL_ASPECT) {
+      this->force_aspect = buf_element->decoder_info[2];
+      if (buf_element->decoder_info[3] == 0x1 && buf_element->decoder_info[2] == 3)
+	/* letterboxing is denied, we have to do pan&scan */
+	this->force_pan_scan = 1;
+      else
+	this->force_pan_scan = 0;
+    }
+
+    return;
+  }
+
+  if (buf_element->decoder_flags != 0) return;
+
+#ifdef LOG_ENTRY
+  printf ("libmpeg2: decode_data: enter\n");
+#endif
+
+  mpeg2_buffer (this->mpeg2dec, current, end);
+
+  info = mpeg2_info (this->mpeg2dec);
+
+  while ((state = mpeg2_parse (this->mpeg2dec)) != STATE_BUFFER) {
+    switch (state) {
+      case STATE_SEQUENCE:
+        /* might set nb fbuf, convert format, stride */
+        /* might set fbufs */
+        _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_BITRATE,   info->sequence->byte_rate * 8);
+        _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_WIDTH,     info->sequence->picture_width);
+        _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HEIGHT,    info->sequence->picture_height);
+        _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION,  info->sequence->frame_period / 300);
+        if (this->force_aspect) ((mpeg2_sequence_t *)info->sequence)->pixel_width = this->force_aspect; /* ugly... */
+        switch (info->sequence->pixel_width) {
+	case 3:
+	  this->ratio = 16.0 / 9.0;
+	  break;
+	case 4:
+	  this->ratio = 2.11;
+	  break;
+	case 2:
+	  this->ratio = 4.0 / 3.0;
+	  break;
+	case 1:
+	default:
+	  this->ratio = (double)info->sequence->picture_width/(double)info->sequence->picture_height;
+	  break;
+        }
+        _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_RATIO, (int)(10000*this->ratio));
+
+        if (info->sequence->flags & SEQ_FLAG_MPEG2) {
+          _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "MPEG 2 (libmpeg2new)");
+        } else {
+          _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "MPEG 1 (libmpeg2new)");
+        }
+
+        break;
+      case STATE_PICTURE:
+        /* might skip */
+        /* might set fbuf */
+        if (info->current_picture->nb_fields == 1) {
+          picture_structure = info->current_picture->flags & PIC_FLAG_TOP_FIELD_FIRST ? VO_TOP_FIELD : VO_BOTTOM_FIELD;
+        } else {
+          picture_structure = VO_BOTH_FIELDS;
+        }
+
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                              info->sequence->picture_width,
+                                              info->sequence->picture_height,
+                                              this->ratio,
+                                              XINE_IMGFMT_YV12,
+                                              picture_structure);
+        this->frame_number++;
+#ifdef LOG_FRAME_COUNTER
+        printf("libmpeg2:frame_number=%d\n",this->frame_number);
+#endif
+        img->top_field_first = info->current_picture->flags & PIC_FLAG_TOP_FIELD_FIRST ? 1 : 0;
+        img->repeat_first_field = (info->current_picture->nb_fields > 2) ? 1 : 0;
+        img->duration=info->sequence->frame_period / 300;
+        if( ((this->rff_pattern & 0xff) == 0xaa ||
+             (this->rff_pattern & 0xff) == 0x55) ) {
+          /* special case for ntsc 3:2 pulldown */
+            img->duration += img->duration/4;
+        } else {
+          if( img->repeat_first_field ) {
+            img->duration = (img->duration * info->current_picture->nb_fields) / 2;
+          }
+        }
+
+        if ((info->current_picture->flags & 7) == 1) {
+          img->pts=buf_element->pts; /* If an I frame, use PTS */
+        } else {
+          img->pts=0;
+        }
+
+
+#ifdef LOG_FRAME_ALLOC_FREE
+        printf ("libmpeg2:decode_data:get_frame xine=%p (id=%d)\n", img,img->id);
+#endif
+        if (this->img_state[img->id].id != 0) {
+          printf ("libmpeg2:decode_data:get_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id].id);
+          _x_abort();
+        }
+
+        this->img_state[img->id].id = 1;
+        this->img_state[img->id].img = img;
+
+        mpeg2_set_buf (this->mpeg2dec, img->base, img);
+        break;
+      case STATE_SLICE:
+      case STATE_END:
+#if 0
+    printf("libmpeg2:decode_data:current_fbuf=");
+    mpeg2_video_print_fbuf(info->current_fbuf);
+    printf("libmpeg2:decode_data:display_fbuf=");
+    mpeg2_video_print_fbuf(info->display_fbuf);
+    printf("libmpeg2:decode_data:discard_fbuf=");
+    mpeg2_video_print_fbuf(info->discard_fbuf);
+#endif
+        /* draw current picture */
+        /* might free frame buffer */
+        if (info->display_fbuf && info->display_fbuf->id) {
+          img = (vo_frame_t *) info->display_fbuf->id;
+          /* this should be used to detect any special rff pattern */
+          this->rff_pattern = this->rff_pattern << 1;
+          this->rff_pattern |= img->repeat_first_field;
+
+#ifdef LOG_FRAME_ALLOC_FREE
+          printf ("libmpeg2:decode_data:draw_frame xine=%p, fbuf=%p, id=%d \n", img, info->display_fbuf, img->id);
+#endif
+          if (this->img_state[img->id].id != 1) {
+            printf ("libmpeg2:decode_data:draw_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id].id);
+            _x_abort();
+          }
+          if (this->img_state[img->id].id == 1) {
+            frame_skipping = img->draw (img, this->stream);
+            /* FIXME: Handle skipping */
+            this->img_state[img->id].id = 2;
+          }
+
+        }
+        if (info->discard_fbuf && !info->discard_fbuf->id) {
+          printf ("libmpeg2:decode_data:BAD free_frame discard: xine=%p, fbuf=%p\n", info->discard_fbuf->id, info->discard_fbuf);
+          //_x_abort();
+        }
+        if (info->discard_fbuf && info->discard_fbuf->id) {
+          img = (vo_frame_t *) info->discard_fbuf->id;
+#ifdef LOG_FRAME_ALLOC_FREE
+          printf ("libmpeg2:decode_data:free_frame xine=%p, fbuf=%p,id=%d\n", img, info->discard_fbuf, img->id);
+#endif
+          if (this->img_state[img->id].id != 2) {
+            printf ("libmpeg2:decode_data:free_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id].id);
+            _x_abort();
+          }
+          if (this->img_state[img->id].id == 2) {
+            img->free(img);
+            this->img_state[img->id].id = 0;
+          }
+        }
+#ifdef LOG_FRAME_ALLOC_FREE
+        mpeg2_video_print_bad_state(this->img_state);
+#endif
+        break;
+      case STATE_GOP:
+        break;
+      default:
+	printf("libmpeg2new: STATE unknown %d\n",state);
+        break;
+   }
+
+ }
+#ifdef LOG_ENTRY
+  printf ("libmpeg2: decode_data: exit\n");
+#endif
+
+}
+
+static void mpeg2_video_flush (video_decoder_t *this_gen) {
+  mpeg2_video_decoder_t *this = (mpeg2_video_decoder_t *) this_gen;
+
+#ifdef LOG_ENTRY
+  printf ("libmpeg2: flush\n");
+#endif
+
+/*  mpeg2_flush (&this->mpeg2); */
+}
+
+static void mpeg2_video_reset (video_decoder_t *this_gen) {
+  mpeg2_video_decoder_t *this = (mpeg2_video_decoder_t *) this_gen;
+  int32_t state;
+  const mpeg2_info_t * info;
+  vo_frame_t * img;
+  int32_t frame_skipping;
+
+#ifdef LOG_ENTRY
+  printf ("libmpeg2: reset\n");
+#endif
+  mpeg2_reset (this->mpeg2dec, 1); /* 1 for full reset */
+  mpeg2_video_free_all(this->img_state);
+
+
+#if 0  /* This bit of code does not work yet. */
+  info = mpeg2_info (this->mpeg2dec);
+  state = mpeg2_reset (this->mpeg2dec);
+  printf("reset state1:%d\n",state);
+  if (info->display_fbuf && info->display_fbuf->id) {
+    img = (vo_frame_t *) info->display_fbuf->id;
+
+    if (this->img_state[img->id] != 1) {
+      printf ("libmpeg2:decode_data:draw_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id]);
+      _x_abort();
+    }
+    if (this->img_state[img->id] == 1) {
+      frame_skipping = img->draw (img, this->stream);
+      /* FIXME: Handle skipping */
+      this->img_state[img->id] = 2;
+    }
+  }
+
+  if (info->discard_fbuf && !info->discard_fbuf->id) {
+    printf ("libmpeg2:decode_data:BAD free_frame discard_fbuf=%p\n", info->discard_fbuf);
+    _x_abort();
+  }
+  if (info->discard_fbuf && info->discard_fbuf->id) {
+    img = (vo_frame_t *) info->discard_fbuf->id;
+    if (this->img_state[img->id] != 2) {
+      printf ("libmpeg2:decode_data:free_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id]);
+      _x_abort();
+    }
+    if (this->img_state[img->id] == 2) {
+      img->free(img);
+      this->img_state[img->id] = 0;
+    }
+  }
+  state = mpeg2_parse (this->mpeg2dec);
+  printf("reset state2:%d\n",state);
+  if (info->display_fbuf && info->display_fbuf->id) {
+    img = (vo_frame_t *) info->display_fbuf->id;
+
+    if (this->img_state[img->id] != 1) {
+      printf ("libmpeg2:decode_data:draw_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id]);
+      _x_abort();
+    }
+    if (this->img_state[img->id] == 1) {
+      frame_skipping = img->draw (img, this->stream);
+      /* FIXME: Handle skipping */
+      this->img_state[img->id] = 2;
+    }
+  }
+
+  if (info->discard_fbuf && !info->discard_fbuf->id) {
+    printf ("libmpeg2:decode_data:BAD free_frame discard_fbuf=%p\n", info->discard_fbuf);
+    _x_abort();
+  }
+  if (info->discard_fbuf && info->discard_fbuf->id) {
+    img = (vo_frame_t *) info->discard_fbuf->id;
+    if (this->img_state[img->id] != 2) {
+      printf ("libmpeg2:decode_data:free_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id]);
+      _x_abort();
+    }
+    if (this->img_state[img->id] == 2) {
+      img->free(img);
+      this->img_state[img->id] = 0;
+    }
+  }
+  state = mpeg2_parse (this->mpeg2dec);
+  printf("reset state3:%d\n",state);
+  if (info->display_fbuf && info->display_fbuf->id) {
+    img = (vo_frame_t *) info->display_fbuf->id;
+
+    if (this->img_state[img->id] != 1) {
+      printf ("libmpeg2:decode_data:draw_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id]);
+      _x_abort();
+    }
+    if (this->img_state[img->id] == 1) {
+      frame_skipping = img->draw (img, this->stream);
+      /* FIXME: Handle skipping */
+      this->img_state[img->id] = 2;
+    }
+  }
+
+  if (info->discard_fbuf && !info->discard_fbuf->id) {
+    printf ("libmpeg2:decode_data:BAD free_frame discard_fbuf=%p\n", info->discard_fbuf);
+    _x_abort();
+  }
+  if (info->discard_fbuf && info->discard_fbuf->id) {
+    img = (vo_frame_t *) info->discard_fbuf->id;
+    if (this->img_state[img->id] != 2) {
+      printf ("libmpeg2:decode_data:free_frame id=%d BAD STATE:%d\n", img->id, this->img_state[img->id]);
+      _x_abort();
+    }
+    if (this->img_state[img->id] == 2) {
+      img->free(img);
+      this->img_state[img->id] = 0;
+    }
+  }
+#endif
+
+}
+
+static void mpeg2_video_discontinuity (video_decoder_t *this_gen) {
+  mpeg2_video_decoder_t *this = (mpeg2_video_decoder_t *) this_gen;
+
+#ifdef LOG_ENTRY
+  printf ("libmpeg2: dicontinuity\n");
+#endif
+/*  mpeg2_discontinuity (&this->mpeg2dec); */
+}
+
+static void mpeg2_video_dispose (video_decoder_t *this_gen) {
+
+  mpeg2_video_decoder_t *this = (mpeg2_video_decoder_t *) this_gen;
+
+#ifdef LOG_ENTRY
+  printf ("libmpeg2: close\n");
+#endif
+
+  mpeg2_close (this->mpeg2dec);
+
+  this->stream->video_out->close(this->stream->video_out, this->stream);
+
+  free (this);
+}
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+  mpeg2_video_decoder_t *this ;
+  int32_t n;
+
+  this = (mpeg2_video_decoder_t *) calloc(1, sizeof(mpeg2_video_decoder_t));
+
+  this->video_decoder.decode_data         = mpeg2_video_decode_data;
+  this->video_decoder.flush               = mpeg2_video_flush;
+  this->video_decoder.reset               = mpeg2_video_reset;
+  this->video_decoder.discontinuity       = mpeg2_video_discontinuity;
+  this->video_decoder.dispose             = mpeg2_video_dispose;
+  this->stream                            = stream;
+  this->class                             = (mpeg2_class_t *) class_gen;
+  this->frame_number=0;
+  this->rff_pattern=0;
+
+  this->mpeg2dec = mpeg2_init ();
+  mpeg2_custom_fbuf (this->mpeg2dec, 1);  /* <- Force libmpeg2 to use xine frame buffers. */
+  (stream->video_out->open) (stream->video_out, stream);
+  this->force_aspect = this->force_pan_scan = 0;
+  for(n=0;n<30;n++) this->img_state[n].id=0;
+
+  return &this->video_decoder;
+}
+
+/*
+ * mpeg2 plugin class
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  mpeg2_class_t *this;
+
+  this = (mpeg2_class_t *) calloc(1, sizeof(mpeg2_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "mpeg2new";
+  this->decoder_class.description     = N_("mpeg2 based video decoder plugin");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t supported_types[] = { BUF_VIDEO_MPEG, 0 };
+
+static const decoder_info_t dec_info_mpeg2 = {
+  supported_types,     /* supported types */
+  6                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "mpeg2new", XINE_VERSION_CODE, &dec_info_mpeg2, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/libvdpau/Makefile.am b/src/video_dec/libvdpau/Makefile.am
new file mode 100644
index 000000000..781001a04
--- /dev/null
+++ b/src/video_dec/libvdpau/Makefile.am
@@ -0,0 +1,42 @@
+include $(top_srcdir)/misc/Makefile.quiet
+include $(top_srcdir)/misc/Makefile.common
+
+AM_CFLAGS = $(DEFAULT_OCFLAGS) $(VISIBILITY_FLAG)
+AM_LDFLAGS = $(xineplug_ldflags)
+
+noinst_HEADERS = alterh264_decode.h alterh264_bits_reader.h bits_reader.h dpb.h cpb.h h264_parser.h nal.h
+
+if ENABLE_VDPAU
+vdpau_h264_module = xineplug_decode_vdpau_h264.la
+VDPAU_CFLAGS += -D_ISOC99_SOURCE
+
+vdpau_h264_alter_module = xineplug_decode_vdpau_h264_alter.la
+
+vdpau_mpeg12_module = xineplug_decode_vdpau_mpeg12.la
+
+vdpau_vc1_module = xineplug_decode_vdpau_vc1.la
+
+vdpau_mpeg4_module = xineplug_decode_vdpau_mpeg4.la
+endif
+
+xineplug_LTLIBRARIES = $(vdpau_h264_module) $(vdpau_h264_alter_module) $(vdpau_mpeg12_module) $(vdpau_vc1_module) $(vdpau_mpeg4_module)
+
+xineplug_decode_vdpau_h264_alter_la_SOURCES = alterh264_decode.c
+xineplug_decode_vdpau_h264_alter_la_CFLAGS = $(AM_CFLAGS) -fno-strict-aliasing
+xineplug_decode_vdpau_h264_alter_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS)
+
+xineplug_decode_vdpau_h264_la_SOURCES = nal.c dpb.c cpb.c h264_parser.c vdpau_h264.c
+xineplug_decode_vdpau_h264_la_CFLAGS = $(AM_CFLAGS) $(VDPAU_CFLAGS) -fno-strict-aliasing
+xineplug_decode_vdpau_h264_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS) -lm
+
+xineplug_decode_vdpau_mpeg12_la_SOURCES = vdpau_mpeg12.c
+xineplug_decode_vdpau_mpeg12_la_CFLAGS = $(AM_CFLAGS) -fno-strict-aliasing
+xineplug_decode_vdpau_mpeg12_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS)
+
+xineplug_decode_vdpau_vc1_la_SOURCES = vdpau_vc1.c
+xineplug_decode_vdpau_vc1_la_CFLAGS = $(AM_CFLAGS) -fno-strict-aliasing
+xineplug_decode_vdpau_vc1_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS)
+
+xineplug_decode_vdpau_mpeg4_la_SOURCES = vdpau_mpeg4.c
+xineplug_decode_vdpau_mpeg4_la_CFLAGS = $(AM_CFLAGS) -fno-strict-aliasing
+xineplug_decode_vdpau_mpeg4_la_LIBADD = $(XINE_LIB) $(DYNAMIC_LD_LIBS)
diff --git a/src/video_dec/libvdpau/alterh264_bits_reader.h b/src/video_dec/libvdpau/alterh264_bits_reader.h
new file mode 100644
index 000000000..47a26aca1
--- /dev/null
+++ b/src/video_dec/libvdpau/alterh264_bits_reader.h
@@ -0,0 +1,127 @@
+/* kate: tab-indent on; indent-width 4; mixedindent off; indent-mode cstyle; remove-trailing-space on; */
+#ifndef ALTERH264_BITS_READER_H
+#define ALTERH264_BITS_READER_H
+#include <sys/types.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+
+
+typedef struct {
+  uint8_t *buffer, *start;
+  int offbits, length, oflow;
+} bits_reader_t;
+
+
+
+static void
+bits_reader_set (bits_reader_t * br, uint8_t * buf, int len)
+{
+  br->buffer = br->start = buf;
+  br->offbits = 0;
+  br->length = len;
+  br->oflow = 0;
+}
+
+
+
+static inline uint32_t
+more_rbsp_data (bits_reader_t * br)
+{
+  uint8_t val[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+  uint8_t *buf = br->start + br->length;
+  int bit;
+
+  while (--buf >= br->buffer)
+  {
+    for (bit = 7; bit > -1; bit--)
+      if (*buf & val[bit])
+	return ((buf - br->buffer) * 8) - br->offbits + bit;
+  }
+  return 0;
+}
+
+
+
+static inline uint8_t
+bits_reader_shift (bits_reader_t * br)
+{
+  br->offbits = 0;
+  if ((br->buffer + 1) > (br->start + br->length - 1))
+  {
+    br->oflow = 1;
+    //printf("!!!!! buffer overflow !!!!!\n");
+    return 0;
+  }
+  ++br->buffer;
+  if ((*(br->buffer) == 3) && ((br->buffer - br->start) > 2)
+      && (*(br->buffer - 2) == 0) && (*(br->buffer - 1) == 0))
+  {
+    if ((br->buffer + 1) > (br->start + br->length - 1))
+    {
+      br->oflow = 1;
+      //printf("!!!!! buffer overflow !!!!!\n");
+      return 0;
+    }
+    ++br->buffer;
+  }
+  return 1;
+}
+
+
+
+static inline uint32_t
+read_bits (bits_reader_t * br, int nbits)
+{
+  uint8_t val[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+  uint32_t res = 0;
+
+  while (nbits)
+  {
+    res = (res << 1) + ((*br->buffer & val[br->offbits]) ? 1 : 0);
+    --nbits;
+    ++br->offbits;
+    if (br->offbits > 7)
+      if (!bits_reader_shift (br))
+	return 1;
+  }
+  return res;
+}
+
+
+
+static inline void
+skip_bits (bits_reader_t * br, int nbits)
+{
+  while (nbits)
+  {
+    --nbits;
+    ++br->offbits;
+    if (br->offbits > 7)
+      bits_reader_shift (br);
+  }
+}
+
+
+
+static inline uint32_t
+read_exp_ue (bits_reader_t * br)
+{
+  int leading = -1;
+  uint8_t b;
+
+  for (b = 0; !b; leading++)
+    b = read_bits (br, 1);
+
+  return (1 << leading) - 1 + read_bits (br, leading);
+}
+
+
+
+static inline int32_t
+read_exp_se (bits_reader_t * br)
+{
+  uint32_t res = read_exp_ue (br);
+  return (res & 0x01) ? (res + 1) / 2 : -(res / 2);
+}
+#endif /* ALTERH264_BITS_READER_H */
diff --git a/src/video_dec/libvdpau/alterh264_decode.c b/src/video_dec/libvdpau/alterh264_decode.c
new file mode 100644
index 000000000..f11162f3e
--- /dev/null
+++ b/src/video_dec/libvdpau/alterh264_decode.c
@@ -0,0 +1,2448 @@
+/* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; remove-trailing-space on;
+ * Copyright (C) 2008 the xine project
+ * Copyright (C) 2008 Christophe Thommeret <hftom@free.fr>
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * alterh264_decode.c, a H264 video stream parser using VDPAU hardware decoder
+ *
+ */
+
+#include "alterh264_decode.h"
+
+
+#define MAX_DPB_SIZE 16
+#define MIN_BUFFER_SIZE 10000
+#define MAX_BUFFER_SIZE 3145728
+
+#define NAL_UNSPECIFIED 0
+#define NAL_SLICE_NO_IDR 1
+#define NAL_SLICE_IDR 5
+#define NAL_SEI 6
+#define NAL_SEQUENCE 7
+#define NAL_PICTURE 8
+#define NAL_ACCES 9
+#define NAL_END_SEQUENCE 10
+#define NAL_END_STREAM 11
+#define NAL_SEQUENCE_EXT 13
+
+#define SLICE_TYPE_P 0
+#define SLICE_TYPE_B 1
+#define SLICE_TYPE_I 2
+#define SLICE_TYPE_SP 3
+#define SLICE_TYPE_SI 4
+
+#define START_IDR_FLAG 1000
+
+#define MAX_POC 2147483647
+
+#define DPB_DRAW_CLEAR   1
+#define DPB_DRAW_REFS   2
+#define DPB_DRAW_CURRENT 3
+
+//#define MAKE_DAT /*do NOT define this, unless you know what you do */
+#ifdef MAKE_DAT
+static int nframes;
+static FILE *outfile;
+#endif
+
+
+/*-------- DPB -------------------------------------------*/
+static void
+dpb_print (sequence_t * sequence)
+{
+  int i;
+  dpb_frame_t *frame;
+  uint32_t sf;
+
+  for (i = 0; i < MAX_DPB_SIZE; i++)
+  {
+    frame = sequence->dpb[i];
+    if (!frame->used)
+      break;
+    vo_frame_t *vo = (vo_frame_t *) frame->videoSurface;
+    vdpau_accel_t *accel;
+    if (vo)
+      accel = (vdpau_accel_t *) vo->accel_data;
+    sf = (vo) ? accel->surface : -1;
+    fprintf (stderr,
+	     "{ i:%d u:%d c:%d pn:%d-%d ir:%d-%d tpoc:%d bpoc:%d sf:%u }\n",
+	     i, frame->used, frame->completed, frame->PicNum[0],
+	     frame->PicNum[1], frame->is_reference[0], frame->is_reference[1],
+	     frame->TopFieldOrderCnt, frame->BottomFieldOrderCnt, sf);
+  }
+}
+
+
+
+static void
+dpb_clear_all_pts (sequence_t * sequence)
+{
+  int i;
+
+  for (i = 0; i < MAX_DPB_SIZE; i++)
+  {
+    if (!sequence->dpb[i]->used)
+      break;
+    sequence->dpb[i]->pts = 0;
+  }
+  sequence->cur_pic.pts = 0;
+  sequence->cur_pic.drop_pts = 1;
+}
+
+
+static void
+dpb_reset (sequence_t * sequence)
+{
+  int i;
+
+  for (i = 0; i < MAX_DPB_SIZE; i++)
+  {
+    if (sequence->dpb[i]->videoSurface)
+      sequence->dpb[i]->videoSurface->free (sequence->dpb[i]->videoSurface);
+    memset (sequence->dpb[i], 0, sizeof (dpb_frame_t));
+  }
+  if (sequence->cur_pic.videoSurface && !sequence->cur_pic.is_reference[0]
+      && !sequence->cur_pic.is_reference[1])
+  {
+    //fprintf(stderr, "freeing cur_pic\n");
+    sequence->cur_pic.videoSurface->free (sequence->cur_pic.videoSurface);
+  }
+}
+
+
+
+static void
+dpb_remove (sequence_t * sequence, int index)
+{
+  lprintf ("|||||||||||||||||||||||||||||||||||||||| dbp_remove\n");
+  int i;
+
+  dpb_frame_t *frame = sequence->dpb[index];
+  if (frame->videoSurface)
+    frame->videoSurface->free (frame->videoSurface);
+  memset (frame, 0, sizeof (dpb_frame_t));
+  for (i = index; i < (MAX_DPB_SIZE - 1); i++)
+  {
+    sequence->dpb[i] = sequence->dpb[i + 1];
+    if (!sequence->dpb[i]->used)
+    {
+      sequence->dpb[i + 1] = frame;
+      break;
+    }
+  }
+  if (i == (MAX_DPB_SIZE - 1))
+    sequence->dpb[i] = frame;
+}
+
+
+
+static dpb_frame_t *
+dpb_get_prev_ref (sequence_t * sequence)
+{
+  int i = MAX_DPB_SIZE - 1;
+
+  while (i > -1)
+  {
+    if (sequence->dpb[i]->used)
+      return sequence->dpb[i];
+    --i;
+  }
+
+  return NULL;
+}
+
+
+
+static void
+dpb_draw_frames (vdpau_h264_alter_decoder_t * this_gen, int32_t curpoc,
+		 int draw_mode)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  int i, index = 0;
+  int32_t poc, tpoc;
+  dpb_frame_t *frame;
+
+  while (index > -1)
+  {
+    index = -1;
+    poc = curpoc;
+    for (i = 0; i < MAX_DPB_SIZE; i++)
+    {
+      frame = seq->dpb[i];
+      if (!frame->used)
+	break;
+      tpoc =
+	(frame->TopFieldOrderCnt >
+	 frame->BottomFieldOrderCnt) ? frame->TopFieldOrderCnt : frame->
+	BottomFieldOrderCnt;
+      if (!frame->videoSurface->drawn && (tpoc <= poc))
+      {
+	poc = tpoc;
+	index = i;
+      }
+    }
+    if ((index > -1) && (poc <= curpoc))
+    {
+      //fprintf(stderr,"|||||||||||||||||||||||||||||||||||||||| dpb_draw_frame = %d\n", poc);
+      frame = seq->dpb[index];
+      frame->videoSurface->pts = frame->pts;
+      //fprintf(stderr,"H264 PTS = %llu\n", frame->pts);
+      frame->videoSurface->top_field_first = frame->top_field_first;
+      frame->videoSurface->draw (frame->videoSurface, this_gen->stream);
+      frame->videoSurface->drawn++;
+      if ((draw_mode != DPB_DRAW_CLEAR) && !frame->is_reference[0]
+	  && !frame->is_reference[1])
+	dpb_remove (seq, index);
+    }
+    else
+      index = -1;
+  }
+
+  if (draw_mode == DPB_DRAW_CURRENT)
+  {
+    //fprintf(stderr,"|||||||||||||||||||||||||||||||||||||||| dpb_draw_frame = %d\n", curpoc);
+    frame = &seq->cur_pic;
+    frame->videoSurface->pts = frame->pts;
+    //fprintf(stderr,"H264 PTS = %llu\n", frame->pts);
+    frame->videoSurface->top_field_first = frame->top_field_first;
+    frame->videoSurface->draw (frame->videoSurface, this_gen->stream);
+    frame->videoSurface->free (frame->videoSurface);
+  }
+  else if (draw_mode == DPB_DRAW_CLEAR)
+    dpb_reset (seq);
+}
+
+
+
+static dpb_frame_t *
+dpb_get_PicNum (sequence_t * sequence, int32_t pic_num, int *index)
+{
+  dpb_frame_t *frame;
+  int i = 0;
+
+  for (i = 0; i < MAX_DPB_SIZE; i++)
+  {
+    frame = sequence->dpb[i];
+    if (!frame->used)
+      break;
+    if ((frame->PicNum[0] == pic_num) || (frame->PicNum[1] == pic_num))
+    {
+      *index = i;
+      return frame;
+    }
+  }
+  return 0;
+}
+
+
+
+static void
+dpb_mmc1 (vdpau_h264_alter_decoder_t * this_gen, int32_t picnum)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  int index;
+
+  lprintf ("dpb_mmc1\n");
+
+  dpb_frame_t *frame = dpb_get_PicNum (seq, picnum, &index);
+
+  if (frame)
+  {
+    frame->is_reference[0] = frame->is_reference[1] = 0;
+    if (frame->videoSurface->drawn)
+      dpb_remove (seq, index);
+    else
+      dpb_draw_frames (this_gen,
+		       (frame->TopFieldOrderCnt >
+			frame->BottomFieldOrderCnt) ? frame->
+		       TopFieldOrderCnt : frame->BottomFieldOrderCnt,
+		       DPB_DRAW_REFS);
+  }
+}
+
+
+
+static void
+dbp_append (vdpau_h264_alter_decoder_t * this_gen, int second_field)
+{
+  sequence_t *sequence = (sequence_t *) & this_gen->sequence;
+  int i, index = 0, refs = 0;
+  int32_t fnw = MAX_POC;
+  slice_param_t *sl = &sequence->slice_param;
+  pic_param_t *pic = sequence->pic_param[sl->pic_parameter_set_id];
+  seq_param_t *sp = sequence->seq_param[pic->seq_parameter_set_id];
+  dpb_frame_t *tmp = 0, *cur_pic = &sequence->cur_pic;
+  int max = sp->num_ref_frames ? sp->num_ref_frames : 1;
+  max = (max > MAX_DPB_SIZE) ? MAX_DPB_SIZE : max;
+
+  vo_frame_t *vo = (vo_frame_t *) cur_pic->videoSurface;
+  vdpau_accel_t *accel = (vdpau_accel_t *) vo->accel_data;
+  lprintf
+    ("|||||||||||||||||||||||||||||||||||||||| dbp_append surface = %d\n",
+     accel->surface);
+
+  if (second_field)
+  {
+    tmp = dpb_get_prev_ref (sequence);
+    if (tmp)
+    {
+      memcpy (tmp, cur_pic, sizeof (dpb_frame_t));
+      cur_pic->videoSurface = NULL;
+    }
+    else
+      fprintf (stderr, "OOPS, no frame to store the second field ?!\n");
+    return;
+  }
+
+  for (i = 0; i < MAX_DPB_SIZE; i++)
+  {
+    if (!sequence->dpb[i]->used)
+      break;
+    if (sequence->dpb[i]->FrameNumWrap < fnw)
+    {
+      fnw = sequence->dpb[i]->FrameNumWrap;
+      index = i;
+    }
+    refs++;
+  }
+
+  if (refs >= max)
+  {
+    lprintf ("sliding window\n");
+    tmp = sequence->dpb[index],
+      tmp->is_reference[0] = tmp->is_reference[1] = 0;
+    if (tmp->videoSurface->drawn)
+      dpb_remove (sequence, index);
+    else
+      dpb_draw_frames (this_gen,
+		       (tmp->TopFieldOrderCnt >
+			tmp->BottomFieldOrderCnt) ? tmp->
+		       TopFieldOrderCnt : tmp->BottomFieldOrderCnt,
+		       DPB_DRAW_REFS);
+
+    for (i = 0; i < MAX_DPB_SIZE; i++)
+    {
+      if (!sequence->dpb[i]->used)
+	break;
+    }
+  }
+
+  if (i < MAX_DPB_SIZE)
+  {
+    memcpy (sequence->dpb[i], cur_pic, sizeof (dpb_frame_t));
+    if (!cur_pic->field_pic_flag)
+      cur_pic->videoSurface = NULL;
+  }
+}
+
+/*--------------------------------------------------------*/
+
+
+
+static void
+reset_slices (sequence_t * sequence)
+{
+  sequence->slices_count = 0;
+  sequence->slice_mode = 0;
+}
+
+
+
+static void
+reset_sequence (sequence_t * sequence)
+{
+  sequence->prevFrameNum = 0;
+  sequence->prevFrameNumOffset = 0;
+  sequence->prevMMC5 = 0;
+
+  sequence->startup_frame = 0;
+  sequence->reset = 0;
+  sequence->chroma = 0;
+  sequence->pic_pts = 0;
+  sequence->bufpos = 0;
+  sequence->bufseek = 0;
+  sequence->start = -1;
+  reset_slices (sequence);
+  dpb_reset (sequence);
+  memset (&sequence->cur_pic, 0, sizeof (dpb_frame_t));
+  sequence->reset = VO_NEW_SEQUENCE_FLAG;
+  sequence->color_standard = VDP_COLOR_STANDARD_ITUR_BT_601;
+}
+
+
+
+static void
+set_ratio (sequence_t * seq, seq_param_t * sp)
+{
+  if (seq->mode_frame && seq->ratio)
+    return;
+  if (!seq->coded_height)
+    seq->coded_height = 1;
+  seq->ratio = (double) seq->coded_width / (double) seq->coded_height;
+  if (sp->vui.aspect_ratio_info)
+  {
+    switch (sp->vui.aspect_ratio_idc)
+    {
+    case ASPECT_1_1:
+      seq->ratio = 1 * seq->ratio;
+      break;
+    case ASPECT_12_11:
+      seq->ratio *= 12.0 / 11.0;
+      break;
+    case ASPECT_10_11:
+      seq->ratio *= 10.0 / 11.0;
+      break;
+    case ASPECT_16_11:
+      seq->ratio *= 16.0 / 11.0;
+      break;
+    case ASPECT_40_33:
+      seq->ratio *= 40.0 / 33.0;
+      break;
+    case ASPECT_24_11:
+      seq->ratio *= 24.0 / 11.0;
+      break;
+    case ASPECT_20_11:
+      seq->ratio *= 20.0 / 11.0;
+      break;
+    case ASPECT_32_11:
+      seq->ratio *= 32.0 / 11.0;
+      break;
+    case ASPECT_80_33:
+      seq->ratio *= 80.0 / 33.0;
+      break;
+    case ASPECT_18_11:
+      seq->ratio *= 18.0 / 11.0;
+      break;
+    case ASPECT_15_11:
+      seq->ratio *= 15.0 / 11.0;
+      break;
+    case ASPECT_64_33:
+      seq->ratio *= 64.0 / 33.0;
+      break;
+    case ASPECT_160_99:
+      seq->ratio *= 160.0 / 99.0;
+      break;
+    case ASPECT_4_3:
+      seq->ratio *= 4.0 / 3.0;
+      break;
+    case ASPECT_3_2:
+      seq->ratio *= 3.0 / 2.0;
+      break;
+    case ASPECT_2_1:
+      seq->ratio *= 2.0 / 1.0;
+      break;
+    case ASPECT_EXTENDED_SAR:
+      if (sp->vui.sar_height)
+	seq->ratio *= (double) sp->vui.sar_width / sp->vui.sar_height;
+      break;
+    }
+  }
+}
+
+
+
+static void
+parse_scaling_list (bits_reader_t * br, uint8_t * scaling_list, int len,
+		    int index)
+{
+  int last_scale = 8;
+  int next_scale = 8;
+  int32_t delta_scale;
+  uint8_t use_default_scaling_matrix_flag = 0;
+  int i;
+
+  const uint8_t *zigzag = (len == 64) ? zigzag_8x8 : zigzag_4x4;
+
+  for (i = 0; i < len; i++)
+  {
+    if (next_scale != 0)
+    {
+      delta_scale = read_exp_se (br);
+      next_scale = (last_scale + delta_scale + 256) % 256;
+      if (i == 0 && next_scale == 0)
+      {
+	use_default_scaling_matrix_flag = 1;
+	break;
+      }
+    }
+    scaling_list[zigzag[i]] = last_scale =
+      (next_scale == 0) ? last_scale : next_scale;
+  }
+
+  if (use_default_scaling_matrix_flag)
+  {
+    switch (index)
+    {
+    case 0:
+    case 1:
+    case 2:
+      {
+	for (i = 0; i < sizeof (default_4x4_intra); i++)
+	  scaling_list[zigzag_4x4[i]] = default_4x4_intra[i];
+	break;
+      }
+    case 3:
+    case 4:
+    case 5:
+      {
+	for (i = 0; i < sizeof (default_4x4_inter); i++)
+	  scaling_list[zigzag_4x4[i]] = default_4x4_inter[i];
+	break;
+      }
+    case 6:
+      {
+	for (i = 0; i < sizeof (default_8x8_intra); i++)
+	  scaling_list[zigzag_8x8[i]] = default_8x8_intra[i];
+	break;
+      }
+    case 7:
+      {
+	for (i = 0; i < sizeof (default_8x8_inter); i++)
+	  scaling_list[zigzag_8x8[i]] = default_8x8_inter[i];
+	break;
+      }
+    }
+  }
+}
+
+
+
+static void
+scaling_list_fallback_A (uint8_t * scaling_lists_4x4,
+			 uint8_t * scaling_lists_8x8, int i)
+{
+  int j;
+  switch (i)
+  {
+  case 0:
+    {
+      for (j = 0; j < sizeof (default_4x4_intra); j++)
+	scaling_lists_4x4[(i * 16) + zigzag_4x4[j]] = default_4x4_intra[j];
+      break;
+    }
+  case 3:
+    {
+      for (j = 0; j < sizeof (default_4x4_inter); j++)
+	scaling_lists_4x4[(i * 16) + zigzag_4x4[j]] = default_4x4_inter[j];
+      break;
+    }
+  case 1:
+  case 2:
+  case 4:
+  case 5:
+    memcpy (&scaling_lists_4x4[i * 16], &scaling_lists_4x4[(i - 1) * 16],
+	    6 * 16);
+    break;
+  case 6:
+    {
+      for (j = 0; j < sizeof (default_8x8_intra); j++)
+	scaling_lists_8x8[(i - 6) * 64 + zigzag_8x8[j]] =
+	  default_8x8_intra[j];
+      break;
+    }
+  case 7:
+    {
+      for (j = 0; j < sizeof (default_8x8_inter); j++)
+	scaling_lists_8x8[(i - 6) * 64 + zigzag_8x8[j]] =
+	  default_8x8_inter[j];
+      break;
+    }
+
+  }
+}
+
+
+
+static void
+scaling_list_fallback_B (seq_param_t * sp, pic_param_t * pic, int i)
+{
+  switch (i)
+  {
+  case 0:
+  case 3:
+    memcpy (pic->scaling_lists_4x4[i], sp->scaling_lists_4x4[i],
+	    sizeof (pic->scaling_lists_4x4[i]));
+    break;
+  case 1:
+  case 2:
+  case 4:
+  case 5:
+    memcpy (pic->scaling_lists_4x4[i], pic->scaling_lists_4x4[i - 1],
+	    sizeof (pic->scaling_lists_4x4[i]));
+    break;
+  case 6:
+  case 7:
+    memcpy (pic->scaling_lists_8x8[i - 6], sp->scaling_lists_8x8[i - 6],
+	    sizeof (pic->scaling_lists_8x8[i - 6]));
+    break;
+  }
+}
+
+
+
+static void
+vui_parameters (sequence_t * seq, vui_param_t * vui)
+{
+  bits_reader_t *br = &seq->br;
+
+  vui->aspect_ratio_info = read_bits (br, 1);
+  lprintf ("aspect_ratio_info_present_flag = %d\n", vui->aspect_ratio_info);
+  if (vui->aspect_ratio_info)
+  {
+    vui->aspect_ratio_idc = read_bits (br, 8);
+    lprintf ("aspect_ratio_idc = %d\n", vui->aspect_ratio_idc);
+    if (vui->aspect_ratio_idc == 255)
+    {
+      vui->sar_width = read_bits (br, 16);
+      lprintf ("sar_width = %d\n", vui->sar_width);
+      vui->sar_height = read_bits (br, 16);
+      lprintf ("sar_height = %d\n", vui->sar_height);
+    }
+  }
+  if (read_bits (br, 1))	/* overscan_info_present_flag */
+    skip_bits (br, 1);		/* overscan_appropriate_falg */
+  if (read_bits (br, 1))
+  {				/* video_signal_type_present_flag */
+    skip_bits (br, 3);		/*video_format */
+    skip_bits (br, 1);		/*video_full_range_flag */
+    vui->colour_desc = read_bits (br, 1);
+    lprintf ("colour_desc = %d\n", vui->colour_desc);
+    if (vui->colour_desc)
+    {
+      vui->colour_primaries = read_bits (br, 8);
+      lprintf ("colour_primaries = %d\n", vui->colour_primaries);
+      skip_bits (br, 8);	/* transfer_characteristics */
+      skip_bits (br, 8);	/* matrix_coefficients */
+      switch (vui->colour_primaries)
+      {
+      case 1:
+	seq->color_standard = VDP_COLOR_STANDARD_ITUR_BT_709;
+	break;
+      case 6:
+      case 7:
+	seq->color_standard = VDP_COLOR_STANDARD_SMPTE_240M;
+	break;
+      }
+    }
+  }
+  if (read_bits (br, 1))
+  {				/* chroma_loc_info_present_flag */
+    read_exp_ue (br);		/* chroma_sample_loc_type_top_field */
+    read_exp_ue (br);		/* chroma_sample_loc_type_bottom_field */
+  }
+  vui->timing_info = read_bits (br, 1);
+  lprintf ("timing_info = %d\n", vui->timing_info);
+  if (vui->timing_info)
+  {
+    vui->num_units_in_tick = read_bits (br, 32);
+    lprintf ("num_units_in_tick = %u\n", vui->num_units_in_tick);
+    vui->time_scale = read_bits (br, 32);
+    lprintf ("time_scale = %u\n", vui->time_scale);
+    if (vui->time_scale > 0)
+      seq->video_step =
+	180000. * (double) vui->num_units_in_tick / (double) vui->time_scale;
+  }
+}
+
+
+
+static void
+seq_parameter_set_data (vdpau_h264_alter_decoder_t * this_gen)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  seq_param_t *sp;
+  int i;
+
+  uint8_t profile_idc = read_bits (&seq->br, 8);
+  lprintf ("profile_idc = %d\n", profile_idc);
+  uint8_t constraint_set0_flag = read_bits (&seq->br, 1);
+  lprintf ("constraint_set0_flag = %d\n", constraint_set0_flag);
+  uint8_t constraint_set1_flag = read_bits (&seq->br, 1);
+  lprintf ("constraint_set1_flag = %d\n", constraint_set1_flag);
+  uint8_t constraint_set2_flag = read_bits (&seq->br, 1);
+  lprintf ("constraint_set2_flag = %d\n", constraint_set2_flag);
+  uint8_t constraint_set3_flag = read_bits (&seq->br, 1);
+  lprintf ("constraint_set3_flag = %d\n", constraint_set3_flag);
+  skip_bits (&seq->br, 4);
+  uint8_t level_idc = read_bits (&seq->br, 8);
+  lprintf ("level_idc = %d\n", level_idc);
+
+  uint8_t seq_parameter_set_id = read_exp_ue (&seq->br);
+  lprintf ("seq_parameter_set_id = %d\n", seq_parameter_set_id);
+  if (seq_parameter_set_id > 31)
+  {
+    lprintf ("OOPS : seq_parameter_set_id > 31 !!\n");
+    return;
+  }
+
+  if (!seq->seq_param[seq_parameter_set_id])
+    seq->seq_param[seq_parameter_set_id] =
+      (seq_param_t *) calloc (1, sizeof (seq_param_t));
+  if (!seq->seq_param[seq_parameter_set_id])
+  {
+    lprintf ("OOPS : can't allocate SPS %d !!\n", seq_parameter_set_id);
+    return;
+  }
+
+  sp = seq->seq_param[seq_parameter_set_id];
+  sp->profile_idc = profile_idc;
+  switch (profile_idc)
+  {
+  case 100:
+    seq->profile = VDP_DECODER_PROFILE_H264_HIGH;
+    break;
+  case 77:
+    seq->profile = VDP_DECODER_PROFILE_H264_MAIN;
+    break;
+  case 66:			/* nvidia's vdpau doesn't suppot baseline, force main */
+  default:
+    seq->profile = VDP_DECODER_PROFILE_H264_MAIN;
+  }
+  sp->constraint_set0_flag = constraint_set0_flag;
+  sp->constraint_set1_flag = constraint_set1_flag;
+  sp->constraint_set2_flag = constraint_set2_flag;
+  sp->constraint_set3_flag = constraint_set3_flag;
+  sp->level_idc = level_idc;
+
+  memset (&sp->scaling_lists_4x4, 16, sizeof (sp->scaling_lists_4x4));
+  memset (&sp->scaling_lists_8x8, 16, sizeof (sp->scaling_lists_8x8));
+
+  sp->chroma_format_idc = 1;
+  sp->separate_colour_plane_flag = 0;
+  if (sp->profile_idc == 100 || sp->profile_idc == 110
+      || sp->profile_idc == 122 || sp->profile_idc == 244
+      || sp->profile_idc == 44 || sp->profile_idc == 83
+      || sp->profile_idc == 86)
+  {
+    sp->chroma_format_idc = read_exp_ue (&seq->br);
+    lprintf ("chroma_format_idc = %u\n", sp->chroma_format_idc);
+    if (sp->chroma_format_idc == 3)
+    {
+      sp->separate_colour_plane_flag = read_bits (&seq->br, 1);
+      lprintf ("separate_colour_plane_flag = %d\n",
+	       sp->separate_colour_plane_flag);
+    }
+    sp->bit_depth_luma_minus8 = read_exp_ue (&seq->br);
+    lprintf ("bit_depth_luma_minus8 = %u\n", sp->bit_depth_luma_minus8);
+    sp->bit_depth_chroma_minus8 = read_exp_ue (&seq->br);
+    lprintf ("bit_depth_chroma_minus8 = %u\n", sp->bit_depth_chroma_minus8);
+    sp->qpprime_y_zero_transform_bypass_flag = read_bits (&seq->br, 1);
+    lprintf ("qpprime_y_zero_transform_bypass_flag = %u\n",
+	     sp->qpprime_y_zero_transform_bypass_flag);
+    sp->seq_scaling_matrix_present_flag = read_bits (&seq->br, 1);
+    lprintf ("seq_scaling_matrix_present_flag = %u\n",
+	     sp->seq_scaling_matrix_present_flag);
+    if (sp->seq_scaling_matrix_present_flag)
+    {
+      for (i = 0; i < 8; i++)
+      {
+	int scaling_flag = read_bits (&seq->br, 1);
+	if (scaling_flag)
+	{
+	  if (i < 6)
+	    parse_scaling_list (&seq->br, &sp->scaling_lists_4x4[i][0], 16,
+				i);
+	  else
+	    parse_scaling_list (&seq->br, &sp->scaling_lists_8x8[i - 6][0],
+				64, i);
+	}
+	else
+	  scaling_list_fallback_A ((uint8_t *) sp->scaling_lists_4x4,
+				   (uint8_t *) sp->scaling_lists_8x8, i);
+      }
+    }
+  }
+  sp->log2_max_frame_num_minus4 = read_exp_ue (&seq->br);
+  lprintf ("log2_max_frame_num_minus4 = %u\n", sp->log2_max_frame_num_minus4);
+  sp->pic_order_cnt_type = read_exp_ue (&seq->br);
+  lprintf ("pic_order_cnt_type = %u\n", sp->pic_order_cnt_type);
+  if (sp->pic_order_cnt_type == 0)
+  {
+    sp->log2_max_pic_order_cnt_lsb_minus4 = read_exp_ue (&seq->br);
+    lprintf ("log2_max_pic_order_cnt_lsb_minus4 = %u\n",
+	     sp->log2_max_pic_order_cnt_lsb_minus4);
+  }
+  else if (sp->pic_order_cnt_type == 1)
+  {
+    sp->delta_pic_order_always_zero_flag = read_bits (&seq->br, 1);
+    lprintf ("delta_pic_order_always_zero_flag = %u\n",
+	     sp->delta_pic_order_always_zero_flag);
+    sp->offset_for_non_ref_pic = read_exp_se (&seq->br);
+    lprintf ("offset_for_non_ref_pic = %d\n", sp->offset_for_non_ref_pic);
+    sp->offset_for_top_to_bottom_field = read_exp_se (&seq->br);
+    lprintf ("offset_for_top_to_bottom_field = %d\n",
+	     sp->offset_for_top_to_bottom_field);
+    sp->num_ref_frames_in_pic_order_cnt_cycle = read_exp_ue (&seq->br);
+    lprintf ("num_ref_frames_in_pic_order_cnt_cycle = %u\n",
+	     sp->num_ref_frames_in_pic_order_cnt_cycle);
+    for (i = 0; i < sp->num_ref_frames_in_pic_order_cnt_cycle; i++)
+    {
+      sp->offset_for_ref_frame[i] = read_exp_se (&seq->br);
+      lprintf ("offset_for_ref_frame[%d] = %d\n", i,
+	       sp->offset_for_ref_frame[i]);
+    }
+  }
+  sp->num_ref_frames = read_exp_ue (&seq->br);
+  if (sp->num_ref_frames > 16)
+    sp->num_ref_frames = 16;
+  lprintf ("num_ref_frames = %u\n", sp->num_ref_frames);
+  sp->gaps_in_frame_num_value_allowed_flag = read_bits (&seq->br, 1);
+  lprintf ("gaps_in_frame_num_value_allowed_flag = %u\n",
+	   sp->gaps_in_frame_num_value_allowed_flag);
+  sp->pic_width_in_mbs_minus1 = read_exp_ue (&seq->br);
+  lprintf ("pic_width_in_mbs_minus1 = %u\n", sp->pic_width_in_mbs_minus1);
+  sp->pic_height_in_map_units_minus1 = read_exp_ue (&seq->br);
+  lprintf ("pic_height_in_map_units_minus1 = %u\n",
+	   sp->pic_height_in_map_units_minus1);
+  sp->frame_mbs_only_flag = read_bits (&seq->br, 1);
+  lprintf ("frame_mbs_only_flag = %u\n", sp->frame_mbs_only_flag);
+
+  seq->coded_width = (sp->pic_width_in_mbs_minus1 + 1) * 16;
+  seq->coded_height =
+    (2 - sp->frame_mbs_only_flag) * (sp->pic_height_in_map_units_minus1 +
+				     1) * 16;
+
+  if (!sp->frame_mbs_only_flag)
+  {
+    sp->mb_adaptive_frame_field_flag = read_bits (&seq->br, 1);
+    lprintf ("mb_adaptive_frame_field_flag = %u\n",
+	     sp->mb_adaptive_frame_field_flag);
+  }
+  sp->direct_8x8_inference_flag = read_bits (&seq->br, 1);
+  lprintf ("direct_8x8_inference_flag = %u\n", sp->direct_8x8_inference_flag);
+  sp->frame_cropping_flag = read_bits (&seq->br, 1);
+  lprintf ("frame_cropping_flag = %u\n", sp->frame_cropping_flag);
+  if (sp->frame_cropping_flag)
+  {
+    sp->frame_crop_left_offset = read_exp_ue (&seq->br);
+    lprintf ("frame_crop_left_offset = %u\n", sp->frame_crop_left_offset);
+    sp->frame_crop_right_offset = read_exp_ue (&seq->br);
+    lprintf ("frame_crop_right_offset = %u\n", sp->frame_crop_right_offset);
+    sp->frame_crop_top_offset = read_exp_ue (&seq->br);
+    lprintf ("frame_crop_top_offset = %u\n", sp->frame_crop_top_offset);
+    sp->frame_crop_bottom_offset = read_exp_ue (&seq->br);
+    lprintf ("frame_crop_bottom_offset = %u\n", sp->frame_crop_bottom_offset);
+    seq->coded_height -=
+      (2 - sp->frame_mbs_only_flag) * 2 * sp->frame_crop_bottom_offset;
+  }
+  if (seq->coded_height == 1088)
+    seq->coded_height = 1080;
+  sp->vui_parameters_present_flag = read_bits (&seq->br, 1);
+  lprintf ("vui_parameters_present_flag = %u\n",
+	   sp->vui_parameters_present_flag);
+  if (sp->vui_parameters_present_flag)
+    vui_parameters (seq, &sp->vui);
+  set_ratio (seq, sp);
+}
+
+
+
+static void
+pic_parameter_set (vdpau_h264_alter_decoder_t * this_gen)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  pic_param_t *pic;
+  seq_param_t *sp;
+  int i;
+
+  uint8_t pic_parameter_set_id = read_exp_ue (&seq->br);
+  lprintf ("pic_parameter_set_id = %u\n", pic_parameter_set_id);
+  if (!seq->pic_param[pic_parameter_set_id])
+    seq->pic_param[pic_parameter_set_id] =
+      (pic_param_t *) calloc (1, sizeof (pic_param_t));
+  if (!seq->pic_param[pic_parameter_set_id])
+  {
+    lprintf ("OOPS : can't allocate PPS %d !!\n", pic_parameter_set_id);
+    return;
+  }
+  pic = seq->pic_param[pic_parameter_set_id];
+
+  uint8_t seq_parameter_set_id = read_exp_ue (&seq->br);
+  lprintf ("seq_parameter_set_id = %u\n", seq_parameter_set_id);
+  if (seq_parameter_set_id > 31)
+  {
+    lprintf ("OOPS : referenced SPS (%d) does not exist !!\n",
+	     seq_parameter_set_id);
+    return;
+  }
+  if (!seq->seq_param[seq_parameter_set_id])
+  {
+    lprintf ("OOPS : referenced SPS (%d) does not exist !!\n",
+	     seq_parameter_set_id);
+    return;
+  }
+
+  pic->seq_parameter_set_id = seq_parameter_set_id;
+  sp = seq->seq_param[pic->seq_parameter_set_id];
+  pic->entropy_coding_mode_flag = read_bits (&seq->br, 1);
+  lprintf ("entropy_coding_mode_flag = %u\n", pic->entropy_coding_mode_flag);
+  pic->pic_order_present_flag = read_bits (&seq->br, 1);
+  lprintf ("pic_order_present_flag = %u\n", pic->pic_order_present_flag);
+  uint8_t num_slice_groups_minus1 = read_exp_ue (&seq->br);
+  lprintf ("num_slice_groups_minus1 = %u\n", num_slice_groups_minus1);
+  if (num_slice_groups_minus1 > 0)
+  {
+    uint8_t slice_group_map_type = read_exp_ue (&seq->br);
+    lprintf ("slice_group_map_type = %u\n", slice_group_map_type);
+    if (!slice_group_map_type)
+    {
+      for (i = 0; i < num_slice_groups_minus1; i++)
+	read_exp_ue (&seq->br);
+    }
+    else if (slice_group_map_type == 2)
+    {
+      for (i = 0; i < num_slice_groups_minus1; i++)
+      {
+	read_exp_ue (&seq->br);
+	read_exp_ue (&seq->br);
+      }
+    }
+    else if (slice_group_map_type == 3 || slice_group_map_type == 4
+	     || slice_group_map_type == 5)
+    {
+      read_bits (&seq->br, 1);
+      read_exp_ue (&seq->br);
+    }
+    else if (slice_group_map_type == 6)
+    {
+      read_exp_ue (&seq->br);
+    }
+  }
+  pic->num_ref_idx_l0_active_minus1 = read_exp_ue (&seq->br);
+  lprintf ("num_ref_idx_l0_active_minus1 = %u\n",
+	   pic->num_ref_idx_l0_active_minus1);
+  pic->num_ref_idx_l1_active_minus1 = read_exp_ue (&seq->br);
+  lprintf ("num_ref_idx_l1_active_minus1 = %u\n",
+	   pic->num_ref_idx_l1_active_minus1);
+  pic->weighted_pred_flag = read_bits (&seq->br, 1);
+  lprintf ("weighted_pred_flag = %u\n", pic->weighted_pred_flag);
+  pic->weighted_bipred_idc = read_bits (&seq->br, 2);
+  lprintf ("weighted_bipred_idc = %u\n", pic->weighted_bipred_idc);
+  pic->pic_init_qp_minus26 = read_exp_se (&seq->br);
+  lprintf ("pic_init_qp_minus26 = %d\n", pic->pic_init_qp_minus26);
+  pic->pic_init_qs_minus26 = read_exp_se (&seq->br);
+  lprintf ("pic_init_qs_minus26 = %d\n", pic->pic_init_qs_minus26);
+  pic->chroma_qp_index_offset = read_exp_se (&seq->br);
+  lprintf ("chroma_qp_index_offset = %d\n", pic->chroma_qp_index_offset);
+  pic->deblocking_filter_control_present_flag = read_bits (&seq->br, 1);
+  lprintf ("deblocking_filter_control_present_flag = %u\n",
+	   pic->deblocking_filter_control_present_flag);
+  pic->constrained_intra_pred_flag = read_bits (&seq->br, 1);
+  lprintf ("constrained_intra_pred_flag = %u\n",
+	   pic->constrained_intra_pred_flag);
+  pic->redundant_pic_cnt_present_flag = read_bits (&seq->br, 1);
+  lprintf ("redundant_pic_cnt_present_flag = %u\n",
+	   pic->redundant_pic_cnt_present_flag);
+
+  uint32_t more = more_rbsp_data (&seq->br);
+  lprintf ("more bits = %u (buflen = %d) (still = %d)\n", more,
+	   seq->br.length, seq->br.start + seq->br.length - seq->br.buffer);
+  if (more)
+  {
+    pic->transform_8x8_mode_flag = read_bits (&seq->br, 1);
+    lprintf ("transform_8x8_mode_flag = %u\n", pic->transform_8x8_mode_flag);
+    pic->pic_scaling_matrix_present_flag = read_bits (&seq->br, 1);
+    lprintf ("pic_scaling_matrix_present_flag = %u\n",
+	     pic->pic_scaling_matrix_present_flag);
+    if (pic->pic_scaling_matrix_present_flag)
+    {
+      for (i = 0; i < 8; i++)
+      {
+	if (i < 6 || pic->transform_8x8_mode_flag)
+	  pic->pic_scaling_list_present_flag[i] = read_bits (&seq->br, 1);
+	else
+	  pic->pic_scaling_list_present_flag[i] = 0;
+
+	if (pic->pic_scaling_list_present_flag[i])
+	{
+	  if (i < 6)
+	    parse_scaling_list (&seq->br, &pic->scaling_lists_4x4[i][0], 16,
+				i);
+	  else
+	    parse_scaling_list (&seq->br, &pic->scaling_lists_8x8[i - 6][0],
+				64, i);
+	}
+	else
+	{
+	  if (!sp->seq_scaling_matrix_present_flag)
+	    scaling_list_fallback_A ((uint8_t *) pic->scaling_lists_4x4,
+				     (uint8_t *) pic->scaling_lists_8x8, i);
+	  else
+	    scaling_list_fallback_B (sp, pic, i);
+	}
+      }
+    }
+    pic->second_chroma_qp_index_offset = read_exp_se (&seq->br);
+    lprintf ("second_chroma_qp_index_offset = %d\n",
+	     pic->second_chroma_qp_index_offset);
+  }
+  else
+  {
+    pic->transform_8x8_mode_flag = 0;
+    pic->pic_scaling_matrix_present_flag = 0;
+    pic->second_chroma_qp_index_offset = pic->chroma_qp_index_offset;
+  }
+}
+
+
+
+static void
+pred_weight_table (vdpau_h264_alter_decoder_t * this_gen, uint8_t slice_type,
+		   uint8_t ChromaArrayType, uint8_t l0, uint8_t l1)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  int i;
+
+  read_exp_ue (&seq->br);
+  if (ChromaArrayType)
+    read_exp_ue (&seq->br);
+  for (i = 0; i <= l0; i++)
+  {
+    if (read_bits (&seq->br, 1))
+    {
+      read_exp_se (&seq->br);
+      read_exp_se (&seq->br);
+    }
+    if (ChromaArrayType && read_bits (&seq->br, 1))
+    {
+      read_exp_se (&seq->br);
+      read_exp_se (&seq->br);
+      read_exp_se (&seq->br);
+      read_exp_se (&seq->br);
+    }
+  }
+  if (slice_type == SLICE_TYPE_B)
+  {
+    for (i = 0; i <= l1; i++)
+    {
+      if (read_bits (&seq->br, 1))
+      {
+	read_exp_se (&seq->br);
+	read_exp_se (&seq->br);
+      }
+      if (ChromaArrayType)
+      {
+	if (read_bits (&seq->br, 1))
+	{
+	  read_exp_se (&seq->br);
+	  read_exp_se (&seq->br);
+	  read_exp_se (&seq->br);
+	  read_exp_se (&seq->br);
+	}
+      }
+    }
+  }
+}
+
+
+
+static void
+ref_pic_list_reordering (vdpau_h264_alter_decoder_t * this_gen)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  slice_param_t *sl = &seq->slice_param;
+
+  if ((sl->slice_type != SLICE_TYPE_I) && (sl->slice_type != SLICE_TYPE_SI))
+  {
+    if (read_bits (&seq->br, 1))
+    {
+      uint32_t tmp, diff;
+      do
+      {
+	tmp = read_exp_ue (&seq->br);
+	if (tmp == 0 || tmp == 1)
+	  diff = read_exp_ue (&seq->br);
+	else if (tmp == 2)
+	  diff = read_exp_ue (&seq->br);
+      }
+      while (tmp != 3 && !seq->br.oflow);
+    }
+  }
+  if (sl->slice_type == SLICE_TYPE_B)
+  {
+    if (read_bits (&seq->br, 1))
+    {
+      uint32_t tmp2, diff2;
+      do
+      {
+	tmp2 = read_exp_ue (&seq->br);
+	if (tmp2 == 0 || tmp2 == 1)
+	  diff2 = read_exp_ue (&seq->br);
+	else if (tmp2 == 2)
+	  diff2 = read_exp_ue (&seq->br);
+      }
+      while (tmp2 != 3 && !seq->br.oflow);
+    }
+  }
+}
+
+
+
+static void
+dec_ref_pic_marking (vdpau_h264_alter_decoder_t * this_gen, uint8_t idr)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  int32_t pic_num;
+
+  if (idr)
+  {
+    uint8_t no_output_of_prior_pics_flag = read_bits (&seq->br, 1);
+    lprintf ("no_output_of_prior_pics_flag = %u\n",
+	     no_output_of_prior_pics_flag);
+    uint8_t long_term_reference_flag = read_bits (&seq->br, 1);
+    lprintf ("long_term_reference_flag = %u\n", long_term_reference_flag);
+  }
+  else
+  {
+    uint8_t adaptive_ref_pic_marking_mode_flag = read_bits (&seq->br, 1);
+    lprintf ("adaptive_ref_pic_marking_mode_flag = %u\n",
+	     adaptive_ref_pic_marking_mode_flag);
+    if (!adaptive_ref_pic_marking_mode_flag)
+    {
+      if (seq->cur_pic.field_pic_flag
+	  && (seq->cur_pic.completed == PICTURE_DONE)
+	  && (seq->cur_pic.is_reference[0] || seq->cur_pic.is_reference[1]))
+      {
+	seq->cur_pic.is_reference[0] = seq->cur_pic.is_reference[1] =
+	  SHORT_TERM_REF;
+	lprintf ("short_ref marking\n");
+      }
+      // sliding window is always performed in dpb_append()
+    }
+    else
+    {
+      uint8_t memory_management_control_operation;
+      do
+      {
+	memory_management_control_operation = read_exp_ue (&seq->br);
+	lprintf ("memory_management_control_operation = %u\n",
+		 memory_management_control_operation);
+	if (memory_management_control_operation == 1
+	    || memory_management_control_operation == 3)
+	{
+	  uint32_t difference_of_pic_nums_minus1 = read_exp_ue (&seq->br);
+	  lprintf ("difference_of_pic_nums_minus1 = %u\n",
+		   difference_of_pic_nums_minus1);
+	  pic_num =
+	    seq->cur_pic.PicNum[0] - (difference_of_pic_nums_minus1 + 1);
+	  dpb_mmc1 (this_gen, pic_num);
+	}
+	if (memory_management_control_operation == 2)
+	{
+	  uint32_t long_term_pic_num = read_exp_ue (&seq->br);
+	  lprintf ("long_term_pic_num = %u\n", long_term_pic_num);
+	}
+	if (memory_management_control_operation == 3
+	    || memory_management_control_operation == 6)
+	{
+	  uint32_t long_term_frame_idx = read_exp_ue (&seq->br);
+	  lprintf ("long_term_frame_idx = %u\n", long_term_frame_idx);
+	}
+	if (memory_management_control_operation == 4)
+	{
+	  uint32_t max_long_term_frame_idx_plus1 = read_exp_ue (&seq->br);
+	  lprintf ("max_long_term_frame_idx_plus1 = %u\n",
+		   max_long_term_frame_idx_plus1);
+	}
+      }
+      while (memory_management_control_operation && !seq->br.oflow);
+    }
+  }
+}
+
+
+
+static void
+slice_header (vdpau_h264_alter_decoder_t * this_gen, uint8_t nal_ref_idc,
+	      uint8_t nal_unit_type)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  slice_param_t *sl = &seq->slice_param;
+  pic_param_t *pic;
+  seq_param_t *sp;
+
+  sl->nal_ref_idc = nal_ref_idc;
+  sl->nal_unit_type = nal_unit_type;
+
+  read_exp_ue (&seq->br);	/* first_mb_in_slice */
+  sl->slice_type = read_exp_ue (&seq->br) % 5;
+  lprintf ("slice_type = %u\n", sl->slice_type);
+  sl->pic_parameter_set_id = read_exp_ue (&seq->br);
+  lprintf ("pic_parameter_set_id = %u\n", sl->pic_parameter_set_id);
+  if (!seq->pic_param[sl->pic_parameter_set_id])
+  {
+    lprintf ("OOPS : referenced PPS (%d) does not exist !!\n",
+	     sl->pic_parameter_set_id);
+    seq->cur_pic.missing_header = 1;
+    return;
+  }
+  pic = seq->pic_param[sl->pic_parameter_set_id];
+  if (!seq->seq_param[pic->seq_parameter_set_id])
+  {
+    lprintf ("OOPS : referenced SPS (%d) does not exist !!\n",
+	     pic->seq_parameter_set_id);
+    seq->cur_pic.missing_header = 1;
+    return;
+  }
+
+  if (!seq->startup_frame && (sl->slice_type == SLICE_TYPE_I)
+      && !seq->cur_pic.completed)
+    seq->startup_frame = 1;
+
+  sp = seq->seq_param[pic->seq_parameter_set_id];
+  if (sp->separate_colour_plane_flag)
+    read_bits (&seq->br, 2);	/* colour_plane_id */
+  sl->frame_num = read_bits (&seq->br, sp->log2_max_frame_num_minus4 + 4);
+  lprintf ("frame_num = %u\n", sl->frame_num);
+  sl->MaxFrameNum = 1 << (sp->log2_max_frame_num_minus4 + 4);
+
+  sl->field_pic_flag = sl->bottom_field_flag =
+    sl->delta_pic_order_cnt_bottom = 0;
+  sl->delta_pic_order_cnt[0] = sl->delta_pic_order_cnt[1] = 0;
+
+  if (!sp->frame_mbs_only_flag)
+  {
+    sl->field_pic_flag = read_bits (&seq->br, 1);
+    lprintf ("field_pic_flag = %u\n", sl->field_pic_flag);
+    if (sl->field_pic_flag)
+    {
+      sl->bottom_field_flag = read_bits (&seq->br, 1);
+      lprintf ("bottom_field_flag = %u\n", sl->bottom_field_flag);
+    }
+  }
+  if (nal_unit_type == NAL_SLICE_IDR)
+  {
+    sl->idr_pic_id = read_exp_ue (&seq->br);
+    lprintf ("idr_pic_id = %u\n", sl->idr_pic_id);
+  }
+  if (sp->pic_order_cnt_type == 0)
+  {
+    sl->pic_order_cnt_lsb =
+      read_bits (&seq->br, sp->log2_max_pic_order_cnt_lsb_minus4 + 4);
+    lprintf ("pic_order_cnt_lsb = %u\n", sl->pic_order_cnt_lsb);
+    if (pic->pic_order_present_flag && !sl->field_pic_flag)
+    {
+      sl->delta_pic_order_cnt_bottom = read_exp_se (&seq->br);
+      lprintf ("delta_pic_order_cnt_bottom = %d\n",
+	       sl->delta_pic_order_cnt_bottom);
+    }
+  }
+  if (sp->pic_order_cnt_type == 1 && !sp->delta_pic_order_always_zero_flag)
+  {
+    sl->delta_pic_order_cnt[0] = read_exp_se (&seq->br);
+    lprintf ("delta_pic_order_cnt[0] = %d\n", sl->delta_pic_order_cnt[0]);
+    if (pic->pic_order_present_flag && !sl->field_pic_flag)
+    {
+      sl->delta_pic_order_cnt[1] = read_exp_se (&seq->br);
+      lprintf ("delta_pic_order_cnt[1] = %d\n", sl->delta_pic_order_cnt[1]);
+    }
+  }
+  if (pic->redundant_pic_cnt_present_flag)
+  {
+    sl->redundant_pic_cnt = read_exp_ue (&seq->br);
+    lprintf ("redundant_pic_cnt = %u\n", sl->redundant_pic_cnt);
+  }
+  if (sl->slice_type == SLICE_TYPE_B)
+    skip_bits (&seq->br, 1);	/* direct_spatial_mv_pred_flag */
+
+  sl->num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1;
+  sl->num_ref_idx_l1_active_minus1 = pic->num_ref_idx_l1_active_minus1;
+
+  if (sl->slice_type == SLICE_TYPE_P || sl->slice_type == SLICE_TYPE_SP
+      || sl->slice_type == SLICE_TYPE_B)
+  {
+    if (read_bits (&seq->br, 1))
+    {
+      lprintf ("num_ref_idx_active_override_flag = 1\n");
+      sl->num_ref_idx_l0_active_minus1 = read_exp_ue (&seq->br);
+      if (sl->slice_type == SLICE_TYPE_B)
+	sl->num_ref_idx_l1_active_minus1 = read_exp_ue (&seq->br);
+      lprintf ("num_ref_idx_l0_active_minus1 = %u\n",
+	       sl->num_ref_idx_l0_active_minus1);
+      lprintf ("num_ref_idx_l1_active_minus1 = %u\n",
+	       sl->num_ref_idx_l1_active_minus1);
+    }
+  }
+}
+
+
+
+static void
+slice_header_post (vdpau_h264_alter_decoder_t * this_gen)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  slice_param_t *sl = &seq->slice_param;
+
+  if (!sl->nal_ref_idc)
+    return;
+
+  pic_param_t *pic = seq->pic_param[sl->pic_parameter_set_id];
+  seq_param_t *sp = seq->seq_param[pic->seq_parameter_set_id];
+
+  if ((pic->weighted_pred_flag
+       && ((sl->slice_type == SLICE_TYPE_P)
+	   || (sl->slice_type == SLICE_TYPE_SP)))
+      || ((pic->weighted_bipred_idc == 1)
+	  && (sl->slice_type == SLICE_TYPE_B)))
+  {
+    uint8_t chroma =
+      (sp->separate_colour_plane_flag) ? 0 : sp->chroma_format_idc;
+    pred_weight_table (this_gen, sl->slice_type, chroma,
+		       sl->num_ref_idx_l0_active_minus1,
+		       sl->num_ref_idx_l1_active_minus1);
+  }
+
+  dec_ref_pic_marking (this_gen, (sl->nal_unit_type == 5) ? 1 : 0);
+}
+
+
+
+static void
+decode_poc (vdpau_h264_alter_decoder_t * this_gen)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  slice_param_t *sl = &seq->slice_param;
+  pic_param_t *pic = seq->pic_param[sl->pic_parameter_set_id];
+  seq_param_t *sp = seq->seq_param[pic->seq_parameter_set_id];
+  int parity = sl->bottom_field_flag ? 1 : 0;
+
+  seq->cur_pic.used = 1;
+  seq->cur_pic.FrameNum = sl->frame_num;
+  seq->cur_pic.is_reference[parity] = sl->nal_ref_idc;
+  seq->cur_pic.field_pic_flag = sl->field_pic_flag;
+
+  if (sl->field_pic_flag)
+  {
+    if (!seq->cur_pic.completed)
+      seq->cur_pic.top_field_first = !parity;
+    seq->cur_pic.completed |=
+      (parity ? PICTURE_BOTTOM_DONE : PICTURE_TOP_DONE);
+  }
+  else
+  {
+    seq->cur_pic.is_reference[!parity] = seq->cur_pic.is_reference[parity];
+    seq->cur_pic.completed = PICTURE_DONE;
+  }
+
+  if (sp->pic_order_cnt_type == 0)
+  {
+    dpb_frame_t *prev_pic = dpb_get_prev_ref (seq);
+    int32_t prevPicOrderCntMsb, prevPicOrderCntLsb;
+    uint32_t MaxPicOrderCntLsb =
+      1 << (sp->log2_max_pic_order_cnt_lsb_minus4 + 4);
+
+    seq->cur_pic.pic_order_cnt_lsb = sl->pic_order_cnt_lsb;
+    seq->cur_pic.top_field_first =
+      (sl->delta_pic_order_cnt_bottom < 0) ? 0 : 1;
+
+    if (!prev_pic)
+    {
+      seq->cur_pic.PicOrderCntMsb = seq->cur_pic.TopFieldOrderCnt =
+	seq->cur_pic.BottomFieldOrderCnt = 0;
+      return;
+    }
+    if (sl->nal_unit_type == NAL_SLICE_IDR)
+      prevPicOrderCntMsb = prevPicOrderCntLsb = 0;
+    else if (prev_pic->mmc5)
+    {
+      if (!sl->bottom_field_flag)
+      {
+	prevPicOrderCntMsb = 0;
+	prevPicOrderCntLsb = prev_pic->TopFieldOrderCnt;
+      }
+      else
+	prevPicOrderCntMsb = prevPicOrderCntLsb = 0;
+    }
+    else
+    {
+      prevPicOrderCntMsb = prev_pic->PicOrderCntMsb;
+      prevPicOrderCntLsb = prev_pic->pic_order_cnt_lsb;
+    }
+
+    if ((sl->pic_order_cnt_lsb < prevPicOrderCntLsb)
+	&& ((prevPicOrderCntLsb - sl->pic_order_cnt_lsb) >=
+	    (MaxPicOrderCntLsb / 2)))
+      seq->cur_pic.PicOrderCntMsb = prevPicOrderCntMsb + MaxPicOrderCntLsb;
+    else if ((sl->pic_order_cnt_lsb > prevPicOrderCntLsb)
+	     && ((sl->pic_order_cnt_lsb - prevPicOrderCntLsb) >
+		 (MaxPicOrderCntLsb / 2)))
+      seq->cur_pic.PicOrderCntMsb = prevPicOrderCntMsb - MaxPicOrderCntLsb;
+    else
+      seq->cur_pic.PicOrderCntMsb = prevPicOrderCntMsb;
+
+    if (!sl->field_pic_flag)
+    {
+      seq->cur_pic.TopFieldOrderCnt =
+	seq->cur_pic.PicOrderCntMsb + sl->pic_order_cnt_lsb;
+      seq->cur_pic.BottomFieldOrderCnt =
+	seq->cur_pic.TopFieldOrderCnt + sl->delta_pic_order_cnt_bottom;
+    }
+    else
+    {
+      if (sl->bottom_field_flag)
+	seq->cur_pic.BottomFieldOrderCnt =
+	  seq->cur_pic.PicOrderCntMsb + sl->pic_order_cnt_lsb;
+      else
+	seq->cur_pic.TopFieldOrderCnt =
+	  seq->cur_pic.PicOrderCntMsb + sl->pic_order_cnt_lsb;
+    }
+  }
+  else
+  {
+    int16_t FrameNumOffset, prevFrameNumOffset;
+    uint16_t MaxFrameNum = 1 << (sp->log2_max_frame_num_minus4 + 4);
+
+    if (sl->nal_unit_type == NAL_SLICE_IDR)
+    {
+      FrameNumOffset = 0;
+    }
+    else
+    {
+      if (seq->prevMMC5)
+	prevFrameNumOffset = 0;
+      else
+	prevFrameNumOffset = seq->prevFrameNumOffset;
+
+      if (seq->prevFrameNum > sl->frame_num)
+	FrameNumOffset = prevFrameNumOffset + MaxFrameNum;
+      else
+	FrameNumOffset = prevFrameNumOffset;
+    }
+
+    if (sp->pic_order_cnt_type == 1)
+    {
+      int16_t absFrameNum = 0, picOrderCntCycleCnt =
+	0, frameNumInPicOrderCntCycle = 0, expectedDeltaPerPicOrderCntCycle =
+	0, expectedPicOrderCnt = 0;
+      int i;
+      if (sp->num_ref_frames_in_pic_order_cnt_cycle)
+	absFrameNum = FrameNumOffset + sl->frame_num;
+      if (!sl->nal_ref_idc && (absFrameNum > 0))
+	--absFrameNum;
+
+      for (i = 0; i < sp->num_ref_frames_in_pic_order_cnt_cycle; i++)
+	expectedDeltaPerPicOrderCntCycle += sp->offset_for_ref_frame[i];
+
+      if (absFrameNum > 0)
+      {
+	picOrderCntCycleCnt =
+	  (absFrameNum - 1) / sp->num_ref_frames_in_pic_order_cnt_cycle;
+	frameNumInPicOrderCntCycle =
+	  (absFrameNum - 1) % sp->num_ref_frames_in_pic_order_cnt_cycle;
+	expectedPicOrderCnt =
+	  picOrderCntCycleCnt * expectedDeltaPerPicOrderCntCycle;
+	for (i = 0; i < frameNumInPicOrderCntCycle; i++)
+	  expectedPicOrderCnt += sp->offset_for_ref_frame[i];
+      }
+      if (!sl->nal_ref_idc)
+	expectedPicOrderCnt += sp->offset_for_non_ref_pic;
+
+      if (!sl->field_pic_flag)
+      {
+	seq->cur_pic.TopFieldOrderCnt =
+	  expectedPicOrderCnt + sl->delta_pic_order_cnt[0];
+	seq->cur_pic.BottomFieldOrderCnt =
+	  seq->cur_pic.TopFieldOrderCnt + sp->offset_for_top_to_bottom_field +
+	  sl->delta_pic_order_cnt[1];
+      }
+      else if (!sl->bottom_field_flag)
+	seq->cur_pic.TopFieldOrderCnt =
+	  expectedPicOrderCnt + sl->delta_pic_order_cnt[0];
+      else
+	seq->cur_pic.BottomFieldOrderCnt =
+	  expectedPicOrderCnt + sp->offset_for_top_to_bottom_field +
+	  sl->delta_pic_order_cnt[1];
+    }
+    else
+    {
+      int32_t tmpPicOrderCnt;
+      if (sl->nal_unit_type == NAL_SLICE_IDR)
+	tmpPicOrderCnt = 0;
+      else if (!sl->nal_ref_idc)
+	tmpPicOrderCnt = 2 * (FrameNumOffset + sl->frame_num) - 1;
+      else
+	tmpPicOrderCnt = 2 * (FrameNumOffset + sl->frame_num);
+
+      if (!sl->field_pic_flag)
+	seq->cur_pic.TopFieldOrderCnt = seq->cur_pic.BottomFieldOrderCnt =
+	  tmpPicOrderCnt;
+      else if (sl->bottom_field_flag)
+	seq->cur_pic.BottomFieldOrderCnt = tmpPicOrderCnt;
+      else
+	seq->cur_pic.TopFieldOrderCnt = tmpPicOrderCnt;
+    }
+    seq->prevFrameNum = seq->cur_pic.FrameNum;
+    seq->prevFrameNumOffset = FrameNumOffset;
+  }
+
+  if (seq->cur_pic.completed < PICTURE_DONE)
+  {
+    if (sl->bottom_field_flag)
+      seq->cur_pic.TopFieldOrderCnt = seq->cur_pic.BottomFieldOrderCnt;
+    else
+      seq->cur_pic.BottomFieldOrderCnt = seq->cur_pic.TopFieldOrderCnt;
+  }
+}
+
+
+
+static void
+decode_picnum (vdpau_h264_alter_decoder_t * this_gen)
+{
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  slice_param_t *sl = &seq->slice_param;
+  dpb_frame_t *frame;
+  int i = 0;
+
+  int parity = sl->bottom_field_flag ? 1 : 0;
+
+  if (!seq->cur_pic.field_pic_flag)
+    seq->cur_pic.PicNum[0] = seq->cur_pic.FrameNum;
+  else
+    seq->cur_pic.PicNum[parity] = 2 * seq->cur_pic.FrameNum + 1;
+
+  while (i < MAX_DPB_SIZE)
+  {
+    frame = seq->dpb[i];
+    if (!frame->used)
+      break;
+    if (frame->FrameNum > seq->cur_pic.FrameNum)
+      frame->FrameNumWrap = frame->FrameNum - sl->MaxFrameNum;
+    else
+      frame->FrameNumWrap = frame->FrameNum;
+
+    if (!sl->field_pic_flag)
+    {
+      frame->PicNum[0] = frame->PicNum[1] = frame->FrameNumWrap;
+    }
+    else
+    {
+      frame->PicNum[0] = 2 * frame->FrameNumWrap + (parity ? 0 : 1);
+      frame->PicNum[1] = 2 * frame->FrameNumWrap + (parity ? 1 : 0);
+    }
+    ++i;
+  }
+}
+
+
+
+static int
+check_ref_list (vdpau_h264_alter_decoder_t * this_gen)
+{
+  int i, j, bad_frame = 0;
+  dpb_frame_t *frame;
+  sequence_t *seq = (sequence_t *) & this_gen->sequence;
+  slice_param_t *sl = &seq->slice_param;
+  pic_param_t *pic = seq->pic_param[sl->pic_parameter_set_id];
+  seq_param_t *sp = seq->seq_param[pic->seq_parameter_set_id];
+  int prefs = 0;
+  int brefs = 0;
+  int poc, curpoc;
+
+  //int fps = (double)sp->vui.time_scale / (double)sp->vui.num_units_in_tick / ( 2 - sl->field_pic_flag );
+  int fps = (1 + sl->field_pic_flag) * 2 * sp->num_ref_frames;
+
+  if (seq->startup_frame >= fps)
+    return 0;
+
+  curpoc =
+    (seq->cur_pic.TopFieldOrderCnt >
+     seq->cur_pic.BottomFieldOrderCnt) ? seq->cur_pic.TopFieldOrderCnt : seq->
+    cur_pic.BottomFieldOrderCnt;
+
+  for (i = 15; i > -1; i--)
+  {
+    frame = seq->dpb[i];
+    if (!frame->used)
+      continue;
+    poc =
+      (frame->TopFieldOrderCnt >
+       frame->BottomFieldOrderCnt) ? frame->TopFieldOrderCnt : frame->
+      BottomFieldOrderCnt;
+    if (seq->cur_pic.field_pic_flag)
+    {
+      if (!frame->videoSurface->bad_frame)
+      {
+	for (j = 0; j < 2; j++)
+	{
+	  if (frame->is_reference[j])
+	  {
+	    if (poc <= curpoc)
+	      ++prefs;
+	    else
+	      ++brefs;
+	  }
+	}
+      }
+    }
+    else
+    {
+      if (!frame->videoSurface->bad_frame)
+      {
+	if (poc <= curpoc)
+	  ++prefs;
+	else
+	  ++brefs;
+      }
+    }
+  }
+
+  if (sl->slice_type != SLICE_TYPE_I)
+  {
+    if (prefs < (sl->num_ref_idx_l0_active_minus1 + 1))
+      bad_frame = 1;
+    if (sl->slice_type == SLICE_TYPE_B)
+    {
+      if (brefs < (sl->num_ref_idx_l1_active_minus1 + 1))
+	bad_frame = 1;
+    }
+  }
+
+  if (bad_frame)
+    fprintf (stderr,
+	     "******** Missing refframes, dropping. nrf=%d lo=%d prefs=%d l1=%d brefs=%d type=%d (%d fps)\n",
+	     sp->num_ref_frames, sl->num_ref_idx_l0_active_minus1 + 1, prefs,
+	     sl->num_ref_idx_l1_active_minus1 + 1, brefs, sl->slice_type,
+	     fps);
+  //else
+  //fprintf(stderr,"******** GOOD ! nrf=%d lo=%d prefs=%d l1=%d brefs=%d type=%d (%d fps)\n", sp->num_ref_frames, sl->num_ref_idx_l0_active_minus1 + 1, prefs, sl->num_ref_idx_l1_active_minus1 + 1, brefs, sl->slice_type, fps );
+
+  if (seq->cur_pic.is_reference[0] || seq->cur_pic.is_reference[1])
+    ++seq->startup_frame;
+
+  return bad_frame;
+}
+
+
+
+static void
+decode_render (vdpau_h264_alter_decoder_t * vd, int bad_frame)
+{
+  int i, j;
+  VdpPictureInfoH264 info;
+  seq_param_t *sp;
+  pic_param_t *pic;
+  slice_param_t *sl;
+  sequence_t *seq = (sequence_t *) & vd->sequence;
+  vo_frame_t *img;
+
+  if (!seq->cur_pic.field_pic_flag || (seq->cur_pic.completed < PICTURE_DONE))
+  {
+    img =
+      vd->stream->video_out->get_frame (vd->stream->video_out,
+					seq->coded_width, seq->coded_height,
+					seq->ratio, XINE_IMGFMT_VDPAU,
+					VO_BOTH_FIELDS | seq->chroma | seq->
+					reset);
+    seq->reset = 0;
+    img->drawn = 0;
+  }
+  else
+    img = seq->cur_pic.videoSurface;
+
+  if (!img)
+  {				/* should not happen */
+    fprintf (stderr,
+	     "vdpau_h264_alter : !!!!!!!!!!!!!!!!!!!!!! No vo_frame_t !!!!!!!!!!!!!!!!!!!!!!!\n");
+    return;
+  }
+
+  vdpau_accel_t *accel = (vdpau_accel_t *) img->accel_data;
+  if (!seq->accel_vdpau)
+    seq->accel_vdpau = accel;
+
+  if (seq->vdp_runtime_nr != *(seq->accel_vdpau->current_vdp_runtime_nr))
+    vd->decoder = VDP_INVALID_HANDLE;
+
+  sl = &vd->sequence.slice_param;
+  pic = vd->sequence.pic_param[sl->pic_parameter_set_id];
+  sp = vd->sequence.seq_param[pic->seq_parameter_set_id];
+
+  VdpStatus st;
+  if (vd->decoder == VDP_INVALID_HANDLE || vd->decoder_profile != seq->profile
+      || vd->decoder_width != seq->coded_width
+      || vd->decoder_height != seq->coded_height)
+  {
+    if (vd->decoder != VDP_INVALID_HANDLE)
+    {
+      accel->vdp_decoder_destroy (vd->decoder);
+      vd->decoder = VDP_INVALID_HANDLE;
+    }
+    st =
+      accel->vdp_decoder_create (accel->vdp_device, seq->profile,
+				 seq->coded_width, seq->coded_height,
+				 sp->num_ref_frames, &vd->decoder);
+    if (st != VDP_STATUS_OK)
+      fprintf (stderr, "vdpau_h264_alter : failed to create decoder !! %s\n",
+	       accel->vdp_get_error_string (st));
+    else
+    {
+      vd->decoder_profile = seq->profile;
+      vd->decoder_width = seq->coded_width;
+      vd->decoder_height = seq->coded_height;
+      seq->vdp_runtime_nr = seq->accel_vdpau->vdp_runtime_nr;
+    }
+  }
+
+  info.slice_count = seq->slices_count;
+  info.field_order_cnt[0] = seq->cur_pic.TopFieldOrderCnt;
+  info.field_order_cnt[1] = seq->cur_pic.BottomFieldOrderCnt;
+  info.is_reference = sl->nal_ref_idc ? VDP_TRUE : VDP_FALSE;
+  info.frame_num = sl->frame_num;
+  info.field_pic_flag = sl->field_pic_flag;
+  info.bottom_field_flag = sl->bottom_field_flag;
+  info.num_ref_frames = sp->num_ref_frames;
+  info.mb_adaptive_frame_field_flag = sp->mb_adaptive_frame_field_flag
+    && !sl->field_pic_flag;
+  info.constrained_intra_pred_flag = pic->constrained_intra_pred_flag;
+  info.weighted_pred_flag = pic->weighted_pred_flag;
+  info.weighted_bipred_idc = pic->weighted_bipred_idc;
+  info.frame_mbs_only_flag = sp->frame_mbs_only_flag;
+  info.transform_8x8_mode_flag = pic->transform_8x8_mode_flag;
+  info.chroma_qp_index_offset = pic->chroma_qp_index_offset;
+  info.second_chroma_qp_index_offset = pic->second_chroma_qp_index_offset;
+  info.pic_init_qp_minus26 = pic->pic_init_qp_minus26;
+  info.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1;
+  info.num_ref_idx_l1_active_minus1 = pic->num_ref_idx_l1_active_minus1;
+  info.log2_max_frame_num_minus4 = sp->log2_max_frame_num_minus4;
+  info.pic_order_cnt_type = sp->pic_order_cnt_type;
+  info.log2_max_pic_order_cnt_lsb_minus4 =
+    sp->log2_max_pic_order_cnt_lsb_minus4;
+  info.delta_pic_order_always_zero_flag =
+    sp->delta_pic_order_always_zero_flag;
+  info.direct_8x8_inference_flag = sp->direct_8x8_inference_flag;
+  info.entropy_coding_mode_flag = pic->entropy_coding_mode_flag;
+  info.pic_order_present_flag = pic->pic_order_present_flag;
+  info.deblocking_filter_control_present_flag =
+    pic->deblocking_filter_control_present_flag;
+  info.redundant_pic_cnt_present_flag = pic->redundant_pic_cnt_present_flag;
+
+  if (!pic->pic_scaling_matrix_present_flag)
+  {
+    xine_fast_memcpy (info.scaling_lists_4x4, sp->scaling_lists_4x4,
+		      sizeof (info.scaling_lists_4x4));
+    xine_fast_memcpy (info.scaling_lists_8x8, sp->scaling_lists_8x8,
+		      sizeof (info.scaling_lists_8x8));
+  }
+  else
+  {
+    xine_fast_memcpy (info.scaling_lists_4x4, pic->scaling_lists_4x4,
+		      sizeof (info.scaling_lists_4x4));
+    xine_fast_memcpy (info.scaling_lists_8x8, pic->scaling_lists_8x8,
+		      sizeof (info.scaling_lists_8x8));
+  }
+
+  j = 0;
+  for (i = (MAX_DPB_SIZE - 1); i > -1; i--)
+  {
+    if (!seq->dpb[i]->used)
+      continue;
+    vdpau_accel_t *accel =
+      (vdpau_accel_t *) seq->dpb[i]->videoSurface->accel_data;
+    info.referenceFrames[j].surface = accel->surface;
+    info.referenceFrames[j].is_long_term = 0;
+    info.referenceFrames[j].frame_idx = seq->dpb[i]->FrameNum;
+    info.referenceFrames[j].top_is_reference =
+      seq->dpb[i]->is_reference[0] ? VDP_TRUE : VDP_FALSE;
+    info.referenceFrames[j].bottom_is_reference =
+      seq->dpb[i]->is_reference[1] ? VDP_TRUE : VDP_FALSE;
+    info.referenceFrames[j].field_order_cnt[0] =
+      seq->dpb[i]->TopFieldOrderCnt;
+    info.referenceFrames[j].field_order_cnt[1] =
+      seq->dpb[i]->BottomFieldOrderCnt;
+    ++j;
+  }
+  for (; j < MAX_DPB_SIZE; j++)
+  {
+    info.referenceFrames[j].surface = VDP_INVALID_HANDLE;
+    info.referenceFrames[j].is_long_term = 0;
+    info.referenceFrames[j].frame_idx = 0;
+    info.referenceFrames[j].top_is_reference = 0;
+    info.referenceFrames[j].bottom_is_reference = 0;
+    info.referenceFrames[j].field_order_cnt[0] = 0;
+    info.referenceFrames[j].field_order_cnt[1] = 0;
+  }
+
+  uint8_t sc[3] = { 0, 0, 1 };
+  VdpBitstreamBuffer vbits[seq->slices_count * 2];
+  for (i = 0; i < seq->slices_count; i++)
+  {
+    vbits[i * 2].struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+    vbits[i * 2].bitstream = sc;
+    vbits[i * 2].bitstream_bytes = 3;
+    vbits[(i * 2) + 1].struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+    vbits[(i * 2) + 1].bitstream = seq->buf + seq->slices[i].buf_offset;
+    vbits[(i * 2) + 1].bitstream_bytes = seq->slices[i].len;
+  }
+  st =
+    accel->vdp_decoder_render (vd->decoder, accel->surface,
+			       (VdpPictureInfo *) & info,
+			       seq->slices_count * 2, vbits);
+  if (st != VDP_STATUS_OK)
+    lprintf ("**********************DECODING failed! - surface = %d - %s\n",
+	     accel->surface, accel->vdp_get_error_string (st));
+  else
+    lprintf ("**********************DECODING success! - surface = %d\n",
+	     accel->surface);
+
+  if ((seq->ratio != seq->reported_ratio)
+      || (seq->coded_width != seq->reported_coded_width)
+      || (seq->coded_height != seq->reported_coded_height)
+      || (seq->video_step != seq->reported_video_step))
+  {
+    seq->reported_ratio = seq->ratio;
+    seq->reported_coded_width = seq->coded_width;
+    seq->reported_coded_height = seq->coded_height;
+    seq->reported_video_step = seq->video_step;
+    _x_stream_info_set (vd->stream, XINE_STREAM_INFO_VIDEO_WIDTH,
+			seq->coded_width);
+    _x_stream_info_set (vd->stream, XINE_STREAM_INFO_VIDEO_HEIGHT,
+			seq->coded_height);
+    _x_stream_info_set (vd->stream, XINE_STREAM_INFO_VIDEO_RATIO,
+			((double) 10000 * seq->ratio));
+    _x_stream_info_set (vd->stream, XINE_STREAM_INFO_FRAME_DURATION,
+			seq->video_step);
+    _x_meta_info_set_utf8 (vd->stream, XINE_META_INFO_VIDEOCODEC,
+			   "H264/AVC (vdpau_alter)");
+    xine_event_t event;
+    xine_format_change_data_t data;
+    event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+    event.stream = vd->stream;
+    event.data = &data;
+    event.data_length = sizeof (data);
+    data.width = seq->coded_width;
+    data.height = seq->coded_height;
+    data.aspect = seq->ratio;
+    xine_event_send (vd->stream, &event);
+  }
+
+  accel->color_standard = seq->color_standard;
+
+  if (seq->cur_pic.completed == PICTURE_DONE)
+  {
+    seq->cur_pic.pts = seq->pic_pts;
+    seq->pic_pts = 0;
+  }
+  if (seq->cur_pic.drop_pts)
+    seq->cur_pic.pts = 0;
+  if (sp->frame_mbs_only_flag)
+    img->progressive_frame = -1;
+  img->bad_frame = bad_frame;
+  img->duration = seq->video_step;
+  seq->cur_pic.videoSurface = img;
+}
+
+
+
+static void
+decode_picture (vdpau_h264_alter_decoder_t * decoder)
+{
+  if (decoder->sequence.cur_pic.missing_header
+      || !decoder->sequence.startup_frame)
+  {
+    memset (&decoder->sequence.cur_pic, 0, sizeof (dpb_frame_t));
+    lprintf ("MISSING_HEADER or !startup_frame\n\n");
+    return;
+  }
+
+  slice_param_t *sl = &decoder->sequence.slice_param;
+  dpb_frame_t *cur_pic = &decoder->sequence.cur_pic;
+
+  if (cur_pic->completed && cur_pic->field_pic_flag)
+  {
+    int wrong_field = 0;
+    if ((sl->frame_num != cur_pic->FrameNum)
+	|| (sl->bottom_field_flag
+	    && (cur_pic->completed == PICTURE_BOTTOM_DONE))
+	|| (!sl->bottom_field_flag
+	    && (cur_pic->completed == PICTURE_TOP_DONE))
+	|| !sl->field_pic_flag)
+    {
+      wrong_field = 1;
+    }
+    if (wrong_field)
+    {
+      fprintf (stderr, "vdpau_h264_alter : Wrong field, skipping.\n");
+      memset (cur_pic, 0, sizeof (dpb_frame_t));
+      dpb_reset (&decoder->sequence);
+      cur_pic->missing_header = 1;
+      decoder->sequence.startup_frame = 0;
+      return;
+    }
+  }
+
+  /* picture decoding */
+  decode_poc (decoder);
+  lprintf ("TopFieldOrderCnt = %d - BottomFieldOrderCnt = %d\n",
+	   cur_pic->TopFieldOrderCnt, cur_pic->BottomFieldOrderCnt);
+  if (sl->nal_unit_type == 5)
+  {
+    dpb_draw_frames (decoder, MAX_POC, DPB_DRAW_CLEAR);
+    decoder->sequence.startup_frame = START_IDR_FLAG;
+  }
+  decode_picnum (decoder);
+  ref_pic_list_reordering (decoder);
+  lprintf ("............................. slices_count = %d\n",
+	   decoder->sequence.slices_count);
+
+  decode_render (decoder, check_ref_list (decoder));
+
+  /* dec_ref_pic_marking */
+  slice_header_post (decoder);
+
+  if (!cur_pic->is_reference[0] && !cur_pic->is_reference[1])
+  {
+    if (cur_pic->completed == PICTURE_DONE)
+    {
+      dpb_draw_frames (decoder,
+		       (cur_pic->TopFieldOrderCnt >
+			cur_pic->BottomFieldOrderCnt) ? cur_pic->
+		       TopFieldOrderCnt : cur_pic->BottomFieldOrderCnt,
+		       DPB_DRAW_CURRENT);
+    }
+  }
+  else
+  {
+    if (decoder->sequence.
+	seq_param[decoder->sequence.pic_param[sl->pic_parameter_set_id]->
+		  seq_parameter_set_id]->pic_order_cnt_type == 2)
+      dpb_draw_frames (decoder,
+		       (cur_pic->TopFieldOrderCnt >
+			cur_pic->BottomFieldOrderCnt) ? cur_pic->
+		       TopFieldOrderCnt : cur_pic->BottomFieldOrderCnt,
+		       DPB_DRAW_REFS);
+
+    if (!sl->field_pic_flag || cur_pic->completed < PICTURE_DONE)
+      dbp_append (decoder, 0);
+    else
+      dbp_append (decoder, 1);
+  }
+
+  if (cur_pic->completed == PICTURE_DONE)
+    memset (cur_pic, 0, sizeof (dpb_frame_t));
+
+
+  lprintf
+    ("\n___________________________________________________________________________________________\n\n");
+}
+
+
+
+static int
+parse_startcodes (vdpau_h264_alter_decoder_t * this_gen, uint8_t * buf,
+		  uint32_t len)
+{
+  sequence_t *sequence = (sequence_t *) & this_gen->sequence;
+  bits_reader_set (&sequence->br, buf, len);
+  int ret = 0;
+
+  skip_bits (&sequence->br, 1);	/* forbidden_zero_bit */
+  uint8_t nal_ref_idc = read_bits (&sequence->br, 2);
+  uint8_t nal_unit_type = read_bits (&sequence->br, 5);
+  lprintf ("NAL size = %d, nal_ref_idc = %d, nal_unit_type = %d\n", len,
+	   nal_ref_idc, nal_unit_type);
+
+  switch (nal_unit_type)
+  {
+  case NAL_END_SEQUENCE:
+    break;
+  case NAL_SEQUENCE:
+    seq_parameter_set_data (this_gen);
+    break;
+  case NAL_PICTURE:
+    pic_parameter_set (this_gen);
+    break;
+  case NAL_SLICE_IDR:
+    slice_header (this_gen, nal_ref_idc, nal_unit_type);
+    sequence->slices[sequence->slices_count].buf_offset = buf - sequence->buf;
+    sequence->slices[sequence->slices_count].len = len;
+    ++sequence->slices_count;
+    sequence->slice_mode = NAL_SLICE_IDR;
+    break;
+  case NAL_SLICE_NO_IDR:
+    slice_header (this_gen, nal_ref_idc, nal_unit_type);
+    sequence->slices[sequence->slices_count].buf_offset = buf - sequence->buf;
+    sequence->slices[sequence->slices_count].len = len;
+    ++sequence->slices_count;
+    sequence->slice_mode = NAL_SLICE_NO_IDR;
+    break;
+  }
+
+  return ret;
+}
+
+
+
+static void
+parse_codec_private (vdpau_h264_alter_decoder_t * this_gen, uint8_t * buf,
+		     uint32_t len)
+{
+  sequence_t *sequence = (sequence_t *) & this_gen->sequence;
+  bits_reader_set (&sequence->br, buf, len);
+  uint8_t *buffer = buf;
+  int i;
+
+  lprintf ("parse_codec_private\n");
+
+  sequence->mode_frame = 1;
+
+  // reserved 
+  skip_bits (&sequence->br, 8);
+  skip_bits (&sequence->br, 8);	/* profile_idc */
+  skip_bits (&sequence->br, 8);
+  skip_bits (&sequence->br, 8);	/* level_idc */
+  skip_bits (&sequence->br, 6);
+
+  sequence->frame_header_size = read_bits (&sequence->br, 2) + 1;
+  //sequence->frame_header_size = 3;
+  skip_bits (&sequence->br, 3);
+  uint8_t count = read_bits (&sequence->br, 5);
+  buffer += 6;
+  for (i = 0; i < count; i++)
+  {
+    bits_reader_set (&sequence->br, buffer, len - (buffer - buf));
+    uint16_t sps_size = read_bits (&sequence->br, 16);
+    skip_bits (&sequence->br, 8);
+    seq_parameter_set_data (this_gen);
+    buffer += sps_size + 2;
+  }
+  count = buffer[0];
+  ++buffer;
+  for (i = 0; i < count; i++)
+  {
+    bits_reader_set (&sequence->br, buffer, len - (buffer - buf));
+    uint16_t pps_size = read_bits (&sequence->br, 16);
+    skip_bits (&sequence->br, 8);
+    pic_parameter_set (this_gen);
+    buffer += pps_size + 2;
+  }
+}
+
+
+
+static void
+flush_buffer (sequence_t * seq)
+{
+  if ((seq->bufpos - seq->bufseek) >= seq->bufseek)
+  {
+    seq->bufsize = (seq->bufpos - seq->bufseek) + MIN_BUFFER_SIZE;
+    lprintf ("buffer too short, have to allocate a new one.\n");
+    uint8_t *tmp = (uint8_t *) malloc (seq->bufsize);
+    xine_fast_memcpy (tmp, seq->buf + seq->bufseek,
+		      seq->bufpos - seq->bufseek);
+    free (seq->buf);
+    seq->buf = tmp;
+  }
+  else
+    xine_fast_memcpy (seq->buf, seq->buf + seq->bufseek,
+		      seq->bufpos - seq->bufseek);
+
+  seq->bufpos -= seq->bufseek;
+  seq->start = -1;
+  seq->bufseek = 0;
+  reset_slices (seq);
+}
+
+
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void
+vdpau_h264_alter_decode_data (video_decoder_t * this_gen, buf_element_t * buf)
+{
+  vdpau_h264_alter_decoder_t *this = (vdpau_h264_alter_decoder_t *) this_gen;
+  sequence_t *seq = (sequence_t *) & this->sequence;
+
+#ifdef MAKE_DAT
+  fwrite (buf->content, 1, buf->size, outfile);
+#endif
+
+  if (buf->decoder_flags & BUF_FLAG_FRAMERATE)
+  {
+    lprintf ("BUF_FLAG_FRAMERATE\n");
+    seq->video_step = buf->decoder_info[0];
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_ASPECT)
+  {
+    lprintf ("BUF_FLAG_ASPECT\n");
+    seq->ratio =
+      (double) buf->decoder_info[1] / (double) buf->decoder_info[2];
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER)
+  {
+    lprintf ("BUF_FLAG_STDHEADER\n");
+    seq->flag_header = 1;
+    xine_bmiheader *bih = (xine_bmiheader *) buf->content;
+    seq->coded_width = bih->biWidth;
+    seq->coded_height = bih->biHeight;
+    uint8_t *codec_private = buf->content + sizeof (xine_bmiheader);
+    uint32_t codec_private_len = bih->biSize - sizeof (xine_bmiheader);
+    if (codec_private_len > 0)
+      parse_codec_private (this, codec_private, codec_private_len);
+    return;
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_SPECIAL)
+  {
+    if (buf->decoder_info[1] == BUF_SPECIAL_DECODER_CONFIG)
+    {
+      lprintf ("BUF_SPECIAL_DECODER_CONFIG\n");
+      seq->flag_header = 1;
+      uint8_t *codec_private = buf->decoder_info_ptr[2];
+      uint32_t codec_private_len = buf->decoder_info[2];
+      if (codec_private_len > 0)
+	parse_codec_private (this, codec_private, codec_private_len);
+    }
+    return;
+  }
+
+  if (!buf->size)
+    return;
+
+  int size = seq->bufpos + buf->size;
+  if (seq->bufsize < size)
+  {
+    if (seq->bufsize > MAX_BUFFER_SIZE)
+    {
+      fprintf (stderr,
+	       "vdpau_h264_alter : sorry, can't accumulate so much data, broken stream?\n");
+      reset_sequence (seq);
+      return;
+    }
+    seq->bufsize = size + MIN_BUFFER_SIZE;
+    seq->buf = (uint8_t *) realloc (seq->buf, seq->bufsize);
+    lprintf ("realloc new size = %d\n", seq->bufsize);
+  }
+  xine_fast_memcpy (seq->buf + seq->bufpos, buf->content, buf->size);
+  seq->bufpos += buf->size;
+
+  if (buf->decoder_flags & BUF_FLAG_FRAME_START)
+    seq->pic_pts = buf->pts;
+
+  int frame_end = buf->decoder_flags & BUF_FLAG_FRAME_END;
+
+  if (seq->mode_frame)
+  {
+    if (!seq->pic_pts)
+      seq->pic_pts = buf->pts;
+    if (frame_end)
+    {
+      if (buf->pts)
+	seq->pic_pts = buf->pts;
+      lprintf ("frame_end && seq->mode_frame\n");
+      int fhs;
+      uint8_t tb;
+      uint32_t j = 0;
+      while (j < seq->bufpos)
+      {
+	uint32_t s = 0;
+	for (fhs = 0; fhs < seq->frame_header_size; fhs++)
+	  s |= seq->buf[j + fhs] << (8 * (seq->frame_header_size - 1 - fhs));
+	tb = *(seq->buf + j + seq->frame_header_size) & 0x1F;
+	if (seq->slice_mode && (tb != seq->slice_mode))
+	{
+	  decode_picture (this);
+	  reset_slices (seq);
+	}
+	parse_startcodes (this, seq->buf + j + seq->frame_header_size, s);
+	j += seq->frame_header_size + s;
+      }
+      if (seq->slice_mode)
+      {
+	decode_picture (this);
+	reset_slices (seq);
+      }
+      seq->bufpos = 0;
+    }
+    return;
+  }
+
+  while (seq->bufseek <= seq->bufpos - 4)
+  {
+    uint8_t *buffer = seq->buf + seq->bufseek;
+    if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 1)
+    {
+      if (seq->start < 0)
+      {
+	seq->start = seq->bufseek;
+	uint8_t tb = buffer[3] & 0x1F;
+	if (((tb == NAL_SLICE_NO_IDR) || (tb == NAL_SLICE_IDR))
+	    && !seq->pic_pts)
+	  seq->pic_pts = buf->pts;
+	if (seq->slice_mode && (tb != seq->slice_mode))
+	{
+	  decode_picture (this);
+	  flush_buffer (seq);
+	}
+	if ((tb & 0x1F) == NAL_END_SEQUENCE)
+	{
+	  dpb_print (seq);
+	  dpb_draw_frames (this, MAX_POC, DPB_DRAW_CLEAR);
+	  lprintf ("NAL_END_SEQUENCE\n");
+	  dpb_print (seq);
+	}
+      }
+      else
+      {
+	parse_startcodes (this, seq->buf + seq->start + 3,
+			  seq->bufseek - seq->start - 3);
+	seq->start = -1;
+	--seq->bufseek;
+      }
+    }
+    ++seq->bufseek;
+  }
+
+  if (frame_end && seq->flag_header && (seq->start > -1)
+      && (seq->bufseek > seq->start))
+  {
+    lprintf ("frame_end && seq->start\n");
+    seq->bufseek = seq->bufpos;
+    parse_startcodes (this, seq->buf + seq->start + 3,
+		      seq->bufseek - seq->start - 3);
+    if (seq->slice_mode)
+      decode_picture (this);
+    flush_buffer (seq);
+  }
+}
+
+
+/*
+ * This function is called when xine needs to flush the system.
+ */
+static void
+vdpau_h264_alter_flush (video_decoder_t * this_gen)
+{
+  vdpau_h264_alter_decoder_t *this = (vdpau_h264_alter_decoder_t *) this_gen;
+
+  printf ("vdpau_h264_alter_flush\n");
+  dpb_draw_frames (this, MAX_POC, DPB_DRAW_REFS);
+}
+
+
+/*
+ * This function resets the video decoder.
+ */
+static void
+vdpau_h264_alter_reset (video_decoder_t * this_gen)
+{
+  vdpau_h264_alter_decoder_t *this = (vdpau_h264_alter_decoder_t *) this_gen;
+
+  lprintf ("vdpau_h264_alter_reset\n");
+  reset_sequence (&this->sequence);
+}
+
+
+/*
+ * The decoder should forget any stored pts values here.
+ */
+static void
+vdpau_h264_alter_discontinuity (video_decoder_t * this_gen)
+{
+  vdpau_h264_alter_decoder_t *this = (vdpau_h264_alter_decoder_t *) this_gen;
+  printf ("vdpau_h264_alter_discontinuity\n");
+
+  dpb_clear_all_pts (&this->sequence);
+  this->sequence.reset = VO_NEW_SEQUENCE_FLAG;
+}
+
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void
+vdpau_h264_alter_dispose (video_decoder_t * this_gen)
+{
+
+  vdpau_h264_alter_decoder_t *this = (vdpau_h264_alter_decoder_t *) this_gen;
+
+  lprintf ("vdpau_h264_alter_dispose\n");
+
+  if ((this->decoder != VDP_INVALID_HANDLE) && this->sequence.accel_vdpau)
+  {
+    this->sequence.accel_vdpau->vdp_decoder_destroy (this->decoder);
+    this->decoder = VDP_INVALID_HANDLE;
+  }
+  reset_sequence (&this->sequence);
+
+  int i;
+  for (i = 0; i < MAX_DPB_SIZE; i++)
+    free (this->sequence.dpb[i]);
+  for (i = 0; i < 32; i++)
+    if (this->sequence.seq_param[i])
+      free (this->sequence.seq_param[i]);
+  for (i = 0; i < 255; i++)
+    if (this->sequence.pic_param[i])
+      free (this->sequence.pic_param[i]);
+
+  this->stream->video_out->close (this->stream->video_out, this->stream);
+
+  free (this->sequence.buf);
+  free (this_gen);
+}
+
+
+/*
+ * This function allocates, initializes, and returns a private video
+ * decoder structure.
+ */
+static video_decoder_t *
+open_plugin (video_decoder_class_t * class_gen, xine_stream_t * stream)
+{
+
+  vdpau_h264_alter_decoder_t *this;
+
+  /* the videoout must be vdpau-capable to support this decoder */
+  if (!
+      (stream->video_driver->
+       get_capabilities (stream->video_driver) & VO_CAP_VDPAU_H264))
+    return NULL;
+
+  /* now check if vdpau has free decoder resource */
+  vo_frame_t *img =
+    stream->video_out->get_frame (stream->video_out, 1920, 1080, 1,
+				  XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS);
+  vdpau_accel_t *accel = (vdpau_accel_t *) img->accel_data;
+  int runtime_nr = accel->vdp_runtime_nr;
+  img->free (img);
+  VdpDecoder decoder;
+  VdpStatus st =
+    accel->vdp_decoder_create (accel->vdp_device,
+			       VDP_DECODER_PROFILE_H264_MAIN, 1920, 1080, 16,
+			       &decoder);
+  if (st != VDP_STATUS_OK)
+  {
+    fprintf (stderr, "can't create vdpau decoder!\n");
+    return NULL;
+  }
+
+  accel->vdp_decoder_destroy (decoder);
+
+  this =
+    (vdpau_h264_alter_decoder_t *) calloc (1,
+					   sizeof
+					   (vdpau_h264_alter_decoder_t));
+
+  this->video_decoder.decode_data = vdpau_h264_alter_decode_data;
+  this->video_decoder.flush = vdpau_h264_alter_flush;
+  this->video_decoder.reset = vdpau_h264_alter_reset;
+  this->video_decoder.discontinuity = vdpau_h264_alter_discontinuity;
+  this->video_decoder.dispose = vdpau_h264_alter_dispose;
+
+  this->stream = stream;
+  this->class = (vdpau_h264_alter_class_t *) class_gen;
+
+  int i;
+  for (i = 0; i < 16; i++)
+    this->sequence.dpb[i] = (dpb_frame_t *) calloc (1, sizeof (dpb_frame_t));
+  this->sequence.bufsize = MIN_BUFFER_SIZE;
+  this->sequence.buf = (uint8_t *) malloc (this->sequence.bufsize);
+  this->sequence.vdp_runtime_nr = runtime_nr;
+  this->sequence.reset = VO_NEW_SEQUENCE_FLAG;
+  this->sequence.reset = VO_NEW_SEQUENCE_FLAG;
+  this->sequence.ratio = 0.0;
+  this->sequence.video_step = 3600;
+  this->sequence.coded_width = 1280;
+  this->sequence.coded_height = 720;
+  this->sequence.reported_ratio = 0.0;
+  this->sequence.reported_video_step = 0;
+  this->sequence.reported_coded_width = 0;
+  this->sequence.reported_coded_height = 0;
+  this->sequence.frame_header_size = 4;
+  this->sequence.flag_header = 0;
+  this->sequence.mode_frame = 0;
+  reset_sequence (&this->sequence);
+
+  this->decoder = VDP_INVALID_HANDLE;
+  this->sequence.accel_vdpau = NULL;
+
+  (stream->video_out->open) (stream->video_out, stream);
+
+#ifdef MAKE_DAT
+  outfile = fopen ("/tmp/h264.dat", "w");
+  nframes = 0;
+#endif
+
+  return &this->video_decoder;
+}
+
+
+/*
+ * This function allocates a private video decoder class and initializes
+ * the class's member functions.
+ */
+static void *
+init_plugin (xine_t * xine, void *data)
+{
+
+  vdpau_h264_alter_class_t *this;
+
+  this =
+    (vdpau_h264_alter_class_t *) calloc (1,
+					 sizeof (vdpau_h264_alter_class_t));
+
+  this->decoder_class.open_plugin = open_plugin;
+  this->decoder_class.identifier = "vdpau_h264_alter";
+  this->decoder_class.description =
+    N_
+    ("vdpau_h264_alter: H264 decoder plugin using VDPAU hardware decoding.\n"
+     "Must be used along with video_out_vdpau.");
+  this->decoder_class.dispose = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+
+/*
+ * This is a list of all of the internal xine video buffer types that
+ * this decoder is able to handle. Check src/xine-engine/buffer.h for a
+ * list of valid buffer types (and add a new one if the one you need does
+ * not exist). Terminate the list with a 0.
+ */
+static const uint32_t video_types[] = {
+  BUF_VIDEO_H264,
+  0
+};
+
+
+/*
+ * This data structure combines the list of supported xine buffer types and
+ * the priority that the plugin should be given with respect to other
+ * plugins that handle the same buffer type. A plugin with priority (n+1)
+ * will be used instead of a plugin with priority (n).
+ */
+static const decoder_info_t dec_info_video = {
+  video_types,			/* supported types */
+  9				/* priority        */
+};
+
+
+/*
+ * The plugin catalog entry. This is the only information that this plugin
+ * will export to the public.
+ */
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* { type, API, "name", version, special_info, init_function } */
+  {PLUGIN_VIDEO_DECODER, 19, "vdpau_h264_alter", XINE_VERSION_CODE,
+   &dec_info_video, init_plugin},
+  {PLUGIN_NONE, 0, "", 0, NULL, NULL}
+};
diff --git a/src/video_dec/libvdpau/alterh264_decode.h b/src/video_dec/libvdpau/alterh264_decode.h
new file mode 100644
index 000000000..88f5e638f
--- /dev/null
+++ b/src/video_dec/libvdpau/alterh264_decode.h
@@ -0,0 +1,339 @@
+/* kate: tab-indent on; indent-width 4; mixedindent off; indent-mode cstyle; remove-trailing-space on; */
+#ifndef ALTERH264_DECODE_H
+#define ALTERH264_DECODE_H
+
+//#define LOG
+#define LOG_MODULE "vdpau_h264"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "accel_vdpau.h"
+#include <vdpau/vdpau.h>
+
+#include "alterh264_bits_reader.h"
+
+
+
+enum aspect_ratio
+{
+  ASPECT_UNSPECIFIED = 0,
+  ASPECT_1_1,
+  ASPECT_12_11,
+  ASPECT_10_11,
+  ASPECT_16_11,
+  ASPECT_40_33,
+  ASPECT_24_11,
+  ASPECT_20_11,
+  ASPECT_32_11,
+  ASPECT_80_33,
+  ASPECT_18_11,
+  ASPECT_15_11,
+  ASPECT_64_33,
+  ASPECT_160_99,
+  ASPECT_4_3,
+  ASPECT_3_2,
+  ASPECT_2_1,
+  ASPECT_RESERVED,
+  ASPECT_EXTENDED_SAR = 255
+};
+
+
+
+static const uint8_t zigzag_4x4[16] = {
+  0, 1, 4, 8,
+  5, 2, 3, 6,
+  9, 12, 13, 10,
+  7, 11, 14, 15
+};
+
+static const uint8_t zigzag_8x8[64] = {
+  0, 1, 8, 16, 9, 2, 3, 10,
+  17, 24, 32, 25, 18, 11, 4, 5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13, 6, 7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63
+};
+
+static const uint8_t default_4x4_intra[16] = {
+  6, 13, 13, 20,
+  20, 20, 28, 28,
+  28, 28, 32, 32,
+  32, 37, 37, 42
+};
+
+static const uint8_t default_4x4_inter[16] = {
+  10, 14, 14, 20,
+  20, 20, 24, 24,
+  24, 24, 27, 27,
+  27, 30, 30, 34
+};
+
+static const uint8_t default_8x8_intra[64] = {
+  6, 10, 10, 13, 11, 13, 16, 16,
+  16, 16, 18, 18, 18, 18, 18, 23,
+  23, 23, 23, 23, 23, 25, 25, 25,
+  25, 25, 25, 25, 27, 27, 27, 27,
+  27, 27, 27, 27, 29, 29, 29, 29,
+  29, 29, 29, 31, 31, 31, 31, 31,
+  31, 33, 33, 33, 33, 33, 36, 36,
+  36, 36, 38, 38, 38, 40, 40, 42
+};
+
+static const uint8_t default_8x8_inter[64] = {
+  9, 13, 13, 15, 13, 15, 17, 17,
+  17, 17, 19, 19, 19, 19, 19, 21,
+  21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 24, 24, 24, 24,
+  24, 24, 24, 24, 25, 25, 25, 25,
+  25, 25, 25, 27, 27, 27, 27, 27,
+  27, 28, 28, 28, 28, 28, 30, 30,
+  30, 30, 32, 32, 32, 33, 33, 35
+};
+
+
+
+typedef struct
+{
+  uint8_t aspect_ratio_info;
+  uint8_t aspect_ratio_idc;
+  uint16_t sar_width;
+  uint16_t sar_height;
+  uint8_t colour_desc;
+  uint8_t colour_primaries;
+  uint8_t timing_info;
+  uint32_t num_units_in_tick;
+  uint32_t time_scale;
+} vui_param_t;
+
+
+
+typedef struct
+{
+  uint8_t profile_idc;
+  uint8_t level_idc;
+  uint8_t seq_parameter_set_id;
+  uint8_t constraint_set0_flag;
+  uint8_t constraint_set1_flag;
+  uint8_t constraint_set2_flag;
+  uint8_t constraint_set3_flag;
+  uint8_t chroma_format_idc;
+  uint8_t separate_colour_plane_flag;
+  uint8_t bit_depth_luma_minus8;
+  uint8_t bit_depth_chroma_minus8;
+  uint8_t qpprime_y_zero_transform_bypass_flag;
+  uint8_t seq_scaling_matrix_present_flag;
+  uint8_t scaling_lists_4x4[6][16];
+  uint8_t scaling_lists_8x8[2][64];
+  uint8_t log2_max_frame_num_minus4;
+  uint8_t pic_order_cnt_type;
+  uint8_t log2_max_pic_order_cnt_lsb_minus4;
+  uint8_t delta_pic_order_always_zero_flag;
+  int32_t offset_for_non_ref_pic;
+  int32_t offset_for_top_to_bottom_field;
+  uint8_t num_ref_frames_in_pic_order_cnt_cycle;
+  int32_t offset_for_ref_frame[256];
+  uint8_t num_ref_frames;
+  uint8_t gaps_in_frame_num_value_allowed_flag;
+  uint8_t pic_width_in_mbs_minus1;
+  uint8_t pic_height_in_map_units_minus1;
+  uint8_t frame_mbs_only_flag;
+  uint8_t mb_adaptive_frame_field_flag;
+  uint8_t direct_8x8_inference_flag;
+  uint8_t frame_cropping_flag;
+  uint16_t frame_crop_left_offset;
+  uint16_t frame_crop_right_offset;
+  uint16_t frame_crop_top_offset;
+  uint16_t frame_crop_bottom_offset;
+  uint8_t vui_parameters_present_flag;
+  vui_param_t vui;
+} seq_param_t;
+
+
+
+typedef struct
+{
+  uint8_t pic_parameter_set_id;
+  uint8_t seq_parameter_set_id;
+  uint8_t entropy_coding_mode_flag;
+  uint8_t pic_order_present_flag;
+  /*uint8_t num_slice_groups_minus1;
+     uint8_t slice_group_map_type;
+     uint16_t run_length_minus1[64];
+     uint16_t top_left[64];
+     uint16_t bottom_right[64];
+     uint8_t slice_group_change_direction_flag;
+     uint16_t slice_group_change_rate_minus1;
+     uint16_t pic_size_in_map_units_minus1;
+     uint8_t slice_group_id[64]; */
+  uint8_t num_ref_idx_l0_active_minus1;
+  uint8_t num_ref_idx_l1_active_minus1;
+  uint8_t weighted_pred_flag;
+  uint8_t weighted_bipred_idc;
+  int8_t pic_init_qp_minus26;
+  int8_t pic_init_qs_minus26;
+  int8_t chroma_qp_index_offset;
+  uint8_t deblocking_filter_control_present_flag;
+  uint8_t constrained_intra_pred_flag;
+  uint8_t redundant_pic_cnt_present_flag;
+  uint8_t transform_8x8_mode_flag;
+  uint8_t pic_scaling_matrix_present_flag;
+  uint8_t pic_scaling_list_present_flag[8];
+  uint8_t scaling_lists_4x4[6][16];
+  uint8_t scaling_lists_8x8[2][64];
+  int8_t second_chroma_qp_index_offset;
+} pic_param_t;
+
+
+
+typedef struct
+{
+  uint8_t nal_ref_idc;
+  uint8_t nal_unit_type;
+  uint8_t slice_type;
+  uint8_t pic_parameter_set_id;
+  uint16_t frame_num;
+  uint32_t MaxFrameNum;
+  uint8_t field_pic_flag;
+  uint8_t bottom_field_flag;
+  uint16_t idr_pic_id;
+  uint16_t pic_order_cnt_lsb;
+  int32_t delta_pic_order_cnt_bottom;
+  int32_t delta_pic_order_cnt[2];
+  uint8_t redundant_pic_cnt;
+  uint8_t num_ref_idx_l0_active_minus1;
+  uint8_t num_ref_idx_l1_active_minus1;
+} slice_param_t;
+
+
+#define PICTURE_TOP_DONE    1
+#define PICTURE_BOTTOM_DONE 2
+#define PICTURE_DONE        3
+
+#define SHORT_TERM_REF 1
+#define LONG_TERM_REF  2
+
+typedef struct
+{
+  uint8_t used;
+  uint8_t missing_header;
+  int64_t pts;
+  uint8_t drop_pts;
+  uint8_t completed;
+  uint8_t top_field_first;
+  uint16_t FrameNum;
+  int32_t FrameNumWrap;
+  int32_t PicNum[2];		/* 0:top, 1:bottom */
+  uint8_t is_reference[2];	/* 0:top, 1:bottom, short or long term */
+  uint8_t field_pic_flag;
+  int32_t PicOrderCntMsb;
+  int32_t TopFieldOrderCnt;
+  int32_t BottomFieldOrderCnt;
+  uint16_t pic_order_cnt_lsb;
+  uint8_t mmc5;
+
+  vo_frame_t *videoSurface;
+} dpb_frame_t;
+
+
+
+typedef struct
+{
+  uint32_t buf_offset;
+  uint32_t len;
+} slice_t;
+
+
+
+typedef struct
+{
+  uint32_t coded_width;
+  uint32_t reported_coded_width;
+  uint32_t coded_height;
+  uint32_t reported_coded_height;
+  uint64_t video_step;		/* frame duration in pts units */
+  uint64_t reported_video_step;	/* frame duration in pts units */
+  double ratio;
+  double reported_ratio;
+
+  slice_t slices[68];
+  int slices_count;
+  int slice_mode;
+
+  seq_param_t *seq_param[32];
+  pic_param_t *pic_param[256];
+  slice_param_t slice_param;
+
+  dpb_frame_t *dpb[16];
+  dpb_frame_t cur_pic;
+  uint16_t prevFrameNum;
+  uint16_t prevFrameNumOffset;
+  uint8_t prevMMC5;
+
+  VdpColorStandard color_standard;
+  int chroma;
+  int top_field_first;
+  VdpDecoderProfile profile;
+
+  uint8_t *buf;			/* accumulate data */
+  int bufseek;
+  uint32_t bufsize;
+  uint32_t bufpos;
+  int start;
+
+  int64_t pic_pts;
+
+  bits_reader_t br;
+
+  int vdp_runtime_nr;
+  vdpau_accel_t *accel_vdpau;
+
+  int reset;
+  int startup_frame;
+
+  uint8_t mode_frame;
+  uint8_t flag_header;
+  uint32_t frame_header_size;
+
+} sequence_t;
+
+
+
+typedef struct
+{
+  video_decoder_class_t decoder_class;
+} vdpau_h264_alter_class_t;
+
+
+
+typedef struct vdpau_mpeg12_decoder_s
+{
+  video_decoder_t video_decoder;	/* parent video decoder structure */
+
+  vdpau_h264_alter_class_t *class;
+  xine_stream_t *stream;
+
+  sequence_t sequence;
+
+  VdpDecoder decoder;
+  VdpDecoderProfile decoder_profile;
+  uint32_t decoder_width;
+  uint32_t decoder_height;
+
+} vdpau_h264_alter_decoder_t;
+
+#endif /* ALTERH264_DECODE_H */
diff --git a/src/video_dec/libvdpau/bits_reader.h b/src/video_dec/libvdpau/bits_reader.h
new file mode 100644
index 000000000..db7cdfc7e
--- /dev/null
+++ b/src/video_dec/libvdpau/bits_reader.h
@@ -0,0 +1,82 @@
+#include <sys/types.h>
+
+
+
+typedef struct {
+  uint8_t *buffer, *start;
+  int      offbits, length, oflow;
+} bits_reader_t;
+
+
+
+static void bits_reader_set( bits_reader_t *br, uint8_t *buf, int len )
+{
+  br->buffer = br->start = buf;
+  br->offbits = 0;
+  br->length = len;
+  br->oflow = 0;
+}
+
+
+
+static uint32_t read_bits( bits_reader_t *br, int nbits )
+{
+  int i, nbytes;
+  uint32_t ret = 0;
+  uint8_t *buf;
+
+  buf = br->buffer;
+  nbytes = (br->offbits + nbits)/8;
+  if ( ((br->offbits + nbits) %8 ) > 0 )
+    nbytes++;
+  if ( (buf + nbytes) > (br->start + br->length) ) {
+    br->oflow = 1;
+    return 0;
+  }
+  for ( i=0; i<nbytes; i++ )
+    ret += buf[i]<<((nbytes-i-1)*8);
+  i = (4-nbytes)*8+br->offbits;
+  ret = ((ret<<i)>>i)>>((nbytes*8)-nbits-br->offbits);
+
+  br->offbits += nbits;
+  br->buffer += br->offbits / 8;
+  br->offbits %= 8;
+
+  return ret;
+}
+
+
+
+static void skip_bits( bits_reader_t *br, int nbits )
+{
+  br->offbits += nbits;
+  br->buffer += br->offbits / 8;
+  br->offbits %= 8;
+  if ( br->buffer > (br->start + br->length) ) {
+    br->oflow = 1;
+  }
+}
+
+
+
+static uint32_t get_bits( bits_reader_t *br, int nbits )
+{
+  int i, nbytes;
+  uint32_t ret = 0;
+  uint8_t *buf;
+
+  buf = br->buffer;
+  nbytes = (br->offbits + nbits)/8;
+  if ( ((br->offbits + nbits) %8 ) > 0 )
+    nbytes++;
+  if ( (buf + nbytes) > (br->start + br->length) ) {
+    br->oflow = 1;
+    return 0;
+  }
+  for ( i=0; i<nbytes; i++ )
+    ret += buf[i]<<((nbytes-i-1)*8);
+  i = (4-nbytes)*8+br->offbits;
+  ret = ((ret<<i)>>i)>>((nbytes*8)-nbits-br->offbits);
+
+  return ret;
+}
diff --git a/src/video_dec/libvdpau/cpb.c b/src/video_dec/libvdpau/cpb.c
new file mode 100644
index 000000000..d06238e5c
--- /dev/null
+++ b/src/video_dec/libvdpau/cpb.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2009 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * cpb.c: Coded Picture Buffer
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cpb.h"
+
+#include <stdlib.h>
+
+struct coded_picture* create_coded_picture()
+{
+  struct coded_picture* pic = calloc(1, sizeof(struct coded_picture));
+  return pic;
+}
+
+void free_coded_picture(struct coded_picture *pic)
+{
+  if(!pic)
+    return;
+
+  release_nal_unit(pic->sei_nal);
+  release_nal_unit(pic->sps_nal);
+  release_nal_unit(pic->pps_nal);
+  release_nal_unit(pic->slc_nal);
+
+  free(pic);
+}
+
diff --git a/src/video_dec/libvdpau/cpb.h b/src/video_dec/libvdpau/cpb.h
new file mode 100644
index 000000000..37dbd94cf
--- /dev/null
+++ b/src/video_dec/libvdpau/cpb.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2009 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * cpb.h: Coded Picture Buffer
+ */
+
+#ifndef CPB_H_
+#define CPB_H_
+
+#include "nal.h"
+
+enum picture_flags {
+  IDR_PIC = 0x01,
+  REFERENCE = 0x02,
+  NOT_EXISTING = 0x04,
+  INTERLACED = 0x08
+};
+
+struct coded_picture
+{
+  uint32_t flag_mask;
+
+  uint32_t max_pic_num;
+  int32_t pic_num;
+
+  uint8_t used_for_long_term_ref;
+  uint32_t long_term_pic_num;
+  uint32_t long_term_frame_idx;
+
+  int32_t top_field_order_cnt;
+  int32_t bottom_field_order_cnt;
+
+  uint8_t repeat_pic;
+
+  /* buffer data for the image slices, which
+   * are passed to the decoder
+   */
+  uint32_t slice_cnt;
+
+  int64_t pts;
+
+  struct nal_unit *sei_nal;
+  struct nal_unit *sps_nal;
+  struct nal_unit *pps_nal;
+  struct nal_unit *slc_nal;
+};
+
+struct coded_picture* create_coded_picture(void);
+void free_coded_picture(struct coded_picture *pic);
+
+#endif /* CPB_H_ */
diff --git a/src/video_dec/libvdpau/dpb.c b/src/video_dec/libvdpau/dpb.c
new file mode 100644
index 000000000..c2afd42ba
--- /dev/null
+++ b/src/video_dec/libvdpau/dpb.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * dpb.c: Implementing Decoded Picture Buffer
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cpb.h"
+#include "dpb.h"
+#include "nal.h"
+
+#include "h264_parser.h"
+
+#include "accel_vdpau.h"
+
+#include <xine/video_out.h>
+
+//#define DEBUG_DPB
+
+int dp_top_field_first(struct decoded_picture *decoded_pic)
+{
+  int top_field_first = 1;
+
+
+  if (decoded_pic->coded_pic[1] != NULL) {
+    if (!decoded_pic->coded_pic[0]->slc_nal->slc.bottom_field_flag &&
+        decoded_pic->coded_pic[1]->slc_nal->slc.bottom_field_flag &&
+        decoded_pic->coded_pic[0]->top_field_order_cnt !=
+            decoded_pic->coded_pic[1]->bottom_field_order_cnt) {
+      top_field_first = decoded_pic->coded_pic[0]->top_field_order_cnt < decoded_pic->coded_pic[1]->bottom_field_order_cnt;
+    } else if (decoded_pic->coded_pic[0]->slc_nal->slc.bottom_field_flag &&
+        !decoded_pic->coded_pic[1]->slc_nal->slc.bottom_field_flag &&
+        decoded_pic->coded_pic[0]->bottom_field_order_cnt !=
+            decoded_pic->coded_pic[1]->top_field_order_cnt) {
+      top_field_first = decoded_pic->coded_pic[0]->bottom_field_order_cnt > decoded_pic->coded_pic[1]->top_field_order_cnt;
+    }
+  }
+
+  if (decoded_pic->coded_pic[0]->flag_mask & PIC_STRUCT_PRESENT && decoded_pic->coded_pic[0]->sei_nal != NULL) {
+    uint8_t pic_struct = decoded_pic->coded_pic[0]->sei_nal->sei.pic_timing.pic_struct;
+    if(pic_struct == DISP_TOP_BOTTOM ||
+        pic_struct == DISP_TOP_BOTTOM_TOP) {
+      top_field_first = 1;
+    } else if (pic_struct == DISP_BOTTOM_TOP ||
+        pic_struct == DISP_BOTTOM_TOP_BOTTOM) {
+      top_field_first = 0;
+    } else if (pic_struct == DISP_FRAME) {
+      top_field_first = 1;
+    }
+  }
+
+  return top_field_first;
+}
+
+/**
+ * ----------------------------------------------------------------------------
+ * decoded picture
+ * ----------------------------------------------------------------------------
+ */
+
+void free_decoded_picture(struct decoded_picture *pic);
+
+struct decoded_picture* init_decoded_picture(struct coded_picture *cpic, vo_frame_t *img)
+{
+  struct decoded_picture *pic = calloc(1, sizeof(struct decoded_picture));
+
+  pic->coded_pic[0] = cpic;
+
+  decoded_pic_check_reference(pic);
+  pic->img = img;
+  pic->lock_counter = 1;
+
+  return pic;
+}
+
+void decoded_pic_check_reference(struct decoded_picture *pic)
+{
+  int i;
+  for(i = 0; i < 2; i++) {
+    struct coded_picture *cpic = pic->coded_pic[i];
+    if(cpic && (cpic->flag_mask & REFERENCE)) {
+      // FIXME: this assumes Top Field First!
+      if(i == 0) {
+        pic->top_is_reference = cpic->slc_nal->slc.field_pic_flag
+                    ? (cpic->slc_nal->slc.bottom_field_flag ? 0 : 1) : 1;
+      }
+
+      pic->bottom_is_reference = cpic->slc_nal->slc.field_pic_flag
+                    ? (cpic->slc_nal->slc.bottom_field_flag ? 1 : 0) : 1;
+    }
+  }
+}
+
+void decoded_pic_add_field(struct decoded_picture *pic,
+    struct coded_picture *cpic)
+{
+  pic->coded_pic[1] = cpic;
+
+  decoded_pic_check_reference(pic);
+}
+
+void release_decoded_picture(struct decoded_picture *pic)
+{
+  if(!pic)
+    return;
+
+  pic->lock_counter--;
+  //printf("release decoded picture: %p (%d)\n", pic, pic->lock_counter);
+
+  if(pic->lock_counter <= 0) {
+    free_decoded_picture(pic);
+  }
+}
+
+void lock_decoded_picture(struct decoded_picture *pic)
+{
+  if(!pic)
+    return;
+
+  pic->lock_counter++;
+  //printf("lock decoded picture: %p (%d)\n", pic, pic->lock_counter);
+}
+
+void free_decoded_picture(struct decoded_picture *pic)
+{
+  if(!pic)
+    return;
+
+  if(pic->img != NULL) {
+    pic->img->free(pic->img);
+  }
+
+  free_coded_picture(pic->coded_pic[1]);
+  free_coded_picture(pic->coded_pic[0]);
+  pic->coded_pic[0] = NULL;
+  pic->coded_pic[1] = NULL;
+  free(pic);
+}
+
+
+
+
+/**
+ * ----------------------------------------------------------------------------
+ * dpb code starting here
+ * ----------------------------------------------------------------------------
+ */
+
+struct dpb* create_dpb(void)
+{
+    struct dpb *dpb = calloc(1, sizeof(struct dpb));
+
+    dpb->output_list = xine_list_new();
+    dpb->reference_list = xine_list_new();
+
+    dpb->max_reorder_frames = MAX_DPB_COUNT;
+    dpb->max_dpb_frames = MAX_DPB_COUNT;
+
+    return dpb;
+}
+
+int dpb_total_frames(struct dpb *dpb)
+{
+  int num_frames = xine_list_size(dpb->output_list);
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while(ite) {
+    struct decoded_picture *pic = xine_list_get_value(dpb->reference_list, ite);
+    if (xine_list_find(dpb->output_list, pic) == NULL) {
+      num_frames++;
+    }
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return num_frames;
+}
+
+void release_dpb(struct dpb *dpb)
+{
+  if(!dpb)
+    return;
+
+  dpb_free_all(dpb);
+
+  xine_list_delete(dpb->output_list);
+  xine_list_delete(dpb->reference_list);
+
+  free(dpb);
+}
+
+struct decoded_picture* dpb_get_next_out_picture(struct dpb *dpb, int do_flush)
+{
+  struct decoded_picture *pic = NULL;;
+  struct decoded_picture *outpic = NULL;
+
+  if(!do_flush &&
+      xine_list_size(dpb->output_list) < dpb->max_reorder_frames &&
+      dpb_total_frames(dpb) < dpb->max_dpb_frames) {
+    return NULL;
+  }
+
+  xine_list_iterator_t ite = xine_list_back(dpb->output_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->output_list, ite);
+
+    int32_t out_top_field_order_cnt = outpic != NULL ?
+        outpic->coded_pic[0]->top_field_order_cnt : 0;
+    int32_t top_field_order_cnt = pic->coded_pic[0]->top_field_order_cnt;
+
+    int32_t out_bottom_field_order_cnt = outpic != NULL ?
+        (outpic->coded_pic[1] != NULL ?
+          outpic->coded_pic[1]->bottom_field_order_cnt :
+          outpic->coded_pic[0]->top_field_order_cnt) : 0;
+    int32_t bottom_field_order_cnt = pic->coded_pic[1] != NULL ?
+              pic->coded_pic[1]->bottom_field_order_cnt :
+              pic->coded_pic[0]->top_field_order_cnt;
+
+    if (outpic == NULL ||
+            (top_field_order_cnt <= out_top_field_order_cnt &&
+                bottom_field_order_cnt <= out_bottom_field_order_cnt) ||
+            (out_top_field_order_cnt <= 0 && top_field_order_cnt > 0 &&
+               out_bottom_field_order_cnt <= 0 && bottom_field_order_cnt > 0) ||
+            outpic->coded_pic[0]->flag_mask & IDR_PIC) {
+      outpic = pic;
+    }
+
+    ite = xine_list_prev(dpb->output_list, ite);
+  }
+
+  return outpic;
+}
+
+struct decoded_picture* dpb_get_picture(struct dpb *dpb, uint32_t picnum)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    if ((pic->coded_pic[0]->pic_num == picnum ||
+        (pic->coded_pic[1] != NULL &&
+            pic->coded_pic[1]->pic_num == picnum))) {
+      return pic;
+    }
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return NULL;
+}
+
+struct decoded_picture* dpb_get_picture_by_ltpn(struct dpb *dpb,
+    uint32_t longterm_picnum)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    if (pic->coded_pic[0]->long_term_pic_num == longterm_picnum ||
+        (pic->coded_pic[1] != NULL &&
+            pic->coded_pic[1]->long_term_pic_num == longterm_picnum)) {
+      return pic;
+    }
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return NULL;
+}
+
+struct decoded_picture* dpb_get_picture_by_ltidx(struct dpb *dpb,
+    uint32_t longterm_idx)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    if (pic->coded_pic[0]->long_term_frame_idx == longterm_idx ||
+        (pic->coded_pic[1] != NULL &&
+            pic->coded_pic[1]->long_term_frame_idx == longterm_idx)) {
+      return pic;
+    }
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return NULL;
+}
+
+int dpb_set_unused_ref_picture_byltpn(struct dpb *dpb, uint32_t longterm_picnum)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    uint8_t found = 0;
+
+    if (pic->coded_pic[0]->long_term_pic_num == longterm_picnum) {
+      pic->coded_pic[0]->used_for_long_term_ref = 0;
+      found = 1;
+    }
+
+    if ((pic->coded_pic[1] != NULL &&
+          pic->coded_pic[1]->long_term_pic_num == longterm_picnum)) {
+      pic->coded_pic[1]->used_for_long_term_ref = 0;
+      found = 1;
+    }
+
+    if(found && !pic->coded_pic[0]->used_for_long_term_ref &&
+        (pic->coded_pic[1] == NULL ||
+            !pic->coded_pic[1]->used_for_long_term_ref)) {
+      dpb_unmark_reference_picture(dpb, pic);
+    }
+
+    if (found)
+      return 0;
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return -1;
+}
+
+int dpb_set_unused_ref_picture_bylidx(struct dpb *dpb, uint32_t longterm_idx)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    uint8_t found = 0;
+
+    if (pic->coded_pic[0]->long_term_frame_idx == longterm_idx) {
+      pic->coded_pic[0]->used_for_long_term_ref = 0;
+      found = 1;
+    }
+
+    if ((pic->coded_pic[1] != NULL &&
+          pic->coded_pic[1]->long_term_frame_idx == longterm_idx)) {
+      pic->coded_pic[1]->used_for_long_term_ref = 0;
+      found = 1;
+    }
+
+    if(found && !pic->coded_pic[0]->used_for_long_term_ref &&
+        (pic->coded_pic[1] == NULL ||
+            !pic->coded_pic[1]->used_for_long_term_ref)) {
+      dpb_unmark_reference_picture(dpb, pic);
+    }
+
+    if (found)
+      return 0;
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return -1;
+}
+
+int dpb_set_unused_ref_picture_lidx_gt(struct dpb *dpb, int32_t longterm_idx)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    uint8_t found = 0;
+
+    if (pic->coded_pic[0]->long_term_frame_idx >= longterm_idx) {
+      pic->coded_pic[0]->used_for_long_term_ref = 0;
+      found = 1;
+    }
+
+    if ((pic->coded_pic[1] != NULL &&
+          pic->coded_pic[1]->long_term_frame_idx >= longterm_idx)) {
+      pic->coded_pic[1]->used_for_long_term_ref = 0;
+      found = 1;
+    }
+
+    if(found && !pic->coded_pic[0]->used_for_long_term_ref &&
+        (pic->coded_pic[1] == NULL ||
+            !pic->coded_pic[1]->used_for_long_term_ref)) {
+      dpb_unmark_reference_picture(dpb, pic);
+    }
+
+    ite = xine_list_next(dpb->reference_list, ite);
+  }
+
+  return -1;
+}
+
+
+int dpb_unmark_picture_delayed(struct dpb *dpb, struct decoded_picture *pic)
+{
+  if(!pic)
+    return -1;
+
+  xine_list_iterator_t ite = xine_list_find(dpb->output_list, pic);
+  if (ite) {
+    xine_list_remove(dpb->output_list, ite);
+    release_decoded_picture(pic);
+
+    return 0;
+  }
+
+  return -1;
+}
+
+int dpb_unmark_reference_picture(struct dpb *dpb, struct decoded_picture *pic)
+{
+  if(!pic)
+    return -1;
+
+  xine_list_iterator_t ite = xine_list_find(dpb->reference_list, pic);
+  if (ite) {
+    xine_list_remove(dpb->reference_list, ite);
+    release_decoded_picture(pic);
+
+    return 0;
+  }
+
+  return -1;
+}
+
+/*static int dpb_remove_picture_by_img(struct dpb *dpb, vo_frame_t *remimg)
+{
+  int retval = -1;
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->output_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->output_list, ite);
+
+    if (pic->img == remimg) {
+      dpb_unmark_picture_delayed(dpb, pic);
+      dpb->used--;
+      retval = 0;
+    }
+
+    ite = xine_list_next(dpb->output_list, ite);
+  }
+
+  return retval;
+}*/
+
+
+int dpb_add_picture(struct dpb *dpb, struct decoded_picture *pic, uint32_t num_ref_frames)
+{
+#if 0
+  /* this should never happen */
+  pic->img->lock(pic->img);
+  if (0 == dpb_remove_picture_by_img(dpb, pic->img))
+    lprintf("H264/DPB broken stream: current img was already in dpb -- freed it\n");
+  else
+    pic->img->free(pic->img);
+#endif
+
+  /* add the pic to the output picture list, as no
+   * pic would be immediately drawn.
+   * acquire a lock for this list
+   */
+  lock_decoded_picture(pic);
+  xine_list_push_back(dpb->output_list, pic);
+
+
+  /* check if the pic is a reference pic,
+   * if it is it should be added to the reference
+   * list. another lock has to be acquired in that case
+   */
+  if (pic->coded_pic[0]->flag_mask & REFERENCE ||
+      (pic->coded_pic[1] != NULL &&
+          pic->coded_pic[1]->flag_mask & REFERENCE)) {
+    lock_decoded_picture(pic);
+    xine_list_push_back(dpb->reference_list, pic);
+
+    /*
+     * always apply the sliding window reference removal, if more reference
+     * frames than expected are in the list. we will always remove the oldest
+     * reference frame
+     */
+    if(xine_list_size(dpb->reference_list) > num_ref_frames) {
+      struct decoded_picture *discard = xine_list_get_value(dpb->reference_list, xine_list_front(dpb->reference_list));
+      dpb_unmark_reference_picture(dpb, discard);
+    }
+  }
+
+#if DEBUG_DPB
+  printf("DPB list sizes: Total: %2d, Output: %2d, Reference: %2d\n",
+      dpb_total_frames(dpb), xine_list_size(dpb->output_list),
+      xine_list_size(dpb->reference_list));
+#endif
+
+  return 0;
+}
+
+int dpb_flush(struct dpb *dpb)
+{
+  struct decoded_picture *pic = NULL;
+
+  xine_list_iterator_t ite = xine_list_front(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+
+    dpb_unmark_reference_picture(dpb, pic);
+
+    /* CAUTION: xine_list_next would return an item, but not the one we
+     * expect, as the current one was deleted
+     */
+    ite = xine_list_front(dpb->reference_list);
+  }
+
+  return 0;
+}
+
+void dpb_free_all(struct dpb *dpb)
+{
+  xine_list_iterator_t ite = xine_list_front(dpb->output_list);
+  while(ite) {
+    dpb_unmark_picture_delayed(dpb, xine_list_get_value(dpb->output_list, ite));
+    /* CAUTION: xine_list_next would return an item, but not the one we
+     * expect, as the current one was deleted
+     */
+    ite = xine_list_front(dpb->output_list);
+  }
+
+  ite = xine_list_front(dpb->reference_list);
+  while(ite) {
+    dpb_unmark_reference_picture(dpb, xine_list_get_value(dpb->reference_list, ite));
+    /* CAUTION: xine_list_next would return an item, but not the one we
+     * expect, as the current one was deleted
+     */
+    ite = xine_list_front(dpb->reference_list);
+  }
+}
+
+void dpb_clear_all_pts(struct dpb *dpb)
+{
+  xine_list_iterator_t ite = xine_list_front(dpb->output_list);
+  while(ite) {
+    struct decoded_picture *pic = xine_list_get_value(dpb->output_list, ite);
+    pic->img->pts = 0;
+
+    ite = xine_list_next(dpb->output_list, ite);
+  }
+}
+
+int fill_vdpau_reference_list(struct dpb *dpb, VdpReferenceFrameH264 *reflist)
+{
+  struct decoded_picture *pic = NULL;
+
+  int i = 0;
+  int used_refframes = 0;
+
+  xine_list_iterator_t ite = xine_list_back(dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(dpb->reference_list, ite);
+    reflist[i].surface = ((vdpau_accel_t*)pic->img->accel_data)->surface;
+    reflist[i].is_long_term = pic->coded_pic[0]->used_for_long_term_ref ||
+        (pic->coded_pic[1] != NULL && pic->coded_pic[1]->used_for_long_term_ref);
+
+    reflist[i].frame_idx = pic->coded_pic[0]->used_for_long_term_ref ?
+        pic->coded_pic[0]->long_term_pic_num :
+        pic->coded_pic[0]->slc_nal->slc.frame_num;
+    reflist[i].top_is_reference = pic->top_is_reference;
+    reflist[i].bottom_is_reference = pic->bottom_is_reference;
+    reflist[i].field_order_cnt[0] = pic->coded_pic[0]->top_field_order_cnt;
+    reflist[i].field_order_cnt[1] = pic->coded_pic[1] != NULL ?
+        pic->coded_pic[1]->bottom_field_order_cnt :
+        pic->coded_pic[0]->bottom_field_order_cnt;
+    i++;
+
+    ite = xine_list_prev(dpb->reference_list, ite);
+  }
+
+  used_refframes = i;
+
+  // fill all other frames with invalid handles
+  while(i < 16) {
+    reflist[i].bottom_is_reference = VDP_FALSE;
+    reflist[i].top_is_reference = VDP_FALSE;
+    reflist[i].frame_idx = 0;
+    reflist[i].is_long_term = VDP_FALSE;
+    reflist[i].surface = VDP_INVALID_HANDLE;
+    reflist[i].field_order_cnt[0] = 0;
+    reflist[i].field_order_cnt[1] = 0;
+    i++;
+  }
+
+  return used_refframes;
+}
diff --git a/src/video_dec/libvdpau/dpb.h b/src/video_dec/libvdpau/dpb.h
new file mode 100644
index 000000000..356bcbf70
--- /dev/null
+++ b/src/video_dec/libvdpau/dpb.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * dpb.h: Decoded Picture Buffer
+ */
+
+#ifndef DPB_H_
+#define DPB_H_
+
+#define MAX_DPB_COUNT 16
+
+#include "nal.h"
+#include "cpb.h"
+#include <xine/video_out.h>
+#include <xine/list.h>
+
+#define USED_FOR_REF (top_is_reference || bottom_is_reference)
+
+/**
+ * ----------------------------------------------------------------------------
+ * decoded picture
+ * ----------------------------------------------------------------------------
+ */
+
+struct decoded_picture {
+  vo_frame_t *img; /* this is the image we block, to make sure
+                    * the surface is not double-used */
+
+  /**
+   * a decoded picture always contains a whole frame,
+   * respective a field pair, so it can contain up to
+   * 2 coded pics
+   */
+  struct coded_picture *coded_pic[2];
+
+  int32_t frame_num_wrap;
+
+  uint8_t top_is_reference;
+  uint8_t bottom_is_reference;
+
+  uint32_t lock_counter;
+};
+
+struct decoded_picture* init_decoded_picture(struct coded_picture *cpic,
+    vo_frame_t *img);
+void release_decoded_picture(struct decoded_picture *pic);
+void lock_decoded_picture(struct decoded_picture *pic);
+void decoded_pic_check_reference(struct decoded_picture *pic);
+void decoded_pic_add_field(struct decoded_picture *pic,
+    struct coded_picture *cpic);
+
+
+/**
+ * ----------------------------------------------------------------------------
+ * dpb code starting here
+ * ----------------------------------------------------------------------------
+ */
+
+/* Decoded Picture Buffer */
+struct dpb {
+  xine_list_t *reference_list;
+  xine_list_t *output_list;
+
+  int max_reorder_frames;
+  int max_dpb_frames;
+};
+
+struct dpb* create_dpb(void);
+void release_dpb(struct dpb *dpb);
+
+/**
+ * calculates the total number of frames in the dpb
+ * when frames are used for reference and are not drawn
+ * yet the result would be less then reference_list-size+
+ * output_list-size
+ */
+int dpb_total_frames(struct dpb *dpb);
+
+struct decoded_picture* dpb_get_next_out_picture(struct dpb *dpb, int do_flush);
+
+struct decoded_picture* dpb_get_picture(struct dpb *dpb, uint32_t picnum);
+struct decoded_picture* dpb_get_picture_by_ltpn(struct dpb *dpb, uint32_t longterm_picnum);
+struct decoded_picture* dpb_get_picture_by_ltidx(struct dpb *dpb, uint32_t longterm_idx);
+
+int dpb_set_unused_ref_picture_byltpn(struct dpb *dpb, uint32_t longterm_picnum);
+int dpb_set_unused_ref_picture_bylidx(struct dpb *dpb, uint32_t longterm_idx);
+int dpb_set_unused_ref_picture_lidx_gt(struct dpb *dpb, int32_t longterm_idx);
+
+int dpb_unmark_picture_delayed(struct dpb *dpb, struct decoded_picture *pic);
+int dpb_unmark_reference_picture(struct dpb *dpb, struct decoded_picture *pic);
+
+int dpb_add_picture(struct dpb *dpb, struct decoded_picture *pic, uint32_t num_ref_frames);
+int dpb_flush(struct dpb *dpb);
+void dpb_free_all(struct dpb *dpb);
+void dpb_clear_all_pts(struct dpb *dpb);
+
+int fill_vdpau_reference_list(struct dpb *dpb, VdpReferenceFrameH264 *reflist);
+
+int dp_top_field_first(struct decoded_picture *decoded_pic);
+
+#endif /* DPB_H_ */
diff --git a/src/video_dec/libvdpau/h264_parser.c b/src/video_dec/libvdpau/h264_parser.c
new file mode 100644
index 000000000..d495bf483
--- /dev/null
+++ b/src/video_dec/libvdpau/h264_parser.c
@@ -0,0 +1,2038 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * h264_parser.c: Almost full-features H264 NAL-Parser
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "h264_parser.h"
+#include "nal.h"
+#include "cpb.h"
+
+/* default scaling_lists according to Table 7-2 */
+uint8_t default_4x4_intra[16] = { 6, 13, 13, 20, 20, 20, 28, 28, 28, 28, 32,
+    32, 32, 37, 37, 42 };
+
+uint8_t default_4x4_inter[16] = { 10, 14, 14, 20, 20, 20, 24, 24, 24, 24, 27,
+    27, 27, 30, 30, 34 };
+
+uint8_t default_8x8_intra[64] = { 6, 10, 10, 13, 11, 13, 16, 16, 16, 16, 18,
+    18, 18, 18, 18, 23, 23, 23, 23, 23, 23, 25, 25, 25, 25, 25, 25, 25, 27, 27,
+    27, 27, 27, 27, 27, 27, 29, 29, 29, 29, 29, 29, 29, 31, 31, 31, 31, 31, 31,
+    33, 33, 33, 33, 33, 36, 36, 36, 36, 38, 38, 38, 40, 40, 42 };
+
+uint8_t default_8x8_inter[64] = { 9, 13, 13, 15, 13, 15, 17, 17, 17, 17, 19,
+    19, 19, 19, 19, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 24, 24,
+    24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 27, 27, 27, 27, 27, 27,
+    28, 28, 28, 28, 28, 30, 30, 30, 30, 32, 32, 32, 33, 33, 35 };
+
+struct buf_reader
+{
+  uint8_t *buf;
+  uint8_t *cur_pos;
+  int len;
+  int cur_offset;
+};
+
+struct h264_parser* init_parser();
+
+static inline uint32_t read_bits(struct buf_reader *buf, int len);
+uint32_t read_exp_golomb(struct buf_reader *buf);
+int32_t read_exp_golomb_s(struct buf_reader *buf);
+
+void calculate_pic_order(struct h264_parser *parser, struct coded_picture *pic,
+    struct slice_header *slc);
+void skip_scaling_list(struct buf_reader *buf, int size);
+void parse_scaling_list(struct buf_reader *buf, uint8_t *scaling_list,
+    int length, int index);
+
+struct nal_unit* parse_nal_header(struct buf_reader *buf,
+    struct coded_picture *pic, struct h264_parser *parser);
+static void sps_scaling_list_fallback(struct seq_parameter_set_rbsp *sps,
+    int i);
+static void pps_scaling_list_fallback(struct seq_parameter_set_rbsp *sps,
+    struct pic_parameter_set_rbsp *pps, int i);
+
+uint8_t parse_sps(struct buf_reader *buf, struct seq_parameter_set_rbsp *sps);
+void interpret_sps(struct coded_picture *pic, struct h264_parser *parser);
+
+void parse_vui_parameters(struct buf_reader *buf,
+    struct seq_parameter_set_rbsp *sps);
+void parse_hrd_parameters(struct buf_reader *buf, struct hrd_parameters *hrd);
+
+uint8_t parse_pps(struct buf_reader *buf, struct pic_parameter_set_rbsp *pps);
+void interpret_pps(struct coded_picture *pic);
+
+void parse_sei(struct buf_reader *buf, struct sei_message *sei,
+    struct h264_parser *parser);
+void interpret_sei(struct coded_picture *pic);
+
+uint8_t parse_slice_header(struct buf_reader *buf, struct nal_unit *slc_nal,
+    struct h264_parser *parser);
+void interpret_slice_header(struct h264_parser *parser, struct nal_unit *slc_nal);
+
+void parse_ref_pic_list_reordering(struct buf_reader *buf,
+    struct slice_header *slc);
+
+void calculate_pic_nums(struct h264_parser *parser, struct coded_picture *cpic);
+void execute_ref_pic_marking(struct coded_picture *cpic,
+    uint32_t memory_management_control_operation,
+    uint32_t marking_nr,
+    struct h264_parser *parser);
+void parse_pred_weight_table(struct buf_reader *buf, struct slice_header *slc,
+    struct h264_parser *parser);
+void parse_dec_ref_pic_marking(struct buf_reader *buf,
+    struct nal_unit *slc_nal);
+
+/* here goes the parser implementation */
+
+static void decode_nal(uint8_t **ret, int *len_ret, uint8_t *buf, int buf_len)
+{
+  // TODO: rework without copying
+  uint8_t *end = &buf[buf_len];
+  uint8_t *pos = malloc(buf_len);
+
+  *ret = pos;
+  while (buf < end) {
+    if (buf < end - 3 && buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0x03) {
+
+      *pos++ = 0x00;
+      *pos++ = 0x00;
+
+      buf += 3;
+      continue;
+    }
+    *pos++ = *buf++;
+  }
+
+  *len_ret = pos - *ret;
+}
+
+#if 0
+static inline void dump_bits(const char *label, const struct buf_reader *buf, int bits)
+{
+  struct buf_reader lbuf;
+  memcpy(&lbuf, buf, sizeof(struct buf_reader));
+
+  int i;
+  printf("%s: 0b", label);
+  for(i=0; i < bits; i++)
+    printf("%d", read_bits(&lbuf, 1));
+  printf("\n");
+}
+#endif
+
+/**
+ * @return total number of bits read by the buf_reader
+ */
+static inline uint32_t bits_read(struct buf_reader *buf)
+{
+  int bits_read = 0;
+  bits_read = (buf->cur_pos - buf->buf)*8;
+  bits_read += (8-buf->cur_offset);
+
+  return bits_read;
+}
+
+/* skips stuffing bytes in the buf_reader */
+static inline void skip_emulation_prevention_three_byte(struct buf_reader *buf)
+{
+  if(buf->cur_pos - buf->buf > 2 &&
+      *(buf->cur_pos-2) == 0x00 &&
+      *(buf->cur_pos-1) == 0x00 &&
+      *buf->cur_pos == 0x03) {
+    buf->cur_pos++;
+  }
+}
+
+/*
+ * read len bits from the buffer and return them
+ * @return right aligned bits
+ */
+static inline uint32_t read_bits(struct buf_reader *buf, int len)
+{
+  static uint32_t i_mask[33] = { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f,
+      0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff,
+      0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff,
+      0xffffff, 0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff, 0x1fffffff,
+      0x3fffffff, 0x7fffffff, 0xffffffff };
+
+  int i_shr;
+  uint32_t bits = 0;
+
+  while (len > 0 && (buf->cur_pos - buf->buf) < buf->len) {
+    if ((i_shr = buf->cur_offset - len) >= 0) {
+      bits |= (*buf->cur_pos >> i_shr) & i_mask[len];
+      buf->cur_offset -= len;
+      if (buf->cur_offset == 0) {
+        buf->cur_pos++;
+        buf->cur_offset = 8;
+
+        skip_emulation_prevention_three_byte(buf);
+      }
+      return bits;
+    }
+    else {
+      bits |= (*buf->cur_pos & i_mask[buf->cur_offset]) << -i_shr;
+      len -= buf->cur_offset;
+      buf->cur_pos++;
+      buf->cur_offset = 8;
+
+      skip_emulation_prevention_three_byte(buf);
+    }
+  }
+  return bits;
+}
+
+/* determines if following bits are rtsb_trailing_bits */
+static inline int rbsp_trailing_bits(uint8_t *buf, int buf_len)
+{
+  uint8_t *cur_buf = buf+(buf_len-1);
+  uint8_t cur_val;
+  int parsed_bits = 0;
+  int i;
+
+  while(buf_len > 0) {
+    cur_val = *cur_buf;
+    for(i = 0; i < 9; i++) {
+      if (cur_val&1)
+        return parsed_bits+i;
+      cur_val>>=1;
+    }
+    parsed_bits += 8;
+    cur_buf--;
+  }
+
+  lprintf("rbsp trailing bits could not be found\n");
+  return 0;
+}
+
+uint32_t read_exp_golomb(struct buf_reader *buf)
+{
+  int leading_zero_bits = 0;
+
+  while (read_bits(buf, 1) == 0 && leading_zero_bits < 32)
+    leading_zero_bits++;
+
+  uint32_t code = (1 << leading_zero_bits) - 1 + read_bits(buf,
+      leading_zero_bits);
+  return code;
+}
+
+int32_t read_exp_golomb_s(struct buf_reader *buf)
+{
+  uint32_t ue = read_exp_golomb(buf);
+  int32_t code = ue & 0x01 ? (ue + 1) / 2 : -(ue / 2);
+  return code;
+}
+
+
+/**
+ * parses the NAL header data and calls the subsequent
+ * parser methods that handle specific NAL units
+ */
+struct nal_unit* parse_nal_header(struct buf_reader *buf,
+    struct coded_picture *pic, struct h264_parser *parser)
+{
+  if (buf->len < 1)
+    return NULL;
+
+
+  struct nal_unit *nal = create_nal_unit();
+
+  nal->nal_ref_idc = (buf->buf[0] >> 5) & 0x03;
+  nal->nal_unit_type = buf->buf[0] & 0x1f;
+
+  buf->cur_pos = buf->buf + 1;
+  //lprintf("NAL: %d\n", nal->nal_unit_type);
+
+  struct buf_reader ibuf;
+  ibuf.cur_offset = 8;
+
+  switch (nal->nal_unit_type) {
+    case NAL_SPS:
+      parse_sps(buf, &nal->sps);
+      break;
+    case NAL_PPS:
+      parse_pps(buf, &nal->pps);
+      break;
+    case NAL_SLICE:
+    case NAL_PART_A:
+    case NAL_PART_B:
+    case NAL_PART_C:
+    case NAL_SLICE_IDR:
+      parse_slice_header(buf, nal, parser);
+      break;
+    case NAL_SEI:
+      memset(&(nal->sei), 0x00, sizeof(struct sei_message));
+      parse_sei(buf, &nal->sei, parser);
+      break;
+    default:
+      break;
+  }
+
+  return nal;
+}
+
+/**
+ * calculates the picture order count according to ITU-T Rec. H.264 (11/2007)
+ * chapter 8.2.1, p104f
+ */
+void calculate_pic_order(struct h264_parser *parser, struct coded_picture *pic,
+    struct slice_header *slc)
+{
+  /* retrieve sps and pps from the buffers */
+  struct nal_unit *pps_nal =
+      nal_buffer_get_by_pps_id(parser->pps_buffer, slc->pic_parameter_set_id);
+
+  if (pps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: calculate_pic_order: pic_parameter_set_id %d not found in buffers\n",
+        slc->pic_parameter_set_id);
+    return;
+  }
+
+  struct pic_parameter_set_rbsp *pps = &pps_nal->pps;
+
+  struct nal_unit *sps_nal =
+      nal_buffer_get_by_sps_id(parser->sps_buffer, pps->seq_parameter_set_id);
+
+  if (sps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: calculate_pic_order: seq_parameter_set_id %d not found in buffers\n",
+        pps->seq_parameter_set_id);
+    return;
+  }
+
+  struct seq_parameter_set_rbsp *sps = &sps_nal->sps;
+
+  if (sps->pic_order_cnt_type == 0) {
+
+    if (pic->flag_mask & IDR_PIC) {
+      parser->prev_pic_order_cnt_lsb = 0;
+      parser->prev_pic_order_cnt_msb = 0;
+
+
+      // FIXME
+      parser->frame_num_offset = 0;
+    }
+
+    const int max_poc_lsb = 1 << (sps->log2_max_pic_order_cnt_lsb_minus4 + 4);
+
+    uint32_t pic_order_cnt_msb = 0;
+
+    if (slc->pic_order_cnt_lsb < parser->prev_pic_order_cnt_lsb
+        && parser->prev_pic_order_cnt_lsb - slc->pic_order_cnt_lsb
+            >= max_poc_lsb / 2)
+      pic_order_cnt_msb = parser->prev_pic_order_cnt_msb + max_poc_lsb;
+    else if (slc->pic_order_cnt_lsb > parser->prev_pic_order_cnt_lsb
+        && parser->prev_pic_order_cnt_lsb - slc->pic_order_cnt_lsb
+            < -max_poc_lsb / 2)
+      pic_order_cnt_msb = parser->prev_pic_order_cnt_msb - max_poc_lsb;
+    else
+      pic_order_cnt_msb = parser->prev_pic_order_cnt_msb;
+
+    if(!slc->field_pic_flag || !slc->bottom_field_flag) {
+      pic->top_field_order_cnt = pic_order_cnt_msb + slc->pic_order_cnt_lsb;
+      parser->prev_top_field_order_cnt = pic->top_field_order_cnt;
+    }
+
+    if (pic->flag_mask & REFERENCE) {
+      parser->prev_pic_order_cnt_msb =  pic_order_cnt_msb;
+    }
+
+    pic->bottom_field_order_cnt = 0;
+
+    if(!slc->field_pic_flag)
+      pic->bottom_field_order_cnt = pic->top_field_order_cnt + slc->delta_pic_order_cnt_bottom;
+    else //if(slc->bottom_field_flag) //TODO: this is not spec compliant, but works...
+      pic->bottom_field_order_cnt = pic_order_cnt_msb + slc->pic_order_cnt_lsb;
+
+    if(slc->field_pic_flag && slc->bottom_field_flag)
+      pic->top_field_order_cnt = parser->prev_top_field_order_cnt;
+
+  } else if (sps->pic_order_cnt_type == 2) {
+    uint32_t prev_frame_num = parser->last_vcl_nal ? parser->last_vcl_nal->slc.frame_num : 0;
+    uint32_t prev_frame_num_offset = parser->frame_num_offset;
+    uint32_t temp_pic_order_cnt = 0;
+
+    if (parser->pic->flag_mask & IDR_PIC)
+      parser->frame_num_offset = 0;
+    else if (prev_frame_num > slc->frame_num)
+      parser->frame_num_offset = prev_frame_num_offset + sps->max_frame_num;
+    else
+      parser->frame_num_offset = prev_frame_num_offset;
+
+    if(parser->pic->flag_mask & IDR_PIC)
+      temp_pic_order_cnt = 0;
+    else if(!(parser->pic->flag_mask & REFERENCE))
+      temp_pic_order_cnt = 2 * (parser->frame_num_offset + slc->frame_num)-1;
+    else
+      temp_pic_order_cnt = 2 * (parser->frame_num_offset + slc->frame_num);
+
+    if(!slc->field_pic_flag)
+      pic->top_field_order_cnt = pic->bottom_field_order_cnt = temp_pic_order_cnt;
+    else if(slc->bottom_field_flag)
+      pic->bottom_field_order_cnt = temp_pic_order_cnt;
+    else
+      pic->top_field_order_cnt = temp_pic_order_cnt;
+
+  } else {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "FIXME: Unsupported poc_type: %d\n", sps->pic_order_cnt_type);
+  }
+}
+
+void skip_scaling_list(struct buf_reader *buf, int size)
+{
+  int i;
+  for (i = 0; i < size; i++) {
+    read_exp_golomb_s(buf);
+  }
+}
+
+void parse_scaling_list(struct buf_reader *buf, uint8_t *scaling_list,
+    int length, int index)
+{
+  int last_scale = 8;
+  int next_scale = 8;
+  int32_t delta_scale;
+  uint8_t use_default_scaling_matrix_flag = 0;
+  int i;
+
+  const uint8_t *zigzag = (length==64) ? zigzag_8x8 : zigzag_4x4;
+
+  for (i = 0; i < length; i++) {
+    if (next_scale != 0) {
+      delta_scale = read_exp_golomb_s(buf);
+      next_scale = (last_scale + delta_scale + 256) % 256;
+      if (i == 0 && next_scale == 0) {
+        use_default_scaling_matrix_flag = 1;
+        break;
+      }
+    }
+    scaling_list[zigzag[i]] = last_scale = (next_scale == 0) ? last_scale : next_scale;
+  }
+
+  if (use_default_scaling_matrix_flag) {
+    switch (index) {
+      case 0:
+      case 1:
+      case 2: {
+        for(i = 0; i < sizeof(default_4x4_intra); i++) {
+          scaling_list[zigzag_4x4[i]] = default_4x4_intra[i];
+        }
+        //memcpy(scaling_list, default_4x4_intra, sizeof(default_4x4_intra));
+        break;
+      }
+      case 3:
+      case 4:
+      case 5: {
+        for(i = 0; i < sizeof(default_4x4_inter); i++) {
+          scaling_list[zigzag_4x4[i]] = default_4x4_inter[i];
+        }
+        //memcpy(scaling_list, default_4x4_inter, sizeof(default_4x4_inter));
+        break;
+      }
+      case 6: {
+        for(i = 0; i < sizeof(default_8x8_intra); i++) {
+          scaling_list[zigzag_8x8[i]] = default_8x8_intra[i];
+        }
+        //memcpy(scaling_list, default_8x8_intra, sizeof(default_8x8_intra));
+        break;
+      }
+      case 7: {
+        for(i = 0; i < sizeof(default_8x8_inter); i++) {
+          scaling_list[zigzag_8x8[i]] = default_8x8_inter[i];
+        }
+        //memcpy(scaling_list, default_8x8_inter, sizeof(default_8x8_inter));
+        break;
+      }
+    }
+  }
+}
+
+static void sps_scaling_list_fallback(struct seq_parameter_set_rbsp *sps, int i)
+{
+  int j;
+  switch (i) {
+    case 0: {
+      for(j = 0; j < sizeof(default_4x4_intra); j++) {
+        sps->scaling_lists_4x4[i][zigzag_4x4[j]] = default_4x4_intra[j];
+      }
+      //memcpy(sps->scaling_lists_4x4[i], default_4x4_intra, sizeof(sps->scaling_lists_4x4[i]));
+      break;
+    }
+    case 3: {
+      for(j = 0; j < sizeof(default_4x4_inter); j++) {
+        sps->scaling_lists_4x4[i][zigzag_4x4[j]] = default_4x4_inter[j];
+      }
+      //memcpy(sps->scaling_lists_4x4[i], default_4x4_inter, sizeof(sps->scaling_lists_4x4[i]));
+      break;
+    }
+    case 1:
+    case 2:
+    case 4:
+    case 5:
+      memcpy(sps->scaling_lists_4x4[i], sps->scaling_lists_4x4[i-1], sizeof(sps->scaling_lists_4x4[i]));
+      break;
+    case 6: {
+      for(j = 0; j < sizeof(default_8x8_intra); j++) {
+        sps->scaling_lists_8x8[i-6][zigzag_8x8[j]] = default_8x8_intra[j];
+      }
+      //memcpy(sps->scaling_lists_8x8[i-6], default_8x8_intra, sizeof(sps->scaling_lists_8x8[i-6]));
+      break;
+    }
+    case 7: {
+      for(j = 0; j < sizeof(default_8x8_inter); j++) {
+        sps->scaling_lists_8x8[i-6][zigzag_8x8[j]] = default_8x8_inter[j];
+      }
+      //memcpy(sps->scaling_lists_8x8[i-6], default_8x8_inter, sizeof(sps->scaling_lists_8x8[i-6]));
+      break;
+    }
+
+  }
+}
+
+static void pps_scaling_list_fallback(struct seq_parameter_set_rbsp *sps, struct pic_parameter_set_rbsp *pps, int i)
+{
+  switch (i) {
+    case 0:
+    case 3:
+      memcpy(pps->scaling_lists_4x4[i], sps->scaling_lists_4x4[i], sizeof(pps->scaling_lists_4x4[i]));
+      break;
+    case 1:
+    case 2:
+    case 4:
+    case 5:
+      memcpy(pps->scaling_lists_4x4[i], pps->scaling_lists_4x4[i-1], sizeof(pps->scaling_lists_4x4[i]));
+      break;
+    case 6:
+    case 7:
+      memcpy(pps->scaling_lists_8x8[i-6], sps->scaling_lists_8x8[i-6], sizeof(pps->scaling_lists_8x8[i-6]));
+      break;
+
+  }
+}
+
+
+uint8_t parse_sps(struct buf_reader *buf, struct seq_parameter_set_rbsp *sps)
+{
+  sps->profile_idc = read_bits(buf, 8);
+  sps->constraint_setN_flag = read_bits(buf, 4);
+  read_bits(buf, 4);
+  sps->level_idc = read_bits(buf, 8);
+
+  sps->seq_parameter_set_id = read_exp_golomb(buf);
+
+  memset(sps->scaling_lists_4x4, 16, sizeof(sps->scaling_lists_4x4));
+  memset(sps->scaling_lists_8x8, 16, sizeof(sps->scaling_lists_8x8));
+  if (sps->profile_idc == 100 || sps->profile_idc == 110 || sps->profile_idc
+      == 122 || sps->profile_idc == 244 || sps->profile_idc == 44 ||
+      sps->profile_idc == 83 || sps->profile_idc == 86) {
+    sps->chroma_format_idc = read_exp_golomb(buf);
+    if (sps->chroma_format_idc == 3) {
+      sps->separate_colour_plane_flag = read_bits(buf, 1);
+    }
+
+    sps->bit_depth_luma_minus8 = read_exp_golomb(buf);
+    sps->bit_depth_chroma_minus8 = read_exp_golomb(buf);
+    sps->qpprime_y_zero_transform_bypass_flag = read_bits(buf, 1);
+    sps->seq_scaling_matrix_present_flag = read_bits(buf, 1);
+    if (sps->seq_scaling_matrix_present_flag) {
+      int i;
+      for (i = 0; i < 8; i++) {
+        sps->seq_scaling_list_present_flag[i] = read_bits(buf, 1);
+
+        if (sps->seq_scaling_list_present_flag[i]) {
+          if (i < 6)
+            parse_scaling_list(buf, sps->scaling_lists_4x4[i], 16, i);
+          else
+            parse_scaling_list(buf, sps->scaling_lists_8x8[i - 6], 64, i);
+        } else {
+          sps_scaling_list_fallback(sps, i);
+        }
+      }
+    }
+  } else
+    sps->chroma_format_idc = 1;
+
+  sps->log2_max_frame_num_minus4 = read_exp_golomb(buf);
+  sps->max_frame_num = 1 << (sps->log2_max_frame_num_minus4 + 4);
+
+  sps->pic_order_cnt_type = read_exp_golomb(buf);
+  if (!sps->pic_order_cnt_type)
+    sps->log2_max_pic_order_cnt_lsb_minus4 = read_exp_golomb(buf);
+  else if(sps->pic_order_cnt_type == 1) {
+    sps->delta_pic_order_always_zero_flag = read_bits(buf, 1);
+    sps->offset_for_non_ref_pic = read_exp_golomb_s(buf);
+    sps->offset_for_top_to_bottom_field = read_exp_golomb_s(buf);
+    sps->num_ref_frames_in_pic_order_cnt_cycle = read_exp_golomb(buf);
+    int i;
+    for (i = 0; i < sps->num_ref_frames_in_pic_order_cnt_cycle; i++) {
+      sps->offset_for_ref_frame[i] = read_exp_golomb_s(buf);
+    }
+  }
+
+  sps->num_ref_frames = read_exp_golomb(buf);
+  sps->gaps_in_frame_num_value_allowed_flag = read_bits(buf, 1);
+
+  /*sps->pic_width_in_mbs_minus1 = read_exp_golomb(buf);
+   sps->pic_height_in_map_units_minus1 = read_exp_golomb(buf);*/
+  sps->pic_width = 16 * (read_exp_golomb(buf) + 1);
+  sps->pic_height = 16 * (read_exp_golomb(buf) + 1);
+
+  sps->frame_mbs_only_flag = read_bits(buf, 1);
+
+  /* compute the height correctly even for interlaced material */
+  sps->pic_height = (2 - sps->frame_mbs_only_flag) * sps->pic_height;
+  if (sps->pic_height == 1088)
+    sps->pic_height = 1080;
+
+  if (!sps->frame_mbs_only_flag)
+    sps->mb_adaptive_frame_field_flag = read_bits(buf, 1);
+
+  sps->direct_8x8_inference_flag = read_bits(buf, 1);
+  sps->frame_cropping_flag = read_bits(buf, 1);
+  if (sps->frame_cropping_flag) {
+    sps->frame_crop_left_offset = read_exp_golomb(buf);
+    sps->frame_crop_right_offset = read_exp_golomb(buf);
+    sps->frame_crop_top_offset = read_exp_golomb(buf);
+    sps->frame_crop_bottom_offset = read_exp_golomb(buf);
+  }
+  sps->vui_parameters_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters_present_flag) {
+    parse_vui_parameters(buf, sps);
+  }
+
+  return 0;
+}
+
+/* evaluates values parsed by sps and modifies the current
+ * picture according to them
+ */
+void interpret_sps(struct coded_picture *pic, struct h264_parser *parser)
+{
+  if(pic->sps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "WARNING: Picture contains no seq_parameter_set\n");
+    return;
+  }
+
+  struct seq_parameter_set_rbsp *sps = &pic->sps_nal->sps;
+
+  if(sps->vui_parameters_present_flag &&
+        sps->vui_parameters.pic_struct_present_flag) {
+    parser->flag_mask |= PIC_STRUCT_PRESENT;
+  } else {
+    parser->flag_mask &= ~PIC_STRUCT_PRESENT;
+  }
+
+  if(sps->vui_parameters_present_flag &&
+      (sps->vui_parameters.nal_hrd_parameters_present_flag ||
+       sps->vui_parameters.vc1_hrd_parameters_present_flag)) {
+    parser->flag_mask |= CPB_DPB_DELAYS_PRESENT;
+  } else {
+    parser->flag_mask &= ~(CPB_DPB_DELAYS_PRESENT);
+  }
+
+  if(pic->slc_nal != NULL) {
+    struct slice_header *slc = &pic->slc_nal->slc;
+    if (slc->field_pic_flag == 0) {
+      pic->max_pic_num = sps->max_frame_num;
+      parser->curr_pic_num = slc->frame_num;
+    } else {
+      pic->max_pic_num = 2 * sps->max_frame_num;
+      parser->curr_pic_num = 2 * slc->frame_num + 1;
+    }
+  }
+}
+
+void parse_sei(struct buf_reader *buf, struct sei_message *sei,
+    struct h264_parser *parser)
+{
+  uint8_t tmp;
+
+  struct nal_unit *sps_nal =
+      nal_buffer_get_last(parser->sps_buffer);
+
+  if (sps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: parse_sei: seq_parameter_set_id not found in buffers\n");
+    return;
+  }
+
+  struct seq_parameter_set_rbsp *sps = &sps_nal->sps;
+
+  sei->payload_type = 0;
+  while((tmp = read_bits(buf, 8)) == 0xff) {
+    sei->payload_type += 255;
+  }
+  sei->last_payload_type_byte = tmp;
+  sei->payload_type += sei->last_payload_type_byte;
+
+  sei->payload_size = 0;
+  while((tmp = read_bits(buf, 8)) == 0xff) {
+    sei->payload_size += 255;
+  }
+  sei->last_payload_size_byte = tmp;
+  sei->payload_size += sei->last_payload_size_byte;
+
+  /* pic_timing */
+  if(sei->payload_type == 1) {
+    if(parser->flag_mask & CPB_DPB_DELAYS_PRESENT) {
+      sei->pic_timing.cpb_removal_delay = read_bits(buf, 5);
+      sei->pic_timing.dpb_output_delay = read_bits(buf, 5);
+    }
+
+    if(parser->flag_mask & PIC_STRUCT_PRESENT) {
+      sei->pic_timing.pic_struct = read_bits(buf, 4);
+
+      uint8_t NumClockTs = 0;
+      switch(sei->pic_timing.pic_struct) {
+        case 0:
+        case 1:
+        case 2:
+          NumClockTs = 1;
+          break;
+        case 3:
+        case 4:
+        case 7:
+          NumClockTs = 2;
+          break;
+        case 5:
+        case 6:
+        case 8:
+          NumClockTs = 3;
+          break;
+      }
+
+      int i;
+      for(i = 0; i < NumClockTs; i++) {
+        if(read_bits(buf, 1)) { /* clock_timestamp_flag == 1 */
+          sei->pic_timing.ct_type = read_bits(buf, 2);
+          sei->pic_timing.nuit_field_based_flag = read_bits(buf, 1);
+          sei->pic_timing.counting_type = read_bits(buf, 5);
+          sei->pic_timing.full_timestamp_flag = read_bits(buf, 1);
+          sei->pic_timing.discontinuity_flag = read_bits(buf, 1);
+          sei->pic_timing.cnt_dropped_flag = read_bits(buf, 1);
+          sei->pic_timing.n_frames = read_bits(buf, 8);
+          if(sei->pic_timing.full_timestamp_flag) {
+            sei->pic_timing.seconds_value = read_bits(buf, 6);
+            sei->pic_timing.minutes_value = read_bits(buf, 6);
+            sei->pic_timing.hours_value = read_bits(buf, 5);
+          } else {
+            if(read_bits(buf, 1)) {
+              sei->pic_timing.seconds_value = read_bits(buf, 6);
+
+              if(read_bits(buf, 1)) {
+                sei->pic_timing.minutes_value = read_bits(buf, 6);
+
+                if(read_bits(buf, 1)) {
+                  sei->pic_timing.hours_value = read_bits(buf, 5);
+                }
+              }
+            }
+          }
+
+          if(sps->vui_parameters_present_flag &&
+              sps->vui_parameters.nal_hrd_parameters_present_flag) {
+            sei->pic_timing.time_offset =
+                read_bits(buf,
+                    sps->vui_parameters.nal_hrd_parameters.time_offset_length);
+          }
+        }
+      }
+    }
+  } /*else {
+    fprintf(stderr, "Unimplemented SEI payload: %d\n", sei->payload_type);
+  }*/
+
+}
+
+void interpret_sei(struct coded_picture *pic)
+{
+  if(!pic->sps_nal || !pic->sei_nal)
+    return;
+
+  struct seq_parameter_set_rbsp *sps = &pic->sps_nal->sps;
+  struct sei_message *sei = &pic->sei_nal->sei;
+
+  if(sps && sps->vui_parameters_present_flag &&
+      sps->vui_parameters.pic_struct_present_flag) {
+    switch(sei->pic_timing.pic_struct) {
+      case DISP_FRAME:
+        pic->flag_mask &= ~INTERLACED;
+        pic->repeat_pic = 0;
+        break;
+      case DISP_TOP:
+      case DISP_BOTTOM:
+      case DISP_TOP_BOTTOM:
+      case DISP_BOTTOM_TOP:
+        pic->flag_mask |= INTERLACED;
+        break;
+      case DISP_TOP_BOTTOM_TOP:
+      case DISP_BOTTOM_TOP_BOTTOM:
+        pic->flag_mask |= INTERLACED;
+        pic->repeat_pic = 1;
+        break;
+      case DISP_FRAME_DOUBLING:
+        pic->flag_mask &= ~INTERLACED;
+        pic->repeat_pic = 2;
+        break;
+      case DISP_FRAME_TRIPLING:
+        pic->flag_mask &= ~INTERLACED;
+        pic->repeat_pic = 3;
+    }
+  }
+}
+
+void parse_vui_parameters(struct buf_reader *buf,
+    struct seq_parameter_set_rbsp *sps)
+{
+  sps->vui_parameters.aspect_ration_info_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.aspect_ration_info_present_flag == 1) {
+    sps->vui_parameters.aspect_ratio_idc = read_bits(buf, 8);
+    if (sps->vui_parameters.aspect_ratio_idc == ASPECT_EXTENDED_SAR) {
+      sps->vui_parameters.sar_width = read_bits(buf, 16);
+      sps->vui_parameters.sar_height = read_bits(buf, 16);
+    }
+  }
+
+  sps->vui_parameters.overscan_info_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.overscan_info_present_flag) {
+    sps->vui_parameters.overscan_appropriate_flag = read_bits(buf, 1);
+  }
+
+  sps->vui_parameters.video_signal_type_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.video_signal_type_present_flag) {
+    sps->vui_parameters.video_format = read_bits(buf, 3);
+    sps->vui_parameters.video_full_range_flag = read_bits(buf, 1);
+    sps->vui_parameters.colour_description_present = read_bits(buf, 1);
+    if (sps->vui_parameters.colour_description_present) {
+      sps->vui_parameters.colour_primaries = read_bits(buf, 8);
+      sps->vui_parameters.transfer_characteristics = read_bits(buf, 8);
+      sps->vui_parameters.matrix_coefficients = read_bits(buf, 8);
+    }
+  }
+
+  sps->vui_parameters.chroma_loc_info_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.chroma_loc_info_present_flag) {
+    sps->vui_parameters.chroma_sample_loc_type_top_field = read_exp_golomb(buf);
+    sps->vui_parameters.chroma_sample_loc_type_bottom_field = read_exp_golomb(
+        buf);
+  }
+
+  sps->vui_parameters.timing_info_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.timing_info_present_flag) {
+    uint32_t num_units_in_tick = read_bits(buf, 32);
+    uint32_t time_scale = read_bits(buf, 32);
+    sps->vui_parameters.num_units_in_tick = num_units_in_tick;
+    sps->vui_parameters.time_scale = time_scale;
+    sps->vui_parameters.fixed_frame_rate_flag = read_bits(buf, 1);
+  }
+
+  sps->vui_parameters.nal_hrd_parameters_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.nal_hrd_parameters_present_flag)
+    parse_hrd_parameters(buf, &sps->vui_parameters.nal_hrd_parameters);
+
+  sps->vui_parameters.vc1_hrd_parameters_present_flag = read_bits(buf, 1);
+  if (sps->vui_parameters.vc1_hrd_parameters_present_flag)
+    parse_hrd_parameters(buf, &sps->vui_parameters.vc1_hrd_parameters);
+
+  if (sps->vui_parameters.nal_hrd_parameters_present_flag
+      || sps->vui_parameters.vc1_hrd_parameters_present_flag)
+    sps->vui_parameters.low_delay_hrd_flag = read_bits(buf, 1);
+
+  sps->vui_parameters.pic_struct_present_flag = read_bits(buf, 1);
+  sps->vui_parameters.bitstream_restriction_flag = read_bits(buf, 1);
+
+  if (sps->vui_parameters.bitstream_restriction_flag) {
+    sps->vui_parameters.motion_vectors_over_pic_boundaries = read_bits(buf, 1);
+    sps->vui_parameters.max_bytes_per_pic_denom = read_exp_golomb(buf);
+    sps->vui_parameters.max_bits_per_mb_denom = read_exp_golomb(buf);
+    sps->vui_parameters.log2_max_mv_length_horizontal = read_exp_golomb(buf);
+    sps->vui_parameters.log2_max_mv_length_vertical = read_exp_golomb(buf);
+    sps->vui_parameters.num_reorder_frames = read_exp_golomb(buf);
+    sps->vui_parameters.max_dec_frame_buffering = read_exp_golomb(buf);
+  }
+}
+
+void parse_hrd_parameters(struct buf_reader *buf, struct hrd_parameters *hrd)
+{
+  hrd->cpb_cnt_minus1 = read_exp_golomb(buf);
+  hrd->bit_rate_scale = read_bits(buf, 4);
+  hrd->cpb_size_scale = read_bits(buf, 4);
+
+  int i;
+  for (i = 0; i <= hrd->cpb_cnt_minus1; i++) {
+    hrd->bit_rate_value_minus1[i] = read_exp_golomb(buf);
+    hrd->cpb_size_value_minus1[i] = read_exp_golomb(buf);
+    hrd->cbr_flag[i] = read_bits(buf, 1);
+  }
+
+  hrd->initial_cpb_removal_delay_length_minus1 = read_bits(buf, 5);
+  hrd->cpb_removal_delay_length_minus1 = read_bits(buf, 5);
+  hrd->dpb_output_delay_length_minus1 = read_bits(buf, 5);
+  hrd->time_offset_length = read_bits(buf, 5);
+}
+
+uint8_t parse_pps(struct buf_reader *buf, struct pic_parameter_set_rbsp *pps)
+{
+  pps->pic_parameter_set_id = read_exp_golomb(buf);
+  pps->seq_parameter_set_id = read_exp_golomb(buf);
+  pps->entropy_coding_mode_flag = read_bits(buf, 1);
+  pps->pic_order_present_flag = read_bits(buf, 1);
+
+  pps->num_slice_groups_minus1 = read_exp_golomb(buf);
+  if (pps->num_slice_groups_minus1 > 0) {
+    pps->slice_group_map_type = read_exp_golomb(buf);
+    if (pps->slice_group_map_type == 0) {
+      int i_group;
+      for (i_group = 0; i_group <= pps->num_slice_groups_minus1; i_group++) {
+        if (i_group < 64)
+          pps->run_length_minus1[i_group] = read_exp_golomb(buf);
+        else { // FIXME: skips if more than 64 groups exist
+          lprintf("Error: Only 64 slice_groups are supported\n");
+          read_exp_golomb(buf);
+        }
+      }
+    }
+    else if (pps->slice_group_map_type == 3 || pps->slice_group_map_type == 4
+        || pps->slice_group_map_type == 5) {
+      pps->slice_group_change_direction_flag = read_bits(buf, 1);
+      pps->slice_group_change_rate_minus1 = read_exp_golomb(buf);
+    }
+    else if (pps->slice_group_map_type == 6) {
+      pps->pic_size_in_map_units_minus1 = read_exp_golomb(buf);
+      int i_group;
+      for (i_group = 0; i_group <= pps->num_slice_groups_minus1; i_group++) {
+        pps->slice_group_id[i_group] = read_bits(buf, ceil(log(
+            pps->num_slice_groups_minus1 + 1)));
+      }
+    }
+  }
+
+  pps->num_ref_idx_l0_active_minus1 = read_exp_golomb(buf);
+  pps->num_ref_idx_l1_active_minus1 = read_exp_golomb(buf);
+  pps->weighted_pred_flag = read_bits(buf, 1);
+  pps->weighted_bipred_idc = read_bits(buf, 2);
+  pps->pic_init_qp_minus26 = read_exp_golomb_s(buf);
+  pps->pic_init_qs_minus26 = read_exp_golomb_s(buf);
+  pps->chroma_qp_index_offset = read_exp_golomb_s(buf);
+  pps->deblocking_filter_control_present_flag = read_bits(buf, 1);
+  pps->constrained_intra_pred_flag = read_bits(buf, 1);
+  pps->redundant_pic_cnt_present_flag = read_bits(buf, 1);
+
+  int bit_length = (buf->len*8)-rbsp_trailing_bits(buf->buf, buf->len);
+  int bit_read = bits_read(buf);
+
+  memset(pps->scaling_lists_4x4, 16, sizeof(pps->scaling_lists_4x4));
+  memset(pps->scaling_lists_8x8, 16, sizeof(pps->scaling_lists_8x8));
+  if (bit_length-bit_read > 1) {
+    pps->transform_8x8_mode_flag = read_bits(buf, 1);
+    pps->pic_scaling_matrix_present_flag = read_bits(buf, 1);
+    if (pps->pic_scaling_matrix_present_flag) {
+      int i;
+      for (i = 0; i < 8; i++) {
+        if(i < 6 || pps->transform_8x8_mode_flag)
+          pps->pic_scaling_list_present_flag[i] = read_bits(buf, 1);
+        else
+          pps->pic_scaling_list_present_flag[i] = 0;
+
+        if (pps->pic_scaling_list_present_flag[i]) {
+          if (i < 6)
+            parse_scaling_list(buf, pps->scaling_lists_4x4[i], 16, i);
+          else
+            parse_scaling_list(buf, pps->scaling_lists_8x8[i - 6], 64, i);
+        }
+      }
+    }
+
+    pps->second_chroma_qp_index_offset = read_exp_golomb_s(buf);
+  } else
+    pps->second_chroma_qp_index_offset = pps->chroma_qp_index_offset;
+
+  return 0;
+}
+
+void interpret_pps(struct coded_picture *pic)
+{
+  if(pic->sps_nal == NULL) {
+    lprintf("WARNING: Picture contains no seq_parameter_set\n");
+    return;
+  } else if(pic->pps_nal == NULL) {
+    lprintf("WARNING: Picture contains no pic_parameter_set\n");
+    return;
+  }
+
+  struct seq_parameter_set_rbsp *sps = &pic->sps_nal->sps;
+  struct pic_parameter_set_rbsp *pps = &pic->pps_nal->pps;
+
+  int i;
+  for (i = 0; i < 8; i++) {
+    if (!pps->pic_scaling_list_present_flag[i]) {
+      pps_scaling_list_fallback(sps, pps, i);
+    }
+  }
+
+  if (!pps->pic_scaling_matrix_present_flag && sps != NULL) {
+    memcpy(pps->scaling_lists_4x4, sps->scaling_lists_4x4,
+        sizeof(pps->scaling_lists_4x4));
+    memcpy(pps->scaling_lists_8x8, sps->scaling_lists_8x8,
+        sizeof(pps->scaling_lists_8x8));
+  }
+}
+
+uint8_t parse_slice_header(struct buf_reader *buf, struct nal_unit *slc_nal,
+    struct h264_parser *parser)
+{
+  struct slice_header *slc = &slc_nal->slc;
+
+  slc->first_mb_in_slice = read_exp_golomb(buf);
+  /* we do some parsing on the slice type, because the list is doubled */
+  slc->slice_type = slice_type(read_exp_golomb(buf));
+
+  //print_slice_type(slc->slice_type);
+  slc->pic_parameter_set_id = read_exp_golomb(buf);
+
+  /* retrieve sps and pps from the buffers */
+  struct nal_unit *pps_nal =
+      nal_buffer_get_by_pps_id(parser->pps_buffer, slc->pic_parameter_set_id);
+
+  if (pps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: parse_slice_header: pic_parameter_set_id %d not found in buffers\n",
+        slc->pic_parameter_set_id);
+    return -1;
+  }
+
+  struct pic_parameter_set_rbsp *pps = &pps_nal->pps;
+
+  struct nal_unit *sps_nal =
+      nal_buffer_get_by_sps_id(parser->sps_buffer, pps->seq_parameter_set_id);
+
+  if (sps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: parse_slice_header: seq_parameter_set_id %d not found in buffers\n",
+        pps->seq_parameter_set_id);
+    return -1;
+  }
+
+  struct seq_parameter_set_rbsp *sps = &sps_nal->sps;
+
+  if(sps->separate_colour_plane_flag)
+    slc->colour_plane_id = read_bits(buf, 2);
+
+  slc->frame_num = read_bits(buf, sps->log2_max_frame_num_minus4 + 4);
+  if (!sps->frame_mbs_only_flag) {
+    slc->field_pic_flag = read_bits(buf, 1);
+    if (slc->field_pic_flag)
+      slc->bottom_field_flag = read_bits(buf, 1);
+    else
+      slc->bottom_field_flag = 0;
+  }
+  else {
+    slc->field_pic_flag = 0;
+    slc->bottom_field_flag = 0;
+  }
+
+  if (slc_nal->nal_unit_type == NAL_SLICE_IDR)
+    slc->idr_pic_id = read_exp_golomb(buf);
+
+  if (!sps->pic_order_cnt_type) {
+    slc->pic_order_cnt_lsb = read_bits(buf,
+        sps->log2_max_pic_order_cnt_lsb_minus4 + 4);
+    if (pps->pic_order_present_flag && !slc->field_pic_flag)
+      slc->delta_pic_order_cnt_bottom = read_exp_golomb_s(buf);
+  }
+
+  if (sps->pic_order_cnt_type == 1 && !sps->delta_pic_order_always_zero_flag) {
+    slc->delta_pic_order_cnt[0] = read_exp_golomb_s(buf);
+    if (pps->pic_order_present_flag && !slc->field_pic_flag)
+      slc->delta_pic_order_cnt[1] = read_exp_golomb_s(buf);
+  }
+
+  if (pps->redundant_pic_cnt_present_flag == 1) {
+    slc->redundant_pic_cnt = read_exp_golomb(buf);
+  }
+
+  if (slc->slice_type == SLICE_B)
+    slc->direct_spatial_mv_pred_flag = read_bits(buf, 1);
+
+  /* take default values in case they are not set here */
+  slc->num_ref_idx_l0_active_minus1 = pps->num_ref_idx_l0_active_minus1;
+  slc->num_ref_idx_l1_active_minus1 = pps->num_ref_idx_l1_active_minus1;
+
+  if (slc->slice_type == SLICE_P || slc->slice_type == SLICE_SP
+      || slc->slice_type == SLICE_B) {
+    slc->num_ref_idx_active_override_flag = read_bits(buf, 1);
+
+    if (slc->num_ref_idx_active_override_flag == 1) {
+      slc->num_ref_idx_l0_active_minus1 = read_exp_golomb(buf);
+
+      if (slc->slice_type == SLICE_B) {
+        slc->num_ref_idx_l1_active_minus1 = read_exp_golomb(buf);
+      }
+    }
+  }
+
+  /* --- ref_pic_list_reordering --- */
+  parse_ref_pic_list_reordering(buf, slc);
+
+  /* --- pred_weight_table --- */
+  if ((pps->weighted_pred_flag && (slc->slice_type == SLICE_P
+      || slc->slice_type == SLICE_SP)) || (pps->weighted_bipred_idc == 1
+      && slc->slice_type == SLICE_B)) {
+    parse_pred_weight_table(buf, slc, parser);
+  }
+
+  /* --- dec_ref_pic_marking --- */
+  if (slc_nal->nal_ref_idc != 0)
+    parse_dec_ref_pic_marking(buf, slc_nal);
+  else
+    slc->dec_ref_pic_marking_count = 0;
+
+  return 0;
+}
+
+void interpret_slice_header(struct h264_parser *parser, struct nal_unit *slc_nal)
+{
+  struct coded_picture *pic = parser->pic;
+  struct slice_header *slc = &slc_nal->slc;
+
+  /* retrieve sps and pps from the buffers */
+  struct nal_unit *pps_nal =
+      nal_buffer_get_by_pps_id(parser->pps_buffer, slc->pic_parameter_set_id);
+
+  if (pps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: interpret_slice_header: pic_parameter_set_id %d not found in buffers\n",
+        slc->pic_parameter_set_id);
+    return;
+  }
+
+  struct nal_unit *sps_nal =
+      nal_buffer_get_by_sps_id(parser->sps_buffer, pps_nal->pps.seq_parameter_set_id);
+
+  if (sps_nal == NULL) {
+    xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+        "ERR: interpret_slice_header: seq_parameter_set_id %d not found in buffers\n",
+        pps_nal->pps.seq_parameter_set_id);
+    return;
+  }
+
+  if (pic->sps_nal) {
+    release_nal_unit(pic->sps_nal);
+  }
+  if (pic->pps_nal) {
+    release_nal_unit(pic->pps_nal);
+  }
+  lock_nal_unit(sps_nal);
+  pic->sps_nal = sps_nal;
+  lock_nal_unit(pps_nal);
+  pic->pps_nal = pps_nal;
+}
+
+void parse_ref_pic_list_reordering(struct buf_reader *buf, struct slice_header *slc)
+{
+  if (slc->slice_type != SLICE_I && slc->slice_type != SLICE_SI) {
+    slc->ref_pic_list_reordering.ref_pic_list_reordering_flag_l0 = read_bits(
+        buf, 1);
+
+    if (slc->ref_pic_list_reordering.ref_pic_list_reordering_flag_l0 == 1) {
+      do {
+        slc->ref_pic_list_reordering.reordering_of_pic_nums_idc
+            = read_exp_golomb(buf);
+
+        if (slc->ref_pic_list_reordering.reordering_of_pic_nums_idc == 0
+            || slc->ref_pic_list_reordering.reordering_of_pic_nums_idc == 1) {
+          slc->ref_pic_list_reordering.abs_diff_pic_num_minus1
+              = read_exp_golomb(buf);
+        }
+        else if (slc->ref_pic_list_reordering.reordering_of_pic_nums_idc == 2) {
+          slc->ref_pic_list_reordering.long_term_pic_num = read_exp_golomb(buf);
+        }
+      } while (slc->ref_pic_list_reordering.reordering_of_pic_nums_idc != 3);
+    }
+  }
+
+  if (slc->slice_type == SLICE_B) {
+    slc->ref_pic_list_reordering.ref_pic_list_reordering_flag_l1 = read_bits(
+        buf, 1);
+
+    if (slc->ref_pic_list_reordering.ref_pic_list_reordering_flag_l1 == 1) {
+      do {
+        slc->ref_pic_list_reordering.reordering_of_pic_nums_idc
+            = read_exp_golomb(buf);
+
+        if (slc->ref_pic_list_reordering.reordering_of_pic_nums_idc == 0
+            || slc->ref_pic_list_reordering.reordering_of_pic_nums_idc == 1) {
+          slc->ref_pic_list_reordering.abs_diff_pic_num_minus1
+              = read_exp_golomb(buf);
+        }
+        else if (slc->ref_pic_list_reordering.reordering_of_pic_nums_idc == 2) {
+          slc->ref_pic_list_reordering.long_term_pic_num = read_exp_golomb(buf);
+        }
+      } while (slc->ref_pic_list_reordering.reordering_of_pic_nums_idc != 3);
+    }
+  }
+}
+
+void parse_pred_weight_table(struct buf_reader *buf, struct slice_header *slc,
+    struct h264_parser *parser)
+{
+  /* retrieve sps and pps from the buffers */
+  struct pic_parameter_set_rbsp *pps =
+      &nal_buffer_get_by_pps_id(parser->pps_buffer, slc->pic_parameter_set_id)
+      ->pps;
+
+  struct seq_parameter_set_rbsp *sps =
+      &nal_buffer_get_by_sps_id(parser->sps_buffer, pps->seq_parameter_set_id)
+      ->sps;
+
+  slc->pred_weight_table.luma_log2_weight_denom = read_exp_golomb(buf);
+
+  uint32_t ChromaArrayType = sps->chroma_format_idc;
+  if(sps->separate_colour_plane_flag)
+    ChromaArrayType = 0;
+
+  if (ChromaArrayType != 0)
+    slc->pred_weight_table.chroma_log2_weight_denom = read_exp_golomb(buf);
+
+  int i;
+  for (i = 0; i <= slc->num_ref_idx_l0_active_minus1; i++) {
+    uint8_t luma_weight_l0_flag = read_bits(buf, 1);
+
+    if (luma_weight_l0_flag == 1) {
+      slc->pred_weight_table.luma_weight_l0[i] = read_exp_golomb_s(buf);
+      slc->pred_weight_table.luma_offset_l0[i] = read_exp_golomb_s(buf);
+    }
+
+    if (ChromaArrayType != 0) {
+      uint8_t chroma_weight_l0_flag = read_bits(buf, 1);
+
+      if (chroma_weight_l0_flag == 1) {
+        int j;
+        for (j = 0; j < 2; j++) {
+          slc->pred_weight_table.chroma_weight_l0[i][j]
+              = read_exp_golomb_s(buf);
+          slc->pred_weight_table.chroma_offset_l0[i][j]
+              = read_exp_golomb_s(buf);
+        }
+      }
+    }
+  }
+
+  if ((slc->slice_type % 5) == SLICE_B) {
+    /* FIXME: Being spec-compliant here and loop to num_ref_idx_l0_active_minus1
+     * will break Divx7 files. Keep this in mind if any other streams are broken
+     */
+    for (i = 0; i <= slc->num_ref_idx_l1_active_minus1; i++) {
+      uint8_t luma_weight_l1_flag = read_bits(buf, 1);
+
+      if (luma_weight_l1_flag == 1) {
+        slc->pred_weight_table.luma_weight_l1[i] = read_exp_golomb_s(buf);
+        slc->pred_weight_table.luma_offset_l1[i] = read_exp_golomb_s(buf);
+      }
+
+      if (ChromaArrayType != 0) {
+        uint8_t chroma_weight_l1_flag = read_bits(buf, 1);
+
+        if (chroma_weight_l1_flag == 1) {
+          int j;
+          for (j = 0; j < 2; j++) {
+            slc->pred_weight_table.chroma_weight_l1[i][j]
+                = read_exp_golomb_s(buf);
+            slc->pred_weight_table.chroma_offset_l1[i][j]
+                = read_exp_golomb_s(buf);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * PicNum calculation following ITU-T H264 11/2007
+ * 8.2.4.1 p112f
+ */
+void calculate_pic_nums(struct h264_parser *parser, struct coded_picture *cpic)
+{
+  struct decoded_picture *pic = NULL;
+  struct slice_header *cslc = &cpic->slc_nal->slc;
+
+  xine_list_iterator_t ite = xine_list_front(parser->dpb->reference_list);
+  while (ite) {
+    pic = xine_list_get_value(parser->dpb->reference_list, ite);
+
+    int i;
+    for (i=0; i<2; i++) {
+      if(pic->coded_pic[i] == NULL)
+        continue;
+
+      struct slice_header *slc = &pic->coded_pic[i]->slc_nal->slc;
+      struct seq_parameter_set_rbsp *sps = &pic->coded_pic[i]->sps_nal->sps;
+
+      if (!pic->coded_pic[i]->used_for_long_term_ref) {
+        int32_t frame_num_wrap = 0;
+        if (slc->frame_num > cslc->frame_num)
+          frame_num_wrap = slc->frame_num - sps->max_frame_num;
+        else
+          frame_num_wrap = slc->frame_num;
+
+        if(i == 0) {
+          pic->frame_num_wrap = frame_num_wrap;
+        }
+
+        if (cslc->field_pic_flag == 0) {
+          pic->coded_pic[i]->pic_num = frame_num_wrap;
+        } else {
+          pic->coded_pic[i]->pic_num = 2 * frame_num_wrap;
+          if((slc->field_pic_flag == 1 &&
+              cslc->bottom_field_flag == slc->bottom_field_flag) ||
+              (slc->field_pic_flag == 0 && !cslc->bottom_field_flag))
+            pic->coded_pic[i]->pic_num++;
+        }
+      } else {
+        pic->coded_pic[i]->long_term_pic_num = pic->coded_pic[i]->long_term_frame_idx;
+        if(slc->bottom_field_flag == cslc->bottom_field_flag)
+          pic->coded_pic[i]->long_term_pic_num++;
+      }
+    }
+
+    ite = xine_list_next(parser->dpb->reference_list, ite);
+  }
+}
+
+void execute_ref_pic_marking(struct coded_picture *cpic,
+    uint32_t memory_management_control_operation,
+    uint32_t marking_nr,
+    struct h264_parser *parser)
+{
+  /**
+   * according to NOTE 6, p83 the dec_ref_pic_marking
+   * structure is identical for all slice headers within
+   * a coded picture, so we can simply use the last
+   * slice_header we saw in the pic
+   */
+  if (!cpic->slc_nal)
+    return;
+  struct slice_header *slc = &cpic->slc_nal->slc;
+  struct dpb *dpb = parser->dpb;
+
+  calculate_pic_nums(parser, cpic);
+
+  if (cpic->flag_mask & IDR_PIC) {
+    if(slc->dec_ref_pic_marking[marking_nr].long_term_reference_flag) {
+      cpic->used_for_long_term_ref = 1;
+      dpb_set_unused_ref_picture_lidx_gt(dpb, 0);
+    } else {
+      dpb_set_unused_ref_picture_lidx_gt(dpb, -1);
+    }
+    return;
+  }
+
+  /* MMC operation == 1 : 8.2.5.4.1, p. 120 */
+  if (memory_management_control_operation == 1) {
+    // short-term -> unused for reference
+    int32_t pic_num_x = (parser->curr_pic_num
+        - (slc->dec_ref_pic_marking[marking_nr].difference_of_pic_nums_minus1 + 1));
+        //% cpic->max_pic_num;
+    struct decoded_picture* pic = NULL;
+    if ((pic = dpb_get_picture(dpb, pic_num_x)) != NULL) {
+      if (cpic->slc_nal->slc.field_pic_flag == 0) {
+        dpb_unmark_reference_picture(dpb, pic);
+      } else {
+
+        if (pic->coded_pic[0]->slc_nal->slc.field_pic_flag == 1) {
+          if (pic->top_is_reference)
+            pic->top_is_reference = 0;
+          else if (pic->bottom_is_reference)
+            pic->bottom_is_reference = 0;
+
+          if(!pic->top_is_reference && !pic->bottom_is_reference)
+            dpb_unmark_reference_picture(dpb, pic);
+        } else {
+          pic->top_is_reference = pic->bottom_is_reference = 0;
+          dpb_unmark_reference_picture(dpb, pic);
+        }
+      }
+    } else {
+        xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+            "H264: mmc 1 failed: %d not existent - curr_pic: %d\n",
+            pic_num_x, parser->curr_pic_num);
+    }
+  } else if (memory_management_control_operation == 2) {
+    // long-term -> unused for reference
+    struct decoded_picture* pic = dpb_get_picture_by_ltpn(dpb,
+        slc->dec_ref_pic_marking[marking_nr].long_term_pic_num);
+    if (pic != NULL) {
+      if (cpic->slc_nal->slc.field_pic_flag == 0)
+        dpb_set_unused_ref_picture_byltpn(dpb,
+            slc->dec_ref_pic_marking[marking_nr].long_term_pic_num);
+      else {
+
+        if (pic->coded_pic[0]->slc_nal->slc.field_pic_flag == 1) {
+          if (pic->top_is_reference)
+            pic->top_is_reference = 0;
+          else if (pic->bottom_is_reference)
+            pic->bottom_is_reference = 0;
+
+          if(!pic->top_is_reference && !pic->bottom_is_reference) {
+            dpb_set_unused_ref_picture_byltpn(dpb,
+                slc->dec_ref_pic_marking[marking_nr].long_term_pic_num);
+          }
+        } else {
+          pic->top_is_reference = pic->bottom_is_reference = 0;
+          dpb_set_unused_ref_picture_byltpn(dpb,
+              slc->dec_ref_pic_marking[marking_nr].long_term_pic_num);
+        }
+      }
+    }
+  } else if (memory_management_control_operation == 3) {
+    // short-term -> long-term, set long-term frame index
+    uint32_t pic_num_x = parser->curr_pic_num
+        - (slc->dec_ref_pic_marking[marking_nr].difference_of_pic_nums_minus1 + 1);
+    struct decoded_picture* pic = dpb_get_picture_by_ltidx(dpb,
+        slc->dec_ref_pic_marking[marking_nr].long_term_pic_num);
+    if (pic != NULL)
+      dpb_set_unused_ref_picture_bylidx(dpb,
+          slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx);
+
+    pic = dpb_get_picture(dpb, pic_num_x);
+    if (pic) {
+      pic = dpb_get_picture(dpb, pic_num_x);
+
+      if (pic->coded_pic[0]->slc_nal->slc.field_pic_flag == 0) {
+        pic->coded_pic[0]->long_term_frame_idx
+            = slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx;
+        pic->coded_pic[0]->long_term_pic_num = pic->coded_pic[0]->long_term_frame_idx;
+      }
+      else {
+        if(pic->coded_pic[0]->pic_num == pic_num_x) {
+          pic->coded_pic[0]->long_term_frame_idx
+              = slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx;
+          pic->coded_pic[0]->long_term_pic_num = pic->coded_pic[0]->long_term_frame_idx * 2 + 1;
+        } else if(pic->coded_pic[1] != NULL &&
+            pic->coded_pic[1]->pic_num == pic_num_x) {
+          pic->coded_pic[1]->long_term_frame_idx
+              = slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx;
+          pic->coded_pic[1]->long_term_pic_num = pic->coded_pic[1]->long_term_frame_idx * 2 + 1;
+        }
+      }
+    }
+    else {
+      xprintf(parser->xine, XINE_VERBOSITY_DEBUG,
+          "memory_management_control_operation: 3 failed. No such picture.\n");
+    }
+
+  } else if (memory_management_control_operation == 4) {
+    /* set max-long-term frame index,
+     * mark all long-term pictures with long-term frame idx
+     * greater max-long-term farme idx as unused for ref */
+    if (slc->dec_ref_pic_marking[marking_nr].max_long_term_frame_idx_plus1 == 0)
+      dpb_set_unused_ref_picture_lidx_gt(dpb, 0);
+    else
+      dpb_set_unused_ref_picture_lidx_gt(dpb,
+          slc->dec_ref_pic_marking[marking_nr].max_long_term_frame_idx_plus1 - 1);
+  } else if (memory_management_control_operation == 5) {
+    /* mark all ref pics as unused for reference,
+     * set max-long-term frame index = no long-term frame idxs */
+    dpb_flush(dpb);
+
+    if (!slc->bottom_field_flag) {
+      parser->prev_pic_order_cnt_lsb = cpic->top_field_order_cnt;
+      parser->prev_pic_order_cnt_msb = 0;
+    } else {
+      parser->prev_pic_order_cnt_lsb = 0;
+      parser->prev_pic_order_cnt_msb = 0;
+    }
+  } else if (memory_management_control_operation == 6) {
+    /* mark current picture as used for long-term ref,
+     * assing long-term frame idx to it */
+    struct decoded_picture* pic = dpb_get_picture_by_ltidx(dpb,
+        slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx);
+    if (pic != NULL)
+      dpb_set_unused_ref_picture_bylidx(dpb,
+          slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx);
+
+    cpic->long_term_frame_idx = slc->dec_ref_pic_marking[marking_nr].long_term_frame_idx;
+    cpic->used_for_long_term_ref = 1;
+
+    if (slc->field_pic_flag == 0) {
+      cpic->long_term_pic_num = cpic->long_term_frame_idx;
+    }
+    else {
+      cpic->long_term_pic_num = cpic->long_term_frame_idx * 2 + 1;
+    }
+
+  }
+}
+
+void parse_dec_ref_pic_marking(struct buf_reader *buf,
+    struct nal_unit *slc_nal)
+{
+  struct slice_header *slc = &slc_nal->slc;
+
+  if (!slc)
+    return;
+
+  slc->dec_ref_pic_marking_count = 0;
+  int i = slc->dec_ref_pic_marking_count;
+
+  if (slc_nal->nal_unit_type == NAL_SLICE_IDR) {
+    slc->dec_ref_pic_marking[i].no_output_of_prior_pics_flag = read_bits(buf, 1);
+    slc->dec_ref_pic_marking[i].long_term_reference_flag = read_bits(buf, 1);
+    i+=2;
+  } else {
+    slc->dec_ref_pic_marking[i].adaptive_ref_pic_marking_mode_flag = read_bits(
+        buf, 1);
+
+    if (slc->dec_ref_pic_marking[i].adaptive_ref_pic_marking_mode_flag) {
+      do {
+        slc->dec_ref_pic_marking[i].memory_management_control_operation
+            = read_exp_golomb(buf);
+
+        if (slc->dec_ref_pic_marking[i].memory_management_control_operation == 1
+            || slc->dec_ref_pic_marking[i].memory_management_control_operation
+                == 3)
+          slc->dec_ref_pic_marking[i].difference_of_pic_nums_minus1
+              = read_exp_golomb(buf);
+
+        if (slc->dec_ref_pic_marking[i].memory_management_control_operation == 2)
+          slc->dec_ref_pic_marking[i].long_term_pic_num = read_exp_golomb(buf);
+
+        if (slc->dec_ref_pic_marking[i].memory_management_control_operation == 3
+            || slc->dec_ref_pic_marking[i].memory_management_control_operation
+                == 6)
+          slc->dec_ref_pic_marking[i].long_term_frame_idx = read_exp_golomb(buf);
+
+        if (slc->dec_ref_pic_marking[i].memory_management_control_operation == 4)
+          slc->dec_ref_pic_marking[i].max_long_term_frame_idx_plus1
+              = read_exp_golomb(buf);
+
+        i++;
+        if(i >= 10) {
+          lprintf("Error: Not more than 10 MMC operations supported per slice. Dropping some.\n");
+          i = 0;
+        }
+      } while (slc->dec_ref_pic_marking[i-1].memory_management_control_operation
+          != 0);
+    }
+  }
+
+  slc->dec_ref_pic_marking_count = (i>0) ? (i-1) : 0;
+}
+
+/* ----------------- NAL parser ----------------- */
+
+struct h264_parser* init_parser(xine_t *xine)
+{
+  struct h264_parser *parser = calloc(1, sizeof(struct h264_parser));
+  parser->pic = create_coded_picture();
+  parser->position = NON_VCL;
+  parser->last_vcl_nal = NULL;
+  parser->sps_buffer = create_nal_buffer(32);
+  parser->pps_buffer = create_nal_buffer(32);
+  parser->xine = xine;
+  parser->dpb = create_dpb();
+
+  return parser;
+}
+
+void reset_parser(struct h264_parser *parser)
+{
+  parser->position = NON_VCL;
+  parser->buf_len = parser->prebuf_len = 0;
+  parser->next_nal_position = 0;
+  parser->last_nal_res = 0;
+
+  if(parser->last_vcl_nal) {
+    release_nal_unit(parser->last_vcl_nal);
+  }
+  parser->last_vcl_nal = NULL;
+
+  parser->prev_pic_order_cnt_msb = 0;
+  parser->prev_pic_order_cnt_lsb = 0;
+  parser->frame_num_offset = 0;
+  parser->prev_top_field_order_cnt = 0;
+  parser->curr_pic_num = 0;
+  parser->flag_mask = 0;
+
+  if(parser->pic != NULL) {
+    free_coded_picture(parser->pic);
+    parser->pic = create_coded_picture();
+  }
+}
+
+void free_parser(struct h264_parser *parser)
+{
+  dpb_free_all(parser->dpb);
+  release_dpb(parser->dpb);
+  free_nal_buffer(parser->pps_buffer);
+  free_nal_buffer(parser->sps_buffer);
+  free(parser);
+}
+
+void parse_codec_private(struct h264_parser *parser, uint8_t *inbuf, int inbuf_len)
+{
+  struct buf_reader bufr;
+
+  bufr.buf = inbuf;
+  bufr.cur_pos = inbuf;
+  bufr.cur_offset = 8;
+  bufr.len = inbuf_len;
+
+  // FIXME: Might be broken!
+  struct nal_unit *nal = calloc(1, sizeof(struct nal_unit));
+
+
+  /* reserved */
+  read_bits(&bufr, 8);
+  nal->sps.profile_idc = read_bits(&bufr, 8);
+  read_bits(&bufr, 8);
+  nal->sps.level_idc = read_bits(&bufr, 8);
+  read_bits(&bufr, 6);
+
+  parser->nal_size_length = read_bits(&bufr, 2) + 1;
+  parser->nal_size_length_buf = calloc(1, parser->nal_size_length);
+  read_bits(&bufr, 3);
+  uint8_t sps_count = read_bits(&bufr, 5);
+
+  inbuf += 6;
+  inbuf_len -= 6;
+  int i;
+
+  struct coded_picture *dummy = NULL;
+  for(i = 0; i < sps_count; i++) {
+    uint16_t sps_size = read_bits(&bufr, 16);
+    inbuf += 2;
+    inbuf_len -= 2;
+    parse_nal(inbuf, sps_size, parser, &dummy);
+    inbuf += sps_size;
+    inbuf_len -= sps_size;
+  }
+
+  bufr.buf = inbuf;
+  bufr.cur_pos = inbuf;
+  bufr.cur_offset = 8;
+  bufr.len = inbuf_len;
+
+  uint8_t pps_count = read_bits(&bufr, 8);
+  inbuf += 1;
+  for(i = 0; i < pps_count; i++) {
+    uint16_t pps_size = read_bits(&bufr, 16);
+    inbuf += 2;
+    inbuf_len -= 2;
+    parse_nal(inbuf, pps_size, parser, &dummy);
+    inbuf += pps_size;
+    inbuf_len -= pps_size;
+  }
+
+  nal_buffer_append(parser->sps_buffer, nal);
+}
+
+void process_mmc_operations(struct h264_parser *parser, struct coded_picture *picture)
+{
+  if (picture->flag_mask & REFERENCE) {
+    parser->prev_pic_order_cnt_lsb
+          = picture->slc_nal->slc.pic_order_cnt_lsb;
+  }
+
+  int i;
+  for(i = 0; i < picture->slc_nal->slc.
+      dec_ref_pic_marking_count; i++) {
+    execute_ref_pic_marking(
+        picture,
+        picture->slc_nal->slc.dec_ref_pic_marking[i].
+        memory_management_control_operation,
+        i,
+        parser);
+  }
+}
+
+int parse_frame(struct h264_parser *parser, uint8_t *inbuf, int inbuf_len,
+    int64_t pts,
+    uint8_t **ret_buf, uint32_t *ret_len, struct coded_picture **ret_pic)
+{
+  int32_t next_nal = 0;
+  int32_t offset = 0;
+  int start_seq_len = 3;
+
+  *ret_pic = NULL;
+  *ret_buf = NULL;
+  *ret_len = 0;
+
+  if(parser->nal_size_length > 0)
+    start_seq_len = offset = parser->nal_size_length;
+
+  if (parser->prebuf_len + inbuf_len > MAX_FRAME_SIZE) {
+    xprintf(parser->xine, XINE_VERBOSITY_LOG,"h264_parser: prebuf underrun\n");
+    *ret_len = 0;
+    *ret_buf = NULL;
+    parser->prebuf_len = 0;
+    return inbuf_len;
+  }
+
+  /* copy the whole inbuf to the prebuf,
+   * then search for a nal-start sequence in the prebuf,
+   * if it's in there, parse the nal and append to parser->buf
+   * or return a frame */
+
+  xine_fast_memcpy(parser->prebuf + parser->prebuf_len, inbuf, inbuf_len);
+  parser->prebuf_len += inbuf_len;
+
+  while((next_nal = seek_for_nal(parser->prebuf+start_seq_len-offset, parser->prebuf_len-start_seq_len+offset, parser)) > 0) {
+
+    struct coded_picture *completed_pic = NULL;
+
+    if(!parser->nal_size_length &&
+        (parser->prebuf[0] != 0x00 || parser->prebuf[1] != 0x00 ||
+            parser->prebuf[2] != 0x01)) {
+      xprintf(parser->xine, XINE_VERBOSITY_LOG, "Broken NAL, skip it.\n");
+      parser->last_nal_res = 2;
+    } else {
+      parser->last_nal_res = parse_nal(parser->prebuf+start_seq_len,
+          next_nal, parser, &completed_pic);
+    }
+
+    if (completed_pic != NULL &&
+        completed_pic->slice_cnt > 0 &&
+        parser->buf_len > 0) {
+
+      //lprintf("Frame complete: %d bytes\n", parser->buf_len);
+      *ret_len = parser->buf_len;
+      *ret_buf = malloc(parser->buf_len);
+      xine_fast_memcpy(*ret_buf, parser->buf, parser->buf_len);
+
+      *ret_pic = completed_pic;
+
+      parser->buf_len = 0;
+
+      if (pts != 0 && (parser->pic->pts == 0 || parser->pic->pts != pts)) {
+        parser->pic->pts = pts;
+      }
+
+      /**
+       * if the new coded picture started with a VCL nal
+       * we have to copy this to buffer for the next picture
+       * now.
+       */
+      if(parser->last_nal_res == 1) {
+        if(parser->nal_size_length > 0) {
+          static const uint8_t start_seq[3] = { 0x00, 0x00, 0x01 };
+          xine_fast_memcpy(parser->buf, start_seq, 3);
+          parser->buf_len += 3;
+        }
+
+        xine_fast_memcpy(parser->buf+parser->buf_len, parser->prebuf+offset, next_nal+start_seq_len-2*offset);
+        parser->buf_len += next_nal+start_seq_len-2*offset;
+      }
+
+      memmove(parser->prebuf, parser->prebuf+(next_nal+start_seq_len-offset), parser->prebuf_len-(next_nal+start_seq_len-offset));
+      parser->prebuf_len -= next_nal+start_seq_len-offset;
+
+      return inbuf_len;
+    }
+
+    /* got a new nal, which is part of the current
+     * coded picture. add it to buf
+     */
+    if (parser->last_nal_res < 2) {
+      if (parser->buf_len + next_nal+start_seq_len-offset > MAX_FRAME_SIZE) {
+        xprintf(parser->xine, XINE_VERBOSITY_LOG, "h264_parser: buf underrun!\n");
+        parser->buf_len = 0;
+        *ret_len = 0;
+        *ret_buf = NULL;
+        return inbuf_len;
+      }
+
+      if(parser->nal_size_length > 0) {
+        static const uint8_t start_seq[3] = { 0x00, 0x00, 0x01 };
+        xine_fast_memcpy(parser->buf+parser->buf_len, start_seq, 3);
+        parser->buf_len += 3;
+      }
+
+      xine_fast_memcpy(parser->buf+parser->buf_len, parser->prebuf+offset, next_nal+start_seq_len-2*offset);
+      parser->buf_len += next_nal+start_seq_len-2*offset;
+
+      memmove(parser->prebuf, parser->prebuf+(next_nal+start_seq_len-offset), parser->prebuf_len-(next_nal+start_seq_len-offset));
+      parser->prebuf_len -= next_nal+start_seq_len-offset;
+    } else {
+      /* got a non-relevant nal, just remove it */
+      memmove(parser->prebuf, parser->prebuf+(next_nal+start_seq_len-offset), parser->prebuf_len-(next_nal+start_seq_len-offset));
+      parser->prebuf_len -= next_nal+start_seq_len-offset;
+    }
+  }
+
+  if (pts != 0 && (parser->pic->pts == 0 || parser->pic->pts != pts)) {
+    parser->pic->pts = pts;
+  }
+
+  *ret_buf = NULL;
+  *ret_len = 0;
+  return inbuf_len;
+}
+
+
+/**
+ * @return 0: NAL is part of coded picture
+ *         2: NAL is not part of coded picture
+ *         1: NAL is the beginning of a new coded picture
+ *         3: NAL is marked as END_OF_SEQUENCE
+ */
+int parse_nal(uint8_t *buf, int buf_len, struct h264_parser *parser,
+    struct coded_picture **completed_picture)
+{
+  int ret = 0;
+
+  struct buf_reader bufr;
+
+  bufr.buf = buf;
+  bufr.cur_pos = buf;
+  bufr.cur_offset = 8;
+  bufr.len = buf_len;
+
+  *completed_picture = NULL;
+
+  struct nal_unit *nal = parse_nal_header(&bufr, parser->pic, parser);
+
+  /**
+   * we detect the start of a new access unit if
+   * a non-vcl nal unit is received after a vcl
+   * nal unit
+   * NAL_END_OF_SEQUENCE terminates the current
+   * access unit
+   */
+  if (nal->nal_unit_type >= NAL_SLICE &&
+      nal->nal_unit_type <= NAL_SLICE_IDR) {
+    parser->position = VCL;
+  } else if ((parser->position == VCL &&
+      nal->nal_unit_type >= NAL_SEI &&
+      nal->nal_unit_type <= NAL_PPS) ||
+      nal->nal_unit_type == NAL_AU_DELIMITER ||
+      nal->nal_unit_type == NAL_END_OF_SEQUENCE) {
+    /* start of a new access unit! */
+    *completed_picture = parser->pic;
+    parser->pic = create_coded_picture();
+
+    if(parser->last_vcl_nal != NULL) {
+      release_nal_unit(parser->last_vcl_nal);
+      parser->last_vcl_nal = NULL;
+    }
+    parser->position = NON_VCL;
+  } else {
+    parser->position = NON_VCL;
+  }
+
+  switch(nal->nal_unit_type) {
+    case NAL_SPS:
+      nal_buffer_append(parser->sps_buffer, nal);
+      break;
+    case NAL_PPS:
+      nal_buffer_append(parser->pps_buffer, nal);
+      break;
+    case NAL_SEI: {
+      if (parser->pic != NULL) {
+        if(parser->pic->sei_nal) {
+          release_nal_unit(parser->pic->sei_nal);
+        }
+        lock_nal_unit(nal);
+        parser->pic->sei_nal = nal;
+        interpret_sei(parser->pic);
+      }
+    }
+    default:
+      break;
+  }
+
+  /**
+   * in case of an access unit which does not contain any
+   * non-vcl nal units we have to detect the new access
+   * unit through the algorithm for detecting first vcl nal
+   * units of a primary coded picture
+   */
+  if (parser->position == VCL && parser->last_vcl_nal != NULL &&
+      nal->nal_unit_type >= NAL_SLICE && nal->nal_unit_type <= NAL_SLICE_IDR) {
+    /**
+     * frame boundary detection according to
+     * ITU-T Rec. H264 (11/2007) chapt 7.4.1.2.4, p65
+     */
+    struct nal_unit* last_nal = parser->last_vcl_nal;
+
+    if (nal == NULL || last_nal == NULL) {
+      ret = 1;
+    } else if (nal->slc.frame_num != last_nal->slc.frame_num) {
+      ret = 1;
+    } else if (nal->slc.pic_parameter_set_id
+        != last_nal->slc.pic_parameter_set_id) {
+      ret = 1;
+    } else if (nal->slc.field_pic_flag
+        != last_nal->slc.field_pic_flag) {
+      ret = 1;
+    } else if (nal->slc.bottom_field_flag
+        != last_nal->slc.bottom_field_flag) {
+      ret = 1;
+    } else if (nal->nal_ref_idc != last_nal->nal_ref_idc &&
+        (nal->nal_ref_idc == 0 || last_nal->nal_ref_idc == 0)) {
+      ret = 1;
+    } else if (nal->sps.pic_order_cnt_type == 0
+            && last_nal->sps.pic_order_cnt_type == 0
+            && (nal->slc.pic_order_cnt_lsb != last_nal->slc.pic_order_cnt_lsb
+                || nal->slc.delta_pic_order_cnt_bottom
+                != last_nal->slc.delta_pic_order_cnt_bottom)) {
+      ret = 1;
+    } else if (nal->sps.pic_order_cnt_type == 1
+        && last_nal->sps.pic_order_cnt_type == 1
+        && (nal->slc.delta_pic_order_cnt[0]
+            != last_nal->slc.delta_pic_order_cnt[0]
+            || nal->slc.delta_pic_order_cnt[1]
+                != last_nal->slc.delta_pic_order_cnt[1])) {
+      ret = 1;
+    } else if (nal->nal_unit_type != last_nal->nal_unit_type && (nal->nal_unit_type
+        == NAL_SLICE_IDR || last_nal->nal_unit_type == NAL_SLICE_IDR)) {
+      ret = 1;
+    } else if (nal->nal_unit_type == NAL_SLICE_IDR
+        && last_nal->nal_unit_type == NAL_SLICE_IDR && nal->slc.idr_pic_id
+        != last_nal->slc.idr_pic_id) {
+      ret = 1;
+    }
+
+    /* increase the slice_cnt until a new frame is detected */
+    if (ret && *completed_picture == NULL) {
+      *completed_picture = parser->pic;
+      parser->pic = create_coded_picture();
+    }
+
+  } else if (nal->nal_unit_type == NAL_PPS || nal->nal_unit_type == NAL_SPS) {
+    ret = 2;
+  } else if (nal->nal_unit_type == NAL_AU_DELIMITER) {
+    ret = 2;
+  } else if (nal->nal_unit_type == NAL_END_OF_SEQUENCE) {
+    ret = 3;
+  } else if (nal->nal_unit_type >= NAL_SEI) {
+    ret = 2;
+  }
+
+  if (parser->pic) {
+
+    if (nal->nal_unit_type == NAL_SLICE_IDR) {
+      parser->pic->flag_mask |= IDR_PIC;
+    }
+
+    /* reference flag is only set for slice NALs,
+     * as PPS/SPS/SEI only references are not relevant
+     * for the vdpau decoder.
+     */
+    if (nal->nal_ref_idc &&
+        nal->nal_unit_type <= NAL_SLICE_IDR) {
+      parser->pic->flag_mask |= REFERENCE;
+    } else if (!nal->nal_ref_idc &&
+        nal->nal_unit_type >= NAL_SLICE &&
+        nal->nal_unit_type <= NAL_PART_C) {
+      /* remove reference flag if a picture is not
+       * continously flagged as reference. */
+      parser->pic->flag_mask &= ~REFERENCE;
+    }
+
+    if (nal->nal_unit_type >= NAL_SLICE &&
+        nal->nal_unit_type <= NAL_SLICE_IDR) {
+      lock_nal_unit(nal);
+      if(parser->last_vcl_nal) {
+        release_nal_unit(parser->last_vcl_nal);
+      }
+      parser->last_vcl_nal = nal;
+
+      parser->pic->slice_cnt++;
+      if(parser->pic->slc_nal) {
+        release_nal_unit(parser->pic->slc_nal);
+      }
+      lock_nal_unit(nal);
+      parser->pic->slc_nal = nal;
+
+      interpret_slice_header(parser, nal);
+    }
+
+    if (*completed_picture != NULL &&
+        (*completed_picture)->slice_cnt > 0) {
+      calculate_pic_order(parser, *completed_picture,
+          &((*completed_picture)->slc_nal->slc));
+      interpret_sps(*completed_picture, parser);
+      interpret_pps(*completed_picture);
+    }
+  }
+
+  release_nal_unit(nal);
+  return ret;
+}
+
+int seek_for_nal(uint8_t *buf, int buf_len, struct h264_parser *parser)
+{
+  if(buf_len <= 0)
+    return -1;
+
+  if(parser->nal_size_length > 0) {
+    if(buf_len < parser->nal_size_length) {
+      return -1;
+    }
+
+    uint32_t next_nal = parser->next_nal_position;
+    if(!next_nal) {
+      struct buf_reader bufr;
+
+      bufr.buf = buf;
+      bufr.cur_pos = buf;
+      bufr.cur_offset = 8;
+      bufr.len = buf_len;
+
+      next_nal = read_bits(&bufr, parser->nal_size_length*8)+parser->nal_size_length;
+    }
+
+    if(next_nal > buf_len) {
+      parser->next_nal_position = next_nal;
+      return -1;
+    } else
+      parser->next_nal_position = 0;
+
+    return next_nal;
+  }
+
+  /* NAL_END_OF_SEQUENCE has only 1 byte, so
+   * we do not need to search for the next start sequence */
+  if(buf[0] == NAL_END_OF_SEQUENCE)
+    return 1;
+
+  int i;
+  for (i = 0; i < buf_len - 2; i++) {
+    if (buf[i] == 0x00 && buf[i + 1] == 0x00 && buf[i + 2] == 0x01) {
+      //lprintf("found nal at: %d\n", i);
+      return i;
+    }
+  }
+
+  return -1;
+}
diff --git a/src/video_dec/libvdpau/h264_parser.h b/src/video_dec/libvdpau/h264_parser.h
new file mode 100644
index 000000000..49bc56bab
--- /dev/null
+++ b/src/video_dec/libvdpau/h264_parser.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * h264_parser.h: Almost full-features H264 NAL-Parser
+ */
+
+#ifndef NAL_PARSER_H_
+#define NAL_PARSER_H_
+
+#include <stdlib.h>
+
+#include <xine/xine_internal.h>
+#include "nal.h"
+#include "dpb.h"
+
+#define MAX_FRAME_SIZE  1024*1024
+
+/* specifies wether the parser last parsed
+ * non-vcl or vcl nal units. depending on
+ * this the access unit boundaries are detected
+ */
+enum parser_position {
+    NON_VCL,
+    VCL
+};
+
+enum parser_flags {
+    CPB_DPB_DELAYS_PRESENT = 0x01,
+    PIC_STRUCT_PRESENT = 0x02
+};
+
+struct h264_parser {
+    uint8_t buf[MAX_FRAME_SIZE];
+    uint32_t buf_len;
+
+    /* prebuf is used to store the currently
+     * processed nal unit */
+    uint8_t prebuf[MAX_FRAME_SIZE];
+    uint32_t prebuf_len;
+    uint32_t next_nal_position;
+
+    uint8_t last_nal_res;
+
+    uint8_t nal_size_length;
+    uint32_t next_nal_size;
+    uint8_t *nal_size_length_buf;
+    uint8_t have_nal_size_length_buf;
+
+    enum parser_position position;
+
+    struct coded_picture *pic;
+
+    struct nal_unit *last_vcl_nal;
+    struct nal_buffer *sps_buffer;
+    struct nal_buffer *pps_buffer;
+
+    uint32_t prev_pic_order_cnt_lsb;
+    uint32_t prev_pic_order_cnt_msb;
+    uint32_t frame_num_offset;
+
+    int32_t prev_top_field_order_cnt;
+
+    uint32_t curr_pic_num;
+
+    uint16_t flag_mask;
+
+    /* this is dpb used for reference frame
+     * heading to vdpau + unordered frames
+     */
+    struct dpb *dpb;
+
+    xine_t *xine;
+};
+
+int parse_nal(uint8_t *buf, int buf_len, struct h264_parser *parser,
+    struct coded_picture **completed_picture);
+
+int seek_for_nal(uint8_t *buf, int buf_len, struct h264_parser *parser);
+
+struct h264_parser* init_parser(xine_t *xine);
+void reset_parser(struct h264_parser *parser);
+void free_parser(struct h264_parser *parser);
+int parse_frame(struct h264_parser *parser, uint8_t *inbuf, int inbuf_len,
+    int64_t pts,
+    uint8_t **ret_buf, uint32_t *ret_len, struct coded_picture **ret_pic);
+
+/* this has to be called after decoding the frame delivered by parse_frame,
+ * but before adding a decoded frame to the dpb.
+ */
+void process_mmc_operations(struct h264_parser *parser, struct coded_picture *picture);
+
+void parse_codec_private(struct h264_parser *parser, uint8_t *inbuf, int inbuf_len);
+
+#endif
diff --git a/src/video_dec/libvdpau/nal.c b/src/video_dec/libvdpau/nal.c
new file mode 100644
index 000000000..c3693c5f2
--- /dev/null
+++ b/src/video_dec/libvdpau/nal.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * nal.c: nal-structure utility functions
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "nal.h"
+#include <xine/xine_internal.h>
+
+struct nal_buffer* create_nal_buffer(uint8_t max_size)
+{
+    struct nal_buffer *nal_buffer = calloc(1, sizeof(struct nal_buffer));
+    nal_buffer->max_size = max_size;
+
+    return nal_buffer;
+}
+
+/**
+ * destroys a nal buffer. all referenced nals are released
+ */
+void free_nal_buffer(struct nal_buffer *nal_buffer)
+{
+  struct nal_unit *nal = nal_buffer->first;
+
+  while (nal) {
+    struct nal_unit *delete = nal;
+    nal = nal->next;
+    release_nal_unit(delete);
+  }
+
+  free(nal_buffer);
+}
+
+/**
+ * appends a nal unit to the end of the buffer
+ */
+void nal_buffer_append(struct nal_buffer *nal_buffer, struct nal_unit *nal)
+{
+  if(nal_buffer->used == nal_buffer->max_size) {
+    nal_buffer_remove(nal_buffer, nal_buffer->first);
+  }
+
+  if (nal_buffer->first == NULL) {
+    nal_buffer->first = nal_buffer->last = nal;
+    nal->prev = nal->next = NULL;
+
+    lock_nal_unit(nal);
+    nal_buffer->used++;
+  } else if (nal_buffer->last != NULL) {
+    nal_buffer->last->next = nal;
+    nal->prev = nal_buffer->last;
+    nal_buffer->last = nal;
+
+    lock_nal_unit(nal);
+    nal_buffer->used++;
+  } else {
+    lprintf("ERR: nal_buffer is in a broken state\n");
+  }
+}
+
+void nal_buffer_remove(struct nal_buffer *nal_buffer, struct nal_unit *nal)
+{
+  if (nal == nal_buffer->first && nal == nal_buffer->last) {
+    nal_buffer->first = nal_buffer->last = NULL;
+  } else {
+    if (nal == nal_buffer->first) {
+      nal_buffer->first = nal->next;
+      nal_buffer->first->prev = NULL;
+    } else {
+      nal->prev->next = nal->next;
+    }
+
+    if (nal == nal_buffer->last) {
+      nal_buffer->last = nal->prev;
+      nal_buffer->last->next = NULL;
+    } else {
+      nal->next->prev = nal->prev;
+    }
+  }
+
+  nal->next = nal->prev = NULL;
+  release_nal_unit(nal);
+
+  nal_buffer->used--;
+}
+
+void nal_buffer_flush(struct nal_buffer *nal_buffer)
+{
+  while(nal_buffer->used > 0) {
+    nal_buffer_remove(nal_buffer, nal_buffer->first);
+  }
+}
+
+/**
+ * returns the last element in the buffer
+ */
+struct nal_unit *nal_buffer_get_last(struct nal_buffer *nal_buffer)
+{
+  return nal_buffer->last;
+}
+
+/**
+ * get a nal unit from a nal_buffer from it's
+ * seq parameter_set_id
+ */
+struct nal_unit* nal_buffer_get_by_sps_id(struct nal_buffer *nal_buffer,
+    uint32_t seq_parameter_set_id)
+{
+  struct nal_unit *nal = nal_buffer->last;
+
+  if (nal != NULL) {
+    do {
+      if(nal->nal_unit_type == NAL_SPS) {
+        if(nal->sps.seq_parameter_set_id == seq_parameter_set_id) {
+          return nal;
+        }
+      }
+
+      nal = nal->prev;
+    } while(nal != NULL);
+  }
+
+  return NULL;
+}
+
+/**
+ * get a nal unit from a nal_buffer from it's
+ * pic parameter_set_id
+ */
+struct nal_unit* nal_buffer_get_by_pps_id(struct nal_buffer *nal_buffer,
+    uint32_t pic_parameter_set_id)
+{
+  struct nal_unit *nal = nal_buffer->last;
+
+  if (nal != NULL) {
+    do {
+      if(nal->nal_unit_type == NAL_PPS) {
+        if(nal->pps.pic_parameter_set_id == pic_parameter_set_id) {
+          return nal;
+        }
+      }
+
+      nal = nal->prev;
+    } while(nal != NULL);
+  }
+
+  return NULL;
+}
+
+/**
+ * create a new nal unit, with a lock_counter of 1
+ */
+struct nal_unit* create_nal_unit()
+{
+  struct nal_unit *nal = calloc(1, sizeof(struct nal_unit));
+  nal->lock_counter = 1;
+
+  return nal;
+}
+
+void lock_nal_unit(struct nal_unit *nal)
+{
+  nal->lock_counter++;
+}
+
+void release_nal_unit(struct nal_unit *nal)
+{
+  if(!nal)
+    return;
+
+  nal->lock_counter--;
+
+  if(nal->lock_counter <= 0) {
+    free(nal);
+  }
+}
+
+/**
+ * creates a copy of a nal unit with a single lock
+ */
+void copy_nal_unit(struct nal_unit *dest, struct nal_unit *src)
+{
+  /* size without pps, sps and slc units: */
+  int size = sizeof(struct nal_unit);
+
+  xine_fast_memcpy(dest, src, size);
+  dest->lock_counter = 1;
+  dest->prev = dest->next = NULL;
+}
diff --git a/src/video_dec/libvdpau/nal.h b/src/video_dec/libvdpau/nal.h
new file mode 100644
index 000000000..f40617cd0
--- /dev/null
+++ b/src/video_dec/libvdpau/nal.h
@@ -0,0 +1,501 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * nal.h: H264 NAL structures
+ */
+
+#ifndef NAL_H_
+#define NAL_H_
+#include <stdint.h>
+#include <vdpau/vdpau.h>
+
+enum nal_unit_types
+{
+  NAL_UNSPECIFIED = 0,
+  NAL_SLICE,
+  NAL_PART_A,
+  NAL_PART_B,
+  NAL_PART_C,
+  NAL_SLICE_IDR,
+  NAL_SEI,
+  NAL_SPS,
+  NAL_PPS,
+  NAL_AU_DELIMITER,
+  NAL_END_OF_SEQUENCE,
+  NAL_END_OF_STREAM,
+  NAL_FILLER_DATA,
+  NAL_SPS_EXT
+};
+
+enum pic_struct {
+  DISP_FRAME = 0,
+  DISP_TOP,
+  DISP_BOTTOM,
+  DISP_TOP_BOTTOM,
+  DISP_BOTTOM_TOP,
+  DISP_TOP_BOTTOM_TOP,
+  DISP_BOTTOM_TOP_BOTTOM,
+  DISP_FRAME_DOUBLING,
+  DISP_FRAME_TRIPLING
+};
+
+enum ct_type {
+    CT_PROGRESSIVE = 0,
+    CT_INTERLACED,
+    CT_UNKNOWN,
+    CT_RESERVED
+};
+
+/* slice types repeat from 5-9, we
+ * need a helper function for comparison
+ */
+enum slice_types
+{
+  SLICE_P = 0, SLICE_B, SLICE_I, SLICE_SP, SLICE_SI
+};
+
+enum aspect_ratio
+{
+  ASPECT_UNSPECIFIED = 0,
+  ASPECT_1_1,
+  ASPECT_12_11,
+  ASPECT_10_11,
+  ASPECT_16_11,
+  ASPECT_40_33,
+  ASPECT_24_11,
+  ASPECT_20_11,
+  ASPECT_32_11,
+  ASPECT_80_33,
+  ASPECT_18_11,
+  ASPECT_15_11,
+  ASPECT_64_33,
+  ASPECT_160_99,
+  ASPECT_4_3,
+  ASPECT_3_2,
+  ASPECT_2_1,
+  ASPECT_RESERVED,
+  ASPECT_EXTENDED_SAR=255
+};
+
+static const uint8_t zigzag_4x4[16] = {
+  0+0*4, 1+0*4, 0+1*4, 0+2*4,
+  1+1*4, 2+0*4, 3+0*4, 2+1*4,
+  1+2*4, 0+3*4, 1+3*4, 2+2*4,
+  3+1*4, 3+2*4, 2+3*4, 3+3*4,
+};
+
+static const uint8_t zigzag_8x8[64] = {
+  0+0*8, 1+0*8, 0+1*8, 0+2*8,
+  1+1*8, 2+0*8, 3+0*8, 2+1*8,
+  1+2*8, 0+3*8, 0+4*8, 1+3*8,
+  2+2*8, 3+1*8, 4+0*8, 5+0*8,
+  4+1*8, 3+2*8, 2+3*8, 1+4*8,
+  0+5*8, 0+6*8, 1+5*8, 2+4*8,
+  3+3*8, 4+2*8, 5+1*8, 6+0*8,
+  7+0*8, 6+1*8, 5+2*8, 4+3*8,
+  3+4*8, 2+5*8, 1+6*8, 0+7*8,
+  1+7*8, 2+6*8, 3+5*8, 4+4*8,
+  5+3*8, 6+2*8, 7+1*8, 7+2*8,
+  6+3*8, 5+4*8, 4+5*8, 3+6*8,
+  2+7*8, 3+7*8, 4+6*8, 5+5*8,
+  6+4*8, 7+3*8, 7+4*8, 6+5*8,
+  5+6*8, 4+7*8, 5+7*8, 6+6*8,
+  7+5*8, 7+6*8, 6+7*8, 7+7*8,
+};
+
+static inline uint32_t slice_type(uint32_t slice_type)
+{
+  return (slice_type < 10 ? slice_type % 5 : slice_type);
+}
+
+#if 0
+static inline void print_slice_type(uint32_t slice_type)
+{
+  switch(slice_type) {
+    case SLICE_P:
+      printf("SLICE_P\n");
+      break;
+    case SLICE_B:
+      printf("SLICE_B\n");
+      break;
+    case SLICE_I:
+      printf("SLICE_I\n");
+      break;
+    case SLICE_SP:
+      printf("SLICE_SP\n");
+      break;
+    case SLICE_SI:
+      printf("SLICE_SI\n");
+      break;
+    default:
+      printf("Unknown SLICE\n");
+  }
+}
+#endif
+
+struct hrd_parameters
+{
+  uint32_t cpb_cnt_minus1;
+  uint8_t bit_rate_scale;
+  uint8_t cpb_size_scale;
+
+  uint32_t bit_rate_value_minus1[32];
+  uint32_t cpb_size_value_minus1[32];
+  uint8_t cbr_flag[32];
+
+  uint8_t initial_cpb_removal_delay_length_minus1;
+  uint8_t cpb_removal_delay_length_minus1;
+  uint8_t dpb_output_delay_length_minus1;
+  uint8_t time_offset_length;
+};
+
+struct seq_parameter_set_rbsp
+{
+  uint8_t profile_idc; // 0xff
+  uint8_t constraint_setN_flag; // 0x0f
+  uint8_t level_idc; // 0xff
+  uint32_t seq_parameter_set_id;
+  uint32_t chroma_format_idc;
+  uint8_t separate_colour_plane_flag; // 0x01
+  uint32_t bit_depth_luma_minus8;
+  uint32_t bit_depth_chroma_minus8;
+  uint8_t qpprime_y_zero_transform_bypass_flag;
+  uint8_t seq_scaling_matrix_present_flag;
+
+  /* if(seq_scaling_matrix_present_flag) */
+  uint8_t seq_scaling_list_present_flag[8];
+
+  uint8_t scaling_lists_4x4[6][16];
+  uint8_t scaling_lists_8x8[2][64];
+  /* endif */
+
+  uint32_t log2_max_frame_num_minus4;
+  uint32_t max_frame_num;
+  uint32_t pic_order_cnt_type;
+  // if pic_order_cnt_type==0
+  uint32_t log2_max_pic_order_cnt_lsb_minus4;
+  // else
+  uint8_t delta_pic_order_always_zero_flag;
+  int32_t offset_for_non_ref_pic;
+  int32_t offset_for_top_to_bottom_field;
+  uint8_t num_ref_frames_in_pic_order_cnt_cycle;
+  int32_t offset_for_ref_frame[256];
+  // TODO: some more ignored here
+  uint32_t num_ref_frames;
+  uint8_t gaps_in_frame_num_value_allowed_flag;
+  /*uint32_t    pic_width_in_mbs_minus1;
+   uint32_t    pic_height_in_map_units_minus1;*/
+  uint32_t pic_width;
+  uint32_t pic_height;
+  uint8_t frame_mbs_only_flag;
+  uint8_t mb_adaptive_frame_field_flag;
+  uint8_t direct_8x8_inference_flag;
+  uint8_t frame_cropping_flag;
+  uint32_t frame_crop_left_offset;
+  uint32_t frame_crop_right_offset;
+  uint32_t frame_crop_top_offset;
+  uint32_t frame_crop_bottom_offset;
+  uint8_t vui_parameters_present_flag;
+
+  /* vui_parameters */
+  struct
+  {
+    uint8_t aspect_ration_info_present_flag;
+
+    /* aspect_ration_info_present_flag == 1 */
+    uint8_t aspect_ratio_idc;
+    uint16_t sar_width;
+    uint16_t sar_height;
+
+    uint8_t overscan_info_present_flag;
+    /* overscan_info_present_flag == 1 */
+    uint8_t overscan_appropriate_flag;
+
+    uint8_t video_signal_type_present_flag;
+    /* video_signal_type_present_flag == 1 */
+    uint8_t video_format;
+    uint8_t video_full_range_flag;
+    uint8_t colour_description_present;
+    /* colour_description_present == 1 */
+    uint8_t colour_primaries;
+    uint8_t transfer_characteristics;
+    uint8_t matrix_coefficients;
+
+    uint8_t chroma_loc_info_present_flag;
+    /* chroma_loc_info_present_flag == 1 */
+    uint8_t chroma_sample_loc_type_top_field;
+    uint8_t chroma_sample_loc_type_bottom_field;
+
+    uint8_t timing_info_present_flag;
+    /* timing_info_present_flag == 1 */
+    uint32_t num_units_in_tick;
+    uint32_t time_scale;
+    uint8_t fixed_frame_rate_flag;
+
+    uint8_t nal_hrd_parameters_present_flag;
+    struct hrd_parameters nal_hrd_parameters;
+
+    uint8_t vc1_hrd_parameters_present_flag;
+    struct hrd_parameters vc1_hrd_parameters;
+
+    uint8_t low_delay_hrd_flag;
+
+    uint8_t pic_struct_present_flag;
+    uint8_t bitstream_restriction_flag;
+
+    /* bitstream_restriction_flag == 1 */
+    uint8_t motion_vectors_over_pic_boundaries;
+    uint32_t max_bytes_per_pic_denom;
+    uint32_t max_bits_per_mb_denom;
+    uint32_t log2_max_mv_length_horizontal;
+    uint32_t log2_max_mv_length_vertical;
+    uint32_t num_reorder_frames;
+    uint32_t max_dec_frame_buffering;
+  } vui_parameters;
+
+};
+
+struct pic_parameter_set_rbsp
+{
+  uint32_t pic_parameter_set_id;
+  uint32_t seq_parameter_set_id;
+  uint8_t entropy_coding_mode_flag;
+  uint8_t pic_order_present_flag;
+
+  uint32_t num_slice_groups_minus1;
+
+  /* num_slice_groups_minus1 > 0 */
+  uint32_t slice_group_map_type;
+
+  /* slice_group_map_type == 1 */
+  uint32_t run_length_minus1[64];
+
+  /* slice_group_map_type == 2 */
+  uint32_t top_left[64];
+  uint32_t bottom_right[64];
+
+  /* slice_group_map_type == 3,4,5 */
+  uint8_t slice_group_change_direction_flag;
+  uint32_t slice_group_change_rate_minus1;
+
+  /* slice_group_map_type == 6 */
+  uint32_t pic_size_in_map_units_minus1;
+  uint8_t slice_group_id[64];
+
+  uint32_t num_ref_idx_l0_active_minus1;
+  uint32_t num_ref_idx_l1_active_minus1;
+  uint8_t weighted_pred_flag;
+  uint8_t weighted_bipred_idc;
+  int32_t pic_init_qp_minus26;
+  int32_t pic_init_qs_minus26;
+  int32_t chroma_qp_index_offset;
+  uint8_t deblocking_filter_control_present_flag;
+  uint8_t constrained_intra_pred_flag;
+  uint8_t redundant_pic_cnt_present_flag;
+
+  /* if(more_rbsp_data) */
+  uint8_t transform_8x8_mode_flag;
+  uint8_t pic_scaling_matrix_present_flag;
+
+  /* if(pic_scaling_matrix_present_flag) */
+  uint8_t pic_scaling_list_present_flag[8];
+
+  uint8_t scaling_lists_4x4[6][16];
+  uint8_t scaling_lists_8x8[2][64];
+
+  int32_t second_chroma_qp_index_offset;
+};
+
+/*struct clock_timestamp {
+  uint8_t ct_type;
+  uint8_t nuit_fiel_based_flag;
+  uint8_t counting_type;
+  uint8_t full_timestamp_flag;
+  uint8_t discontinuity_flag;
+  uint8_t cnt_dropped_flag;
+  uint8_t n_frames
+};*/
+
+/* sei contains several additional info, we do
+ * only care for pic_timing, to handle display
+ * reordering
+ */
+struct sei_message
+{
+  uint32_t payload_type;
+  uint8_t last_payload_type_byte;
+  uint32_t payload_size;
+  uint8_t last_payload_size_byte;
+
+  struct
+  {
+    /* cpb_dpb_delays_present_flag == 1 */
+    uint8_t cpb_removal_delay;
+    uint8_t dpb_output_delay;
+
+    uint8_t pic_struct;
+    uint8_t ct_type : 1;
+    uint8_t nuit_field_based_flag : 1;
+    uint8_t counting_type : 5;
+    uint8_t full_timestamp_flag : 1;
+    uint8_t discontinuity_flag : 1;
+    uint8_t cnt_dropped_flag : 1;
+    uint8_t n_frames;
+
+    uint8_t seconds_value : 6;
+    uint8_t minutes_value : 6;
+    uint8_t hours_value : 5;
+
+    int32_t time_offset;
+  } pic_timing;
+};
+
+struct slice_header
+{
+  uint32_t first_mb_in_slice;
+  uint32_t slice_type;
+  uint32_t pic_parameter_set_id;
+  uint8_t colour_plane_id;
+  uint32_t frame_num;
+  uint8_t field_pic_flag;
+  uint8_t bottom_field_flag;
+  uint32_t idr_pic_id;
+
+  /* sps->pic_order_cnt_type == 0 */
+  uint32_t pic_order_cnt_lsb;
+  int32_t delta_pic_order_cnt_bottom;
+  /* sps->pic_order_cnt_type == 1 && !sps->delta_pic_order_always_zero_flag */
+  int32_t delta_pic_order_cnt[2];
+
+  /* pps->redundant_pic_cnt_present_flag == 1 */
+  int32_t redundant_pic_cnt;
+
+  /* slice_type == B */
+  uint8_t direct_spatial_mv_pred_flag;
+
+  /* slice_type == P, SP, B */
+  uint8_t num_ref_idx_active_override_flag;
+  /* num_ref_idx_active_override_flag == 1 */
+  uint32_t num_ref_idx_l0_active_minus1;
+  /* slice type == B */
+  uint32_t num_ref_idx_l1_active_minus1;
+
+  /* ref_pic_list_reordering */
+  struct
+  {
+    /* slice_type != I && slice_type != SI */
+    uint8_t ref_pic_list_reordering_flag_l0;
+
+    /* slice_type == B */
+    uint8_t ref_pic_list_reordering_flag_l1;
+
+    /* ref_pic_list_reordering_flag_l0 == 1 */
+    uint32_t reordering_of_pic_nums_idc;
+
+    /* reordering_of_pic_nums_idc == 0, 1 */
+    uint32_t abs_diff_pic_num_minus1;
+
+    /* reordering_of_pic_nums_idc == 2) */
+    uint32_t long_term_pic_num;
+  } ref_pic_list_reordering;
+
+  /* pred_weight_table */
+  struct
+  {
+    uint32_t luma_log2_weight_denom;
+
+    /* chroma_format_idc != 0 */
+    uint32_t chroma_log2_weight_denom;
+
+    int32_t luma_weight_l0[32];
+    int32_t luma_offset_l0[32];
+
+    int32_t chroma_weight_l0[32][2];
+    int32_t chroma_offset_l0[32][2];
+
+    int32_t luma_weight_l1[32];
+    int32_t luma_offset_l1[32];
+
+    int32_t chroma_weight_l1[32][2];
+    int32_t chroma_offset_l1[32][2];
+  } pred_weight_table;
+
+  /* def_rec_pic_marking */
+  struct
+  {
+
+    /* nal_unit_type == NAL_SLICE_IDR */
+    uint8_t no_output_of_prior_pics_flag;
+    uint8_t long_term_reference_flag;
+
+    /* else */
+    uint8_t adaptive_ref_pic_marking_mode_flag;
+    uint32_t memory_management_control_operation;
+
+    uint32_t difference_of_pic_nums_minus1;
+    uint32_t long_term_pic_num;
+    uint32_t long_term_frame_idx;
+    uint32_t max_long_term_frame_idx_plus1;
+  } dec_ref_pic_marking[10];
+  uint32_t dec_ref_pic_marking_count;
+};
+
+struct nal_unit {
+    uint8_t nal_ref_idc; // 0x03
+    enum nal_unit_types nal_unit_type; // 0x1f
+
+    //union {
+      struct sei_message sei;
+      struct seq_parameter_set_rbsp sps;
+      struct pic_parameter_set_rbsp pps;
+      struct slice_header slc;
+    //};
+
+    struct nal_unit *prev;
+    struct nal_unit *next;
+
+    uint32_t lock_counter;
+};
+
+struct nal_buffer {
+    struct nal_unit *first;
+    struct nal_unit *last;
+
+    uint8_t max_size;
+    uint8_t used;
+};
+
+struct nal_buffer* create_nal_buffer(uint8_t max_size);
+void free_nal_buffer(struct nal_buffer *nal_buffer);
+void nal_buffer_append(struct nal_buffer *nal_buffer, struct nal_unit *nal);
+void nal_buffer_remove(struct nal_buffer *nal_buffer, struct nal_unit *nal);
+void nal_buffer_flush(struct nal_buffer *nal_buffer);
+
+struct nal_unit* nal_buffer_get_by_sps_id(struct nal_buffer *nal_buffer,
+    uint32_t seq_parameter_set_id);
+struct nal_unit* nal_buffer_get_by_pps_id(struct nal_buffer *nal_buffer,
+    uint32_t pic_parameter_set_id);
+struct nal_unit* nal_buffer_get_last(struct nal_buffer *nal_buffer);
+
+struct nal_unit* create_nal_unit(void);
+void lock_nal_unit(struct nal_unit *nal);
+void release_nal_unit(struct nal_unit *nal);
+void copy_nal_unit(struct nal_unit *dest, struct nal_unit *src);
+
+#endif /* NAL_H_ */
diff --git a/src/video_dec/libvdpau/vdpau_h264.c b/src/video_dec/libvdpau/vdpau_h264.c
new file mode 100644
index 000000000..25ed62295
--- /dev/null
+++ b/src/video_dec/libvdpau/vdpau_h264.c
@@ -0,0 +1,1014 @@
+/*
+ * Copyright (C) 2008 Julian Scheel
+ *
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; remove-trailing-space on;
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * vdpau_h264.c: H264 Video Decoder utilizing nvidia VDPAU engine
+ */
+
+#define LOG_MODULE "vdpau_h264"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vdpau/vdpau.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+#include "accel_vdpau.h"
+#include "h264_parser.h"
+#include "dpb.h"
+#include "cpb.h"
+
+//#define DEBUG_H264
+
+#define VIDEOBUFSIZE 128*1024
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} vdpau_h264_class_t;
+
+typedef struct vdpau_h264_decoder_s {
+  video_decoder_t   video_decoder;  /* parent video decoder structure */
+
+  vdpau_h264_class_t *class;
+  xine_stream_t    *stream;
+
+  /* these are traditional variables in a video decoder object */
+  uint64_t          video_step;  /* frame duration in pts units */
+  uint64_t          reported_video_step;  /* frame duration in pts units */
+
+  int               width;       /* the width of a video frame */
+  int               height;      /* the height of a video frame */
+  double            ratio;       /* the width to height ratio */
+
+
+  struct h264_parser *nal_parser;  /* h264 nal parser. extracts stream data for vdpau */
+
+  struct decoded_picture *incomplete_pic;
+  uint32_t          last_top_field_order_cnt;
+
+  int               have_frame_boundary_marks;
+  int               wait_for_frame_start;
+
+  VdpDecoder        decoder;
+  int               decoder_started;
+  int               progressive_cnt; /* count of progressive marked frames in line */
+
+  VdpColorStandard  color_standard;
+  VdpDecoderProfile profile;
+  vdpau_accel_t     *vdpau_accel;
+
+  xine_t            *xine;
+
+  struct coded_picture *completed_pic;
+  vo_frame_t        *dangling_img;
+
+  uint8_t           *codec_private;
+  uint32_t          codec_private_len;
+
+  int               vdp_runtime_nr;
+
+  int               reset;
+
+} vdpau_h264_decoder_t;
+
+static void vdpau_h264_reset (video_decoder_t *this_gen);
+static void vdpau_h264_flush (video_decoder_t *this_gen);
+
+/**************************************************************************
+ * vdpau_h264 specific decode functions
+ *************************************************************************/
+
+/**************************************************************************
+ * xine video plugin functions
+ *************************************************************************/
+
+#ifdef DEBUG_H264
+static inline void dump_pictureinfo_h264(VdpPictureInfoH264 *pic)
+{
+  printf("C: slice_count: %d\n", pic->slice_count);
+  printf("C: field_order_cnt[0]: %d\n", pic->field_order_cnt[0]);
+  printf("C: field_order_cnt[1]: %d\n", pic->field_order_cnt[1]);
+  printf("C: is_reference: %d\n", pic->is_reference);
+  printf("C: frame_num: %d\n", pic->frame_num);
+  printf("C: field_pic_flag: %d\n", pic->field_pic_flag);
+  printf("C: bottom_field_flag: %d\n", pic->bottom_field_flag);
+  printf("C: num_ref_frames: %d\n", pic->num_ref_frames);
+  printf("C: mb_adaptive_frame_field_flag: %d\n", pic->mb_adaptive_frame_field_flag);
+  printf("C: constrained_intra_pred_flag: %d\n", pic->constrained_intra_pred_flag);
+  printf("C: weighted_pred_flag: %d\n", pic->weighted_pred_flag);
+  printf("C: weighted_bipred_idc: %d\n", pic->weighted_bipred_idc);
+  printf("C: frame_mbs_only_flag: %d\n", pic->frame_mbs_only_flag);
+  printf("C: transform_8x8_mode_flag: %d\n", pic->transform_8x8_mode_flag);
+  printf("C: chroma_qp_index_offset: %d\n", pic->chroma_qp_index_offset);
+  printf("C: second_chroma_qp_index_offset: %d\n", pic->second_chroma_qp_index_offset);
+  printf("C: pic_init_qp_minus26: %d\n", pic->pic_init_qp_minus26);
+  printf("C: num_ref_idx_l0_active_minus1: %d\n", pic->num_ref_idx_l0_active_minus1);
+  printf("C: num_ref_idx_l1_active_minus1: %d\n", pic->num_ref_idx_l1_active_minus1);
+  printf("C: log2_max_frame_num_minus4: %d\n", pic->log2_max_frame_num_minus4);
+  printf("C: pic_order_cnt_type: %d\n", pic->pic_order_cnt_type);
+  printf("C: log2_max_pic_order_cnt_lsb_minus4: %d\n", pic->log2_max_pic_order_cnt_lsb_minus4);
+  printf("C: delta_pic_order_always_zero_flag: %d\n", pic->delta_pic_order_always_zero_flag);
+  printf("C: direct_8x8_inference_flag: %d\n", pic->direct_8x8_inference_flag);
+  printf("C: entropy_coding_mode_flag: %d\n", pic->entropy_coding_mode_flag);
+  printf("C: pic_order_present_flag: %d\n", pic->pic_order_present_flag);
+  printf("C: deblocking_filter_control_present_flag: %d\n", pic->deblocking_filter_control_present_flag);
+  printf("C: redundant_pic_cnt_present_flag: %d\n", pic->redundant_pic_cnt_present_flag);
+
+  int i, j;
+  for(i = 0; i < 6; i++) {
+    printf("C: scalint_list4x4[%d]:\nC:", i);
+    for(j = 0; j < 16; j++) {
+      printf(" [%d]", pic->scaling_lists_4x4[i][j]);
+      if(j%8 == 0)
+        printf("\nC:");
+    }
+    printf("C: \n");
+  }
+  for(i = 0; i < 2; i++) {
+    printf("C: scalint_list8x8[%d]:\nC:", i);
+    for(j = 0; j < 64; j++) {
+      printf(" [%d] ", pic->scaling_lists_8x8[i][j]);
+      if(j%8 == 0)
+        printf("\nC:");
+    }
+    printf("C: \n");
+  }
+
+  //int i;
+  for(i = 0; i < 16; i++) {
+    if(pic->referenceFrames[i].surface != VDP_INVALID_HANDLE) {
+    printf("C: -------------------\n");
+      printf("C: Reference Frame %d:\n", i);
+    printf("C: frame_idx: %d\n", pic->referenceFrames[i].frame_idx);
+    printf("C: field_order_cnt[0]: %d\n", pic->referenceFrames[i].field_order_cnt[0]);
+    printf("C: field_order_cnt[1]: %d\n", pic->referenceFrames[i].field_order_cnt[0]);
+    printf("C: is_long_term: %d\n", pic->referenceFrames[i].is_long_term);
+    printf("C: top_is_reference: %d\n", pic->referenceFrames[i].top_is_reference);
+    printf("C: bottom_is_reference: %d\n", pic->referenceFrames[i].bottom_is_reference);
+    }
+  }
+  printf("C: ---------------------------------------------------------------\n");
+  /*memcpy(pic.scaling_lists_4x4, pps->scaling_lists_4x4, 6*16);
+  memcpy(pic.scaling_lists_8x8, pps->scaling_lists_8x8, 2*64);
+  memcpy(pic.referenceFrames, this->reference_frames, sizeof(this->reference_frames));*/
+
+}
+#endif
+
+static void set_ratio(video_decoder_t *this_gen)
+{
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *)this_gen;
+
+  this->ratio = (double)this->width / (double)this->height;
+  if(this->completed_pic->sps_nal->sps.vui_parameters.aspect_ration_info_present_flag) {
+    switch(this->completed_pic->sps_nal->sps.vui_parameters.aspect_ratio_idc) {
+      case ASPECT_1_1:
+        this->ratio = 1 * this->ratio;
+        break;
+      case ASPECT_12_11:
+        this->ratio *= 12.0/11.0;
+        break;
+      case ASPECT_10_11:
+        this->ratio *= 10.0/11.0;
+        break;
+      case ASPECT_16_11:
+        this->ratio *= 16.0/11.0;
+        break;
+      case ASPECT_40_33:
+        this->ratio *= 40.0/33.0;
+        break;
+      case ASPECT_24_11:
+        this->ratio *= 24.0/11.0;
+        break;
+      case ASPECT_20_11:
+        this->ratio *= 20.0/11.0;
+        break;
+      case ASPECT_32_11:
+        this->ratio *= 32.0/11.0;
+        break;
+      case ASPECT_80_33:
+        this->ratio *= 80.0/33.0;
+        break;
+      case ASPECT_18_11:
+        this->ratio *= 18.0/11.0;
+        break;
+      case ASPECT_15_11:
+        this->ratio *= 15.0/11.0;
+        break;
+      case ASPECT_64_33:
+        this->ratio *= 64.0/33.0;
+        break;
+      case ASPECT_160_99:
+        this->ratio *= 160.0/99.0;
+        break;
+      case ASPECT_4_3:
+        this->ratio *= 4.0/3.0;
+        break;
+      case ASPECT_3_2:
+        this->ratio *= 3.0/2.0;
+        break;
+      case ASPECT_2_1:
+        this->ratio *= 2.0/1.0;
+        break;
+      case ASPECT_EXTENDED_SAR:
+        this->ratio *=
+          (double)this->completed_pic->sps_nal->sps.vui_parameters.sar_width/
+          (double)this->completed_pic->sps_nal->sps.vui_parameters.sar_height;
+        break;
+    }
+  }
+}
+
+static void fill_vdpau_pictureinfo_h264(video_decoder_t *this_gen, uint32_t slice_count, VdpPictureInfoH264 *pic)
+{
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *)this_gen;
+
+  struct pic_parameter_set_rbsp *pps = &this->completed_pic->pps_nal->pps;
+  struct seq_parameter_set_rbsp *sps = &this->completed_pic->sps_nal->sps;
+  struct slice_header *slc = &this->completed_pic->slc_nal->slc;
+
+  pic->slice_count = slice_count;
+  pic->field_order_cnt[0] = this->completed_pic->top_field_order_cnt;
+  pic->field_order_cnt[1] = this->completed_pic->bottom_field_order_cnt;
+  pic->is_reference =
+    (this->completed_pic->flag_mask & REFERENCE) ? VDP_TRUE : VDP_FALSE;
+  pic->frame_num = slc->frame_num;
+  pic->field_pic_flag = slc->field_pic_flag;
+  pic->bottom_field_flag = slc->bottom_field_flag;
+  pic->num_ref_frames = sps->num_ref_frames;
+  pic->mb_adaptive_frame_field_flag = sps->mb_adaptive_frame_field_flag && !slc->field_pic_flag;
+  pic->constrained_intra_pred_flag = pps->constrained_intra_pred_flag;
+  pic->weighted_pred_flag = pps->weighted_pred_flag;
+  pic->weighted_bipred_idc = pps->weighted_bipred_idc;
+  pic->frame_mbs_only_flag = sps->frame_mbs_only_flag;
+  pic->transform_8x8_mode_flag = pps->transform_8x8_mode_flag;
+  pic->chroma_qp_index_offset = pps->chroma_qp_index_offset;
+  pic->second_chroma_qp_index_offset = pps->second_chroma_qp_index_offset;
+  pic->pic_init_qp_minus26 = pps->pic_init_qp_minus26;
+  pic->num_ref_idx_l0_active_minus1 = pps->num_ref_idx_l0_active_minus1;
+  pic->num_ref_idx_l1_active_minus1 = pps->num_ref_idx_l1_active_minus1;
+  pic->log2_max_frame_num_minus4 = sps->log2_max_frame_num_minus4;
+  pic->pic_order_cnt_type = sps->pic_order_cnt_type;
+  pic->log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_pic_order_cnt_lsb_minus4;
+  pic->delta_pic_order_always_zero_flag = sps->delta_pic_order_always_zero_flag;
+  pic->direct_8x8_inference_flag = sps->direct_8x8_inference_flag;
+  pic->entropy_coding_mode_flag = pps->entropy_coding_mode_flag;
+  pic->pic_order_present_flag = pps->pic_order_present_flag;
+  pic->deblocking_filter_control_present_flag = pps->deblocking_filter_control_present_flag;
+  pic->redundant_pic_cnt_present_flag = pps->redundant_pic_cnt_present_flag;
+
+  memcpy(pic->scaling_lists_4x4, pps->scaling_lists_4x4, sizeof(pic->scaling_lists_4x4));
+  memcpy(pic->scaling_lists_8x8, pps->scaling_lists_8x8, sizeof(pic->scaling_lists_8x8));
+
+  /* set num_ref_frames to the number of actually available reference frames,
+   * if this is not set generation 3 decoders will fail. */
+  /*pic->num_ref_frames =*/
+  fill_vdpau_reference_list(this->nal_parser->dpb, pic->referenceFrames);
+
+}
+
+static int check_progressive(video_decoder_t *this_gen, struct decoded_picture *dpic)
+{
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *)this_gen;
+  int progressive = 0;
+  int i;
+
+  for(i = 0; i < 2; i++) {
+    struct coded_picture *pic = dpic->coded_pic[i];
+    if (!pic) {
+      continue;
+    }
+
+    if (pic->flag_mask & PIC_STRUCT_PRESENT && pic->sei_nal != NULL) {
+      uint8_t pic_struct = pic->sei_nal->sei.pic_timing.pic_struct;
+
+      if (pic_struct == DISP_FRAME) {
+        progressive = 1;
+        continue;
+      } else if (pic_struct == DISP_TOP_BOTTOM ||
+          pic_struct == DISP_BOTTOM_TOP) {
+        progressive = 0;
+        break;
+      }
+
+      /* FIXME: seems unreliable, maybe it's has to be interpreted more complex */
+      /*if (pic->sei_nal->sei.pic_timing.ct_type == CT_INTERLACED) {
+        return 0;
+      } else if (pic->sei_nal->sei.pic_timing.ct_type == CT_PROGRESSIVE) {
+        return 1;
+      } */
+    }
+
+    if (pic->slc_nal->slc.field_pic_flag && pic->pps_nal->pps.pic_order_present_flag) {
+      if(pic->slc_nal->slc.delta_pic_order_cnt_bottom == 1 ||
+          pic->slc_nal->slc.delta_pic_order_cnt_bottom == -1) {
+        progressive = 0;
+        break;
+      } else {
+        progressive = 1;
+        continue;
+      }
+    }
+    if (!pic->slc_nal->slc.field_pic_flag && pic->sps_nal->sps.frame_mbs_only_flag) {
+      progressive = 1;
+      continue;
+    }
+  }
+
+  if (progressive) {
+    this->progressive_cnt++;
+  } else {
+    this->progressive_cnt = 0;
+  }
+
+  /* only switch to progressive mode if at least 5
+   * frames in order were marked as progressive */
+  return (this->progressive_cnt >= 5);
+}
+
+static int vdpau_decoder_init(video_decoder_t *this_gen)
+{
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *)this_gen;
+  vo_frame_t *img;
+
+  if(this->width == 0) {
+    this->width = this->completed_pic->sps_nal->sps.pic_width;
+    this->height = this->completed_pic->sps_nal->sps.pic_height;
+  }
+
+  set_ratio(this_gen);
+
+  _x_stream_info_set( this->stream, XINE_STREAM_INFO_VIDEO_WIDTH, this->width );
+  _x_stream_info_set( this->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, this->height );
+  _x_stream_info_set( this->stream, XINE_STREAM_INFO_VIDEO_RATIO, ((double)10000*this->ratio) );
+  _x_stream_info_set( this->stream, XINE_STREAM_INFO_FRAME_DURATION, (this->reported_video_step = this->video_step) );
+  _x_meta_info_set_utf8( this->stream, XINE_META_INFO_VIDEOCODEC, "H264/AVC (vdpau)" );
+  xine_event_t event;
+  xine_format_change_data_t data;
+  event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+  event.stream = this->stream;
+  event.data = &data;
+  event.data_length = sizeof(data);
+  data.width = this->width;
+  data.height = this->height;
+  data.aspect = this->ratio;
+  xine_event_send( this->stream, &event );
+
+  switch(this->completed_pic->sps_nal->sps.profile_idc) {
+    case 100:
+      this->profile = VDP_DECODER_PROFILE_H264_HIGH;
+      break;
+    case 77:
+      this->profile = VDP_DECODER_PROFILE_H264_MAIN;
+      break;
+    case 66:
+    default:
+      // nvidia's VDPAU doesn't support BASELINE. But most (every?) streams marked BASELINE do not use BASELINE specifics,
+      // so, just force MAIN.
+      //this->profile = VDP_DECODER_PROFILE_H264_BASELINE;
+      this->profile = VDP_DECODER_PROFILE_H264_MAIN;
+      break;
+  }
+
+  // Level 4.1 limits:
+  int ref_frames = 0;
+  if(this->completed_pic->sps_nal->sps.num_ref_frames) {
+    ref_frames = this->completed_pic->sps_nal->sps.num_ref_frames;
+  } else {
+    uint32_t round_width = (this->width + 15) & ~15;
+    uint32_t round_height = (this->height + 15) & ~15;
+    uint32_t surf_size = (round_width * round_height * 3) / 2;
+    ref_frames = (12 * 1024 * 1024) / surf_size;
+  }
+
+  if (ref_frames > 16) {
+      ref_frames = 16;
+  }
+
+  xprintf(this->xine, XINE_VERBOSITY_LOG, "Allocate %d reference frames\n",
+      ref_frames);
+  /* get the vdpau context from vo */
+  //(this->stream->video_out->open) (this->stream->video_out, this->stream);
+  img = this->stream->video_out->get_frame (this->stream->video_out,
+                                    this->width, this->height,
+                                    this->ratio,
+                                    XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS | this->reset);
+  this->reset = 0;                                    
+
+  this->vdpau_accel = (vdpau_accel_t*)img->accel_data;
+
+  img->free(img);
+  img = NULL;
+
+  /*VdpBool is_supported;
+  uint32_t max_level, max_references, max_width, max_height;*/
+  if(this->vdpau_accel->vdp_runtime_nr > 0) {
+   xprintf(this->xine, XINE_VERBOSITY_LOG,
+       "Create decoder: vdp_device: %d, profile: %d, res: %dx%d\n",
+       this->vdpau_accel->vdp_device, this->profile, this->width, this->height);
+
+   VdpStatus status = this->vdpau_accel->vdp_decoder_create(this->vdpau_accel->vdp_device,
+       this->profile, this->width, this->height, 16, &this->decoder);
+
+   if(status != VDP_STATUS_OK) {
+     xprintf(this->xine, XINE_VERBOSITY_LOG, "vdpau_h264: ERROR: VdpDecoderCreate returned status != OK (%s)\n", this->vdpau_accel->vdp_get_error_string(status));
+     return 0;
+   }
+  }
+  return 1;
+}
+
+static void draw_frames(video_decoder_t *this_gen, int flush)
+{
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *)this_gen;
+
+  struct decoded_picture *decoded_pic = NULL;
+  while ((decoded_pic = dpb_get_next_out_picture(this->nal_parser->dpb, flush)) != NULL) {
+    decoded_pic->img->top_field_first = dp_top_field_first(decoded_pic);
+    decoded_pic->img->progressive_frame = check_progressive(this_gen, decoded_pic);
+#ifdef DEBUG_H264
+    printf("progressive: %d\n", decoded_pic->img->progressive_frame);
+#endif
+    if (flush) {
+      xprintf(this->xine, XINE_VERBOSITY_DEBUG,
+          "h264 flush, draw pts: %"PRId64"\n", decoded_pic->img->pts);
+    }
+
+    decoded_pic->img->draw(decoded_pic->img, this->stream);
+    dpb_unmark_picture_delayed(this->nal_parser->dpb, decoded_pic);
+    decoded_pic = NULL;
+  }
+}
+
+static int vdpau_decoder_render(video_decoder_t *this_gen, VdpBitstreamBuffer *vdp_buffer, uint32_t slice_count)
+{
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *)this_gen;
+  vo_frame_t *img = NULL;
+
+  /* if we wait for a second field for this frame, we
+   * have to render to the same surface again.
+   */
+  if (this->incomplete_pic) {
+    img = this->incomplete_pic->img;
+  }
+
+  // FIXME: what is if this is the second field of a field coded
+  // picture? - should we keep the first field in dpb?
+  if(this->completed_pic->flag_mask & IDR_PIC) {
+    dpb_flush(this->nal_parser->dpb);
+    if(this->incomplete_pic) {
+      release_decoded_picture(this->incomplete_pic);
+      this->incomplete_pic = NULL;
+    }
+  }
+
+  struct seq_parameter_set_rbsp *sps = &this->completed_pic->sps_nal->sps;
+  struct slice_header *slc = &this->completed_pic->slc_nal->slc;
+
+  if(sps->vui_parameters_present_flag &&
+      sps->vui_parameters.timing_info_present_flag &&
+      this->video_step == 0) {
+    this->video_step = 2*90000/(1/((double)sps->vui_parameters.num_units_in_tick/(double)sps->vui_parameters.time_scale));
+  }
+
+  /* go and decode a frame */
+
+  /* check if we expect a second field, but got a frame */
+  if (this->incomplete_pic && img) {
+    if ((this->completed_pic->slc_nal->slc.frame_num !=
+        this->incomplete_pic->coded_pic[0]->slc_nal->slc.frame_num) ||
+        !slc->field_pic_flag) {
+      xprintf(this->xine, XINE_VERBOSITY_DEBUG, "H264 warning: Expected a second field, stream might be broken\n");
+
+      /* remove this pic from dpb, as it is not complete */
+      dpb_unmark_picture_delayed(this->nal_parser->dpb, this->incomplete_pic);
+      dpb_unmark_reference_picture(this->nal_parser->dpb, this->incomplete_pic);
+
+      release_decoded_picture(this->incomplete_pic);
+      this->incomplete_pic = NULL;
+      img = NULL;
+    }
+  }
+
+
+  VdpPictureInfoH264 pic;
+
+  fill_vdpau_pictureinfo_h264(this_gen, slice_count, &pic);
+
+#ifdef DEBUG_H264
+  dump_pictureinfo_h264(&pic);
+
+  int i;
+  printf("E: Bytes used: %d\n", vdp_buffer->bitstream_bytes);
+  printf("E: Decode data: \nE:");
+  for(i = 0; i < ((vdp_buffer->bitstream_bytes < 20) ? vdp_buffer->bitstream_bytes : 20); i++) {
+    printf("%02x ", ((uint8_t*)vdp_buffer->bitstream)[i]);
+    if((i+1) % 10 == 0)
+      printf("\nE:");
+  }
+  printf("\n...\n");
+  for(i = vdp_buffer->bitstream_bytes - 20; i < vdp_buffer->bitstream_bytes; i++) {
+    printf("%02x ", ((uint8_t*)vdp_buffer->bitstream)[i]);
+    if((i+1) % 10 == 0)
+      printf("\nE:");
+  }
+  printf("\nE: ---------------------------------------------------------------\n");
+#endif
+
+  if(!this->decoder_started && !pic.is_reference)
+    return 0;
+
+  this->decoder_started = 1;
+
+  if(img == NULL) {
+    img = this->stream->video_out->get_frame (this->stream->video_out,
+                                              this->width, this->height,
+                                              this->ratio,
+                                              XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS);
+    this->vdpau_accel = (vdpau_accel_t*)img->accel_data;
+
+    img->duration  = this->video_step;
+    img->pts       = this->completed_pic->pts;
+
+    if (this->dangling_img) {
+      xprintf(this->xine, XINE_VERBOSITY_LOG,
+          "broken stream: current img wasn't processed -- freeing it!\n");
+      this->dangling_img->free(this->dangling_img);
+    }
+    this->dangling_img = img;
+  } else {
+    if (img->pts == 0) {
+      img->pts = this->completed_pic->pts;
+    }
+  }
+
+  if(this->vdp_runtime_nr != *(this->vdpau_accel->current_vdp_runtime_nr)) {
+    xprintf(this->xine, XINE_VERBOSITY_LOG,
+        "VDPAU was preempted. Reinitialise the decoder.\n");
+    this->decoder = VDP_INVALID_HANDLE;
+    vdpau_h264_reset(this_gen);
+    this->vdp_runtime_nr = this->vdpau_accel->vdp_runtime_nr;
+    return 0;
+  }
+
+  VdpVideoSurface surface = this->vdpau_accel->surface;
+
+  /*xprintf(this->xine, XINE_VERBOSITY_DEBUG,
+      "Decode: NUM: %d, REF: %d, BYTES: %d, PTS: %lld\n", pic.frame_num, pic.is_reference, vdp_buffer->bitstream_bytes, this->completed_pic->pts);*/
+  VdpStatus status = this->vdpau_accel->vdp_decoder_render(this->decoder,
+      surface, (VdpPictureInfo*)&pic, 1, vdp_buffer);
+
+  /* free the image data */
+  if(((uint8_t*)vdp_buffer->bitstream) != NULL) {
+    free((uint8_t*)vdp_buffer->bitstream);
+  }
+
+  process_mmc_operations(this->nal_parser, this->completed_pic);
+
+  if(status != VDP_STATUS_OK)
+  {
+    xprintf(this->xine, XINE_VERBOSITY_LOG, "vdpau_h264: Decoder failure: %s\n",  this->vdpau_accel->vdp_get_error_string(status));
+    if (this->dangling_img)
+      this->dangling_img->free(this->dangling_img);
+    img = NULL;
+    this->dangling_img = NULL;
+    free_coded_picture(this->completed_pic);
+    this->completed_pic = NULL;
+  }
+  else {
+    img->bad_frame = 0;
+
+    if(!img->progressive_frame && this->completed_pic->repeat_pic)
+      img->repeat_first_field = 1;
+    //else if(img->progressive_frame && this->nal_parser->current_nal->repeat_pic)
+    //  img->duration *= this->nal_parser->current_nal->repeat_pic;
+
+    /* only bt601 and bt701 handled so far. others seem to be rarely used */
+    if(sps->vui_parameters.colour_description_present) {
+      switch (sps->vui_parameters.colour_primaries) {
+        case 1:
+          this->color_standard = VDP_COLOR_STANDARD_ITUR_BT_709;
+          break;
+        case 5:
+        case 6:
+        default:
+          this->color_standard = VDP_COLOR_STANDARD_ITUR_BT_601;
+          break;
+      }
+    }
+
+    this->vdpau_accel->color_standard = this->color_standard;
+
+    struct decoded_picture *decoded_pic = NULL;
+
+
+    uint8_t draw_frame = 0;
+    if (!slc->field_pic_flag) { /* frame coded: simply add to dpb */
+      decoded_pic = init_decoded_picture(this->completed_pic, img);
+      this->completed_pic = NULL;
+      this->dangling_img = NULL;
+
+      dpb_add_picture(this->nal_parser->dpb, decoded_pic, sps->num_ref_frames);
+
+      draw_frame = 1;
+    } else { /* field coded: check for second field */
+      if (!this->incomplete_pic) {
+        decoded_pic = init_decoded_picture(this->completed_pic, img);
+        this->completed_pic = NULL;
+        this->dangling_img = NULL;
+        this->incomplete_pic = decoded_pic;
+        lock_decoded_picture(this->incomplete_pic);
+
+        dpb_add_picture(this->nal_parser->dpb, decoded_pic, sps->num_ref_frames);
+
+        /* don't do a draw yet as the field was incomplete */
+        draw_frame = 0;
+      } else {
+        decoded_pic = this->incomplete_pic;
+        lock_decoded_picture(decoded_pic);
+
+        /* picture is complete now */
+        release_decoded_picture(this->incomplete_pic);
+        this->incomplete_pic = NULL;
+        this->dangling_img = NULL;
+
+        decoded_pic_add_field(decoded_pic, this->completed_pic);
+        this->completed_pic = NULL;
+
+        draw_frame = 1;
+      }
+    }
+
+    release_decoded_picture(decoded_pic);
+
+    /* draw the next frame in display order */
+    if (draw_frame) {
+      draw_frames(this_gen, 0);
+    }
+  }
+
+  return 1;
+}
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void vdpau_h264_decode_data (video_decoder_t *this_gen,
+  buf_element_t *buf) {
+
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *) this_gen;
+
+  VdpBitstreamBuffer vdp_buffer;
+  vdp_buffer.struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+
+  /* a video decoder does not care about this flag (?) */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if(buf->decoder_flags & BUF_FLAG_FRAME_START || buf->decoder_flags & BUF_FLAG_FRAME_END)
+    this->have_frame_boundary_marks = 1;
+
+  if (buf->decoder_flags & BUF_FLAG_FRAMERATE) {
+    this->video_step = buf->decoder_info[0];
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, this->video_step);
+  }
+
+  if (this->video_step != this->reported_video_step){
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, (this->reported_video_step = this->video_step));
+  }
+  
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER) { /* need to initialize */
+    this->have_frame_boundary_marks = 0;
+
+    xine_bmiheader *bih = (xine_bmiheader*)buf->content;
+    this->width                         = bih->biWidth;
+    this->height                        = bih->biHeight;
+
+    uint8_t *codec_private = buf->content + sizeof(xine_bmiheader);
+    uint32_t codec_private_len = bih->biSize - sizeof(xine_bmiheader);
+    this->codec_private_len = codec_private_len;
+    this->codec_private = malloc(codec_private_len);
+    memcpy(this->codec_private, codec_private, codec_private_len);
+
+    if(codec_private_len > 0) {
+      parse_codec_private(this->nal_parser, codec_private, codec_private_len);
+    }
+  } else if (buf->decoder_flags & BUF_FLAG_SPECIAL) {
+    this->have_frame_boundary_marks = 0;
+
+    if(buf->decoder_info[1] == BUF_SPECIAL_DECODER_CONFIG) {
+      uint8_t *codec_private = buf->decoder_info_ptr[2];
+      uint32_t codec_private_len = buf->decoder_info[2];
+      this->codec_private_len = codec_private_len;
+      this->codec_private = malloc(codec_private_len);
+      memcpy(this->codec_private, codec_private, codec_private_len);
+
+      if(codec_private_len > 0) {
+        parse_codec_private(this->nal_parser, codec_private, codec_private_len);
+      }
+    } else if (buf->decoder_info[1] == BUF_SPECIAL_PALETTE) {
+      xprintf(this->xine, XINE_VERBOSITY_LOG,
+          "SPECIAL PALETTE is not yet handled\n");
+    } else
+      xprintf(this->xine, XINE_VERBOSITY_LOG,
+          "UNKNOWN SPECIAL HEADER\n");
+
+  } else {
+    /* parse the first nal packages to retrieve profile type */
+    int len = 0;
+
+    while(len < buf->size && !(this->wait_for_frame_start && !(buf->decoder_flags & BUF_FLAG_FRAME_START))) {
+      this->wait_for_frame_start = 0;
+      len += parse_frame(this->nal_parser, buf->content + len, buf->size - len,
+          buf->pts,
+          (uint8_t**)&vdp_buffer.bitstream, &vdp_buffer.bitstream_bytes, &this->completed_pic);
+
+      if(this->decoder == VDP_INVALID_HANDLE &&
+          this->completed_pic &&
+          this->completed_pic->sps_nal != NULL &&
+          this->completed_pic->sps_nal->sps.pic_width > 0 &&
+          this->completed_pic->sps_nal->sps.pic_height > 0) {
+
+        vdpau_decoder_init(this_gen);
+      }
+
+      if(this->completed_pic &&
+          this->completed_pic->sps_nal != NULL &&
+          this->completed_pic->sps_nal->sps.vui_parameters_present_flag &&
+          this->completed_pic->sps_nal->sps.vui_parameters.bitstream_restriction_flag) {
+
+        this->nal_parser->dpb->max_reorder_frames =
+            this->completed_pic->sps_nal->sps.vui_parameters.num_reorder_frames + 1;
+        this->nal_parser->dpb->max_dpb_frames = this->completed_pic->sps_nal->sps.vui_parameters.max_dec_frame_buffering + 1;
+
+        xprintf(this->xine, XINE_VERBOSITY_DEBUG,
+                    "max reorder count: %d, max dpb count %d\n",
+                    this->nal_parser->dpb->max_reorder_frames,
+                    this->nal_parser->dpb->max_dpb_frames);
+      }
+
+      if(this->decoder != VDP_INVALID_HANDLE &&
+          vdp_buffer.bitstream_bytes > 0 &&
+          this->completed_pic->slc_nal != NULL &&
+          this->completed_pic->pps_nal != NULL) {
+        vdpau_decoder_render(this_gen, &vdp_buffer, this->completed_pic->slice_cnt);
+      } else if (this->completed_pic != NULL) {
+        free_coded_picture(this->completed_pic);
+      }
+
+      /* in case the last nal was detected as END_OF_SEQUENCE
+       * we will flush the dpb, so that all pictures get drawn
+       */
+      if(this->nal_parser->last_nal_res == 3) {
+        xprintf(this->xine, XINE_VERBOSITY_DEBUG,
+            "END_OF_SEQUENCE, flush buffers\n");
+        vdpau_h264_flush(this_gen);
+      }
+    }
+  }
+
+  if(buf->decoder_flags & BUF_FLAG_FRAME_END)
+    this->wait_for_frame_start = 0;
+}
+
+/*
+ * This function is called when xine needs to flush the system.
+ */
+static void vdpau_h264_flush (video_decoder_t *this_gen) {
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t*) this_gen;
+  struct decoded_picture *decoded_pic = NULL;
+
+  if(this->dangling_img){
+    this->dangling_img->free(this->dangling_img);
+    this->dangling_img = NULL;
+  }
+
+  if (this->incomplete_pic) {
+    release_decoded_picture(this->incomplete_pic);
+    this->incomplete_pic = NULL;
+  }
+
+  draw_frames(this_gen, 1);
+  dpb_free_all(this->nal_parser->dpb);
+  this->reset = VO_NEW_SEQUENCE_FLAG;
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void vdpau_h264_reset (video_decoder_t *this_gen) {
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *) this_gen;
+
+  dpb_free_all(this->nal_parser->dpb);
+
+  if (this->decoder != VDP_INVALID_HANDLE) {
+    this->vdpau_accel->vdp_decoder_destroy( this->decoder );
+    this->decoder = VDP_INVALID_HANDLE;
+  }
+
+  // Doing a full parser reinit here works more reliable than
+  // resetting
+
+  //reset_parser(this->nal_parser);
+  free_parser(this->nal_parser);
+  this->nal_parser = init_parser(this->xine);
+
+  this->color_standard = VDP_COLOR_STANDARD_ITUR_BT_601;
+  this->video_step = 0;
+
+  if(this->codec_private_len > 0) {
+    parse_codec_private(this->nal_parser, this->codec_private, this->codec_private_len);
+
+    /* if the stream does not contain frame boundary marks we
+     * have to hope that the next nal will start with the next
+     * incoming buf... seems to work, though...
+     */
+    this->wait_for_frame_start = this->have_frame_boundary_marks;
+  }
+
+  if (this->incomplete_pic) {
+    release_decoded_picture(this->incomplete_pic);
+    this->incomplete_pic = NULL;
+  }
+
+  if (this->dangling_img) {
+    this->dangling_img->free(this->dangling_img);
+    this->dangling_img = NULL;
+  }
+
+  this->progressive_cnt = 0;
+  this->reset = VO_NEW_SEQUENCE_FLAG;
+}
+
+/*
+ * The decoder should forget any stored pts values here.
+ */
+static void vdpau_h264_discontinuity (video_decoder_t *this_gen) {
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *) this_gen;
+
+  dpb_clear_all_pts(this->nal_parser->dpb);
+  this->reset = VO_NEW_SEQUENCE_FLAG;
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void vdpau_h264_dispose (video_decoder_t *this_gen) {
+
+  vdpau_h264_decoder_t *this = (vdpau_h264_decoder_t *) this_gen;
+
+  if (this->incomplete_pic) {
+    release_decoded_picture(this->incomplete_pic);
+    this->incomplete_pic = NULL;
+  }
+
+  if (this->dangling_img) {
+    this->dangling_img->free(this->dangling_img);
+    this->dangling_img = NULL;
+  }
+
+  dpb_free_all(this->nal_parser->dpb);
+
+  if (this->decoder != VDP_INVALID_HANDLE) {
+    this->vdpau_accel->vdp_decoder_destroy( this->decoder );
+    this->decoder = VDP_INVALID_HANDLE;
+  }
+
+  this->stream->video_out->close( this->stream->video_out, this->stream );
+
+  free_parser (this->nal_parser);
+  free (this_gen);
+}
+
+/*
+ * This function allocates, initializes, and returns a private video
+ * decoder structure.
+ */
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  vdpau_h264_decoder_t  *this ;
+
+  /* the videoout must be vdpau-capable to support this decoder */
+  if ( !(stream->video_driver->get_capabilities(stream->video_driver) & VO_CAP_VDPAU_H264) )
+	  return NULL;
+
+  /* now check if vdpau has free decoder resource */
+  vo_frame_t *img = stream->video_out->get_frame( stream->video_out, 1920, 1080, 1, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS );
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  int runtime_nr = accel->vdp_runtime_nr;
+  img->free(img);
+  VdpDecoder decoder;
+  VdpStatus st = accel->vdp_decoder_create( accel->vdp_device, VDP_DECODER_PROFILE_H264_MAIN, 1920, 1080, 16, &decoder );
+  if ( st!=VDP_STATUS_OK ) {
+    lprintf( "can't create vdpau decoder.\n" );
+    return NULL;
+  }
+
+  accel->vdp_decoder_destroy( decoder );
+
+  this = (vdpau_h264_decoder_t *) calloc(1, sizeof(vdpau_h264_decoder_t));
+
+  this->nal_parser = init_parser(stream->xine);
+
+  this->video_decoder.decode_data         = vdpau_h264_decode_data;
+  this->video_decoder.flush               = vdpau_h264_flush;
+  this->video_decoder.reset               = vdpau_h264_reset;
+  this->video_decoder.discontinuity       = vdpau_h264_discontinuity;
+  this->video_decoder.dispose             = vdpau_h264_dispose;
+
+  this->stream                            = stream;
+  this->xine                              = stream->xine;
+  this->class                             = (vdpau_h264_class_t *) class_gen;
+
+  this->decoder                           = VDP_INVALID_HANDLE;
+  this->vdp_runtime_nr                    = runtime_nr;
+  this->color_standard                    = VDP_COLOR_STANDARD_ITUR_BT_601;
+  this->progressive_cnt                   = 0;
+
+  this->reset = VO_NEW_SEQUENCE_FLAG;
+
+  (this->stream->video_out->open) (this->stream->video_out, this->stream);
+
+  return &this->video_decoder;
+}
+
+/*
+ * This function allocates a private video decoder class and initializes
+ * the class's member functions.
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  vdpau_h264_class_t *this;
+
+  this = (vdpau_h264_class_t *) calloc(1, sizeof(vdpau_h264_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "vdpau_h264";
+  this->decoder_class.description     =
+	N_("vdpau_h264: h264 decoder plugin using VDPAU hardware decoding.\n"
+	   "Must be used along with video_out_vdpau.");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * This is a list of all of the internal xine video buffer types that
+ * this decoder is able to handle. Check src/xine-engine/buffer.h for a
+ * list of valid buffer types (and add a new one if the one you need does
+ * not exist). Terminate the list with a 0.
+ */
+static const uint32_t video_types[] = {
+  /* BUF_VIDEO_FOOVIDEO, */
+  BUF_VIDEO_H264,
+  0
+};
+
+/*
+ * This data structure combines the list of supported xine buffer types and
+ * the priority that the plugin should be given with respect to other
+ * plugins that handle the same buffer type. A plugin with priority (n+1)
+ * will be used instead of a plugin with priority (n).
+ */
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  7                    /* priority        */
+};
+
+/*
+ * The plugin catalog entry. This is the only information that this plugin
+ * will export to the public.
+ */
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* { type, API, "name", version, special_info, init_function } */
+  { PLUGIN_VIDEO_DECODER | PLUGIN_MUST_PRELOAD, 19, "vdpau_h264", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/libvdpau/vdpau_mpeg12.c b/src/video_dec/libvdpau/vdpau_mpeg12.c
new file mode 100644
index 000000000..1067f8634
--- /dev/null
+++ b/src/video_dec/libvdpau/vdpau_mpeg12.c
@@ -0,0 +1,1101 @@
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; remove-trailing-space on;
+ * Copyright (C) 2008 the xine project
+ * Copyright (C) 2008 Christophe Thommeret <hftom@free.fr>
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * vdpau_mpeg12.c, a mpeg1/2 video stream parser using VDPAU hardware decoder
+ *
+ */
+
+/*#define LOG*/
+#define LOG_MODULE "vdpau_mpeg12"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "accel_vdpau.h"
+#include "bits_reader.h"
+
+#include <vdpau/vdpau.h>
+
+#define sequence_header_code    0xb3
+#define sequence_error_code     0xb4
+#define sequence_end_code       0xb7
+#define group_start_code        0xb8
+#define extension_start_code    0xb5
+#define user_data_start_code    0xb2
+#define picture_start_code      0x00
+#define begin_slice_start_code  0x01
+#define end_slice_start_code    0xaf
+
+#define sequence_ext_sc         1
+#define quant_matrix_ext_sc     3
+#define picture_coding_ext_sc   8
+#define sequence_display_ext_sc 2
+
+#define I_FRAME   1
+#define P_FRAME   2
+#define B_FRAME   3
+
+#define PICTURE_TOP     1
+#define PICTURE_BOTTOM  2
+#define PICTURE_FRAME   3
+
+/*#define MAKE_DAT*/ /*do NOT define this, unless you know what you do */
+#ifdef MAKE_DAT
+static int nframes;
+static FILE *outfile;
+#endif
+
+
+
+/* default intra quant matrix, in zig-zag order */
+static const uint8_t default_intra_quantizer_matrix[64] = {
+    8,
+    16, 16,
+    19, 16, 19,
+    22, 22, 22, 22,
+    22, 22, 26, 24, 26,
+    27, 27, 27, 26, 26, 26,
+    26, 27, 27, 27, 29, 29, 29,
+    34, 34, 34, 29, 29, 29, 27, 27,
+    29, 29, 32, 32, 34, 34, 37,
+    38, 37, 35, 35, 34, 35,
+    38, 38, 40, 40, 40,
+    48, 48, 46, 46,
+    56, 56, 58,
+    69, 69,
+    83
+};
+
+uint8_t mpeg2_scan_norm[64] = {
+    /* Zig-Zag scan pattern */
+     0, 1, 8,16, 9, 2, 3,10,
+    17,24,32,25,18,11, 4, 5,
+    12,19,26,33,40,48,41,34,
+    27,20,13, 6, 7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+
+
+typedef struct {
+  VdpPictureInfoMPEG1Or2  vdp_infos; /* first field, also used for frame */
+  VdpPictureInfoMPEG1Or2  vdp_infos2; /* second field */
+  int                     slices_count, slices_count2;
+  uint8_t                 *slices;
+  int                     slices_size;
+  int                     slices_pos, slices_pos_top;
+
+  int                     progressive_frame;
+  int                     repeat_first_field;
+} picture_t;
+
+
+
+typedef struct {
+  uint32_t    coded_width;
+  uint32_t    coded_height;
+
+  double      video_step; /* frame duration in pts units */
+  double      reported_video_step; /* frame duration in pts units */
+  double      ratio;
+   
+  VdpDecoderProfile profile;
+  int         horizontal_size_value;
+  int         vertical_size_value;
+  int         aspect_ratio_information;
+  int         frame_rate_code;
+  int         progressive_sequence;
+  int         chroma;
+  int         horizontal_size_extension;
+  int         vertical_size_extension;
+  int         frame_rate_extension_n;
+  int         frame_rate_extension_d;
+  int         display_horizontal_size;
+  int         display_vertical_size;
+  int         top_field_first;
+
+  int         have_header;
+  int         have_display_extension;
+
+  uint8_t     *buf; /* accumulate data */
+  int         bufseek;
+  uint32_t    bufsize;
+  uint32_t    bufpos;
+  int         start;
+
+  picture_t   picture;
+  vo_frame_t  *forward_ref;
+  vo_frame_t  *backward_ref;
+
+  int64_t    cur_pts, seq_pts;
+
+  vdpau_accel_t *accel_vdpau;
+
+  bits_reader_t  br;
+
+  int         vdp_runtime_nr;
+  int         reset;
+
+} sequence_t;
+
+
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} vdpau_mpeg12_class_t;
+
+
+
+typedef struct vdpau_mpeg12_decoder_s {
+  video_decoder_t         video_decoder;  /* parent video decoder structure */
+
+  vdpau_mpeg12_class_t    *class;
+  xine_stream_t           *stream;
+
+  sequence_t              sequence;
+
+  VdpDecoder              decoder;
+  VdpDecoderProfile       decoder_profile;
+  uint32_t                decoder_width;
+  uint32_t                decoder_height;
+
+} vdpau_mpeg12_decoder_t;
+
+
+static void picture_ready( vdpau_mpeg12_decoder_t *vd, uint8_t end_of_sequence );
+
+
+
+static void reset_picture( picture_t *pic )
+{
+  lprintf( "reset_picture\n" );
+  pic->vdp_infos.picture_structure = pic->vdp_infos2.picture_structure = 0;
+  pic->vdp_infos2.intra_dc_precision = pic->vdp_infos.intra_dc_precision = 0;
+  pic->vdp_infos2.frame_pred_frame_dct = pic->vdp_infos.frame_pred_frame_dct = 1;
+  pic->vdp_infos2.concealment_motion_vectors = pic->vdp_infos.concealment_motion_vectors = 0;
+  pic->vdp_infos2.intra_vlc_format = pic->vdp_infos.intra_vlc_format = 0;
+  pic->vdp_infos2.alternate_scan = pic->vdp_infos.alternate_scan = 0;
+  pic->vdp_infos2.q_scale_type = pic->vdp_infos.q_scale_type = 0;
+  pic->vdp_infos2.top_field_first = pic->vdp_infos.top_field_first = 1;
+  pic->slices_count = 0;
+  pic->slices_count2 = 0;
+  pic->slices_pos = 0;
+  pic->slices_pos_top = 0;
+  pic->progressive_frame = 0;
+  pic->repeat_first_field = 0;
+}
+
+
+
+static void init_picture( picture_t *pic )
+{
+  pic->slices_size = 2048;
+  pic->slices = (uint8_t*)malloc(pic->slices_size);
+  reset_picture( pic );
+}
+
+
+
+static void reset_sequence( sequence_t *sequence, int free_refs )
+{
+  sequence->cur_pts = sequence->seq_pts = 0;
+  if ( sequence->forward_ref )
+    sequence->forward_ref->pts = 0;
+  if ( sequence->backward_ref )
+    sequence->backward_ref->pts = 0;
+
+  if ( !free_refs )
+    return;
+
+  sequence->bufpos = 0;
+  sequence->bufseek = 0;
+  sequence->start = -1;
+  if ( sequence->forward_ref )
+    sequence->forward_ref->free( sequence->forward_ref );
+  sequence->forward_ref = NULL;
+  if ( sequence->backward_ref )
+    sequence->backward_ref->free( sequence->backward_ref );
+  sequence->backward_ref = NULL;
+  sequence->top_field_first = 0;
+  sequence->reset = VO_NEW_SEQUENCE_FLAG;
+}
+
+
+
+static void free_sequence( sequence_t *sequence )
+{
+  lprintf( "init_sequence\n" );
+  sequence->have_header = 0;
+  sequence->profile = VDP_DECODER_PROFILE_MPEG1;
+  sequence->chroma = 0;
+  sequence->video_step = 3600;
+  reset_sequence( sequence, 1 );
+}
+
+
+
+static void sequence_header( vdpau_mpeg12_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  int i, j;
+
+  if ( !sequence->have_header )
+    sequence->have_header = 1;
+
+  sequence->profile = VDP_DECODER_PROFILE_MPEG1;
+  sequence->horizontal_size_extension = 0;
+  sequence->vertical_size_extension = 0;
+  sequence->have_display_extension = 0;
+
+  bits_reader_set( &sequence->br, buf, len );
+  sequence->horizontal_size_value = read_bits( &sequence->br, 12 );
+  lprintf( "horizontal_size_value: %d\n", sequence->horizontal_size_value );
+  sequence->vertical_size_value = read_bits( &sequence->br, 12 );
+  lprintf( "vertical_size_value: %d\n", sequence->vertical_size_value );
+  sequence->aspect_ratio_information = read_bits( &sequence->br, 4 );
+  lprintf( "aspect_ratio_information: %d\n", sequence->aspect_ratio_information );
+  sequence->frame_rate_code = read_bits( &sequence->br, 4 );
+  lprintf( "frame_rate_code: %d\n", sequence->frame_rate_code );
+  int tmp;
+  tmp = read_bits( &sequence->br, 18 );
+  lprintf( "bit_rate_value: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 1 );
+  lprintf( "marker_bit: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 10 );
+  lprintf( "vbv_buffer_size_value: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 1 );
+  lprintf( "constrained_parameters_flag: %d\n", tmp );
+  i = read_bits( &sequence->br, 1 );
+  lprintf( "load_intra_quantizer_matrix: %d\n", i );
+  if ( i ) {
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos2.intra_quantizer_matrix[mpeg2_scan_norm[j]] = sequence->picture.vdp_infos.intra_quantizer_matrix[mpeg2_scan_norm[j]] = read_bits( &sequence->br, 8 );
+    }
+  }
+  else {
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos2.intra_quantizer_matrix[mpeg2_scan_norm[j]] = sequence->picture.vdp_infos.intra_quantizer_matrix[mpeg2_scan_norm[j]] = default_intra_quantizer_matrix[j];
+    }
+  }
+
+  i = read_bits( &sequence->br, 1 );
+  lprintf( "load_non_intra_quantizer_matrix: %d\n", i );
+  if ( i ) {
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos2.non_intra_quantizer_matrix[mpeg2_scan_norm[j]] = sequence->picture.vdp_infos.non_intra_quantizer_matrix[mpeg2_scan_norm[j]] = read_bits( &sequence->br, 8 );
+    }
+  }
+  else {
+    memset( sequence->picture.vdp_infos.non_intra_quantizer_matrix, 16, 64 );
+    memset( sequence->picture.vdp_infos2.non_intra_quantizer_matrix, 16, 64 );
+  }
+}
+
+
+
+static void process_sequence_mpeg12_dependent_data( vdpau_mpeg12_decoder_t *this_gen )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  int frame_rate_value_n, frame_rate_value_d;
+
+  sequence->coded_width  = sequence->horizontal_size_value | (sequence->horizontal_size_extension << 14);
+  sequence->coded_height = sequence->vertical_size_value   | (sequence->vertical_size_extension   << 14);
+
+  switch ( sequence->frame_rate_code ) {
+    case 1:  frame_rate_value_n = 24; frame_rate_value_d = 1001; break; /* 23.976.. */
+    case 2:  frame_rate_value_n = 24; frame_rate_value_d = 1000; break; /* 24 */
+    case 3:  frame_rate_value_n = 25; frame_rate_value_d = 1000; break; /* 25 */
+    case 4:  frame_rate_value_n = 30; frame_rate_value_d = 1001; break; /* 29.97.. */
+    case 5:  frame_rate_value_n = 30; frame_rate_value_d = 1000; break; /* 30 */
+    case 6:  frame_rate_value_n = 50; frame_rate_value_d = 1000; break; /* 50 */
+    case 7:  frame_rate_value_n = 60; frame_rate_value_d = 1001; break; /* 59.94.. */
+    case 8:  frame_rate_value_n = 60; frame_rate_value_d = 1000; break; /* 60 */
+    default: frame_rate_value_n = 50; frame_rate_value_d = 1000; /* assume 50 */
+  }
+
+  sequence->video_step = 90.0 * (frame_rate_value_d * (sequence->frame_rate_extension_d + 1))
+                              / (frame_rate_value_n * (sequence->frame_rate_extension_n + 1));
+
+  if ( sequence->profile==VDP_DECODER_PROFILE_MPEG1 ) {
+    double pel_aspect_ratio; /* height / width */
+
+    switch ( sequence->aspect_ratio_information ) {
+      case  1: pel_aspect_ratio = 1.0000;
+      case  2: pel_aspect_ratio = 0.6735;
+      case  3: pel_aspect_ratio = 0.7031;
+      case  4: pel_aspect_ratio = 0.7615;
+      case  5: pel_aspect_ratio = 0.8055;
+      case  6: pel_aspect_ratio = 0.8437;
+      case  7: pel_aspect_ratio = 0.8935;
+      case  8: pel_aspect_ratio = 0.9157;
+      case  9: pel_aspect_ratio = 0.9815;
+      case 10: pel_aspect_ratio = 1.0255;
+      case 11: pel_aspect_ratio = 1.0695;
+      case 12: pel_aspect_ratio = 1.0950;
+      case 13: pel_aspect_ratio = 1.1575;
+      case 14: pel_aspect_ratio = 1.2015;
+      default: pel_aspect_ratio = 1.0000; /* fallback */
+    }
+
+    sequence->ratio = ((double)sequence->coded_width/(double)sequence->coded_height)/pel_aspect_ratio;
+  }
+  else {
+    switch ( sequence->aspect_ratio_information ) {
+      case 1:  sequence->ratio = sequence->have_display_extension
+                               ? ((double)sequence->display_horizontal_size/(double)sequence->display_vertical_size)/1.0
+                               : ((double)sequence->coded_width/(double)sequence->coded_height)/1.0;
+                               break;
+      case 2:  sequence->ratio = 4.0/3.0;  break;
+      case 3:  sequence->ratio = 16.0/9.0; break;
+      case 4:  sequence->ratio = 2.21;     break;
+      default: sequence->ratio = ((double)sequence->coded_width/(double)sequence->coded_height)/1.0;
+    }
+  }
+
+  if ( sequence->have_header == 1 ) {
+    sequence->have_header = 2;
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_WIDTH, sequence->coded_width );
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, sequence->coded_height );
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_RATIO, ((double)10000*sequence->ratio) );
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_FRAME_DURATION, (sequence->reported_video_step = sequence->video_step) );
+    _x_meta_info_set_utf8( this_gen->stream, XINE_META_INFO_VIDEOCODEC, "MPEG1/2 (vdpau)" );
+    xine_event_t event;
+    xine_format_change_data_t data;
+    event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+    event.stream = this_gen->stream;
+    event.data = &data;
+    event.data_length = sizeof(data);
+    data.width = sequence->coded_width;
+    data.height = sequence->coded_height;
+    data.aspect = sequence->ratio;
+    xine_event_send( this_gen->stream, &event );
+  }
+  else if ( sequence->have_header == 2 && sequence->reported_video_step != sequence->video_step ) {
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_FRAME_DURATION, (sequence->reported_video_step = sequence->video_step) );
+  }
+}
+
+
+
+static void picture_header( vdpau_mpeg12_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  process_sequence_mpeg12_dependent_data(this_gen);
+
+  if ( sequence->profile==VDP_DECODER_PROFILE_MPEG1 )
+    sequence->picture.vdp_infos.picture_structure = PICTURE_FRAME;
+
+  VdpPictureInfoMPEG1Or2 *infos = &sequence->picture.vdp_infos;
+
+  if ( sequence->picture.vdp_infos.picture_structure==PICTURE_FRAME ) {
+	picture_ready( this_gen, 0 );
+    reset_picture( &sequence->picture );
+  }
+  else if ( sequence->picture.vdp_infos.picture_structure && sequence->picture.vdp_infos2.picture_structure ) {
+	picture_ready( this_gen, 0 );
+    reset_picture( &sequence->picture );
+  }
+  else if ( sequence->picture.vdp_infos.picture_structure ) {
+    infos = &sequence->picture.vdp_infos2;
+	sequence->picture.slices_pos_top = sequence->picture.slices_pos;
+
+    sequence->cur_pts = 0; /* ignore pts of second field */
+  }
+
+  /* take over pts for next issued image */ 
+  if ( sequence->cur_pts ) {
+    sequence->seq_pts = sequence->cur_pts;
+    sequence->cur_pts = 0;
+  }
+
+  bits_reader_set( &sequence->br, buf, len );
+  int tmp = read_bits( &sequence->br, 10 );
+  lprintf( "temporal_reference: %d\n", tmp );
+  infos->picture_coding_type = read_bits( &sequence->br, 3 );
+  lprintf( "picture_coding_type: %d\n", infos->picture_coding_type );
+  infos->forward_reference = VDP_INVALID_HANDLE;
+  infos->backward_reference = VDP_INVALID_HANDLE;
+  skip_bits( &sequence->br, 16 );
+  if ( infos->picture_coding_type > I_FRAME ) {
+    infos->full_pel_forward_vector = read_bits( &sequence->br, 1 );
+    infos->f_code[0][0] = infos->f_code[0][1] = read_bits( &sequence->br, 3 );
+    if ( infos->picture_coding_type==B_FRAME ) {
+      infos->full_pel_backward_vector = read_bits( &sequence->br, 1 );
+      infos->f_code[1][0] = infos->f_code[1][1] = read_bits( &sequence->br, 3 );
+    }
+  }
+  else {
+    infos->full_pel_forward_vector = 0;
+    infos->full_pel_backward_vector = 0;
+  }
+}
+
+
+
+static void sequence_extension( sequence_t *sequence, uint8_t *buf, int len )
+{
+  bits_reader_set( &sequence->br, buf, len );
+  int tmp = read_bits( &sequence->br, 4 );
+  lprintf( "extension_start_code_identifier: %d\n", tmp );
+  skip_bits( &sequence->br, 1 );
+  switch ( read_bits( &sequence->br, 3 ) ) {
+    case 5: sequence->profile = VDP_DECODER_PROFILE_MPEG2_SIMPLE; break;
+    default: sequence->profile = VDP_DECODER_PROFILE_MPEG2_MAIN;
+  }
+  skip_bits( &sequence->br, 4 );
+  sequence->progressive_sequence = read_bits( &sequence->br, 1 );
+  lprintf( "progressive_sequence: %d\n", sequence->progressive_sequence );
+  if ( read_bits( &sequence->br, 2 ) == 2 )
+    sequence->chroma = VO_CHROMA_422;
+  tmp = read_bits( &sequence->br, 2 );
+  lprintf( "horizontal_size_extension: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 2 );
+  lprintf( "vertical_size_extension: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 12 );
+  lprintf( "bit_rate_extension: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 1 );
+  lprintf( "marker_bit: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 8 );
+  lprintf( "vbv_buffer_size_extension: %d\n", tmp );
+  tmp = read_bits( &sequence->br, 1 );
+  lprintf( "low_delay: %d\n", tmp );
+  sequence->frame_rate_extension_n = read_bits( &sequence->br, 2 );
+  lprintf( "frame_rate_extension_n: %d\n", sequence->frame_rate_extension_n );
+  sequence->frame_rate_extension_d = read_bits( &sequence->br, 5 );
+  lprintf( "frame_rate_extension_d: %d\n", sequence->frame_rate_extension_d );
+}
+
+
+
+static void picture_coding_extension( sequence_t *sequence, uint8_t *buf, int len )
+{
+  VdpPictureInfoMPEG1Or2 *infos = &sequence->picture.vdp_infos;
+  if ( infos->picture_structure && infos->picture_structure!=PICTURE_FRAME )
+    infos = &sequence->picture.vdp_infos2;
+
+  bits_reader_set( &sequence->br, buf, len );
+  int tmp = read_bits( &sequence->br, 4 );
+  lprintf( "extension_start_code_identifier: %d\n", tmp );
+  infos->f_code[0][0] = read_bits( &sequence->br, 4 );
+  infos->f_code[0][1] = read_bits( &sequence->br, 4 );
+  infos->f_code[1][0] = read_bits( &sequence->br, 4 );
+  infos->f_code[1][1] = read_bits( &sequence->br, 4 );
+  lprintf( "f_code_0_0: %d\n", infos->f_code[0][0] );
+  lprintf( "f_code_0_1: %d\n", infos->f_code[0][1] );
+  lprintf( "f_code_1_0: %d\n", infos->f_code[1][0] );
+  lprintf( "f_code_1_1: %d\n", infos->f_code[1][1] );
+  infos->intra_dc_precision = read_bits( &sequence->br, 2 );
+  lprintf( "intra_dc_precision: %d\n", infos->intra_dc_precision );
+  infos->picture_structure = read_bits( &sequence->br, 2 );
+  lprintf( "picture_structure: %d\n", infos->picture_structure );
+  infos->top_field_first = read_bits( &sequence->br, 1 );
+  lprintf( "top_field_first: %d\n", infos->top_field_first );
+  infos->frame_pred_frame_dct = read_bits( &sequence->br, 1 );
+  lprintf( "frame_pred_frame_dct: %d\n", infos->frame_pred_frame_dct );
+  infos->concealment_motion_vectors = read_bits( &sequence->br, 1 );
+  lprintf( "concealment_motion_vectors: %d\n", infos->concealment_motion_vectors );
+  infos->q_scale_type = read_bits( &sequence->br, 1 );
+  lprintf( "q_scale_type: %d\n", infos->q_scale_type );
+  infos->intra_vlc_format = read_bits( &sequence->br, 1 );
+  lprintf( "intra_vlc_format: %d\n", infos->intra_vlc_format );
+  infos->alternate_scan = read_bits( &sequence->br, 1 );
+  lprintf( "alternate_scan: %d\n", infos->alternate_scan );
+  sequence->picture.repeat_first_field = read_bits( &sequence->br, 1 );
+  lprintf( "repeat_first_field: %d\n", sequence->picture.repeat_first_field );
+  tmp = read_bits( &sequence->br, 1 );
+  lprintf( "chroma_420_type: %d\n", tmp );
+  sequence->picture.progressive_frame = read_bits( &sequence->br, 1 );
+  lprintf( "progressive_frame: %d\n", sequence->picture.progressive_frame );
+}
+
+
+
+static void quant_matrix_extension( sequence_t *sequence, uint8_t *buf, int len )
+{
+  int i, j;
+
+  bits_reader_set( &sequence->br, buf, len );
+  skip_bits( &sequence->br, 4 );
+  i = read_bits( &sequence->br, 1 );
+  lprintf( "load_intra_quantizer_matrix: %d\n", i );
+  if ( i ) {
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos2.intra_quantizer_matrix[mpeg2_scan_norm[j]] = sequence->picture.vdp_infos.intra_quantizer_matrix[mpeg2_scan_norm[j]] = read_bits( &sequence->br, 8 );
+    }
+  }
+  else {
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos2.intra_quantizer_matrix[mpeg2_scan_norm[j]] = sequence->picture.vdp_infos.intra_quantizer_matrix[mpeg2_scan_norm[j]] = default_intra_quantizer_matrix[j];
+    }
+  }
+
+  i = read_bits( &sequence->br, 1 );
+  lprintf( "load_non_intra_quantizer_matrix: %d\n", i );
+  if ( i ) {
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos2.non_intra_quantizer_matrix[mpeg2_scan_norm[j]] = sequence->picture.vdp_infos.non_intra_quantizer_matrix[mpeg2_scan_norm[j]] = read_bits( &sequence->br, 8 );
+    }
+  }
+  else {
+    memset( sequence->picture.vdp_infos.non_intra_quantizer_matrix, 16, 64 );
+    memset( sequence->picture.vdp_infos2.non_intra_quantizer_matrix, 16, 64 );
+  }
+}
+
+
+
+static void copy_slice( sequence_t *sequence, uint8_t *buf, int len )
+{
+  int size = sequence->picture.slices_pos+len;
+  if ( sequence->picture.slices_size < size ) {
+    sequence->picture.slices_size = size+1024;
+    sequence->picture.slices = realloc( sequence->picture.slices, sequence->picture.slices_size );
+  }
+  xine_fast_memcpy( sequence->picture.slices+sequence->picture.slices_pos, buf, len );
+  sequence->picture.slices_pos += len;
+  if ( sequence->picture.slices_pos_top )
+    sequence->picture.slices_count2++;
+  else
+    sequence->picture.slices_count++;
+}
+
+
+
+static int parse_code( vdpau_mpeg12_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  if ( !sequence->have_header && buf[3]!=sequence_header_code ) {
+    lprintf( " ----------- no sequence header yet.\n" );
+    return 0;
+  }
+
+  if ( (buf[3] >= begin_slice_start_code) && (buf[3] <= end_slice_start_code) ) {
+    lprintf( " ----------- slice_start_code\n" );
+    copy_slice( sequence, buf, len );
+    return 0;
+  }
+
+  switch ( buf[3] ) {
+    case sequence_header_code:
+      lprintf( " ----------- sequence_header_code\n" );
+      sequence_header( this_gen, buf+4, len-4 );
+      break;
+    case extension_start_code: {
+      switch ( buf[4]>>4 ) {
+        case sequence_ext_sc:
+          lprintf( " ----------- sequence_extension_start_code\n" );
+          sequence_extension( sequence, buf+4, len-4 );
+          break;
+        case quant_matrix_ext_sc:
+          lprintf( " ----------- quant_matrix_extension_start_code\n" );
+          quant_matrix_extension( sequence, buf+4, len-4 );
+          break;
+        case picture_coding_ext_sc:
+          lprintf( " ----------- picture_coding_extension_start_code\n" );
+          picture_coding_extension( sequence, buf+4, len-4 );
+          break;
+        case sequence_display_ext_sc:
+          lprintf( " ----------- sequence_display_extension_start_code\n" );
+          break;
+      }
+      break;
+      }
+    case user_data_start_code:
+      lprintf( " ----------- user_data_start_code\n" );
+      break;
+    case group_start_code:
+      lprintf( " ----------- group_start_code\n" );
+      break;
+    case picture_start_code:
+      lprintf( " ----------- picture_start_code\n" );
+      picture_header( this_gen, buf+4, len-4 );
+      break;
+    case sequence_error_code:
+      lprintf( " ----------- sequence_error_code\n" );
+      break;
+    case sequence_end_code:
+      lprintf( " ----------- sequence_end_code\n" );
+      break;
+  }
+  return 0;
+}
+
+
+
+static void decode_render( vdpau_mpeg12_decoder_t *vd, vdpau_accel_t *accel )
+{
+  sequence_t *seq = (sequence_t*)&vd->sequence;
+  picture_t *pic = (picture_t*)&seq->picture;
+
+  pic->vdp_infos.slice_count = pic->slices_count;
+  pic->vdp_infos2.slice_count = pic->slices_count2;
+
+  VdpStatus st;
+  if ( vd->decoder==VDP_INVALID_HANDLE || vd->decoder_profile!=seq->profile || vd->decoder_width!=seq->coded_width || vd->decoder_height!=seq->coded_height ) {
+    if ( vd->decoder!=VDP_INVALID_HANDLE ) {
+      accel->vdp_decoder_destroy( vd->decoder );
+      vd->decoder = VDP_INVALID_HANDLE;
+    }
+    st = accel->vdp_decoder_create( accel->vdp_device, seq->profile, seq->coded_width, seq->coded_height, 2, &vd->decoder);
+    if ( st!=VDP_STATUS_OK )
+      lprintf( "failed to create decoder !! %s\n", accel->vdp_get_error_string( st ) );
+    else {
+      vd->decoder_profile = seq->profile;
+      vd->decoder_width = seq->coded_width;
+      vd->decoder_height = seq->coded_height;
+      seq->vdp_runtime_nr = accel->vdp_runtime_nr;
+    }
+  }
+
+  VdpBitstreamBuffer vbit;
+  vbit.struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+  vbit.bitstream = pic->slices;
+  vbit.bitstream_bytes = (pic->vdp_infos.picture_structure==PICTURE_FRAME)? pic->slices_pos : pic->slices_pos_top;
+  st = accel->vdp_decoder_render( vd->decoder, accel->surface, (VdpPictureInfo*)&pic->vdp_infos, 1, &vbit );
+  if ( st!=VDP_STATUS_OK )
+    lprintf( "decoder failed : %d!! %s\n", st, accel->vdp_get_error_string( st ) );
+  else {
+    lprintf( "DECODER SUCCESS : frame_type:%d, slices=%d, slices_bytes=%d, current=%d, forwref:%d, backref:%d, pts:%lld\n",
+      pic->vdp_infos.picture_coding_type, pic->vdp_infos.slice_count, vbit.bitstream_bytes, accel->surface, pic->vdp_infos.forward_reference, pic->vdp_infos.backward_reference, seq->cur_pts );
+    VdpPictureInfoMPEG1Or2 *info = &pic->vdp_infos;
+    lprintf("%d %d %d %d %d %d %d %d %d %d %d %d %d\n", info->intra_dc_precision, info->frame_pred_frame_dct, info->concealment_motion_vectors,
+      info->intra_vlc_format, info->alternate_scan, info->q_scale_type, info->top_field_first, info->full_pel_forward_vector,
+      info->full_pel_backward_vector, info->f_code[0][0], info->f_code[0][1], info->f_code[1][0], info->f_code[1][1] );
+  }
+
+  if ( pic->vdp_infos.picture_structure != PICTURE_FRAME ) {
+    pic->vdp_infos2.backward_reference = VDP_INVALID_HANDLE;
+    pic->vdp_infos2.forward_reference = VDP_INVALID_HANDLE;
+    if ( pic->vdp_infos2.picture_coding_type==P_FRAME ) {
+      if ( pic->vdp_infos.picture_coding_type==I_FRAME )
+        pic->vdp_infos2.forward_reference = accel->surface;
+      else
+        pic->vdp_infos2.forward_reference = pic->vdp_infos.forward_reference;
+    }
+    else if ( pic->vdp_infos.picture_coding_type==B_FRAME ) {
+      pic->vdp_infos2.forward_reference = pic->vdp_infos.forward_reference;
+      pic->vdp_infos2.backward_reference = pic->vdp_infos.backward_reference;
+    }
+    vbit.struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+    vbit.bitstream = pic->slices+pic->slices_pos_top;
+    vbit.bitstream_bytes = pic->slices_pos-pic->slices_pos_top;
+    st = accel->vdp_decoder_render( vd->decoder, accel->surface, (VdpPictureInfo*)&pic->vdp_infos2, 1, &vbit );
+    if ( st!=VDP_STATUS_OK )
+      lprintf( "decoder failed : %d!! %s\n", st, accel->vdp_get_error_string( st ) );
+    else
+      lprintf( "DECODER SUCCESS : frame_type:%d, slices=%d, current=%d, forwref:%d, backref:%d, pts:%lld\n",
+        pic->vdp_infos2.picture_coding_type, pic->vdp_infos2.slice_count, accel->surface, pic->vdp_infos2.forward_reference, pic->vdp_infos2.backward_reference, seq->cur_pts );
+  }
+}
+
+
+
+static void decode_picture( vdpau_mpeg12_decoder_t *vd, uint8_t end_of_sequence )
+{
+  sequence_t *seq = (sequence_t*)&vd->sequence;
+  picture_t *pic = (picture_t*)&seq->picture;
+  vdpau_accel_t *ref_accel;
+
+  if ( seq->profile == VDP_DECODER_PROFILE_MPEG1 )
+    pic->vdp_infos.picture_structure=PICTURE_FRAME;
+
+  if ( pic->vdp_infos.picture_coding_type==P_FRAME ) {
+    if ( seq->backward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->backward_ref->accel_data;
+      pic->vdp_infos.forward_reference = ref_accel->surface;
+    }
+    else
+      return;
+  }
+  else if ( pic->vdp_infos.picture_coding_type==B_FRAME ) {
+    if ( seq->forward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->forward_ref->accel_data;
+      pic->vdp_infos.forward_reference = ref_accel->surface;
+    }
+    else
+      return;
+    if ( seq->backward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->backward_ref->accel_data;
+      pic->vdp_infos.backward_reference = ref_accel->surface;
+    }
+    else
+      return;
+  }
+
+  int still_image = (end_of_sequence) ? VO_STILL_IMAGE : 0;
+  vo_frame_t *img = vd->stream->video_out->get_frame( vd->stream->video_out, seq->coded_width, seq->coded_height,
+                                                      seq->ratio, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS | seq->chroma | seq->reset | still_image );
+  seq->reset = 0;                                                      
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  if ( !seq->accel_vdpau )
+    seq->accel_vdpau = accel;
+
+  if( seq->vdp_runtime_nr != *(seq->accel_vdpau->current_vdp_runtime_nr) ) {
+    seq->accel_vdpau = accel;
+    if ( seq->forward_ref )
+      seq->forward_ref->free( seq->forward_ref );
+    seq->forward_ref = NULL;
+    if ( seq->backward_ref )
+      seq->backward_ref->free( seq->backward_ref );
+    seq->backward_ref = NULL;
+    vd->decoder = VDP_INVALID_HANDLE;
+  }
+
+  decode_render( vd, accel );
+
+#ifdef MAKE_DAT
+  if ( nframes==0 ) {
+    fwrite( &seq->coded_width, 1, sizeof(seq->coded_width), outfile );
+    fwrite( &seq->coded_height, 1, sizeof(seq->coded_height), outfile );
+    fwrite( &seq->ratio, 1, sizeof(seq->ratio), outfile );
+    fwrite( &seq->profile, 1, sizeof(seq->profile), outfile );
+  }
+
+  if ( nframes++ < 25 ) {
+    fwrite( &pic->vdp_infos, 1, sizeof(pic->vdp_infos), outfile );
+    fwrite( &pic->slices_pos, 1, sizeof(pic->slices_pos), outfile );
+    fwrite( pic->slices, 1, pic->slices_pos, outfile );
+  }
+#endif
+
+  img->drawn = 0;
+  img->pts = seq->seq_pts;
+  seq->seq_pts = 0; /* reset */
+  img->bad_frame = 0;
+
+  if ( end_of_sequence ) {
+    if ( seq->backward_ref )
+      seq->backward_ref->free( seq->backward_ref );
+    seq->backward_ref = NULL;
+  }
+
+#if 0
+  /* trying to deal with (french) buggy streams that randomly set bottom_field_first
+     while stream is top_field_first. So we assume that when top_field_first
+     is set one time, the stream _is_ top_field_first. */
+  lprintf("pic->vdp_infos.top_field_first = %d\n", pic->vdp_infos.top_field_first);
+  if ( pic->vdp_infos.top_field_first )
+    seq->top_field_first = 1;
+  img->top_field_first = seq->top_field_first;
+#else
+  img->top_field_first = pic->vdp_infos.top_field_first;
+#endif
+
+  /* progressive_frame is unreliable with most mpeg2 streams */
+  if ( pic->vdp_infos.picture_structure!=PICTURE_FRAME )
+    img->progressive_frame = 0;
+  else
+    img->progressive_frame = pic->progressive_frame;
+
+  img->repeat_first_field = pic->repeat_first_field;
+
+  double duration = seq->video_step;
+
+  if ( img->repeat_first_field ) {
+    if( !seq->progressive_sequence && pic->progressive_frame ) {
+      /* decoder should output 3 fields, so adjust duration to
+         count on this extra field time */
+      duration *= 3;
+      duration /= 2;
+    } else if ( seq->progressive_sequence ) {
+      /* for progressive sequences the output should repeat the
+         frame 1 or 2 times depending on top_field_first flag. */
+      duration *= (pic->vdp_infos.top_field_first ? 3 : 2);
+    }
+  }
+
+  img->duration = (int)(duration + .5);
+
+  if ( pic->vdp_infos.picture_coding_type!=B_FRAME ) {
+    if ( pic->vdp_infos.picture_coding_type==I_FRAME && !seq->backward_ref ) {
+      img->pts = 0;
+      img->draw( img, vd->stream );
+      ++img->drawn;
+    }
+    if ( seq->forward_ref ) {
+      seq->forward_ref->drawn = 0;
+      seq->forward_ref->free( seq->forward_ref );
+    }
+    seq->forward_ref = seq->backward_ref;
+    if ( seq->forward_ref && !seq->forward_ref->drawn ) {
+      seq->forward_ref->draw( seq->forward_ref, vd->stream );
+    }
+    seq->backward_ref = img;
+  }
+  else {
+    img->draw( img, vd->stream );
+    img->free( img );
+  }
+}
+
+
+
+static void picture_ready( vdpau_mpeg12_decoder_t *vd, uint8_t end_of_sequence )
+{
+	picture_t *pic = (picture_t*)&vd->sequence.picture;
+	if ( !pic->slices_count )
+		return;
+	if ( pic->vdp_infos2.picture_structure && !pic->slices_count2 )
+		return;
+	decode_picture( vd, end_of_sequence );
+}
+
+
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void vdpau_mpeg12_decode_data (video_decoder_t *this_gen, buf_element_t *buf)
+{
+  vdpau_mpeg12_decoder_t *this = (vdpau_mpeg12_decoder_t *) this_gen;
+  sequence_t *seq = (sequence_t*)&this->sequence;
+
+  /* preview buffers shall not be decoded and drawn -- use them only to supply stream information */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if ( !buf->size )
+    return;
+
+  if ( buf->pts )
+    seq->cur_pts = buf->pts;
+
+  int size = seq->bufpos+buf->size;
+  if ( seq->bufsize < size ) {
+    seq->bufsize = size+1024;
+    seq->buf = realloc( seq->buf, seq->bufsize );
+  }
+  xine_fast_memcpy( seq->buf+seq->bufpos, buf->content, buf->size );
+  seq->bufpos += buf->size;
+
+  while ( seq->bufseek <= seq->bufpos-4 ) {
+    uint8_t *buffer = seq->buf+seq->bufseek;
+    if ( buffer[0]==0 && buffer[1]==0 && buffer[2]==1 ) {
+      if ( seq->start<0 ) {
+        seq->start = seq->bufseek;
+      }
+      else {
+        parse_code( this, seq->buf+seq->start, seq->bufseek-seq->start );
+        uint8_t *tmp = (uint8_t*)malloc(seq->bufsize);
+        xine_fast_memcpy( tmp, seq->buf+seq->bufseek, seq->bufpos-seq->bufseek );
+        seq->bufpos -= seq->bufseek;
+        seq->start = -1;
+        seq->bufseek = -1;
+        free( seq->buf );
+        seq->buf = tmp;
+      }
+    }
+    ++seq->bufseek;
+  }
+
+  /* still image detection -- don't wait for further data if buffer ends in sequence end code */
+  if (seq->start >= 0 && seq->buf[seq->start + 3] == sequence_end_code) {
+    decode_picture(this, 1);
+	parse_code(this, seq->buf+seq->start, 4);
+    seq->start = -1;
+  }
+}
+
+/*
+ * This function is called when xine needs to flush the system.
+ */
+static void vdpau_mpeg12_flush (video_decoder_t *this_gen) {
+  vdpau_mpeg12_decoder_t *this = (vdpau_mpeg12_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg12_flush\n" );
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void vdpau_mpeg12_reset (video_decoder_t *this_gen) {
+  vdpau_mpeg12_decoder_t *this = (vdpau_mpeg12_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg12_reset\n" );
+  reset_sequence( &this->sequence, 1 );
+}
+
+/*
+ * The decoder should forget any stored pts values here.
+ */
+static void vdpau_mpeg12_discontinuity (video_decoder_t *this_gen) {
+  vdpau_mpeg12_decoder_t *this = (vdpau_mpeg12_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg12_discontinuity\n" );
+  reset_sequence( &this->sequence, 0 );
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void vdpau_mpeg12_dispose (video_decoder_t *this_gen) {
+
+  vdpau_mpeg12_decoder_t *this = (vdpau_mpeg12_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg12_dispose\n" );
+
+  if ( this->decoder!=VDP_INVALID_HANDLE && this->sequence.accel_vdpau ) {
+      this->sequence.accel_vdpau->vdp_decoder_destroy( this->decoder );
+      this->decoder = VDP_INVALID_HANDLE;
+    }
+
+  free_sequence( &this->sequence );
+
+  this->stream->video_out->close( this->stream->video_out, this->stream );
+
+  free( this->sequence.picture.slices );
+  free( this->sequence.buf );
+  free( this_gen );
+}
+
+/*
+ * This function allocates, initializes, and returns a private video
+ * decoder structure.
+ */
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  vdpau_mpeg12_decoder_t  *this ;
+
+  lprintf( "open_plugin\n" );
+
+  /* the videoout must be vdpau-capable to support this decoder */
+  if ( !(stream->video_driver->get_capabilities(stream->video_driver) & VO_CAP_VDPAU_MPEG12) )
+    return NULL;
+
+  /* now check if vdpau has free decoder resource */
+  vo_frame_t *img = stream->video_out->get_frame( stream->video_out, 1920, 1080, 1, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS );
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  int runtime_nr = accel->vdp_runtime_nr;
+  img->free(img);
+  VdpDecoder decoder;
+  VdpStatus st = accel->vdp_decoder_create( accel->vdp_device, VDP_DECODER_PROFILE_MPEG2_MAIN, 1920, 1080, 2, &decoder );
+  if ( st!=VDP_STATUS_OK ) {
+    lprintf( "can't create vdpau decoder.\n" );
+    return NULL;
+  }
+
+  accel->vdp_decoder_destroy( decoder );
+
+  this = (vdpau_mpeg12_decoder_t *) calloc(1, sizeof(vdpau_mpeg12_decoder_t));
+
+  this->video_decoder.decode_data         = vdpau_mpeg12_decode_data;
+  this->video_decoder.flush               = vdpau_mpeg12_flush;
+  this->video_decoder.reset               = vdpau_mpeg12_reset;
+  this->video_decoder.discontinuity       = vdpau_mpeg12_discontinuity;
+  this->video_decoder.dispose             = vdpau_mpeg12_dispose;
+
+  this->stream                            = stream;
+  this->class                             = (vdpau_mpeg12_class_t *) class_gen;
+
+  this->sequence.bufsize = 1024;
+  this->sequence.buf = (uint8_t*)malloc(this->sequence.bufsize);
+  this->sequence.forward_ref = 0;
+  this->sequence.backward_ref = 0;
+  this->sequence.vdp_runtime_nr = runtime_nr;
+  free_sequence( &this->sequence );
+  this->sequence.ratio = 1;
+  this->sequence.reset = VO_NEW_SEQUENCE_FLAG;
+
+  init_picture( &this->sequence.picture );
+
+  this->decoder = VDP_INVALID_HANDLE;
+  this->sequence.accel_vdpau = NULL;
+
+  (stream->video_out->open)(stream->video_out, stream);
+
+#ifdef MAKE_DAT
+  outfile = fopen( "/tmp/mpg.dat","w");
+  nframes = 0;
+#endif
+
+  return &this->video_decoder;
+}
+
+/*
+ * This function allocates a private video decoder class and initializes
+ * the class's member functions.
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  vdpau_mpeg12_class_t *this;
+
+  this = (vdpau_mpeg12_class_t *) calloc(1, sizeof(vdpau_mpeg12_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "vdpau_mpeg12";
+  this->decoder_class.description     =
+	N_("vdpau_mpeg12: mpeg1/2 decoder plugin using VDPAU hardware decoding.\n"
+	   "Must be used along with video_out_vdpau.");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * This is a list of all of the internal xine video buffer types that
+ * this decoder is able to handle. Check src/xine-engine/buffer.h for a
+ * list of valid buffer types (and add a new one if the one you need does
+ * not exist). Terminate the list with a 0.
+ */
+static const uint32_t video_types[] = {
+  BUF_VIDEO_MPEG,
+  0
+};
+
+/*
+ * This data structure combines the list of supported xine buffer types and
+ * the priority that the plugin should be given with respect to other
+ * plugins that handle the same buffer type. A plugin with priority (n+1)
+ * will be used instead of a plugin with priority (n).
+ */
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  8                    /* priority        */
+};
+
+/*
+ * The plugin catalog entry. This is the only information that this plugin
+ * will export to the public.
+ */
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* { type, API, "name", version, special_info, init_function } */
+  { PLUGIN_VIDEO_DECODER, 19, "vdpau_mpeg12", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/libvdpau/vdpau_mpeg4.c b/src/video_dec/libvdpau/vdpau_mpeg4.c
new file mode 100644
index 000000000..4d7dee1ed
--- /dev/null
+++ b/src/video_dec/libvdpau/vdpau_mpeg4.c
@@ -0,0 +1,1194 @@
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; remove-trailing-space on;
+ *
+ * Copyright (C) 2010 the xine project
+ * Copyright (C) 2010 Christophe Thommeret <hftom@free.fr>
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * vdpau_mpeg4.c, a mpeg4-part-2 video stream parser using VDPAU hardware decoder
+ *
+ */
+
+/*#define LOG*/
+#define LOG_MODULE "vdpau_mpeg4"
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "accel_vdpau.h"
+#include "bits_reader.h"
+
+#include <vdpau/vdpau.h>
+
+#define begin_vo_start_code         0x00
+#define end_vo_start_code           0x1f
+#define begin_vol_start_code        0x20
+#define end_vol_start_code          0x2f
+#define viso_sequence_start_code    0xb0
+#define viso_sequence_end_code      0xb1
+#define viso_start_code             0xb5
+#define group_start_code            0xb3
+#define user_data_start_code        0xb2
+#define vop_start_code              0xb6
+
+#define I_FRAME   0
+#define P_FRAME   1
+#define B_FRAME   2
+
+#define PICTURE_TOP     1
+#define PICTURE_BOTTOM  2
+#define PICTURE_FRAME   3
+
+#define SHAPE_RECT    0
+#define SHAPE_BIN     1
+#define SHAPE_BINONLY 2
+#define SHAPE_GRAY    3
+
+#define SPRITE_STATIC 1
+#define SPRITE_GMC    2
+
+static int nframe;
+
+/*#define MAKE_DAT*/ /*do NOT define this, unless you know what you do */
+#ifdef MAKE_DAT
+static int nframes;
+static FILE *outfile;
+#endif
+
+
+
+/* default intra quant matrix, in zig-zag order */
+static const uint8_t default_intra_quantizer_matrix[64] = {
+    8,
+    17, 17,
+    20, 18, 18,
+    19, 19, 21, 21,
+    22, 22, 22, 21, 21,
+    23, 23, 23, 23, 23, 23,
+    25, 24, 24, 24, 24, 25, 25,
+    27, 27, 26, 26, 26, 26, 26, 27,
+    28, 28, 28, 28, 28, 28, 28,
+    30, 30, 30, 30, 30, 30,
+    32, 32, 32, 32, 32,
+    35, 35, 35, 35,
+    38, 38, 38,
+    41, 41,
+    45
+};
+
+/* default non intra quant matrix, in zig-zag order */
+static const uint8_t default_non_intra_quantizer_matrix[64] = {
+    16,
+    17, 17,
+    18, 18, 18,
+    19, 19, 19, 19,
+    20, 20, 20, 20, 20,
+    21, 21, 21, 21, 21, 21,
+    22, 22, 22, 22, 22, 22, 22,
+    23, 23, 23, 23, 23, 23, 23, 23,
+    24, 24, 24, 25, 24, 24, 24,
+    25, 26, 26, 26, 26, 25,
+    27, 27, 27, 27, 27,
+    28, 28, 28, 28,
+    30, 30, 30,
+    31, 31,
+    33
+};
+
+uint8_t mpeg_scan_norm[64] = {
+    /* Zig-Zag scan pattern */
+     0, 1, 8,16, 9, 2, 3,10,
+    17,24,32,25,18,11, 4, 5,
+    12,19,26,33,40,48,41,34,
+    27,20,13, 6, 7,14,21,28,
+    35,42,49,56,57,50,43,36,
+    29,22,15,23,30,37,44,51,
+    58,59,52,45,38,31,39,46,
+    53,60,61,54,47,55,62,63
+};
+
+
+
+typedef struct {
+  VdpPictureInfoMPEG4Part2  vdp_infos; /* first field, also used for frame */
+
+  int                     viso_verid;
+  int                     newpred_enable;
+  int                     reduced_resolution_vop_enable;
+  int                     vol_shape;
+  int                     complexity_estimation_disable;
+  int                     sprite_enable;
+  int                     quant_precision;
+
+  int                     progressive_frame;
+} picture_t;
+
+
+
+typedef struct {
+  uint32_t    coded_width;
+  uint32_t    coded_height;
+
+  uint64_t    video_step; /* frame duration in pts units */
+  double      ratio;
+  VdpDecoderProfile profile;
+  int         chroma;
+  int         top_field_first;
+
+  int         have_header;
+
+  uint8_t     *buf; /* accumulate data */
+  int         bufseek;
+  uint32_t    bufsize;
+  uint32_t    bufpos;
+  int         start;
+
+  picture_t   picture;
+  vo_frame_t  *forward_ref;
+  vo_frame_t  *backward_ref;
+
+  int64_t    cur_pts, seq_pts;
+
+  vdpau_accel_t *accel_vdpau;
+
+  VdpColorStandard  color_standard;
+
+  bits_reader_t  br;
+
+  int         vdp_runtime_nr;
+  int         reset;
+
+  int         have_codec_name;
+  char        codec_name[256];
+
+  int         fixed_vop_time_increment;
+  int         time_increment_bits;
+  int         last_time_base;
+  int         time_base;
+  int         time;
+  int         last_non_b_time;
+  int         t_frame;
+
+} sequence_t;
+
+
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} vdpau_mpeg4_class_t;
+
+
+
+typedef struct vdpau_mpeg4_decoder_s {
+  video_decoder_t         video_decoder;  /* parent video decoder structure */
+
+  vdpau_mpeg4_class_t    *class;
+  xine_stream_t           *stream;
+
+  sequence_t              sequence;
+
+  VdpDecoder              decoder;
+  VdpDecoderProfile       decoder_profile;
+  uint32_t                decoder_width;
+  uint32_t                decoder_height;
+
+} vdpau_mpeg4_decoder_t;
+
+
+
+static void reset_picture( picture_t *pic )
+{
+  lprintf( "reset_picture\n" );
+  pic->vdp_infos.vop_coding_type = 0;
+  pic->vdp_infos.alternate_vertical_scan_flag = 0;
+  pic->vdp_infos.quant_type = 0;
+  pic->vdp_infos.vop_time_increment_resolution = 0;
+  pic->vdp_infos.vop_fcode_forward = 1;
+  pic->vdp_infos.vop_fcode_backward = 1;
+  pic->vdp_infos.resync_marker_disable = 0;
+  pic->vdp_infos.interlaced = 0;
+  pic->vdp_infos.quarter_sample = 0;
+  pic->vdp_infos.short_video_header = 0;
+  pic->vdp_infos.rounding_control = 0;
+  pic->vdp_infos.top_field_first = 1;
+  pic->progressive_frame = 1;
+  pic->viso_verid = 1;
+  pic->newpred_enable = 0;
+  pic->reduced_resolution_vop_enable = 0;
+  pic->complexity_estimation_disable = 1;
+  pic->vol_shape = SHAPE_RECT;
+  pic->quant_precision = 5;
+  pic->vdp_infos.trd[0] = pic->vdp_infos.trd[1] = 0;
+  pic->vdp_infos.trb[0] = pic->vdp_infos.trb[1] = 0;
+}
+
+
+
+static void init_picture( picture_t *pic )
+{
+  reset_picture( pic );
+}
+
+
+
+static void reset_sequence( sequence_t *sequence, int free_refs )
+{
+  sequence->cur_pts = sequence->seq_pts = 0;
+  if ( sequence->forward_ref )
+    sequence->forward_ref->pts = 0;
+  if ( sequence->backward_ref )
+    sequence->backward_ref->pts = 0;
+
+  if ( !free_refs )
+    return;
+
+  sequence->bufpos = 0;
+  sequence->bufseek = 0;
+  sequence->start = -1;
+  if ( sequence->forward_ref )
+    sequence->forward_ref->free( sequence->forward_ref );
+  sequence->forward_ref = NULL;
+  if ( sequence->backward_ref )
+    sequence->backward_ref->free( sequence->backward_ref );
+  sequence->backward_ref = NULL;
+  sequence->top_field_first = 0;
+  sequence->reset = VO_NEW_SEQUENCE_FLAG;
+  sequence->color_standard = VDP_COLOR_STANDARD_ITUR_BT_709;
+
+  sequence->last_time_base = 0;
+  sequence->time_base = 0;
+  sequence->time = 0;
+  sequence->last_non_b_time = 0;
+  sequence->t_frame = 0;
+}
+
+
+
+static void free_sequence( sequence_t *sequence )
+{
+  lprintf( "init_sequence\n" );
+  sequence->have_header = 0;
+  sequence->profile = VDP_DECODER_PROFILE_MPEG4_PART2_ASP;
+  sequence->chroma = 0;
+  sequence->video_step = 3600;
+  sequence->have_codec_name = 0;
+  strcpy( sequence->codec_name, "MPEG4 / XviD / DivX (vdpau)" );
+  reset_sequence( sequence, 1 );
+}
+
+
+
+static void update_metadata( vdpau_mpeg4_decoder_t *this_gen )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  
+  _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_WIDTH, sequence->coded_width );
+  _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, sequence->coded_height );
+  _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_RATIO, ((double)10000*sequence->ratio) );
+  _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_FRAME_DURATION, sequence->video_step );
+  _x_meta_info_set_utf8( this_gen->stream, XINE_META_INFO_VIDEOCODEC, sequence->codec_name );
+  xine_event_t event;
+  xine_format_change_data_t data;
+  event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+  event.stream = this_gen->stream;
+  event.data = &data;
+  event.data_length = sizeof(data);
+  data.width = sequence->coded_width;
+  data.height = sequence->coded_height;
+  data.aspect = sequence->ratio;
+  xine_event_send( this_gen->stream, &event );
+}
+
+
+
+static void visual_object( vdpau_mpeg4_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  picture_t *picture = (picture_t*)&sequence->picture;
+  bits_reader_set( &sequence->br, buf, len );
+
+  if ( read_bits( &sequence->br, 1 ) ) {
+    picture->viso_verid = read_bits( &sequence->br, 4 );
+    lprintf("visual_object_verid: %d\n", picture->viso_verid);
+    skip_bits( &sequence->br, 3 );
+  }
+  if ( read_bits( &sequence->br, 4 ) == 1 ) {
+    if ( read_bits( &sequence->br, 1 ) ) {
+      skip_bits( &sequence->br, 4 );
+      if ( read_bits( &sequence->br, 1 ) ) {
+        if ( read_bits( &sequence->br, 8 ) == 7 ) {
+          lprintf("color_standard: smpte_240M\n");
+          sequence->color_standard = VDP_COLOR_STANDARD_SMPTE_240M;
+        }
+        skip_bits( &sequence->br, 16 );
+      }
+    }
+  }
+}
+
+
+
+static void video_object_layer( vdpau_mpeg4_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  picture_t *picture = (picture_t*)&sequence->picture;
+  bits_reader_set( &sequence->br, buf, len );
+
+  int vol_verid = 1;
+
+  picture->vdp_infos.short_video_header = 0;
+  sequence->t_frame = 0;
+
+  skip_bits( &sequence->br, 9 );
+  if ( read_bits( &sequence->br, 1 ) ) {
+    vol_verid = read_bits( &sequence->br, 4 );
+    lprintf("video_object_layer_verid: %d\n", vol_verid);
+    skip_bits( &sequence->br, 3 );
+  }
+  double parw=1, parh=1;
+  int ar = read_bits( &sequence->br, 4 );
+  lprintf("aspect_ratio_info: %d\n", ar);
+  switch ( ar ) {
+    case 1: parw = parh = 1; break;
+    case 2: parw = 12; parh = 11; break;
+    case 3: parw = 10; parh = 11; break;
+    case 4: parw = 16; parh = 11; break;
+    case 5: parw = 40; parh = 33; break;
+    case 15: {
+      parw = read_bits( &sequence->br, 8 );
+      parh = read_bits( &sequence->br, 8 );
+      break;
+    }
+  }
+  lprintf("parw: %f, parh: %f\n", parw, parh);
+  if ( read_bits( &sequence->br, 1 ) ) {
+    skip_bits( &sequence->br, 3 );
+    if ( read_bits( &sequence->br, 1 ) ) {
+      read_bits( &sequence->br, 16 );
+      read_bits( &sequence->br, 16 );
+      read_bits( &sequence->br, 16 );
+      read_bits( &sequence->br, 15 );
+      read_bits( &sequence->br, 16 );
+    }
+  }
+
+  picture->vol_shape = read_bits( &sequence->br, 2 );
+  if ( (picture->vol_shape == SHAPE_GRAY) && (vol_verid != 1) ) {
+    skip_bits( &sequence->br, 4 );
+    fprintf(stderr, "vdpau_mpeg4: unsupported SHAPE_GRAY!\n");
+  }
+  skip_bits( &sequence->br, 1 );
+  picture->vdp_infos.vop_time_increment_resolution = read_bits( &sequence->br, 16 );
+  lprintf("vop_time_increment_resolution: %d\n", picture->vdp_infos.vop_time_increment_resolution);
+  int length=1, max=2;
+  while ( (max - 1) < picture->vdp_infos.vop_time_increment_resolution ) {
+    ++length;
+    max *= 2;
+  }
+  sequence->time_increment_bits = length;
+  if ( sequence->time_increment_bits < 1 )
+    sequence->time_increment_bits = 1;
+  skip_bits( &sequence->br, 1 );
+
+  if ( read_bits( &sequence->br, 1 ) ) {
+    sequence->fixed_vop_time_increment = read_bits( &sequence->br, sequence->time_increment_bits );
+  }
+  else
+    sequence->fixed_vop_time_increment = 1;
+
+  sequence->video_step = 90000 / (picture->vdp_infos.vop_time_increment_resolution / sequence->fixed_vop_time_increment);
+  lprintf("fixed_vop_time_increment: %d\n", sequence->fixed_vop_time_increment);
+  lprintf("video_step: %d\n", (int)sequence->video_step);
+
+  if ( picture->vol_shape != SHAPE_BINONLY ) {
+    if ( picture->vol_shape == SHAPE_RECT ) {
+      skip_bits( &sequence->br, 1 );
+      sequence->coded_width = read_bits( &sequence->br, 13 );
+      lprintf("vol_width: %d\n", sequence->coded_width);
+      skip_bits( &sequence->br, 1 );
+      sequence->coded_height = read_bits( &sequence->br, 13 );
+      lprintf("vol_height: %d\n", sequence->coded_height);
+      skip_bits( &sequence->br, 1 );
+    }
+    sequence->ratio = ((double)sequence->coded_width * parw) / ((double)sequence->coded_height * parh);
+    lprintf("aspect_ratio: %f\n", sequence->ratio);
+    picture->vdp_infos.interlaced = read_bits( &sequence->br, 1 );
+    skip_bits( &sequence->br, 1 );
+
+    picture->sprite_enable = 0;
+    if ( vol_verid == 1 )
+      picture->sprite_enable = read_bits( &sequence->br, 1 );
+    else
+      picture->sprite_enable = read_bits( &sequence->br, 2 );
+
+    if ( (picture->sprite_enable == SPRITE_STATIC) || (picture->sprite_enable == SPRITE_GMC) ) {
+      if ( picture->sprite_enable != SPRITE_GMC ) {
+        skip_bits( &sequence->br, 14 );
+        skip_bits( &sequence->br, 14 );
+        skip_bits( &sequence->br, 14 );
+        skip_bits( &sequence->br, 14 );
+      }
+      skip_bits( &sequence->br, 9 );
+      if ( picture->sprite_enable != SPRITE_GMC )
+        skip_bits( &sequence->br, 1 );
+    }
+    if ( (vol_verid != 1) && (picture->vol_shape != SHAPE_RECT) )
+      skip_bits( &sequence->br, 1 );
+
+    if ( read_bits( &sequence->br, 1 ) ) {
+      picture->quant_precision = read_bits( &sequence->br, 4 );
+      skip_bits( &sequence->br, 4 );
+    }
+    else
+      picture->quant_precision = 5;
+
+    if ( picture->vol_shape == SHAPE_GRAY )
+      skip_bits( &sequence->br, 3 );
+
+    picture->vdp_infos.quant_type = read_bits( &sequence->br, 1 );
+
+    /* load default matrices */
+    int j;
+    for ( j=0; j<64; ++j ) {
+      sequence->picture.vdp_infos.intra_quantizer_matrix[mpeg_scan_norm[j]] = default_intra_quantizer_matrix[j];
+      sequence->picture.vdp_infos.non_intra_quantizer_matrix[mpeg_scan_norm[j]] = default_non_intra_quantizer_matrix[j];
+    }
+    if ( picture->vdp_infos.quant_type ) {
+      int val, last = 0;
+      if ( read_bits( &sequence->br, 1 ) ) { /* load_intra_quant_matrix */
+        lprintf("load_intra_quant_matrix\n");
+        for ( j=0; j<64; ++j ) {
+          val = read_bits( &sequence->br, 8 );
+          if ( !val )
+            break;
+          last = sequence->picture.vdp_infos.intra_quantizer_matrix[j] = val;
+        }
+        for ( ; j<64; ++j )
+          sequence->picture.vdp_infos.intra_quantizer_matrix[j] = last;
+      }
+      if ( read_bits( &sequence->br, 1 ) ) { /* load_non_intra_quant_matrix */
+        lprintf("load_non_intra_quant_matrix\n");
+        for ( j=0; j<64; ++j ) {
+          val = read_bits( &sequence->br, 8 );
+          if ( !val )
+            break;
+          last = sequence->picture.vdp_infos.non_intra_quantizer_matrix[j] = val;
+        }
+        for ( ; j<64; ++j )
+          sequence->picture.vdp_infos.non_intra_quantizer_matrix[j] = last;
+      }
+      if ( picture->vol_shape == SHAPE_GRAY ) { /* FIXME */
+        fprintf(stderr, "vdpau_mpeg4: grayscale shape not supported!\n");
+        return;
+      }
+    }
+    if ( vol_verid != 1 )
+      sequence->picture.vdp_infos.quarter_sample = read_bits( &sequence->br, 1 );
+    else
+      sequence->picture.vdp_infos.quarter_sample = 0;
+
+    picture->complexity_estimation_disable = read_bits( &sequence->br, 1 );
+    if ( !picture->complexity_estimation_disable ) { /* define_vop_complexity_estimation_header */
+      int estimation_method = read_bits( &sequence->br, 2 );
+      if ( (estimation_method == 0) || (estimation_method == 1) ){
+        if ( !read_bits( &sequence->br, 1 ) )
+          skip_bits( &sequence->br, 6 );
+        if ( !read_bits( &sequence->br, 1 ) )
+          skip_bits( &sequence->br, 4 );
+        skip_bits( &sequence->br, 1 );
+        if ( !read_bits( &sequence->br, 1 ) )
+          skip_bits( &sequence->br, 4 );
+        if ( !read_bits( &sequence->br, 1 ) )
+          skip_bits( &sequence->br, 6 );
+        skip_bits( &sequence->br, 1 );
+        if ( estimation_method == 1 ) {
+          if ( !read_bits( &sequence->br, 1 ) )
+            skip_bits( &sequence->br, 2 );
+        }
+      }
+    }
+
+    picture->vdp_infos.resync_marker_disable = read_bits( &sequence->br, 1 );
+
+    if ( read_bits( &sequence->br, 1 ) )
+      skip_bits( &sequence->br, 1 );
+    if ( vol_verid != 1 ) {
+      picture->newpred_enable = read_bits( &sequence->br, 1 );
+      if ( picture->newpred_enable )
+        skip_bits( &sequence->br, 3 );
+      picture->reduced_resolution_vop_enable = read_bits( &sequence->br, 1 );
+    }
+    else {
+      picture->newpred_enable = 0;
+      picture->reduced_resolution_vop_enable = 0;
+    }
+    /* .... */
+  }
+  else {
+    if ( vol_verid != 1 ) {
+      if ( read_bits( &sequence->br, 1 ) )
+        skip_bits( &sequence->br, 24 );
+    }
+    picture->vdp_infos.resync_marker_disable = read_bits( &sequence->br, 1 );
+  }
+
+  if ( !sequence->have_header ) {
+    update_metadata( this_gen );
+    sequence->have_header = 1;
+  }
+}
+
+
+#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
+
+static void video_object_plane( vdpau_mpeg4_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  picture_t *picture = (picture_t*)&sequence->picture;
+  bits_reader_set( &sequence->br, buf, len );
+  int time_inc=0, time_increment;
+
+  sequence->seq_pts = sequence->cur_pts;
+  sequence->cur_pts = 0;
+
+  picture->vdp_infos.vop_coding_type = read_bits( &sequence->br, 2 );
+  while ( read_bits( &sequence->br, 1 ) )
+    ++time_inc;
+
+  skip_bits( &sequence->br, 1 );
+
+  if ( sequence->time_increment_bits == 0 || !(get_bits( &sequence->br, sequence->time_increment_bits + 1) & 1) ) {
+    for ( sequence->time_increment_bits = 1; sequence->time_increment_bits < 16; ++sequence->time_increment_bits ) {
+      if ( picture->vdp_infos.vop_coding_type == P_FRAME ) {
+        if ( (get_bits( &sequence->br, sequence->time_increment_bits + 6 ) & 0x37) == 0x30 )
+          break;
+      }
+      else {
+        if ( (get_bits( &sequence->br, sequence->time_increment_bits + 5 ) & 0x1f) == 0x18 )
+          break;
+      }
+      fprintf(stderr, "Headers are not complete, guessing time_increment_bits: %d\n", sequence->time_increment_bits);
+    }
+  }
+
+  time_increment = read_bits( &sequence->br, sequence->time_increment_bits );
+
+  if ( picture->vdp_infos.vop_coding_type != B_FRAME ) {
+    sequence->last_time_base = sequence->time_base;
+    sequence->time_base += time_inc;
+    sequence->time = sequence->time_base * picture->vdp_infos.vop_time_increment_resolution + time_increment;
+    if ( sequence->time < sequence->last_non_b_time ) {
+      ++sequence->time_base;
+      sequence->time += picture->vdp_infos.vop_time_increment_resolution;
+    }
+    picture->vdp_infos.trd[0] = sequence->time - sequence->last_non_b_time;
+    sequence->last_non_b_time = sequence->time;
+  }
+  else {
+    sequence->time = (sequence->last_time_base + time_inc) * picture->vdp_infos.vop_time_increment_resolution + time_increment;
+    picture->vdp_infos.trb[0] = picture->vdp_infos.trd[0] - (sequence->last_non_b_time - sequence->time);
+    if ( (picture->vdp_infos.trd[0] <= picture->vdp_infos.trb[0] ) || (picture->vdp_infos.trd[0] <= (picture->vdp_infos.trd[0] - picture->vdp_infos.trb[0])) || (picture->vdp_infos.trd[0] <= 0) ) {
+      /* FIXME */
+    }
+    if ( sequence->t_frame == 0 )
+      sequence->t_frame = picture->vdp_infos.trb[0];
+    if ( sequence->t_frame == 0 )
+      sequence->t_frame = 1;
+    picture->vdp_infos.trd[1] = (  ROUNDED_DIV(sequence->last_non_b_time, sequence->t_frame) - ROUNDED_DIV(sequence->last_non_b_time - picture->vdp_infos.trd[0], sequence->t_frame));
+    picture->vdp_infos.trb[1] = (  ROUNDED_DIV(sequence->time, sequence->t_frame) - ROUNDED_DIV(sequence->last_non_b_time - picture->vdp_infos.trd[0], sequence->t_frame));
+    if ( picture->vdp_infos.interlaced ) {
+      /* FIXME */
+    }
+  }
+
+  /*if ( sequence->fixed_vop_time_increment )
+    sequence->seq_pts = ( sequence->time + sequence->fixed_vop_time_increment/2 ) / sequence->fixed_vop_time_increment;*/
+  
+  skip_bits( &sequence->br, 1 );
+  if ( !read_bits( &sequence->br, 1 ) )
+    return; /* vop_coded == 0 */
+
+  if ( picture->newpred_enable ) { /* FIXME */
+    fprintf(stderr, "vdpau_mpeg4: newpred_enable, dunno what to do !!!\n");
+    return;
+  }
+
+  if ( (picture->vol_shape != SHAPE_BINONLY) && (picture->vdp_infos.vop_coding_type == P_FRAME) )
+    picture->vdp_infos.rounding_control = read_bits( &sequence->br, 1 );
+  else
+    picture->vdp_infos.rounding_control = 0;
+
+  if ( picture->reduced_resolution_vop_enable && (picture->vol_shape == SHAPE_RECT) && (picture->vdp_infos.vop_coding_type != B_FRAME) )
+    skip_bits( &sequence->br, 1 );
+  if ( picture->vol_shape != SHAPE_RECT ) { /* FIXME */
+    fprintf(stderr, "vdpau_mpeg4: vol_shape != SHAPE_RECT, return\n");
+    return;
+  }
+
+  if ( picture->vol_shape != SHAPE_BINONLY ) {
+    if ( !picture->complexity_estimation_disable ) { /* FIXME */
+      fprintf(stderr, "vdpau_mpeg4: TODO: read_vop_complexity_estimation_header\n");
+      return;
+    }
+  }
+
+  if ( picture->vol_shape != SHAPE_BINONLY ) {
+    skip_bits( &sequence->br, 3 );
+    if ( picture->vdp_infos.interlaced ) {
+      picture->vdp_infos.top_field_first = read_bits( &sequence->br, 1 );
+      picture->vdp_infos.alternate_vertical_scan_flag = read_bits( &sequence->br, 1 );
+    }
+  }
+
+  if ( picture->vol_shape != SHAPE_BINONLY ) {
+    skip_bits( &sequence->br, picture->quant_precision );
+    if ( picture->vol_shape == SHAPE_GRAY ) { /* FIXME */
+      fprintf(stderr, "vdpau_mpeg4: unsupported SHAPE_GRAY!\n");
+      return;
+    }
+    if ( picture->vdp_infos.vop_coding_type != I_FRAME )
+      picture->vdp_infos.vop_fcode_forward = read_bits( &sequence->br, 3 );
+    if ( picture->vdp_infos.vop_coding_type == B_FRAME )
+      picture->vdp_infos.vop_fcode_backward = read_bits( &sequence->br, 3 );
+  }
+}
+
+
+
+static void gop_header( vdpau_mpeg4_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  int h, m, s;
+
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  bits_reader_set( &sequence->br, buf, len );
+
+  h = read_bits( &sequence->br, 5 );
+  m = read_bits( &sequence->br, 6 );
+  skip_bits( &sequence->br, 1 );
+  s = read_bits( &sequence->br, 6 );
+
+  sequence->time_base = s + (60 * (m + (60 * h)));
+}
+
+
+
+static void user_data( vdpau_mpeg4_decoder_t *this_gen, uint8_t *buffer, int len )
+{
+  /* code from ffmpeg's mpeg4videodec.c */
+
+  char buf[256];
+  int i, e, ver = 0, build = 0, ver2 = 0, ver3 = 0;
+  char last;
+
+  if ( this_gen->sequence.have_codec_name )
+    return;
+
+  for( i=0; i<255 && i<len; i++ ) {
+    if ( buffer[i] == 0 )
+      break;
+    buf[i]= buffer[i];
+  }
+  buf[i]=0;
+
+  /* divx detection */
+  e = sscanf(buf, "DivX%dBuild%d%c", &ver, &build, &last);
+  if ( e < 2 )
+    e=sscanf(buf, "DivX%db%d%c", &ver, &build, &last);
+  if ( e >= 2 ) {
+    strcpy( this_gen->sequence.codec_name, "MPEG4 / DivX " );
+    sprintf( buf, "%d", ver );
+    strcat( this_gen->sequence.codec_name, " (vdpau)" );
+    this_gen->sequence.have_codec_name = 1;
+  }
+
+  /* ffmpeg detection */
+  e = sscanf(buf, "FFmpe%*[^b]b%d", &build) + 3;
+  if ( e != 4 )
+    e=sscanf(buf, "FFmpeg v%d.%d.%d / libavcodec build: %d", &ver, &ver2, &ver3, &build);
+  if ( e != 4 ) {
+    e=sscanf(buf, "Lavc%d.%d.%d", &ver, &ver2, &ver3)+1;
+    if ( e > 1 )
+      build= (ver<<16) + (ver2<<8) + ver3;
+  }
+  if ( e == 4 ) {
+    strcpy( this_gen->sequence.codec_name, "MPEG4 / FFmpeg " );
+    sprintf( buf, "%d", build );
+    strcat( this_gen->sequence.codec_name, " (vdpau)" );
+    this_gen->sequence.have_codec_name = 1;
+  }
+  else {
+    if(strcmp(buf, "ffmpeg")==0) {
+      strcpy( this_gen->sequence.codec_name, "MPEG4 / FFmpeg " );
+      strcpy( this_gen->sequence.codec_name, "4600" );
+      strcat( this_gen->sequence.codec_name, " (vdpau)" );
+      this_gen->sequence.have_codec_name = 1;
+    }
+  }
+
+  /* Xvid detection */
+  e = sscanf(buf, "XviD%d", &build);
+  if ( e == 1 ) {
+    strcpy( this_gen->sequence.codec_name, "MPEG4 / XviD " );
+    sprintf( buf, "%d", build );
+    strcat( this_gen->sequence.codec_name, " (vdpau)" );
+    this_gen->sequence.have_codec_name = 1;
+  }
+
+  update_metadata( this_gen );
+}
+
+
+
+static int parse_code( vdpau_mpeg4_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  if ( (buf[3] >= begin_vo_start_code) && (buf[3] <= end_vo_start_code) ) {
+    lprintf( " ----------- vo_start_code\n" );
+    return 0;
+  }
+
+  if ( (buf[3] >= begin_vol_start_code) && (buf[3] <= end_vol_start_code) ) {
+    lprintf( " ----------- vol_start_code\n" );
+    video_object_layer( this_gen, buf+4, len-4);
+    return 0;
+  }
+
+  switch ( buf[3] ) {
+    case viso_sequence_start_code:
+      lprintf( " ----------- viso_sequence_start_code\n" );
+      break;
+    case viso_sequence_end_code:
+      lprintf( " ----------- viso_sequence_end_code\n" );
+      break;
+    case viso_start_code:
+      lprintf( " ----------- viso_start_code\n" );
+      visual_object( this_gen, buf+4, len-4 );
+      break;
+  }
+
+  if ( !sequence->have_header )
+    return 0;
+
+  switch ( buf[3] ) {
+    case group_start_code:
+      lprintf( " ----------- group_start_code\n" );
+      gop_header( this_gen, buf+4, len-4 );
+      break;
+    case user_data_start_code:
+      lprintf( " ----------- user_data_start_code\n" );
+      user_data( this_gen, buf+4, len-4 );
+      break;
+    case vop_start_code:
+      lprintf( " ----------- vop_start_code\n" );
+      video_object_plane( this_gen, buf+4, len-4 );
+      return 1;
+      break;
+  }
+  return 0;
+}
+
+
+
+static void decode_render( vdpau_mpeg4_decoder_t *vd, vdpau_accel_t *accel, uint8_t *buf, int len )
+{
+  sequence_t *seq = (sequence_t*)&vd->sequence;
+  picture_t *pic = (picture_t*)&seq->picture;
+
+  VdpStatus st;
+  if ( vd->decoder==VDP_INVALID_HANDLE || vd->decoder_profile!=seq->profile || vd->decoder_width!=seq->coded_width || vd->decoder_height!=seq->coded_height ) {
+    if ( vd->decoder!=VDP_INVALID_HANDLE ) {
+      accel->vdp_decoder_destroy( vd->decoder );
+      vd->decoder = VDP_INVALID_HANDLE;
+    }
+    st = accel->vdp_decoder_create( accel->vdp_device, seq->profile, seq->coded_width, seq->coded_height, 2, &vd->decoder);
+    if ( st!=VDP_STATUS_OK )
+      fprintf(stderr, "vdpau_mpeg4: failed to create decoder !! %s\n", accel->vdp_get_error_string( st ) );
+    else {
+      lprintf( "decoder created.\n" );
+      vd->decoder_profile = seq->profile;
+      vd->decoder_width = seq->coded_width;
+      vd->decoder_height = seq->coded_height;
+      seq->vdp_runtime_nr = accel->vdp_runtime_nr;
+    }
+  }
+
+  VdpPictureInfoMPEG4Part2 *infos = (VdpPictureInfoMPEG4Part2*)&pic->vdp_infos;
+  printf("%d: %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", ++nframe, infos->vop_coding_type,infos->vop_time_increment_resolution, infos->vop_fcode_forward, infos->vop_fcode_backward, infos->resync_marker_disable, infos->interlaced, infos->quant_type, infos->quarter_sample, infos->short_video_header, infos->rounding_control, infos->alternate_vertical_scan_flag, len, infos->trd[0], infos->trd[1], infos->trb[0], infos->trb[1]);
+
+  VdpBitstreamBuffer vbit;
+  vbit.struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+  vbit.bitstream = buf;
+  vbit.bitstream_bytes = len;
+  st = accel->vdp_decoder_render( vd->decoder, accel->surface, (VdpPictureInfo*)&pic->vdp_infos, 1, &vbit );
+  if ( st!=VDP_STATUS_OK )
+    fprintf(stderr, "vdpau_mpeg4: decoder failed : %d!! %s\n", st, accel->vdp_get_error_string( st ) );
+  else {
+    lprintf( "DECODER SUCCESS : vop_coding_type=%d, bytes=%d, current=%d, forwref:%d, backref:%d, pts:%lld\n",
+              pic->vdp_infos.vop_coding_type, vbit.bitstream_bytes, accel->surface, pic->vdp_infos.forward_reference, pic->vdp_infos.backward_reference, seq->seq_pts );
+  }
+}
+
+
+
+static void decode_picture( vdpau_mpeg4_decoder_t *vd )
+{
+  sequence_t *seq = (sequence_t*)&vd->sequence;
+  picture_t *pic = (picture_t*)&seq->picture;
+  vdpau_accel_t *ref_accel;
+
+  uint8_t *buf = seq->buf;
+  int len = seq->bufpos;
+
+  pic->vdp_infos.forward_reference = VDP_INVALID_HANDLE;
+  pic->vdp_infos.backward_reference = VDP_INVALID_HANDLE;
+
+  if ( pic->vdp_infos.vop_coding_type == P_FRAME ) {
+    if ( seq->backward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->backward_ref->accel_data;
+      pic->vdp_infos.forward_reference = ref_accel->surface;
+    }
+    else {
+      /* reset_picture( &seq->picture ); */
+      return;
+    }
+  }
+  else if ( pic->vdp_infos.vop_coding_type == B_FRAME ) {
+    if ( seq->forward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->forward_ref->accel_data;
+      pic->vdp_infos.forward_reference = ref_accel->surface;
+    }
+    else {
+      /* reset_picture( &seq->picture ); */
+      return;
+    }
+    if ( seq->backward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->backward_ref->accel_data;
+      pic->vdp_infos.backward_reference = ref_accel->surface;
+    }
+    else {
+      /* reset_picture( &seq->picture );*/
+      return;
+    }
+  }
+
+  vo_frame_t *img = vd->stream->video_out->get_frame( vd->stream->video_out, seq->coded_width, seq->coded_height, seq->ratio, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS );
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  if ( !seq->accel_vdpau )
+    seq->accel_vdpau = accel;
+
+  if( seq->vdp_runtime_nr != *(seq->accel_vdpau->current_vdp_runtime_nr) ) {
+    seq->accel_vdpau = accel;
+    if ( seq->forward_ref )
+      seq->forward_ref->free( seq->forward_ref );
+    seq->forward_ref = NULL;
+    if ( seq->backward_ref )
+      seq->backward_ref->free( seq->backward_ref );
+    seq->backward_ref = NULL;
+    vd->decoder = VDP_INVALID_HANDLE;
+  }
+
+  decode_render( vd, accel, buf, len );
+
+
+#ifdef MAKE_DAT
+  if ( nframes==0 ) {
+    fwrite( &seq->coded_width, 1, sizeof(seq->coded_width), outfile );
+    fwrite( &seq->coded_height, 1, sizeof(seq->coded_height), outfile );
+    fwrite( &seq->ratio, 1, sizeof(seq->ratio), outfile );
+    fwrite( &seq->profile, 1, sizeof(seq->profile), outfile );
+  }
+
+  if ( nframes++ < 25 ) {
+    fwrite( &pic->vdp_infos, 1, sizeof(pic->vdp_infos), outfile );
+    fwrite( &len, 1, sizeof(len), outfile );
+    fwrite( buf, 1, len, outfile );
+    printf( "picture_type = %d\n", pic->vdp_infos.picture_type);
+  }
+#endif
+
+  if ( pic->vdp_infos.interlaced ) {
+    img->progressive_frame = 0;
+    img->top_field_first = pic->vdp_infos.top_field_first;
+  }
+  else {
+    img->progressive_frame = -1; /* set to -1 to let the vo know that it MUST NOT deinterlace */
+    img->top_field_first = 1;
+  }
+  img->pts = seq->seq_pts;
+  img->bad_frame = 0;
+  if ( seq->video_step > 900 ) /* some buggy streams */
+    img->duration = seq->video_step;
+  accel->color_standard = seq->color_standard;
+
+  if ( pic->vdp_infos.vop_coding_type < B_FRAME ) {
+    if ( pic->vdp_infos.vop_coding_type == I_FRAME && !seq->backward_ref ) {
+      img->pts = 0;
+      img->draw( img, vd->stream );
+      ++img->drawn;
+    }
+    if ( seq->forward_ref ) {
+      seq->forward_ref->drawn = 0;
+      seq->forward_ref->free( seq->forward_ref );
+    }
+    seq->forward_ref = seq->backward_ref;
+    if ( seq->forward_ref && !seq->forward_ref->drawn ) {
+      seq->forward_ref->draw( seq->forward_ref, vd->stream );
+    }
+    seq->backward_ref = img;
+  }
+  else {
+    img->draw( img, vd->stream );
+    img->free( img );
+  }
+}
+
+
+
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void vdpau_mpeg4_decode_data (video_decoder_t *this_gen, buf_element_t *buf)
+{
+  vdpau_mpeg4_decoder_t *this = (vdpau_mpeg4_decoder_t *) this_gen;
+  sequence_t *seq = (sequence_t*)&this->sequence;
+
+  /* preview buffers shall not be decoded and drawn -- use them only to supply stream information */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if ( !buf->size )
+    return;
+
+  if ( buf->pts )
+    seq->cur_pts = buf->pts;
+
+  int size = seq->bufpos+buf->size;
+  if ( seq->bufsize < size ) {
+    seq->bufsize = size+1024;
+    seq->buf = realloc( seq->buf, seq->bufsize );
+  }
+  xine_fast_memcpy( seq->buf+seq->bufpos, buf->content, buf->size );
+  seq->bufpos += buf->size;
+
+  while ( seq->bufseek <= seq->bufpos-4 ) {
+    uint8_t *buffer = seq->buf+seq->bufseek;
+    if ( buffer[0]==0 && buffer[1]==0 && buffer[2]==1 ) {
+      if ( seq->start<0 ) {
+        seq->start = seq->bufseek;
+      }
+      else {
+        if ( parse_code( this, seq->buf+seq->start, seq->bufseek-seq->start ) ) {
+          decode_picture( this );
+        }
+        uint8_t *tmp = (uint8_t*)malloc(seq->bufsize);
+        xine_fast_memcpy( tmp, seq->buf+seq->bufseek, seq->bufpos-seq->bufseek );
+        seq->bufpos -= seq->bufseek;
+        seq->start = -1;
+        seq->bufseek = -1;
+        free( seq->buf );
+        seq->buf = tmp;
+      }
+    }
+    ++seq->bufseek;
+  }
+}
+
+/*
+ * This function is called when xine needs to flush the system.
+ */
+static void vdpau_mpeg4_flush (video_decoder_t *this_gen) {
+  vdpau_mpeg4_decoder_t *this = (vdpau_mpeg4_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg4_flush\n" );
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void vdpau_mpeg4_reset (video_decoder_t *this_gen) {
+  vdpau_mpeg4_decoder_t *this = (vdpau_mpeg4_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg4_reset\n" );
+  reset_sequence( &this->sequence, 1 );
+}
+
+/*
+ * The decoder should forget any stored pts values here.
+ */
+static void vdpau_mpeg4_discontinuity (video_decoder_t *this_gen) {
+  vdpau_mpeg4_decoder_t *this = (vdpau_mpeg4_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg4_discontinuity\n" );
+  reset_sequence( &this->sequence, 0 );
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void vdpau_mpeg4_dispose (video_decoder_t *this_gen) {
+
+  vdpau_mpeg4_decoder_t *this = (vdpau_mpeg4_decoder_t *) this_gen;
+
+  lprintf( "vdpau_mpeg4_dispose\n" );
+
+  if ( this->decoder!=VDP_INVALID_HANDLE && this->sequence.accel_vdpau ) {
+      this->sequence.accel_vdpau->vdp_decoder_destroy( this->decoder );
+      this->decoder = VDP_INVALID_HANDLE;
+    }
+
+  free_sequence( &this->sequence );
+
+  this->stream->video_out->close( this->stream->video_out, this->stream );
+
+  free( this->sequence.buf );
+  free( this_gen );
+}
+
+/*
+ * This function allocates, initializes, and returns a private video
+ * decoder structure.
+ */
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  vdpau_mpeg4_decoder_t  *this ;
+
+  lprintf( "open_plugin\n" );
+
+  /* the videoout must be vdpau-capable to support this decoder */
+  if ( !(stream->video_driver->get_capabilities(stream->video_driver) & VO_CAP_VDPAU_MPEG4) )
+    return NULL;
+
+  /* now check if vdpau has free decoder resource */
+  vo_frame_t *img = stream->video_out->get_frame( stream->video_out, 1920, 1080, 1, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS );
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  int runtime_nr = accel->vdp_runtime_nr;
+  img->free(img);
+  VdpDecoder decoder;
+  VdpStatus st = accel->vdp_decoder_create( accel->vdp_device, VDP_DECODER_PROFILE_MPEG4_PART2_ASP, 1920, 1080, 2, &decoder );
+  if ( st!=VDP_STATUS_OK ) {
+    lprintf( "can't create vdpau decoder.\n" );
+    return NULL;
+  }
+
+  accel->vdp_decoder_destroy( decoder );
+
+  this = (vdpau_mpeg4_decoder_t *) calloc(1, sizeof(vdpau_mpeg4_decoder_t));
+
+  this->video_decoder.decode_data         = vdpau_mpeg4_decode_data;
+  this->video_decoder.flush               = vdpau_mpeg4_flush;
+  this->video_decoder.reset               = vdpau_mpeg4_reset;
+  this->video_decoder.discontinuity       = vdpau_mpeg4_discontinuity;
+  this->video_decoder.dispose             = vdpau_mpeg4_dispose;
+
+  this->stream                            = stream;
+  this->class                             = (vdpau_mpeg4_class_t *) class_gen;
+
+  this->sequence.bufsize = 1024;
+  this->sequence.buf = (uint8_t*)malloc(this->sequence.bufsize);
+  this->sequence.forward_ref = 0;
+  this->sequence.backward_ref = 0;
+  this->sequence.vdp_runtime_nr = runtime_nr;
+  free_sequence( &this->sequence );
+  this->sequence.ratio = 1;
+  this->sequence.reset = VO_NEW_SEQUENCE_FLAG;
+
+  init_picture( &this->sequence.picture );
+
+  this->decoder = VDP_INVALID_HANDLE;
+  this->sequence.accel_vdpau = NULL;
+
+  (stream->video_out->open)(stream->video_out, stream);
+
+#ifdef MAKE_DAT
+  outfile = fopen( "/tmp/mpeg4.dat","w");
+  nframes = 0;
+#endif
+  nframe = 0;
+
+  return &this->video_decoder;
+}
+
+/*
+ * This function allocates a private video decoder class and initializes
+ * the class's member functions.
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  vdpau_mpeg4_class_t *this;
+
+  this = (vdpau_mpeg4_class_t *) calloc(1, sizeof(vdpau_mpeg4_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "vdpau_mpeg4";
+  this->decoder_class.description     =
+	N_("vdpau_mpeg4: mpeg4 part 2 decoder plugin using VDPAU hardware decoding.\n"
+	   "Must be used along with video_out_vdpau.");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * This is a list of all of the internal xine video buffer types that
+ * this decoder is able to handle. Check src/xine-engine/buffer.h for a
+ * list of valid buffer types (and add a new one if the one you need does
+ * not exist). Terminate the list with a 0.
+ */
+static const uint32_t video_types[] = {
+  BUF_VIDEO_MPEG4,
+  BUF_VIDEO_XVID,
+  BUF_VIDEO_DIVX5,
+  BUF_VIDEO_3IVX,
+  0
+};
+
+/*
+ * This data structure combines the list of supported xine buffer types and
+ * the priority that the plugin should be given with respect to other
+ * plugins that handle the same buffer type. A plugin with priority (n+1)
+ * will be used instead of a plugin with priority (n).
+ */
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  0                    /* priority        */
+};
+
+/*
+ * The plugin catalog entry. This is the only information that this plugin
+ * will export to the public.
+ */
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* { type, API, "name", version, special_info, init_function } */
+  { PLUGIN_VIDEO_DECODER, 19, "vdpau_mpeg4", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/libvdpau/vdpau_vc1.c b/src/video_dec/libvdpau/vdpau_vc1.c
new file mode 100644
index 000000000..fe6ce26b4
--- /dev/null
+++ b/src/video_dec/libvdpau/vdpau_vc1.c
@@ -0,0 +1,1176 @@
+/*
+ * Copyright (C) 2008 the xine project
+ * Copyright (C) 2008 Christophe Thommeret <hftom@free.fr>
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * vdpau_vc1.c, a vc1 video stream parser using VDPAU hardware decoder
+ *
+ */
+
+/*#define LOG*/
+#define LOG_MODULE "vdpau_vc1"
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "accel_vdpau.h"
+#include "bits_reader.h"
+
+#include <vdpau/vdpau.h>
+
+#define sequence_header_code    0x0f
+#define sequence_end_code       0x0a
+#define entry_point_code        0x0e
+#define frame_start_code        0x0d
+#define field_start_code        0x0c
+#define slice_start_code        0x0b
+
+#define PICTURE_FRAME            0
+#define PICTURE_FRAME_INTERLACE  2
+#define PICTURE_FIELD_INTERLACE  3
+
+#define I_FRAME   0
+#define P_FRAME   1
+#define B_FRAME   3
+#define BI_FRAME  4
+
+#define FIELDS_I_I    0
+#define FIELDS_I_P    1
+#define FIELDS_P_I    2
+#define FIELDS_P_P    3
+#define FIELDS_B_B    4
+#define FIELDS_B_BI   5
+#define FIELDS_BI_B   6
+#define FIELDS_BI_BI  7
+
+#define MODE_STARTCODE  0
+#define MODE_FRAME      1
+
+/*#define MAKE_DAT*/ /*do NOT define this, unless you know what you do */
+#ifdef MAKE_DAT
+static int nframes;
+static FILE *outfile;
+#endif
+
+
+
+const double aspect_ratio[] = {
+  0.0,
+  1.0,
+  12./11.,
+  10./11.,
+  16./11.,
+  40./33.,
+  24./11.,
+  20./11.,
+  32./11.,
+  80./33.,
+  18./11.,
+  15./11.,
+  64./33.,
+  160./99.
+};
+
+
+
+typedef struct {
+  VdpPictureInfoVC1       vdp_infos;
+  int                     slices;
+  int                     fptype;
+  int                     field;
+  int                     header_size;
+  int                     hrd_param_flag;
+  int                     hrd_num_leaky_buckets;
+  int                     repeat_first_field;
+  int                     top_field_first;
+  int                     skipped;
+} picture_t;
+
+
+
+typedef struct {
+  uint32_t    coded_width;
+  uint32_t    coded_height;
+
+  uint64_t    video_step; /* frame duration in pts units */
+  uint64_t    reported_video_step; /* frame duration in pts units */
+  double      ratio;
+  VdpDecoderProfile profile;
+
+  int         mode;
+  int         have_header;
+
+  uint8_t     *buf; /* accumulate data */
+  int         bufseek;
+  int         start;
+  int         code_start, current_code;
+  uint32_t    bufsize;
+  uint32_t    bufpos;
+
+  picture_t   picture;
+  vo_frame_t  *forward_ref;
+  vo_frame_t  *backward_ref;
+
+  int64_t    seq_pts;
+  int64_t    cur_pts;
+
+  vdpau_accel_t *accel_vdpau;
+
+  bits_reader_t br;
+
+  int         vdp_runtime_nr;
+
+} sequence_t;
+
+
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} vdpau_vc1_class_t;
+
+
+
+typedef struct vdpau_vc1_decoder_s {
+  video_decoder_t         video_decoder;  /* parent video decoder structure */
+
+  vdpau_vc1_class_t    *class;
+  xine_stream_t           *stream;
+
+  sequence_t              sequence;
+
+  VdpDecoder              decoder;
+  VdpDecoderProfile       decoder_profile;
+  uint32_t                decoder_width;
+  uint32_t                decoder_height;
+
+} vdpau_vc1_decoder_t;
+
+
+
+static void init_picture( picture_t *pic )
+{
+  memset( pic, 0, sizeof( picture_t ) );
+}
+
+
+
+static void reset_picture( picture_t *pic )
+{
+  pic->slices = 1;
+}
+
+
+
+static void reset_sequence( sequence_t *sequence )
+{
+  lprintf( "reset_sequence\n" );
+  sequence->bufpos = 0;
+  sequence->bufseek = 0;
+  sequence->start = -1;
+  sequence->code_start = sequence->current_code = 0;
+  sequence->seq_pts = sequence->cur_pts = 0;
+  if ( sequence->forward_ref )
+    sequence->forward_ref->free( sequence->forward_ref );
+  sequence->forward_ref = NULL;
+  if ( sequence->backward_ref )
+    sequence->backward_ref->free( sequence->backward_ref );
+  sequence->backward_ref = NULL;
+  reset_picture( &sequence->picture );
+}
+
+
+
+static void init_sequence( sequence_t *sequence )
+{
+  lprintf( "init_sequence\n" );
+  sequence->have_header = 0;
+  sequence->profile = VDP_DECODER_PROFILE_VC1_SIMPLE;
+  sequence->ratio = 0;
+  sequence->video_step = 0;
+  sequence->picture.hrd_param_flag = 0;
+  reset_sequence( sequence );
+}
+
+
+
+static void update_metadata( vdpau_vc1_decoder_t *this_gen )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  if ( !sequence->have_header ) {
+    sequence->have_header = 1;
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_WIDTH, sequence->coded_width );
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, sequence->coded_height );
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_VIDEO_RATIO, ((double)10000*sequence->ratio) );
+    _x_stream_info_set( this_gen->stream, XINE_STREAM_INFO_FRAME_DURATION, (sequence->reported_video_step = sequence->video_step) );
+    _x_meta_info_set_utf8( this_gen->stream, XINE_META_INFO_VIDEOCODEC, "VC1/WMV9 (vdpau)" );
+    xine_event_t event;
+    xine_format_change_data_t data;
+    event.type = XINE_EVENT_FRAME_FORMAT_CHANGE;
+    event.stream = this_gen->stream;
+    event.data = &data;
+    event.data_length = sizeof(data);
+    data.width = sequence->coded_width;
+    data.height = sequence->coded_height;
+    data.aspect = sequence->ratio;
+    xine_event_send( this_gen->stream, &event );
+  }
+}
+
+
+
+static void sequence_header_advanced( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  lprintf( "sequence_header_advanced\n" );
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  if ( len < 5 )
+    return;
+
+  sequence->profile = VDP_DECODER_PROFILE_VC1_ADVANCED;
+  lprintf("VDP_DECODER_PROFILE_VC1_ADVANCED\n");
+  bits_reader_set( &sequence->br, buf, len );
+  skip_bits( &sequence->br, 15 );
+  sequence->picture.vdp_infos.postprocflag = read_bits( &sequence->br, 1 );
+  sequence->coded_width = (read_bits( &sequence->br, 12 )+1)<<1;
+  sequence->coded_height = (read_bits( &sequence->br, 12 )+1)<<1;
+  sequence->picture.vdp_infos.pulldown = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.interlace = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.tfcntrflag = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.finterpflag = read_bits( &sequence->br, 1 );
+  skip_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.psf = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.maxbframes = 7;
+  if ( read_bits( &sequence->br, 1 ) ) {
+    double w, h;
+    int ar=0;
+    w = read_bits( &sequence->br, 14 )+1;
+    h = read_bits( &sequence->br, 14 )+1;
+    if ( read_bits( &sequence->br, 1 ) ) {
+      ar = read_bits( &sequence->br, 4 );
+    }
+    if ( ar==15 ) {
+      w = read_bits( &sequence->br, 8 );
+      h = read_bits( &sequence->br, 8 );
+      sequence->ratio = w/h;
+      lprintf("aspect_ratio (w/h) = %f\n", sequence->ratio);
+    }
+    else if ( ar && ar<14 ) {
+      sequence->ratio = sequence->coded_width*aspect_ratio[ar]/sequence->coded_height;
+      lprintf("aspect_ratio = %f\n", sequence->ratio);
+    }
+
+    if ( read_bits( &sequence->br, 1 ) ) {
+      if ( read_bits( &sequence->br, 1 ) ) {
+        int exp = read_bits( &sequence->br, 16 );
+        lprintf("framerate exp = %d\n", exp);
+      }
+      else {
+        double nr = read_bits( &sequence->br, 8 );
+        switch ((int)nr) {
+          case 1: nr = 24000; break;
+          case 2: nr = 25000; break;
+          case 3: nr = 30000; break;
+          case 4: nr = 50000; break;
+          case 5: nr = 60000; break;
+          default: nr = 0;
+        }
+        double dr = read_bits( &sequence->br, 4 );
+        switch ((int)dr) {
+          case 2: dr = 1001; break;
+          default: dr = 1000;
+        }
+        sequence->video_step = 90000/(nr/dr);
+        lprintf("framerate = %f video_step = %d\n", nr/dr, sequence->video_step);
+      }
+    }
+    if ( read_bits( &sequence->br, 1 ) ) {
+	  int col = read_bits( &sequence->br, 8 );
+      lprintf("color_standard = %d\n", col);
+      skip_bits( &sequence->br, 16 );
+    }
+  }
+  sequence->picture.hrd_param_flag = read_bits( &sequence->br, 1 );
+  if ( sequence->picture.hrd_param_flag )
+    sequence->picture.hrd_num_leaky_buckets = read_bits( &sequence->br, 5 );
+
+  update_metadata( this_gen );
+}
+
+
+
+static void sequence_header( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  lprintf( "sequence_header\n" );
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  if ( len < 4 )
+    return;
+
+  bits_reader_set( &sequence->br, buf, len );
+  switch ( read_bits( &sequence->br, 2 ) ) {
+    case 0: sequence->profile = VDP_DECODER_PROFILE_VC1_SIMPLE; lprintf("VDP_DECODER_PROFILE_VC1_SIMPLE\n"); break;
+    case 1: sequence->profile = VDP_DECODER_PROFILE_VC1_MAIN; lprintf("VDP_DECODER_PROFILE_VC1_MAIN\n"); break;
+    case 2: sequence->profile = VDP_DECODER_PROFILE_VC1_MAIN; fprintf(stderr, "vc1_complex profile not supported by vdpau, forcing vc1_main, expect corruption!.\n"); break;
+    case 3: return sequence_header_advanced( this_gen, buf, len ); break;
+    default: return; /* illegal value, broken header? */
+  }
+  skip_bits( &sequence->br, 10 );
+  sequence->picture.vdp_infos.loopfilter = read_bits( &sequence->br, 1 );
+  skip_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.multires = read_bits( &sequence->br, 1 );
+  skip_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.fastuvmc = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.extended_mv = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.dquant = read_bits( &sequence->br, 2 );
+  sequence->picture.vdp_infos.vstransform = read_bits( &sequence->br, 1 );
+  skip_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.overlap = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.syncmarker = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.rangered = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.maxbframes = read_bits( &sequence->br, 3 );
+  sequence->picture.vdp_infos.quantizer = read_bits( &sequence->br, 2 );
+  sequence->picture.vdp_infos.finterpflag = read_bits( &sequence->br, 1 );
+
+  update_metadata( this_gen );
+}
+
+
+
+static void entry_point( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  lprintf( "entry_point\n" );
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  bits_reader_set( &sequence->br, buf, len );
+  skip_bits( &sequence->br, 2 );
+  sequence->picture.vdp_infos.panscan_flag = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.refdist_flag = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.loopfilter = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.fastuvmc = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.extended_mv = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.dquant = read_bits( &sequence->br, 2 );
+  sequence->picture.vdp_infos.vstransform = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.overlap = read_bits( &sequence->br, 1 );
+  sequence->picture.vdp_infos.quantizer = read_bits( &sequence->br, 2 );
+
+  if ( sequence->picture.hrd_param_flag ) {
+    int i;
+    for ( i=0; i<sequence->picture.hrd_num_leaky_buckets; ++i )
+      skip_bits( &sequence->br, 8 );
+  }
+
+  if ( read_bits( &sequence->br, 1 ) ) {
+    sequence->coded_width = (read_bits( &sequence->br, 12 )+1)<<1;
+    sequence->coded_height = (read_bits( &sequence->br, 12 )+1)<<1;
+  }
+
+  if ( sequence->picture.vdp_infos.extended_mv )
+    sequence->picture.vdp_infos.extended_dmv = read_bits( &sequence->br, 1 );
+
+  sequence->picture.vdp_infos.range_mapy_flag = read_bits( &sequence->br, 1 );
+  if ( sequence->picture.vdp_infos.range_mapy_flag ) {
+    sequence->picture.vdp_infos.range_mapy = read_bits( &sequence->br, 3 );
+  }
+  sequence->picture.vdp_infos.range_mapuv_flag = read_bits( &sequence->br, 1 );
+  if ( sequence->picture.vdp_infos.range_mapuv_flag ) {
+    sequence->picture.vdp_infos.range_mapuv = read_bits( &sequence->br, 3 );
+  }
+}
+
+
+
+static void picture_header( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  picture_t *pic = (picture_t*)&sequence->picture;
+  VdpPictureInfoVC1 *info = &(sequence->picture.vdp_infos);
+  int tmp;
+
+  lprintf("picture_header\n");
+
+  bits_reader_set( &sequence->br, buf, len );
+  skip_bits( &sequence->br, 2 );
+
+  if ( info->finterpflag )
+    skip_bits( &sequence->br, 1 );
+  if ( info->rangered ) {
+    /*info->rangered &= ~2;
+    info->rangered |= get_bits( buf,off++,1 ) << 1;*/
+    info->rangered = (read_bits( &sequence->br, 1 ) << 1) +1;
+  }
+  if ( !info->maxbframes ) {
+    if ( read_bits( &sequence->br, 1 ) )
+      info->picture_type = P_FRAME;
+    else
+      info->picture_type = I_FRAME;
+  }
+  else {
+    if ( read_bits( &sequence->br, 1 ) )
+      info->picture_type = P_FRAME;
+    else {
+      if ( read_bits( &sequence->br, 1 ) )
+        info->picture_type = I_FRAME;
+      else
+        info->picture_type = B_FRAME;
+    }
+  }
+  if ( info->picture_type == B_FRAME ) {
+    tmp = read_bits( &sequence->br, 3 );
+    if ( tmp==7 ) {
+      tmp = (tmp<<4) | read_bits( &sequence->br, 4 );
+      if ( tmp==127 )
+        info->picture_type = BI_FRAME;
+    }
+  }
+}
+
+
+
+static void picture_header_advanced( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  picture_t *pic = (picture_t*)&sequence->picture;
+  VdpPictureInfoVC1 *info = &(sequence->picture.vdp_infos);
+
+  lprintf("picture_header_advanced\n");
+
+  bits_reader_set( &sequence->br, buf, len );
+
+  if ( info->interlace ) {
+    lprintf("frame->interlace=1\n");
+    if ( !read_bits( &sequence->br, 1 ) ) {
+      lprintf("progressive frame\n");
+      info->frame_coding_mode = PICTURE_FRAME;
+    }
+    else {
+      if ( !read_bits( &sequence->br, 1 ) ) {
+        lprintf("frame interlaced\n");
+        info->frame_coding_mode = PICTURE_FRAME_INTERLACE;
+      }
+      else {
+        lprintf("field interlaced\n");
+        info->frame_coding_mode = PICTURE_FIELD_INTERLACE;
+      }
+    }
+  }
+  if ( info->interlace && info->frame_coding_mode == PICTURE_FIELD_INTERLACE ) {
+    pic->fptype = read_bits( &sequence->br, 3 );
+    switch ( pic->fptype ) {
+      case FIELDS_I_I:
+      case FIELDS_I_P:
+        info->picture_type = I_FRAME; break;
+      case FIELDS_P_I:
+      case FIELDS_P_P:
+        info->picture_type = P_FRAME; break;
+      case FIELDS_B_B:
+      case FIELDS_B_BI:
+        info->picture_type = B_FRAME; break;
+      default:
+        info->picture_type = BI_FRAME;
+    }
+  }
+  else {
+    if ( !read_bits( &sequence->br, 1 ) )
+      info->picture_type = P_FRAME;
+    else {
+      if ( !read_bits( &sequence->br, 1 ) )
+        info->picture_type = B_FRAME;
+      else {
+        if ( !read_bits( &sequence->br, 1 ) )
+          info->picture_type = I_FRAME;
+        else {
+          if ( !read_bits( &sequence->br, 1 ) )
+            info->picture_type = BI_FRAME;
+          else {
+            info->picture_type = P_FRAME;
+            pic->skipped = 1;
+          }
+        }
+      }
+    }
+  }
+  if ( info->tfcntrflag ) {
+    lprintf("tfcntrflag=1\n");
+    skip_bits( &sequence->br, 8 );
+  }
+  if ( info->pulldown && info->interlace ) {
+    pic->top_field_first = read_bits( &sequence->br, 1 );
+    pic->repeat_first_field = read_bits( &sequence->br, 1 );
+  }
+}
+
+
+
+static void parse_header( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+  int off=0;
+
+  while ( off < (len-4) ) {
+    uint8_t *buffer = buf+off;
+    if ( buffer[0]==0 && buffer[1]==0 && buffer[2]==1 ) {
+      switch ( buffer[3] ) {
+        case sequence_header_code: sequence_header( this_gen, buf+off+4, len-off-4 ); break;
+        case entry_point_code: entry_point( this_gen, buf+off+4, len-off-4 ); break;
+      }
+    }
+    ++off;
+  }
+  if ( !sequence->have_header )
+    sequence_header( this_gen, buf, len );
+}
+
+
+
+static void remove_emulation_prevention( uint8_t *src, uint8_t *dst, int src_len, int *dst_len )
+{
+  int i;
+  int len = 0;
+  int removed = 0;
+
+  for ( i=0; i<src_len-3; ++i ) {
+    if ( src[i]==0 && src[i+1]==0 && src[i+2]==3 ) {
+      lprintf("removed emulation prevention byte\n");
+      dst[len++] = src[i];
+      dst[len++] = src[i+1];
+      i += 2;
+      ++removed;
+    }
+    else {
+      memcpy( dst+len, src+i, 4 );
+      ++len;
+    }
+  }
+  for ( ; i<src_len; ++i )
+    dst[len++] = src[i];
+  *dst_len = src_len-removed;
+}
+
+
+
+static int parse_code( vdpau_vc1_decoder_t *this_gen, uint8_t *buf, int len )
+{
+  sequence_t *sequence = (sequence_t*)&this_gen->sequence;
+
+  if ( !sequence->have_header && buf[3]!=sequence_header_code )
+    return 0;
+
+  if ( sequence->code_start == frame_start_code ) {
+    if ( sequence->current_code==field_start_code || sequence->current_code==slice_start_code ) {
+	  sequence->picture.slices++;
+      return -1;
+	}
+    return 1; /* frame complete, decode */
+  }
+
+  switch ( buf[3] ) {
+    int dst_len;
+    uint8_t *tmp;
+    case sequence_header_code:
+      lprintf("sequence_header_code\n");
+      tmp = malloc( len );
+      remove_emulation_prevention( buf, tmp, len, &dst_len );
+      sequence_header( this_gen, tmp+4, dst_len-4 );
+      free( tmp );
+      break;
+    case entry_point_code:
+      lprintf("entry_point_code\n");
+      tmp = malloc( len );
+      remove_emulation_prevention( buf, tmp, len, &dst_len );
+      entry_point( this_gen, tmp+4, dst_len-4 );
+      free( tmp );
+      break;
+    case sequence_end_code:
+      lprintf("sequence_end_code\n");
+      break;
+    case frame_start_code:
+      lprintf("frame_start_code, len=%d\n", len);
+      break;
+    case field_start_code:
+      lprintf("field_start_code\n");
+      break;
+    case slice_start_code:
+      lprintf("slice_start_code, len=%d\n", len);
+      break;
+  }
+  return 0;
+}
+
+
+
+static void decode_render( vdpau_vc1_decoder_t *vd, vdpau_accel_t *accel, uint8_t *buf, int len )
+{
+  sequence_t *seq = (sequence_t*)&vd->sequence;
+  picture_t *pic = (picture_t*)&seq->picture;
+
+  VdpStatus st;
+  if ( vd->decoder==VDP_INVALID_HANDLE || vd->decoder_profile!=seq->profile || vd->decoder_width!=seq->coded_width || vd->decoder_height!=seq->coded_height ) {
+    if ( vd->decoder!=VDP_INVALID_HANDLE ) {
+      accel->vdp_decoder_destroy( vd->decoder );
+      vd->decoder = VDP_INVALID_HANDLE;
+    }
+    st = accel->vdp_decoder_create( accel->vdp_device, seq->profile, seq->coded_width, seq->coded_height, 2, &vd->decoder);
+    if ( st!=VDP_STATUS_OK )
+      fprintf(stderr, "vdpau_vc1: failed to create decoder !! %s\n", accel->vdp_get_error_string( st ) );
+    else {
+      lprintf( "decoder created.\n" );
+      vd->decoder_profile = seq->profile;
+      vd->decoder_width = seq->coded_width;
+      vd->decoder_height = seq->coded_height;
+      seq->vdp_runtime_nr = accel->vdp_runtime_nr;
+    }
+  }
+
+  VdpBitstreamBuffer vbit;
+  vbit.struct_version = VDP_BITSTREAM_BUFFER_VERSION;
+  vbit.bitstream = buf;
+  vbit.bitstream_bytes = len;
+  if ( pic->field )
+    vbit.bitstream_bytes = pic->field;
+  st = accel->vdp_decoder_render( vd->decoder, accel->surface, (VdpPictureInfo*)&pic->vdp_infos, 1, &vbit );
+  if ( st!=VDP_STATUS_OK )
+    fprintf(stderr, "vdpau_vc1: decoder failed : %d!! %s\n", st, accel->vdp_get_error_string( st ) );
+  else {
+    lprintf( "DECODER SUCCESS : slices=%d, slices_bytes=%d, current=%d, forwref:%d, backref:%d, pts:%lld\n",
+              pic->vdp_infos.slice_count, vbit.bitstream_bytes, accel->surface, pic->vdp_infos.forward_reference, pic->vdp_infos.backward_reference, seq->seq_pts );
+  }
+  VdpPictureInfoVC1 *info = &(seq->picture.vdp_infos);
+  lprintf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", info->slice_count, info->picture_type, info->frame_coding_mode,
+           info->postprocflag, info->pulldown, info->interlace, info->tfcntrflag, info->finterpflag, info->psf, info->dquant, info->panscan_flag, info->refdist_flag,
+           info->quantizer, info->extended_mv, info->extended_dmv, info->overlap, info->vstransform, info->loopfilter, info->fastuvmc, info->range_mapy_flag, info->range_mapy,
+           info->range_mapuv_flag, info->range_mapuv, info->multires, info->syncmarker, info->rangered, info->maxbframes, info->deblockEnable, info->pquant );
+
+  if ( pic->field ) {
+    int old_type = pic->vdp_infos.picture_type;
+    switch ( pic->fptype ) {
+      case FIELDS_I_I:
+      case FIELDS_P_I:
+        pic->vdp_infos.picture_type = I_FRAME;
+        pic->vdp_infos.backward_reference = VDP_INVALID_HANDLE;
+        pic->vdp_infos.forward_reference = VDP_INVALID_HANDLE;
+        break;
+      case FIELDS_I_P:
+        pic->vdp_infos.forward_reference = accel->surface;
+        pic->vdp_infos.picture_type = P_FRAME;
+        break;
+      case FIELDS_P_P:
+        if ( seq->backward_ref )
+          pic->vdp_infos.forward_reference = ((vdpau_accel_t*)seq->backward_ref->accel_data)->surface;
+        pic->vdp_infos.picture_type = P_FRAME;
+        break;
+      case FIELDS_B_B:
+      case FIELDS_BI_B:
+        pic->vdp_infos.picture_type = B_FRAME;
+        break;
+      default:
+        pic->vdp_infos.picture_type = BI_FRAME;
+    }
+    vbit.bitstream = buf+pic->field+4;
+    vbit.bitstream_bytes = len-pic->field-4;
+    st = accel->vdp_decoder_render( vd->decoder, accel->surface, (VdpPictureInfo*)&pic->vdp_infos, 1, &vbit );
+    if ( st!=VDP_STATUS_OK )
+      fprintf(stderr, "vdpau_vc1: decoder failed : %d!! %s\n", st, accel->vdp_get_error_string( st ) );
+    else {
+      lprintf( "DECODER SUCCESS (second field): slices=%d, slices_bytes=%d, current=%d, forwref:%d, backref:%d, pts:%lld\n",
+                pic->vdp_infos.slice_count, vbit.bitstream_bytes, accel->surface, pic->vdp_infos.forward_reference, pic->vdp_infos.backward_reference, seq->seq_pts );
+    }
+    VdpPictureInfoVC1 *info = &(seq->picture.vdp_infos);
+    lprintf("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", info->slice_count, info->picture_type, info->frame_coding_mode,
+             info->postprocflag, info->pulldown, info->interlace, info->tfcntrflag, info->finterpflag, info->psf, info->dquant, info->panscan_flag, info->refdist_flag,
+             info->quantizer, info->extended_mv, info->extended_dmv, info->overlap, info->vstransform, info->loopfilter, info->fastuvmc, info->range_mapy_flag, info->range_mapy,
+             info->range_mapuv_flag, info->range_mapuv, info->multires, info->syncmarker, info->rangered, info->maxbframes, info->deblockEnable, info->pquant );
+
+    pic->vdp_infos.picture_type = old_type;
+  }
+}
+
+
+
+static int search_field( vdpau_vc1_decoder_t *vd, uint8_t *buf, int len )
+{
+  int i;
+  lprintf("search_fields, len=%d\n", len);
+  for ( i=0; i<len-4; ++i ) {
+    if ( buf[i]==0 && buf[i+1]==0 && buf[i+2]==1 && buf[i+3]==field_start_code ) {
+      lprintf("found field_start_code at %d\n", i);
+      return i;
+    }
+  }
+  return 0;
+}
+
+
+
+static void decode_picture( vdpau_vc1_decoder_t *vd )
+{
+  sequence_t *seq = (sequence_t*)&vd->sequence;
+  picture_t *pic = (picture_t*)&seq->picture;
+  vdpau_accel_t *ref_accel;
+  int field;
+
+  uint8_t *buf;
+  int len;
+
+  pic->skipped = 0;
+  pic->field = 0;
+
+  if ( seq->mode == MODE_FRAME ) {
+    buf = seq->buf;
+    len = seq->bufpos;
+    if ( seq->profile==VDP_DECODER_PROFILE_VC1_ADVANCED )
+      picture_header_advanced( vd, buf, len );
+    else
+      picture_header( vd, buf, len );
+
+    if ( len < 2 )
+      pic->skipped = 1;
+  }
+  else {
+    seq->picture.vdp_infos.slice_count = seq->picture.slices;
+    buf = seq->buf+seq->start+4;
+    len = seq->bufseek-seq->start-4;
+    if ( seq->profile==VDP_DECODER_PROFILE_VC1_ADVANCED ) {
+      int tmplen = (len>50) ? 50 : len;
+      uint8_t *tmp = malloc( tmplen );
+      remove_emulation_prevention( buf, tmp, tmplen, &tmplen );
+      picture_header_advanced( vd, tmp, tmplen );
+      free( tmp );
+    }
+    else
+      picture_header( vd, buf, len );
+
+    if ( len < 2 )
+      pic->skipped = 1;
+  }
+
+  if ( pic->skipped )
+    pic->vdp_infos.picture_type = P_FRAME;
+
+  if ( pic->vdp_infos.interlace && pic->vdp_infos.frame_coding_mode == PICTURE_FIELD_INTERLACE ) {
+    if ( !(field = search_field( vd, buf, len )) )
+      lprintf("error, no fields found!\n");
+    else
+      pic->field = field;
+  }
+
+  pic->vdp_infos.forward_reference = VDP_INVALID_HANDLE;
+  pic->vdp_infos.backward_reference = VDP_INVALID_HANDLE;
+
+  if ( pic->vdp_infos.picture_type==P_FRAME ) {
+    if ( seq->backward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->backward_ref->accel_data;
+      pic->vdp_infos.forward_reference = ref_accel->surface;
+    }
+    else {
+      reset_picture( &seq->picture );
+      return;
+    }
+  }
+  else if ( pic->vdp_infos.picture_type>=B_FRAME ) {
+    if ( seq->forward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->forward_ref->accel_data;
+      pic->vdp_infos.forward_reference = ref_accel->surface;
+    }
+    else {
+      reset_picture( &seq->picture );
+      return;
+    }
+    if ( seq->backward_ref ) {
+      ref_accel = (vdpau_accel_t*)seq->backward_ref->accel_data;
+      pic->vdp_infos.backward_reference = ref_accel->surface;
+    }
+    else {
+      reset_picture( &seq->picture );
+      return;
+    } 
+  }
+
+  vo_frame_t *img = vd->stream->video_out->get_frame( vd->stream->video_out, seq->coded_width, seq->coded_height,
+                                                      seq->ratio, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS );
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  if ( !seq->accel_vdpau )
+    seq->accel_vdpau = accel;
+
+  if( seq->vdp_runtime_nr != *(seq->accel_vdpau->current_vdp_runtime_nr) ) {
+    seq->accel_vdpau = accel;
+    if ( seq->forward_ref )
+      seq->forward_ref->free( seq->forward_ref );
+    seq->forward_ref = NULL;
+    if ( seq->backward_ref )
+      seq->backward_ref->free( seq->backward_ref );
+    seq->backward_ref = NULL;
+    vd->decoder = VDP_INVALID_HANDLE;
+  }
+
+  decode_render( vd, accel, buf, len );
+
+
+#ifdef MAKE_DAT
+  if ( nframes==0 ) {
+	fwrite( &seq->coded_width, 1, sizeof(seq->coded_width), outfile );
+	fwrite( &seq->coded_height, 1, sizeof(seq->coded_height), outfile );
+	fwrite( &seq->ratio, 1, sizeof(seq->ratio), outfile );
+	fwrite( &seq->profile, 1, sizeof(seq->profile), outfile );
+  }
+
+  if ( nframes++ < 25 ) {
+	fwrite( &pic->vdp_infos, 1, sizeof(pic->vdp_infos), outfile );
+	fwrite( &len, 1, sizeof(len), outfile );
+	fwrite( buf, 1, len, outfile );
+	printf( "picture_type = %d\n", pic->vdp_infos.picture_type);
+  }
+#endif
+
+  if ( pic->vdp_infos.interlace && pic->vdp_infos.frame_coding_mode ) {
+    img->progressive_frame = 0;
+    img->top_field_first = pic->top_field_first;
+  }
+  else {
+    img->progressive_frame = 1;
+    img->top_field_first = 1;
+  }
+  img->pts = seq->seq_pts;
+  img->bad_frame = 0;
+  img->duration = seq->video_step;
+  accel->color_standard = VDP_COLOR_STANDARD_ITUR_BT_709;
+
+  if ( pic->vdp_infos.picture_type<B_FRAME ) {
+    if ( pic->vdp_infos.picture_type==I_FRAME && !seq->backward_ref ) {
+      img->pts = 0;
+      img->draw( img, vd->stream );
+      ++img->drawn;
+    }
+    if ( seq->forward_ref ) {
+      seq->forward_ref->drawn = 0;
+      seq->forward_ref->free( seq->forward_ref );
+    }
+    seq->forward_ref = seq->backward_ref;
+    if ( seq->forward_ref && !seq->forward_ref->drawn ) {
+      seq->forward_ref->draw( seq->forward_ref, vd->stream );
+    }
+    seq->backward_ref = img;
+  }
+  else {
+    img->draw( img, vd->stream );
+    img->free( img );
+  }
+
+  seq->seq_pts +=seq->video_step;
+
+  reset_picture( &seq->picture );
+}
+
+
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void vdpau_vc1_decode_data (video_decoder_t *this_gen, buf_element_t *buf)
+{
+  vdpau_vc1_decoder_t *this = (vdpau_vc1_decoder_t *) this_gen;
+  sequence_t *seq = (sequence_t*)&this->sequence;
+
+  /* a video decoder does not care about this flag (?) */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW) {
+    lprintf("BUF_FLAG_PREVIEW\n");
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_FRAMERATE) {
+    lprintf("BUF_FLAG_FRAMERATE=%d\n", buf->decoder_info[0]);
+    if ( buf->decoder_info[0] > 0 ) {
+      this->sequence.video_step = buf->decoder_info[0];
+      _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, this->sequence.video_step);
+    }
+  }
+  
+  if (this->sequence.reported_video_step != this->sequence.video_step){
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, (this->sequence.reported_video_step = this->sequence.video_step));
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_HEADER) {
+    lprintf("BUF_FLAG_HEADER\n");
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_ASPECT) {
+    lprintf("BUF_FLAG_ASPECT\n");
+    seq->ratio = (double)buf->decoder_info[1]/(double)buf->decoder_info[2];
+    lprintf("arx=%d ary=%d ratio=%f\n", buf->decoder_info[1], buf->decoder_info[2], seq->ratio);
+  }
+
+  if ( !buf->size )
+    return;
+
+  seq->cur_pts = buf->pts;
+
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER) {
+    lprintf("BUF_FLAG_STDHEADER\n");
+    xine_bmiheader *bih = (xine_bmiheader *) buf->content;
+    int bs = sizeof( xine_bmiheader );
+    seq->coded_width = bih->biWidth;
+    seq->coded_height = bih->biHeight;
+    lprintf( "width=%d height=%d\n", bih->biWidth, bih->biHeight );
+    if ( buf->size > bs ) {
+      seq->mode = MODE_FRAME;
+      parse_header( this, buf->content+bs, buf->size-bs );
+    }
+    return;
+  }
+
+  int size = seq->bufpos+buf->size;
+  if ( seq->bufsize < size ) {
+    seq->bufsize = size+10000;
+    seq->buf = realloc( seq->buf, seq->bufsize );
+    lprintf("sequence buffer realloced = %d\n", seq->bufsize );
+  }
+  xine_fast_memcpy( seq->buf+seq->bufpos, buf->content, buf->size );
+  seq->bufpos += buf->size;
+
+  if (buf->decoder_flags & BUF_FLAG_FRAME_START) {
+    lprintf("BUF_FLAG_FRAME_START\n");
+    seq->seq_pts = buf->pts;
+    seq->mode = MODE_FRAME;
+    if ( seq->bufpos > 3 ) {
+      if ( seq->buf[0]==0 && seq->buf[1]==0 && seq->buf[2]==1 ) {
+        seq->mode = MODE_STARTCODE;
+      }
+    }
+  }
+
+  if ( seq->mode == MODE_FRAME ) {
+    if ( buf->decoder_flags & BUF_FLAG_FRAME_END ) {
+      lprintf("BUF_FLAG_FRAME_END\n");
+      decode_picture( this );
+      seq->bufpos = 0;
+    }
+    return;
+  }
+
+  int res, startcode=0;
+  while ( seq->bufseek <= seq->bufpos-4 ) {
+    uint8_t *buffer = seq->buf+seq->bufseek;
+    if ( buffer[0]==0 && buffer[1]==0 && buffer[2]==1 ) {
+      startcode = 1;
+      seq->current_code = buffer[3];
+      lprintf("current_code = %d\n", seq->current_code);
+      if ( seq->start<0 ) {
+        seq->start = seq->bufseek;
+        seq->code_start = buffer[3];
+        lprintf("code_start = %d\n", seq->code_start);
+        if ( seq->cur_pts )
+          seq->seq_pts = seq->cur_pts;
+      }
+      else {
+        res = parse_code( this, seq->buf+seq->start, seq->bufseek-seq->start );
+        if ( res==1 ) {
+          seq->mode = MODE_STARTCODE;
+          decode_picture( this );
+          parse_code( this, seq->buf+seq->start, seq->bufseek-seq->start );
+        }
+        if ( res!=-1 ) {
+          uint8_t *tmp = (uint8_t*)malloc(seq->bufsize);
+          xine_fast_memcpy( tmp, seq->buf+seq->bufseek, seq->bufpos-seq->bufseek );
+          seq->bufpos -= seq->bufseek;
+          seq->start = -1;
+          seq->bufseek = -1;
+          free( seq->buf );
+          seq->buf = tmp;
+        }
+      }
+    }
+    ++seq->bufseek;
+  }
+}
+
+
+
+/*
+ * This function is called when xine needs to flush the system.
+ */
+static void vdpau_vc1_flush (video_decoder_t *this_gen) {
+  vdpau_vc1_decoder_t *this = (vdpau_vc1_decoder_t *) this_gen;
+
+  lprintf( "vdpau_vc1_flush\n" );
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void vdpau_vc1_reset (video_decoder_t *this_gen) {
+  vdpau_vc1_decoder_t *this = (vdpau_vc1_decoder_t *) this_gen;
+
+  lprintf( "vdpau_vc1_reset\n" );
+  reset_sequence( &this->sequence );
+}
+
+/*
+ * The decoder should forget any stored pts values here.
+ */
+static void vdpau_vc1_discontinuity (video_decoder_t *this_gen) {
+  vdpau_vc1_decoder_t *this = (vdpau_vc1_decoder_t *) this_gen;
+
+  lprintf( "vdpau_vc1_discontinuity\n" );
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void vdpau_vc1_dispose (video_decoder_t *this_gen) {
+
+  vdpau_vc1_decoder_t *this = (vdpau_vc1_decoder_t *) this_gen;
+
+  lprintf( "vdpau_vc1_dispose\n" );
+
+  if ( this->decoder!=VDP_INVALID_HANDLE && this->sequence.accel_vdpau ) {
+      this->sequence.accel_vdpau->vdp_decoder_destroy( this->decoder );
+      this->decoder = VDP_INVALID_HANDLE;
+    }
+
+  reset_sequence( &this->sequence );
+
+  this->stream->video_out->close( this->stream->video_out, this->stream );
+
+  free( this->sequence.buf );
+  free( this_gen );
+}
+
+/*
+ * This function allocates, initializes, and returns a private video
+ * decoder structure.
+ */
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  vdpau_vc1_decoder_t  *this ;
+
+  lprintf( "open_plugin\n" );
+
+  /* the videoout must be vdpau-capable to support this decoder */
+  if ( !(stream->video_driver->get_capabilities(stream->video_driver) & VO_CAP_VDPAU_VC1) )
+    return NULL;
+
+  /* now check if vdpau has free decoder resource */
+  vo_frame_t *img = stream->video_out->get_frame( stream->video_out, 1920, 1080, 1, XINE_IMGFMT_VDPAU, VO_BOTH_FIELDS );
+  vdpau_accel_t *accel = (vdpau_accel_t*)img->accel_data;
+  int runtime_nr = accel->vdp_runtime_nr;
+  img->free(img);
+  VdpDecoder decoder;
+  VdpStatus st = accel->vdp_decoder_create( accel->vdp_device, VDP_DECODER_PROFILE_VC1_MAIN, 1920, 1080, 2, &decoder );
+  if ( st!=VDP_STATUS_OK ) {
+    lprintf( "can't create vdpau decoder.\n" );
+    return NULL;
+  }
+
+  accel->vdp_decoder_destroy( decoder );
+
+  this = (vdpau_vc1_decoder_t *) calloc(1, sizeof(vdpau_vc1_decoder_t));
+
+  this->video_decoder.decode_data         = vdpau_vc1_decode_data;
+  this->video_decoder.flush               = vdpau_vc1_flush;
+  this->video_decoder.reset               = vdpau_vc1_reset;
+  this->video_decoder.discontinuity       = vdpau_vc1_discontinuity;
+  this->video_decoder.dispose             = vdpau_vc1_dispose;
+
+  this->stream                            = stream;
+  this->class                             = (vdpau_vc1_class_t *) class_gen;
+
+  this->sequence.bufsize = 10000;
+  this->sequence.buf = (uint8_t*)malloc(this->sequence.bufsize);
+  this->sequence.forward_ref = 0;
+  this->sequence.backward_ref = 0;
+  this->sequence.vdp_runtime_nr = runtime_nr;
+  init_sequence( &this->sequence );
+
+  init_picture( &this->sequence.picture );
+
+  this->decoder = VDP_INVALID_HANDLE;
+  this->sequence.accel_vdpau = NULL;
+  this->sequence.mode = MODE_STARTCODE;
+
+  (stream->video_out->open)(stream->video_out, stream);
+
+#ifdef MAKE_DAT
+  outfile = fopen( "/tmp/vc1.dat","w");
+  nframes = 0;
+#endif
+
+  return &this->video_decoder;
+}
+
+/*
+ * This function allocates a private video decoder class and initializes
+ * the class's member functions.
+ */
+static void *init_plugin (xine_t *xine, void *data) {
+
+  vdpau_vc1_class_t *this;
+
+  this = (vdpau_vc1_class_t *) calloc(1, sizeof(vdpau_vc1_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "vdpau_vc1";
+  this->decoder_class.description     =
+	N_("vdpau_vc1: vc1 decoder plugin using VDPAU hardware decoding.\n"
+	   "Must be used along with video_out_vdpau.");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * This is a list of all of the internal xine video buffer types that
+ * this decoder is able to handle. Check src/xine-engine/buffer.h for a
+ * list of valid buffer types (and add a new one if the one you need does
+ * not exist). Terminate the list with a 0.
+ */
+static const uint32_t video_types[] = {
+  BUF_VIDEO_VC1, BUF_VIDEO_WMV9,
+  0
+};
+
+/*
+ * This data structure combines the list of supported xine buffer types and
+ * the priority that the plugin should be given with respect to other
+ * plugins that handle the same buffer type. A plugin with priority (n+1)
+ * will be used instead of a plugin with priority (n).
+ */
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  8                    /* priority        */
+};
+
+/*
+ * The plugin catalog entry. This is the only information that this plugin
+ * will export to the public.
+ */
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* { type, API, "name", version, special_info, init_function } */
+  { PLUGIN_VIDEO_DECODER, 19, "vdpau_vc1", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/rgb.c b/src/video_dec/rgb.c
new file mode 100644
index 000000000..678ce8194
--- /dev/null
+++ b/src/video_dec/rgb.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (C) 2000-2003 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * Raw RGB "Decoder" by Mike Melanson (melanson@pcisys.net)
+ * Actually, this decoder just converts a raw RGB image to a YUY2 map
+ * suitable for display under xine.
+ *
+ * This decoder deals with raw RGB data from Microsoft and Quicktime files.
+ * Data from a MS file can be 32-, 24-, 16-, or 8-bit. The latter can also
+ * be grayscale, depending on whether a palette is present. Data from a QT
+ * file can be 32-, 24-, 16-, 8-, 4-, 2-, or 1-bit. Any resolutions <= 8
+ * can also be greyscale depending on what the QT file specifies.
+ *
+ * One more catch: Raw RGB from a Microsoft file is upside down. This is
+ * indicated by a negative height parameter.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define LOG_MODULE "rgb"
+#define LOG_VERBOSE
+/*
+#define LOG
+*/
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} rgb_class_t;
+
+typedef struct rgb_decoder_s {
+  video_decoder_t   video_decoder;  /* parent video decoder structure */
+
+  rgb_class_t      *class;
+  xine_stream_t    *stream;
+
+  /* these are traditional variables in a video decoder object */
+  uint64_t          video_step;  /* frame duration in pts units */
+  int               decoder_ok;  /* current decoder status */
+  int               skipframes;
+
+  unsigned char    *buf;         /* the accumulated buffer data */
+  int               bufsize;     /* the maximum size of buf */
+  int               size;        /* the current size of buf */
+
+  int               width;       /* the width of a video frame */
+  int               height;      /* the height of a video frame */
+  double            ratio;       /* the width to height ratio */
+  int               bytes_per_pixel;
+  int               bit_depth;
+  int               upside_down;
+
+  unsigned char     yuv_palette[256 * 4];
+  yuv_planes_t      yuv_planes;
+
+} rgb_decoder_t;
+
+static void rgb_decode_data (video_decoder_t *this_gen,
+  buf_element_t *buf) {
+
+  rgb_decoder_t *this = (rgb_decoder_t *) this_gen;
+  xine_bmiheader *bih;
+  palette_entry_t *palette;
+  int i;
+  int pixel_ptr, row_ptr;
+  int palette_index;
+  int buf_ptr;
+  unsigned int packed_pixel;
+  unsigned char r, g, b;
+  int pixels_left;
+  unsigned char pixel_byte = 0;
+
+  vo_frame_t *img; /* video out frame */
+
+  /* a video decoder does not care about this flag (?) */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if ((buf->decoder_flags & BUF_FLAG_SPECIAL) &&
+    (buf->decoder_info[1] == BUF_SPECIAL_PALETTE)) {
+    palette = (palette_entry_t *)buf->decoder_info_ptr[2];
+    for (i = 0; i < buf->decoder_info[2]; i++) {
+      this->yuv_palette[i * 4 + 0] =
+        COMPUTE_Y(palette[i].r, palette[i].g, palette[i].b);
+      this->yuv_palette[i * 4 + 1] =
+        COMPUTE_U(palette[i].r, palette[i].g, palette[i].b);
+      this->yuv_palette[i * 4 + 2] =
+        COMPUTE_V(palette[i].r, palette[i].g, palette[i].b);
+    }
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_FRAMERATE) {
+    this->video_step = buf->decoder_info[0];
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, this->video_step);
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER) { /* need to initialize */
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+
+    bih = (xine_bmiheader *) buf->content;
+    this->width = (bih->biWidth + 3) & ~0x03;
+    this->height = (bih->biHeight + 3) & ~0x03;
+    if (this->height < 0) {
+      this->upside_down = 1;
+      this->height = -this->height;
+    } else {
+      this->upside_down = 0;
+    }
+    this->ratio = (double)this->width/(double)this->height;
+
+    this->bit_depth = bih->biBitCount;
+    if (this->bit_depth > 32)
+      this->bit_depth &= 0x1F;
+    /* round this number up in case of 15 */
+    lprintf("width = %d, height = %d, bit_depth = %d\n", this->width, this->height, this->bit_depth);
+
+    this->bytes_per_pixel = (this->bit_depth + 1) / 8;
+
+    free (this->buf);
+
+    /* minimal buffer size */
+    this->bufsize = this->width * this->height * this->bytes_per_pixel;
+    this->buf = calloc(1, this->bufsize);
+    this->size = 0;
+
+    init_yuv_planes(&this->yuv_planes, this->width, this->height);
+
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+    this->decoder_ok = 1;
+
+    /* load the stream/meta info */
+    _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Raw RGB");
+
+    return;
+  } else if (this->decoder_ok) {
+
+    if (this->size + buf->size > this->bufsize) {
+      this->bufsize = this->size + 2 * buf->size;
+      this->buf = realloc (this->buf, this->bufsize);
+    }
+    xine_fast_memcpy (&this->buf[this->size], buf->content, buf->size);
+
+    this->size += buf->size;
+
+    if (buf->decoder_flags & BUF_FLAG_FRAME_END) {
+
+      img = this->stream->video_out->get_frame (this->stream->video_out,
+                                        this->width, this->height,
+                                        this->ratio, XINE_IMGFMT_YUY2,
+                                        VO_BOTH_FIELDS);
+
+      img->duration  = this->video_step;
+      img->pts       = buf->pts;
+      img->bad_frame = 0;
+
+
+      /* iterate through each row */
+      buf_ptr = 0;
+
+      if (this->upside_down) {
+        for (row_ptr = this->yuv_planes.row_width * (this->yuv_planes.row_count - 1);
+          row_ptr >= 0; row_ptr -= this->yuv_planes.row_width) {
+          for (pixel_ptr = 0; pixel_ptr < this->width; pixel_ptr++) {
+
+            if (this->bytes_per_pixel == 1) {
+
+              palette_index = this->buf[buf_ptr++];
+
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 0];
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 1];
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 2];
+
+            } else if (this->bytes_per_pixel == 2) {
+
+              /* ABGR1555 format, little-endian order */
+              packed_pixel = _X_LE_16(&this->buf[buf_ptr]);
+              buf_ptr += 2;
+              UNPACK_BGR15(packed_pixel, r, g, b);
+
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                COMPUTE_Y(r, g, b);
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                COMPUTE_U(r, g, b);
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                COMPUTE_V(r, g, b);
+
+            } else {
+
+              /* BGR24 or BGRA32 */
+              b = this->buf[buf_ptr++];
+              g = this->buf[buf_ptr++];
+              r = this->buf[buf_ptr++];
+
+              /* the next line takes care of 'A' in the 32-bit case */
+              buf_ptr += this->bytes_per_pixel - 3;
+
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                COMPUTE_Y(r, g, b);
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                COMPUTE_U(r, g, b);
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                COMPUTE_V(r, g, b);
+
+            }
+          }
+        }
+      } else {
+
+        for (row_ptr = 0; row_ptr < this->yuv_planes.row_width * this->yuv_planes.row_count; row_ptr += this->yuv_planes.row_width) {
+          pixels_left = 0;
+          for (pixel_ptr = 0; pixel_ptr < this->width; pixel_ptr++) {
+
+            if (this->bit_depth == 1) {
+
+              if (pixels_left == 0) {
+                pixels_left = 8;
+                pixel_byte = *this->buf++;
+              }
+
+              if (pixel_byte & 0x80) {
+                this->yuv_planes.y[row_ptr + pixel_ptr] =
+                  this->yuv_palette[1 * 4 + 0];
+                this->yuv_planes.u[row_ptr + pixel_ptr] =
+                  this->yuv_palette[1 * 4 + 1];
+                this->yuv_planes.v[row_ptr + pixel_ptr] =
+                  this->yuv_palette[1 * 4 + 2];
+              } else {
+                this->yuv_planes.y[row_ptr + pixel_ptr] =
+                  this->yuv_palette[0 * 4 + 0];
+                this->yuv_planes.u[row_ptr + pixel_ptr] =
+                  this->yuv_palette[0 * 4 + 1];
+                this->yuv_planes.v[row_ptr + pixel_ptr] =
+                  this->yuv_palette[0 * 4 + 2];
+              }
+              pixels_left--;
+              pixel_byte <<= 1;
+
+            } else if (this->bit_depth == 2) {
+
+              if (pixels_left == 0) {
+                pixels_left = 4;
+                pixel_byte = *this->buf++;
+              }
+
+              palette_index = (pixel_byte & 0xC0) >> 6;
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 0];
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 1];
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 2];
+
+              pixels_left--;
+              pixel_byte <<= 2;
+
+            } else if (this->bit_depth == 4) {
+
+              if (pixels_left == 0) {
+                pixels_left = 2;
+                pixel_byte = *this->buf++;
+              }
+
+              palette_index = (pixel_byte & 0xF0) >> 4;
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 0];
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 1];
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 2];
+
+              pixels_left--;
+              pixel_byte <<= 4;
+
+            } else if (this->bytes_per_pixel == 1) {
+
+              palette_index = this->buf[buf_ptr++];
+
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 0];
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 1];
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                this->yuv_palette[palette_index * 4 + 2];
+
+            } else if (this->bytes_per_pixel == 2) {
+
+              /* ARGB1555 format, big-endian order */
+              packed_pixel = _X_BE_16(&this->buf[buf_ptr]);
+              buf_ptr += 2;
+              UNPACK_RGB15(packed_pixel, r, g, b);
+
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                COMPUTE_Y(r, g, b);
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                COMPUTE_U(r, g, b);
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                COMPUTE_V(r, g, b);
+
+            } else {
+
+              /* RGB24 or ARGB32; the next line takes care of 'A' in the
+               * 32-bit case */
+              buf_ptr += this->bytes_per_pixel - 3;
+
+              r = this->buf[buf_ptr++];
+              g = this->buf[buf_ptr++];
+              b = this->buf[buf_ptr++];
+
+              this->yuv_planes.y[row_ptr + pixel_ptr] =
+                COMPUTE_Y(r, g, b);
+              this->yuv_planes.u[row_ptr + pixel_ptr] =
+                COMPUTE_U(r, g, b);
+              this->yuv_planes.v[row_ptr + pixel_ptr] =
+                COMPUTE_V(r, g, b);
+
+            }
+          }
+        }
+      }
+
+      yuv444_to_yuy2(&this->yuv_planes, img->base[0], img->pitches[0]);
+
+      img->draw(img, this->stream);
+      img->free(img);
+
+      this->size = 0;
+    }
+  }
+}
+
+/*
+ * This function is called when xine needs to flush the system. Not
+ * sure when or if this is used or even if it needs to do anything.
+ */
+static void rgb_flush (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void rgb_reset (video_decoder_t *this_gen) {
+  rgb_decoder_t *this = (rgb_decoder_t *) this_gen;
+
+  this->size = 0;
+}
+
+static void rgb_discontinuity (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void rgb_dispose (video_decoder_t *this_gen) {
+  rgb_decoder_t *this = (rgb_decoder_t *) this_gen;
+
+  free (this->buf);
+
+  if (this->decoder_ok) {
+    this->decoder_ok = 0;
+    this->stream->video_out->close(this->stream->video_out, this->stream);
+  }
+
+  free (this_gen);
+}
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  rgb_decoder_t  *this ;
+
+  this = (rgb_decoder_t *) calloc(1, sizeof(rgb_decoder_t));
+
+  this->video_decoder.decode_data         = rgb_decode_data;
+  this->video_decoder.flush               = rgb_flush;
+  this->video_decoder.reset               = rgb_reset;
+  this->video_decoder.discontinuity       = rgb_discontinuity;
+  this->video_decoder.dispose             = rgb_dispose;
+  this->size                              = 0;
+
+  this->stream                            = stream;
+  this->class                             = (rgb_class_t *) class_gen;
+
+  this->decoder_ok    = 0;
+  this->buf           = NULL;
+
+  return &this->video_decoder;
+}
+
+static void *init_plugin (xine_t *xine, void *data) {
+
+  rgb_class_t *this;
+
+  this = (rgb_class_t *) calloc(1, sizeof(rgb_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "RGB";
+  this->decoder_class.description     = N_("Raw RGB video decoder plugin");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t video_types[] = {
+  BUF_VIDEO_RGB,
+  0
+ };
+
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  1                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "rgb", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};
diff --git a/src/video_dec/yuv.c b/src/video_dec/yuv.c
new file mode 100644
index 000000000..c1a8b1829
--- /dev/null
+++ b/src/video_dec/yuv.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (C) 2000-2004 the xine project
+ *
+ * This file is part of xine, a free video player.
+ *
+ * xine is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * xine is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
+ *
+ * YUV "Decoder" by Mike Melanson (melanson@pcisys.net)
+ * Actually, this decoder just reorganizes chunks of raw YUV data in such
+ * a way that xine can display them.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <xine/xine_internal.h>
+#include <xine/video_out.h>
+#include <xine/buffer.h>
+#include <xine/xineutils.h>
+#include "bswap.h"
+
+#define VIDEOBUFSIZE 128*1024
+
+typedef struct {
+  video_decoder_class_t   decoder_class;
+} yuv_class_t;
+
+typedef struct yuv_decoder_s {
+  video_decoder_t   video_decoder;  /* parent video decoder structure */
+
+  yuv_class_t      *class;
+  xine_stream_t    *stream;
+
+  /* these are traditional variables in a video decoder object */
+  uint64_t          video_step;  /* frame duration in pts units */
+  int               decoder_ok;  /* current decoder status */
+  int               skipframes;
+
+  unsigned char    *buf;         /* the accumulated buffer data */
+  int               bufsize;     /* the maximum size of buf */
+  int               size;        /* the current size of buf */
+
+  int               width;       /* the width of a video frame */
+  int               height;      /* the height of a video frame */
+  double            ratio;       /* the width to height ratio */
+
+  int               progressive;
+  int               top_field_first;
+
+} yuv_decoder_t;
+
+/**************************************************************************
+ * xine video plugin functions
+ *************************************************************************/
+
+/*
+ * This function receives a buffer of data from the demuxer layer and
+ * figures out how to handle it based on its header flags.
+ */
+static void yuv_decode_data (video_decoder_t *this_gen,
+  buf_element_t *buf) {
+
+  yuv_decoder_t *this = (yuv_decoder_t *) this_gen;
+  xine_bmiheader *bih;
+
+  vo_frame_t *img; /* video out frame */
+
+  /* a video decoder does not care about this flag (?) */
+  if (buf->decoder_flags & BUF_FLAG_PREVIEW)
+    return;
+
+  if (buf->decoder_flags & BUF_FLAG_FRAMERATE) {
+    this->video_step = buf->decoder_info[0];
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_FRAME_DURATION, this->video_step);
+  }
+
+  if (buf->decoder_flags & BUF_FLAG_STDHEADER) { /* need to initialize */
+    (this->stream->video_out->open) (this->stream->video_out, this->stream);
+
+    bih = (xine_bmiheader *) buf->content;
+    this->width = (bih->biWidth + 3) & ~0x03;
+    this->height = (bih->biHeight + 3) & ~0x03;
+
+    if (buf->decoder_flags & BUF_FLAG_ASPECT)
+      this->ratio = (double)buf->decoder_info[1] / (double)buf->decoder_info[2];
+    else
+      this->ratio = (double)this->width / (double)this->height;
+
+    this->progressive = buf->decoder_info[3];
+    this->top_field_first = buf->decoder_info[4];
+
+    free (this->buf);
+    this->buf = NULL;
+
+    this->bufsize = VIDEOBUFSIZE;
+    this->buf = malloc(this->bufsize);
+    this->size = 0;
+
+    this->decoder_ok = 1;
+
+    /* load the stream/meta info */
+    switch (buf->type) {
+
+      case BUF_VIDEO_YUY2:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Raw YUY2");
+        break;
+
+      case BUF_VIDEO_YV12:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Raw YV12");
+        break;
+
+      case BUF_VIDEO_YVU9:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Raw YVU9");
+        break;
+
+      case BUF_VIDEO_GREY:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Greyscale YUV");
+        break;
+
+      case BUF_VIDEO_I420:
+        _x_meta_info_set_utf8(this->stream, XINE_META_INFO_VIDEOCODEC, "Raw I420");
+        break;
+
+    }
+
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_WIDTH,  this->width);
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_HEIGHT, this->height);
+    _x_stream_info_set(this->stream, XINE_STREAM_INFO_VIDEO_RATIO,  this->ratio*10000);
+
+    return;
+  } else if (this->decoder_ok && !(buf->decoder_flags & BUF_FLAG_SPECIAL)) {
+    uint8_t *src;
+
+    /* if buffer contains an entire frame then there's no need to copy it
+     * into our internal buffer */
+    if ((buf->decoder_flags & BUF_FLAG_FRAME_START) &&
+        (buf->decoder_flags & BUF_FLAG_FRAME_END))
+      src = buf->content;
+    else {
+      if (this->size + buf->size > this->bufsize) {
+        this->bufsize = this->size + 2 * buf->size;
+        this->buf = realloc (this->buf, this->bufsize);
+      }
+
+      xine_fast_memcpy (&this->buf[this->size], buf->content, buf->size);
+
+      this->size += buf->size;
+
+      src = this->buf;
+    }
+
+    if (buf->decoder_flags & BUF_FLAG_FRAME_END) {
+
+      if (buf->type == BUF_VIDEO_YUY2) {
+
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                          this->width, this->height,
+                                          this->ratio, XINE_IMGFMT_YUY2, VO_BOTH_FIELDS);
+
+        yuy2_to_yuy2(
+         /* src */
+          src, this->width*2,
+         /* dst */
+          img->base[0], img->pitches[0],
+         /* width x height */
+          this->width, this->height);
+
+      } else if (buf->type == BUF_VIDEO_YV12) {
+
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                          this->width, this->height,
+                                          this->ratio, XINE_IMGFMT_YV12, VO_BOTH_FIELDS);
+
+        yv12_to_yv12(
+         /* Y */
+          src, this->width,
+          img->base[0], img->pitches[0],
+         /* U */
+          src + (this->width * this->height * 5/4), this->width/2,
+          img->base[1], img->pitches[1],
+         /* V */
+          src + (this->width * this->height), this->width/2,
+          img->base[2], img->pitches[2],
+         /* width x height */
+          this->width, this->height);
+
+      } else if (buf->type == BUF_VIDEO_I420) {
+
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                          this->width, this->height,
+                                          this->ratio, XINE_IMGFMT_YV12, VO_BOTH_FIELDS);
+
+        yv12_to_yv12(
+         /* Y */
+          src, this->width,
+          img->base[0], img->pitches[0],
+         /* U */
+          src + (this->width * this->height), this->width/2,
+          img->base[1], img->pitches[1],
+         /* V */
+          src + (this->width * this->height * 5/4), this->width/2,
+          img->base[2], img->pitches[2],
+         /* width x height */
+          this->width, this->height);
+
+      } else if (buf->type == BUF_VIDEO_YVU9) {
+
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                          this->width, this->height,
+                                          this->ratio, XINE_IMGFMT_YV12, VO_BOTH_FIELDS);
+
+
+        yuv9_to_yv12(
+         /* Y */
+          src,
+          this->width,
+          img->base[0],
+          img->pitches[0],
+         /* U */
+          src + (this->width * this->height),
+          this->width / 4,
+          img->base[1],
+          img->pitches[1],
+         /* V */
+          src + (this->width * this->height) +
+            (this->width * this->height / 16),
+          this->width / 4,
+          img->base[2],
+          img->pitches[2],
+         /* width x height */
+          this->width,
+          this->height);
+
+      } else if (buf->type == BUF_VIDEO_GREY) {
+
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                          this->width, this->height,
+                                          this->ratio, XINE_IMGFMT_YV12, VO_BOTH_FIELDS);
+
+        xine_fast_memcpy(img->base[0], src, this->width * this->height);
+        memset( img->base[1], 0x80, this->width * this->height / 4 );
+        memset( img->base[2], 0x80, this->width * this->height / 4 );
+
+      } else {
+
+        /* just allocate something to avoid compiler warnings */
+        img = this->stream->video_out->get_frame (this->stream->video_out,
+                                          this->width, this->height,
+                                          this->ratio, XINE_IMGFMT_YV12, VO_BOTH_FIELDS);
+
+      }
+
+      img->duration  = this->video_step;
+      img->pts       = buf->pts;
+      img->bad_frame = 0;
+
+      img->draw(img, this->stream);
+      img->free(img);
+
+      this->size = 0;
+    }
+  }
+}
+
+/*
+ * This function is called when xine needs to flush the system. Not
+ * sure when or if this is used or even if it needs to do anything.
+ */
+static void yuv_flush (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function resets the video decoder.
+ */
+static void yuv_reset (video_decoder_t *this_gen) {
+  yuv_decoder_t *this = (yuv_decoder_t *) this_gen;
+
+  this->size = 0;
+}
+
+static void yuv_discontinuity (video_decoder_t *this_gen) {
+}
+
+/*
+ * This function frees the video decoder instance allocated to the decoder.
+ */
+static void yuv_dispose (video_decoder_t *this_gen) {
+  yuv_decoder_t *this = (yuv_decoder_t *) this_gen;
+
+  free (this->buf);
+
+  if (this->decoder_ok) {
+    this->decoder_ok = 0;
+    this->stream->video_out->close(this->stream->video_out, this->stream);
+  }
+
+  free (this_gen);
+}
+
+static video_decoder_t *open_plugin (video_decoder_class_t *class_gen, xine_stream_t *stream) {
+
+  yuv_decoder_t  *this ;
+
+  this = (yuv_decoder_t *) calloc(1, sizeof(yuv_decoder_t));
+
+  this->video_decoder.decode_data         = yuv_decode_data;
+  this->video_decoder.flush               = yuv_flush;
+  this->video_decoder.reset               = yuv_reset;
+  this->video_decoder.discontinuity       = yuv_discontinuity;
+  this->video_decoder.dispose             = yuv_dispose;
+  this->size                              = 0;
+
+  this->stream                            = stream;
+  this->class                             = (yuv_class_t *) class_gen;
+
+  this->decoder_ok    = 0;
+  this->buf           = NULL;
+
+  return &this->video_decoder;
+}
+
+static void *init_plugin (xine_t *xine, void *data) {
+
+  yuv_class_t *this;
+
+  this = (yuv_class_t *) calloc(1, sizeof(yuv_class_t));
+
+  this->decoder_class.open_plugin     = open_plugin;
+  this->decoder_class.identifier      = "YUV";
+  this->decoder_class.description     = N_("Raw YUV video decoder plugin");
+  this->decoder_class.dispose         = default_video_decoder_class_dispose;
+
+  return this;
+}
+
+/*
+ * exported plugin catalog entry
+ */
+
+static const uint32_t video_types[] = {
+  BUF_VIDEO_YUY2,
+  BUF_VIDEO_YV12,
+  BUF_VIDEO_YVU9,
+  BUF_VIDEO_GREY,
+  BUF_VIDEO_I420,
+  0
+ };
+
+static const decoder_info_t dec_info_video = {
+  video_types,         /* supported types */
+  1                    /* priority        */
+};
+
+const plugin_info_t xine_plugin_info[] EXPORTED = {
+  /* type, API, "name", version, special_info, init_function */
+  { PLUGIN_VIDEO_DECODER, 19, "yuv", XINE_VERSION_CODE, &dec_info_video, init_plugin },
+  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
+};